{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.8328611898017, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 15.287191390991211, "learning_rate": 1.5723270440251572e-09, "logps/chosen": -36.293212890625, "logps/rejected": -54.14521789550781, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 1.333309292793274, "losses/total": 0.6931471824645996, "ref_logps/chosen": -36.293212890625, "ref_logps/rejected": -54.14521789550781, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 16.00140953063965, "learning_rate": 3.1446540880503143e-09, "logps/chosen": -44.562477111816406, "logps/rejected": -46.48662185668945, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 0.9847710132598877, "losses/total": 0.6931471824645996, "ref_logps/chosen": -44.562477111816406, "ref_logps/rejected": -46.48662185668945, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0, "grad_norm": 16.550865173339844, "learning_rate": 4.716981132075472e-09, "logps/chosen": -41.267608642578125, "logps/rejected": -55.37574768066406, "loss": 0.6946, "losses/dpo": 0.6985788941383362, "losses/sft": 1.231009840965271, "losses/total": 0.6985788941383362, "ref_logps/chosen": -41.19807434082031, "ref_logps/rejected": -55.33381652832031, "rewards/accuracies": 0.3125, "rewards/chosen": -0.006953191943466663, "rewards/margins": -0.0027600531466305256, "rewards/rejected": -0.0041931383311748505, "step": 3 }, { "epoch": 0.0, "grad_norm": 19.647977828979492, "learning_rate": 6.289308176100629e-09, "logps/chosen": -31.251550674438477, "logps/rejected": -41.84539031982422, "loss": 0.6929, "losses/dpo": 0.6954505443572998, "losses/sft": 1.0519485473632812, "losses/total": 0.6954505443572998, "ref_logps/chosen": -31.226787567138672, "ref_logps/rejected": -41.81562805175781, "rewards/accuracies": 0.5625, "rewards/chosen": -0.002476316411048174, "rewards/margins": 0.0005000412929803133, "rewards/rejected": -0.002976357936859131, "step": 4 }, { "epoch": 0.0, "grad_norm": 13.318140983581543, "learning_rate": 7.861635220125786e-09, "logps/chosen": -32.866783142089844, "logps/rejected": -37.394134521484375, "loss": 0.6929, "losses/dpo": 0.6889871954917908, "losses/sft": 1.273924469947815, "losses/total": 0.6889871954917908, "ref_logps/chosen": -32.928836822509766, "ref_logps/rejected": -37.45051193237305, "rewards/accuracies": 0.4375, "rewards/chosen": 0.006205156445503235, "rewards/margins": 0.0005674933781847358, "rewards/rejected": 0.005637663416564465, "step": 5 }, { "epoch": 0.01, "grad_norm": 14.464860916137695, "learning_rate": 9.433962264150943e-09, "logps/chosen": -40.548370361328125, "logps/rejected": -41.034645080566406, "loss": 0.6927, "losses/dpo": 0.6911370754241943, "losses/sft": 1.591616153717041, "losses/total": 0.6911370754241943, "ref_logps/chosen": -40.54157257080078, "ref_logps/rejected": -41.017581939697266, "rewards/accuracies": 0.375, "rewards/chosen": -0.000679713673889637, "rewards/margins": 0.0010262848809361458, "rewards/rejected": -0.0017059982055798173, "step": 6 }, { "epoch": 0.01, "grad_norm": 13.795360565185547, "learning_rate": 1.1006289308176099e-08, "logps/chosen": -40.470916748046875, "logps/rejected": -42.38670349121094, "loss": 0.6955, "losses/dpo": 0.6914644837379456, "losses/sft": 1.4545750617980957, "losses/total": 0.6914644837379456, "ref_logps/chosen": -40.477840423583984, "ref_logps/rejected": -42.43923568725586, "rewards/accuracies": 0.375, "rewards/chosen": 0.0006923316977918148, "rewards/margins": -0.004560908768326044, "rewards/rejected": 0.005253240931779146, "step": 7 }, { "epoch": 0.01, "grad_norm": 16.392980575561523, "learning_rate": 1.2578616352201257e-08, "logps/chosen": -42.82190704345703, "logps/rejected": -50.23805236816406, "loss": 0.6902, "losses/dpo": 0.6934096217155457, "losses/sft": 1.8958083391189575, "losses/total": 0.6934096217155457, "ref_logps/chosen": -42.789737701416016, "ref_logps/rejected": -50.1453857421875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0032165111042559147, "rewards/margins": 0.006050032563507557, "rewards/rejected": -0.00926654227077961, "step": 8 }, { "epoch": 0.01, "grad_norm": 14.590455055236816, "learning_rate": 1.4150943396226414e-08, "logps/chosen": -32.913726806640625, "logps/rejected": -40.874839782714844, "loss": 0.696, "losses/dpo": 0.6927824020385742, "losses/sft": 1.417920470237732, "losses/total": 0.6927824020385742, "ref_logps/chosen": -32.87944793701172, "ref_logps/rejected": -40.89663314819336, "rewards/accuracies": 0.4375, "rewards/chosen": -0.003427929012104869, "rewards/margins": -0.005606940947473049, "rewards/rejected": 0.0021790117025375366, "step": 9 }, { "epoch": 0.01, "grad_norm": 13.96774959564209, "learning_rate": 1.5723270440251573e-08, "logps/chosen": -33.643367767333984, "logps/rejected": -39.197227478027344, "loss": 0.6936, "losses/dpo": 0.6882840394973755, "losses/sft": 1.5314520597457886, "losses/total": 0.6882840394973755, "ref_logps/chosen": -33.65266036987305, "ref_logps/rejected": -39.214569091796875, "rewards/accuracies": 0.5, "rewards/chosen": 0.0009294033516198397, "rewards/margins": -0.0008048177696764469, "rewards/rejected": 0.0017342206556349993, "step": 10 }, { "epoch": 0.01, "grad_norm": 15.812729835510254, "learning_rate": 1.729559748427673e-08, "logps/chosen": -33.84188461303711, "logps/rejected": -42.659942626953125, "loss": 0.6923, "losses/dpo": 0.6912386417388916, "losses/sft": 1.217553973197937, "losses/total": 0.6912386417388916, "ref_logps/chosen": -33.8734130859375, "ref_logps/rejected": -42.67314529418945, "rewards/accuracies": 0.625, "rewards/chosen": 0.0031529488041996956, "rewards/margins": 0.0018325985874980688, "rewards/rejected": 0.0013203503331169486, "step": 11 }, { "epoch": 0.01, "grad_norm": 13.129110336303711, "learning_rate": 1.8867924528301887e-08, "logps/chosen": -35.366729736328125, "logps/rejected": -36.78556823730469, "loss": 0.6954, "losses/dpo": 0.6976528167724609, "losses/sft": 1.3461360931396484, "losses/total": 0.6976528167724609, "ref_logps/chosen": -35.39350891113281, "ref_logps/rejected": -36.857025146484375, "rewards/accuracies": 0.375, "rewards/chosen": 0.0026777982711791992, "rewards/margins": -0.004467433784157038, "rewards/rejected": 0.007145232520997524, "step": 12 }, { "epoch": 0.01, "grad_norm": 15.79524040222168, "learning_rate": 2.044025157232704e-08, "logps/chosen": -42.27802658081055, "logps/rejected": -46.32032012939453, "loss": 0.6964, "losses/dpo": 0.704017162322998, "losses/sft": 1.3899930715560913, "losses/total": 0.704017162322998, "ref_logps/chosen": -42.27894973754883, "ref_logps/rejected": -46.3837890625, "rewards/accuracies": 0.25, "rewards/chosen": 9.242584928870201e-05, "rewards/margins": -0.006254673004150391, "rewards/rejected": 0.006347098853439093, "step": 13 }, { "epoch": 0.01, "grad_norm": 13.720596313476562, "learning_rate": 2.2012578616352197e-08, "logps/chosen": -35.25041961669922, "logps/rejected": -43.295013427734375, "loss": 0.691, "losses/dpo": 0.681664764881134, "losses/sft": 1.2782015800476074, "losses/total": 0.681664764881134, "ref_logps/chosen": -35.20989227294922, "ref_logps/rejected": -43.21023178100586, "rewards/accuracies": 0.6875, "rewards/chosen": -0.004052662290632725, "rewards/margins": 0.004425554536283016, "rewards/rejected": -0.00847821868956089, "step": 14 }, { "epoch": 0.01, "grad_norm": 13.796195983886719, "learning_rate": 2.3584905660377358e-08, "logps/chosen": -35.010528564453125, "logps/rejected": -36.66010665893555, "loss": 0.6938, "losses/dpo": 0.6871126890182495, "losses/sft": 1.4939182996749878, "losses/total": 0.6871126890182495, "ref_logps/chosen": -35.017948150634766, "ref_logps/rejected": -36.68010711669922, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0007417916785925627, "rewards/margins": -0.001258552074432373, "rewards/rejected": 0.0020003439858555794, "step": 15 }, { "epoch": 0.02, "grad_norm": 13.905455589294434, "learning_rate": 2.5157232704402515e-08, "logps/chosen": -35.276771545410156, "logps/rejected": -30.285860061645508, "loss": 0.6959, "losses/dpo": 0.6939616203308105, "losses/sft": 1.1386065483093262, "losses/total": 0.6939616203308105, "ref_logps/chosen": -35.17827224731445, "ref_logps/rejected": -30.2420597076416, "rewards/accuracies": 0.5, "rewards/chosen": -0.0098500307649374, "rewards/margins": -0.005470088683068752, "rewards/rejected": -0.004379943013191223, "step": 16 }, { "epoch": 0.02, "grad_norm": 15.296125411987305, "learning_rate": 2.672955974842767e-08, "logps/chosen": -47.09111022949219, "logps/rejected": -43.19539260864258, "loss": 0.6922, "losses/dpo": 0.6896570920944214, "losses/sft": 1.3599190711975098, "losses/total": 0.6896570920944214, "ref_logps/chosen": -47.078372955322266, "ref_logps/rejected": -43.1630859375, "rewards/accuracies": 0.625, "rewards/chosen": -0.0012737751239910722, "rewards/margins": 0.0019566123373806477, "rewards/rejected": -0.0032303868792951107, "step": 17 }, { "epoch": 0.02, "grad_norm": 16.253629684448242, "learning_rate": 2.830188679245283e-08, "logps/chosen": -35.313194274902344, "logps/rejected": -49.410675048828125, "loss": 0.6973, "losses/dpo": 0.7041923999786377, "losses/sft": 1.6521106958389282, "losses/total": 0.7041923999786377, "ref_logps/chosen": -35.28705978393555, "ref_logps/rejected": -49.466217041015625, "rewards/accuracies": 0.3125, "rewards/chosen": -0.002613651566207409, "rewards/margins": -0.008167792111635208, "rewards/rejected": 0.005554139614105225, "step": 18 }, { "epoch": 0.02, "grad_norm": 17.191499710083008, "learning_rate": 2.987421383647799e-08, "logps/chosen": -48.732940673828125, "logps/rejected": -58.556121826171875, "loss": 0.6931, "losses/dpo": 0.6878706812858582, "losses/sft": 1.5482776165008545, "losses/total": 0.6878706812858582, "ref_logps/chosen": -48.78472137451172, "ref_logps/rejected": -58.60450744628906, "rewards/accuracies": 0.5625, "rewards/chosen": 0.005178308114409447, "rewards/margins": 0.0003401516005396843, "rewards/rejected": 0.004838156513869762, "step": 19 }, { "epoch": 0.02, "grad_norm": 15.549980163574219, "learning_rate": 3.1446540880503146e-08, "logps/chosen": -36.84483337402344, "logps/rejected": -39.059471130371094, "loss": 0.6956, "losses/dpo": 0.699995756149292, "losses/sft": 1.1807096004486084, "losses/total": 0.699995756149292, "ref_logps/chosen": -36.83661651611328, "ref_logps/rejected": -39.09865188598633, "rewards/accuracies": 0.3125, "rewards/chosen": -0.000821572495624423, "rewards/margins": -0.004739719443023205, "rewards/rejected": 0.003918147645890713, "step": 20 }, { "epoch": 0.02, "grad_norm": 14.073959350585938, "learning_rate": 3.30188679245283e-08, "logps/chosen": -36.344810485839844, "logps/rejected": -41.956016540527344, "loss": 0.6923, "losses/dpo": 0.6898228526115417, "losses/sft": 1.222610354423523, "losses/total": 0.6898228526115417, "ref_logps/chosen": -36.35798263549805, "ref_logps/rejected": -41.95195770263672, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0013175427448004484, "rewards/margins": 0.0017231464153155684, "rewards/rejected": -0.0004056035540997982, "step": 21 }, { "epoch": 0.02, "grad_norm": 14.14978313446045, "learning_rate": 3.459119496855346e-08, "logps/chosen": -33.29254150390625, "logps/rejected": -47.185489654541016, "loss": 0.6939, "losses/dpo": 0.6981321573257446, "losses/sft": 1.4130489826202393, "losses/total": 0.6981321573257446, "ref_logps/chosen": -33.252281188964844, "ref_logps/rejected": -47.157386779785156, "rewards/accuracies": 0.5, "rewards/chosen": -0.004025830887258053, "rewards/margins": -0.0012156391749158502, "rewards/rejected": -0.002810192294418812, "step": 22 }, { "epoch": 0.02, "grad_norm": 14.848532676696777, "learning_rate": 3.6163522012578617e-08, "logps/chosen": -46.34131622314453, "logps/rejected": -39.75196075439453, "loss": 0.6976, "losses/dpo": 0.696555495262146, "losses/sft": 1.7955430746078491, "losses/total": 0.696555495262146, "ref_logps/chosen": -46.30574035644531, "ref_logps/rejected": -39.8038330078125, "rewards/accuracies": 0.375, "rewards/chosen": -0.003557533025741577, "rewards/margins": -0.008745087310671806, "rewards/rejected": 0.005187553353607655, "step": 23 }, { "epoch": 0.02, "grad_norm": 13.658124923706055, "learning_rate": 3.7735849056603774e-08, "logps/chosen": -31.579673767089844, "logps/rejected": -36.94577407836914, "loss": 0.6991, "losses/dpo": 0.6995179653167725, "losses/sft": 1.3396552801132202, "losses/total": 0.6995179653167725, "ref_logps/chosen": -31.527324676513672, "ref_logps/rejected": -37.01141357421875, "rewards/accuracies": 0.375, "rewards/chosen": -0.005234760232269764, "rewards/margins": -0.011798936873674393, "rewards/rejected": 0.0065641761757433414, "step": 24 }, { "epoch": 0.02, "grad_norm": 14.473611831665039, "learning_rate": 3.930817610062893e-08, "logps/chosen": -33.093265533447266, "logps/rejected": -38.25944519042969, "loss": 0.6981, "losses/dpo": 0.6936898231506348, "losses/sft": 1.3963024616241455, "losses/total": 0.6936898231506348, "ref_logps/chosen": -33.092132568359375, "ref_logps/rejected": -38.356536865234375, "rewards/accuracies": 0.3125, "rewards/chosen": -0.00011318037286400795, "rewards/margins": -0.009822163730859756, "rewards/rejected": 0.009708983823657036, "step": 25 }, { "epoch": 0.02, "grad_norm": 15.082950592041016, "learning_rate": 4.088050314465408e-08, "logps/chosen": -33.5494270324707, "logps/rejected": -38.667518615722656, "loss": 0.6905, "losses/dpo": 0.6945425868034363, "losses/sft": 1.6459039449691772, "losses/total": 0.6945425868034363, "ref_logps/chosen": -33.616607666015625, "ref_logps/rejected": -38.67987823486328, "rewards/accuracies": 0.5625, "rewards/chosen": 0.006717693526297808, "rewards/margins": 0.00548145454376936, "rewards/rejected": 0.001236238982528448, "step": 26 }, { "epoch": 0.03, "grad_norm": 16.236902236938477, "learning_rate": 4.245283018867924e-08, "logps/chosen": -40.37019348144531, "logps/rejected": -49.955196380615234, "loss": 0.6905, "losses/dpo": 0.6870005130767822, "losses/sft": 1.2192702293395996, "losses/total": 0.6870005130767822, "ref_logps/chosen": -40.37604904174805, "ref_logps/rejected": -49.90593338012695, "rewards/accuracies": 0.625, "rewards/chosen": 0.0005852816393598914, "rewards/margins": 0.005511948373168707, "rewards/rejected": -0.004926666617393494, "step": 27 }, { "epoch": 0.03, "grad_norm": 14.55437183380127, "learning_rate": 4.4025157232704395e-08, "logps/chosen": -32.74169158935547, "logps/rejected": -39.09827423095703, "loss": 0.6934, "losses/dpo": 0.6983916759490967, "losses/sft": 1.4402568340301514, "losses/total": 0.6983916759490967, "ref_logps/chosen": -32.714149475097656, "ref_logps/rejected": -39.075469970703125, "rewards/accuracies": 0.5, "rewards/chosen": -0.0027539432048797607, "rewards/margins": -0.0004735295078717172, "rewards/rejected": -0.0022804138716310263, "step": 28 }, { "epoch": 0.03, "grad_norm": 14.335480690002441, "learning_rate": 4.559748427672955e-08, "logps/chosen": -34.224037170410156, "logps/rejected": -37.38675308227539, "loss": 0.6936, "losses/dpo": 0.6951794624328613, "losses/sft": 1.4238932132720947, "losses/total": 0.6951794624328613, "ref_logps/chosen": -34.20547866821289, "ref_logps/rejected": -37.376678466796875, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0018555226270109415, "rewards/margins": -0.0008478106465190649, "rewards/rejected": -0.0010077119804918766, "step": 29 }, { "epoch": 0.03, "grad_norm": 15.342060089111328, "learning_rate": 4.7169811320754715e-08, "logps/chosen": -34.61055374145508, "logps/rejected": -42.71800994873047, "loss": 0.6876, "losses/dpo": 0.6789344549179077, "losses/sft": 1.1305980682373047, "losses/total": 0.6789344549179077, "ref_logps/chosen": -34.644195556640625, "ref_logps/rejected": -42.638336181640625, "rewards/accuracies": 0.625, "rewards/chosen": 0.003364542266353965, "rewards/margins": 0.01133134588599205, "rewards/rejected": -0.007966804318130016, "step": 30 }, { "epoch": 0.03, "grad_norm": 13.72602367401123, "learning_rate": 4.874213836477987e-08, "logps/chosen": -32.430580139160156, "logps/rejected": -33.662437438964844, "loss": 0.6934, "losses/dpo": 0.69603031873703, "losses/sft": 1.152692437171936, "losses/total": 0.69603031873703, "ref_logps/chosen": -32.425933837890625, "ref_logps/rejected": -33.66048049926758, "rewards/accuracies": 0.375, "rewards/chosen": -0.0004647308960556984, "rewards/margins": -0.00026935047935694456, "rewards/rejected": -0.00019538099877536297, "step": 31 }, { "epoch": 0.03, "grad_norm": 15.070570945739746, "learning_rate": 5.031446540880503e-08, "logps/chosen": -40.984222412109375, "logps/rejected": -47.076194763183594, "loss": 0.6897, "losses/dpo": 0.6885380744934082, "losses/sft": 1.491570234298706, "losses/total": 0.6885380744934082, "ref_logps/chosen": -41.050697326660156, "ref_logps/rejected": -47.071189880371094, "rewards/accuracies": 0.5625, "rewards/chosen": 0.006647652946412563, "rewards/margins": 0.00714834313839674, "rewards/rejected": -0.0005006909486837685, "step": 32 }, { "epoch": 0.03, "grad_norm": 13.632221221923828, "learning_rate": 5.1886792452830186e-08, "logps/chosen": -33.047698974609375, "logps/rejected": -34.45159912109375, "loss": 0.6958, "losses/dpo": 0.689713716506958, "losses/sft": 1.2142502069473267, "losses/total": 0.689713716506958, "ref_logps/chosen": -33.024322509765625, "ref_logps/rejected": -34.47886657714844, "rewards/accuracies": 0.4375, "rewards/chosen": -0.002337571932002902, "rewards/margins": -0.005064377095550299, "rewards/rejected": 0.002726804930716753, "step": 33 }, { "epoch": 0.03, "grad_norm": 13.262465476989746, "learning_rate": 5.345911949685534e-08, "logps/chosen": -30.140480041503906, "logps/rejected": -34.00312805175781, "loss": 0.6962, "losses/dpo": 0.6963517665863037, "losses/sft": 1.2056186199188232, "losses/total": 0.6963517665863037, "ref_logps/chosen": -30.103130340576172, "ref_logps/rejected": -34.02630615234375, "rewards/accuracies": 0.375, "rewards/chosen": -0.003735171165317297, "rewards/margins": -0.006053084507584572, "rewards/rejected": 0.0023179128766059875, "step": 34 }, { "epoch": 0.03, "grad_norm": 15.316493034362793, "learning_rate": 5.50314465408805e-08, "logps/chosen": -36.81224822998047, "logps/rejected": -51.80878448486328, "loss": 0.6917, "losses/dpo": 0.6966387629508972, "losses/sft": 1.3699390888214111, "losses/total": 0.6966387629508972, "ref_logps/chosen": -36.79683303833008, "ref_logps/rejected": -51.762454986572266, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0015415906673297286, "rewards/margins": 0.0030912284273654222, "rewards/rejected": -0.004632818978279829, "step": 35 }, { "epoch": 0.03, "grad_norm": 13.993942260742188, "learning_rate": 5.660377358490566e-08, "logps/chosen": -32.81092834472656, "logps/rejected": -35.80175018310547, "loss": 0.6935, "losses/dpo": 0.6949833035469055, "losses/sft": 1.1595534086227417, "losses/total": 0.6949833035469055, "ref_logps/chosen": -32.79785919189453, "ref_logps/rejected": -35.79462814331055, "rewards/accuracies": 0.5, "rewards/chosen": -0.00130692427046597, "rewards/margins": -0.0005950063932687044, "rewards/rejected": -0.000711917644366622, "step": 36 }, { "epoch": 0.03, "grad_norm": 14.649563789367676, "learning_rate": 5.8176100628930814e-08, "logps/chosen": -36.67781066894531, "logps/rejected": -47.96495819091797, "loss": 0.6901, "losses/dpo": 0.6957077980041504, "losses/sft": 1.529431939125061, "losses/total": 0.6957077980041504, "ref_logps/chosen": -36.696502685546875, "ref_logps/rejected": -47.91997146606445, "rewards/accuracies": 0.5625, "rewards/chosen": 0.001869445899501443, "rewards/margins": 0.006367784459143877, "rewards/rejected": -0.004498339258134365, "step": 37 }, { "epoch": 0.04, "grad_norm": 14.203393936157227, "learning_rate": 5.974842767295598e-08, "logps/chosen": -41.23329162597656, "logps/rejected": -35.483360290527344, "loss": 0.693, "losses/dpo": 0.7117766737937927, "losses/sft": 1.2247445583343506, "losses/total": 0.7117766737937927, "ref_logps/chosen": -41.22758483886719, "ref_logps/rejected": -35.47206115722656, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0005709344986826181, "rewards/margins": 0.0005590622313320637, "rewards/rejected": -0.0011299969628453255, "step": 38 }, { "epoch": 0.04, "grad_norm": 15.450180053710938, "learning_rate": 6.132075471698113e-08, "logps/chosen": -39.377593994140625, "logps/rejected": -46.454246520996094, "loss": 0.6975, "losses/dpo": 0.6997748613357544, "losses/sft": 1.5618855953216553, "losses/total": 0.6997748613357544, "ref_logps/chosen": -39.30659866333008, "ref_logps/rejected": -46.469581604003906, "rewards/accuracies": 0.25, "rewards/chosen": -0.007099542301148176, "rewards/margins": -0.008633071556687355, "rewards/rejected": 0.001533529139123857, "step": 39 }, { "epoch": 0.04, "grad_norm": 14.611263275146484, "learning_rate": 6.289308176100629e-08, "logps/chosen": -31.116464614868164, "logps/rejected": -36.931434631347656, "loss": 0.6954, "losses/dpo": 0.6901916861534119, "losses/sft": 1.1429109573364258, "losses/total": 0.6901916861534119, "ref_logps/chosen": -31.093990325927734, "ref_logps/rejected": -36.953338623046875, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0022472681012004614, "rewards/margins": -0.004437911324203014, "rewards/rejected": 0.002190643921494484, "step": 40 }, { "epoch": 0.04, "grad_norm": 13.562875747680664, "learning_rate": 6.446540880503144e-08, "logps/chosen": -33.89849853515625, "logps/rejected": -40.63304138183594, "loss": 0.6924, "losses/dpo": 0.6974458694458008, "losses/sft": 1.3313788175582886, "losses/total": 0.6974458694458008, "ref_logps/chosen": -33.87089538574219, "ref_logps/rejected": -40.59001159667969, "rewards/accuracies": 0.4375, "rewards/chosen": -0.002760761883109808, "rewards/margins": 0.001542425248771906, "rewards/rejected": -0.004303187131881714, "step": 41 }, { "epoch": 0.04, "grad_norm": 13.275260925292969, "learning_rate": 6.60377358490566e-08, "logps/chosen": -31.73421287536621, "logps/rejected": -37.52004623413086, "loss": 0.6971, "losses/dpo": 0.6914031505584717, "losses/sft": 1.2416191101074219, "losses/total": 0.6914031505584717, "ref_logps/chosen": -31.701637268066406, "ref_logps/rejected": -37.56598663330078, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0032576103694736958, "rewards/margins": -0.007851381786167622, "rewards/rejected": 0.004593771882355213, "step": 42 }, { "epoch": 0.04, "grad_norm": 15.27834701538086, "learning_rate": 6.761006289308176e-08, "logps/chosen": -38.450767517089844, "logps/rejected": -45.00822448730469, "loss": 0.6918, "losses/dpo": 0.6955606937408447, "losses/sft": 1.1744383573532104, "losses/total": 0.6955606937408447, "ref_logps/chosen": -38.456146240234375, "ref_logps/rejected": -44.983848571777344, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0005378782516345382, "rewards/margins": 0.002975050127133727, "rewards/rejected": -0.002437171759083867, "step": 43 }, { "epoch": 0.04, "grad_norm": 15.608879089355469, "learning_rate": 6.918238993710692e-08, "logps/chosen": -28.18182945251465, "logps/rejected": -47.542015075683594, "loss": 0.6946, "losses/dpo": 0.6975482702255249, "losses/sft": 0.8388431072235107, "losses/total": 0.6975482702255249, "ref_logps/chosen": -28.196659088134766, "ref_logps/rejected": -47.58372497558594, "rewards/accuracies": 0.375, "rewards/chosen": 0.0014829186256974936, "rewards/margins": -0.002688285894691944, "rewards/rejected": 0.004171204753220081, "step": 44 }, { "epoch": 0.04, "grad_norm": 14.393813133239746, "learning_rate": 7.075471698113207e-08, "logps/chosen": -38.44715118408203, "logps/rejected": -42.931541442871094, "loss": 0.6936, "losses/dpo": 0.6936181783676147, "losses/sft": 1.3454254865646362, "losses/total": 0.6936181783676147, "ref_logps/chosen": -38.42930221557617, "ref_logps/rejected": -42.92062759399414, "rewards/accuracies": 0.5, "rewards/chosen": -0.0017848550342023373, "rewards/margins": -0.0006935299606993794, "rewards/rejected": -0.0010913253063336015, "step": 45 }, { "epoch": 0.04, "grad_norm": 16.733362197875977, "learning_rate": 7.232704402515723e-08, "logps/chosen": -37.33485412597656, "logps/rejected": -41.190853118896484, "loss": 0.6903, "losses/dpo": 0.6862717270851135, "losses/sft": 1.1105400323867798, "losses/total": 0.6862717270851135, "ref_logps/chosen": -37.37635040283203, "ref_logps/rejected": -41.17495346069336, "rewards/accuracies": 0.75, "rewards/chosen": 0.004149759188294411, "rewards/margins": 0.0057398732751607895, "rewards/rejected": -0.0015901147853583097, "step": 46 }, { "epoch": 0.04, "grad_norm": 15.69994831085205, "learning_rate": 7.389937106918238e-08, "logps/chosen": -44.16721725463867, "logps/rejected": -37.57732009887695, "loss": 0.6932, "losses/dpo": 0.6937659978866577, "losses/sft": 1.4528712034225464, "losses/total": 0.6937659978866577, "ref_logps/chosen": -44.15205764770508, "ref_logps/rejected": -37.56299591064453, "rewards/accuracies": 0.5625, "rewards/chosen": -0.001515796990133822, "rewards/margins": -8.350861025974154e-05, "rewards/rejected": -0.001432287972420454, "step": 47 }, { "epoch": 0.05, "grad_norm": 15.128890991210938, "learning_rate": 7.547169811320755e-08, "logps/chosen": -44.43276596069336, "logps/rejected": -46.47834777832031, "loss": 0.6911, "losses/dpo": 0.690741240978241, "losses/sft": 1.3188337087631226, "losses/total": 0.690741240978241, "ref_logps/chosen": -44.44622802734375, "ref_logps/rejected": -46.45027160644531, "rewards/accuracies": 0.6875, "rewards/chosen": 0.00134660629555583, "rewards/margins": 0.0041544619016349316, "rewards/rejected": -0.0028078556060791016, "step": 48 }, { "epoch": 0.05, "grad_norm": 16.965360641479492, "learning_rate": 7.70440251572327e-08, "logps/chosen": -39.97547912597656, "logps/rejected": -48.518775939941406, "loss": 0.6885, "losses/dpo": 0.6853836178779602, "losses/sft": 1.380096673965454, "losses/total": 0.6853836178779602, "ref_logps/chosen": -40.01591491699219, "ref_logps/rejected": -48.46247863769531, "rewards/accuracies": 0.625, "rewards/chosen": 0.004043960943818092, "rewards/margins": 0.009673237800598145, "rewards/rejected": -0.0056292773224413395, "step": 49 }, { "epoch": 0.05, "grad_norm": 15.04533576965332, "learning_rate": 7.861635220125786e-08, "logps/chosen": -39.61756896972656, "logps/rejected": -49.66996383666992, "loss": 0.6942, "losses/dpo": 0.6997429728507996, "losses/sft": 1.417793869972229, "losses/total": 0.6997429728507996, "ref_logps/chosen": -39.60337829589844, "ref_logps/rejected": -49.67588424682617, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0014191538793966174, "rewards/margins": -0.0020113971550017595, "rewards/rejected": 0.0005922436248511076, "step": 50 }, { "epoch": 0.05, "grad_norm": 15.650992393493652, "learning_rate": 8.018867924528301e-08, "logps/chosen": -34.93170928955078, "logps/rejected": -45.80840301513672, "loss": 0.6917, "losses/dpo": 0.6938539147377014, "losses/sft": 1.172393798828125, "losses/total": 0.6938539147377014, "ref_logps/chosen": -34.88665771484375, "ref_logps/rejected": -45.73363494873047, "rewards/accuracies": 0.5, "rewards/chosen": -0.004505685530602932, "rewards/margins": 0.0029702871106565, "rewards/rejected": -0.0074759721755981445, "step": 51 }, { "epoch": 0.05, "grad_norm": 12.861350059509277, "learning_rate": 8.176100628930816e-08, "logps/chosen": -30.98968505859375, "logps/rejected": -37.95505142211914, "loss": 0.6912, "losses/dpo": 0.6984187364578247, "losses/sft": 1.143085241317749, "losses/total": 0.6984187364578247, "ref_logps/chosen": -31.017133712768555, "ref_logps/rejected": -37.94151306152344, "rewards/accuracies": 0.625, "rewards/chosen": 0.0027447701431810856, "rewards/margins": 0.004098537378013134, "rewards/rejected": -0.0013537677004933357, "step": 52 }, { "epoch": 0.05, "grad_norm": 15.835521697998047, "learning_rate": 8.333333333333333e-08, "logps/chosen": -41.60312271118164, "logps/rejected": -39.74080276489258, "loss": 0.6937, "losses/dpo": 0.6793924570083618, "losses/sft": 1.4429981708526611, "losses/total": 0.6793924570083618, "ref_logps/chosen": -41.5811767578125, "ref_logps/rejected": -39.72795867919922, "rewards/accuracies": 0.375, "rewards/chosen": -0.002194375265389681, "rewards/margins": -0.0009097992442548275, "rewards/rejected": -0.0012845754390582442, "step": 53 }, { "epoch": 0.05, "grad_norm": 15.634050369262695, "learning_rate": 8.490566037735848e-08, "logps/chosen": -46.38857650756836, "logps/rejected": -40.06861114501953, "loss": 0.6962, "losses/dpo": 0.6969350576400757, "losses/sft": 1.2307177782058716, "losses/total": 0.6969350576400757, "ref_logps/chosen": -46.36830520629883, "ref_logps/rejected": -40.10871505737305, "rewards/accuracies": 0.375, "rewards/chosen": -0.002026684582233429, "rewards/margins": -0.006037433166056871, "rewards/rejected": 0.00401074904948473, "step": 54 }, { "epoch": 0.05, "grad_norm": 14.544160842895508, "learning_rate": 8.647798742138364e-08, "logps/chosen": -32.85659408569336, "logps/rejected": -40.02392578125, "loss": 0.6964, "losses/dpo": 0.689262866973877, "losses/sft": 1.1000863313674927, "losses/total": 0.689262866973877, "ref_logps/chosen": -32.840023040771484, "ref_logps/rejected": -40.07231140136719, "rewards/accuracies": 0.375, "rewards/chosen": -0.0016567648854106665, "rewards/margins": -0.0064952559769153595, "rewards/rejected": 0.004838490858674049, "step": 55 }, { "epoch": 0.05, "grad_norm": 14.296195983886719, "learning_rate": 8.805031446540879e-08, "logps/chosen": -41.2823371887207, "logps/rejected": -38.29021072387695, "loss": 0.693, "losses/dpo": 0.6974284648895264, "losses/sft": 1.3975054025650024, "losses/total": 0.6974284648895264, "ref_logps/chosen": -41.26557922363281, "ref_logps/rejected": -38.268707275390625, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0016760170692577958, "rewards/margins": 0.0004747032653540373, "rewards/rejected": -0.002150720451027155, "step": 56 }, { "epoch": 0.05, "grad_norm": 14.8149995803833, "learning_rate": 8.962264150943395e-08, "logps/chosen": -33.73179626464844, "logps/rejected": -41.118045806884766, "loss": 0.6936, "losses/dpo": 0.6961352229118347, "losses/sft": 1.2129688262939453, "losses/total": 0.6961352229118347, "ref_logps/chosen": -33.73820877075195, "ref_logps/rejected": -41.132591247558594, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0006412325892597437, "rewards/margins": -0.0008131118956953287, "rewards/rejected": 0.0014543444849550724, "step": 57 }, { "epoch": 0.05, "grad_norm": 13.560029983520508, "learning_rate": 9.11949685534591e-08, "logps/chosen": -36.52886962890625, "logps/rejected": -34.41619873046875, "loss": 0.6932, "losses/dpo": 0.6908233165740967, "losses/sft": 1.0413261651992798, "losses/total": 0.6908233165740967, "ref_logps/chosen": -36.54736328125, "ref_logps/rejected": -34.43437957763672, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0018498122226446867, "rewards/margins": 3.185553941875696e-05, "rewards/rejected": 0.0018179567996412516, "step": 58 }, { "epoch": 0.06, "grad_norm": 15.819250106811523, "learning_rate": 9.276729559748427e-08, "logps/chosen": -42.62882614135742, "logps/rejected": -42.30875015258789, "loss": 0.6906, "losses/dpo": 0.6812754273414612, "losses/sft": 1.4014006853103638, "losses/total": 0.6812754273414612, "ref_logps/chosen": -42.63201904296875, "ref_logps/rejected": -42.25872802734375, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00031913910061120987, "rewards/margins": 0.005321601405739784, "rewards/rejected": -0.005002461373806, "step": 59 }, { "epoch": 0.06, "grad_norm": 13.456873893737793, "learning_rate": 9.433962264150943e-08, "logps/chosen": -30.847614288330078, "logps/rejected": -33.43515396118164, "loss": 0.6919, "losses/dpo": 0.6852722764015198, "losses/sft": 1.3770960569381714, "losses/total": 0.6852722764015198, "ref_logps/chosen": -30.881393432617188, "ref_logps/rejected": -33.4415283203125, "rewards/accuracies": 0.5, "rewards/chosen": 0.003377854824066162, "rewards/margins": 0.002740193158388138, "rewards/rejected": 0.0006376623641699553, "step": 60 }, { "epoch": 0.06, "grad_norm": 15.81886100769043, "learning_rate": 9.59119496855346e-08, "logps/chosen": -41.415042877197266, "logps/rejected": -50.666526794433594, "loss": 0.6946, "losses/dpo": 0.7051351070404053, "losses/sft": 1.4612579345703125, "losses/total": 0.7051351070404053, "ref_logps/chosen": -41.415164947509766, "ref_logps/rejected": -50.69341278076172, "rewards/accuracies": 0.5, "rewards/chosen": 1.2171454727649689e-05, "rewards/margins": -0.002675973577424884, "rewards/rejected": 0.0026881457306444645, "step": 61 }, { "epoch": 0.06, "grad_norm": 14.154650688171387, "learning_rate": 9.748427672955974e-08, "logps/chosen": -43.80131530761719, "logps/rejected": -36.09864807128906, "loss": 0.6868, "losses/dpo": 0.6857415437698364, "losses/sft": 2.0101566314697266, "losses/total": 0.6857415437698364, "ref_logps/chosen": -43.869712829589844, "ref_logps/rejected": -36.038368225097656, "rewards/accuracies": 0.6875, "rewards/chosen": 0.006840109825134277, "rewards/margins": 0.01286797784268856, "rewards/rejected": -0.0060278684832155704, "step": 62 }, { "epoch": 0.06, "grad_norm": 13.230729103088379, "learning_rate": 9.905660377358491e-08, "logps/chosen": -31.131973266601562, "logps/rejected": -37.758209228515625, "loss": 0.6926, "losses/dpo": 0.6912837624549866, "losses/sft": 1.1980384588241577, "losses/total": 0.6912837624549866, "ref_logps/chosen": -31.096126556396484, "ref_logps/rejected": -37.710594177246094, "rewards/accuracies": 0.5625, "rewards/chosen": -0.003584663849323988, "rewards/margins": 0.0011769132688641548, "rewards/rejected": -0.004761577118188143, "step": 63 }, { "epoch": 0.06, "grad_norm": 17.403446197509766, "learning_rate": 1.0062893081761006e-07, "logps/chosen": -41.96482849121094, "logps/rejected": -53.58015823364258, "loss": 0.6955, "losses/dpo": 0.6856638193130493, "losses/sft": 1.075535774230957, "losses/total": 0.6856638193130493, "ref_logps/chosen": -41.92784118652344, "ref_logps/rejected": -53.586753845214844, "rewards/accuracies": 0.5625, "rewards/chosen": -0.003698870772495866, "rewards/margins": -0.004358276724815369, "rewards/rejected": 0.0006594061851501465, "step": 64 }, { "epoch": 0.06, "grad_norm": 14.0162992477417, "learning_rate": 1.0220125786163522e-07, "logps/chosen": -23.421993255615234, "logps/rejected": -43.844642639160156, "loss": 0.6926, "losses/dpo": 0.7009305953979492, "losses/sft": 1.3693844079971313, "losses/total": 0.7009305953979492, "ref_logps/chosen": -23.462329864501953, "ref_logps/rejected": -43.87372589111328, "rewards/accuracies": 0.375, "rewards/chosen": 0.00403362512588501, "rewards/margins": 0.0011253058910369873, "rewards/rejected": 0.0029083192348480225, "step": 65 }, { "epoch": 0.06, "grad_norm": 14.804617881774902, "learning_rate": 1.0377358490566037e-07, "logps/chosen": -39.25425338745117, "logps/rejected": -40.46149444580078, "loss": 0.6966, "losses/dpo": 0.7018302083015442, "losses/sft": 1.4890705347061157, "losses/total": 0.7018302083015442, "ref_logps/chosen": -39.18394088745117, "ref_logps/rejected": -40.45819091796875, "rewards/accuracies": 0.5, "rewards/chosen": -0.0070314290933310986, "rewards/margins": -0.006701166275888681, "rewards/rejected": -0.00033026374876499176, "step": 66 }, { "epoch": 0.06, "grad_norm": 13.262336730957031, "learning_rate": 1.0534591194968554e-07, "logps/chosen": -30.260812759399414, "logps/rejected": -33.28879165649414, "loss": 0.6948, "losses/dpo": 0.6856677532196045, "losses/sft": 0.8565500974655151, "losses/total": 0.6856677532196045, "ref_logps/chosen": -30.227874755859375, "ref_logps/rejected": -33.28705596923828, "rewards/accuracies": 0.5, "rewards/chosen": -0.003294035093858838, "rewards/margins": -0.0031204731203615665, "rewards/rejected": -0.0001735622063279152, "step": 67 }, { "epoch": 0.06, "grad_norm": 14.190892219543457, "learning_rate": 1.0691823899371069e-07, "logps/chosen": -31.034976959228516, "logps/rejected": -41.680450439453125, "loss": 0.6918, "losses/dpo": 0.6886014938354492, "losses/sft": 1.6718907356262207, "losses/total": 0.6886014938354492, "ref_logps/chosen": -31.038402557373047, "ref_logps/rejected": -41.65570831298828, "rewards/accuracies": 0.625, "rewards/chosen": 0.00034259981475770473, "rewards/margins": 0.0028164724353700876, "rewards/rejected": -0.0024738728534430265, "step": 68 }, { "epoch": 0.07, "grad_norm": 14.235614776611328, "learning_rate": 1.0849056603773585e-07, "logps/chosen": -38.295806884765625, "logps/rejected": -40.39185333251953, "loss": 0.6901, "losses/dpo": 0.6912087202072144, "losses/sft": 1.3809698820114136, "losses/total": 0.6912087202072144, "ref_logps/chosen": -38.35113525390625, "ref_logps/rejected": -40.38447952270508, "rewards/accuracies": 0.625, "rewards/chosen": 0.0055329324677586555, "rewards/margins": 0.006270730402320623, "rewards/rejected": -0.0007377980509772897, "step": 69 }, { "epoch": 0.07, "grad_norm": 14.858607292175293, "learning_rate": 1.10062893081761e-07, "logps/chosen": -36.434654235839844, "logps/rejected": -42.416053771972656, "loss": 0.6965, "losses/dpo": 0.6960601806640625, "losses/sft": 1.3788210153579712, "losses/total": 0.6960601806640625, "ref_logps/chosen": -36.4222526550293, "ref_logps/rejected": -42.46902084350586, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0012397617101669312, "rewards/margins": -0.006536379922181368, "rewards/rejected": 0.005296617280691862, "step": 70 }, { "epoch": 0.07, "grad_norm": 13.156291007995605, "learning_rate": 1.1163522012578616e-07, "logps/chosen": -24.545652389526367, "logps/rejected": -35.784202575683594, "loss": 0.6923, "losses/dpo": 0.6898338794708252, "losses/sft": 1.1819456815719604, "losses/total": 0.6898338794708252, "ref_logps/chosen": -24.52574920654297, "ref_logps/rejected": -35.74748992919922, "rewards/accuracies": 0.5625, "rewards/chosen": -0.001990178134292364, "rewards/margins": 0.0016812352696433663, "rewards/rejected": -0.0036714139860123396, "step": 71 }, { "epoch": 0.07, "grad_norm": 16.261690139770508, "learning_rate": 1.1320754716981131e-07, "logps/chosen": -30.026611328125, "logps/rejected": -44.707176208496094, "loss": 0.6919, "losses/dpo": 0.6989429593086243, "losses/sft": 1.3058668375015259, "losses/total": 0.6989429593086243, "ref_logps/chosen": -30.060457229614258, "ref_logps/rejected": -44.71483612060547, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0033847063314169645, "rewards/margins": 0.0026185151655226946, "rewards/rejected": 0.0007661909912712872, "step": 72 }, { "epoch": 0.07, "grad_norm": 14.12116813659668, "learning_rate": 1.1477987421383648e-07, "logps/chosen": -30.411203384399414, "logps/rejected": -43.75214385986328, "loss": 0.6931, "losses/dpo": 0.6866962909698486, "losses/sft": 0.9678082466125488, "losses/total": 0.6866962909698486, "ref_logps/chosen": -30.391040802001953, "ref_logps/rejected": -43.72895812988281, "rewards/accuracies": 0.5, "rewards/chosen": -0.0020164516754448414, "rewards/margins": 0.00030208518728613853, "rewards/rejected": -0.002318537561222911, "step": 73 }, { "epoch": 0.07, "grad_norm": 15.123022079467773, "learning_rate": 1.1635220125786163e-07, "logps/chosen": -47.57429504394531, "logps/rejected": -44.82411193847656, "loss": 0.6922, "losses/dpo": 0.6984449028968811, "losses/sft": 1.6888502836227417, "losses/total": 0.6984449028968811, "ref_logps/chosen": -47.54075241088867, "ref_logps/rejected": -44.771324157714844, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0033542518503963947, "rewards/margins": 0.001925045158714056, "rewards/rejected": -0.0052792965434491634, "step": 74 }, { "epoch": 0.07, "grad_norm": 15.112242698669434, "learning_rate": 1.1792452830188679e-07, "logps/chosen": -35.57346725463867, "logps/rejected": -47.27886962890625, "loss": 0.6945, "losses/dpo": 0.6906975507736206, "losses/sft": 1.2371470928192139, "losses/total": 0.6906975507736206, "ref_logps/chosen": -35.503482818603516, "ref_logps/rejected": -47.23637771606445, "rewards/accuracies": 0.625, "rewards/chosen": -0.006998538970947266, "rewards/margins": -0.002749508712440729, "rewards/rejected": -0.004249030724167824, "step": 75 }, { "epoch": 0.07, "grad_norm": 15.0462007522583, "learning_rate": 1.1949685534591195e-07, "logps/chosen": -44.30376434326172, "logps/rejected": -45.30217742919922, "loss": 0.6937, "losses/dpo": 0.7008363604545593, "losses/sft": 1.2435119152069092, "losses/total": 0.7008363604545593, "ref_logps/chosen": -44.33555603027344, "ref_logps/rejected": -45.34375, "rewards/accuracies": 0.5625, "rewards/chosen": 0.003179407212883234, "rewards/margins": -0.0009777904488146305, "rewards/rejected": 0.0041571976616978645, "step": 76 }, { "epoch": 0.07, "grad_norm": 16.324993133544922, "learning_rate": 1.210691823899371e-07, "logps/chosen": -40.5933723449707, "logps/rejected": -46.27098846435547, "loss": 0.6981, "losses/dpo": 0.694791316986084, "losses/sft": 1.267210841178894, "losses/total": 0.694791316986084, "ref_logps/chosen": -40.50887680053711, "ref_logps/rejected": -46.2847900390625, "rewards/accuracies": 0.3125, "rewards/chosen": -0.00844934955239296, "rewards/margins": -0.009829377755522728, "rewards/rejected": 0.0013800286687910557, "step": 77 }, { "epoch": 0.07, "grad_norm": 15.344736099243164, "learning_rate": 1.2264150943396226e-07, "logps/chosen": -35.888084411621094, "logps/rejected": -47.26556396484375, "loss": 0.6926, "losses/dpo": 0.6868501901626587, "losses/sft": 1.6895915269851685, "losses/total": 0.6868501901626587, "ref_logps/chosen": -35.859771728515625, "ref_logps/rejected": -47.22634506225586, "rewards/accuracies": 0.5, "rewards/chosen": -0.0028314590454101562, "rewards/margins": 0.001090294448658824, "rewards/rejected": -0.003921753726899624, "step": 78 }, { "epoch": 0.07, "grad_norm": 16.02104949951172, "learning_rate": 1.242138364779874e-07, "logps/chosen": -51.301265716552734, "logps/rejected": -40.96278381347656, "loss": 0.6922, "losses/dpo": 0.6897770166397095, "losses/sft": 1.4705162048339844, "losses/total": 0.6897770166397095, "ref_logps/chosen": -51.24794006347656, "ref_logps/rejected": -40.89030456542969, "rewards/accuracies": 0.5, "rewards/chosen": -0.00533277727663517, "rewards/margins": 0.0019149510189890862, "rewards/rejected": -0.007247728295624256, "step": 79 }, { "epoch": 0.08, "grad_norm": 14.240321159362793, "learning_rate": 1.2578616352201258e-07, "logps/chosen": -38.717018127441406, "logps/rejected": -43.53178405761719, "loss": 0.6922, "losses/dpo": 0.6946658492088318, "losses/sft": 1.1348018646240234, "losses/total": 0.6946658492088318, "ref_logps/chosen": -38.72174072265625, "ref_logps/rejected": -43.515045166015625, "rewards/accuracies": 0.375, "rewards/chosen": 0.00047174119390547276, "rewards/margins": 0.0021452102810144424, "rewards/rejected": -0.0016734690871089697, "step": 80 }, { "epoch": 0.08, "grad_norm": 15.212409973144531, "learning_rate": 1.2735849056603773e-07, "logps/chosen": -39.79193878173828, "logps/rejected": -44.109989166259766, "loss": 0.691, "losses/dpo": 0.6931477785110474, "losses/sft": 1.2996903657913208, "losses/total": 0.6931477785110474, "ref_logps/chosen": -39.76420593261719, "ref_logps/rejected": -44.03834533691406, "rewards/accuracies": 0.625, "rewards/chosen": -0.0027731030713766813, "rewards/margins": 0.0043914709240198135, "rewards/rejected": -0.007164573762565851, "step": 81 }, { "epoch": 0.08, "grad_norm": 14.307190895080566, "learning_rate": 1.2893081761006288e-07, "logps/chosen": -39.145469665527344, "logps/rejected": -42.11958312988281, "loss": 0.6941, "losses/dpo": 0.6958881616592407, "losses/sft": 1.587785243988037, "losses/total": 0.6958881616592407, "ref_logps/chosen": -39.09599304199219, "ref_logps/rejected": -42.08744812011719, "rewards/accuracies": 0.4375, "rewards/chosen": -0.004947358276695013, "rewards/margins": -0.0017338457982987165, "rewards/rejected": -0.00321351271122694, "step": 82 }, { "epoch": 0.08, "grad_norm": 13.738253593444824, "learning_rate": 1.3050314465408803e-07, "logps/chosen": -34.46840286254883, "logps/rejected": -32.53174591064453, "loss": 0.6924, "losses/dpo": 0.6979929804801941, "losses/sft": 1.4558571577072144, "losses/total": 0.6979929804801941, "ref_logps/chosen": -34.381011962890625, "ref_logps/rejected": -32.42715835571289, "rewards/accuracies": 0.4375, "rewards/chosen": -0.008738812990486622, "rewards/margins": 0.0017202908638864756, "rewards/rejected": -0.010459104552865028, "step": 83 }, { "epoch": 0.08, "grad_norm": 14.270866394042969, "learning_rate": 1.320754716981132e-07, "logps/chosen": -36.263126373291016, "logps/rejected": -32.43574523925781, "loss": 0.6934, "losses/dpo": 0.6902810335159302, "losses/sft": 1.0569183826446533, "losses/total": 0.6902810335159302, "ref_logps/chosen": -36.21672439575195, "ref_logps/rejected": -32.393104553222656, "rewards/accuracies": 0.4375, "rewards/chosen": -0.004640126600861549, "rewards/margins": -0.0003763199783861637, "rewards/rejected": -0.004263806156814098, "step": 84 }, { "epoch": 0.08, "grad_norm": 13.939360618591309, "learning_rate": 1.3364779874213836e-07, "logps/chosen": -27.650066375732422, "logps/rejected": -39.44580078125, "loss": 0.6917, "losses/dpo": 0.6909595727920532, "losses/sft": 0.8875692486763, "losses/total": 0.6909595727920532, "ref_logps/chosen": -27.60795783996582, "ref_logps/rejected": -39.37433624267578, "rewards/accuracies": 0.625, "rewards/chosen": -0.004210913088172674, "rewards/margins": 0.0029358803294599056, "rewards/rejected": -0.007146793883293867, "step": 85 }, { "epoch": 0.08, "grad_norm": 15.56495475769043, "learning_rate": 1.352201257861635e-07, "logps/chosen": -46.32075500488281, "logps/rejected": -45.338584899902344, "loss": 0.6895, "losses/dpo": 0.6981450319290161, "losses/sft": 1.4593093395233154, "losses/total": 0.6981450319290161, "ref_logps/chosen": -46.183692932128906, "ref_logps/rejected": -45.12657165527344, "rewards/accuracies": 0.625, "rewards/chosen": -0.01370624266564846, "rewards/margins": 0.007495338097214699, "rewards/rejected": -0.02120158076286316, "step": 86 }, { "epoch": 0.08, "grad_norm": 15.097458839416504, "learning_rate": 1.3679245283018866e-07, "logps/chosen": -39.066123962402344, "logps/rejected": -45.30772399902344, "loss": 0.6881, "losses/dpo": 0.682632565498352, "losses/sft": 1.6743701696395874, "losses/total": 0.682632565498352, "ref_logps/chosen": -38.98908233642578, "ref_logps/rejected": -45.1273078918457, "rewards/accuracies": 0.75, "rewards/chosen": -0.007704532239586115, "rewards/margins": 0.010336978361010551, "rewards/rejected": -0.01804151013493538, "step": 87 }, { "epoch": 0.08, "grad_norm": 16.17900276184082, "learning_rate": 1.3836477987421384e-07, "logps/chosen": -25.759960174560547, "logps/rejected": -41.42595291137695, "loss": 0.6879, "losses/dpo": 0.6917724609375, "losses/sft": 1.3302946090698242, "losses/total": 0.6917724609375, "ref_logps/chosen": -25.710800170898438, "ref_logps/rejected": -41.27067184448242, "rewards/accuracies": 0.625, "rewards/chosen": -0.004916036501526833, "rewards/margins": 0.010611901059746742, "rewards/rejected": -0.015527937561273575, "step": 88 }, { "epoch": 0.08, "grad_norm": 15.869852066040039, "learning_rate": 1.39937106918239e-07, "logps/chosen": -37.822227478027344, "logps/rejected": -47.868438720703125, "loss": 0.6939, "losses/dpo": 0.6867597699165344, "losses/sft": 1.1006416082382202, "losses/total": 0.6867597699165344, "ref_logps/chosen": -37.74897003173828, "ref_logps/rejected": -47.808197021484375, "rewards/accuracies": 0.4375, "rewards/chosen": -0.007325470447540283, "rewards/margins": -0.0013015333097428083, "rewards/rejected": -0.0060239373706281185, "step": 89 }, { "epoch": 0.08, "grad_norm": 13.955927848815918, "learning_rate": 1.4150943396226414e-07, "logps/chosen": -29.049095153808594, "logps/rejected": -53.24306869506836, "loss": 0.6938, "losses/dpo": 0.6909089088439941, "losses/sft": 1.1799514293670654, "losses/total": 0.6909089088439941, "ref_logps/chosen": -28.953819274902344, "ref_logps/rejected": -53.159454345703125, "rewards/accuracies": 0.5, "rewards/chosen": -0.009527618065476418, "rewards/margins": -0.001166248694062233, "rewards/rejected": -0.008361369371414185, "step": 90 }, { "epoch": 0.09, "grad_norm": 14.775657653808594, "learning_rate": 1.430817610062893e-07, "logps/chosen": -34.025177001953125, "logps/rejected": -42.15654373168945, "loss": 0.6929, "losses/dpo": 0.7020770311355591, "losses/sft": 1.0969116687774658, "losses/total": 0.7020770311355591, "ref_logps/chosen": -33.92861557006836, "ref_logps/rejected": -42.053688049316406, "rewards/accuracies": 0.5, "rewards/chosen": -0.009656224399805069, "rewards/margins": 0.0006290348246693611, "rewards/rejected": -0.01028525922447443, "step": 91 }, { "epoch": 0.09, "grad_norm": 14.214162826538086, "learning_rate": 1.4465408805031447e-07, "logps/chosen": -34.177757263183594, "logps/rejected": -43.930545806884766, "loss": 0.6881, "losses/dpo": 0.6853083372116089, "losses/sft": 0.9370521306991577, "losses/total": 0.6853083372116089, "ref_logps/chosen": -34.10899353027344, "ref_logps/rejected": -43.759490966796875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.006876480299979448, "rewards/margins": 0.010229195468127728, "rewards/rejected": -0.017105676233768463, "step": 92 }, { "epoch": 0.09, "grad_norm": 17.3109188079834, "learning_rate": 1.4622641509433962e-07, "logps/chosen": -40.37968444824219, "logps/rejected": -49.63119125366211, "loss": 0.6886, "losses/dpo": 0.6832884550094604, "losses/sft": 1.7671897411346436, "losses/total": 0.6832884550094604, "ref_logps/chosen": -40.22066116333008, "ref_logps/rejected": -49.377174377441406, "rewards/accuracies": 0.625, "rewards/chosen": -0.01590237021446228, "rewards/margins": 0.00949946604669094, "rewards/rejected": -0.02540183812379837, "step": 93 }, { "epoch": 0.09, "grad_norm": 13.591806411743164, "learning_rate": 1.4779874213836477e-07, "logps/chosen": -34.08275604248047, "logps/rejected": -32.793453216552734, "loss": 0.6869, "losses/dpo": 0.6806268095970154, "losses/sft": 1.2122366428375244, "losses/total": 0.6806268095970154, "ref_logps/chosen": -34.06282043457031, "ref_logps/rejected": -32.647281646728516, "rewards/accuracies": 0.625, "rewards/chosen": -0.0019936240278184414, "rewards/margins": 0.012623312883079052, "rewards/rejected": -0.014616936445236206, "step": 94 }, { "epoch": 0.09, "grad_norm": 14.241443634033203, "learning_rate": 1.4937106918238992e-07, "logps/chosen": -43.90531921386719, "logps/rejected": -38.043739318847656, "loss": 0.6917, "losses/dpo": 0.6911299824714661, "losses/sft": 1.7745474576950073, "losses/total": 0.6911299824714661, "ref_logps/chosen": -43.74835968017578, "ref_logps/rejected": -37.856956481933594, "rewards/accuracies": 0.5625, "rewards/chosen": -0.015696100890636444, "rewards/margins": 0.0029826308600604534, "rewards/rejected": -0.018678732216358185, "step": 95 }, { "epoch": 0.09, "grad_norm": 15.963578224182129, "learning_rate": 1.509433962264151e-07, "logps/chosen": -33.27313995361328, "logps/rejected": -43.820289611816406, "loss": 0.6911, "losses/dpo": 0.6779791712760925, "losses/sft": 1.113797903060913, "losses/total": 0.6779791712760925, "ref_logps/chosen": -33.136817932128906, "ref_logps/rejected": -43.64072036743164, "rewards/accuracies": 0.5625, "rewards/chosen": -0.01363192219287157, "rewards/margins": 0.004325224086642265, "rewards/rejected": -0.01795714534819126, "step": 96 }, { "epoch": 0.09, "grad_norm": 15.912907600402832, "learning_rate": 1.5251572327044024e-07, "logps/chosen": -41.62563705444336, "logps/rejected": -55.13963317871094, "loss": 0.6969, "losses/dpo": 0.6952822208404541, "losses/sft": 1.8095005750656128, "losses/total": 0.6952822208404541, "ref_logps/chosen": -41.45159149169922, "ref_logps/rejected": -55.037925720214844, "rewards/accuracies": 0.25, "rewards/chosen": -0.01740449108183384, "rewards/margins": -0.00723386462777853, "rewards/rejected": -0.010170625522732735, "step": 97 }, { "epoch": 0.09, "grad_norm": 15.8721342086792, "learning_rate": 1.540880503144654e-07, "logps/chosen": -48.8435173034668, "logps/rejected": -45.31690979003906, "loss": 0.6936, "losses/dpo": 0.6874356269836426, "losses/sft": 1.448114275932312, "losses/total": 0.6874356269836426, "ref_logps/chosen": -48.658470153808594, "ref_logps/rejected": -45.13918685913086, "rewards/accuracies": 0.5, "rewards/chosen": -0.018504882231354713, "rewards/margins": -0.0007324693724513054, "rewards/rejected": -0.017772411927580833, "step": 98 }, { "epoch": 0.09, "grad_norm": 12.944564819335938, "learning_rate": 1.5566037735849055e-07, "logps/chosen": -30.8857364654541, "logps/rejected": -35.680885314941406, "loss": 0.6971, "losses/dpo": 0.6944862008094788, "losses/sft": 0.959463894367218, "losses/total": 0.6944862008094788, "ref_logps/chosen": -30.673364639282227, "ref_logps/rejected": -35.54549789428711, "rewards/accuracies": 0.5, "rewards/chosen": -0.021237188950181007, "rewards/margins": -0.007698586210608482, "rewards/rejected": -0.013538602739572525, "step": 99 }, { "epoch": 0.09, "grad_norm": 14.438765525817871, "learning_rate": 1.5723270440251572e-07, "logps/chosen": -45.89412307739258, "logps/rejected": -47.31294250488281, "loss": 0.6932, "losses/dpo": 0.695717453956604, "losses/sft": 1.817944884300232, "losses/total": 0.695717453956604, "ref_logps/chosen": -45.63401794433594, "ref_logps/rejected": -47.05308532714844, "rewards/accuracies": 0.5625, "rewards/chosen": -0.02601074054837227, "rewards/margins": -2.5463057681918144e-05, "rewards/rejected": -0.02598527818918228, "step": 100 }, { "epoch": 0.1, "grad_norm": 16.887371063232422, "learning_rate": 1.5880503144654087e-07, "logps/chosen": -47.991641998291016, "logps/rejected": -57.681365966796875, "loss": 0.694, "losses/dpo": 0.7069874405860901, "losses/sft": 1.730445384979248, "losses/total": 0.7069874405860901, "ref_logps/chosen": -47.78053283691406, "ref_logps/rejected": -57.486690521240234, "rewards/accuracies": 0.5, "rewards/chosen": -0.02111116796731949, "rewards/margins": -0.0016439557075500488, "rewards/rejected": -0.01946721225976944, "step": 101 }, { "epoch": 0.1, "grad_norm": 15.702824592590332, "learning_rate": 1.6037735849056602e-07, "logps/chosen": -38.05979919433594, "logps/rejected": -51.93192672729492, "loss": 0.6956, "losses/dpo": 0.6968280076980591, "losses/sft": 1.1726560592651367, "losses/total": 0.6968280076980591, "ref_logps/chosen": -37.90008544921875, "ref_logps/rejected": -51.82126235961914, "rewards/accuracies": 0.3125, "rewards/chosen": -0.01597147062420845, "rewards/margins": -0.004904856905341148, "rewards/rejected": -0.011066612787544727, "step": 102 }, { "epoch": 0.1, "grad_norm": 14.684089660644531, "learning_rate": 1.6194968553459117e-07, "logps/chosen": -36.708221435546875, "logps/rejected": -37.9134521484375, "loss": 0.6868, "losses/dpo": 0.6792709827423096, "losses/sft": 1.3039885759353638, "losses/total": 0.6792709827423096, "ref_logps/chosen": -36.589111328125, "ref_logps/rejected": -37.66527557373047, "rewards/accuracies": 0.6875, "rewards/chosen": -0.011910857632756233, "rewards/margins": 0.01290722656995058, "rewards/rejected": -0.02481808327138424, "step": 103 }, { "epoch": 0.1, "grad_norm": 16.684906005859375, "learning_rate": 1.6352201257861632e-07, "logps/chosen": -43.59468460083008, "logps/rejected": -40.17591857910156, "loss": 0.6898, "losses/dpo": 0.6985071301460266, "losses/sft": 1.7022027969360352, "losses/total": 0.6985071301460266, "ref_logps/chosen": -43.42711639404297, "ref_logps/rejected": -39.94116973876953, "rewards/accuracies": 0.5625, "rewards/chosen": -0.016756635159254074, "rewards/margins": 0.006718737538903952, "rewards/rejected": -0.023475375026464462, "step": 104 }, { "epoch": 0.1, "grad_norm": 14.9424409866333, "learning_rate": 1.650943396226415e-07, "logps/chosen": -39.1132926940918, "logps/rejected": -45.54408264160156, "loss": 0.6981, "losses/dpo": 0.6998525857925415, "losses/sft": 1.4071406126022339, "losses/total": 0.6998525857925415, "ref_logps/chosen": -38.84712600708008, "ref_logps/rejected": -45.37468719482422, "rewards/accuracies": 0.5, "rewards/chosen": -0.026616401970386505, "rewards/margins": -0.009676598012447357, "rewards/rejected": -0.016939803957939148, "step": 105 }, { "epoch": 0.1, "grad_norm": 14.410820960998535, "learning_rate": 1.6666666666666665e-07, "logps/chosen": -34.343482971191406, "logps/rejected": -38.83158874511719, "loss": 0.6916, "losses/dpo": 0.6919582486152649, "losses/sft": 1.2401819229125977, "losses/total": 0.6919582486152649, "ref_logps/chosen": -34.14746856689453, "ref_logps/rejected": -38.6029167175293, "rewards/accuracies": 0.375, "rewards/chosen": -0.019601475447416306, "rewards/margins": 0.00326578039675951, "rewards/rejected": -0.02286725677549839, "step": 106 }, { "epoch": 0.1, "grad_norm": 14.090592384338379, "learning_rate": 1.682389937106918e-07, "logps/chosen": -29.85173988342285, "logps/rejected": -42.94633483886719, "loss": 0.693, "losses/dpo": 0.6914762258529663, "losses/sft": 1.1770298480987549, "losses/total": 0.6914762258529663, "ref_logps/chosen": -29.699792861938477, "ref_logps/rejected": -42.7900276184082, "rewards/accuracies": 0.4375, "rewards/chosen": -0.015194655396044254, "rewards/margins": 0.00043619866482913494, "rewards/rejected": -0.015630854293704033, "step": 107 }, { "epoch": 0.1, "grad_norm": 15.35986328125, "learning_rate": 1.6981132075471695e-07, "logps/chosen": -42.16456985473633, "logps/rejected": -45.15089416503906, "loss": 0.6885, "losses/dpo": 0.6948836445808411, "losses/sft": 1.6860700845718384, "losses/total": 0.6948836445808411, "ref_logps/chosen": -41.986881256103516, "ref_logps/rejected": -44.87779235839844, "rewards/accuracies": 0.5625, "rewards/chosen": -0.01776862144470215, "rewards/margins": 0.009541459381580353, "rewards/rejected": -0.0273100808262825, "step": 108 }, { "epoch": 0.1, "grad_norm": 13.9297513961792, "learning_rate": 1.7138364779874213e-07, "logps/chosen": -26.50198745727539, "logps/rejected": -36.50336456298828, "loss": 0.6885, "losses/dpo": 0.6882284879684448, "losses/sft": 1.036218285560608, "losses/total": 0.6882284879684448, "ref_logps/chosen": -26.43675422668457, "ref_logps/rejected": -36.34293746948242, "rewards/accuracies": 0.75, "rewards/chosen": -0.0065233525820076466, "rewards/margins": 0.009519243612885475, "rewards/rejected": -0.016042595729231834, "step": 109 }, { "epoch": 0.1, "grad_norm": 14.998244285583496, "learning_rate": 1.7295597484276728e-07, "logps/chosen": -36.74858093261719, "logps/rejected": -43.12199401855469, "loss": 0.6895, "losses/dpo": 0.6972934603691101, "losses/sft": 1.3377923965454102, "losses/total": 0.6972934603691101, "ref_logps/chosen": -36.58707809448242, "ref_logps/rejected": -42.88616943359375, "rewards/accuracies": 0.5, "rewards/chosen": -0.016150321811437607, "rewards/margins": 0.007431963458657265, "rewards/rejected": -0.02358228713274002, "step": 110 }, { "epoch": 0.1, "grad_norm": 13.215498924255371, "learning_rate": 1.7452830188679243e-07, "logps/chosen": -32.85944747924805, "logps/rejected": -34.17639923095703, "loss": 0.6879, "losses/dpo": 0.6819064021110535, "losses/sft": 1.4440749883651733, "losses/total": 0.6819064021110535, "ref_logps/chosen": -32.7509765625, "ref_logps/rejected": -33.96112823486328, "rewards/accuracies": 0.625, "rewards/chosen": -0.010846990160644054, "rewards/margins": 0.010679739527404308, "rewards/rejected": -0.021526731550693512, "step": 111 }, { "epoch": 0.11, "grad_norm": 14.666735649108887, "learning_rate": 1.7610062893081758e-07, "logps/chosen": -39.56743240356445, "logps/rejected": -45.411155700683594, "loss": 0.6848, "losses/dpo": 0.6862828731536865, "losses/sft": 1.5352342128753662, "losses/total": 0.6862828731536865, "ref_logps/chosen": -39.37195587158203, "ref_logps/rejected": -45.04680252075195, "rewards/accuracies": 0.875, "rewards/chosen": -0.019547976553440094, "rewards/margins": 0.016887344419956207, "rewards/rejected": -0.036435317248106, "step": 112 }, { "epoch": 0.11, "grad_norm": 14.673539161682129, "learning_rate": 1.7767295597484276e-07, "logps/chosen": -34.470298767089844, "logps/rejected": -39.43095397949219, "loss": 0.6918, "losses/dpo": 0.6949492692947388, "losses/sft": 1.1301006078720093, "losses/total": 0.6949492692947388, "ref_logps/chosen": -34.285438537597656, "ref_logps/rejected": -39.21759033203125, "rewards/accuracies": 0.5, "rewards/chosen": -0.018486076965928078, "rewards/margins": 0.002850135788321495, "rewards/rejected": -0.021336212754249573, "step": 113 }, { "epoch": 0.11, "grad_norm": 14.95534610748291, "learning_rate": 1.792452830188679e-07, "logps/chosen": -39.47813415527344, "logps/rejected": -32.72713088989258, "loss": 0.6953, "losses/dpo": 0.6829326152801514, "losses/sft": 1.2921595573425293, "losses/total": 0.6829326152801514, "ref_logps/chosen": -39.1935920715332, "ref_logps/rejected": -32.48295211791992, "rewards/accuracies": 0.375, "rewards/chosen": -0.028454016894102097, "rewards/margins": -0.004036164842545986, "rewards/rejected": -0.024417854845523834, "step": 114 }, { "epoch": 0.11, "grad_norm": 13.93600082397461, "learning_rate": 1.8081761006289306e-07, "logps/chosen": -34.593177795410156, "logps/rejected": -35.36073303222656, "loss": 0.697, "losses/dpo": 0.7119002938270569, "losses/sft": 1.3550841808319092, "losses/total": 0.7119002938270569, "ref_logps/chosen": -34.31936264038086, "ref_logps/rejected": -35.16082000732422, "rewards/accuracies": 0.4375, "rewards/chosen": -0.027381660416722298, "rewards/margins": -0.007390107028186321, "rewards/rejected": -0.01999155431985855, "step": 115 }, { "epoch": 0.11, "grad_norm": 16.06708526611328, "learning_rate": 1.823899371069182e-07, "logps/chosen": -45.849769592285156, "logps/rejected": -50.03912353515625, "loss": 0.6977, "losses/dpo": 0.6969462633132935, "losses/sft": 1.747936487197876, "losses/total": 0.6969462633132935, "ref_logps/chosen": -45.534278869628906, "ref_logps/rejected": -49.81377029418945, "rewards/accuracies": 0.3125, "rewards/chosen": -0.03154875710606575, "rewards/margins": -0.009013607166707516, "rewards/rejected": -0.02253514900803566, "step": 116 }, { "epoch": 0.11, "grad_norm": 14.117581367492676, "learning_rate": 1.8396226415094338e-07, "logps/chosen": -34.1506233215332, "logps/rejected": -47.759986877441406, "loss": 0.6952, "losses/dpo": 0.6971405744552612, "losses/sft": 1.3164403438568115, "losses/total": 0.6971405744552612, "ref_logps/chosen": -33.87939453125, "ref_logps/rejected": -47.52716827392578, "rewards/accuracies": 0.4375, "rewards/chosen": -0.027122847735881805, "rewards/margins": -0.003840741002932191, "rewards/rejected": -0.023282108828425407, "step": 117 }, { "epoch": 0.11, "grad_norm": 13.412751197814941, "learning_rate": 1.8553459119496853e-07, "logps/chosen": -34.454444885253906, "logps/rejected": -44.52323532104492, "loss": 0.6962, "losses/dpo": 0.6933097839355469, "losses/sft": 1.301590919494629, "losses/total": 0.6933097839355469, "ref_logps/chosen": -34.16710662841797, "ref_logps/rejected": -44.295814514160156, "rewards/accuracies": 0.4375, "rewards/chosen": -0.028733599931001663, "rewards/margins": -0.00599172106012702, "rewards/rejected": -0.022741876542568207, "step": 118 }, { "epoch": 0.11, "grad_norm": 15.68087387084961, "learning_rate": 1.8710691823899368e-07, "logps/chosen": -47.611175537109375, "logps/rejected": -46.28931427001953, "loss": 0.6876, "losses/dpo": 0.6986205577850342, "losses/sft": 1.2177549600601196, "losses/total": 0.6986205577850342, "ref_logps/chosen": -47.382904052734375, "ref_logps/rejected": -45.94834518432617, "rewards/accuracies": 0.75, "rewards/chosen": -0.022827334702014923, "rewards/margins": 0.011269832029938698, "rewards/rejected": -0.03409716486930847, "step": 119 }, { "epoch": 0.11, "grad_norm": 12.786795616149902, "learning_rate": 1.8867924528301886e-07, "logps/chosen": -30.233591079711914, "logps/rejected": -37.38566970825195, "loss": 0.6963, "losses/dpo": 0.7035396099090576, "losses/sft": 0.7588335871696472, "losses/total": 0.7035396099090576, "ref_logps/chosen": -29.987346649169922, "ref_logps/rejected": -37.2010498046875, "rewards/accuracies": 0.3125, "rewards/chosen": -0.024624522775411606, "rewards/margins": -0.006162205711007118, "rewards/rejected": -0.018462317064404488, "step": 120 }, { "epoch": 0.11, "grad_norm": 14.822002410888672, "learning_rate": 1.9025157232704404e-07, "logps/chosen": -42.0538330078125, "logps/rejected": -43.76673889160156, "loss": 0.6878, "losses/dpo": 0.690367579460144, "losses/sft": 1.4329962730407715, "losses/total": 0.690367579460144, "ref_logps/chosen": -41.80472183227539, "ref_logps/rejected": -43.407867431640625, "rewards/accuracies": 0.75, "rewards/chosen": -0.024911170825362206, "rewards/margins": 0.010975952260196209, "rewards/rejected": -0.03588712215423584, "step": 121 }, { "epoch": 0.12, "grad_norm": 13.923846244812012, "learning_rate": 1.918238993710692e-07, "logps/chosen": -33.28826904296875, "logps/rejected": -38.56500244140625, "loss": 0.692, "losses/dpo": 0.6966415643692017, "losses/sft": 1.658522129058838, "losses/total": 0.6966415643692017, "ref_logps/chosen": -32.97311019897461, "ref_logps/rejected": -38.22522735595703, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0315156951546669, "rewards/margins": 0.0024614857975393534, "rewards/rejected": -0.03397718071937561, "step": 122 }, { "epoch": 0.12, "grad_norm": 15.002264976501465, "learning_rate": 1.9339622641509434e-07, "logps/chosen": -34.423828125, "logps/rejected": -52.01573944091797, "loss": 0.6885, "losses/dpo": 0.6879132986068726, "losses/sft": 1.094735860824585, "losses/total": 0.6879132986068726, "ref_logps/chosen": -34.19406509399414, "ref_logps/rejected": -51.68952941894531, "rewards/accuracies": 0.5625, "rewards/chosen": -0.022976290434598923, "rewards/margins": 0.00964451115578413, "rewards/rejected": -0.03262079879641533, "step": 123 }, { "epoch": 0.12, "grad_norm": 19.52489471435547, "learning_rate": 1.949685534591195e-07, "logps/chosen": -32.851112365722656, "logps/rejected": -39.58111572265625, "loss": 0.6905, "losses/dpo": 0.685728907585144, "losses/sft": 1.1572200059890747, "losses/total": 0.685728907585144, "ref_logps/chosen": -32.63593292236328, "ref_logps/rejected": -39.31063461303711, "rewards/accuracies": 0.625, "rewards/chosen": -0.021518178284168243, "rewards/margins": 0.005529931280761957, "rewards/rejected": -0.027048110961914062, "step": 124 }, { "epoch": 0.12, "grad_norm": 15.122300148010254, "learning_rate": 1.9654088050314467e-07, "logps/chosen": -36.94127655029297, "logps/rejected": -39.3349609375, "loss": 0.6945, "losses/dpo": 0.6853357553482056, "losses/sft": 1.2431758642196655, "losses/total": 0.6853357553482056, "ref_logps/chosen": -36.56884002685547, "ref_logps/rejected": -38.9855842590332, "rewards/accuracies": 0.5, "rewards/chosen": -0.03724343329668045, "rewards/margins": -0.0023057940416038036, "rewards/rejected": -0.03493763878941536, "step": 125 }, { "epoch": 0.12, "grad_norm": 13.548483848571777, "learning_rate": 1.9811320754716982e-07, "logps/chosen": -31.962743759155273, "logps/rejected": -35.55200958251953, "loss": 0.6924, "losses/dpo": 0.7094372510910034, "losses/sft": 1.4011971950531006, "losses/total": 0.7094372510910034, "ref_logps/chosen": -31.544071197509766, "ref_logps/rejected": -35.11526107788086, "rewards/accuracies": 0.5, "rewards/chosen": -0.041867226362228394, "rewards/margins": 0.0018077259883284569, "rewards/rejected": -0.043674953281879425, "step": 126 }, { "epoch": 0.12, "grad_norm": 15.32440185546875, "learning_rate": 1.9968553459119497e-07, "logps/chosen": -47.51824951171875, "logps/rejected": -49.325355529785156, "loss": 0.6896, "losses/dpo": 0.6987741589546204, "losses/sft": 1.8628894090652466, "losses/total": 0.6987741589546204, "ref_logps/chosen": -47.120445251464844, "ref_logps/rejected": -48.85082244873047, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03978060558438301, "rewards/margins": 0.007672748528420925, "rewards/rejected": -0.04745335131883621, "step": 127 }, { "epoch": 0.12, "grad_norm": 14.225200653076172, "learning_rate": 2.0125786163522012e-07, "logps/chosen": -36.54164505004883, "logps/rejected": -35.82334899902344, "loss": 0.6879, "losses/dpo": 0.6846950650215149, "losses/sft": 1.3227465152740479, "losses/total": 0.6846950650215149, "ref_logps/chosen": -36.33439636230469, "ref_logps/rejected": -35.509422302246094, "rewards/accuracies": 0.75, "rewards/chosen": -0.020724685862660408, "rewards/margins": 0.010668057017028332, "rewards/rejected": -0.031392741948366165, "step": 128 }, { "epoch": 0.12, "grad_norm": 17.420419692993164, "learning_rate": 2.028301886792453e-07, "logps/chosen": -39.37260055541992, "logps/rejected": -57.38317108154297, "loss": 0.6847, "losses/dpo": 0.692078173160553, "losses/sft": 1.5656405687332153, "losses/total": 0.692078173160553, "ref_logps/chosen": -39.03068161010742, "ref_logps/rejected": -56.86867141723633, "rewards/accuracies": 0.75, "rewards/chosen": -0.03419186919927597, "rewards/margins": 0.01725781336426735, "rewards/rejected": -0.05144967883825302, "step": 129 }, { "epoch": 0.12, "grad_norm": 14.964306831359863, "learning_rate": 2.0440251572327044e-07, "logps/chosen": -43.3221321105957, "logps/rejected": -41.251625061035156, "loss": 0.6868, "losses/dpo": 0.6975349187850952, "losses/sft": 1.6676081418991089, "losses/total": 0.6975349187850952, "ref_logps/chosen": -43.071380615234375, "ref_logps/rejected": -40.869747161865234, "rewards/accuracies": 0.5625, "rewards/chosen": -0.025075269863009453, "rewards/margins": 0.01311260461807251, "rewards/rejected": -0.03818787261843681, "step": 130 }, { "epoch": 0.12, "grad_norm": 14.855781555175781, "learning_rate": 2.059748427672956e-07, "logps/chosen": -32.41206359863281, "logps/rejected": -54.820892333984375, "loss": 0.6897, "losses/dpo": 0.694727897644043, "losses/sft": 1.23732328414917, "losses/total": 0.694727897644043, "ref_logps/chosen": -32.06626892089844, "ref_logps/rejected": -54.403839111328125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03457948565483093, "rewards/margins": 0.007125252857804298, "rewards/rejected": -0.04170474037528038, "step": 131 }, { "epoch": 0.12, "grad_norm": 14.342564582824707, "learning_rate": 2.0754716981132074e-07, "logps/chosen": -37.349639892578125, "logps/rejected": -44.60934829711914, "loss": 0.6907, "losses/dpo": 0.6890225410461426, "losses/sft": 1.7710987329483032, "losses/total": 0.6890225410461426, "ref_logps/chosen": -37.06349563598633, "ref_logps/rejected": -44.271053314208984, "rewards/accuracies": 0.5, "rewards/chosen": -0.028614195063710213, "rewards/margins": 0.005215257406234741, "rewards/rejected": -0.033829450607299805, "step": 132 }, { "epoch": 0.13, "grad_norm": 14.29720401763916, "learning_rate": 2.0911949685534592e-07, "logps/chosen": -36.476844787597656, "logps/rejected": -41.18645477294922, "loss": 0.687, "losses/dpo": 0.672049880027771, "losses/sft": 1.412061095237732, "losses/total": 0.672049880027771, "ref_logps/chosen": -36.14765930175781, "ref_logps/rejected": -40.729766845703125, "rewards/accuracies": 0.75, "rewards/chosen": -0.03291814774274826, "rewards/margins": 0.01275054644793272, "rewards/rejected": -0.04566869139671326, "step": 133 }, { "epoch": 0.13, "grad_norm": 14.889425277709961, "learning_rate": 2.1069182389937107e-07, "logps/chosen": -32.16847610473633, "logps/rejected": -41.37443923950195, "loss": 0.6867, "losses/dpo": 0.6890193819999695, "losses/sft": 1.1969457864761353, "losses/total": 0.6890193819999695, "ref_logps/chosen": -31.79351043701172, "ref_logps/rejected": -40.86709213256836, "rewards/accuracies": 0.625, "rewards/chosen": -0.037496283650398254, "rewards/margins": 0.01323840394616127, "rewards/rejected": -0.050734687596559525, "step": 134 }, { "epoch": 0.13, "grad_norm": 14.49722957611084, "learning_rate": 2.1226415094339622e-07, "logps/chosen": -28.457765579223633, "logps/rejected": -45.70714569091797, "loss": 0.6841, "losses/dpo": 0.6867278814315796, "losses/sft": 1.2185068130493164, "losses/total": 0.6867278814315796, "ref_logps/chosen": -28.06502914428711, "ref_logps/rejected": -45.12978744506836, "rewards/accuracies": 0.8125, "rewards/chosen": -0.039273701608181, "rewards/margins": 0.01846211403608322, "rewards/rejected": -0.05773581564426422, "step": 135 }, { "epoch": 0.13, "grad_norm": 14.271917343139648, "learning_rate": 2.1383647798742137e-07, "logps/chosen": -38.469852447509766, "logps/rejected": -39.8077392578125, "loss": 0.6937, "losses/dpo": 0.687812328338623, "losses/sft": 1.3850359916687012, "losses/total": 0.687812328338623, "ref_logps/chosen": -38.16452407836914, "ref_logps/rejected": -39.51031494140625, "rewards/accuracies": 0.4375, "rewards/chosen": -0.030532920733094215, "rewards/margins": -0.0007908491534180939, "rewards/rejected": -0.02974206954240799, "step": 136 }, { "epoch": 0.13, "grad_norm": 15.589990615844727, "learning_rate": 2.1540880503144655e-07, "logps/chosen": -39.6287956237793, "logps/rejected": -44.67558288574219, "loss": 0.6898, "losses/dpo": 0.6974602341651917, "losses/sft": 1.5750739574432373, "losses/total": 0.6974602341651917, "ref_logps/chosen": -39.27399826049805, "ref_logps/rejected": -44.25121307373047, "rewards/accuracies": 0.5, "rewards/chosen": -0.03547965735197067, "rewards/margins": 0.00695746298879385, "rewards/rejected": -0.0424371175467968, "step": 137 }, { "epoch": 0.13, "grad_norm": 15.114520072937012, "learning_rate": 2.169811320754717e-07, "logps/chosen": -44.047691345214844, "logps/rejected": -40.24201202392578, "loss": 0.6913, "losses/dpo": 0.6949135065078735, "losses/sft": 2.0087459087371826, "losses/total": 0.6949135065078735, "ref_logps/chosen": -43.595733642578125, "ref_logps/rejected": -39.75062561035156, "rewards/accuracies": 0.625, "rewards/chosen": -0.04519592598080635, "rewards/margins": 0.0039425380527973175, "rewards/rejected": -0.04913846030831337, "step": 138 }, { "epoch": 0.13, "grad_norm": 12.832149505615234, "learning_rate": 2.1855345911949685e-07, "logps/chosen": -32.86917495727539, "logps/rejected": -33.458431243896484, "loss": 0.6965, "losses/dpo": 0.699682354927063, "losses/sft": 0.9208551645278931, "losses/total": 0.699682354927063, "ref_logps/chosen": -32.45710754394531, "ref_logps/rejected": -33.11075210571289, "rewards/accuracies": 0.4375, "rewards/chosen": -0.041206683963537216, "rewards/margins": -0.006438740529119968, "rewards/rejected": -0.03476794809103012, "step": 139 }, { "epoch": 0.13, "grad_norm": 16.490097045898438, "learning_rate": 2.20125786163522e-07, "logps/chosen": -41.00999450683594, "logps/rejected": -51.05350875854492, "loss": 0.6794, "losses/dpo": 0.6706057786941528, "losses/sft": 1.1546810865402222, "losses/total": 0.6706057786941528, "ref_logps/chosen": -40.791709899902344, "ref_logps/rejected": -50.55547332763672, "rewards/accuracies": 0.8125, "rewards/chosen": -0.02182871848344803, "rewards/margins": 0.02797437272965908, "rewards/rejected": -0.04980308935046196, "step": 140 }, { "epoch": 0.13, "grad_norm": 15.765511512756348, "learning_rate": 2.2169811320754718e-07, "logps/chosen": -45.249046325683594, "logps/rejected": -52.27911376953125, "loss": 0.6951, "losses/dpo": 0.6822272539138794, "losses/sft": 1.4385101795196533, "losses/total": 0.6822272539138794, "ref_logps/chosen": -44.595176696777344, "ref_logps/rejected": -51.66139221191406, "rewards/accuracies": 0.375, "rewards/chosen": -0.06538671255111694, "rewards/margins": -0.0036148373037576675, "rewards/rejected": -0.061771877110004425, "step": 141 }, { "epoch": 0.13, "grad_norm": 16.23243522644043, "learning_rate": 2.2327044025157233e-07, "logps/chosen": -38.73735046386719, "logps/rejected": -48.82923126220703, "loss": 0.6944, "losses/dpo": 0.7174341678619385, "losses/sft": 1.6384347677230835, "losses/total": 0.7174341678619385, "ref_logps/chosen": -37.99863815307617, "ref_logps/rejected": -48.11299133300781, "rewards/accuracies": 0.4375, "rewards/chosen": -0.07387126982212067, "rewards/margins": -0.0022472080308943987, "rewards/rejected": -0.07162405550479889, "step": 142 }, { "epoch": 0.14, "grad_norm": 17.4960994720459, "learning_rate": 2.2484276729559748e-07, "logps/chosen": -48.301883697509766, "logps/rejected": -50.832313537597656, "loss": 0.6949, "losses/dpo": 0.6918096542358398, "losses/sft": 1.2325555086135864, "losses/total": 0.6918096542358398, "ref_logps/chosen": -47.70981979370117, "ref_logps/rejected": -50.27460479736328, "rewards/accuracies": 0.375, "rewards/chosen": -0.05920600891113281, "rewards/margins": -0.003434740472584963, "rewards/rejected": -0.05577126890420914, "step": 143 }, { "epoch": 0.14, "grad_norm": 15.367589950561523, "learning_rate": 2.2641509433962263e-07, "logps/chosen": -33.164772033691406, "logps/rejected": -40.99664306640625, "loss": 0.6876, "losses/dpo": 0.6855679154396057, "losses/sft": 1.2458962202072144, "losses/total": 0.6855679154396057, "ref_logps/chosen": -32.71955108642578, "ref_logps/rejected": -40.43425750732422, "rewards/accuracies": 0.5625, "rewards/chosen": -0.044522032141685486, "rewards/margins": 0.011716343462467194, "rewards/rejected": -0.05623837187886238, "step": 144 }, { "epoch": 0.14, "grad_norm": 14.423078536987305, "learning_rate": 2.279874213836478e-07, "logps/chosen": -38.37190628051758, "logps/rejected": -43.500038146972656, "loss": 0.6819, "losses/dpo": 0.6875548362731934, "losses/sft": 1.3889192342758179, "losses/total": 0.6875548362731934, "ref_logps/chosen": -37.958709716796875, "ref_logps/rejected": -42.858619689941406, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0413198322057724, "rewards/margins": 0.022821854799985886, "rewards/rejected": -0.06414168328046799, "step": 145 }, { "epoch": 0.14, "grad_norm": 15.48110294342041, "learning_rate": 2.2955974842767295e-07, "logps/chosen": -34.765846252441406, "logps/rejected": -41.104759216308594, "loss": 0.6814, "losses/dpo": 0.669724702835083, "losses/sft": 1.7875714302062988, "losses/total": 0.669724702835083, "ref_logps/chosen": -34.194915771484375, "ref_logps/rejected": -40.29185485839844, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05709293484687805, "rewards/margins": 0.024197574704885483, "rewards/rejected": -0.08129051327705383, "step": 146 }, { "epoch": 0.14, "grad_norm": 16.652830123901367, "learning_rate": 2.311320754716981e-07, "logps/chosen": -36.488487243652344, "logps/rejected": -48.65363311767578, "loss": 0.6813, "losses/dpo": 0.6807320713996887, "losses/sft": 1.1820979118347168, "losses/total": 0.6807320713996887, "ref_logps/chosen": -36.11376953125, "ref_logps/rejected": -48.032691955566406, "rewards/accuracies": 0.75, "rewards/chosen": -0.0374714694917202, "rewards/margins": 0.02462271973490715, "rewards/rejected": -0.06209418922662735, "step": 147 }, { "epoch": 0.14, "grad_norm": 15.308725357055664, "learning_rate": 2.3270440251572326e-07, "logps/chosen": -40.74299240112305, "logps/rejected": -44.69352722167969, "loss": 0.6965, "losses/dpo": 0.709496021270752, "losses/sft": 1.0641168355941772, "losses/total": 0.709496021270752, "ref_logps/chosen": -40.02014923095703, "ref_logps/rejected": -44.02622604370117, "rewards/accuracies": 0.375, "rewards/chosen": -0.07228401303291321, "rewards/margins": -0.005553926341235638, "rewards/rejected": -0.0667300820350647, "step": 148 }, { "epoch": 0.14, "grad_norm": 13.749775886535645, "learning_rate": 2.3427672955974843e-07, "logps/chosen": -31.36304473876953, "logps/rejected": -37.66726303100586, "loss": 0.6768, "losses/dpo": 0.6756744384765625, "losses/sft": 1.764005422592163, "losses/total": 0.6756744384765625, "ref_logps/chosen": -30.908706665039062, "ref_logps/rejected": -36.87897491455078, "rewards/accuracies": 0.75, "rewards/chosen": -0.045433878898620605, "rewards/margins": 0.033394597470760345, "rewards/rejected": -0.07882846891880035, "step": 149 }, { "epoch": 0.14, "grad_norm": 13.469657897949219, "learning_rate": 2.3584905660377358e-07, "logps/chosen": -35.03850555419922, "logps/rejected": -41.29573059082031, "loss": 0.6887, "losses/dpo": 0.6853890419006348, "losses/sft": 1.3369121551513672, "losses/total": 0.6853890419006348, "ref_logps/chosen": -34.43486022949219, "ref_logps/rejected": -40.60087966918945, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06036418676376343, "rewards/margins": 0.009120948612689972, "rewards/rejected": -0.0694851353764534, "step": 150 }, { "epoch": 0.14, "grad_norm": 14.294488906860352, "learning_rate": 2.3742138364779873e-07, "logps/chosen": -37.178802490234375, "logps/rejected": -44.86241149902344, "loss": 0.6993, "losses/dpo": 0.7003527879714966, "losses/sft": 1.4542131423950195, "losses/total": 0.7003527879714966, "ref_logps/chosen": -36.377647399902344, "ref_logps/rejected": -44.17764663696289, "rewards/accuracies": 0.5, "rewards/chosen": -0.08011550456285477, "rewards/margins": -0.011639060452580452, "rewards/rejected": -0.06847643852233887, "step": 151 }, { "epoch": 0.14, "grad_norm": 16.434444427490234, "learning_rate": 2.389937106918239e-07, "logps/chosen": -43.262386322021484, "logps/rejected": -50.735694885253906, "loss": 0.6843, "losses/dpo": 0.6821097135543823, "losses/sft": 1.3412997722625732, "losses/total": 0.6821097135543823, "ref_logps/chosen": -42.7374153137207, "ref_logps/rejected": -50.02814483642578, "rewards/accuracies": 0.625, "rewards/chosen": -0.05249696224927902, "rewards/margins": 0.018258104100823402, "rewards/rejected": -0.07075506448745728, "step": 152 }, { "epoch": 0.14, "grad_norm": 17.17867660522461, "learning_rate": 2.4056603773584903e-07, "logps/chosen": -45.7003173828125, "logps/rejected": -48.128639221191406, "loss": 0.6866, "losses/dpo": 0.6841785311698914, "losses/sft": 1.4632898569107056, "losses/total": 0.6841785311698914, "ref_logps/chosen": -44.91077423095703, "ref_logps/rejected": -47.199195861816406, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07895435392856598, "rewards/margins": 0.013990305364131927, "rewards/rejected": -0.09294465184211731, "step": 153 }, { "epoch": 0.15, "grad_norm": 15.123295783996582, "learning_rate": 2.421383647798742e-07, "logps/chosen": -41.161293029785156, "logps/rejected": -43.288719177246094, "loss": 0.6898, "losses/dpo": 0.6806986331939697, "losses/sft": 1.2774852514266968, "losses/total": 0.6806986331939697, "ref_logps/chosen": -40.295387268066406, "ref_logps/rejected": -42.34626007080078, "rewards/accuracies": 0.5, "rewards/chosen": -0.08659013360738754, "rewards/margins": 0.007655314169824123, "rewards/rejected": -0.09424544870853424, "step": 154 }, { "epoch": 0.15, "grad_norm": 15.345965385437012, "learning_rate": 2.437106918238994e-07, "logps/chosen": -36.4453010559082, "logps/rejected": -43.52362060546875, "loss": 0.6933, "losses/dpo": 0.69317626953125, "losses/sft": 1.4723988771438599, "losses/total": 0.69317626953125, "ref_logps/chosen": -35.49375915527344, "ref_logps/rejected": -42.57204055786133, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09515412151813507, "rewards/margins": 3.3443793654441833e-06, "rewards/rejected": -0.09515747427940369, "step": 155 }, { "epoch": 0.15, "grad_norm": 15.515606880187988, "learning_rate": 2.452830188679245e-07, "logps/chosen": -41.18972396850586, "logps/rejected": -46.37871170043945, "loss": 0.6899, "losses/dpo": 0.6924268007278442, "losses/sft": 1.5120503902435303, "losses/total": 0.6924268007278442, "ref_logps/chosen": -40.387245178222656, "ref_logps/rejected": -45.50495529174805, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08024749904870987, "rewards/margins": 0.007128346711397171, "rewards/rejected": -0.08737584948539734, "step": 156 }, { "epoch": 0.15, "grad_norm": 15.6599702835083, "learning_rate": 2.468553459119497e-07, "logps/chosen": -37.020111083984375, "logps/rejected": -43.15111541748047, "loss": 0.6947, "losses/dpo": 0.7032310962677002, "losses/sft": 1.466179609298706, "losses/total": 0.7032310962677002, "ref_logps/chosen": -36.081417083740234, "ref_logps/rejected": -42.236480712890625, "rewards/accuracies": 0.5, "rewards/chosen": -0.09386956691741943, "rewards/margins": -0.0024061878211796284, "rewards/rejected": -0.0914633721113205, "step": 157 }, { "epoch": 0.15, "grad_norm": 15.054862022399902, "learning_rate": 2.484276729559748e-07, "logps/chosen": -43.759910583496094, "logps/rejected": -43.61105728149414, "loss": 0.6849, "losses/dpo": 0.6913665533065796, "losses/sft": 1.5183358192443848, "losses/total": 0.6913665533065796, "ref_logps/chosen": -42.80980682373047, "ref_logps/rejected": -42.48993682861328, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09501078724861145, "rewards/margins": 0.017101164907217026, "rewards/rejected": -0.11211195588111877, "step": 158 }, { "epoch": 0.15, "grad_norm": 17.373764038085938, "learning_rate": 2.5e-07, "logps/chosen": -48.36299133300781, "logps/rejected": -61.741703033447266, "loss": 0.691, "losses/dpo": 0.6970544457435608, "losses/sft": 1.3365906476974487, "losses/total": 0.6970544457435608, "ref_logps/chosen": -47.605804443359375, "ref_logps/rejected": -60.936859130859375, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0757184773683548, "rewards/margins": 0.004765537567436695, "rewards/rejected": -0.08048401772975922, "step": 159 }, { "epoch": 0.15, "grad_norm": 16.456748962402344, "learning_rate": 2.5157232704402517e-07, "logps/chosen": -42.988807678222656, "logps/rejected": -48.046600341796875, "loss": 0.6972, "losses/dpo": 0.6922672986984253, "losses/sft": 1.4732236862182617, "losses/total": 0.6922672986984253, "ref_logps/chosen": -41.93547058105469, "ref_logps/rejected": -47.06855010986328, "rewards/accuracies": 0.5, "rewards/chosen": -0.1053338348865509, "rewards/margins": -0.0075288740918040276, "rewards/rejected": -0.097804956138134, "step": 160 }, { "epoch": 0.15, "grad_norm": 13.012798309326172, "learning_rate": 2.531446540880503e-07, "logps/chosen": -25.559913635253906, "logps/rejected": -38.070777893066406, "loss": 0.6787, "losses/dpo": 0.6694801449775696, "losses/sft": 1.0408848524093628, "losses/total": 0.6694801449775696, "ref_logps/chosen": -24.90559196472168, "ref_logps/rejected": -37.12018585205078, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06543232500553131, "rewards/margins": 0.029626626521348953, "rewards/rejected": -0.09505894780158997, "step": 161 }, { "epoch": 0.15, "grad_norm": 16.118104934692383, "learning_rate": 2.5471698113207547e-07, "logps/chosen": -44.049354553222656, "logps/rejected": -47.15470886230469, "loss": 0.6829, "losses/dpo": 0.7058506608009338, "losses/sft": 1.5558570623397827, "losses/total": 0.7058506608009338, "ref_logps/chosen": -43.05775833129883, "ref_logps/rejected": -45.945899963378906, "rewards/accuracies": 0.5, "rewards/chosen": -0.09915965050458908, "rewards/margins": 0.021722018718719482, "rewards/rejected": -0.12088166177272797, "step": 162 }, { "epoch": 0.15, "grad_norm": 14.646360397338867, "learning_rate": 2.562893081761006e-07, "logps/chosen": -37.91931915283203, "logps/rejected": -35.62671661376953, "loss": 0.6849, "losses/dpo": 0.6865489482879639, "losses/sft": 1.2324689626693726, "losses/total": 0.6865489482879639, "ref_logps/chosen": -37.11323547363281, "ref_logps/rejected": -34.651885986328125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08060857653617859, "rewards/margins": 0.016874689608812332, "rewards/rejected": -0.09748326241970062, "step": 163 }, { "epoch": 0.15, "grad_norm": 14.869216918945312, "learning_rate": 2.5786163522012577e-07, "logps/chosen": -40.927040100097656, "logps/rejected": -42.58702087402344, "loss": 0.6836, "losses/dpo": 0.6857635378837585, "losses/sft": 1.0527235269546509, "losses/total": 0.6857635378837585, "ref_logps/chosen": -39.78255081176758, "ref_logps/rejected": -41.23955535888672, "rewards/accuracies": 0.5625, "rewards/chosen": -0.11444923281669617, "rewards/margins": 0.020297367125749588, "rewards/rejected": -0.13474659621715546, "step": 164 }, { "epoch": 0.16, "grad_norm": 11.951457977294922, "learning_rate": 2.5943396226415094e-07, "logps/chosen": -27.83062744140625, "logps/rejected": -28.856121063232422, "loss": 0.6832, "losses/dpo": 0.6878472566604614, "losses/sft": 1.077296495437622, "losses/total": 0.6878472566604614, "ref_logps/chosen": -27.205949783325195, "ref_logps/rejected": -28.023914337158203, "rewards/accuracies": 0.625, "rewards/chosen": -0.06246752291917801, "rewards/margins": 0.02075308747589588, "rewards/rejected": -0.08322061598300934, "step": 165 }, { "epoch": 0.16, "grad_norm": 14.284201622009277, "learning_rate": 2.6100628930817607e-07, "logps/chosen": -38.03638458251953, "logps/rejected": -38.48820495605469, "loss": 0.6943, "losses/dpo": 0.7134230136871338, "losses/sft": 1.8575987815856934, "losses/total": 0.7134230136871338, "ref_logps/chosen": -37.123382568359375, "ref_logps/rejected": -37.59544372558594, "rewards/accuracies": 0.5, "rewards/chosen": -0.09130043536424637, "rewards/margins": -0.002024741843342781, "rewards/rejected": -0.08927569538354874, "step": 166 }, { "epoch": 0.16, "grad_norm": 17.11756134033203, "learning_rate": 2.6257861635220124e-07, "logps/chosen": -42.22746276855469, "logps/rejected": -45.72160720825195, "loss": 0.6843, "losses/dpo": 0.6774294972419739, "losses/sft": 1.6261186599731445, "losses/total": 0.6774294972419739, "ref_logps/chosen": -41.04143524169922, "ref_logps/rejected": -44.35469055175781, "rewards/accuracies": 0.625, "rewards/chosen": -0.11860252916812897, "rewards/margins": 0.018089205026626587, "rewards/rejected": -0.13669173419475555, "step": 167 }, { "epoch": 0.16, "grad_norm": 15.555020332336426, "learning_rate": 2.641509433962264e-07, "logps/chosen": -38.277732849121094, "logps/rejected": -47.20541763305664, "loss": 0.6879, "losses/dpo": 0.6990188360214233, "losses/sft": 1.5325950384140015, "losses/total": 0.6990188360214233, "ref_logps/chosen": -37.04336929321289, "ref_logps/rejected": -45.858497619628906, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12343662977218628, "rewards/margins": 0.01125560887157917, "rewards/rejected": -0.1346922367811203, "step": 168 }, { "epoch": 0.16, "grad_norm": 16.52756118774414, "learning_rate": 2.6572327044025154e-07, "logps/chosen": -41.617828369140625, "logps/rejected": -41.552860260009766, "loss": 0.6899, "losses/dpo": 0.6829442977905273, "losses/sft": 1.0690327882766724, "losses/total": 0.6829442977905273, "ref_logps/chosen": -40.609249114990234, "ref_logps/rejected": -40.46549987792969, "rewards/accuracies": 0.5, "rewards/chosen": -0.10085811465978622, "rewards/margins": 0.007877673022449017, "rewards/rejected": -0.10873579233884811, "step": 169 }, { "epoch": 0.16, "grad_norm": 14.34033489227295, "learning_rate": 2.672955974842767e-07, "logps/chosen": -36.80204391479492, "logps/rejected": -47.43988800048828, "loss": 0.6904, "losses/dpo": 0.7027833461761475, "losses/sft": 1.1950372457504272, "losses/total": 0.7027833461761475, "ref_logps/chosen": -35.8398551940918, "ref_logps/rejected": -46.41313171386719, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09621897339820862, "rewards/margins": 0.006456691771745682, "rewards/rejected": -0.1026756763458252, "step": 170 }, { "epoch": 0.16, "grad_norm": 14.940962791442871, "learning_rate": 2.6886792452830185e-07, "logps/chosen": -43.187530517578125, "logps/rejected": -49.84062194824219, "loss": 0.69, "losses/dpo": 0.6816960573196411, "losses/sft": 1.5521328449249268, "losses/total": 0.6816960573196411, "ref_logps/chosen": -41.991539001464844, "ref_logps/rejected": -48.575469970703125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.11959922313690186, "rewards/margins": 0.006915335543453693, "rewards/rejected": -0.12651455402374268, "step": 171 }, { "epoch": 0.16, "grad_norm": 15.463927268981934, "learning_rate": 2.70440251572327e-07, "logps/chosen": -32.57449722290039, "logps/rejected": -46.29249572753906, "loss": 0.6773, "losses/dpo": 0.687432587146759, "losses/sft": 1.2540098428726196, "losses/total": 0.687432587146759, "ref_logps/chosen": -31.729053497314453, "ref_logps/rejected": -45.11859130859375, "rewards/accuracies": 0.625, "rewards/chosen": -0.08454425632953644, "rewards/margins": 0.03284570947289467, "rewards/rejected": -0.11738996207714081, "step": 172 }, { "epoch": 0.16, "grad_norm": 16.008506774902344, "learning_rate": 2.720125786163522e-07, "logps/chosen": -44.71931457519531, "logps/rejected": -49.086971282958984, "loss": 0.6803, "losses/dpo": 0.6806377172470093, "losses/sft": 1.4626554250717163, "losses/total": 0.6806377172470093, "ref_logps/chosen": -43.528839111328125, "ref_logps/rejected": -47.61699676513672, "rewards/accuracies": 0.5, "rewards/chosen": -0.11904732137918472, "rewards/margins": 0.027950255200266838, "rewards/rejected": -0.1469975709915161, "step": 173 }, { "epoch": 0.16, "grad_norm": 16.596031188964844, "learning_rate": 2.735849056603773e-07, "logps/chosen": -41.14557647705078, "logps/rejected": -51.88591003417969, "loss": 0.6778, "losses/dpo": 0.6837175488471985, "losses/sft": 1.5990148782730103, "losses/total": 0.6837175488471985, "ref_logps/chosen": -40.182373046875, "ref_logps/rejected": -50.60072708129883, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09632086008787155, "rewards/margins": 0.032197415828704834, "rewards/rejected": -0.12851828336715698, "step": 174 }, { "epoch": 0.17, "grad_norm": 14.481203079223633, "learning_rate": 2.751572327044025e-07, "logps/chosen": -32.59597396850586, "logps/rejected": -39.09141540527344, "loss": 0.6745, "losses/dpo": 0.6526998281478882, "losses/sft": 1.1885079145431519, "losses/total": 0.6526998281478882, "ref_logps/chosen": -31.67422103881836, "ref_logps/rejected": -37.783836364746094, "rewards/accuracies": 0.625, "rewards/chosen": -0.0921751856803894, "rewards/margins": 0.03858298808336258, "rewards/rejected": -0.1307581663131714, "step": 175 }, { "epoch": 0.17, "grad_norm": 13.180888175964355, "learning_rate": 2.767295597484277e-07, "logps/chosen": -30.781482696533203, "logps/rejected": -33.86185836791992, "loss": 0.7028, "losses/dpo": 0.702089786529541, "losses/sft": 1.3457748889923096, "losses/total": 0.702089786529541, "ref_logps/chosen": -29.678794860839844, "ref_logps/rejected": -32.9378662109375, "rewards/accuracies": 0.375, "rewards/chosen": -0.11026884615421295, "rewards/margins": -0.017869500443339348, "rewards/rejected": -0.09239935129880905, "step": 176 }, { "epoch": 0.17, "grad_norm": 16.37763214111328, "learning_rate": 2.783018867924528e-07, "logps/chosen": -39.206024169921875, "logps/rejected": -49.604888916015625, "loss": 0.6964, "losses/dpo": 0.68863445520401, "losses/sft": 1.305910587310791, "losses/total": 0.68863445520401, "ref_logps/chosen": -38.11943817138672, "ref_logps/rejected": -48.57234573364258, "rewards/accuracies": 0.625, "rewards/chosen": -0.10865846276283264, "rewards/margins": -0.005403862800449133, "rewards/rejected": -0.10325458645820618, "step": 177 }, { "epoch": 0.17, "grad_norm": 17.375797271728516, "learning_rate": 2.79874213836478e-07, "logps/chosen": -45.335411071777344, "logps/rejected": -59.22425842285156, "loss": 0.6907, "losses/dpo": 0.6877986192703247, "losses/sft": 1.5931527614593506, "losses/total": 0.6877986192703247, "ref_logps/chosen": -43.8665885925293, "ref_logps/rejected": -57.69758224487305, "rewards/accuracies": 0.5, "rewards/chosen": -0.14688171446323395, "rewards/margins": 0.005785716697573662, "rewards/rejected": -0.15266743302345276, "step": 178 }, { "epoch": 0.17, "grad_norm": 13.111441612243652, "learning_rate": 2.814465408805031e-07, "logps/chosen": -29.578392028808594, "logps/rejected": -31.886646270751953, "loss": 0.7002, "losses/dpo": 0.7050355672836304, "losses/sft": 1.4341094493865967, "losses/total": 0.7050355672836304, "ref_logps/chosen": -28.493379592895508, "ref_logps/rejected": -30.934284210205078, "rewards/accuracies": 0.375, "rewards/chosen": -0.1085013598203659, "rewards/margins": -0.013265163637697697, "rewards/rejected": -0.09523618966341019, "step": 179 }, { "epoch": 0.17, "grad_norm": 16.71872901916504, "learning_rate": 2.830188679245283e-07, "logps/chosen": -46.59730911254883, "logps/rejected": -50.84010314941406, "loss": 0.6928, "losses/dpo": 0.6997765302658081, "losses/sft": 1.3833726644515991, "losses/total": 0.6997765302658081, "ref_logps/chosen": -45.3701171875, "ref_logps/rejected": -49.58867263793945, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1227194219827652, "rewards/margins": 0.0024234289303421974, "rewards/rejected": -0.12514284253120422, "step": 180 }, { "epoch": 0.17, "grad_norm": 13.005847930908203, "learning_rate": 2.8459119496855345e-07, "logps/chosen": -28.849990844726562, "logps/rejected": -40.01360321044922, "loss": 0.6859, "losses/dpo": 0.6663920283317566, "losses/sft": 1.5015794038772583, "losses/total": 0.6663920283317566, "ref_logps/chosen": -27.75012969970703, "ref_logps/rejected": -38.7610969543457, "rewards/accuracies": 0.625, "rewards/chosen": -0.1099858433008194, "rewards/margins": 0.015264769084751606, "rewards/rejected": -0.12525062263011932, "step": 181 }, { "epoch": 0.17, "grad_norm": 16.202726364135742, "learning_rate": 2.861635220125786e-07, "logps/chosen": -47.63661193847656, "logps/rejected": -43.00746536254883, "loss": 0.7008, "losses/dpo": 0.7257519364356995, "losses/sft": 1.6272774934768677, "losses/total": 0.7257519364356995, "ref_logps/chosen": -46.10846710205078, "ref_logps/rejected": -41.60276412963867, "rewards/accuracies": 0.375, "rewards/chosen": -0.15281397104263306, "rewards/margins": -0.012343762442469597, "rewards/rejected": -0.1404702067375183, "step": 182 }, { "epoch": 0.17, "grad_norm": 15.507378578186035, "learning_rate": 2.8773584905660376e-07, "logps/chosen": -34.826133728027344, "logps/rejected": -35.629150390625, "loss": 0.6983, "losses/dpo": 0.7034717202186584, "losses/sft": 1.0290791988372803, "losses/total": 0.7034717202186584, "ref_logps/chosen": -33.809837341308594, "ref_logps/rejected": -34.70735549926758, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10162919014692307, "rewards/margins": -0.00944942981004715, "rewards/rejected": -0.09217976033687592, "step": 183 }, { "epoch": 0.17, "grad_norm": 16.532779693603516, "learning_rate": 2.8930817610062893e-07, "logps/chosen": -39.020416259765625, "logps/rejected": -49.396759033203125, "loss": 0.6692, "losses/dpo": 0.6778005957603455, "losses/sft": 1.1891576051712036, "losses/total": 0.6778005957603455, "ref_logps/chosen": -37.64043426513672, "ref_logps/rejected": -47.520320892333984, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13799801468849182, "rewards/margins": 0.04964599758386612, "rewards/rejected": -0.18764400482177734, "step": 184 }, { "epoch": 0.17, "grad_norm": 17.674861907958984, "learning_rate": 2.9088050314465406e-07, "logps/chosen": -44.64285659790039, "logps/rejected": -58.697853088378906, "loss": 0.6659, "losses/dpo": 0.6724292039871216, "losses/sft": 1.3990806341171265, "losses/total": 0.6724292039871216, "ref_logps/chosen": -43.65705871582031, "ref_logps/rejected": -57.150028228759766, "rewards/accuracies": 0.75, "rewards/chosen": -0.09857955574989319, "rewards/margins": 0.056202761828899384, "rewards/rejected": -0.15478231012821198, "step": 185 }, { "epoch": 0.18, "grad_norm": 16.24513053894043, "learning_rate": 2.9245283018867923e-07, "logps/chosen": -43.595157623291016, "logps/rejected": -50.3731575012207, "loss": 0.6705, "losses/dpo": 0.6837365627288818, "losses/sft": 1.6158891916275024, "losses/total": 0.6837365627288818, "ref_logps/chosen": -42.28122329711914, "ref_logps/rejected": -48.581905364990234, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13139350712299347, "rewards/margins": 0.047731611877679825, "rewards/rejected": -0.1791251301765442, "step": 186 }, { "epoch": 0.18, "grad_norm": 14.867253303527832, "learning_rate": 2.9402515723270436e-07, "logps/chosen": -38.93950653076172, "logps/rejected": -38.7735595703125, "loss": 0.6962, "losses/dpo": 0.7279604077339172, "losses/sft": 1.3980575799942017, "losses/total": 0.7279604077339172, "ref_logps/chosen": -37.841529846191406, "ref_logps/rejected": -37.7196044921875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1097974181175232, "rewards/margins": -0.004402121528983116, "rewards/rejected": -0.10539529472589493, "step": 187 }, { "epoch": 0.18, "grad_norm": 16.037687301635742, "learning_rate": 2.9559748427672953e-07, "logps/chosen": -39.63153839111328, "logps/rejected": -51.349578857421875, "loss": 0.6738, "losses/dpo": 0.6700311899185181, "losses/sft": 1.102718472480774, "losses/total": 0.6700311899185181, "ref_logps/chosen": -38.3155517578125, "ref_logps/rejected": -49.63243865966797, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13159829378128052, "rewards/margins": 0.040115997195243835, "rewards/rejected": -0.17171429097652435, "step": 188 }, { "epoch": 0.18, "grad_norm": 14.470324516296387, "learning_rate": 2.971698113207547e-07, "logps/chosen": -31.057300567626953, "logps/rejected": -39.35655212402344, "loss": 0.6921, "losses/dpo": 0.6860572099685669, "losses/sft": 1.5276051759719849, "losses/total": 0.6860572099685669, "ref_logps/chosen": -29.975032806396484, "ref_logps/rejected": -38.23912811279297, "rewards/accuracies": 0.4375, "rewards/chosen": -0.10822691768407822, "rewards/margins": 0.003515336662530899, "rewards/rejected": -0.11174225062131882, "step": 189 }, { "epoch": 0.18, "grad_norm": 15.651127815246582, "learning_rate": 2.9874213836477983e-07, "logps/chosen": -34.63383483886719, "logps/rejected": -50.765357971191406, "loss": 0.685, "losses/dpo": 0.6855325698852539, "losses/sft": 1.1889837980270386, "losses/total": 0.6855325698852539, "ref_logps/chosen": -33.375396728515625, "ref_logps/rejected": -49.328575134277344, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12584339082241058, "rewards/margins": 0.017834901809692383, "rewards/rejected": -0.14367829263210297, "step": 190 }, { "epoch": 0.18, "grad_norm": 15.785958290100098, "learning_rate": 3.00314465408805e-07, "logps/chosen": -45.23821258544922, "logps/rejected": -42.64419174194336, "loss": 0.6782, "losses/dpo": 0.7013839483261108, "losses/sft": 1.2799322605133057, "losses/total": 0.7013839483261108, "ref_logps/chosen": -43.82655334472656, "ref_logps/rejected": -40.90449523925781, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1411656141281128, "rewards/margins": 0.03280426189303398, "rewards/rejected": -0.17396987974643707, "step": 191 }, { "epoch": 0.18, "grad_norm": 15.278894424438477, "learning_rate": 3.018867924528302e-07, "logps/chosen": -48.549888610839844, "logps/rejected": -48.774085998535156, "loss": 0.6952, "losses/dpo": 0.7167978286743164, "losses/sft": 1.6310853958129883, "losses/total": 0.7167978286743164, "ref_logps/chosen": -47.09099197387695, "ref_logps/rejected": -47.336204528808594, "rewards/accuracies": 0.5, "rewards/chosen": -0.1458895206451416, "rewards/margins": -0.002101265825331211, "rewards/rejected": -0.14378824830055237, "step": 192 }, { "epoch": 0.18, "grad_norm": 15.971733093261719, "learning_rate": 3.034591194968553e-07, "logps/chosen": -40.196651458740234, "logps/rejected": -38.4633674621582, "loss": 0.6789, "losses/dpo": 0.6922624111175537, "losses/sft": 1.6729313135147095, "losses/total": 0.6922624111175537, "ref_logps/chosen": -38.757503509521484, "ref_logps/rejected": -36.720375061035156, "rewards/accuracies": 0.8125, "rewards/chosen": -0.14391487836837769, "rewards/margins": 0.030384279787540436, "rewards/rejected": -0.17429916560649872, "step": 193 }, { "epoch": 0.18, "grad_norm": 14.058929443359375, "learning_rate": 3.050314465408805e-07, "logps/chosen": -31.338830947875977, "logps/rejected": -37.29695129394531, "loss": 0.6764, "losses/dpo": 0.6699336767196655, "losses/sft": 1.2760751247406006, "losses/total": 0.6699336767196655, "ref_logps/chosen": -30.212322235107422, "ref_logps/rejected": -35.824440002441406, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11265072226524353, "rewards/margins": 0.03460048511624336, "rewards/rejected": -0.1472511887550354, "step": 194 }, { "epoch": 0.18, "grad_norm": 15.570208549499512, "learning_rate": 3.066037735849056e-07, "logps/chosen": -31.415109634399414, "logps/rejected": -47.4658203125, "loss": 0.6769, "losses/dpo": 0.6840342879295349, "losses/sft": 1.202999234199524, "losses/total": 0.6840342879295349, "ref_logps/chosen": -30.24490737915039, "ref_logps/rejected": -45.958744049072266, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1170201227068901, "rewards/margins": 0.03368773311376572, "rewards/rejected": -0.15070785582065582, "step": 195 }, { "epoch": 0.19, "grad_norm": 16.667709350585938, "learning_rate": 3.081761006289308e-07, "logps/chosen": -43.68022918701172, "logps/rejected": -56.54926300048828, "loss": 0.6836, "losses/dpo": 0.6808040142059326, "losses/sft": 1.133742094039917, "losses/total": 0.6808040142059326, "ref_logps/chosen": -41.85504150390625, "ref_logps/rejected": -54.516883850097656, "rewards/accuracies": 0.625, "rewards/chosen": -0.18251898884773254, "rewards/margins": 0.02071884088218212, "rewards/rejected": -0.20323783159255981, "step": 196 }, { "epoch": 0.19, "grad_norm": 15.58508586883545, "learning_rate": 3.0974842767295597e-07, "logps/chosen": -39.37928009033203, "logps/rejected": -50.64099884033203, "loss": 0.682, "losses/dpo": 0.6437791585922241, "losses/sft": 1.2103878259658813, "losses/total": 0.6437791585922241, "ref_logps/chosen": -37.771575927734375, "ref_logps/rejected": -48.79432678222656, "rewards/accuracies": 0.5625, "rewards/chosen": -0.16077077388763428, "rewards/margins": 0.023896407335996628, "rewards/rejected": -0.1846671849489212, "step": 197 }, { "epoch": 0.19, "grad_norm": 16.314849853515625, "learning_rate": 3.113207547169811e-07, "logps/chosen": -50.09778594970703, "logps/rejected": -46.592864990234375, "loss": 0.6833, "losses/dpo": 0.6780909299850464, "losses/sft": 1.2175432443618774, "losses/total": 0.6780909299850464, "ref_logps/chosen": -48.372596740722656, "ref_logps/rejected": -44.658721923828125, "rewards/accuracies": 0.5, "rewards/chosen": -0.17251861095428467, "rewards/margins": 0.02089569717645645, "rewards/rejected": -0.19341430068016052, "step": 198 }, { "epoch": 0.19, "grad_norm": 15.102590560913086, "learning_rate": 3.1289308176100627e-07, "logps/chosen": -36.87873840332031, "logps/rejected": -47.39272689819336, "loss": 0.6813, "losses/dpo": 0.7059723138809204, "losses/sft": 1.406247615814209, "losses/total": 0.7059723138809204, "ref_logps/chosen": -35.17682647705078, "ref_logps/rejected": -45.424652099609375, "rewards/accuracies": 0.625, "rewards/chosen": -0.1701907515525818, "rewards/margins": 0.02661687135696411, "rewards/rejected": -0.1968076229095459, "step": 199 }, { "epoch": 0.19, "grad_norm": 14.590982437133789, "learning_rate": 3.1446540880503144e-07, "logps/chosen": -35.96052551269531, "logps/rejected": -39.116111755371094, "loss": 0.7049, "losses/dpo": 0.7237061262130737, "losses/sft": 1.5074063539505005, "losses/total": 0.7237061262130737, "ref_logps/chosen": -33.97023010253906, "ref_logps/rejected": -37.34395217895508, "rewards/accuracies": 0.375, "rewards/chosen": -0.19902947545051575, "rewards/margins": -0.021813856437802315, "rewards/rejected": -0.1772156059741974, "step": 200 }, { "epoch": 0.19, "grad_norm": 17.552453994750977, "learning_rate": 3.1603773584905657e-07, "logps/chosen": -50.09906768798828, "logps/rejected": -51.57411193847656, "loss": 0.6844, "losses/dpo": 0.6874098181724548, "losses/sft": 1.8443374633789062, "losses/total": 0.6874098181724548, "ref_logps/chosen": -48.258975982666016, "ref_logps/rejected": -49.54026794433594, "rewards/accuracies": 0.5, "rewards/chosen": -0.18400876224040985, "rewards/margins": 0.01937580667436123, "rewards/rejected": -0.20338457822799683, "step": 201 }, { "epoch": 0.19, "grad_norm": 15.17917537689209, "learning_rate": 3.1761006289308174e-07, "logps/chosen": -41.997772216796875, "logps/rejected": -42.95301055908203, "loss": 0.6651, "losses/dpo": 0.6975255012512207, "losses/sft": 1.6166423559188843, "losses/total": 0.6975255012512207, "ref_logps/chosen": -40.2783203125, "ref_logps/rejected": -40.63972473144531, "rewards/accuracies": 0.75, "rewards/chosen": -0.17194506525993347, "rewards/margins": 0.059383779764175415, "rewards/rejected": -0.23132885992527008, "step": 202 }, { "epoch": 0.19, "grad_norm": 13.554970741271973, "learning_rate": 3.1918238993710687e-07, "logps/chosen": -32.481422424316406, "logps/rejected": -41.664222717285156, "loss": 0.6747, "losses/dpo": 0.6949838399887085, "losses/sft": 1.0620595216751099, "losses/total": 0.6949838399887085, "ref_logps/chosen": -31.102386474609375, "ref_logps/rejected": -39.886993408203125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13790348172187805, "rewards/margins": 0.03981912508606911, "rewards/rejected": -0.17772261798381805, "step": 203 }, { "epoch": 0.19, "grad_norm": 15.56489372253418, "learning_rate": 3.2075471698113204e-07, "logps/chosen": -37.09054946899414, "logps/rejected": -53.35906219482422, "loss": 0.692, "losses/dpo": 0.6879791021347046, "losses/sft": 1.4524227380752563, "losses/total": 0.6879791021347046, "ref_logps/chosen": -35.559696197509766, "ref_logps/rejected": -51.78603744506836, "rewards/accuracies": 0.4375, "rewards/chosen": -0.15308555960655212, "rewards/margins": 0.004216962028294802, "rewards/rejected": -0.15730252861976624, "step": 204 }, { "epoch": 0.19, "grad_norm": 16.473398208618164, "learning_rate": 3.223270440251572e-07, "logps/chosen": -47.96939468383789, "logps/rejected": -42.30652618408203, "loss": 0.6717, "losses/dpo": 0.6861964464187622, "losses/sft": 1.3401845693588257, "losses/total": 0.6861964464187622, "ref_logps/chosen": -46.041831970214844, "ref_logps/rejected": -39.90606689453125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.19275638461112976, "rewards/margins": 0.04728955030441284, "rewards/rejected": -0.2400459349155426, "step": 205 }, { "epoch": 0.19, "grad_norm": 15.597003936767578, "learning_rate": 3.2389937106918235e-07, "logps/chosen": -41.01193618774414, "logps/rejected": -51.83057403564453, "loss": 0.6682, "losses/dpo": 0.6791011095046997, "losses/sft": 1.6842013597488403, "losses/total": 0.6791011095046997, "ref_logps/chosen": -39.343719482421875, "ref_logps/rejected": -49.630767822265625, "rewards/accuracies": 0.75, "rewards/chosen": -0.16682153940200806, "rewards/margins": 0.05315908417105675, "rewards/rejected": -0.2199806123971939, "step": 206 }, { "epoch": 0.2, "grad_norm": 16.27007484436035, "learning_rate": 3.254716981132075e-07, "logps/chosen": -32.553504943847656, "logps/rejected": -45.2066650390625, "loss": 0.6929, "losses/dpo": 0.690405011177063, "losses/sft": 1.352427363395691, "losses/total": 0.690405011177063, "ref_logps/chosen": -30.952503204345703, "ref_logps/rejected": -43.589378356933594, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1601003110408783, "rewards/margins": 0.0016281567513942719, "rewards/rejected": -0.16172844171524048, "step": 207 }, { "epoch": 0.2, "grad_norm": 17.231746673583984, "learning_rate": 3.2704402515723265e-07, "logps/chosen": -43.998146057128906, "logps/rejected": -41.52692794799805, "loss": 0.7, "losses/dpo": 0.6533277630805969, "losses/sft": 1.257144570350647, "losses/total": 0.6533277630805969, "ref_logps/chosen": -42.212215423583984, "ref_logps/rejected": -39.830039978027344, "rewards/accuracies": 0.5625, "rewards/chosen": -0.17859303951263428, "rewards/margins": -0.008904272690415382, "rewards/rejected": -0.16968876123428345, "step": 208 }, { "epoch": 0.2, "grad_norm": 16.129486083984375, "learning_rate": 3.286163522012578e-07, "logps/chosen": -41.640602111816406, "logps/rejected": -46.48876190185547, "loss": 0.6814, "losses/dpo": 0.6967303156852722, "losses/sft": 0.8777698874473572, "losses/total": 0.6967303156852722, "ref_logps/chosen": -39.685237884521484, "ref_logps/rejected": -44.27562713623047, "rewards/accuracies": 0.625, "rewards/chosen": -0.19553638994693756, "rewards/margins": 0.025777162984013557, "rewards/rejected": -0.22131355106830597, "step": 209 }, { "epoch": 0.2, "grad_norm": 14.743343353271484, "learning_rate": 3.30188679245283e-07, "logps/chosen": -32.78865432739258, "logps/rejected": -42.84953689575195, "loss": 0.6634, "losses/dpo": 0.6717252135276794, "losses/sft": 1.4664636850357056, "losses/total": 0.6717252135276794, "ref_logps/chosen": -31.449596405029297, "ref_logps/rejected": -40.8954963684082, "rewards/accuracies": 0.75, "rewards/chosen": -0.13390573859214783, "rewards/margins": 0.06149820238351822, "rewards/rejected": -0.19540394842624664, "step": 210 }, { "epoch": 0.2, "grad_norm": 13.537386894226074, "learning_rate": 3.317610062893081e-07, "logps/chosen": -33.905540466308594, "logps/rejected": -35.38450622558594, "loss": 0.6864, "losses/dpo": 0.6876325011253357, "losses/sft": 1.122238039970398, "losses/total": 0.6876325011253357, "ref_logps/chosen": -32.07237243652344, "ref_logps/rejected": -33.40351486206055, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1833166927099228, "rewards/margins": 0.014782454818487167, "rewards/rejected": -0.19809913635253906, "step": 211 }, { "epoch": 0.2, "grad_norm": 17.885358810424805, "learning_rate": 3.333333333333333e-07, "logps/chosen": -50.72590637207031, "logps/rejected": -57.39551544189453, "loss": 0.6716, "losses/dpo": 0.6695274710655212, "losses/sft": 1.7289897203445435, "losses/total": 0.6695274710655212, "ref_logps/chosen": -48.49372100830078, "ref_logps/rejected": -54.711021423339844, "rewards/accuracies": 0.75, "rewards/chosen": -0.22321867942810059, "rewards/margins": 0.04523022100329399, "rewards/rejected": -0.2684488892555237, "step": 212 }, { "epoch": 0.2, "grad_norm": 13.853524208068848, "learning_rate": 3.349056603773585e-07, "logps/chosen": -29.290006637573242, "logps/rejected": -40.776973724365234, "loss": 0.6819, "losses/dpo": 0.6762322783470154, "losses/sft": 1.0710937976837158, "losses/total": 0.6762322783470154, "ref_logps/chosen": -27.652446746826172, "ref_logps/rejected": -38.88838195800781, "rewards/accuracies": 0.625, "rewards/chosen": -0.16375604271888733, "rewards/margins": 0.025102993473410606, "rewards/rejected": -0.18885904550552368, "step": 213 }, { "epoch": 0.2, "grad_norm": 14.594182014465332, "learning_rate": 3.364779874213836e-07, "logps/chosen": -31.246246337890625, "logps/rejected": -41.12478256225586, "loss": 0.6688, "losses/dpo": 0.6594405174255371, "losses/sft": 1.2172948122024536, "losses/total": 0.6594405174255371, "ref_logps/chosen": -29.770009994506836, "ref_logps/rejected": -39.10675048828125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14762377738952637, "rewards/margins": 0.05417914688587189, "rewards/rejected": -0.20180290937423706, "step": 214 }, { "epoch": 0.2, "grad_norm": 16.192296981811523, "learning_rate": 3.380503144654088e-07, "logps/chosen": -44.14506530761719, "logps/rejected": -55.825077056884766, "loss": 0.6631, "losses/dpo": 0.656355082988739, "losses/sft": 1.545645833015442, "losses/total": 0.656355082988739, "ref_logps/chosen": -41.701148986816406, "ref_logps/rejected": -52.75156021118164, "rewards/accuracies": 0.8125, "rewards/chosen": -0.24439160525798798, "rewards/margins": 0.06296008080244064, "rewards/rejected": -0.307351678609848, "step": 215 }, { "epoch": 0.2, "grad_norm": 14.246395111083984, "learning_rate": 3.396226415094339e-07, "logps/chosen": -35.00162887573242, "logps/rejected": -43.11711120605469, "loss": 0.6689, "losses/dpo": 0.6910299062728882, "losses/sft": 1.5011862516403198, "losses/total": 0.6910299062728882, "ref_logps/chosen": -33.433834075927734, "ref_logps/rejected": -41.03712463378906, "rewards/accuracies": 0.75, "rewards/chosen": -0.1567794233560562, "rewards/margins": 0.05121920257806778, "rewards/rejected": -0.2079986333847046, "step": 216 }, { "epoch": 0.2, "grad_norm": 15.941094398498535, "learning_rate": 3.411949685534591e-07, "logps/chosen": -38.74461364746094, "logps/rejected": -53.71509552001953, "loss": 0.6728, "losses/dpo": 0.708710789680481, "losses/sft": 1.4716625213623047, "losses/total": 0.708710789680481, "ref_logps/chosen": -36.182579040527344, "ref_logps/rejected": -50.68135070800781, "rewards/accuracies": 0.5, "rewards/chosen": -0.25620341300964355, "rewards/margins": 0.04717123508453369, "rewards/rejected": -0.30337464809417725, "step": 217 }, { "epoch": 0.21, "grad_norm": 15.452842712402344, "learning_rate": 3.4276729559748426e-07, "logps/chosen": -46.325130462646484, "logps/rejected": -41.82459259033203, "loss": 0.6763, "losses/dpo": 0.7202698588371277, "losses/sft": 1.8678845167160034, "losses/total": 0.7202698588371277, "ref_logps/chosen": -44.23219299316406, "ref_logps/rejected": -39.33218002319336, "rewards/accuracies": 0.5625, "rewards/chosen": -0.20929360389709473, "rewards/margins": 0.039947494864463806, "rewards/rejected": -0.24924111366271973, "step": 218 }, { "epoch": 0.21, "grad_norm": 14.239889144897461, "learning_rate": 3.443396226415094e-07, "logps/chosen": -35.42744445800781, "logps/rejected": -39.29603576660156, "loss": 0.6799, "losses/dpo": 0.6879767179489136, "losses/sft": 1.5547114610671997, "losses/total": 0.6879767179489136, "ref_logps/chosen": -33.429893493652344, "ref_logps/rejected": -36.999755859375, "rewards/accuracies": 0.5, "rewards/chosen": -0.19975495338439941, "rewards/margins": 0.029872652143239975, "rewards/rejected": -0.2296276092529297, "step": 219 }, { "epoch": 0.21, "grad_norm": 15.082650184631348, "learning_rate": 3.4591194968553456e-07, "logps/chosen": -34.29314422607422, "logps/rejected": -47.68400573730469, "loss": 0.6497, "losses/dpo": 0.6668439507484436, "losses/sft": 1.0790891647338867, "losses/total": 0.6668439507484436, "ref_logps/chosen": -32.624237060546875, "ref_logps/rejected": -45.09401321411133, "rewards/accuracies": 0.875, "rewards/chosen": -0.1668909788131714, "rewards/margins": 0.09210819005966187, "rewards/rejected": -0.25899916887283325, "step": 220 }, { "epoch": 0.21, "grad_norm": 15.518473625183105, "learning_rate": 3.4748427672955973e-07, "logps/chosen": -40.32875442504883, "logps/rejected": -52.40559005737305, "loss": 0.6599, "losses/dpo": 0.6081426739692688, "losses/sft": 1.261610507965088, "losses/total": 0.6081426739692688, "ref_logps/chosen": -38.299949645996094, "ref_logps/rejected": -49.6461181640625, "rewards/accuracies": 0.75, "rewards/chosen": -0.20288079977035522, "rewards/margins": 0.07306638360023499, "rewards/rejected": -0.2759471833705902, "step": 221 }, { "epoch": 0.21, "grad_norm": 14.866722106933594, "learning_rate": 3.4905660377358486e-07, "logps/chosen": -38.81988525390625, "logps/rejected": -49.87083435058594, "loss": 0.6693, "losses/dpo": 0.6657773852348328, "losses/sft": 1.343111276626587, "losses/total": 0.6657773852348328, "ref_logps/chosen": -36.71820068359375, "ref_logps/rejected": -47.21161651611328, "rewards/accuracies": 0.625, "rewards/chosen": -0.21016836166381836, "rewards/margins": 0.05575356259942055, "rewards/rejected": -0.265921950340271, "step": 222 }, { "epoch": 0.21, "grad_norm": 15.532499313354492, "learning_rate": 3.5062893081761003e-07, "logps/chosen": -45.165679931640625, "logps/rejected": -42.318572998046875, "loss": 0.6893, "losses/dpo": 0.6813037395477295, "losses/sft": 1.8235492706298828, "losses/total": 0.6813037395477295, "ref_logps/chosen": -42.794921875, "ref_logps/rejected": -39.800296783447266, "rewards/accuracies": 0.4375, "rewards/chosen": -0.23707585036754608, "rewards/margins": 0.01475165132433176, "rewards/rejected": -0.2518274784088135, "step": 223 }, { "epoch": 0.21, "grad_norm": 16.2032527923584, "learning_rate": 3.5220125786163516e-07, "logps/chosen": -43.01076889038086, "logps/rejected": -43.869937896728516, "loss": 0.6603, "losses/dpo": 0.6653252840042114, "losses/sft": 1.4910627603530884, "losses/total": 0.6653252840042114, "ref_logps/chosen": -41.44512176513672, "ref_logps/rejected": -41.61030197143555, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1565646529197693, "rewards/margins": 0.06939879059791565, "rewards/rejected": -0.22596344351768494, "step": 224 }, { "epoch": 0.21, "grad_norm": 13.993906021118164, "learning_rate": 3.5377358490566033e-07, "logps/chosen": -37.451271057128906, "logps/rejected": -41.676597595214844, "loss": 0.6771, "losses/dpo": 0.6879667043685913, "losses/sft": 1.3644269704818726, "losses/total": 0.6879667043685913, "ref_logps/chosen": -35.446128845214844, "ref_logps/rejected": -39.31643295288086, "rewards/accuracies": 0.625, "rewards/chosen": -0.2005145400762558, "rewards/margins": 0.03550197184085846, "rewards/rejected": -0.23601651191711426, "step": 225 }, { "epoch": 0.21, "grad_norm": 15.624147415161133, "learning_rate": 3.553459119496855e-07, "logps/chosen": -38.95832061767578, "logps/rejected": -47.245208740234375, "loss": 0.6808, "losses/dpo": 0.6604455709457397, "losses/sft": 1.2016445398330688, "losses/total": 0.6604455709457397, "ref_logps/chosen": -36.592918395996094, "ref_logps/rejected": -44.59624481201172, "rewards/accuracies": 0.4375, "rewards/chosen": -0.23654010891914368, "rewards/margins": 0.028356090188026428, "rewards/rejected": -0.2648961842060089, "step": 226 }, { "epoch": 0.21, "grad_norm": 15.331037521362305, "learning_rate": 3.5691823899371064e-07, "logps/chosen": -33.339542388916016, "logps/rejected": -47.19365692138672, "loss": 0.694, "losses/dpo": 0.696473240852356, "losses/sft": 1.533835530281067, "losses/total": 0.696473240852356, "ref_logps/chosen": -31.02646255493164, "ref_logps/rejected": -44.7705078125, "rewards/accuracies": 0.625, "rewards/chosen": -0.2313082218170166, "rewards/margins": 0.0110064297914505, "rewards/rejected": -0.2423146367073059, "step": 227 }, { "epoch": 0.22, "grad_norm": 14.999101638793945, "learning_rate": 3.584905660377358e-07, "logps/chosen": -34.735633850097656, "logps/rejected": -40.994468688964844, "loss": 0.674, "losses/dpo": 0.6969943046569824, "losses/sft": 1.3561644554138184, "losses/total": 0.6969943046569824, "ref_logps/chosen": -33.20204162597656, "ref_logps/rejected": -39.01994323730469, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1533590853214264, "rewards/margins": 0.04409363120794296, "rewards/rejected": -0.19745272397994995, "step": 228 }, { "epoch": 0.22, "grad_norm": 14.841408729553223, "learning_rate": 3.60062893081761e-07, "logps/chosen": -37.207210540771484, "logps/rejected": -39.67007827758789, "loss": 0.6915, "losses/dpo": 0.7072970867156982, "losses/sft": 1.3001089096069336, "losses/total": 0.7072970867156982, "ref_logps/chosen": -34.878719329833984, "ref_logps/rejected": -37.27781677246094, "rewards/accuracies": 0.5625, "rewards/chosen": -0.23284918069839478, "rewards/margins": 0.006377032026648521, "rewards/rejected": -0.23922622203826904, "step": 229 }, { "epoch": 0.22, "grad_norm": 15.088621139526367, "learning_rate": 3.616352201257861e-07, "logps/chosen": -37.07647705078125, "logps/rejected": -43.55615234375, "loss": 0.6722, "losses/dpo": 0.7194198369979858, "losses/sft": 1.6237705945968628, "losses/total": 0.7194198369979858, "ref_logps/chosen": -34.825340270996094, "ref_logps/rejected": -40.844993591308594, "rewards/accuracies": 0.625, "rewards/chosen": -0.22511407732963562, "rewards/margins": 0.0460018664598465, "rewards/rejected": -0.2711159586906433, "step": 230 }, { "epoch": 0.22, "grad_norm": 15.284753799438477, "learning_rate": 3.632075471698113e-07, "logps/chosen": -39.28997039794922, "logps/rejected": -43.157981872558594, "loss": 0.6755, "losses/dpo": 0.6688922643661499, "losses/sft": 1.4212239980697632, "losses/total": 0.6688922643661499, "ref_logps/chosen": -37.02705383300781, "ref_logps/rejected": -40.469093322753906, "rewards/accuracies": 0.625, "rewards/chosen": -0.22629138827323914, "rewards/margins": 0.042597681283950806, "rewards/rejected": -0.26888906955718994, "step": 231 }, { "epoch": 0.22, "grad_norm": 14.206757545471191, "learning_rate": 3.647798742138364e-07, "logps/chosen": -34.42321014404297, "logps/rejected": -43.79088592529297, "loss": 0.6681, "losses/dpo": 0.664331316947937, "losses/sft": 1.4147409200668335, "losses/total": 0.664331316947937, "ref_logps/chosen": -32.078857421875, "ref_logps/rejected": -40.853614807128906, "rewards/accuracies": 0.6875, "rewards/chosen": -0.23443545401096344, "rewards/margins": 0.05929189175367355, "rewards/rejected": -0.2937273383140564, "step": 232 }, { "epoch": 0.22, "grad_norm": 15.06775951385498, "learning_rate": 3.663522012578616e-07, "logps/chosen": -43.315086364746094, "logps/rejected": -45.40140914916992, "loss": 0.661, "losses/dpo": 0.6380653977394104, "losses/sft": 1.9001001119613647, "losses/total": 0.6380653977394104, "ref_logps/chosen": -40.40079116821289, "ref_logps/rejected": -41.77799987792969, "rewards/accuracies": 0.625, "rewards/chosen": -0.2914295196533203, "rewards/margins": 0.0709112212061882, "rewards/rejected": -0.3623407483100891, "step": 233 }, { "epoch": 0.22, "grad_norm": 15.058982849121094, "learning_rate": 3.6792452830188677e-07, "logps/chosen": -41.931304931640625, "logps/rejected": -43.587646484375, "loss": 0.6634, "losses/dpo": 0.6785554885864258, "losses/sft": 1.5310620069503784, "losses/total": 0.6785554885864258, "ref_logps/chosen": -39.47883605957031, "ref_logps/rejected": -40.498313903808594, "rewards/accuracies": 0.625, "rewards/chosen": -0.2452470362186432, "rewards/margins": 0.06368651241064072, "rewards/rejected": -0.3089335560798645, "step": 234 }, { "epoch": 0.22, "grad_norm": 15.68648910522461, "learning_rate": 3.694968553459119e-07, "logps/chosen": -45.422271728515625, "logps/rejected": -33.526824951171875, "loss": 0.7061, "losses/dpo": 0.717628002166748, "losses/sft": 1.4144009351730347, "losses/total": 0.717628002166748, "ref_logps/chosen": -42.57256317138672, "ref_logps/rejected": -30.87489128112793, "rewards/accuracies": 0.5, "rewards/chosen": -0.2849707007408142, "rewards/margins": -0.019777163863182068, "rewards/rejected": -0.26519352197647095, "step": 235 }, { "epoch": 0.22, "grad_norm": 16.10345458984375, "learning_rate": 3.7106918238993707e-07, "logps/chosen": -44.631004333496094, "logps/rejected": -47.2867546081543, "loss": 0.6631, "losses/dpo": 0.6312600374221802, "losses/sft": 1.4290958642959595, "losses/total": 0.6312600374221802, "ref_logps/chosen": -42.252628326416016, "ref_logps/rejected": -44.24822235107422, "rewards/accuracies": 0.5, "rewards/chosen": -0.23783786594867706, "rewards/margins": 0.06601482629776001, "rewards/rejected": -0.30385270714759827, "step": 236 }, { "epoch": 0.22, "grad_norm": 15.954336166381836, "learning_rate": 3.7264150943396224e-07, "logps/chosen": -39.63313293457031, "logps/rejected": -52.807403564453125, "loss": 0.6736, "losses/dpo": 0.6659075617790222, "losses/sft": 1.698724627494812, "losses/total": 0.6659075617790222, "ref_logps/chosen": -36.858253479003906, "ref_logps/rejected": -49.60414123535156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2774878144264221, "rewards/margins": 0.04283905774354935, "rewards/rejected": -0.32032686471939087, "step": 237 }, { "epoch": 0.22, "grad_norm": 14.88613510131836, "learning_rate": 3.7421383647798737e-07, "logps/chosen": -39.489967346191406, "logps/rejected": -52.311187744140625, "loss": 0.6459, "losses/dpo": 0.6612695455551147, "losses/sft": 1.4787285327911377, "losses/total": 0.6612695455551147, "ref_logps/chosen": -37.394012451171875, "ref_logps/rejected": -49.207279205322266, "rewards/accuracies": 0.8125, "rewards/chosen": -0.20959532260894775, "rewards/margins": 0.10079538822174072, "rewards/rejected": -0.3103907108306885, "step": 238 }, { "epoch": 0.23, "grad_norm": 17.082839965820312, "learning_rate": 3.757861635220126e-07, "logps/chosen": -47.36906051635742, "logps/rejected": -52.26832962036133, "loss": 0.6787, "losses/dpo": 0.6704140305519104, "losses/sft": 1.844097375869751, "losses/total": 0.6704140305519104, "ref_logps/chosen": -44.38483428955078, "ref_logps/rejected": -48.93090057373047, "rewards/accuracies": 0.625, "rewards/chosen": -0.2984226942062378, "rewards/margins": 0.03532020002603531, "rewards/rejected": -0.3337429165840149, "step": 239 }, { "epoch": 0.23, "grad_norm": 15.276880264282227, "learning_rate": 3.773584905660377e-07, "logps/chosen": -41.31379699707031, "logps/rejected": -45.109554290771484, "loss": 0.6721, "losses/dpo": 0.679213285446167, "losses/sft": 1.7050467729568481, "losses/total": 0.679213285446167, "ref_logps/chosen": -38.43292236328125, "ref_logps/rejected": -41.68553924560547, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2880876064300537, "rewards/margins": 0.05431407690048218, "rewards/rejected": -0.3424016833305359, "step": 240 }, { "epoch": 0.23, "grad_norm": 15.54065227508545, "learning_rate": 3.789308176100629e-07, "logps/chosen": -42.323673248291016, "logps/rejected": -44.42234802246094, "loss": 0.6537, "losses/dpo": 0.6742671728134155, "losses/sft": 1.3836464881896973, "losses/total": 0.6742671728134155, "ref_logps/chosen": -39.27201461791992, "ref_logps/rejected": -40.444923400878906, "rewards/accuracies": 0.625, "rewards/chosen": -0.30516576766967773, "rewards/margins": 0.09257683157920837, "rewards/rejected": -0.3977426290512085, "step": 241 }, { "epoch": 0.23, "grad_norm": 15.982912063598633, "learning_rate": 3.805031446540881e-07, "logps/chosen": -45.311492919921875, "logps/rejected": -44.36945724487305, "loss": 0.666, "losses/dpo": 0.6028597354888916, "losses/sft": 1.141249656677246, "losses/total": 0.6028597354888916, "ref_logps/chosen": -42.80768585205078, "ref_logps/rejected": -41.26680374145508, "rewards/accuracies": 0.625, "rewards/chosen": -0.2503806948661804, "rewards/margins": 0.05988464504480362, "rewards/rejected": -0.31026530265808105, "step": 242 }, { "epoch": 0.23, "grad_norm": 15.576713562011719, "learning_rate": 3.820754716981132e-07, "logps/chosen": -40.11058807373047, "logps/rejected": -49.348941802978516, "loss": 0.6546, "losses/dpo": 0.6521026492118835, "losses/sft": 1.044120192527771, "losses/total": 0.6521026492118835, "ref_logps/chosen": -37.294281005859375, "ref_logps/rejected": -45.683921813964844, "rewards/accuracies": 0.75, "rewards/chosen": -0.281631201505661, "rewards/margins": 0.08487124741077423, "rewards/rejected": -0.36650246381759644, "step": 243 }, { "epoch": 0.23, "grad_norm": 16.844026565551758, "learning_rate": 3.836477987421384e-07, "logps/chosen": -45.600990295410156, "logps/rejected": -52.28462219238281, "loss": 0.6774, "losses/dpo": 0.7089597582817078, "losses/sft": 1.6474016904830933, "losses/total": 0.7089597582817078, "ref_logps/chosen": -42.518218994140625, "ref_logps/rejected": -48.75550079345703, "rewards/accuracies": 0.6875, "rewards/chosen": -0.30827704071998596, "rewards/margins": 0.044635359197854996, "rewards/rejected": -0.35291242599487305, "step": 244 }, { "epoch": 0.23, "grad_norm": 15.172090530395508, "learning_rate": 3.8522012578616355e-07, "logps/chosen": -36.581722259521484, "logps/rejected": -45.127784729003906, "loss": 0.7012, "losses/dpo": 0.7287442088127136, "losses/sft": 1.337358832359314, "losses/total": 0.7287442088127136, "ref_logps/chosen": -33.399070739746094, "ref_logps/rejected": -42.060569763183594, "rewards/accuracies": 0.5625, "rewards/chosen": -0.31826525926589966, "rewards/margins": -0.011543730273842812, "rewards/rejected": -0.3067215085029602, "step": 245 }, { "epoch": 0.23, "grad_norm": 14.574108123779297, "learning_rate": 3.867924528301887e-07, "logps/chosen": -36.471771240234375, "logps/rejected": -40.091285705566406, "loss": 0.6782, "losses/dpo": 0.6828131079673767, "losses/sft": 1.4105702638626099, "losses/total": 0.6828131079673767, "ref_logps/chosen": -34.01863098144531, "ref_logps/rejected": -37.2685432434082, "rewards/accuracies": 0.625, "rewards/chosen": -0.24531395733356476, "rewards/margins": 0.036960236728191376, "rewards/rejected": -0.28227418661117554, "step": 246 }, { "epoch": 0.23, "grad_norm": 15.863611221313477, "learning_rate": 3.8836477987421385e-07, "logps/chosen": -36.288108825683594, "logps/rejected": -38.02286148071289, "loss": 0.6863, "losses/dpo": 0.7561929821968079, "losses/sft": 1.3613510131835938, "losses/total": 0.7561929821968079, "ref_logps/chosen": -33.26061248779297, "ref_logps/rejected": -34.768577575683594, "rewards/accuracies": 0.625, "rewards/chosen": -0.3027498424053192, "rewards/margins": 0.022678393870592117, "rewards/rejected": -0.3254282474517822, "step": 247 }, { "epoch": 0.23, "grad_norm": 13.649224281311035, "learning_rate": 3.89937106918239e-07, "logps/chosen": -33.580169677734375, "logps/rejected": -30.676929473876953, "loss": 0.7073, "losses/dpo": 0.6862790584564209, "losses/sft": 1.3503104448318481, "losses/total": 0.6862790584564209, "ref_logps/chosen": -30.99078941345215, "ref_logps/rejected": -28.324487686157227, "rewards/accuracies": 0.5, "rewards/chosen": -0.2589379549026489, "rewards/margins": -0.023694012314081192, "rewards/rejected": -0.23524394631385803, "step": 248 }, { "epoch": 0.24, "grad_norm": 14.472437858581543, "learning_rate": 3.9150943396226415e-07, "logps/chosen": -35.0002555847168, "logps/rejected": -43.43840026855469, "loss": 0.6854, "losses/dpo": 0.7300747632980347, "losses/sft": 1.217039704322815, "losses/total": 0.7300747632980347, "ref_logps/chosen": -31.94352912902832, "ref_logps/rejected": -40.115211486816406, "rewards/accuracies": 0.625, "rewards/chosen": -0.3056725859642029, "rewards/margins": 0.0266465712338686, "rewards/rejected": -0.33231914043426514, "step": 249 }, { "epoch": 0.24, "grad_norm": 18.260669708251953, "learning_rate": 3.9308176100628933e-07, "logps/chosen": -48.50001525878906, "logps/rejected": -54.786956787109375, "loss": 0.6785, "losses/dpo": 0.7375392317771912, "losses/sft": 1.5034102201461792, "losses/total": 0.7375392317771912, "ref_logps/chosen": -45.462913513183594, "ref_logps/rejected": -51.376548767089844, "rewards/accuracies": 0.375, "rewards/chosen": -0.30370983481407166, "rewards/margins": 0.037330590188503265, "rewards/rejected": -0.3410404324531555, "step": 250 }, { "epoch": 0.24, "grad_norm": 17.827125549316406, "learning_rate": 3.9465408805031445e-07, "logps/chosen": -43.956756591796875, "logps/rejected": -53.212440490722656, "loss": 0.7021, "losses/dpo": 0.728506326675415, "losses/sft": 1.50521719455719, "losses/total": 0.728506326675415, "ref_logps/chosen": -40.286964416503906, "ref_logps/rejected": -49.678077697753906, "rewards/accuracies": 0.375, "rewards/chosen": -0.3669794201850891, "rewards/margins": -0.01354297250509262, "rewards/rejected": -0.3534364700317383, "step": 251 }, { "epoch": 0.24, "grad_norm": 18.261459350585938, "learning_rate": 3.9622641509433963e-07, "logps/chosen": -48.157283782958984, "logps/rejected": -55.378719329833984, "loss": 0.6956, "losses/dpo": 0.6914754509925842, "losses/sft": 1.2750729322433472, "losses/total": 0.6914754509925842, "ref_logps/chosen": -44.9898567199707, "ref_logps/rejected": -52.20886993408203, "rewards/accuracies": 0.375, "rewards/chosen": -0.31674253940582275, "rewards/margins": 0.00024258531630039215, "rewards/rejected": -0.3169851005077362, "step": 252 }, { "epoch": 0.24, "grad_norm": 17.766359329223633, "learning_rate": 3.977987421383648e-07, "logps/chosen": -52.382164001464844, "logps/rejected": -55.79834747314453, "loss": 0.6847, "losses/dpo": 0.70313560962677, "losses/sft": 1.7460511922836304, "losses/total": 0.70313560962677, "ref_logps/chosen": -48.68946075439453, "ref_logps/rejected": -51.823795318603516, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3692704439163208, "rewards/margins": 0.02818494290113449, "rewards/rejected": -0.3974553644657135, "step": 253 }, { "epoch": 0.24, "grad_norm": 13.266200065612793, "learning_rate": 3.9937106918238993e-07, "logps/chosen": -24.98733901977539, "logps/rejected": -41.37847900390625, "loss": 0.6617, "losses/dpo": 0.6861698031425476, "losses/sft": 1.0828763246536255, "losses/total": 0.6861698031425476, "ref_logps/chosen": -22.569034576416016, "ref_logps/rejected": -38.25971984863281, "rewards/accuracies": 0.75, "rewards/chosen": -0.24183058738708496, "rewards/margins": 0.07004562020301819, "rewards/rejected": -0.31187620759010315, "step": 254 }, { "epoch": 0.24, "grad_norm": 16.188703536987305, "learning_rate": 4.009433962264151e-07, "logps/chosen": -42.302978515625, "logps/rejected": -50.74462890625, "loss": 0.6724, "losses/dpo": 0.7926273941993713, "losses/sft": 1.7149397134780884, "losses/total": 0.7926273941993713, "ref_logps/chosen": -38.7346076965332, "ref_logps/rejected": -46.588783264160156, "rewards/accuracies": 0.625, "rewards/chosen": -0.3568369746208191, "rewards/margins": 0.0587477833032608, "rewards/rejected": -0.4155848026275635, "step": 255 }, { "epoch": 0.24, "grad_norm": 15.104988098144531, "learning_rate": 4.0251572327044023e-07, "logps/chosen": -41.74165344238281, "logps/rejected": -51.38977813720703, "loss": 0.6435, "losses/dpo": 0.6382113695144653, "losses/sft": 1.4787259101867676, "losses/total": 0.6382113695144653, "ref_logps/chosen": -38.84921646118164, "ref_logps/rejected": -47.40968322753906, "rewards/accuracies": 0.75, "rewards/chosen": -0.28924357891082764, "rewards/margins": 0.10876599699258804, "rewards/rejected": -0.3980095684528351, "step": 256 }, { "epoch": 0.24, "grad_norm": 14.39992618560791, "learning_rate": 4.040880503144654e-07, "logps/chosen": -34.94757080078125, "logps/rejected": -42.57470703125, "loss": 0.6857, "losses/dpo": 0.7048845291137695, "losses/sft": 1.443289041519165, "losses/total": 0.7048845291137695, "ref_logps/chosen": -31.906789779663086, "ref_logps/rejected": -39.276458740234375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.304077684879303, "rewards/margins": 0.025747492909431458, "rewards/rejected": -0.32982516288757324, "step": 257 }, { "epoch": 0.24, "grad_norm": 17.034503936767578, "learning_rate": 4.056603773584906e-07, "logps/chosen": -45.911746978759766, "logps/rejected": -57.7626953125, "loss": 0.6624, "losses/dpo": 0.7345125675201416, "losses/sft": 1.5799616575241089, "losses/total": 0.7345125675201416, "ref_logps/chosen": -42.90734100341797, "ref_logps/rejected": -54.03882598876953, "rewards/accuracies": 0.625, "rewards/chosen": -0.3004402220249176, "rewards/margins": 0.07194683700799942, "rewards/rejected": -0.3723870813846588, "step": 258 }, { "epoch": 0.24, "grad_norm": 14.38132381439209, "learning_rate": 4.072327044025157e-07, "logps/chosen": -30.72813606262207, "logps/rejected": -42.7314567565918, "loss": 0.6639, "losses/dpo": 0.6390981078147888, "losses/sft": 1.2096134424209595, "losses/total": 0.6390981078147888, "ref_logps/chosen": -28.13254165649414, "ref_logps/rejected": -39.48625183105469, "rewards/accuracies": 0.625, "rewards/chosen": -0.25955939292907715, "rewards/margins": 0.06496097147464752, "rewards/rejected": -0.3245203495025635, "step": 259 }, { "epoch": 0.25, "grad_norm": 13.774007797241211, "learning_rate": 4.088050314465409e-07, "logps/chosen": -34.30670166015625, "logps/rejected": -39.893516540527344, "loss": 0.6894, "losses/dpo": 0.6727636456489563, "losses/sft": 1.287360429763794, "losses/total": 0.6727636456489563, "ref_logps/chosen": -31.98028564453125, "ref_logps/rejected": -37.41706085205078, "rewards/accuracies": 0.5, "rewards/chosen": -0.23264184594154358, "rewards/margins": 0.015003605745732784, "rewards/rejected": -0.24764545261859894, "step": 260 }, { "epoch": 0.25, "grad_norm": 15.514729499816895, "learning_rate": 4.1037735849056606e-07, "logps/chosen": -40.19240188598633, "logps/rejected": -44.82511901855469, "loss": 0.6775, "losses/dpo": 0.6544908285140991, "losses/sft": 1.601394534111023, "losses/total": 0.6544908285140991, "ref_logps/chosen": -37.0391845703125, "ref_logps/rejected": -41.2575798034668, "rewards/accuracies": 0.625, "rewards/chosen": -0.31532156467437744, "rewards/margins": 0.04143207520246506, "rewards/rejected": -0.3567536473274231, "step": 261 }, { "epoch": 0.25, "grad_norm": 17.19778060913086, "learning_rate": 4.119496855345912e-07, "logps/chosen": -40.04396057128906, "logps/rejected": -54.078495025634766, "loss": 0.6426, "losses/dpo": 0.7065551280975342, "losses/sft": 1.3404619693756104, "losses/total": 0.7065551280975342, "ref_logps/chosen": -37.421043395996094, "ref_logps/rejected": -50.26447677612305, "rewards/accuracies": 0.625, "rewards/chosen": -0.2622915506362915, "rewards/margins": 0.11911033093929291, "rewards/rejected": -0.3814018964767456, "step": 262 }, { "epoch": 0.25, "grad_norm": 14.092816352844238, "learning_rate": 4.1352201257861636e-07, "logps/chosen": -39.34889221191406, "logps/rejected": -41.527000427246094, "loss": 0.6621, "losses/dpo": 0.6810022592544556, "losses/sft": 1.7040151357650757, "losses/total": 0.6810022592544556, "ref_logps/chosen": -37.01990509033203, "ref_logps/rejected": -38.464027404785156, "rewards/accuracies": 0.625, "rewards/chosen": -0.23289905488491058, "rewards/margins": 0.07339858263731003, "rewards/rejected": -0.3062976598739624, "step": 263 }, { "epoch": 0.25, "grad_norm": 17.33518409729004, "learning_rate": 4.150943396226415e-07, "logps/chosen": -50.4212532043457, "logps/rejected": -47.210941314697266, "loss": 0.7211, "losses/dpo": 0.7034216523170471, "losses/sft": 1.845947027206421, "losses/total": 0.7034216523170471, "ref_logps/chosen": -45.92155456542969, "ref_logps/rejected": -43.09968566894531, "rewards/accuracies": 0.625, "rewards/chosen": -0.4499703645706177, "rewards/margins": -0.03884504735469818, "rewards/rejected": -0.4111253023147583, "step": 264 }, { "epoch": 0.25, "grad_norm": 15.276748657226562, "learning_rate": 4.1666666666666667e-07, "logps/chosen": -35.351654052734375, "logps/rejected": -52.00558090209961, "loss": 0.6312, "losses/dpo": 0.7081926465034485, "losses/sft": 1.2190580368041992, "losses/total": 0.7081926465034485, "ref_logps/chosen": -32.42749786376953, "ref_logps/rejected": -47.676856994628906, "rewards/accuracies": 0.75, "rewards/chosen": -0.2924157679080963, "rewards/margins": 0.1404569447040558, "rewards/rejected": -0.4328727126121521, "step": 265 }, { "epoch": 0.25, "grad_norm": 17.348905563354492, "learning_rate": 4.1823899371069184e-07, "logps/chosen": -39.68585968017578, "logps/rejected": -52.619834899902344, "loss": 0.6579, "losses/dpo": 0.614298939704895, "losses/sft": 1.1146241426467896, "losses/total": 0.614298939704895, "ref_logps/chosen": -37.08367919921875, "ref_logps/rejected": -49.21800231933594, "rewards/accuracies": 0.5625, "rewards/chosen": -0.26021838188171387, "rewards/margins": 0.07996508479118347, "rewards/rejected": -0.3401834964752197, "step": 266 }, { "epoch": 0.25, "grad_norm": 15.063830375671387, "learning_rate": 4.1981132075471697e-07, "logps/chosen": -40.76624298095703, "logps/rejected": -52.1614990234375, "loss": 0.632, "losses/dpo": 0.6369799375534058, "losses/sft": 1.5126986503601074, "losses/total": 0.6369799375534058, "ref_logps/chosen": -38.486122131347656, "ref_logps/rejected": -48.50868606567383, "rewards/accuracies": 0.75, "rewards/chosen": -0.22801154851913452, "rewards/margins": 0.13726995885372162, "rewards/rejected": -0.36528149247169495, "step": 267 }, { "epoch": 0.25, "grad_norm": 15.28031063079834, "learning_rate": 4.2138364779874214e-07, "logps/chosen": -31.515867233276367, "logps/rejected": -45.040252685546875, "loss": 0.6305, "losses/dpo": 0.623494029045105, "losses/sft": 1.4976038932800293, "losses/total": 0.623494029045105, "ref_logps/chosen": -29.37891387939453, "ref_logps/rejected": -41.54468536376953, "rewards/accuracies": 0.875, "rewards/chosen": -0.21369558572769165, "rewards/margins": 0.13586114346981049, "rewards/rejected": -0.34955674409866333, "step": 268 }, { "epoch": 0.25, "grad_norm": 19.609994888305664, "learning_rate": 4.229559748427673e-07, "logps/chosen": -46.337158203125, "logps/rejected": -61.737632751464844, "loss": 0.6453, "losses/dpo": 0.7226425409317017, "losses/sft": 1.4349713325500488, "losses/total": 0.7226425409317017, "ref_logps/chosen": -43.272056579589844, "ref_logps/rejected": -57.550846099853516, "rewards/accuracies": 0.625, "rewards/chosen": -0.3065102696418762, "rewards/margins": 0.11216841638088226, "rewards/rejected": -0.4186787009239197, "step": 269 }, { "epoch": 0.25, "grad_norm": 15.402117729187012, "learning_rate": 4.2452830188679244e-07, "logps/chosen": -38.63148880004883, "logps/rejected": -51.09016418457031, "loss": 0.6697, "losses/dpo": 0.668036699295044, "losses/sft": 1.8736562728881836, "losses/total": 0.668036699295044, "ref_logps/chosen": -35.629493713378906, "ref_logps/rejected": -47.55656814575195, "rewards/accuracies": 0.75, "rewards/chosen": -0.3001996576786041, "rewards/margins": 0.05315982550382614, "rewards/rejected": -0.35335949063301086, "step": 270 }, { "epoch": 0.26, "grad_norm": 16.25444984436035, "learning_rate": 4.261006289308176e-07, "logps/chosen": -45.08386993408203, "logps/rejected": -40.419898986816406, "loss": 0.7001, "losses/dpo": 0.6437594890594482, "losses/sft": 1.251446008682251, "losses/total": 0.6437594890594482, "ref_logps/chosen": -41.53697967529297, "ref_logps/rejected": -36.89950942993164, "rewards/accuracies": 0.5, "rewards/chosen": -0.35468873381614685, "rewards/margins": -0.0026499181985855103, "rewards/rejected": -0.35203880071640015, "step": 271 }, { "epoch": 0.26, "grad_norm": 14.902688980102539, "learning_rate": 4.2767295597484274e-07, "logps/chosen": -28.657167434692383, "logps/rejected": -48.08135223388672, "loss": 0.6213, "losses/dpo": 0.5799668431282043, "losses/sft": 1.2232944965362549, "losses/total": 0.5799668431282043, "ref_logps/chosen": -26.796960830688477, "ref_logps/rejected": -44.65399169921875, "rewards/accuracies": 0.875, "rewards/chosen": -0.18602056801319122, "rewards/margins": 0.1567152738571167, "rewards/rejected": -0.3427358567714691, "step": 272 }, { "epoch": 0.26, "grad_norm": 15.471696853637695, "learning_rate": 4.292452830188679e-07, "logps/chosen": -42.2933235168457, "logps/rejected": -45.91864013671875, "loss": 0.6648, "losses/dpo": 0.6357539892196655, "losses/sft": 1.821222186088562, "losses/total": 0.6357539892196655, "ref_logps/chosen": -39.12578582763672, "ref_logps/rejected": -42.10855484008789, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3167540431022644, "rewards/margins": 0.06425458937883377, "rewards/rejected": -0.3810086250305176, "step": 273 }, { "epoch": 0.26, "grad_norm": 15.276728630065918, "learning_rate": 4.308176100628931e-07, "logps/chosen": -34.20111846923828, "logps/rejected": -56.20793151855469, "loss": 0.5948, "losses/dpo": 0.6406265497207642, "losses/sft": 1.4846235513687134, "losses/total": 0.6406265497207642, "ref_logps/chosen": -31.65646743774414, "ref_logps/rejected": -51.374366760253906, "rewards/accuracies": 0.875, "rewards/chosen": -0.2544652223587036, "rewards/margins": 0.22889135777950287, "rewards/rejected": -0.4833565652370453, "step": 274 }, { "epoch": 0.26, "grad_norm": 16.227323532104492, "learning_rate": 4.323899371069182e-07, "logps/chosen": -44.5732421875, "logps/rejected": -54.36494445800781, "loss": 0.6407, "losses/dpo": 0.6271330118179321, "losses/sft": 1.2677981853485107, "losses/total": 0.6271330118179321, "ref_logps/chosen": -41.26808547973633, "ref_logps/rejected": -49.88677978515625, "rewards/accuracies": 0.625, "rewards/chosen": -0.33051571249961853, "rewards/margins": 0.11730123311281204, "rewards/rejected": -0.44781696796417236, "step": 275 }, { "epoch": 0.26, "grad_norm": 14.46500015258789, "learning_rate": 4.339622641509434e-07, "logps/chosen": -32.21154022216797, "logps/rejected": -40.5195198059082, "loss": 0.6569, "losses/dpo": 0.657302737236023, "losses/sft": 1.18999183177948, "losses/total": 0.657302737236023, "ref_logps/chosen": -29.70412826538086, "ref_logps/rejected": -37.229637145996094, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2507411539554596, "rewards/margins": 0.07824680209159851, "rewards/rejected": -0.3289879560470581, "step": 276 }, { "epoch": 0.26, "grad_norm": 15.547643661499023, "learning_rate": 4.355345911949685e-07, "logps/chosen": -38.443050384521484, "logps/rejected": -50.91439437866211, "loss": 0.608, "losses/dpo": 0.5969237685203552, "losses/sft": 1.4887139797210693, "losses/total": 0.5969237685203552, "ref_logps/chosen": -35.83464431762695, "ref_logps/rejected": -46.121028900146484, "rewards/accuracies": 0.625, "rewards/chosen": -0.2608405649662018, "rewards/margins": 0.21849608421325684, "rewards/rejected": -0.4793366491794586, "step": 277 }, { "epoch": 0.26, "grad_norm": 15.859171867370605, "learning_rate": 4.371069182389937e-07, "logps/chosen": -45.779476165771484, "logps/rejected": -41.80155944824219, "loss": 0.6846, "losses/dpo": 0.6578966379165649, "losses/sft": 1.501111388206482, "losses/total": 0.6578966379165649, "ref_logps/chosen": -43.15447235107422, "ref_logps/rejected": -38.86341857910156, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2625007927417755, "rewards/margins": 0.031313199549913406, "rewards/rejected": -0.2938140034675598, "step": 278 }, { "epoch": 0.26, "grad_norm": 16.175655364990234, "learning_rate": 4.386792452830189e-07, "logps/chosen": -42.040733337402344, "logps/rejected": -45.3043212890625, "loss": 0.6454, "losses/dpo": 0.6217872500419617, "losses/sft": 1.1767728328704834, "losses/total": 0.6217872500419617, "ref_logps/chosen": -39.03452682495117, "ref_logps/rejected": -41.25267791748047, "rewards/accuracies": 0.75, "rewards/chosen": -0.3006206452846527, "rewards/margins": 0.10454416275024414, "rewards/rejected": -0.40516480803489685, "step": 279 }, { "epoch": 0.26, "grad_norm": 15.101045608520508, "learning_rate": 4.40251572327044e-07, "logps/chosen": -39.660789489746094, "logps/rejected": -51.88922882080078, "loss": 0.6431, "losses/dpo": 0.673517107963562, "losses/sft": 1.537069320678711, "losses/total": 0.673517107963562, "ref_logps/chosen": -37.26958465576172, "ref_logps/rejected": -48.36371612548828, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2391205132007599, "rewards/margins": 0.11343112587928772, "rewards/rejected": -0.3525516390800476, "step": 280 }, { "epoch": 0.27, "grad_norm": 18.64141273498535, "learning_rate": 4.418238993710692e-07, "logps/chosen": -49.59341812133789, "logps/rejected": -55.329978942871094, "loss": 0.7113, "losses/dpo": 0.7187132239341736, "losses/sft": 1.7550328969955444, "losses/total": 0.7187132239341736, "ref_logps/chosen": -45.04663848876953, "ref_logps/rejected": -51.04995346069336, "rewards/accuracies": 0.5, "rewards/chosen": -0.4546777307987213, "rewards/margins": -0.026675017550587654, "rewards/rejected": -0.4280027151107788, "step": 281 }, { "epoch": 0.27, "grad_norm": 15.597990036010742, "learning_rate": 4.4339622641509435e-07, "logps/chosen": -39.32229995727539, "logps/rejected": -57.20562744140625, "loss": 0.5939, "losses/dpo": 0.5364837646484375, "losses/sft": 0.9851723313331604, "losses/total": 0.5364837646484375, "ref_logps/chosen": -35.8394775390625, "ref_logps/rejected": -51.483863830566406, "rewards/accuracies": 0.6875, "rewards/chosen": -0.348282128572464, "rewards/margins": 0.22389397025108337, "rewards/rejected": -0.5721760988235474, "step": 282 }, { "epoch": 0.27, "grad_norm": 14.092550277709961, "learning_rate": 4.449685534591195e-07, "logps/chosen": -40.98548889160156, "logps/rejected": -47.132606506347656, "loss": 0.6047, "losses/dpo": 0.6200289726257324, "losses/sft": 1.5759824514389038, "losses/total": 0.6200289726257324, "ref_logps/chosen": -37.910160064697266, "ref_logps/rejected": -41.932498931884766, "rewards/accuracies": 0.625, "rewards/chosen": -0.3075331151485443, "rewards/margins": 0.21247738599777222, "rewards/rejected": -0.5200105309486389, "step": 283 }, { "epoch": 0.27, "grad_norm": 17.39341926574707, "learning_rate": 4.4654088050314465e-07, "logps/chosen": -41.997745513916016, "logps/rejected": -46.48918151855469, "loss": 0.6652, "losses/dpo": 0.6076216697692871, "losses/sft": 1.3608059883117676, "losses/total": 0.6076216697692871, "ref_logps/chosen": -39.23186492919922, "ref_logps/rejected": -42.779693603515625, "rewards/accuracies": 0.625, "rewards/chosen": -0.276588499546051, "rewards/margins": 0.09436051547527313, "rewards/rejected": -0.37094900012016296, "step": 284 }, { "epoch": 0.27, "grad_norm": 16.304155349731445, "learning_rate": 4.481132075471698e-07, "logps/chosen": -40.0236930847168, "logps/rejected": -44.648338317871094, "loss": 0.674, "losses/dpo": 0.6296372413635254, "losses/sft": 1.485594391822815, "losses/total": 0.6296372413635254, "ref_logps/chosen": -36.0826530456543, "ref_logps/rejected": -40.076412200927734, "rewards/accuracies": 0.5, "rewards/chosen": -0.39410385489463806, "rewards/margins": 0.06308881938457489, "rewards/rejected": -0.45719265937805176, "step": 285 }, { "epoch": 0.27, "grad_norm": 15.726293563842773, "learning_rate": 4.4968553459119495e-07, "logps/chosen": -39.88897705078125, "logps/rejected": -43.01799392700195, "loss": 0.684, "losses/dpo": 0.6568284630775452, "losses/sft": 1.9258939027786255, "losses/total": 0.6568284630775452, "ref_logps/chosen": -36.286231994628906, "ref_logps/rejected": -39.07062911987305, "rewards/accuracies": 0.5, "rewards/chosen": -0.36027443408966064, "rewards/margins": 0.03446207940578461, "rewards/rejected": -0.39473646879196167, "step": 286 }, { "epoch": 0.27, "grad_norm": 15.3701753616333, "learning_rate": 4.5125786163522013e-07, "logps/chosen": -35.64639663696289, "logps/rejected": -52.39430236816406, "loss": 0.6388, "losses/dpo": 0.6017330288887024, "losses/sft": 1.4482841491699219, "losses/total": 0.6017330288887024, "ref_logps/chosen": -32.51789093017578, "ref_logps/rejected": -48.03721237182617, "rewards/accuracies": 0.625, "rewards/chosen": -0.31285086274147034, "rewards/margins": 0.12285785377025604, "rewards/rejected": -0.4357087016105652, "step": 287 }, { "epoch": 0.27, "grad_norm": 16.409990310668945, "learning_rate": 4.5283018867924526e-07, "logps/chosen": -40.90259552001953, "logps/rejected": -34.40892791748047, "loss": 0.7107, "losses/dpo": 0.658980667591095, "losses/sft": 1.2520112991333008, "losses/total": 0.658980667591095, "ref_logps/chosen": -37.784332275390625, "ref_logps/rejected": -31.530216217041016, "rewards/accuracies": 0.5, "rewards/chosen": -0.31182605028152466, "rewards/margins": -0.023954953998327255, "rewards/rejected": -0.2878710925579071, "step": 288 }, { "epoch": 0.27, "grad_norm": 16.037878036499023, "learning_rate": 4.5440251572327043e-07, "logps/chosen": -46.21577453613281, "logps/rejected": -39.91151428222656, "loss": 0.6883, "losses/dpo": 0.6805859804153442, "losses/sft": 1.5793442726135254, "losses/total": 0.6805859804153442, "ref_logps/chosen": -42.846458435058594, "ref_logps/rejected": -36.31890106201172, "rewards/accuracies": 0.5, "rewards/chosen": -0.33693137764930725, "rewards/margins": 0.02232995815575123, "rewards/rejected": -0.35926133394241333, "step": 289 }, { "epoch": 0.27, "grad_norm": 17.23445701599121, "learning_rate": 4.559748427672956e-07, "logps/chosen": -44.772178649902344, "logps/rejected": -57.92347717285156, "loss": 0.6689, "losses/dpo": 0.5138028264045715, "losses/sft": 1.6963366270065308, "losses/total": 0.5138028264045715, "ref_logps/chosen": -40.34746551513672, "ref_logps/rejected": -52.718589782714844, "rewards/accuracies": 0.6875, "rewards/chosen": -0.44247108697891235, "rewards/margins": 0.07801740616559982, "rewards/rejected": -0.5204885005950928, "step": 290 }, { "epoch": 0.27, "grad_norm": 16.649431228637695, "learning_rate": 4.5754716981132073e-07, "logps/chosen": -43.871299743652344, "logps/rejected": -52.043487548828125, "loss": 0.6531, "losses/dpo": 0.6663308143615723, "losses/sft": 1.8109893798828125, "losses/total": 0.6663308143615723, "ref_logps/chosen": -39.87586975097656, "ref_logps/rejected": -47.08435821533203, "rewards/accuracies": 0.625, "rewards/chosen": -0.39954280853271484, "rewards/margins": 0.09637051820755005, "rewards/rejected": -0.4959132671356201, "step": 291 }, { "epoch": 0.28, "grad_norm": 15.538848876953125, "learning_rate": 4.591194968553459e-07, "logps/chosen": -40.258453369140625, "logps/rejected": -43.68415069580078, "loss": 0.6393, "losses/dpo": 0.7158225774765015, "losses/sft": 1.4569058418273926, "losses/total": 0.7158225774765015, "ref_logps/chosen": -37.33110427856445, "ref_logps/rejected": -39.56146240234375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.29273444414138794, "rewards/margins": 0.11953487992286682, "rewards/rejected": -0.41226935386657715, "step": 292 }, { "epoch": 0.28, "grad_norm": 14.913241386413574, "learning_rate": 4.6069182389937103e-07, "logps/chosen": -36.74955749511719, "logps/rejected": -49.41448211669922, "loss": 0.656, "losses/dpo": 0.5618067979812622, "losses/sft": 0.859252393245697, "losses/total": 0.5618067979812622, "ref_logps/chosen": -33.257293701171875, "ref_logps/rejected": -44.97230911254883, "rewards/accuracies": 0.5625, "rewards/chosen": -0.34922653436660767, "rewards/margins": 0.09499108791351318, "rewards/rejected": -0.44421762228012085, "step": 293 }, { "epoch": 0.28, "grad_norm": 13.554713249206543, "learning_rate": 4.622641509433962e-07, "logps/chosen": -37.023826599121094, "logps/rejected": -33.654483795166016, "loss": 0.6445, "losses/dpo": 0.6806570887565613, "losses/sft": 1.4043481349945068, "losses/total": 0.6806570887565613, "ref_logps/chosen": -34.33789825439453, "ref_logps/rejected": -29.86209487915039, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2685929536819458, "rewards/margins": 0.11064592003822327, "rewards/rejected": -0.37923890352249146, "step": 294 }, { "epoch": 0.28, "grad_norm": 17.03995704650879, "learning_rate": 4.638364779874214e-07, "logps/chosen": -43.4785041809082, "logps/rejected": -44.47311782836914, "loss": 0.6641, "losses/dpo": 0.6484063267707825, "losses/sft": 1.5728685855865479, "losses/total": 0.6484063267707825, "ref_logps/chosen": -39.81079864501953, "ref_logps/rejected": -40.02204513549805, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3667706549167633, "rewards/margins": 0.07833677530288696, "rewards/rejected": -0.4451074004173279, "step": 295 }, { "epoch": 0.28, "grad_norm": 15.064468383789062, "learning_rate": 4.654088050314465e-07, "logps/chosen": -35.53520202636719, "logps/rejected": -44.49199676513672, "loss": 0.6791, "losses/dpo": 0.7307789325714111, "losses/sft": 1.2095739841461182, "losses/total": 0.7307789325714111, "ref_logps/chosen": -31.74903678894043, "ref_logps/rejected": -40.314491271972656, "rewards/accuracies": 0.625, "rewards/chosen": -0.3786165714263916, "rewards/margins": 0.039134155958890915, "rewards/rejected": -0.4177507162094116, "step": 296 }, { "epoch": 0.28, "grad_norm": 19.06511116027832, "learning_rate": 4.669811320754717e-07, "logps/chosen": -47.629703521728516, "logps/rejected": -59.77693176269531, "loss": 0.6795, "losses/dpo": 0.6098648309707642, "losses/sft": 0.9918664693832397, "losses/total": 0.6098648309707642, "ref_logps/chosen": -42.36842346191406, "ref_logps/rejected": -53.96903610229492, "rewards/accuracies": 0.625, "rewards/chosen": -0.5261276960372925, "rewards/margins": 0.05466217175126076, "rewards/rejected": -0.5807898640632629, "step": 297 }, { "epoch": 0.28, "grad_norm": 17.147062301635742, "learning_rate": 4.6855345911949686e-07, "logps/chosen": -47.28531265258789, "logps/rejected": -46.57255554199219, "loss": 0.663, "losses/dpo": 0.7391225695610046, "losses/sft": 1.6611230373382568, "losses/total": 0.7391225695610046, "ref_logps/chosen": -43.058223724365234, "ref_logps/rejected": -41.360260009765625, "rewards/accuracies": 0.625, "rewards/chosen": -0.4227089285850525, "rewards/margins": 0.09852064400911331, "rewards/rejected": -0.5212295651435852, "step": 298 }, { "epoch": 0.28, "grad_norm": 13.381155967712402, "learning_rate": 4.70125786163522e-07, "logps/chosen": -29.36941146850586, "logps/rejected": -36.3071403503418, "loss": 0.6522, "losses/dpo": 0.5655578374862671, "losses/sft": 1.265622854232788, "losses/total": 0.5655578374862671, "ref_logps/chosen": -25.40846824645996, "ref_logps/rejected": -31.21912956237793, "rewards/accuracies": 0.625, "rewards/chosen": -0.39609450101852417, "rewards/margins": 0.1127067506313324, "rewards/rejected": -0.5088012218475342, "step": 299 }, { "epoch": 0.28, "grad_norm": 14.62690544128418, "learning_rate": 4.7169811320754717e-07, "logps/chosen": -37.77018737792969, "logps/rejected": -48.2723388671875, "loss": 0.6441, "losses/dpo": 0.6590859889984131, "losses/sft": 1.4876482486724854, "losses/total": 0.6590859889984131, "ref_logps/chosen": -34.07875061035156, "ref_logps/rejected": -43.16485595703125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.369143545627594, "rewards/margins": 0.1416051685810089, "rewards/rejected": -0.5107487440109253, "step": 300 }, { "epoch": 0.28, "grad_norm": 18.411983489990234, "learning_rate": 4.732704402515723e-07, "logps/chosen": -46.09126281738281, "logps/rejected": -44.139404296875, "loss": 0.6791, "losses/dpo": 0.6460752487182617, "losses/sft": 1.3547742366790771, "losses/total": 0.6460752487182617, "ref_logps/chosen": -41.491634368896484, "ref_logps/rejected": -38.95182418823242, "rewards/accuracies": 0.5625, "rewards/chosen": -0.45996329188346863, "rewards/margins": 0.058794502168893814, "rewards/rejected": -0.5187578201293945, "step": 301 }, { "epoch": 0.29, "grad_norm": 18.769363403320312, "learning_rate": 4.7484276729559747e-07, "logps/chosen": -44.56507873535156, "logps/rejected": -48.06944274902344, "loss": 0.7326, "losses/dpo": 0.6647197008132935, "losses/sft": 1.8934367895126343, "losses/total": 0.6647197008132935, "ref_logps/chosen": -39.720436096191406, "ref_logps/rejected": -43.856685638427734, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4844643771648407, "rewards/margins": -0.0631885826587677, "rewards/rejected": -0.4212757647037506, "step": 302 }, { "epoch": 0.29, "grad_norm": 13.678750038146973, "learning_rate": 4.7641509433962264e-07, "logps/chosen": -29.137710571289062, "logps/rejected": -34.092384338378906, "loss": 0.6465, "losses/dpo": 0.7419373989105225, "losses/sft": 1.431159496307373, "losses/total": 0.7419373989105225, "ref_logps/chosen": -26.39360809326172, "ref_logps/rejected": -30.094730377197266, "rewards/accuracies": 0.75, "rewards/chosen": -0.2744103968143463, "rewards/margins": 0.12535496056079865, "rewards/rejected": -0.39976537227630615, "step": 303 }, { "epoch": 0.29, "grad_norm": 16.425840377807617, "learning_rate": 4.779874213836478e-07, "logps/chosen": -38.671836853027344, "logps/rejected": -46.90407943725586, "loss": 0.6697, "losses/dpo": 0.6034525036811829, "losses/sft": 1.153171420097351, "losses/total": 0.6034525036811829, "ref_logps/chosen": -35.337196350097656, "ref_logps/rejected": -42.98271942138672, "rewards/accuracies": 0.625, "rewards/chosen": -0.3334636688232422, "rewards/margins": 0.058672383427619934, "rewards/rejected": -0.3921360373497009, "step": 304 }, { "epoch": 0.29, "grad_norm": 15.527347564697266, "learning_rate": 4.795597484276729e-07, "logps/chosen": -43.831520080566406, "logps/rejected": -53.383304595947266, "loss": 0.5983, "losses/dpo": 0.6423917412757874, "losses/sft": 1.5524685382843018, "losses/total": 0.6423917412757874, "ref_logps/chosen": -40.01023864746094, "ref_logps/rejected": -47.300376892089844, "rewards/accuracies": 0.75, "rewards/chosen": -0.38212764263153076, "rewards/margins": 0.22616487741470337, "rewards/rejected": -0.6082925200462341, "step": 305 }, { "epoch": 0.29, "grad_norm": 15.913620948791504, "learning_rate": 4.811320754716981e-07, "logps/chosen": -36.82072448730469, "logps/rejected": -43.5571174621582, "loss": 0.6416, "losses/dpo": 0.5956966280937195, "losses/sft": 1.20928156375885, "losses/total": 0.5956966280937195, "ref_logps/chosen": -34.06861877441406, "ref_logps/rejected": -39.47810363769531, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2752102315425873, "rewards/margins": 0.1326913982629776, "rewards/rejected": -0.4079016447067261, "step": 306 }, { "epoch": 0.29, "grad_norm": 15.387386322021484, "learning_rate": 4.827044025157232e-07, "logps/chosen": -35.70310974121094, "logps/rejected": -40.58681106567383, "loss": 0.6603, "losses/dpo": 0.5975790619850159, "losses/sft": 1.0019665956497192, "losses/total": 0.5975790619850159, "ref_logps/chosen": -32.418556213378906, "ref_logps/rejected": -36.39390182495117, "rewards/accuracies": 0.625, "rewards/chosen": -0.32845550775527954, "rewards/margins": 0.09083528816699982, "rewards/rejected": -0.41929078102111816, "step": 307 }, { "epoch": 0.29, "grad_norm": 17.168498992919922, "learning_rate": 4.842767295597484e-07, "logps/chosen": -37.426513671875, "logps/rejected": -38.88938903808594, "loss": 0.7197, "losses/dpo": 0.7147113084793091, "losses/sft": 1.8871713876724243, "losses/total": 0.7147113084793091, "ref_logps/chosen": -33.53988265991211, "ref_logps/rejected": -35.286476135253906, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3886632025241852, "rewards/margins": -0.028372403234243393, "rewards/rejected": -0.3602908253669739, "step": 308 }, { "epoch": 0.29, "grad_norm": 15.583175659179688, "learning_rate": 4.858490566037736e-07, "logps/chosen": -38.89422607421875, "logps/rejected": -37.618263244628906, "loss": 0.7198, "losses/dpo": 0.7783017158508301, "losses/sft": 1.1059563159942627, "losses/total": 0.7783017158508301, "ref_logps/chosen": -33.88743591308594, "ref_logps/rejected": -33.0002326965332, "rewards/accuracies": 0.5, "rewards/chosen": -0.5006794333457947, "rewards/margins": -0.03887690603733063, "rewards/rejected": -0.46180254220962524, "step": 309 }, { "epoch": 0.29, "grad_norm": 18.440391540527344, "learning_rate": 4.874213836477988e-07, "logps/chosen": -42.746482849121094, "logps/rejected": -54.17639923095703, "loss": 0.7278, "losses/dpo": 0.7887463569641113, "losses/sft": 1.51253342628479, "losses/total": 0.7887463569641113, "ref_logps/chosen": -37.78423309326172, "ref_logps/rejected": -49.63832092285156, "rewards/accuracies": 0.5, "rewards/chosen": -0.49622464179992676, "rewards/margins": -0.04241668060421944, "rewards/rejected": -0.4538079500198364, "step": 310 }, { "epoch": 0.29, "grad_norm": 13.605622291564941, "learning_rate": 4.889937106918238e-07, "logps/chosen": -31.56808090209961, "logps/rejected": -46.14800262451172, "loss": 0.5595, "losses/dpo": 0.5758316516876221, "losses/sft": 1.1419419050216675, "losses/total": 0.5758316516876221, "ref_logps/chosen": -28.66701889038086, "ref_logps/rejected": -40.081214904785156, "rewards/accuracies": 0.875, "rewards/chosen": -0.29010629653930664, "rewards/margins": 0.316572904586792, "rewards/rejected": -0.6066792011260986, "step": 311 }, { "epoch": 0.29, "grad_norm": 13.29858112335205, "learning_rate": 4.90566037735849e-07, "logps/chosen": -33.10694885253906, "logps/rejected": -40.835933685302734, "loss": 0.6434, "losses/dpo": 0.6015459299087524, "losses/sft": 1.8418209552764893, "losses/total": 0.6015459299087524, "ref_logps/chosen": -30.316028594970703, "ref_logps/rejected": -36.822509765625, "rewards/accuracies": 0.75, "rewards/chosen": -0.2790922522544861, "rewards/margins": 0.12225019186735153, "rewards/rejected": -0.4013424217700958, "step": 312 }, { "epoch": 0.3, "grad_norm": 15.46154499053955, "learning_rate": 4.921383647798742e-07, "logps/chosen": -45.18656539916992, "logps/rejected": -46.45176696777344, "loss": 0.641, "losses/dpo": 0.6185716986656189, "losses/sft": 1.4683407545089722, "losses/total": 0.6185716986656189, "ref_logps/chosen": -40.97571563720703, "ref_logps/rejected": -41.00298309326172, "rewards/accuracies": 0.75, "rewards/chosen": -0.4210848808288574, "rewards/margins": 0.1237938180565834, "rewards/rejected": -0.5448787212371826, "step": 313 }, { "epoch": 0.3, "grad_norm": 19.87638282775879, "learning_rate": 4.937106918238994e-07, "logps/chosen": -50.0240478515625, "logps/rejected": -55.17597198486328, "loss": 0.7361, "losses/dpo": 0.7950786352157593, "losses/sft": 1.617371916770935, "losses/total": 0.7950786352157593, "ref_logps/chosen": -44.95348358154297, "ref_logps/rejected": -50.52076721191406, "rewards/accuracies": 0.4375, "rewards/chosen": -0.507056713104248, "rewards/margins": -0.041536275297403336, "rewards/rejected": -0.465520441532135, "step": 314 }, { "epoch": 0.3, "grad_norm": 15.329681396484375, "learning_rate": 4.952830188679246e-07, "logps/chosen": -42.27714920043945, "logps/rejected": -46.185882568359375, "loss": 0.6733, "losses/dpo": 0.6167937517166138, "losses/sft": 1.1877564191818237, "losses/total": 0.6167937517166138, "ref_logps/chosen": -38.369590759277344, "ref_logps/rejected": -41.6512336730957, "rewards/accuracies": 0.5625, "rewards/chosen": -0.39075565338134766, "rewards/margins": 0.06270917505025864, "rewards/rejected": -0.4534648358821869, "step": 315 }, { "epoch": 0.3, "grad_norm": 13.328225135803223, "learning_rate": 4.968553459119496e-07, "logps/chosen": -29.752016067504883, "logps/rejected": -41.991233825683594, "loss": 0.6141, "losses/dpo": 0.6650824546813965, "losses/sft": 1.4727708101272583, "losses/total": 0.6650824546813965, "ref_logps/chosen": -27.185718536376953, "ref_logps/rejected": -37.60988235473633, "rewards/accuracies": 0.75, "rewards/chosen": -0.2566297650337219, "rewards/margins": 0.18150556087493896, "rewards/rejected": -0.4381353557109833, "step": 316 }, { "epoch": 0.3, "grad_norm": 13.903385162353516, "learning_rate": 4.984276729559748e-07, "logps/chosen": -28.24636459350586, "logps/rejected": -53.85102081298828, "loss": 0.5736, "losses/dpo": 0.544606626033783, "losses/sft": 1.4416906833648682, "losses/total": 0.544606626033783, "ref_logps/chosen": -25.089126586914062, "ref_logps/rejected": -47.75431823730469, "rewards/accuracies": 0.8125, "rewards/chosen": -0.31572383642196655, "rewards/margins": 0.29394596815109253, "rewards/rejected": -0.6096698045730591, "step": 317 }, { "epoch": 0.3, "grad_norm": 16.531278610229492, "learning_rate": 5e-07, "logps/chosen": -46.63761520385742, "logps/rejected": -52.220245361328125, "loss": 0.6263, "losses/dpo": 0.5461878180503845, "losses/sft": 1.6149392127990723, "losses/total": 0.5461878180503845, "ref_logps/chosen": -42.58404541015625, "ref_logps/rejected": -46.41576385498047, "rewards/accuracies": 0.6875, "rewards/chosen": -0.40535682439804077, "rewards/margins": 0.17509107291698456, "rewards/rejected": -0.5804479122161865, "step": 318 }, { "epoch": 0.3, "grad_norm": 16.40178871154785, "learning_rate": 4.998251136761105e-07, "logps/chosen": -35.51566696166992, "logps/rejected": -49.572017669677734, "loss": 0.6223, "losses/dpo": 0.6416157484054565, "losses/sft": 1.3145924806594849, "losses/total": 0.6416157484054565, "ref_logps/chosen": -32.12255859375, "ref_logps/rejected": -44.47797393798828, "rewards/accuracies": 0.75, "rewards/chosen": -0.3393109142780304, "rewards/margins": 0.1700935810804367, "rewards/rejected": -0.5094045400619507, "step": 319 }, { "epoch": 0.3, "grad_norm": 19.217369079589844, "learning_rate": 4.99650227352221e-07, "logps/chosen": -43.56651306152344, "logps/rejected": -43.700965881347656, "loss": 0.7009, "losses/dpo": 0.7487107515335083, "losses/sft": 1.6572529077529907, "losses/total": 0.7487107515335083, "ref_logps/chosen": -39.02032470703125, "ref_logps/rejected": -38.94599151611328, "rewards/accuracies": 0.375, "rewards/chosen": -0.45461931824684143, "rewards/margins": 0.020878277719020844, "rewards/rejected": -0.4754975736141205, "step": 320 }, { "epoch": 0.3, "grad_norm": 16.932418823242188, "learning_rate": 4.994753410283315e-07, "logps/chosen": -45.408233642578125, "logps/rejected": -43.65406799316406, "loss": 0.618, "losses/dpo": 0.6029354333877563, "losses/sft": 1.5400285720825195, "losses/total": 0.6029354333877563, "ref_logps/chosen": -41.826934814453125, "ref_logps/rejected": -38.315242767333984, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3581298291683197, "rewards/margins": 0.1757524013519287, "rewards/rejected": -0.5338822603225708, "step": 321 }, { "epoch": 0.3, "grad_norm": 18.416934967041016, "learning_rate": 4.993004547044421e-07, "logps/chosen": -39.20286560058594, "logps/rejected": -55.60207748413086, "loss": 0.673, "losses/dpo": 0.6724597215652466, "losses/sft": 1.8351119756698608, "losses/total": 0.6724597215652466, "ref_logps/chosen": -34.607269287109375, "ref_logps/rejected": -50.36186218261719, "rewards/accuracies": 0.6875, "rewards/chosen": -0.45955926179885864, "rewards/margins": 0.06446242332458496, "rewards/rejected": -0.5240216851234436, "step": 322 }, { "epoch": 0.31, "grad_norm": 14.532320022583008, "learning_rate": 4.991255683805526e-07, "logps/chosen": -38.145042419433594, "logps/rejected": -46.594970703125, "loss": 0.6046, "losses/dpo": 0.5145504474639893, "losses/sft": 1.2098132371902466, "losses/total": 0.5145504474639893, "ref_logps/chosen": -34.41341781616211, "ref_logps/rejected": -40.75916290283203, "rewards/accuracies": 0.625, "rewards/chosen": -0.37316226959228516, "rewards/margins": 0.21041880548000336, "rewards/rejected": -0.5835810899734497, "step": 323 }, { "epoch": 0.31, "grad_norm": 18.384719848632812, "learning_rate": 4.989506820566632e-07, "logps/chosen": -53.385643005371094, "logps/rejected": -59.70347595214844, "loss": 0.6883, "losses/dpo": 0.7504025101661682, "losses/sft": 1.5278271436691284, "losses/total": 0.7504025101661682, "ref_logps/chosen": -47.14939880371094, "ref_logps/rejected": -52.856712341308594, "rewards/accuracies": 0.625, "rewards/chosen": -0.6236246824264526, "rewards/margins": 0.061051853001117706, "rewards/rejected": -0.6846765279769897, "step": 324 }, { "epoch": 0.31, "grad_norm": 19.458702087402344, "learning_rate": 4.987757957327737e-07, "logps/chosen": -51.701927185058594, "logps/rejected": -49.56433868408203, "loss": 0.6573, "losses/dpo": 0.6533963680267334, "losses/sft": 1.3683087825775146, "losses/total": 0.6533963680267334, "ref_logps/chosen": -47.254798889160156, "ref_logps/rejected": -44.163543701171875, "rewards/accuracies": 0.75, "rewards/chosen": -0.444713294506073, "rewards/margins": 0.09536644071340561, "rewards/rejected": -0.5400797128677368, "step": 325 }, { "epoch": 0.31, "grad_norm": 16.315893173217773, "learning_rate": 4.986009094088842e-07, "logps/chosen": -44.640541076660156, "logps/rejected": -46.98735427856445, "loss": 0.6286, "losses/dpo": 0.5390735268592834, "losses/sft": 1.1266926527023315, "losses/total": 0.5390735268592834, "ref_logps/chosen": -40.5927619934082, "ref_logps/rejected": -41.14649963378906, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4047783613204956, "rewards/margins": 0.1793069839477539, "rewards/rejected": -0.5840853452682495, "step": 326 }, { "epoch": 0.31, "grad_norm": 15.500914573669434, "learning_rate": 4.984260230849947e-07, "logps/chosen": -45.44721984863281, "logps/rejected": -47.130775451660156, "loss": 0.6625, "losses/dpo": 0.6081020832061768, "losses/sft": 1.2721569538116455, "losses/total": 0.6081020832061768, "ref_logps/chosen": -40.59376907348633, "ref_logps/rejected": -41.537742614746094, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4853447675704956, "rewards/margins": 0.07395829260349274, "rewards/rejected": -0.5593030452728271, "step": 327 }, { "epoch": 0.31, "grad_norm": 17.627470016479492, "learning_rate": 4.982511367611052e-07, "logps/chosen": -47.881874084472656, "logps/rejected": -57.78369140625, "loss": 0.6523, "losses/dpo": 0.5345410108566284, "losses/sft": 1.894019365310669, "losses/total": 0.5345410108566284, "ref_logps/chosen": -42.40503692626953, "ref_logps/rejected": -51.038658142089844, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5476834774017334, "rewards/margins": 0.12682020664215088, "rewards/rejected": -0.6745036840438843, "step": 328 }, { "epoch": 0.31, "grad_norm": 16.869220733642578, "learning_rate": 4.980762504372158e-07, "logps/chosen": -46.1259765625, "logps/rejected": -49.09648132324219, "loss": 0.6417, "losses/dpo": 0.6740195751190186, "losses/sft": 1.8484153747558594, "losses/total": 0.6740195751190186, "ref_logps/chosen": -41.715606689453125, "ref_logps/rejected": -43.21916580200195, "rewards/accuracies": 0.5, "rewards/chosen": -0.441037118434906, "rewards/margins": 0.14669471979141235, "rewards/rejected": -0.5877318382263184, "step": 329 }, { "epoch": 0.31, "grad_norm": 16.697179794311523, "learning_rate": 4.979013641133263e-07, "logps/chosen": -44.72831344604492, "logps/rejected": -45.09574508666992, "loss": 0.6805, "losses/dpo": 0.6937955617904663, "losses/sft": 1.5024570226669312, "losses/total": 0.6937955617904663, "ref_logps/chosen": -39.872230529785156, "ref_logps/rejected": -39.67607879638672, "rewards/accuracies": 0.4375, "rewards/chosen": -0.48560816049575806, "rewards/margins": 0.05635836347937584, "rewards/rejected": -0.5419665575027466, "step": 330 }, { "epoch": 0.31, "grad_norm": 14.972700119018555, "learning_rate": 4.977264777894369e-07, "logps/chosen": -37.015037536621094, "logps/rejected": -52.2890625, "loss": 0.6358, "losses/dpo": 0.597869873046875, "losses/sft": 1.2581983804702759, "losses/total": 0.597869873046875, "ref_logps/chosen": -32.54080581665039, "ref_logps/rejected": -46.212852478027344, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4474233090877533, "rewards/margins": 0.1601974070072174, "rewards/rejected": -0.6076207160949707, "step": 331 }, { "epoch": 0.31, "grad_norm": 18.237545013427734, "learning_rate": 4.975515914655474e-07, "logps/chosen": -42.3123779296875, "logps/rejected": -54.46826171875, "loss": 0.6737, "losses/dpo": 0.7524250745773315, "losses/sft": 1.1088095903396606, "losses/total": 0.7524250745773315, "ref_logps/chosen": -36.97213363647461, "ref_logps/rejected": -48.25359344482422, "rewards/accuracies": 0.75, "rewards/chosen": -0.5340246558189392, "rewards/margins": 0.08744189143180847, "rewards/rejected": -0.6214665174484253, "step": 332 }, { "epoch": 0.31, "grad_norm": 15.017657279968262, "learning_rate": 4.973767051416579e-07, "logps/chosen": -42.169715881347656, "logps/rejected": -40.91410827636719, "loss": 0.6542, "losses/dpo": 0.7065522074699402, "losses/sft": 1.538163423538208, "losses/total": 0.7065522074699402, "ref_logps/chosen": -37.17097854614258, "ref_logps/rejected": -35.0113525390625, "rewards/accuracies": 0.625, "rewards/chosen": -0.49987372756004333, "rewards/margins": 0.09040142595767975, "rewards/rejected": -0.5902751684188843, "step": 333 }, { "epoch": 0.32, "grad_norm": 15.769179344177246, "learning_rate": 4.972018188177684e-07, "logps/chosen": -37.79359817504883, "logps/rejected": -43.023658752441406, "loss": 0.6463, "losses/dpo": 0.6829018592834473, "losses/sft": 1.1957991123199463, "losses/total": 0.6829018592834473, "ref_logps/chosen": -34.46954345703125, "ref_logps/rejected": -38.55085754394531, "rewards/accuracies": 0.5625, "rewards/chosen": -0.33240556716918945, "rewards/margins": 0.11487418413162231, "rewards/rejected": -0.44727975130081177, "step": 334 }, { "epoch": 0.32, "grad_norm": 19.166316986083984, "learning_rate": 4.970269324938789e-07, "logps/chosen": -43.17356491088867, "logps/rejected": -43.072242736816406, "loss": 0.6281, "losses/dpo": 0.45466917753219604, "losses/sft": 1.2970844507217407, "losses/total": 0.45466917753219604, "ref_logps/chosen": -39.01023864746094, "ref_logps/rejected": -37.182247161865234, "rewards/accuracies": 0.5625, "rewards/chosen": -0.41633301973342896, "rewards/margins": 0.1726670116186142, "rewards/rejected": -0.5889999866485596, "step": 335 }, { "epoch": 0.32, "grad_norm": 17.412860870361328, "learning_rate": 4.968520461699895e-07, "logps/chosen": -37.92523193359375, "logps/rejected": -52.40209197998047, "loss": 0.5764, "losses/dpo": 0.5267322063446045, "losses/sft": 1.3428879976272583, "losses/total": 0.5267322063446045, "ref_logps/chosen": -34.152950286865234, "ref_logps/rejected": -45.85675048828125, "rewards/accuracies": 0.75, "rewards/chosen": -0.3772285580635071, "rewards/margins": 0.27730560302734375, "rewards/rejected": -0.654534101486206, "step": 336 }, { "epoch": 0.32, "grad_norm": 16.409772872924805, "learning_rate": 4.966771598461e-07, "logps/chosen": -48.05484390258789, "logps/rejected": -48.352081298828125, "loss": 0.583, "losses/dpo": 0.5335006713867188, "losses/sft": 1.2728737592697144, "losses/total": 0.5335006713867188, "ref_logps/chosen": -43.75884246826172, "ref_logps/rejected": -41.4486083984375, "rewards/accuracies": 0.875, "rewards/chosen": -0.4296002984046936, "rewards/margins": 0.2607470452785492, "rewards/rejected": -0.6903473138809204, "step": 337 }, { "epoch": 0.32, "grad_norm": 17.714736938476562, "learning_rate": 4.965022735222105e-07, "logps/chosen": -43.32847213745117, "logps/rejected": -53.19118881225586, "loss": 0.6944, "losses/dpo": 0.68752521276474, "losses/sft": 1.2535369396209717, "losses/total": 0.68752521276474, "ref_logps/chosen": -37.737205505371094, "ref_logps/rejected": -47.28385925292969, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5591270923614502, "rewards/margins": 0.031605955213308334, "rewards/rejected": -0.5907330513000488, "step": 338 }, { "epoch": 0.32, "grad_norm": 18.90288543701172, "learning_rate": 4.963273871983211e-07, "logps/chosen": -43.99187469482422, "logps/rejected": -59.82645034790039, "loss": 0.6313, "losses/dpo": 0.6904228329658508, "losses/sft": 1.516309380531311, "losses/total": 0.6904228329658508, "ref_logps/chosen": -38.480098724365234, "ref_logps/rejected": -52.933963775634766, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5511775016784668, "rewards/margins": 0.1380712389945984, "rewards/rejected": -0.6892487406730652, "step": 339 }, { "epoch": 0.32, "grad_norm": 14.638052940368652, "learning_rate": 4.961525008744316e-07, "logps/chosen": -32.529903411865234, "logps/rejected": -44.50151443481445, "loss": 0.6092, "losses/dpo": 0.4741971492767334, "losses/sft": 1.4373074769973755, "losses/total": 0.4741971492767334, "ref_logps/chosen": -28.500930786132812, "ref_logps/rejected": -38.405235290527344, "rewards/accuracies": 0.75, "rewards/chosen": -0.4028974771499634, "rewards/margins": 0.20673055946826935, "rewards/rejected": -0.6096280813217163, "step": 340 }, { "epoch": 0.32, "grad_norm": 17.94290542602539, "learning_rate": 4.959776145505421e-07, "logps/chosen": -54.526126861572266, "logps/rejected": -56.8326530456543, "loss": 0.6461, "losses/dpo": 0.7271311283111572, "losses/sft": 1.667887568473816, "losses/total": 0.7271311283111572, "ref_logps/chosen": -48.099334716796875, "ref_logps/rejected": -48.829864501953125, "rewards/accuracies": 0.5, "rewards/chosen": -0.6426790952682495, "rewards/margins": 0.15759959816932678, "rewards/rejected": -0.8002787232398987, "step": 341 }, { "epoch": 0.32, "grad_norm": 16.220985412597656, "learning_rate": 4.958027282266526e-07, "logps/chosen": -46.70875549316406, "logps/rejected": -44.699867248535156, "loss": 0.6325, "losses/dpo": 0.6219876408576965, "losses/sft": 1.7357341051101685, "losses/total": 0.6219876408576965, "ref_logps/chosen": -41.75714874267578, "ref_logps/rejected": -38.12370300292969, "rewards/accuracies": 0.625, "rewards/chosen": -0.49516016244888306, "rewards/margins": 0.16245661675930023, "rewards/rejected": -0.6576167345046997, "step": 342 }, { "epoch": 0.32, "grad_norm": 20.682252883911133, "learning_rate": 4.956278419027632e-07, "logps/chosen": -49.13526916503906, "logps/rejected": -47.93041229248047, "loss": 0.7831, "losses/dpo": 0.764042854309082, "losses/sft": 1.4871896505355835, "losses/total": 0.764042854309082, "ref_logps/chosen": -42.53851318359375, "ref_logps/rejected": -42.67013168334961, "rewards/accuracies": 0.375, "rewards/chosen": -0.6596754789352417, "rewards/margins": -0.1336475908756256, "rewards/rejected": -0.5260279178619385, "step": 343 }, { "epoch": 0.32, "grad_norm": 17.956283569335938, "learning_rate": 4.954529555788737e-07, "logps/chosen": -48.28112030029297, "logps/rejected": -59.63841247558594, "loss": 0.6415, "losses/dpo": 0.6288716793060303, "losses/sft": 1.1391419172286987, "losses/total": 0.6288716793060303, "ref_logps/chosen": -43.74011993408203, "ref_logps/rejected": -53.666351318359375, "rewards/accuracies": 0.5625, "rewards/chosen": -0.45410019159317017, "rewards/margins": 0.14310592412948608, "rewards/rejected": -0.5972061157226562, "step": 344 }, { "epoch": 0.33, "grad_norm": 14.91905403137207, "learning_rate": 4.952780692549842e-07, "logps/chosen": -42.038238525390625, "logps/rejected": -54.24596405029297, "loss": 0.5755, "losses/dpo": 0.759334921836853, "losses/sft": 1.369367003440857, "losses/total": 0.759334921836853, "ref_logps/chosen": -38.41636276245117, "ref_logps/rejected": -47.42622375488281, "rewards/accuracies": 0.8125, "rewards/chosen": -0.36218756437301636, "rewards/margins": 0.3197862505912781, "rewards/rejected": -0.6819738149642944, "step": 345 }, { "epoch": 0.33, "grad_norm": 15.398578643798828, "learning_rate": 4.951031829310948e-07, "logps/chosen": -40.799774169921875, "logps/rejected": -41.05747985839844, "loss": 0.6483, "losses/dpo": 0.6527115702629089, "losses/sft": 1.4417846202850342, "losses/total": 0.6527115702629089, "ref_logps/chosen": -35.516326904296875, "ref_logps/rejected": -34.40104675292969, "rewards/accuracies": 0.75, "rewards/chosen": -0.5283448696136475, "rewards/margins": 0.13729813694953918, "rewards/rejected": -0.665643036365509, "step": 346 }, { "epoch": 0.33, "grad_norm": 22.32353973388672, "learning_rate": 4.949282966072053e-07, "logps/chosen": -62.92329406738281, "logps/rejected": -56.66679382324219, "loss": 0.704, "losses/dpo": 0.982194185256958, "losses/sft": 1.8895689249038696, "losses/total": 0.982194185256958, "ref_logps/chosen": -56.06339645385742, "ref_logps/rejected": -49.39789581298828, "rewards/accuracies": 0.625, "rewards/chosen": -0.6859898567199707, "rewards/margins": 0.04089978337287903, "rewards/rejected": -0.7268896102905273, "step": 347 }, { "epoch": 0.33, "grad_norm": 15.723599433898926, "learning_rate": 4.947534102833158e-07, "logps/chosen": -35.60626220703125, "logps/rejected": -35.96453857421875, "loss": 0.6729, "losses/dpo": 0.6972857713699341, "losses/sft": 1.229810357093811, "losses/total": 0.6972857713699341, "ref_logps/chosen": -31.716310501098633, "ref_logps/rejected": -31.524150848388672, "rewards/accuracies": 0.625, "rewards/chosen": -0.38899534940719604, "rewards/margins": 0.055043164640665054, "rewards/rejected": -0.4440385103225708, "step": 348 }, { "epoch": 0.33, "grad_norm": 17.474153518676758, "learning_rate": 4.945785239594263e-07, "logps/chosen": -42.06867980957031, "logps/rejected": -32.13497543334961, "loss": 0.749, "losses/dpo": 0.7319649457931519, "losses/sft": 1.6740304231643677, "losses/total": 0.7319649457931519, "ref_logps/chosen": -36.58716583251953, "ref_logps/rejected": -27.30141830444336, "rewards/accuracies": 0.5, "rewards/chosen": -0.548151433467865, "rewards/margins": -0.06479551643133163, "rewards/rejected": -0.48335593938827515, "step": 349 }, { "epoch": 0.33, "grad_norm": 16.306217193603516, "learning_rate": 4.944036376355369e-07, "logps/chosen": -36.958343505859375, "logps/rejected": -53.405094146728516, "loss": 0.6313, "losses/dpo": 0.593835175037384, "losses/sft": 1.8788634538650513, "losses/total": 0.593835175037384, "ref_logps/chosen": -32.361385345458984, "ref_logps/rejected": -47.33667755126953, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4596959352493286, "rewards/margins": 0.14714574813842773, "rewards/rejected": -0.6068416833877563, "step": 350 }, { "epoch": 0.33, "grad_norm": 16.265230178833008, "learning_rate": 4.942287513116474e-07, "logps/chosen": -43.46842575073242, "logps/rejected": -49.64971160888672, "loss": 0.6093, "losses/dpo": 0.5821921229362488, "losses/sft": 1.5536878108978271, "losses/total": 0.5821921229362488, "ref_logps/chosen": -39.623558044433594, "ref_logps/rejected": -43.57435607910156, "rewards/accuracies": 0.75, "rewards/chosen": -0.3844863176345825, "rewards/margins": 0.22304952144622803, "rewards/rejected": -0.6075358390808105, "step": 351 }, { "epoch": 0.33, "grad_norm": 20.127544403076172, "learning_rate": 4.940538649877579e-07, "logps/chosen": -46.149234771728516, "logps/rejected": -53.44261932373047, "loss": 0.7208, "losses/dpo": 0.4625735580921173, "losses/sft": 1.6604089736938477, "losses/total": 0.4625735580921173, "ref_logps/chosen": -39.44435501098633, "ref_logps/rejected": -46.366180419921875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6704879999160767, "rewards/margins": 0.03715598210692406, "rewards/rejected": -0.707643985748291, "step": 352 }, { "epoch": 0.33, "grad_norm": 15.963071823120117, "learning_rate": 4.938789786638684e-07, "logps/chosen": -36.14772033691406, "logps/rejected": -47.00101089477539, "loss": 0.6377, "losses/dpo": 0.5668531656265259, "losses/sft": 1.6079418659210205, "losses/total": 0.5668531656265259, "ref_logps/chosen": -31.208240509033203, "ref_logps/rejected": -40.485137939453125, "rewards/accuracies": 0.75, "rewards/chosen": -0.49394795298576355, "rewards/margins": 0.1576395481824875, "rewards/rejected": -0.6515874862670898, "step": 353 }, { "epoch": 0.33, "grad_norm": 16.185941696166992, "learning_rate": 4.93704092339979e-07, "logps/chosen": -43.763179779052734, "logps/rejected": -52.58757019042969, "loss": 0.6172, "losses/dpo": 0.5382827520370483, "losses/sft": 1.4193828105926514, "losses/total": 0.5382827520370483, "ref_logps/chosen": -37.45954132080078, "ref_logps/rejected": -44.3360595703125, "rewards/accuracies": 0.75, "rewards/chosen": -0.6303641200065613, "rewards/margins": 0.19478727877140045, "rewards/rejected": -0.8251514434814453, "step": 354 }, { "epoch": 0.34, "grad_norm": 16.672767639160156, "learning_rate": 4.935292060160895e-07, "logps/chosen": -41.033485412597656, "logps/rejected": -47.182586669921875, "loss": 0.7167, "losses/dpo": 0.8603551387786865, "losses/sft": 1.820441484451294, "losses/total": 0.8603551387786865, "ref_logps/chosen": -35.289642333984375, "ref_logps/rejected": -41.65888214111328, "rewards/accuracies": 0.5, "rewards/chosen": -0.5743842124938965, "rewards/margins": -0.022013980895280838, "rewards/rejected": -0.5523703098297119, "step": 355 }, { "epoch": 0.34, "grad_norm": 18.10701560974121, "learning_rate": 4.933543196922e-07, "logps/chosen": -43.38030242919922, "logps/rejected": -50.14466857910156, "loss": 0.6917, "losses/dpo": 0.5257072448730469, "losses/sft": 1.6851675510406494, "losses/total": 0.5257072448730469, "ref_logps/chosen": -37.954803466796875, "ref_logps/rejected": -44.21272277832031, "rewards/accuracies": 0.5, "rewards/chosen": -0.5425497889518738, "rewards/margins": 0.050644807517528534, "rewards/rejected": -0.5931944847106934, "step": 356 }, { "epoch": 0.34, "grad_norm": 19.319822311401367, "learning_rate": 4.931794333683106e-07, "logps/chosen": -52.50119400024414, "logps/rejected": -62.10511016845703, "loss": 0.6744, "losses/dpo": 0.7592508792877197, "losses/sft": 1.8160649538040161, "losses/total": 0.7592508792877197, "ref_logps/chosen": -45.70870590209961, "ref_logps/rejected": -54.46797180175781, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6792489886283875, "rewards/margins": 0.08446483314037323, "rewards/rejected": -0.7637137770652771, "step": 357 }, { "epoch": 0.34, "grad_norm": 19.426149368286133, "learning_rate": 4.930045470444211e-07, "logps/chosen": -57.17057418823242, "logps/rejected": -61.41523742675781, "loss": 0.6438, "losses/dpo": 0.7625062465667725, "losses/sft": 1.9373992681503296, "losses/total": 0.7625062465667725, "ref_logps/chosen": -49.13054656982422, "ref_logps/rejected": -51.94956588745117, "rewards/accuracies": 0.625, "rewards/chosen": -0.8040027022361755, "rewards/margins": 0.14256440103054047, "rewards/rejected": -0.9465670585632324, "step": 358 }, { "epoch": 0.34, "grad_norm": 18.105295181274414, "learning_rate": 4.928296607205316e-07, "logps/chosen": -52.00062561035156, "logps/rejected": -52.21385192871094, "loss": 0.5934, "losses/dpo": 0.5580828189849854, "losses/sft": 1.7423279285430908, "losses/total": 0.5580828189849854, "ref_logps/chosen": -46.851287841796875, "ref_logps/rejected": -44.67858123779297, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5149338245391846, "rewards/margins": 0.23859325051307678, "rewards/rejected": -0.7535271048545837, "step": 359 }, { "epoch": 0.34, "grad_norm": 18.62017059326172, "learning_rate": 4.926547743966421e-07, "logps/chosen": -55.59669494628906, "logps/rejected": -46.74395751953125, "loss": 0.6991, "losses/dpo": 0.6631979942321777, "losses/sft": 1.880706548690796, "losses/total": 0.6631979942321777, "ref_logps/chosen": -49.36800003051758, "ref_logps/rejected": -40.283843994140625, "rewards/accuracies": 0.5, "rewards/chosen": -0.6228693127632141, "rewards/margins": 0.02314191684126854, "rewards/rejected": -0.646011233329773, "step": 360 }, { "epoch": 0.34, "grad_norm": 17.774625778198242, "learning_rate": 4.924798880727527e-07, "logps/chosen": -46.10017395019531, "logps/rejected": -45.3880729675293, "loss": 0.6913, "losses/dpo": 0.7544475197792053, "losses/sft": 1.6384199857711792, "losses/total": 0.7544475197792053, "ref_logps/chosen": -41.199554443359375, "ref_logps/rejected": -39.884803771972656, "rewards/accuracies": 0.4375, "rewards/chosen": -0.49006187915802, "rewards/margins": 0.0602647066116333, "rewards/rejected": -0.5503265857696533, "step": 361 }, { "epoch": 0.34, "grad_norm": 15.296215057373047, "learning_rate": 4.923050017488632e-07, "logps/chosen": -42.792118072509766, "logps/rejected": -65.90992736816406, "loss": 0.5411, "losses/dpo": 0.4318642318248749, "losses/sft": 2.0659215450286865, "losses/total": 0.4318642318248749, "ref_logps/chosen": -36.07505416870117, "ref_logps/rejected": -54.86952209472656, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6717066764831543, "rewards/margins": 0.4323332905769348, "rewards/rejected": -1.1040399074554443, "step": 362 }, { "epoch": 0.34, "grad_norm": 15.614508628845215, "learning_rate": 4.921301154249738e-07, "logps/chosen": -39.033714294433594, "logps/rejected": -50.13581085205078, "loss": 0.6104, "losses/dpo": 0.558932900428772, "losses/sft": 1.314379096031189, "losses/total": 0.558932900428772, "ref_logps/chosen": -35.31792449951172, "ref_logps/rejected": -44.301700592041016, "rewards/accuracies": 0.5625, "rewards/chosen": -0.37157878279685974, "rewards/margins": 0.21183189749717712, "rewards/rejected": -0.5834106206893921, "step": 363 }, { "epoch": 0.34, "grad_norm": 14.529922485351562, "learning_rate": 4.919552291010843e-07, "logps/chosen": -31.802534103393555, "logps/rejected": -45.50495910644531, "loss": 0.6177, "losses/dpo": 0.7741323709487915, "losses/sft": 1.8115882873535156, "losses/total": 0.7741323709487915, "ref_logps/chosen": -29.133575439453125, "ref_logps/rejected": -40.823123931884766, "rewards/accuracies": 0.875, "rewards/chosen": -0.26689612865448, "rewards/margins": 0.2012873888015747, "rewards/rejected": -0.46818357706069946, "step": 364 }, { "epoch": 0.34, "grad_norm": 14.948827743530273, "learning_rate": 4.917803427771948e-07, "logps/chosen": -35.900238037109375, "logps/rejected": -44.39055633544922, "loss": 0.6441, "losses/dpo": 0.645287811756134, "losses/sft": 1.2219266891479492, "losses/total": 0.645287811756134, "ref_logps/chosen": -31.56234359741211, "ref_logps/rejected": -38.54907989501953, "rewards/accuracies": 0.625, "rewards/chosen": -0.43378946185112, "rewards/margins": 0.15035782754421234, "rewards/rejected": -0.5841472744941711, "step": 365 }, { "epoch": 0.35, "grad_norm": 17.084739685058594, "learning_rate": 4.916054564533053e-07, "logps/chosen": -46.07752990722656, "logps/rejected": -55.04396057128906, "loss": 0.663, "losses/dpo": 0.6777514219284058, "losses/sft": 1.5951437950134277, "losses/total": 0.6777514219284058, "ref_logps/chosen": -40.62417984008789, "ref_logps/rejected": -48.55836486816406, "rewards/accuracies": 0.625, "rewards/chosen": -0.5453349947929382, "rewards/margins": 0.1032242476940155, "rewards/rejected": -0.6485592722892761, "step": 366 }, { "epoch": 0.35, "grad_norm": 18.58281707763672, "learning_rate": 4.914305701294158e-07, "logps/chosen": -40.653499603271484, "logps/rejected": -46.78325653076172, "loss": 0.682, "losses/dpo": 0.6739465594291687, "losses/sft": 1.5690735578536987, "losses/total": 0.6739465594291687, "ref_logps/chosen": -35.23616027832031, "ref_logps/rejected": -40.62438201904297, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5417338013648987, "rewards/margins": 0.0741538554430008, "rewards/rejected": -0.6158876419067383, "step": 367 }, { "epoch": 0.35, "grad_norm": 18.19317626953125, "learning_rate": 4.912556838055264e-07, "logps/chosen": -46.61870193481445, "logps/rejected": -34.94660186767578, "loss": 0.7033, "losses/dpo": 0.9036322236061096, "losses/sft": 1.325303554534912, "losses/total": 0.9036322236061096, "ref_logps/chosen": -41.60350036621094, "ref_logps/rejected": -29.104267120361328, "rewards/accuracies": 0.625, "rewards/chosen": -0.5015201568603516, "rewards/margins": 0.08271355926990509, "rewards/rejected": -0.5842337608337402, "step": 368 }, { "epoch": 0.35, "grad_norm": 15.949390411376953, "learning_rate": 4.910807974816369e-07, "logps/chosen": -44.95087432861328, "logps/rejected": -38.99454879760742, "loss": 0.6674, "losses/dpo": 0.7117958068847656, "losses/sft": 1.5104433298110962, "losses/total": 0.7117958068847656, "ref_logps/chosen": -39.73406219482422, "ref_logps/rejected": -32.87041473388672, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5216814279556274, "rewards/margins": 0.09073203057050705, "rewards/rejected": -0.6124134659767151, "step": 369 }, { "epoch": 0.35, "grad_norm": 17.29485321044922, "learning_rate": 4.909059111577475e-07, "logps/chosen": -35.46530532836914, "logps/rejected": -46.79058074951172, "loss": 0.6979, "losses/dpo": 0.7862976789474487, "losses/sft": 1.4087018966674805, "losses/total": 0.7862976789474487, "ref_logps/chosen": -28.886497497558594, "ref_logps/rejected": -39.95635223388672, "rewards/accuracies": 0.4375, "rewards/chosen": -0.657880961894989, "rewards/margins": 0.025541625916957855, "rewards/rejected": -0.6834225654602051, "step": 370 }, { "epoch": 0.35, "grad_norm": 18.99604034423828, "learning_rate": 4.90731024833858e-07, "logps/chosen": -45.85717010498047, "logps/rejected": -47.745880126953125, "loss": 0.7043, "losses/dpo": 0.773084282875061, "losses/sft": 2.0893893241882324, "losses/total": 0.773084282875061, "ref_logps/chosen": -39.34242248535156, "ref_logps/rejected": -40.57661819458008, "rewards/accuracies": 0.5, "rewards/chosen": -0.6514745950698853, "rewards/margins": 0.06545138359069824, "rewards/rejected": -0.7169260382652283, "step": 371 }, { "epoch": 0.35, "grad_norm": 17.263776779174805, "learning_rate": 4.905561385099685e-07, "logps/chosen": -31.50356674194336, "logps/rejected": -49.4587287902832, "loss": 0.6661, "losses/dpo": 0.7274957895278931, "losses/sft": 1.4706470966339111, "losses/total": 0.7274957895278931, "ref_logps/chosen": -27.50067138671875, "ref_logps/rejected": -44.39161682128906, "rewards/accuracies": 0.625, "rewards/chosen": -0.40028929710388184, "rewards/margins": 0.1064220741391182, "rewards/rejected": -0.5067113637924194, "step": 372 }, { "epoch": 0.35, "grad_norm": 13.453235626220703, "learning_rate": 4.90381252186079e-07, "logps/chosen": -30.153182983398438, "logps/rejected": -39.06922149658203, "loss": 0.6136, "losses/dpo": 0.6532871723175049, "losses/sft": 1.5000978708267212, "losses/total": 0.6532871723175049, "ref_logps/chosen": -26.363821029663086, "ref_logps/rejected": -33.32054138183594, "rewards/accuracies": 0.75, "rewards/chosen": -0.3789364695549011, "rewards/margins": 0.19593185186386108, "rewards/rejected": -0.5748683214187622, "step": 373 }, { "epoch": 0.35, "grad_norm": 15.723969459533691, "learning_rate": 4.902063658621895e-07, "logps/chosen": -42.195716857910156, "logps/rejected": -45.46604919433594, "loss": 0.5879, "losses/dpo": 0.6335541605949402, "losses/sft": 1.487048864364624, "losses/total": 0.6335541605949402, "ref_logps/chosen": -38.526390075683594, "ref_logps/rejected": -39.2433967590332, "rewards/accuracies": 0.75, "rewards/chosen": -0.3669326603412628, "rewards/margins": 0.2553326189517975, "rewards/rejected": -0.6222652792930603, "step": 374 }, { "epoch": 0.35, "grad_norm": 14.40339183807373, "learning_rate": 4.900314795383e-07, "logps/chosen": -31.525100708007812, "logps/rejected": -45.984596252441406, "loss": 0.5898, "losses/dpo": 0.6913972496986389, "losses/sft": 1.099683403968811, "losses/total": 0.6913972496986389, "ref_logps/chosen": -27.74881935119629, "ref_logps/rejected": -39.44319152832031, "rewards/accuracies": 0.6875, "rewards/chosen": -0.37762802839279175, "rewards/margins": 0.2765120565891266, "rewards/rejected": -0.654140055179596, "step": 375 }, { "epoch": 0.36, "grad_norm": 16.11861228942871, "learning_rate": 4.898565932144106e-07, "logps/chosen": -36.8918571472168, "logps/rejected": -50.09154510498047, "loss": 0.5972, "losses/dpo": 0.7275481224060059, "losses/sft": 1.7201942205429077, "losses/total": 0.7275481224060059, "ref_logps/chosen": -32.64515686035156, "ref_logps/rejected": -43.260528564453125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4246698021888733, "rewards/margins": 0.2584313750267029, "rewards/rejected": -0.6831011772155762, "step": 376 }, { "epoch": 0.36, "grad_norm": 19.990903854370117, "learning_rate": 4.896817068905212e-07, "logps/chosen": -53.96989822387695, "logps/rejected": -46.38031005859375, "loss": 0.7189, "losses/dpo": 0.7240732908248901, "losses/sft": 1.6825931072235107, "losses/total": 0.7240732908248901, "ref_logps/chosen": -47.78759002685547, "ref_logps/rejected": -40.2942008972168, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6182307600975037, "rewards/margins": -0.009619642049074173, "rewards/rejected": -0.6086111068725586, "step": 377 }, { "epoch": 0.36, "grad_norm": 19.15579605102539, "learning_rate": 4.895068205666317e-07, "logps/chosen": -48.938873291015625, "logps/rejected": -59.844417572021484, "loss": 0.6221, "losses/dpo": 0.600677490234375, "losses/sft": 1.450546383857727, "losses/total": 0.600677490234375, "ref_logps/chosen": -42.355220794677734, "ref_logps/rejected": -51.14586639404297, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6583653688430786, "rewards/margins": 0.21148985624313354, "rewards/rejected": -0.8698551654815674, "step": 378 }, { "epoch": 0.36, "grad_norm": 17.557723999023438, "learning_rate": 4.893319342427422e-07, "logps/chosen": -45.916561126708984, "logps/rejected": -57.7324333190918, "loss": 0.6019, "losses/dpo": 0.6280567646026611, "losses/sft": 1.4994337558746338, "losses/total": 0.6280567646026611, "ref_logps/chosen": -41.315452575683594, "ref_logps/rejected": -50.54290771484375, "rewards/accuracies": 0.75, "rewards/chosen": -0.4601111114025116, "rewards/margins": 0.25884193181991577, "rewards/rejected": -0.718953013420105, "step": 379 }, { "epoch": 0.36, "grad_norm": 18.44418716430664, "learning_rate": 4.891570479188527e-07, "logps/chosen": -51.42871856689453, "logps/rejected": -57.7188720703125, "loss": 0.5943, "losses/dpo": 0.6480692028999329, "losses/sft": 1.7022343873977661, "losses/total": 0.6480692028999329, "ref_logps/chosen": -46.33269500732422, "ref_logps/rejected": -49.920387268066406, "rewards/accuracies": 0.625, "rewards/chosen": -0.5096026659011841, "rewards/margins": 0.27024561166763306, "rewards/rejected": -0.7798483371734619, "step": 380 }, { "epoch": 0.36, "grad_norm": 16.115467071533203, "learning_rate": 4.889821615949632e-07, "logps/chosen": -43.42619323730469, "logps/rejected": -46.72793960571289, "loss": 0.5855, "losses/dpo": 0.4910088777542114, "losses/sft": 1.433046579360962, "losses/total": 0.4910088777542114, "ref_logps/chosen": -39.071250915527344, "ref_logps/rejected": -39.33418273925781, "rewards/accuracies": 0.75, "rewards/chosen": -0.4354940950870514, "rewards/margins": 0.30388131737709045, "rewards/rejected": -0.7393754124641418, "step": 381 }, { "epoch": 0.36, "grad_norm": 17.226011276245117, "learning_rate": 4.888072752710738e-07, "logps/chosen": -37.94596481323242, "logps/rejected": -48.29046630859375, "loss": 0.589, "losses/dpo": 0.4702170789241791, "losses/sft": 0.9514347314834595, "losses/total": 0.4702170789241791, "ref_logps/chosen": -32.395713806152344, "ref_logps/rejected": -39.96636962890625, "rewards/accuracies": 0.75, "rewards/chosen": -0.5550253987312317, "rewards/margins": 0.2773841619491577, "rewards/rejected": -0.8324095010757446, "step": 382 }, { "epoch": 0.36, "grad_norm": 15.372961044311523, "learning_rate": 4.886323889471843e-07, "logps/chosen": -39.563194274902344, "logps/rejected": -51.458526611328125, "loss": 0.553, "losses/dpo": 0.5169616341590881, "losses/sft": 1.6448532342910767, "losses/total": 0.5169616341590881, "ref_logps/chosen": -34.72569274902344, "ref_logps/rejected": -42.97579574584961, "rewards/accuracies": 0.75, "rewards/chosen": -0.483749657869339, "rewards/margins": 0.3645235002040863, "rewards/rejected": -0.8482731580734253, "step": 383 }, { "epoch": 0.36, "grad_norm": 16.232027053833008, "learning_rate": 4.884575026232949e-07, "logps/chosen": -44.31641387939453, "logps/rejected": -61.96830368041992, "loss": 0.5499, "losses/dpo": 0.5958530902862549, "losses/sft": 1.2350316047668457, "losses/total": 0.5958530902862549, "ref_logps/chosen": -39.33867645263672, "ref_logps/rejected": -52.458457946777344, "rewards/accuracies": 0.75, "rewards/chosen": -0.4977739155292511, "rewards/margins": 0.45321106910705566, "rewards/rejected": -0.9509849548339844, "step": 384 }, { "epoch": 0.36, "grad_norm": 17.623497009277344, "learning_rate": 4.882826162994054e-07, "logps/chosen": -41.76802062988281, "logps/rejected": -51.88357925415039, "loss": 0.6588, "losses/dpo": 0.6860288977622986, "losses/sft": 1.4751417636871338, "losses/total": 0.6860288977622986, "ref_logps/chosen": -37.24629211425781, "ref_logps/rejected": -46.162330627441406, "rewards/accuracies": 0.75, "rewards/chosen": -0.45217248797416687, "rewards/margins": 0.11995206773281097, "rewards/rejected": -0.5721245408058167, "step": 385 }, { "epoch": 0.36, "grad_norm": 17.865766525268555, "learning_rate": 4.881077299755159e-07, "logps/chosen": -48.415687561035156, "logps/rejected": -59.801910400390625, "loss": 0.6429, "losses/dpo": 0.576919436454773, "losses/sft": 1.5451867580413818, "losses/total": 0.576919436454773, "ref_logps/chosen": -42.79845428466797, "ref_logps/rejected": -52.516029357910156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5617234706878662, "rewards/margins": 0.16686515510082245, "rewards/rejected": -0.7285885810852051, "step": 386 }, { "epoch": 0.37, "grad_norm": 15.3607759475708, "learning_rate": 4.879328436516264e-07, "logps/chosen": -37.83244323730469, "logps/rejected": -51.96109390258789, "loss": 0.5846, "losses/dpo": 0.6534067392349243, "losses/sft": 1.4005818367004395, "losses/total": 0.6534067392349243, "ref_logps/chosen": -33.5261344909668, "ref_logps/rejected": -44.703426361083984, "rewards/accuracies": 0.8125, "rewards/chosen": -0.43063056468963623, "rewards/margins": 0.29513639211654663, "rewards/rejected": -0.7257668972015381, "step": 387 }, { "epoch": 0.37, "grad_norm": 18.870868682861328, "learning_rate": 4.877579573277369e-07, "logps/chosen": -52.28619384765625, "logps/rejected": -58.13663101196289, "loss": 0.6719, "losses/dpo": 0.6589078903198242, "losses/sft": 1.8530817031860352, "losses/total": 0.6589078903198242, "ref_logps/chosen": -46.614654541015625, "ref_logps/rejected": -51.617279052734375, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5671533346176147, "rewards/margins": 0.08478206396102905, "rewards/rejected": -0.6519354581832886, "step": 388 }, { "epoch": 0.37, "grad_norm": 17.403518676757812, "learning_rate": 4.875830710038475e-07, "logps/chosen": -44.04420852661133, "logps/rejected": -41.051490783691406, "loss": 0.622, "losses/dpo": 0.7141726016998291, "losses/sft": 1.9304901361465454, "losses/total": 0.7141726016998291, "ref_logps/chosen": -39.160709381103516, "ref_logps/rejected": -33.75501251220703, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4883498549461365, "rewards/margins": 0.241298109292984, "rewards/rejected": -0.7296479940414429, "step": 389 }, { "epoch": 0.37, "grad_norm": 15.864351272583008, "learning_rate": 4.874081846799579e-07, "logps/chosen": -38.41652297973633, "logps/rejected": -52.740264892578125, "loss": 0.5355, "losses/dpo": 0.5309996604919434, "losses/sft": 1.4430912733078003, "losses/total": 0.5309996604919434, "ref_logps/chosen": -34.61224365234375, "ref_logps/rejected": -45.08299255371094, "rewards/accuracies": 0.75, "rewards/chosen": -0.38042739033699036, "rewards/margins": 0.3852997124195099, "rewards/rejected": -0.7657271027565002, "step": 390 }, { "epoch": 0.37, "grad_norm": 17.13785171508789, "learning_rate": 4.872332983560686e-07, "logps/chosen": -37.88313293457031, "logps/rejected": -39.423465728759766, "loss": 0.6859, "losses/dpo": 0.690115213394165, "losses/sft": 1.644648551940918, "losses/total": 0.690115213394165, "ref_logps/chosen": -33.61713409423828, "ref_logps/rejected": -34.702396392822266, "rewards/accuracies": 0.4375, "rewards/chosen": -0.42660024762153625, "rewards/margins": 0.04550663381814957, "rewards/rejected": -0.4721068739891052, "step": 391 }, { "epoch": 0.37, "grad_norm": 14.988529205322266, "learning_rate": 4.870584120321791e-07, "logps/chosen": -41.488441467285156, "logps/rejected": -37.75962829589844, "loss": 0.6294, "losses/dpo": 0.6864627599716187, "losses/sft": 1.48981773853302, "losses/total": 0.6864627599716187, "ref_logps/chosen": -37.671180725097656, "ref_logps/rejected": -32.284706115722656, "rewards/accuracies": 0.625, "rewards/chosen": -0.38172605633735657, "rewards/margins": 0.16576623916625977, "rewards/rejected": -0.547492265701294, "step": 392 }, { "epoch": 0.37, "grad_norm": 17.260526657104492, "learning_rate": 4.868835257082896e-07, "logps/chosen": -37.648902893066406, "logps/rejected": -47.992645263671875, "loss": 0.6936, "losses/dpo": 0.5706920027732849, "losses/sft": 1.1341158151626587, "losses/total": 0.5706920027732849, "ref_logps/chosen": -33.21958541870117, "ref_logps/rejected": -43.15337371826172, "rewards/accuracies": 0.4375, "rewards/chosen": -0.44293180108070374, "rewards/margins": 0.04099569469690323, "rewards/rejected": -0.48392748832702637, "step": 393 }, { "epoch": 0.37, "grad_norm": 16.488170623779297, "learning_rate": 4.867086393844001e-07, "logps/chosen": -45.87257385253906, "logps/rejected": -48.47898864746094, "loss": 0.6302, "losses/dpo": 0.49739784002304077, "losses/sft": 1.7258594036102295, "losses/total": 0.49739784002304077, "ref_logps/chosen": -41.582820892333984, "ref_logps/rejected": -42.0963249206543, "rewards/accuracies": 0.625, "rewards/chosen": -0.42897525429725647, "rewards/margins": 0.20929084718227386, "rewards/rejected": -0.6382660865783691, "step": 394 }, { "epoch": 0.37, "grad_norm": 15.787715911865234, "learning_rate": 4.865337530605106e-07, "logps/chosen": -38.56987380981445, "logps/rejected": -57.91104507446289, "loss": 0.6043, "losses/dpo": 0.49941176176071167, "losses/sft": 1.359432339668274, "losses/total": 0.49941176176071167, "ref_logps/chosen": -34.32741165161133, "ref_logps/rejected": -51.366233825683594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4242461621761322, "rewards/margins": 0.2302348017692566, "rewards/rejected": -0.6544809341430664, "step": 395 }, { "epoch": 0.37, "grad_norm": 15.207798957824707, "learning_rate": 4.863588667366212e-07, "logps/chosen": -30.377071380615234, "logps/rejected": -36.83739471435547, "loss": 0.5979, "losses/dpo": 0.543952465057373, "losses/sft": 1.4746674299240112, "losses/total": 0.543952465057373, "ref_logps/chosen": -27.675073623657227, "ref_logps/rejected": -31.649330139160156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2701996862888336, "rewards/margins": 0.24860700964927673, "rewards/rejected": -0.5188066959381104, "step": 396 }, { "epoch": 0.37, "grad_norm": 17.54619598388672, "learning_rate": 4.861839804127316e-07, "logps/chosen": -47.32435989379883, "logps/rejected": -58.10388946533203, "loss": 0.6519, "losses/dpo": 0.5839875936508179, "losses/sft": 1.435477375984192, "losses/total": 0.5839875936508179, "ref_logps/chosen": -41.70479202270508, "ref_logps/rejected": -50.81410217285156, "rewards/accuracies": 0.625, "rewards/chosen": -0.561956524848938, "rewards/margins": 0.1670229434967041, "rewards/rejected": -0.7289795279502869, "step": 397 }, { "epoch": 0.38, "grad_norm": 15.378534317016602, "learning_rate": 4.860090940888423e-07, "logps/chosen": -35.75560760498047, "logps/rejected": -41.20283508300781, "loss": 0.6299, "losses/dpo": 0.6845287084579468, "losses/sft": 1.1644686460494995, "losses/total": 0.6845287084579468, "ref_logps/chosen": -31.902124404907227, "ref_logps/rejected": -35.534156799316406, "rewards/accuracies": 0.625, "rewards/chosen": -0.38534846901893616, "rewards/margins": 0.18151944875717163, "rewards/rejected": -0.5668679475784302, "step": 398 }, { "epoch": 0.38, "grad_norm": 16.789806365966797, "learning_rate": 4.858342077649528e-07, "logps/chosen": -33.036827087402344, "logps/rejected": -40.6373291015625, "loss": 0.6909, "losses/dpo": 0.5921821594238281, "losses/sft": 1.5299408435821533, "losses/total": 0.5921821594238281, "ref_logps/chosen": -30.053743362426758, "ref_logps/rejected": -37.085121154785156, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2983085513114929, "rewards/margins": 0.0569118931889534, "rewards/rejected": -0.3552204370498657, "step": 399 }, { "epoch": 0.38, "grad_norm": 16.89141082763672, "learning_rate": 4.856593214410633e-07, "logps/chosen": -47.697715759277344, "logps/rejected": -61.035888671875, "loss": 0.6096, "losses/dpo": 0.5185248851776123, "losses/sft": 1.6526906490325928, "losses/total": 0.5185248851776123, "ref_logps/chosen": -42.63914489746094, "ref_logps/rejected": -53.86378860473633, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5058566331863403, "rewards/margins": 0.21135367453098297, "rewards/rejected": -0.7172103524208069, "step": 400 }, { "epoch": 0.38, "grad_norm": 14.587214469909668, "learning_rate": 4.854844351171738e-07, "logps/chosen": -34.169090270996094, "logps/rejected": -39.14319610595703, "loss": 0.6158, "losses/dpo": 0.5357730388641357, "losses/sft": 1.3998308181762695, "losses/total": 0.5357730388641357, "ref_logps/chosen": -30.625774383544922, "ref_logps/rejected": -33.343929290771484, "rewards/accuracies": 0.75, "rewards/chosen": -0.354331374168396, "rewards/margins": 0.2255951315164566, "rewards/rejected": -0.5799264907836914, "step": 401 }, { "epoch": 0.38, "grad_norm": 16.121726989746094, "learning_rate": 4.853095487932843e-07, "logps/chosen": -32.05437469482422, "logps/rejected": -60.37568664550781, "loss": 0.4951, "losses/dpo": 0.48003166913986206, "losses/sft": 1.5877383947372437, "losses/total": 0.48003166913986206, "ref_logps/chosen": -28.26840591430664, "ref_logps/rejected": -51.92275619506836, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3785969614982605, "rewards/margins": 0.46669596433639526, "rewards/rejected": -0.8452929258346558, "step": 402 }, { "epoch": 0.38, "grad_norm": 16.64834213256836, "learning_rate": 4.851346624693949e-07, "logps/chosen": -38.816585540771484, "logps/rejected": -53.78671646118164, "loss": 0.5811, "losses/dpo": 0.5250800848007202, "losses/sft": 1.79682457447052, "losses/total": 0.5250800848007202, "ref_logps/chosen": -34.659847259521484, "ref_logps/rejected": -46.36677551269531, "rewards/accuracies": 0.6875, "rewards/chosen": -0.415674090385437, "rewards/margins": 0.32631999254226685, "rewards/rejected": -0.7419940233230591, "step": 403 }, { "epoch": 0.38, "grad_norm": 17.353267669677734, "learning_rate": 4.849597761455053e-07, "logps/chosen": -43.05952835083008, "logps/rejected": -48.76091384887695, "loss": 0.6148, "losses/dpo": 0.4252875745296478, "losses/sft": 1.267093539237976, "losses/total": 0.4252875745296478, "ref_logps/chosen": -38.705326080322266, "ref_logps/rejected": -42.27796173095703, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4354203939437866, "rewards/margins": 0.21287499368190765, "rewards/rejected": -0.6482954025268555, "step": 404 }, { "epoch": 0.38, "grad_norm": 14.371841430664062, "learning_rate": 4.84784889821616e-07, "logps/chosen": -26.627094268798828, "logps/rejected": -35.829750061035156, "loss": 0.6266, "losses/dpo": 0.5315425395965576, "losses/sft": 1.3527321815490723, "losses/total": 0.5315425395965576, "ref_logps/chosen": -23.816089630126953, "ref_logps/rejected": -31.18358039855957, "rewards/accuracies": 0.625, "rewards/chosen": -0.2811005413532257, "rewards/margins": 0.1835164874792099, "rewards/rejected": -0.4646170735359192, "step": 405 }, { "epoch": 0.38, "grad_norm": 17.5135440826416, "learning_rate": 4.846100034977265e-07, "logps/chosen": -40.98727798461914, "logps/rejected": -60.82196044921875, "loss": 0.5499, "losses/dpo": 0.6684725880622864, "losses/sft": 1.9783414602279663, "losses/total": 0.6684725880622864, "ref_logps/chosen": -36.28944778442383, "ref_logps/rejected": -52.15388870239258, "rewards/accuracies": 0.8125, "rewards/chosen": -0.46978288888931274, "rewards/margins": 0.39702415466308594, "rewards/rejected": -0.8668070435523987, "step": 406 }, { "epoch": 0.38, "grad_norm": 15.416147232055664, "learning_rate": 4.84435117173837e-07, "logps/chosen": -32.149574279785156, "logps/rejected": -41.04954528808594, "loss": 0.674, "losses/dpo": 0.6583499312400818, "losses/sft": 1.5759145021438599, "losses/total": 0.6583499312400818, "ref_logps/chosen": -28.559553146362305, "ref_logps/rejected": -36.13202667236328, "rewards/accuracies": 0.5625, "rewards/chosen": -0.35900166630744934, "rewards/margins": 0.13275033235549927, "rewards/rejected": -0.4917519986629486, "step": 407 }, { "epoch": 0.39, "grad_norm": 16.129085540771484, "learning_rate": 4.842602308499475e-07, "logps/chosen": -40.663230895996094, "logps/rejected": -56.94084167480469, "loss": 0.5149, "losses/dpo": 0.45456379652023315, "losses/sft": 1.2015033960342407, "losses/total": 0.45456379652023315, "ref_logps/chosen": -37.173484802246094, "ref_logps/rejected": -47.41035461425781, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3489745259284973, "rewards/margins": 0.6040740013122559, "rewards/rejected": -0.953048586845398, "step": 408 }, { "epoch": 0.39, "grad_norm": 19.411956787109375, "learning_rate": 4.84085344526058e-07, "logps/chosen": -42.17294692993164, "logps/rejected": -50.168800354003906, "loss": 0.6934, "losses/dpo": 0.5932608842849731, "losses/sft": 1.406678557395935, "losses/total": 0.5932608842849731, "ref_logps/chosen": -35.548667907714844, "ref_logps/rejected": -42.467041015625, "rewards/accuracies": 0.5, "rewards/chosen": -0.6624277830123901, "rewards/margins": 0.10774798691272736, "rewards/rejected": -0.7701758146286011, "step": 409 }, { "epoch": 0.39, "grad_norm": 16.154325485229492, "learning_rate": 4.839104582021686e-07, "logps/chosen": -42.76899719238281, "logps/rejected": -50.167564392089844, "loss": 0.6151, "losses/dpo": 0.8264023065567017, "losses/sft": 1.948232650756836, "losses/total": 0.8264023065567017, "ref_logps/chosen": -36.968727111816406, "ref_logps/rejected": -41.93342590332031, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5800269842147827, "rewards/margins": 0.24338719248771667, "rewards/rejected": -0.823414146900177, "step": 410 }, { "epoch": 0.39, "grad_norm": 18.531009674072266, "learning_rate": 4.83735571878279e-07, "logps/chosen": -48.12474822998047, "logps/rejected": -54.27491760253906, "loss": 0.6067, "losses/dpo": 0.46124640107154846, "losses/sft": 1.0872917175292969, "losses/total": 0.46124640107154846, "ref_logps/chosen": -42.32415008544922, "ref_logps/rejected": -45.653709411621094, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5800597667694092, "rewards/margins": 0.2820609509944916, "rewards/rejected": -0.8621206879615784, "step": 411 }, { "epoch": 0.39, "grad_norm": 16.040817260742188, "learning_rate": 4.835606855543896e-07, "logps/chosen": -41.7850456237793, "logps/rejected": -44.34109115600586, "loss": 0.5943, "losses/dpo": 0.5554689168930054, "losses/sft": 1.4853107929229736, "losses/total": 0.5554689168930054, "ref_logps/chosen": -38.30476760864258, "ref_logps/rejected": -38.22357940673828, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3480280041694641, "rewards/margins": 0.26372280716896057, "rewards/rejected": -0.6117507815361023, "step": 412 }, { "epoch": 0.39, "grad_norm": 15.917330741882324, "learning_rate": 4.833857992305002e-07, "logps/chosen": -41.395469665527344, "logps/rejected": -49.830352783203125, "loss": 0.6196, "losses/dpo": 0.5293267965316772, "losses/sft": 1.0707151889801025, "losses/total": 0.5293267965316772, "ref_logps/chosen": -35.795894622802734, "ref_logps/rejected": -41.86262130737305, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5599576234817505, "rewards/margins": 0.2368154525756836, "rewards/rejected": -0.7967730760574341, "step": 413 }, { "epoch": 0.39, "grad_norm": 16.770217895507812, "learning_rate": 4.832109129066107e-07, "logps/chosen": -52.91405487060547, "logps/rejected": -59.947940826416016, "loss": 0.5564, "losses/dpo": 0.5155785083770752, "losses/sft": 1.9012634754180908, "losses/total": 0.5155785083770752, "ref_logps/chosen": -46.752349853515625, "ref_logps/rejected": -49.959800720214844, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6161705255508423, "rewards/margins": 0.38264378905296326, "rewards/rejected": -0.9988143444061279, "step": 414 }, { "epoch": 0.39, "grad_norm": 17.09412956237793, "learning_rate": 4.830360265827212e-07, "logps/chosen": -41.7603874206543, "logps/rejected": -53.3480339050293, "loss": 0.5886, "losses/dpo": 0.5072081685066223, "losses/sft": 1.2812321186065674, "losses/total": 0.5072081685066223, "ref_logps/chosen": -35.59248733520508, "ref_logps/rejected": -44.63600158691406, "rewards/accuracies": 0.6875, "rewards/chosen": -0.616789698600769, "rewards/margins": 0.25441354513168335, "rewards/rejected": -0.8712032437324524, "step": 415 }, { "epoch": 0.39, "grad_norm": 21.589744567871094, "learning_rate": 4.828611402588317e-07, "logps/chosen": -55.79002380371094, "logps/rejected": -65.64686584472656, "loss": 0.6915, "losses/dpo": 0.5672193169593811, "losses/sft": 1.7146742343902588, "losses/total": 0.5672193169593811, "ref_logps/chosen": -47.1961784362793, "ref_logps/rejected": -55.97956085205078, "rewards/accuracies": 0.5, "rewards/chosen": -0.8593848943710327, "rewards/margins": 0.10734502971172333, "rewards/rejected": -0.9667298793792725, "step": 416 }, { "epoch": 0.39, "grad_norm": 17.905376434326172, "learning_rate": 4.826862539349423e-07, "logps/chosen": -41.799400329589844, "logps/rejected": -51.18357849121094, "loss": 0.5399, "losses/dpo": 0.5003094673156738, "losses/sft": 1.5253819227218628, "losses/total": 0.5003094673156738, "ref_logps/chosen": -38.23586654663086, "ref_logps/rejected": -43.49443054199219, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3563530445098877, "rewards/margins": 0.41256183385849, "rewards/rejected": -0.7689148783683777, "step": 417 }, { "epoch": 0.39, "grad_norm": 16.941661834716797, "learning_rate": 4.825113676110527e-07, "logps/chosen": -37.555171966552734, "logps/rejected": -42.37037658691406, "loss": 0.6635, "losses/dpo": 0.9041906595230103, "losses/sft": 1.7020913362503052, "losses/total": 0.9041906595230103, "ref_logps/chosen": -32.227359771728516, "ref_logps/rejected": -35.716827392578125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5327807664871216, "rewards/margins": 0.1325741708278656, "rewards/rejected": -0.6653549671173096, "step": 418 }, { "epoch": 0.4, "grad_norm": 18.61013412475586, "learning_rate": 4.823364812871633e-07, "logps/chosen": -43.78700256347656, "logps/rejected": -71.27808380126953, "loss": 0.5131, "losses/dpo": 0.4810955822467804, "losses/sft": 1.5377532243728638, "losses/total": 0.4810955822467804, "ref_logps/chosen": -39.11919403076172, "ref_logps/rejected": -59.33916091918945, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4667811989784241, "rewards/margins": 0.7271108627319336, "rewards/rejected": -1.193892240524292, "step": 419 }, { "epoch": 0.4, "grad_norm": 22.28360939025879, "learning_rate": 4.821615949632739e-07, "logps/chosen": -49.553157806396484, "logps/rejected": -48.26500701904297, "loss": 0.7759, "losses/dpo": 0.7567892074584961, "losses/sft": 1.5547449588775635, "losses/total": 0.7567892074584961, "ref_logps/chosen": -41.395145416259766, "ref_logps/rejected": -40.95943069458008, "rewards/accuracies": 0.3125, "rewards/chosen": -0.8158011436462402, "rewards/margins": -0.08524350821971893, "rewards/rejected": -0.7305576801300049, "step": 420 }, { "epoch": 0.4, "grad_norm": 18.713472366333008, "learning_rate": 4.819867086393844e-07, "logps/chosen": -40.23215103149414, "logps/rejected": -52.624839782714844, "loss": 0.6301, "losses/dpo": 0.5053095817565918, "losses/sft": 1.4159964323043823, "losses/total": 0.5053095817565918, "ref_logps/chosen": -33.68476104736328, "ref_logps/rejected": -43.69927978515625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6547389030456543, "rewards/margins": 0.23781681060791016, "rewards/rejected": -0.8925557136535645, "step": 421 }, { "epoch": 0.4, "grad_norm": 14.797178268432617, "learning_rate": 4.818118223154949e-07, "logps/chosen": -47.91529846191406, "logps/rejected": -60.493133544921875, "loss": 0.4971, "losses/dpo": 0.531101405620575, "losses/sft": 1.6579855680465698, "losses/total": 0.531101405620575, "ref_logps/chosen": -43.0427360534668, "ref_logps/rejected": -50.19813537597656, "rewards/accuracies": 0.75, "rewards/chosen": -0.4872565269470215, "rewards/margins": 0.5422431826591492, "rewards/rejected": -1.0294997692108154, "step": 422 }, { "epoch": 0.4, "grad_norm": 17.529640197753906, "learning_rate": 4.816369359916054e-07, "logps/chosen": -46.716304779052734, "logps/rejected": -57.9948616027832, "loss": 0.6419, "losses/dpo": 0.7770130634307861, "losses/sft": 1.5371500253677368, "losses/total": 0.7770130634307861, "ref_logps/chosen": -39.979270935058594, "ref_logps/rejected": -48.78118133544922, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6737032532691956, "rewards/margins": 0.24766482412815094, "rewards/rejected": -0.9213681221008301, "step": 423 }, { "epoch": 0.4, "grad_norm": 16.264522552490234, "learning_rate": 4.81462049667716e-07, "logps/chosen": -44.645259857177734, "logps/rejected": -61.48662185668945, "loss": 0.5151, "losses/dpo": 0.4992883801460266, "losses/sft": 1.765952229499817, "losses/total": 0.4992883801460266, "ref_logps/chosen": -38.91535949707031, "ref_logps/rejected": -50.420440673828125, "rewards/accuracies": 0.75, "rewards/chosen": -0.5729899406433105, "rewards/margins": 0.5336276292800903, "rewards/rejected": -1.1066175699234009, "step": 424 }, { "epoch": 0.4, "grad_norm": 17.553157806396484, "learning_rate": 4.812871633438264e-07, "logps/chosen": -43.50630569458008, "logps/rejected": -56.305152893066406, "loss": 0.5832, "losses/dpo": 0.7116658687591553, "losses/sft": 1.435996651649475, "losses/total": 0.7116658687591553, "ref_logps/chosen": -37.015892028808594, "ref_logps/rejected": -46.49329376220703, "rewards/accuracies": 0.75, "rewards/chosen": -0.6490410566329956, "rewards/margins": 0.33214494585990906, "rewards/rejected": -0.981186032295227, "step": 425 }, { "epoch": 0.4, "grad_norm": 15.7957124710083, "learning_rate": 4.81112277019937e-07, "logps/chosen": -43.427886962890625, "logps/rejected": -48.535614013671875, "loss": 0.5, "losses/dpo": 0.4772017300128937, "losses/sft": 1.3629473447799683, "losses/total": 0.4772017300128937, "ref_logps/chosen": -39.44947814941406, "ref_logps/rejected": -39.24213409423828, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3978408873081207, "rewards/margins": 0.531507134437561, "rewards/rejected": -0.9293479919433594, "step": 426 }, { "epoch": 0.4, "grad_norm": 23.079870223999023, "learning_rate": 4.809373906960476e-07, "logps/chosen": -55.994503021240234, "logps/rejected": -48.09328079223633, "loss": 0.7933, "losses/dpo": 0.8312777280807495, "losses/sft": 1.7263953685760498, "losses/total": 0.8312777280807495, "ref_logps/chosen": -47.46648025512695, "ref_logps/rejected": -40.24591827392578, "rewards/accuracies": 0.375, "rewards/chosen": -0.8528022766113281, "rewards/margins": -0.06806611269712448, "rewards/rejected": -0.784736156463623, "step": 427 }, { "epoch": 0.4, "grad_norm": 14.267626762390137, "learning_rate": 4.807625043721581e-07, "logps/chosen": -37.362342834472656, "logps/rejected": -52.87950134277344, "loss": 0.4932, "losses/dpo": 0.3793169856071472, "losses/sft": 1.7072707414627075, "losses/total": 0.3793169856071472, "ref_logps/chosen": -31.649919509887695, "ref_logps/rejected": -41.25990295410156, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5712425112724304, "rewards/margins": 0.5907171964645386, "rewards/rejected": -1.1619596481323242, "step": 428 }, { "epoch": 0.41, "grad_norm": 24.212587356567383, "learning_rate": 4.805876180482686e-07, "logps/chosen": -62.959625244140625, "logps/rejected": -58.49671173095703, "loss": 0.6562, "losses/dpo": 0.7208305597305298, "losses/sft": 1.8471555709838867, "losses/total": 0.7208305597305298, "ref_logps/chosen": -55.51588439941406, "ref_logps/rejected": -49.259849548339844, "rewards/accuracies": 0.625, "rewards/chosen": -0.7443743944168091, "rewards/margins": 0.17931221425533295, "rewards/rejected": -0.9236866235733032, "step": 429 }, { "epoch": 0.41, "grad_norm": 16.95990562438965, "learning_rate": 4.804127317243791e-07, "logps/chosen": -45.03351974487305, "logps/rejected": -51.542999267578125, "loss": 0.6483, "losses/dpo": 0.6551743745803833, "losses/sft": 1.555639624595642, "losses/total": 0.6551743745803833, "ref_logps/chosen": -37.09907531738281, "ref_logps/rejected": -41.29612350463867, "rewards/accuracies": 0.5, "rewards/chosen": -0.7934445142745972, "rewards/margins": 0.2312435805797577, "rewards/rejected": -1.0246880054473877, "step": 430 }, { "epoch": 0.41, "grad_norm": 19.56223487854004, "learning_rate": 4.802378454004897e-07, "logps/chosen": -51.68017578125, "logps/rejected": -60.513671875, "loss": 0.5672, "losses/dpo": 0.49754729866981506, "losses/sft": 1.8137422800064087, "losses/total": 0.49754729866981506, "ref_logps/chosen": -44.28087615966797, "ref_logps/rejected": -48.87751388549805, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7399299144744873, "rewards/margins": 0.4236864149570465, "rewards/rejected": -1.1636161804199219, "step": 431 }, { "epoch": 0.41, "grad_norm": 19.554107666015625, "learning_rate": 4.800629590766002e-07, "logps/chosen": -38.52039337158203, "logps/rejected": -38.24866485595703, "loss": 0.7718, "losses/dpo": 0.7348898649215698, "losses/sft": 2.0776398181915283, "losses/total": 0.7348898649215698, "ref_logps/chosen": -32.058921813964844, "ref_logps/rejected": -32.68750762939453, "rewards/accuracies": 0.3125, "rewards/chosen": -0.6461471319198608, "rewards/margins": -0.09003150463104248, "rewards/rejected": -0.5561156868934631, "step": 432 }, { "epoch": 0.41, "grad_norm": 15.3414888381958, "learning_rate": 4.798880727527107e-07, "logps/chosen": -33.04411315917969, "logps/rejected": -49.62543487548828, "loss": 0.5423, "losses/dpo": 0.36758744716644287, "losses/sft": 1.2707054615020752, "losses/total": 0.36758744716644287, "ref_logps/chosen": -26.833080291748047, "ref_logps/rejected": -38.800392150878906, "rewards/accuracies": 0.875, "rewards/chosen": -0.6211034059524536, "rewards/margins": 0.46140095591545105, "rewards/rejected": -1.0825042724609375, "step": 433 }, { "epoch": 0.41, "grad_norm": 17.68614959716797, "learning_rate": 4.797131864288212e-07, "logps/chosen": -42.37980270385742, "logps/rejected": -47.62135314941406, "loss": 0.602, "losses/dpo": 0.713486909866333, "losses/sft": 1.7730909585952759, "losses/total": 0.713486909866333, "ref_logps/chosen": -35.75381851196289, "ref_logps/rejected": -38.149776458740234, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6625986099243164, "rewards/margins": 0.28455933928489685, "rewards/rejected": -0.9471579194068909, "step": 434 }, { "epoch": 0.41, "grad_norm": 17.736204147338867, "learning_rate": 4.795383001049318e-07, "logps/chosen": -43.941314697265625, "logps/rejected": -56.237457275390625, "loss": 0.589, "losses/dpo": 0.7260459065437317, "losses/sft": 1.7363533973693848, "losses/total": 0.7260459065437317, "ref_logps/chosen": -36.130069732666016, "ref_logps/rejected": -43.96063995361328, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7811243534088135, "rewards/margins": 0.4465577006340027, "rewards/rejected": -1.227682113647461, "step": 435 }, { "epoch": 0.41, "grad_norm": 20.706064224243164, "learning_rate": 4.793634137810423e-07, "logps/chosen": -52.48896026611328, "logps/rejected": -58.83586120605469, "loss": 0.6204, "losses/dpo": 0.5095518827438354, "losses/sft": 1.1213394403457642, "losses/total": 0.5095518827438354, "ref_logps/chosen": -42.20005416870117, "ref_logps/rejected": -46.325347900390625, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0288909673690796, "rewards/margins": 0.22216051816940308, "rewards/rejected": -1.251051425933838, "step": 436 }, { "epoch": 0.41, "grad_norm": 20.205711364746094, "learning_rate": 4.791885274571528e-07, "logps/chosen": -41.39351272583008, "logps/rejected": -49.85081481933594, "loss": 0.6644, "losses/dpo": 0.48256945610046387, "losses/sft": 1.8185701370239258, "losses/total": 0.48256945610046387, "ref_logps/chosen": -33.18410110473633, "ref_logps/rejected": -40.282981872558594, "rewards/accuracies": 0.625, "rewards/chosen": -0.8209414482116699, "rewards/margins": 0.13584190607070923, "rewards/rejected": -0.9567833542823792, "step": 437 }, { "epoch": 0.41, "grad_norm": 19.14922332763672, "learning_rate": 4.790136411332634e-07, "logps/chosen": -42.25700378417969, "logps/rejected": -54.612281799316406, "loss": 0.6119, "losses/dpo": 0.45903971791267395, "losses/sft": 1.8760770559310913, "losses/total": 0.45903971791267395, "ref_logps/chosen": -33.964603424072266, "ref_logps/rejected": -43.75865173339844, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8292405605316162, "rewards/margins": 0.2561222314834595, "rewards/rejected": -1.0853627920150757, "step": 438 }, { "epoch": 0.41, "grad_norm": 16.795570373535156, "learning_rate": 4.788387548093739e-07, "logps/chosen": -47.49351501464844, "logps/rejected": -68.00299072265625, "loss": 0.5174, "losses/dpo": 0.5840404033660889, "losses/sft": 1.597842812538147, "losses/total": 0.5840404033660889, "ref_logps/chosen": -41.466434478759766, "ref_logps/rejected": -56.268890380859375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6027083396911621, "rewards/margins": 0.5707011222839355, "rewards/rejected": -1.1734094619750977, "step": 439 }, { "epoch": 0.42, "grad_norm": 18.438379287719727, "learning_rate": 4.786638684854844e-07, "logps/chosen": -37.86119842529297, "logps/rejected": -55.4619140625, "loss": 0.5908, "losses/dpo": 0.5164816379547119, "losses/sft": 1.6231499910354614, "losses/total": 0.5164816379547119, "ref_logps/chosen": -31.267627716064453, "ref_logps/rejected": -45.998756408691406, "rewards/accuracies": 0.75, "rewards/chosen": -0.6593573093414307, "rewards/margins": 0.286958247423172, "rewards/rejected": -0.946315586566925, "step": 440 }, { "epoch": 0.42, "grad_norm": 22.680397033691406, "learning_rate": 4.784889821615949e-07, "logps/chosen": -47.43533706665039, "logps/rejected": -46.34278869628906, "loss": 0.7527, "losses/dpo": 0.5242143869400024, "losses/sft": 1.235299825668335, "losses/total": 0.5242143869400024, "ref_logps/chosen": -37.58125305175781, "ref_logps/rejected": -37.05174255371094, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9854083061218262, "rewards/margins": -0.05630387365818024, "rewards/rejected": -0.9291044473648071, "step": 441 }, { "epoch": 0.42, "grad_norm": 15.34579086303711, "learning_rate": 4.783140958377055e-07, "logps/chosen": -38.032371520996094, "logps/rejected": -58.148345947265625, "loss": 0.5269, "losses/dpo": 0.3750840127468109, "losses/sft": 1.2935688495635986, "losses/total": 0.3750840127468109, "ref_logps/chosen": -31.074722290039062, "ref_logps/rejected": -46.435943603515625, "rewards/accuracies": 0.75, "rewards/chosen": -0.6957647800445557, "rewards/margins": 0.4754754602909088, "rewards/rejected": -1.171240210533142, "step": 442 }, { "epoch": 0.42, "grad_norm": 14.68557357788086, "learning_rate": 4.78139209513816e-07, "logps/chosen": -40.66063690185547, "logps/rejected": -54.25035858154297, "loss": 0.4711, "losses/dpo": 0.5740063786506653, "losses/sft": 1.3817611932754517, "losses/total": 0.5740063786506653, "ref_logps/chosen": -35.359954833984375, "ref_logps/rejected": -42.793907165527344, "rewards/accuracies": 0.875, "rewards/chosen": -0.5300679802894592, "rewards/margins": 0.6155773401260376, "rewards/rejected": -1.1456453800201416, "step": 443 }, { "epoch": 0.42, "grad_norm": 18.490785598754883, "learning_rate": 4.779643231899265e-07, "logps/chosen": -41.23979949951172, "logps/rejected": -56.92577362060547, "loss": 0.5916, "losses/dpo": 0.555675745010376, "losses/sft": 1.7466236352920532, "losses/total": 0.555675745010376, "ref_logps/chosen": -32.979679107666016, "ref_logps/rejected": -45.45464324951172, "rewards/accuracies": 0.625, "rewards/chosen": -0.8260120153427124, "rewards/margins": 0.3211010694503784, "rewards/rejected": -1.1471130847930908, "step": 444 }, { "epoch": 0.42, "grad_norm": 16.052154541015625, "learning_rate": 4.777894368660371e-07, "logps/chosen": -38.793434143066406, "logps/rejected": -45.60943603515625, "loss": 0.6371, "losses/dpo": 0.7017737627029419, "losses/sft": 1.5881775617599487, "losses/total": 0.7017737627029419, "ref_logps/chosen": -30.47211265563965, "ref_logps/rejected": -34.91651916503906, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8321321606636047, "rewards/margins": 0.23715932667255402, "rewards/rejected": -1.069291591644287, "step": 445 }, { "epoch": 0.42, "grad_norm": 19.183164596557617, "learning_rate": 4.776145505421476e-07, "logps/chosen": -47.78739929199219, "logps/rejected": -57.933128356933594, "loss": 0.5936, "losses/dpo": 0.41501307487487793, "losses/sft": 1.1630043983459473, "losses/total": 0.41501307487487793, "ref_logps/chosen": -40.3790397644043, "ref_logps/rejected": -47.480621337890625, "rewards/accuracies": 0.625, "rewards/chosen": -0.7408355474472046, "rewards/margins": 0.30441516637802124, "rewards/rejected": -1.045250654220581, "step": 446 }, { "epoch": 0.42, "grad_norm": 19.889230728149414, "learning_rate": 4.774396642182581e-07, "logps/chosen": -49.03253936767578, "logps/rejected": -60.4730339050293, "loss": 0.6154, "losses/dpo": 0.5732200145721436, "losses/sft": 1.8470089435577393, "losses/total": 0.5732200145721436, "ref_logps/chosen": -38.5589714050293, "ref_logps/rejected": -46.730350494384766, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0473569631576538, "rewards/margins": 0.32691168785095215, "rewards/rejected": -1.374268651008606, "step": 447 }, { "epoch": 0.42, "grad_norm": 22.205015182495117, "learning_rate": 4.772647778943686e-07, "logps/chosen": -52.47529983520508, "logps/rejected": -57.048583984375, "loss": 0.6867, "losses/dpo": 0.43346765637397766, "losses/sft": 1.9497685432434082, "losses/total": 0.43346765637397766, "ref_logps/chosen": -43.684791564941406, "ref_logps/rejected": -46.37071990966797, "rewards/accuracies": 0.625, "rewards/chosen": -0.8790508508682251, "rewards/margins": 0.18873567879199982, "rewards/rejected": -1.067786455154419, "step": 448 }, { "epoch": 0.42, "grad_norm": 21.74654197692871, "learning_rate": 4.770898915704791e-07, "logps/chosen": -55.387115478515625, "logps/rejected": -53.995975494384766, "loss": 0.6333, "losses/dpo": 0.7112340927124023, "losses/sft": 2.041130781173706, "losses/total": 0.7112340927124023, "ref_logps/chosen": -46.39142608642578, "ref_logps/rejected": -42.75465774536133, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8995682597160339, "rewards/margins": 0.22456341981887817, "rewards/rejected": -1.124131679534912, "step": 449 }, { "epoch": 0.42, "grad_norm": 18.93586540222168, "learning_rate": 4.769150052465897e-07, "logps/chosen": -51.54730224609375, "logps/rejected": -55.797611236572266, "loss": 0.6069, "losses/dpo": 0.6966808438301086, "losses/sft": 1.768096923828125, "losses/total": 0.6966808438301086, "ref_logps/chosen": -42.923240661621094, "ref_logps/rejected": -44.025184631347656, "rewards/accuracies": 0.625, "rewards/chosen": -0.8624064922332764, "rewards/margins": 0.31483587622642517, "rewards/rejected": -1.1772422790527344, "step": 450 }, { "epoch": 0.43, "grad_norm": 20.891456604003906, "learning_rate": 4.767401189227002e-07, "logps/chosen": -51.11573028564453, "logps/rejected": -53.578433990478516, "loss": 0.6487, "losses/dpo": 0.5291703939437866, "losses/sft": 1.383335828781128, "losses/total": 0.5291703939437866, "ref_logps/chosen": -42.64189147949219, "ref_logps/rejected": -42.661312103271484, "rewards/accuracies": 0.625, "rewards/chosen": -0.8473836779594421, "rewards/margins": 0.2443285584449768, "rewards/rejected": -1.091712236404419, "step": 451 }, { "epoch": 0.43, "grad_norm": 14.952369689941406, "learning_rate": 4.7656523259881074e-07, "logps/chosen": -45.62141418457031, "logps/rejected": -57.92384338378906, "loss": 0.4864, "losses/dpo": 0.56844162940979, "losses/sft": 1.3780332803726196, "losses/total": 0.56844162940979, "ref_logps/chosen": -40.546173095703125, "ref_logps/rejected": -47.22905731201172, "rewards/accuracies": 0.875, "rewards/chosen": -0.5075246095657349, "rewards/margins": 0.5619542598724365, "rewards/rejected": -1.0694787502288818, "step": 452 }, { "epoch": 0.43, "grad_norm": 16.49131965637207, "learning_rate": 4.763903462749213e-07, "logps/chosen": -35.94932556152344, "logps/rejected": -43.545989990234375, "loss": 0.5882, "losses/dpo": 0.7098357677459717, "losses/sft": 1.9436204433441162, "losses/total": 0.7098357677459717, "ref_logps/chosen": -30.23165512084961, "ref_logps/rejected": -34.85575866699219, "rewards/accuracies": 0.75, "rewards/chosen": -0.5717674493789673, "rewards/margins": 0.29725557565689087, "rewards/rejected": -0.8690229654312134, "step": 453 }, { "epoch": 0.43, "grad_norm": 18.1254825592041, "learning_rate": 4.762154599510318e-07, "logps/chosen": -56.57024383544922, "logps/rejected": -73.067138671875, "loss": 0.5017, "losses/dpo": 0.29225221276283264, "losses/sft": 1.9481486082077026, "losses/total": 0.29225221276283264, "ref_logps/chosen": -46.44720458984375, "ref_logps/rejected": -57.4094123840332, "rewards/accuracies": 0.875, "rewards/chosen": -1.0123039484024048, "rewards/margins": 0.5534693598747253, "rewards/rejected": -1.5657734870910645, "step": 454 }, { "epoch": 0.43, "grad_norm": 17.663000106811523, "learning_rate": 4.7604057362714233e-07, "logps/chosen": -43.21162033081055, "logps/rejected": -50.22489547729492, "loss": 0.564, "losses/dpo": 0.36530470848083496, "losses/sft": 1.479834794998169, "losses/total": 0.36530470848083496, "ref_logps/chosen": -35.02986526489258, "ref_logps/rejected": -38.19993591308594, "rewards/accuracies": 0.75, "rewards/chosen": -0.8181754946708679, "rewards/margins": 0.3843207061290741, "rewards/rejected": -1.2024962902069092, "step": 455 }, { "epoch": 0.43, "grad_norm": 21.488021850585938, "learning_rate": 4.758656873032529e-07, "logps/chosen": -48.68894577026367, "logps/rejected": -58.52639389038086, "loss": 0.6352, "losses/dpo": 0.5732907056808472, "losses/sft": 1.8608347177505493, "losses/total": 0.5732907056808472, "ref_logps/chosen": -41.12689971923828, "ref_logps/rejected": -47.08283615112305, "rewards/accuracies": 0.75, "rewards/chosen": -0.7562047243118286, "rewards/margins": 0.38815122842788696, "rewards/rejected": -1.1443560123443604, "step": 456 }, { "epoch": 0.43, "grad_norm": 16.04104995727539, "learning_rate": 4.7569080097936335e-07, "logps/chosen": -35.751747131347656, "logps/rejected": -48.58073806762695, "loss": 0.5742, "losses/dpo": 0.4595525860786438, "losses/sft": 1.1189483404159546, "losses/total": 0.4595525860786438, "ref_logps/chosen": -28.997360229492188, "ref_logps/rejected": -38.64642333984375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6754388213157654, "rewards/margins": 0.3179926872253418, "rewards/rejected": -0.9934315085411072, "step": 457 }, { "epoch": 0.43, "grad_norm": 22.913747787475586, "learning_rate": 4.755159146554739e-07, "logps/chosen": -51.341373443603516, "logps/rejected": -56.24919891357422, "loss": 0.6936, "losses/dpo": 0.670680582523346, "losses/sft": 1.5938969850540161, "losses/total": 0.670680582523346, "ref_logps/chosen": -42.247589111328125, "ref_logps/rejected": -45.67637252807617, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9093787670135498, "rewards/margins": 0.14790397882461548, "rewards/rejected": -1.0572826862335205, "step": 458 }, { "epoch": 0.43, "grad_norm": 21.591320037841797, "learning_rate": 4.753410283315845e-07, "logps/chosen": -48.552101135253906, "logps/rejected": -71.198974609375, "loss": 0.5838, "losses/dpo": 0.5087445974349976, "losses/sft": 1.36105477809906, "losses/total": 0.5087445974349976, "ref_logps/chosen": -38.676673889160156, "ref_logps/rejected": -58.15402603149414, "rewards/accuracies": 0.75, "rewards/chosen": -0.9875422716140747, "rewards/margins": 0.31695204973220825, "rewards/rejected": -1.3044943809509277, "step": 459 }, { "epoch": 0.43, "grad_norm": 18.097824096679688, "learning_rate": 4.75166142007695e-07, "logps/chosen": -49.69443893432617, "logps/rejected": -55.27891159057617, "loss": 0.618, "losses/dpo": 0.33629435300827026, "losses/sft": 1.4625208377838135, "losses/total": 0.33629435300827026, "ref_logps/chosen": -42.104835510253906, "ref_logps/rejected": -44.32172393798828, "rewards/accuracies": 0.75, "rewards/chosen": -0.7589606046676636, "rewards/margins": 0.3367578387260437, "rewards/rejected": -1.0957183837890625, "step": 460 }, { "epoch": 0.44, "grad_norm": 17.464462280273438, "learning_rate": 4.749912556838055e-07, "logps/chosen": -33.44009780883789, "logps/rejected": -50.5752067565918, "loss": 0.6021, "losses/dpo": 0.4094173312187195, "losses/sft": 1.7006267309188843, "losses/total": 0.4094173312187195, "ref_logps/chosen": -24.201457977294922, "ref_logps/rejected": -38.730926513671875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9238638877868652, "rewards/margins": 0.2605639100074768, "rewards/rejected": -1.1844277381896973, "step": 461 }, { "epoch": 0.44, "grad_norm": 22.261863708496094, "learning_rate": 4.74816369359916e-07, "logps/chosen": -44.44083023071289, "logps/rejected": -42.3175048828125, "loss": 0.7428, "losses/dpo": 0.6097182035446167, "losses/sft": 1.3318395614624023, "losses/total": 0.6097182035446167, "ref_logps/chosen": -36.43798065185547, "ref_logps/rejected": -33.59527587890625, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8002848625183105, "rewards/margins": 0.07193787395954132, "rewards/rejected": -0.8722227811813354, "step": 462 }, { "epoch": 0.44, "grad_norm": 17.269041061401367, "learning_rate": 4.746414830360266e-07, "logps/chosen": -42.34362030029297, "logps/rejected": -50.94217300415039, "loss": 0.6089, "losses/dpo": 0.5902761220932007, "losses/sft": 1.8555960655212402, "losses/total": 0.5902761220932007, "ref_logps/chosen": -34.671905517578125, "ref_logps/rejected": -40.2083740234375, "rewards/accuracies": 0.625, "rewards/chosen": -0.7671712636947632, "rewards/margins": 0.30620861053466797, "rewards/rejected": -1.0733798742294312, "step": 463 }, { "epoch": 0.44, "grad_norm": 17.21843719482422, "learning_rate": 4.7446659671213705e-07, "logps/chosen": -40.25098419189453, "logps/rejected": -46.70378112792969, "loss": 0.6523, "losses/dpo": 0.7448264360427856, "losses/sft": 1.8023455142974854, "losses/total": 0.7448264360427856, "ref_logps/chosen": -31.76789665222168, "ref_logps/rejected": -35.90791320800781, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8483086824417114, "rewards/margins": 0.2312774658203125, "rewards/rejected": -1.0795860290527344, "step": 464 }, { "epoch": 0.44, "grad_norm": 21.60810089111328, "learning_rate": 4.742917103882476e-07, "logps/chosen": -45.472679138183594, "logps/rejected": -46.75816345214844, "loss": 0.7712, "losses/dpo": 0.5438826084136963, "losses/sft": 1.6787500381469727, "losses/total": 0.5438826084136963, "ref_logps/chosen": -34.86170196533203, "ref_logps/rejected": -36.738441467285156, "rewards/accuracies": 0.4375, "rewards/chosen": -1.0610976219177246, "rewards/margins": -0.0591253936290741, "rewards/rejected": -1.0019723176956177, "step": 465 }, { "epoch": 0.44, "grad_norm": 16.170509338378906, "learning_rate": 4.741168240643582e-07, "logps/chosen": -39.71556854248047, "logps/rejected": -46.69499969482422, "loss": 0.5679, "losses/dpo": 0.4159000515937805, "losses/sft": 0.8769636750221252, "losses/total": 0.4159000515937805, "ref_logps/chosen": -33.49738311767578, "ref_logps/rejected": -36.21461486816406, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6218188405036926, "rewards/margins": 0.4262195825576782, "rewards/rejected": -1.0480384826660156, "step": 466 }, { "epoch": 0.44, "grad_norm": 19.162927627563477, "learning_rate": 4.739419377404687e-07, "logps/chosen": -41.015533447265625, "logps/rejected": -48.816219329833984, "loss": 0.661, "losses/dpo": 0.8000115156173706, "losses/sft": 1.8277498483657837, "losses/total": 0.8000115156173706, "ref_logps/chosen": -32.41111373901367, "ref_logps/rejected": -37.9894905090332, "rewards/accuracies": 0.5, "rewards/chosen": -0.860442042350769, "rewards/margins": 0.2222307026386261, "rewards/rejected": -1.0826727151870728, "step": 467 }, { "epoch": 0.44, "grad_norm": 20.709863662719727, "learning_rate": 4.737670514165792e-07, "logps/chosen": -52.0721321105957, "logps/rejected": -68.4088134765625, "loss": 0.5521, "losses/dpo": 0.37309297919273376, "losses/sft": 1.539156198501587, "losses/total": 0.37309297919273376, "ref_logps/chosen": -44.31230163574219, "ref_logps/rejected": -55.56413269042969, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7759832143783569, "rewards/margins": 0.5084854960441589, "rewards/rejected": -1.284468650817871, "step": 468 }, { "epoch": 0.44, "grad_norm": 23.58916473388672, "learning_rate": 4.735921650926897e-07, "logps/chosen": -48.31652069091797, "logps/rejected": -53.9385871887207, "loss": 0.7458, "losses/dpo": 0.7906002998352051, "losses/sft": 1.6727759838104248, "losses/total": 0.7906002998352051, "ref_logps/chosen": -36.828189849853516, "ref_logps/rejected": -42.351768493652344, "rewards/accuracies": 0.375, "rewards/chosen": -1.1488333940505981, "rewards/margins": 0.009848490357398987, "rewards/rejected": -1.1586819887161255, "step": 469 }, { "epoch": 0.44, "grad_norm": 19.42133903503418, "learning_rate": 4.734172787688003e-07, "logps/chosen": -45.268280029296875, "logps/rejected": -64.12366485595703, "loss": 0.5553, "losses/dpo": 0.5998967289924622, "losses/sft": 1.2374427318572998, "losses/total": 0.5998967289924622, "ref_logps/chosen": -36.237701416015625, "ref_logps/rejected": -50.12152862548828, "rewards/accuracies": 0.75, "rewards/chosen": -0.903057873249054, "rewards/margins": 0.49715644121170044, "rewards/rejected": -1.4002143144607544, "step": 470 }, { "epoch": 0.44, "grad_norm": 26.29847526550293, "learning_rate": 4.7324239244491074e-07, "logps/chosen": -60.17868423461914, "logps/rejected": -65.33236694335938, "loss": 0.8158, "losses/dpo": 0.7402694225311279, "losses/sft": 1.5119843482971191, "losses/total": 0.7402694225311279, "ref_logps/chosen": -45.775604248046875, "ref_logps/rejected": -51.926429748535156, "rewards/accuracies": 0.5, "rewards/chosen": -1.4403083324432373, "rewards/margins": -0.0997144877910614, "rewards/rejected": -1.3405938148498535, "step": 471 }, { "epoch": 0.45, "grad_norm": 19.726667404174805, "learning_rate": 4.730675061210213e-07, "logps/chosen": -40.36116027832031, "logps/rejected": -48.045719146728516, "loss": 0.6722, "losses/dpo": 0.5413148999214172, "losses/sft": 1.5484027862548828, "losses/total": 0.5413148999214172, "ref_logps/chosen": -30.498477935791016, "ref_logps/rejected": -35.477359771728516, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9862680435180664, "rewards/margins": 0.2705676257610321, "rewards/rejected": -1.256835699081421, "step": 472 }, { "epoch": 0.45, "grad_norm": 19.40871238708496, "learning_rate": 4.7289261979713187e-07, "logps/chosen": -49.310821533203125, "logps/rejected": -47.04783630371094, "loss": 0.6942, "losses/dpo": 0.9769923686981201, "losses/sft": 2.333613872528076, "losses/total": 0.9769923686981201, "ref_logps/chosen": -40.01148986816406, "ref_logps/rejected": -36.74407958984375, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9299333095550537, "rewards/margins": 0.10044249892234802, "rewards/rejected": -1.0303759574890137, "step": 473 }, { "epoch": 0.45, "grad_norm": 18.19770050048828, "learning_rate": 4.727177334732424e-07, "logps/chosen": -42.30438232421875, "logps/rejected": -55.242313385009766, "loss": 0.4734, "losses/dpo": 0.7860375642776489, "losses/sft": 1.4256479740142822, "losses/total": 0.7860375642776489, "ref_logps/chosen": -34.156639099121094, "ref_logps/rejected": -39.7231559753418, "rewards/accuracies": 0.875, "rewards/chosen": -0.8147742748260498, "rewards/margins": 0.7371412515640259, "rewards/rejected": -1.5519155263900757, "step": 474 }, { "epoch": 0.45, "grad_norm": 18.775230407714844, "learning_rate": 4.725428471493529e-07, "logps/chosen": -46.815521240234375, "logps/rejected": -63.159793853759766, "loss": 0.6024, "losses/dpo": 0.598051905632019, "losses/sft": 1.55747389793396, "losses/total": 0.598051905632019, "ref_logps/chosen": -37.30512237548828, "ref_logps/rejected": -50.9106330871582, "rewards/accuracies": 0.75, "rewards/chosen": -0.951040506362915, "rewards/margins": 0.27387550473213196, "rewards/rejected": -1.2249159812927246, "step": 475 }, { "epoch": 0.45, "grad_norm": 15.7463960647583, "learning_rate": 4.723679608254634e-07, "logps/chosen": -35.400634765625, "logps/rejected": -49.15324401855469, "loss": 0.5159, "losses/dpo": 0.8908594846725464, "losses/sft": 1.7306084632873535, "losses/total": 0.8908594846725464, "ref_logps/chosen": -28.968732833862305, "ref_logps/rejected": -36.71923065185547, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6431903839111328, "rewards/margins": 0.6002112627029419, "rewards/rejected": -1.2434017658233643, "step": 476 }, { "epoch": 0.45, "grad_norm": 20.361690521240234, "learning_rate": 4.72193074501574e-07, "logps/chosen": -49.52460479736328, "logps/rejected": -58.65748977661133, "loss": 0.6242, "losses/dpo": 0.871837854385376, "losses/sft": 1.6254528760910034, "losses/total": 0.871837854385376, "ref_logps/chosen": -41.26051330566406, "ref_logps/rejected": -46.249664306640625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8264090418815613, "rewards/margins": 0.4143737554550171, "rewards/rejected": -1.2407827377319336, "step": 477 }, { "epoch": 0.45, "grad_norm": 17.02433204650879, "learning_rate": 4.7201818817768444e-07, "logps/chosen": -33.82848358154297, "logps/rejected": -38.8887939453125, "loss": 0.6757, "losses/dpo": 0.5560075044631958, "losses/sft": 1.0706725120544434, "losses/total": 0.5560075044631958, "ref_logps/chosen": -26.161861419677734, "ref_logps/rejected": -30.305509567260742, "rewards/accuracies": 0.625, "rewards/chosen": -0.7666621208190918, "rewards/margins": 0.09166643023490906, "rewards/rejected": -0.858328640460968, "step": 478 }, { "epoch": 0.45, "grad_norm": 18.766489028930664, "learning_rate": 4.71843301853795e-07, "logps/chosen": -45.95814514160156, "logps/rejected": -52.7591552734375, "loss": 0.5795, "losses/dpo": 0.5951334834098816, "losses/sft": 1.0161874294281006, "losses/total": 0.5951334834098816, "ref_logps/chosen": -38.287296295166016, "ref_logps/rejected": -42.2863883972168, "rewards/accuracies": 0.75, "rewards/chosen": -0.767085075378418, "rewards/margins": 0.2801916301250458, "rewards/rejected": -1.0472767353057861, "step": 479 }, { "epoch": 0.45, "grad_norm": 17.539127349853516, "learning_rate": 4.7166841552990557e-07, "logps/chosen": -36.53566360473633, "logps/rejected": -54.15878677368164, "loss": 0.6424, "losses/dpo": 0.7673262357711792, "losses/sft": 1.7970187664031982, "losses/total": 0.7673262357711792, "ref_logps/chosen": -28.678905487060547, "ref_logps/rejected": -43.651466369628906, "rewards/accuracies": 0.75, "rewards/chosen": -0.785676121711731, "rewards/margins": 0.2650558054447174, "rewards/rejected": -1.050731897354126, "step": 480 }, { "epoch": 0.45, "grad_norm": 19.44285011291504, "learning_rate": 4.714935292060161e-07, "logps/chosen": -41.32199478149414, "logps/rejected": -59.14490509033203, "loss": 0.5627, "losses/dpo": 0.45880869030952454, "losses/sft": 1.7741310596466064, "losses/total": 0.45880869030952454, "ref_logps/chosen": -33.38575744628906, "ref_logps/rejected": -47.139739990234375, "rewards/accuracies": 0.625, "rewards/chosen": -0.793623685836792, "rewards/margins": 0.40689271688461304, "rewards/rejected": -1.2005164623260498, "step": 481 }, { "epoch": 0.46, "grad_norm": 15.83872127532959, "learning_rate": 4.713186428821266e-07, "logps/chosen": -45.21366882324219, "logps/rejected": -60.75635528564453, "loss": 0.5274, "losses/dpo": 0.4298006296157837, "losses/sft": 1.198513388633728, "losses/total": 0.4298006296157837, "ref_logps/chosen": -37.90647506713867, "ref_logps/rejected": -47.69110870361328, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7307194471359253, "rewards/margins": 0.5758055448532104, "rewards/rejected": -1.3065249919891357, "step": 482 }, { "epoch": 0.46, "grad_norm": 14.492478370666504, "learning_rate": 4.711437565582371e-07, "logps/chosen": -38.78242492675781, "logps/rejected": -44.837562561035156, "loss": 0.5355, "losses/dpo": 0.5942108631134033, "losses/sft": 1.7184935808181763, "losses/total": 0.5942108631134033, "ref_logps/chosen": -32.85670471191406, "ref_logps/rejected": -34.80342102050781, "rewards/accuracies": 0.75, "rewards/chosen": -0.5925720930099487, "rewards/margins": 0.41084229946136475, "rewards/rejected": -1.0034143924713135, "step": 483 }, { "epoch": 0.46, "grad_norm": 18.78915786743164, "learning_rate": 4.7096887023434767e-07, "logps/chosen": -48.04331970214844, "logps/rejected": -68.16919708251953, "loss": 0.5494, "losses/dpo": 0.6037707328796387, "losses/sft": 1.6506601572036743, "losses/total": 0.6037707328796387, "ref_logps/chosen": -38.50733184814453, "ref_logps/rejected": -54.90313720703125, "rewards/accuracies": 0.625, "rewards/chosen": -0.9535988569259644, "rewards/margins": 0.3730068802833557, "rewards/rejected": -1.3266057968139648, "step": 484 }, { "epoch": 0.46, "grad_norm": 19.55449104309082, "learning_rate": 4.707939839104582e-07, "logps/chosen": -54.59739685058594, "logps/rejected": -56.72189712524414, "loss": 0.6101, "losses/dpo": 0.4871029853820801, "losses/sft": 1.7814446687698364, "losses/total": 0.4871029853820801, "ref_logps/chosen": -46.157718658447266, "ref_logps/rejected": -45.65517044067383, "rewards/accuracies": 0.625, "rewards/chosen": -0.8439677953720093, "rewards/margins": 0.2627049684524536, "rewards/rejected": -1.106672763824463, "step": 485 }, { "epoch": 0.46, "grad_norm": 16.17681312561035, "learning_rate": 4.706190975865687e-07, "logps/chosen": -46.72882080078125, "logps/rejected": -73.10804748535156, "loss": 0.4612, "losses/dpo": 0.6411159038543701, "losses/sft": 1.7147176265716553, "losses/total": 0.6411159038543701, "ref_logps/chosen": -38.388999938964844, "ref_logps/rejected": -57.14522933959961, "rewards/accuracies": 0.75, "rewards/chosen": -0.8339822888374329, "rewards/margins": 0.7622995376586914, "rewards/rejected": -1.5962817668914795, "step": 486 }, { "epoch": 0.46, "grad_norm": 23.703916549682617, "learning_rate": 4.7044421126267926e-07, "logps/chosen": -45.02565002441406, "logps/rejected": -43.16627502441406, "loss": 0.7874, "losses/dpo": 0.5986388921737671, "losses/sft": 1.2237434387207031, "losses/total": 0.5986388921737671, "ref_logps/chosen": -36.05559539794922, "ref_logps/rejected": -34.63477325439453, "rewards/accuracies": 0.5, "rewards/chosen": -0.8970053195953369, "rewards/margins": -0.0438552126288414, "rewards/rejected": -0.8531500697135925, "step": 487 }, { "epoch": 0.46, "grad_norm": 18.107013702392578, "learning_rate": 4.702693249387898e-07, "logps/chosen": -44.655616760253906, "logps/rejected": -49.370018005371094, "loss": 0.6378, "losses/dpo": 0.6428918838500977, "losses/sft": 1.8399006128311157, "losses/total": 0.6428918838500977, "ref_logps/chosen": -35.976295471191406, "ref_logps/rejected": -38.152950286865234, "rewards/accuracies": 0.625, "rewards/chosen": -0.8679324984550476, "rewards/margins": 0.2537745237350464, "rewards/rejected": -1.1217069625854492, "step": 488 }, { "epoch": 0.46, "grad_norm": 17.24370002746582, "learning_rate": 4.700944386149003e-07, "logps/chosen": -42.770179748535156, "logps/rejected": -60.08184051513672, "loss": 0.5301, "losses/dpo": 0.5165985822677612, "losses/sft": 1.7652552127838135, "losses/total": 0.5165985822677612, "ref_logps/chosen": -36.12382888793945, "ref_logps/rejected": -48.13255310058594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.664635181427002, "rewards/margins": 0.5302936434745789, "rewards/rejected": -1.194928765296936, "step": 489 }, { "epoch": 0.46, "grad_norm": 18.994253158569336, "learning_rate": 4.699195522910108e-07, "logps/chosen": -46.275970458984375, "logps/rejected": -57.52937316894531, "loss": 0.5453, "losses/dpo": 0.6960052251815796, "losses/sft": 1.6999146938323975, "losses/total": 0.6960052251815796, "ref_logps/chosen": -38.666622161865234, "ref_logps/rejected": -43.64849853515625, "rewards/accuracies": 0.625, "rewards/chosen": -0.760934591293335, "rewards/margins": 0.6271531581878662, "rewards/rejected": -1.3880877494812012, "step": 490 }, { "epoch": 0.46, "grad_norm": 22.115245819091797, "learning_rate": 4.6974466596712137e-07, "logps/chosen": -58.12640380859375, "logps/rejected": -62.0530891418457, "loss": 0.6813, "losses/dpo": 0.5207221508026123, "losses/sft": 1.4057503938674927, "losses/total": 0.5207221508026123, "ref_logps/chosen": -46.12263488769531, "ref_logps/rejected": -48.4195442199707, "rewards/accuracies": 0.625, "rewards/chosen": -1.2003765106201172, "rewards/margins": 0.16297802329063416, "rewards/rejected": -1.3633546829223633, "step": 491 }, { "epoch": 0.46, "grad_norm": 17.894325256347656, "learning_rate": 4.695697796432319e-07, "logps/chosen": -42.253726959228516, "logps/rejected": -54.4864501953125, "loss": 0.5657, "losses/dpo": 0.5236856937408447, "losses/sft": 1.9671494960784912, "losses/total": 0.5236856937408447, "ref_logps/chosen": -34.381656646728516, "ref_logps/rejected": -42.45591354370117, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7872068881988525, "rewards/margins": 0.415846586227417, "rewards/rejected": -1.2030534744262695, "step": 492 }, { "epoch": 0.47, "grad_norm": 17.1875, "learning_rate": 4.693948933193424e-07, "logps/chosen": -42.840702056884766, "logps/rejected": -54.49610900878906, "loss": 0.6294, "losses/dpo": 0.6529833078384399, "losses/sft": 1.8150737285614014, "losses/total": 0.6529833078384399, "ref_logps/chosen": -33.348411560058594, "ref_logps/rejected": -42.27693176269531, "rewards/accuracies": 0.625, "rewards/chosen": -0.9492291212081909, "rewards/margins": 0.2726883888244629, "rewards/rejected": -1.2219175100326538, "step": 493 }, { "epoch": 0.47, "grad_norm": 15.577768325805664, "learning_rate": 4.6922000699545296e-07, "logps/chosen": -42.49756622314453, "logps/rejected": -56.66531753540039, "loss": 0.5734, "losses/dpo": 0.44411545991897583, "losses/sft": 1.9318451881408691, "losses/total": 0.44411545991897583, "ref_logps/chosen": -34.50550842285156, "ref_logps/rejected": -44.91295623779297, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7992058396339417, "rewards/margins": 0.3760302662849426, "rewards/rejected": -1.1752361059188843, "step": 494 }, { "epoch": 0.47, "grad_norm": 22.485267639160156, "learning_rate": 4.6904512067156347e-07, "logps/chosen": -53.93192672729492, "logps/rejected": -56.58464431762695, "loss": 0.739, "losses/dpo": 0.5776136517524719, "losses/sft": 1.7007924318313599, "losses/total": 0.5776136517524719, "ref_logps/chosen": -44.054908752441406, "ref_logps/rejected": -45.867652893066406, "rewards/accuracies": 0.625, "rewards/chosen": -0.9877016544342041, "rewards/margins": 0.0839976891875267, "rewards/rejected": -1.0716992616653442, "step": 495 }, { "epoch": 0.47, "grad_norm": 17.948152542114258, "learning_rate": 4.68870234347674e-07, "logps/chosen": -41.03163146972656, "logps/rejected": -45.37687301635742, "loss": 0.6257, "losses/dpo": 0.5661336183547974, "losses/sft": 2.003648281097412, "losses/total": 0.5661336183547974, "ref_logps/chosen": -34.10818099975586, "ref_logps/rejected": -35.574745178222656, "rewards/accuracies": 0.75, "rewards/chosen": -0.692345380783081, "rewards/margins": 0.28786757588386536, "rewards/rejected": -0.9802129864692688, "step": 496 }, { "epoch": 0.47, "grad_norm": 17.81679916381836, "learning_rate": 4.686953480237845e-07, "logps/chosen": -40.114295959472656, "logps/rejected": -68.06855773925781, "loss": 0.4938, "losses/dpo": 0.3779289722442627, "losses/sft": 1.258560061454773, "losses/total": 0.3779289722442627, "ref_logps/chosen": -34.208866119384766, "ref_logps/rejected": -54.778663635253906, "rewards/accuracies": 0.75, "rewards/chosen": -0.5905427932739258, "rewards/margins": 0.7384467124938965, "rewards/rejected": -1.3289895057678223, "step": 497 }, { "epoch": 0.47, "grad_norm": 17.091392517089844, "learning_rate": 4.6852046169989506e-07, "logps/chosen": -48.91603469848633, "logps/rejected": -78.88955688476562, "loss": 0.494, "losses/dpo": 0.3647302985191345, "losses/sft": 2.0997064113616943, "losses/total": 0.3647302985191345, "ref_logps/chosen": -42.14565658569336, "ref_logps/rejected": -65.46869659423828, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6770377159118652, "rewards/margins": 0.6650483012199402, "rewards/rejected": -1.3420860767364502, "step": 498 }, { "epoch": 0.47, "grad_norm": 19.506563186645508, "learning_rate": 4.6834557537600557e-07, "logps/chosen": -38.25553894042969, "logps/rejected": -47.6988410949707, "loss": 0.6258, "losses/dpo": 0.4752151370048523, "losses/sft": 1.5231549739837646, "losses/total": 0.4752151370048523, "ref_logps/chosen": -31.041873931884766, "ref_logps/rejected": -37.1468391418457, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7213667035102844, "rewards/margins": 0.33383363485336304, "rewards/rejected": -1.0552003383636475, "step": 499 }, { "epoch": 0.47, "grad_norm": 18.562971115112305, "learning_rate": 4.681706890521161e-07, "logps/chosen": -38.205718994140625, "logps/rejected": -44.971038818359375, "loss": 0.6131, "losses/dpo": 0.7807100415229797, "losses/sft": 1.3306481838226318, "losses/total": 0.7807100415229797, "ref_logps/chosen": -31.795337677001953, "ref_logps/rejected": -36.20762252807617, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6410377025604248, "rewards/margins": 0.23530375957489014, "rewards/rejected": -0.8763414025306702, "step": 500 }, { "epoch": 0.47, "grad_norm": 22.99134635925293, "learning_rate": 4.6799580272822665e-07, "logps/chosen": -55.033180236816406, "logps/rejected": -56.21544647216797, "loss": 0.7198, "losses/dpo": 0.7807220816612244, "losses/sft": 1.5503177642822266, "losses/total": 0.7807220816612244, "ref_logps/chosen": -43.038021087646484, "ref_logps/rejected": -42.46408462524414, "rewards/accuracies": 0.625, "rewards/chosen": -1.199515461921692, "rewards/margins": 0.17562085390090942, "rewards/rejected": -1.375136375427246, "step": 501 }, { "epoch": 0.47, "grad_norm": 16.69556999206543, "learning_rate": 4.6782091640433716e-07, "logps/chosen": -46.3759651184082, "logps/rejected": -60.395477294921875, "loss": 0.5059, "losses/dpo": 0.37844979763031006, "losses/sft": 1.8363982439041138, "losses/total": 0.37844979763031006, "ref_logps/chosen": -39.96241760253906, "ref_logps/rejected": -47.32520294189453, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6413544416427612, "rewards/margins": 0.6656726598739624, "rewards/rejected": -1.3070271015167236, "step": 502 }, { "epoch": 0.47, "grad_norm": 18.059375762939453, "learning_rate": 4.676460300804477e-07, "logps/chosen": -40.86088180541992, "logps/rejected": -45.95768356323242, "loss": 0.6248, "losses/dpo": 0.8307574391365051, "losses/sft": 1.4206961393356323, "losses/total": 0.8307574391365051, "ref_logps/chosen": -33.901485443115234, "ref_logps/rejected": -36.60893249511719, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6959394216537476, "rewards/margins": 0.23893572390079498, "rewards/rejected": -0.9348750710487366, "step": 503 }, { "epoch": 0.48, "grad_norm": 22.766759872436523, "learning_rate": 4.6747114375655824e-07, "logps/chosen": -43.040191650390625, "logps/rejected": -54.45762252807617, "loss": 0.67, "losses/dpo": 0.8299270868301392, "losses/sft": 1.9540852308273315, "losses/total": 0.8299270868301392, "ref_logps/chosen": -35.957183837890625, "ref_logps/rejected": -44.16240692138672, "rewards/accuracies": 0.75, "rewards/chosen": -0.7083003520965576, "rewards/margins": 0.32122132182121277, "rewards/rejected": -1.0295215845108032, "step": 504 }, { "epoch": 0.48, "grad_norm": 17.236427307128906, "learning_rate": 4.6729625743266875e-07, "logps/chosen": -40.47539138793945, "logps/rejected": -50.07622528076172, "loss": 0.58, "losses/dpo": 0.6723325848579407, "losses/sft": 1.2062137126922607, "losses/total": 0.6723325848579407, "ref_logps/chosen": -32.4854736328125, "ref_logps/rejected": -38.40970993041992, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7989921569824219, "rewards/margins": 0.3676595985889435, "rewards/rejected": -1.166651725769043, "step": 505 }, { "epoch": 0.48, "grad_norm": 18.677513122558594, "learning_rate": 4.6712137110877927e-07, "logps/chosen": -50.442169189453125, "logps/rejected": -69.31327819824219, "loss": 0.5662, "losses/dpo": 0.9488786458969116, "losses/sft": 1.9949392080307007, "losses/total": 0.9488786458969116, "ref_logps/chosen": -42.01020812988281, "ref_logps/rejected": -54.56141662597656, "rewards/accuracies": 0.75, "rewards/chosen": -0.8431962132453918, "rewards/margins": 0.631990373134613, "rewards/rejected": -1.4751865863800049, "step": 506 }, { "epoch": 0.48, "grad_norm": 14.27094554901123, "learning_rate": 4.669464847848898e-07, "logps/chosen": -41.08687973022461, "logps/rejected": -65.14675903320312, "loss": 0.4831, "losses/dpo": 0.552592396736145, "losses/sft": 1.467944860458374, "losses/total": 0.552592396736145, "ref_logps/chosen": -34.62055969238281, "ref_logps/rejected": -52.06278991699219, "rewards/accuracies": 0.75, "rewards/chosen": -0.6466319561004639, "rewards/margins": 0.6617651581764221, "rewards/rejected": -1.3083970546722412, "step": 507 }, { "epoch": 0.48, "grad_norm": 20.737091064453125, "learning_rate": 4.6677159846100035e-07, "logps/chosen": -39.053672790527344, "logps/rejected": -49.83109664916992, "loss": 0.6259, "losses/dpo": 0.44908422231674194, "losses/sft": 1.63162362575531, "losses/total": 0.44908422231674194, "ref_logps/chosen": -31.550031661987305, "ref_logps/rejected": -39.70254898071289, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7503641843795776, "rewards/margins": 0.26249057054519653, "rewards/rejected": -1.0128546953201294, "step": 508 }, { "epoch": 0.48, "grad_norm": 19.570249557495117, "learning_rate": 4.6659671213711086e-07, "logps/chosen": -37.74221420288086, "logps/rejected": -38.164546966552734, "loss": 0.7403, "losses/dpo": 0.7543376684188843, "losses/sft": 1.7525604963302612, "losses/total": 0.7543376684188843, "ref_logps/chosen": -30.180740356445312, "ref_logps/rejected": -30.425790786743164, "rewards/accuracies": 0.5, "rewards/chosen": -0.7561473250389099, "rewards/margins": 0.017728038132190704, "rewards/rejected": -0.7738754153251648, "step": 509 }, { "epoch": 0.48, "grad_norm": 16.213655471801758, "learning_rate": 4.6642182581322137e-07, "logps/chosen": -43.07244110107422, "logps/rejected": -58.89888000488281, "loss": 0.4809, "losses/dpo": 0.3350576162338257, "losses/sft": 1.128604531288147, "losses/total": 0.3350576162338257, "ref_logps/chosen": -37.63832473754883, "ref_logps/rejected": -47.659271240234375, "rewards/accuracies": 0.875, "rewards/chosen": -0.5434116721153259, "rewards/margins": 0.5805493593215942, "rewards/rejected": -1.123961091041565, "step": 510 }, { "epoch": 0.48, "grad_norm": 20.865148544311523, "learning_rate": 4.6624693948933194e-07, "logps/chosen": -49.687129974365234, "logps/rejected": -51.68817901611328, "loss": 0.6943, "losses/dpo": 0.8964338302612305, "losses/sft": 1.6238555908203125, "losses/total": 0.8964338302612305, "ref_logps/chosen": -40.38511657714844, "ref_logps/rejected": -40.2294807434082, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9302010536193848, "rewards/margins": 0.2156689167022705, "rewards/rejected": -1.1458700895309448, "step": 511 }, { "epoch": 0.48, "grad_norm": 17.870445251464844, "learning_rate": 4.6607205316544245e-07, "logps/chosen": -37.93321228027344, "logps/rejected": -62.82341766357422, "loss": 0.4961, "losses/dpo": 0.618889570236206, "losses/sft": 1.3829941749572754, "losses/total": 0.618889570236206, "ref_logps/chosen": -32.22320556640625, "ref_logps/rejected": -50.822689056396484, "rewards/accuracies": 0.75, "rewards/chosen": -0.5710002183914185, "rewards/margins": 0.6290723085403442, "rewards/rejected": -1.2000726461410522, "step": 512 }, { "epoch": 0.48, "grad_norm": 17.249675750732422, "learning_rate": 4.6589716684155296e-07, "logps/chosen": -46.194698333740234, "logps/rejected": -56.13873291015625, "loss": 0.5549, "losses/dpo": 0.5435903072357178, "losses/sft": 1.60612952709198, "losses/total": 0.5435903072357178, "ref_logps/chosen": -37.873985290527344, "ref_logps/rejected": -43.54327392578125, "rewards/accuracies": 0.75, "rewards/chosen": -0.83207106590271, "rewards/margins": 0.42747408151626587, "rewards/rejected": -1.259545087814331, "step": 513 }, { "epoch": 0.49, "grad_norm": 19.374313354492188, "learning_rate": 4.657222805176635e-07, "logps/chosen": -37.62091064453125, "logps/rejected": -51.132179260253906, "loss": 0.6659, "losses/dpo": 0.6505405902862549, "losses/sft": 1.8584665060043335, "losses/total": 0.6505405902862549, "ref_logps/chosen": -31.067005157470703, "ref_logps/rejected": -43.022987365722656, "rewards/accuracies": 0.5, "rewards/chosen": -0.6553905010223389, "rewards/margins": 0.15552854537963867, "rewards/rejected": -0.8109190464019775, "step": 514 }, { "epoch": 0.49, "grad_norm": 20.405542373657227, "learning_rate": 4.6554739419377404e-07, "logps/chosen": -41.0093994140625, "logps/rejected": -54.108421325683594, "loss": 0.6408, "losses/dpo": 0.5008844137191772, "losses/sft": 1.376187801361084, "losses/total": 0.5008844137191772, "ref_logps/chosen": -34.1032600402832, "ref_logps/rejected": -44.66871643066406, "rewards/accuracies": 0.625, "rewards/chosen": -0.6906140446662903, "rewards/margins": 0.253356397151947, "rewards/rejected": -0.9439705014228821, "step": 515 }, { "epoch": 0.49, "grad_norm": 25.345590591430664, "learning_rate": 4.6537250786988455e-07, "logps/chosen": -54.34903335571289, "logps/rejected": -50.94935607910156, "loss": 0.7937, "losses/dpo": 0.9341498613357544, "losses/sft": 1.5544166564941406, "losses/total": 0.9341498613357544, "ref_logps/chosen": -43.75737762451172, "ref_logps/rejected": -38.691261291503906, "rewards/accuracies": 0.5, "rewards/chosen": -1.0591654777526855, "rewards/margins": 0.16664430499076843, "rewards/rejected": -1.2258098125457764, "step": 516 }, { "epoch": 0.49, "grad_norm": 16.542613983154297, "learning_rate": 4.6519762154599506e-07, "logps/chosen": -41.26185607910156, "logps/rejected": -51.66979217529297, "loss": 0.5381, "losses/dpo": 0.49808430671691895, "losses/sft": 1.7250854969024658, "losses/total": 0.49808430671691895, "ref_logps/chosen": -35.4698600769043, "ref_logps/rejected": -40.898475646972656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5791990160942078, "rewards/margins": 0.4979328513145447, "rewards/rejected": -1.0771318674087524, "step": 517 }, { "epoch": 0.49, "grad_norm": 19.7836856842041, "learning_rate": 4.6502273522210563e-07, "logps/chosen": -37.76109313964844, "logps/rejected": -51.98636245727539, "loss": 0.6677, "losses/dpo": 0.6548014879226685, "losses/sft": 1.4091333150863647, "losses/total": 0.6548014879226685, "ref_logps/chosen": -31.213499069213867, "ref_logps/rejected": -43.97237014770508, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6547592878341675, "rewards/margins": 0.14663998782634735, "rewards/rejected": -0.801399290561676, "step": 518 }, { "epoch": 0.49, "grad_norm": 20.166194915771484, "learning_rate": 4.6484784889821614e-07, "logps/chosen": -50.248191833496094, "logps/rejected": -54.06584167480469, "loss": 0.6071, "losses/dpo": 0.5363638997077942, "losses/sft": 1.1510367393493652, "losses/total": 0.5363638997077942, "ref_logps/chosen": -42.62671661376953, "ref_logps/rejected": -41.8621826171875, "rewards/accuracies": 0.625, "rewards/chosen": -0.7621476650238037, "rewards/margins": 0.4582183361053467, "rewards/rejected": -1.2203660011291504, "step": 519 }, { "epoch": 0.49, "grad_norm": 18.74920082092285, "learning_rate": 4.6467296257432666e-07, "logps/chosen": -48.60076141357422, "logps/rejected": -69.72369384765625, "loss": 0.4872, "losses/dpo": 0.40776166319847107, "losses/sft": 2.0724453926086426, "losses/total": 0.40776166319847107, "ref_logps/chosen": -40.401268005371094, "ref_logps/rejected": -55.63837432861328, "rewards/accuracies": 0.8125, "rewards/chosen": -0.819949746131897, "rewards/margins": 0.58858323097229, "rewards/rejected": -1.4085328578948975, "step": 520 }, { "epoch": 0.49, "grad_norm": 15.707258224487305, "learning_rate": 4.6449807625043717e-07, "logps/chosen": -39.754249572753906, "logps/rejected": -60.420013427734375, "loss": 0.4779, "losses/dpo": 0.3915397822856903, "losses/sft": 1.417066216468811, "losses/total": 0.3915397822856903, "ref_logps/chosen": -33.275630950927734, "ref_logps/rejected": -46.234169006347656, "rewards/accuracies": 0.75, "rewards/chosen": -0.6478619575500488, "rewards/margins": 0.7707231044769287, "rewards/rejected": -1.4185853004455566, "step": 521 }, { "epoch": 0.49, "grad_norm": 16.339000701904297, "learning_rate": 4.6432318992654773e-07, "logps/chosen": -48.527687072753906, "logps/rejected": -62.224937438964844, "loss": 0.4821, "losses/dpo": 0.3421018421649933, "losses/sft": 1.3392013311386108, "losses/total": 0.3421018421649933, "ref_logps/chosen": -40.360755920410156, "ref_logps/rejected": -47.55398941040039, "rewards/accuracies": 0.875, "rewards/chosen": -0.816693127155304, "rewards/margins": 0.6504019498825073, "rewards/rejected": -1.467095136642456, "step": 522 }, { "epoch": 0.49, "grad_norm": 17.996753692626953, "learning_rate": 4.641483036026583e-07, "logps/chosen": -51.03308868408203, "logps/rejected": -67.96345520019531, "loss": 0.5056, "losses/dpo": 0.41352635622024536, "losses/sft": 1.479400873184204, "losses/total": 0.41352635622024536, "ref_logps/chosen": -41.82776641845703, "ref_logps/rejected": -52.19953155517578, "rewards/accuracies": 0.625, "rewards/chosen": -0.9205319881439209, "rewards/margins": 0.6558610200881958, "rewards/rejected": -1.5763931274414062, "step": 523 }, { "epoch": 0.49, "grad_norm": 16.52777099609375, "learning_rate": 4.6397341727876876e-07, "logps/chosen": -39.58159637451172, "logps/rejected": -51.186668395996094, "loss": 0.542, "losses/dpo": 0.3972024917602539, "losses/sft": 1.2694875001907349, "losses/total": 0.3972024917602539, "ref_logps/chosen": -33.00318908691406, "ref_logps/rejected": -39.64402770996094, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6578408479690552, "rewards/margins": 0.49642282724380493, "rewards/rejected": -1.1542637348175049, "step": 524 }, { "epoch": 0.5, "grad_norm": 17.888750076293945, "learning_rate": 4.637985309548793e-07, "logps/chosen": -44.566497802734375, "logps/rejected": -52.6598014831543, "loss": 0.5552, "losses/dpo": 0.6064153909683228, "losses/sft": 1.5405288934707642, "losses/total": 0.6064153909683228, "ref_logps/chosen": -37.5517463684082, "ref_logps/rejected": -40.256202697753906, "rewards/accuracies": 0.75, "rewards/chosen": -0.7014751434326172, "rewards/margins": 0.5388847589492798, "rewards/rejected": -1.2403597831726074, "step": 525 }, { "epoch": 0.5, "grad_norm": 18.140491485595703, "learning_rate": 4.6362364463098984e-07, "logps/chosen": -41.16547393798828, "logps/rejected": -55.885684967041016, "loss": 0.5694, "losses/dpo": 0.46671921014785767, "losses/sft": 1.6265698671340942, "losses/total": 0.46671921014785767, "ref_logps/chosen": -32.732261657714844, "ref_logps/rejected": -43.497371673583984, "rewards/accuracies": 0.75, "rewards/chosen": -0.8433213829994202, "rewards/margins": 0.39550989866256714, "rewards/rejected": -1.2388312816619873, "step": 526 }, { "epoch": 0.5, "grad_norm": 20.340742111206055, "learning_rate": 4.6344875830710035e-07, "logps/chosen": -47.97841262817383, "logps/rejected": -49.0335807800293, "loss": 0.6108, "losses/dpo": 0.7322754859924316, "losses/sft": 1.867557406425476, "losses/total": 0.7322754859924316, "ref_logps/chosen": -41.427330017089844, "ref_logps/rejected": -40.12873840332031, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6551084518432617, "rewards/margins": 0.23537567257881165, "rewards/rejected": -0.8904841542243958, "step": 527 }, { "epoch": 0.5, "grad_norm": 16.094282150268555, "learning_rate": 4.6327387198321086e-07, "logps/chosen": -51.06538009643555, "logps/rejected": -60.69963073730469, "loss": 0.4921, "losses/dpo": 0.5082216262817383, "losses/sft": 1.6114482879638672, "losses/total": 0.5082216262817383, "ref_logps/chosen": -44.52245330810547, "ref_logps/rejected": -48.39900207519531, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6542928218841553, "rewards/margins": 0.5757702589035034, "rewards/rejected": -1.2300630807876587, "step": 528 }, { "epoch": 0.5, "grad_norm": 20.200944900512695, "learning_rate": 4.6309898565932143e-07, "logps/chosen": -43.65022277832031, "logps/rejected": -45.25249481201172, "loss": 0.6692, "losses/dpo": 0.586146354675293, "losses/sft": 1.792749285697937, "losses/total": 0.586146354675293, "ref_logps/chosen": -35.28498458862305, "ref_logps/rejected": -35.23607635498047, "rewards/accuracies": 0.5, "rewards/chosen": -0.8365240097045898, "rewards/margins": 0.16511788964271545, "rewards/rejected": -1.0016417503356934, "step": 529 }, { "epoch": 0.5, "grad_norm": 18.585430145263672, "learning_rate": 4.62924099335432e-07, "logps/chosen": -41.75214767456055, "logps/rejected": -54.3496208190918, "loss": 0.5819, "losses/dpo": 0.6976767182350159, "losses/sft": 1.224543571472168, "losses/total": 0.6976767182350159, "ref_logps/chosen": -34.77529525756836, "ref_logps/rejected": -44.052207946777344, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6976852416992188, "rewards/margins": 0.3320556581020355, "rewards/rejected": -1.0297410488128662, "step": 530 }, { "epoch": 0.5, "grad_norm": 20.460107803344727, "learning_rate": 4.6274921301154245e-07, "logps/chosen": -46.226558685302734, "logps/rejected": -46.63634490966797, "loss": 0.706, "losses/dpo": 0.5846570730209351, "losses/sft": 1.621983289718628, "losses/total": 0.5846570730209351, "ref_logps/chosen": -39.506229400634766, "ref_logps/rejected": -39.551780700683594, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6720329523086548, "rewards/margins": 0.03642330318689346, "rewards/rejected": -0.70845627784729, "step": 531 }, { "epoch": 0.5, "grad_norm": 17.80394172668457, "learning_rate": 4.62574326687653e-07, "logps/chosen": -40.451961517333984, "logps/rejected": -53.589881896972656, "loss": 0.6174, "losses/dpo": 0.5291751027107239, "losses/sft": 1.1986802816390991, "losses/total": 0.5291751027107239, "ref_logps/chosen": -33.04058837890625, "ref_logps/rejected": -42.0926513671875, "rewards/accuracies": 0.75, "rewards/chosen": -0.7411374449729919, "rewards/margins": 0.4085853397846222, "rewards/rejected": -1.149722695350647, "step": 532 }, { "epoch": 0.5, "grad_norm": 19.09368133544922, "learning_rate": 4.6239944036376353e-07, "logps/chosen": -48.723331451416016, "logps/rejected": -54.167301177978516, "loss": 0.547, "losses/dpo": 0.5327960252761841, "losses/sft": 1.3891955614089966, "losses/total": 0.5327960252761841, "ref_logps/chosen": -41.82886505126953, "ref_logps/rejected": -42.401832580566406, "rewards/accuracies": 0.625, "rewards/chosen": -0.6894465684890747, "rewards/margins": 0.48709988594055176, "rewards/rejected": -1.176546335220337, "step": 533 }, { "epoch": 0.5, "grad_norm": 18.902814865112305, "learning_rate": 4.6222455403987404e-07, "logps/chosen": -41.274105072021484, "logps/rejected": -46.66707229614258, "loss": 0.6539, "losses/dpo": 0.678492546081543, "losses/sft": 1.7505420446395874, "losses/total": 0.678492546081543, "ref_logps/chosen": -33.76953125, "ref_logps/rejected": -37.19273376464844, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7504576444625854, "rewards/margins": 0.19697652757167816, "rewards/rejected": -0.9474341869354248, "step": 534 }, { "epoch": 0.51, "grad_norm": 16.190881729125977, "learning_rate": 4.6204966771598456e-07, "logps/chosen": -41.202980041503906, "logps/rejected": -59.25920486450195, "loss": 0.4814, "losses/dpo": 0.5541568994522095, "losses/sft": 1.6433279514312744, "losses/total": 0.5541568994522095, "ref_logps/chosen": -36.29848098754883, "ref_logps/rejected": -48.277801513671875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4904496371746063, "rewards/margins": 0.6076907515525818, "rewards/rejected": -1.0981404781341553, "step": 535 }, { "epoch": 0.51, "grad_norm": 14.119658470153809, "learning_rate": 4.618747813920951e-07, "logps/chosen": -36.84916687011719, "logps/rejected": -50.92412567138672, "loss": 0.5039, "losses/dpo": 0.5139963626861572, "losses/sft": 1.0659281015396118, "losses/total": 0.5139963626861572, "ref_logps/chosen": -32.00421142578125, "ref_logps/rejected": -39.78274154663086, "rewards/accuracies": 0.625, "rewards/chosen": -0.4844956398010254, "rewards/margins": 0.6296426057815552, "rewards/rejected": -1.1141383647918701, "step": 536 }, { "epoch": 0.51, "grad_norm": 17.82438087463379, "learning_rate": 4.616998950682057e-07, "logps/chosen": -43.560691833496094, "logps/rejected": -48.5607795715332, "loss": 0.6292, "losses/dpo": 0.6040087938308716, "losses/sft": 1.223031997680664, "losses/total": 0.6040087938308716, "ref_logps/chosen": -34.930362701416016, "ref_logps/rejected": -37.556705474853516, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8630329370498657, "rewards/margins": 0.23737439513206482, "rewards/rejected": -1.100407361984253, "step": 537 }, { "epoch": 0.51, "grad_norm": 18.799345016479492, "learning_rate": 4.6152500874431615e-07, "logps/chosen": -43.603477478027344, "logps/rejected": -54.45125198364258, "loss": 0.5935, "losses/dpo": 0.5168572664260864, "losses/sft": 1.2226108312606812, "losses/total": 0.5168572664260864, "ref_logps/chosen": -35.670448303222656, "ref_logps/rejected": -43.59748077392578, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7933031916618347, "rewards/margins": 0.29207438230514526, "rewards/rejected": -1.08537757396698, "step": 538 }, { "epoch": 0.51, "grad_norm": 19.961963653564453, "learning_rate": 4.613501224204267e-07, "logps/chosen": -39.98153305053711, "logps/rejected": -48.5761833190918, "loss": 0.6535, "losses/dpo": 1.0349931716918945, "losses/sft": 1.9321941137313843, "losses/total": 1.0349931716918945, "ref_logps/chosen": -33.00051498413086, "ref_logps/rejected": -38.60032653808594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6981016397476196, "rewards/margins": 0.2994837760925293, "rewards/rejected": -0.9975854158401489, "step": 539 }, { "epoch": 0.51, "grad_norm": 18.267555236816406, "learning_rate": 4.611752360965372e-07, "logps/chosen": -49.56752014160156, "logps/rejected": -64.00669860839844, "loss": 0.5189, "losses/dpo": 0.503933846950531, "losses/sft": 1.3660101890563965, "losses/total": 0.503933846950531, "ref_logps/chosen": -44.633785247802734, "ref_logps/rejected": -53.22724151611328, "rewards/accuracies": 0.75, "rewards/chosen": -0.4933737516403198, "rewards/margins": 0.584572434425354, "rewards/rejected": -1.0779461860656738, "step": 540 }, { "epoch": 0.51, "grad_norm": 19.289949417114258, "learning_rate": 4.6100034977264774e-07, "logps/chosen": -55.475101470947266, "logps/rejected": -60.352447509765625, "loss": 0.5869, "losses/dpo": 0.6577602624893188, "losses/sft": 1.8504594564437866, "losses/total": 0.6577602624893188, "ref_logps/chosen": -47.22643280029297, "ref_logps/rejected": -48.38414764404297, "rewards/accuracies": 0.625, "rewards/chosen": -0.8248670101165771, "rewards/margins": 0.3719630837440491, "rewards/rejected": -1.1968300342559814, "step": 541 }, { "epoch": 0.51, "grad_norm": 20.45795440673828, "learning_rate": 4.608254634487583e-07, "logps/chosen": -51.12990188598633, "logps/rejected": -53.9380989074707, "loss": 0.5951, "losses/dpo": 0.8269652724266052, "losses/sft": 1.7129558324813843, "losses/total": 0.8269652724266052, "ref_logps/chosen": -41.75885009765625, "ref_logps/rejected": -40.59892272949219, "rewards/accuracies": 0.75, "rewards/chosen": -0.9371054172515869, "rewards/margins": 0.3968122899532318, "rewards/rejected": -1.3339176177978516, "step": 542 }, { "epoch": 0.51, "grad_norm": 20.200122833251953, "learning_rate": 4.606505771248688e-07, "logps/chosen": -50.2019157409668, "logps/rejected": -49.549659729003906, "loss": 0.5585, "losses/dpo": 0.6339994668960571, "losses/sft": 2.619389772415161, "losses/total": 0.6339994668960571, "ref_logps/chosen": -42.71995544433594, "ref_logps/rejected": -38.63445281982422, "rewards/accuracies": 0.75, "rewards/chosen": -0.7481957674026489, "rewards/margins": 0.343325138092041, "rewards/rejected": -1.09152090549469, "step": 543 }, { "epoch": 0.51, "grad_norm": 20.325183868408203, "learning_rate": 4.604756908009794e-07, "logps/chosen": -37.36647033691406, "logps/rejected": -42.93746566772461, "loss": 0.661, "losses/dpo": 0.7532327175140381, "losses/sft": 2.116211414337158, "losses/total": 0.7532327175140381, "ref_logps/chosen": -31.26726531982422, "ref_logps/rejected": -34.85523223876953, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6099203824996948, "rewards/margins": 0.19830316305160522, "rewards/rejected": -0.8082234859466553, "step": 544 }, { "epoch": 0.51, "grad_norm": 22.209781646728516, "learning_rate": 4.6030080447708984e-07, "logps/chosen": -57.72157669067383, "logps/rejected": -63.55116271972656, "loss": 0.5993, "losses/dpo": 0.4275997281074524, "losses/sft": 2.05446720123291, "losses/total": 0.4275997281074524, "ref_logps/chosen": -47.218875885009766, "ref_logps/rejected": -48.9195556640625, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0502698421478271, "rewards/margins": 0.41289082169532776, "rewards/rejected": -1.463160753250122, "step": 545 }, { "epoch": 0.52, "grad_norm": 17.983835220336914, "learning_rate": 4.601259181532004e-07, "logps/chosen": -40.64090347290039, "logps/rejected": -47.25122833251953, "loss": 0.5603, "losses/dpo": 0.722617506980896, "losses/sft": 1.5367704629898071, "losses/total": 0.722617506980896, "ref_logps/chosen": -35.09404754638672, "ref_logps/rejected": -37.44237518310547, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5546858310699463, "rewards/margins": 0.4261992573738098, "rewards/rejected": -0.9808850288391113, "step": 546 }, { "epoch": 0.52, "grad_norm": 26.236013412475586, "learning_rate": 4.599510318293109e-07, "logps/chosen": -57.17491912841797, "logps/rejected": -62.179683685302734, "loss": 0.7702, "losses/dpo": 0.9172338247299194, "losses/sft": 2.0179269313812256, "losses/total": 0.9172338247299194, "ref_logps/chosen": -46.224395751953125, "ref_logps/rejected": -51.54985427856445, "rewards/accuracies": 0.625, "rewards/chosen": -1.0950521230697632, "rewards/margins": -0.03206905350089073, "rewards/rejected": -1.0629830360412598, "step": 547 }, { "epoch": 0.52, "grad_norm": 22.186540603637695, "learning_rate": 4.5977614550542143e-07, "logps/chosen": -44.1175651550293, "logps/rejected": -49.125244140625, "loss": 0.7027, "losses/dpo": 0.675849437713623, "losses/sft": 1.7110408544540405, "losses/total": 0.675849437713623, "ref_logps/chosen": -35.94295120239258, "ref_logps/rejected": -39.02360534667969, "rewards/accuracies": 0.625, "rewards/chosen": -0.817461371421814, "rewards/margins": 0.19270196557044983, "rewards/rejected": -1.0101633071899414, "step": 548 }, { "epoch": 0.52, "grad_norm": 15.947452545166016, "learning_rate": 4.59601259181532e-07, "logps/chosen": -43.255401611328125, "logps/rejected": -68.79779052734375, "loss": 0.414, "losses/dpo": 0.3710100054740906, "losses/sft": 1.706789255142212, "losses/total": 0.3710100054740906, "ref_logps/chosen": -36.15275573730469, "ref_logps/rejected": -52.01953125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7102651000022888, "rewards/margins": 0.9675604104995728, "rewards/rejected": -1.6778254508972168, "step": 549 }, { "epoch": 0.52, "grad_norm": 20.477737426757812, "learning_rate": 4.594263728576425e-07, "logps/chosen": -42.64044189453125, "logps/rejected": -48.93083953857422, "loss": 0.6011, "losses/dpo": 0.6613829135894775, "losses/sft": 1.7269814014434814, "losses/total": 0.6613829135894775, "ref_logps/chosen": -33.133811950683594, "ref_logps/rejected": -35.95905685424805, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9506632089614868, "rewards/margins": 0.3465149998664856, "rewards/rejected": -1.2971782684326172, "step": 550 }, { "epoch": 0.52, "grad_norm": 16.479202270507812, "learning_rate": 4.592514865337531e-07, "logps/chosen": -50.13810729980469, "logps/rejected": -62.10587692260742, "loss": 0.5175, "losses/dpo": 0.6193148493766785, "losses/sft": 2.2699668407440186, "losses/total": 0.6193148493766785, "ref_logps/chosen": -40.48619079589844, "ref_logps/rejected": -46.197547912597656, "rewards/accuracies": 0.75, "rewards/chosen": -0.9651917815208435, "rewards/margins": 0.6256414651870728, "rewards/rejected": -1.5908331871032715, "step": 551 }, { "epoch": 0.52, "grad_norm": 17.107406616210938, "learning_rate": 4.5907660020986354e-07, "logps/chosen": -38.52226638793945, "logps/rejected": -59.176597595214844, "loss": 0.5397, "losses/dpo": 0.48231440782546997, "losses/sft": 1.854367733001709, "losses/total": 0.48231440782546997, "ref_logps/chosen": -28.64743423461914, "ref_logps/rejected": -44.32575607299805, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9874832630157471, "rewards/margins": 0.4976009130477905, "rewards/rejected": -1.4850842952728271, "step": 552 }, { "epoch": 0.52, "grad_norm": 22.987104415893555, "learning_rate": 4.589017138859741e-07, "logps/chosen": -50.133785247802734, "logps/rejected": -48.91804885864258, "loss": 0.6585, "losses/dpo": 0.6219667196273804, "losses/sft": 1.4681081771850586, "losses/total": 0.6219667196273804, "ref_logps/chosen": -39.487632751464844, "ref_logps/rejected": -35.85594177246094, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0646154880523682, "rewards/margins": 0.24159535765647888, "rewards/rejected": -1.3062108755111694, "step": 553 }, { "epoch": 0.52, "grad_norm": 22.33740234375, "learning_rate": 4.587268275620846e-07, "logps/chosen": -53.852081298828125, "logps/rejected": -60.23436737060547, "loss": 0.6664, "losses/dpo": 0.9895302057266235, "losses/sft": 1.8812607526779175, "losses/total": 0.9895302057266235, "ref_logps/chosen": -41.266414642333984, "ref_logps/rejected": -45.49755096435547, "rewards/accuracies": 0.75, "rewards/chosen": -1.2585668563842773, "rewards/margins": 0.21511486172676086, "rewards/rejected": -1.4736816883087158, "step": 554 }, { "epoch": 0.52, "grad_norm": 24.779027938842773, "learning_rate": 4.5855194123819513e-07, "logps/chosen": -58.739349365234375, "logps/rejected": -66.29627990722656, "loss": 0.6482, "losses/dpo": 0.1946183741092682, "losses/sft": 1.8658313751220703, "losses/total": 0.1946183741092682, "ref_logps/chosen": -46.45478439331055, "ref_logps/rejected": -49.61613845825195, "rewards/accuracies": 0.75, "rewards/chosen": -1.2284562587738037, "rewards/margins": 0.43955832719802856, "rewards/rejected": -1.6680145263671875, "step": 555 }, { "epoch": 0.53, "grad_norm": 19.251026153564453, "learning_rate": 4.583770549143057e-07, "logps/chosen": -51.381412506103516, "logps/rejected": -61.252723693847656, "loss": 0.5333, "losses/dpo": 0.675920307636261, "losses/sft": 1.8289772272109985, "losses/total": 0.675920307636261, "ref_logps/chosen": -43.014129638671875, "ref_logps/rejected": -47.88116455078125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8367282748222351, "rewards/margins": 0.5004271864891052, "rewards/rejected": -1.3371554613113403, "step": 556 }, { "epoch": 0.53, "grad_norm": 19.037635803222656, "learning_rate": 4.582021685904162e-07, "logps/chosen": -42.47540283203125, "logps/rejected": -47.087257385253906, "loss": 0.6694, "losses/dpo": 0.6714488863945007, "losses/sft": 1.8966470956802368, "losses/total": 0.6714488863945007, "ref_logps/chosen": -33.46836853027344, "ref_logps/rejected": -36.699928283691406, "rewards/accuracies": 0.75, "rewards/chosen": -0.9007034301757812, "rewards/margins": 0.13802987337112427, "rewards/rejected": -1.0387332439422607, "step": 557 }, { "epoch": 0.53, "grad_norm": 16.663394927978516, "learning_rate": 4.5802728226652677e-07, "logps/chosen": -49.5675048828125, "logps/rejected": -66.3643569946289, "loss": 0.4732, "losses/dpo": 0.3888000249862671, "losses/sft": 1.814093828201294, "losses/total": 0.3888000249862671, "ref_logps/chosen": -38.756683349609375, "ref_logps/rejected": -47.82711410522461, "rewards/accuracies": 0.875, "rewards/chosen": -1.0810821056365967, "rewards/margins": 0.772642195224762, "rewards/rejected": -1.8537243604660034, "step": 558 }, { "epoch": 0.53, "grad_norm": 14.421659469604492, "learning_rate": 4.5785239594263723e-07, "logps/chosen": -45.214996337890625, "logps/rejected": -66.09407043457031, "loss": 0.3689, "losses/dpo": 0.4478214681148529, "losses/sft": 1.4946962594985962, "losses/total": 0.4478214681148529, "ref_logps/chosen": -37.52229309082031, "ref_logps/rejected": -46.532798767089844, "rewards/accuracies": 0.875, "rewards/chosen": -0.769270658493042, "rewards/margins": 1.1868572235107422, "rewards/rejected": -1.9561278820037842, "step": 559 }, { "epoch": 0.53, "grad_norm": 23.122665405273438, "learning_rate": 4.576775096187478e-07, "logps/chosen": -56.175209045410156, "logps/rejected": -67.87620544433594, "loss": 0.768, "losses/dpo": 0.6455317735671997, "losses/sft": 1.3315482139587402, "losses/total": 0.6455317735671997, "ref_logps/chosen": -44.463558197021484, "ref_logps/rejected": -54.495849609375, "rewards/accuracies": 0.5, "rewards/chosen": -1.1711652278900146, "rewards/margins": 0.1668698787689209, "rewards/rejected": -1.338035225868225, "step": 560 }, { "epoch": 0.53, "grad_norm": 17.649150848388672, "learning_rate": 4.5750262329485836e-07, "logps/chosen": -41.43122100830078, "logps/rejected": -69.98138427734375, "loss": 0.4408, "losses/dpo": 0.4343390464782715, "losses/sft": 1.6604691743850708, "losses/total": 0.4343390464782715, "ref_logps/chosen": -32.734798431396484, "ref_logps/rejected": -53.289634704589844, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8696419596672058, "rewards/margins": 0.799533486366272, "rewards/rejected": -1.669175386428833, "step": 561 }, { "epoch": 0.53, "grad_norm": 17.988561630249023, "learning_rate": 4.573277369709688e-07, "logps/chosen": -39.51237869262695, "logps/rejected": -46.915992736816406, "loss": 0.6035, "losses/dpo": 0.6121284365653992, "losses/sft": 1.4484940767288208, "losses/total": 0.6121284365653992, "ref_logps/chosen": -32.31346130371094, "ref_logps/rejected": -36.85161590576172, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7198917865753174, "rewards/margins": 0.28654569387435913, "rewards/rejected": -1.0064374208450317, "step": 562 }, { "epoch": 0.53, "grad_norm": 18.029293060302734, "learning_rate": 4.571528506470794e-07, "logps/chosen": -47.67314147949219, "logps/rejected": -67.33277130126953, "loss": 0.5051, "losses/dpo": 0.45294126868247986, "losses/sft": 1.9086031913757324, "losses/total": 0.45294126868247986, "ref_logps/chosen": -39.116432189941406, "ref_logps/rejected": -52.71623992919922, "rewards/accuracies": 0.75, "rewards/chosen": -0.855671226978302, "rewards/margins": 0.6059819459915161, "rewards/rejected": -1.461653232574463, "step": 563 }, { "epoch": 0.53, "grad_norm": 14.394112586975098, "learning_rate": 4.569779643231899e-07, "logps/chosen": -42.31031799316406, "logps/rejected": -54.99332046508789, "loss": 0.4608, "losses/dpo": 0.47964540123939514, "losses/sft": 1.343507170677185, "losses/total": 0.47964540123939514, "ref_logps/chosen": -32.94147491455078, "ref_logps/rejected": -37.29802703857422, "rewards/accuracies": 0.625, "rewards/chosen": -0.9368841648101807, "rewards/margins": 0.8326447606086731, "rewards/rejected": -1.7695289850234985, "step": 564 }, { "epoch": 0.53, "grad_norm": 17.398134231567383, "learning_rate": 4.5680307799930047e-07, "logps/chosen": -41.23756408691406, "logps/rejected": -67.37899780273438, "loss": 0.4651, "losses/dpo": 0.369045227766037, "losses/sft": 1.6824448108673096, "losses/total": 0.369045227766037, "ref_logps/chosen": -33.456703186035156, "ref_logps/rejected": -51.3763427734375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7780860662460327, "rewards/margins": 0.8221798539161682, "rewards/rejected": -1.6002659797668457, "step": 565 }, { "epoch": 0.53, "grad_norm": 28.186016082763672, "learning_rate": 4.566281916754109e-07, "logps/chosen": -47.245662689208984, "logps/rejected": -49.52615737915039, "loss": 0.781, "losses/dpo": 0.6183753609657288, "losses/sft": 1.8384548425674438, "losses/total": 0.6183753609657288, "ref_logps/chosen": -34.558082580566406, "ref_logps/rejected": -36.763221740722656, "rewards/accuracies": 0.4375, "rewards/chosen": -1.2687578201293945, "rewards/margins": 0.007535912096500397, "rewards/rejected": -1.2762937545776367, "step": 566 }, { "epoch": 0.54, "grad_norm": 25.419403076171875, "learning_rate": 4.564533053515215e-07, "logps/chosen": -64.38424682617188, "logps/rejected": -77.08365631103516, "loss": 0.791, "losses/dpo": 0.6494208574295044, "losses/sft": 2.211927652359009, "losses/total": 0.6494208574295044, "ref_logps/chosen": -49.56736755371094, "ref_logps/rejected": -62.12215042114258, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4816880226135254, "rewards/margins": 0.014462143182754517, "rewards/rejected": -1.496150255203247, "step": 567 }, { "epoch": 0.54, "grad_norm": 23.816476821899414, "learning_rate": 4.5627841902763206e-07, "logps/chosen": -51.96767807006836, "logps/rejected": -59.69984817504883, "loss": 0.6734, "losses/dpo": 0.4443957209587097, "losses/sft": 1.374029278755188, "losses/total": 0.4443957209587097, "ref_logps/chosen": -42.19104766845703, "ref_logps/rejected": -48.227783203125, "rewards/accuracies": 0.5, "rewards/chosen": -0.9776627421379089, "rewards/margins": 0.1695437729358673, "rewards/rejected": -1.1472065448760986, "step": 568 }, { "epoch": 0.54, "grad_norm": 24.870258331298828, "learning_rate": 4.561035327037425e-07, "logps/chosen": -45.30424499511719, "logps/rejected": -46.708221435546875, "loss": 0.7158, "losses/dpo": 1.1105490922927856, "losses/sft": 1.3865281343460083, "losses/total": 1.1105490922927856, "ref_logps/chosen": -37.21491622924805, "ref_logps/rejected": -37.429046630859375, "rewards/accuracies": 0.625, "rewards/chosen": -0.8089326620101929, "rewards/margins": 0.11898540705442429, "rewards/rejected": -0.9279180765151978, "step": 569 }, { "epoch": 0.54, "grad_norm": 14.619791030883789, "learning_rate": 4.559286463798531e-07, "logps/chosen": -43.014503479003906, "logps/rejected": -67.5992431640625, "loss": 0.3671, "losses/dpo": 0.294307142496109, "losses/sft": 2.140934944152832, "losses/total": 0.294307142496109, "ref_logps/chosen": -35.34181213378906, "ref_logps/rejected": -49.05023956298828, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7672694325447083, "rewards/margins": 1.0876307487487793, "rewards/rejected": -1.8549003601074219, "step": 570 }, { "epoch": 0.54, "grad_norm": 25.318273544311523, "learning_rate": 4.557537600559636e-07, "logps/chosen": -51.31770324707031, "logps/rejected": -45.34050369262695, "loss": 0.7361, "losses/dpo": 0.935380220413208, "losses/sft": 1.6511880159378052, "losses/total": 0.935380220413208, "ref_logps/chosen": -40.02067565917969, "ref_logps/rejected": -33.30419158935547, "rewards/accuracies": 0.4375, "rewards/chosen": -1.1297024488449097, "rewards/margins": 0.07392872869968414, "rewards/rejected": -1.2036311626434326, "step": 571 }, { "epoch": 0.54, "grad_norm": 22.432546615600586, "learning_rate": 4.5557887373207416e-07, "logps/chosen": -47.067108154296875, "logps/rejected": -48.23573303222656, "loss": 0.6426, "losses/dpo": 0.3802192807197571, "losses/sft": 1.8040047883987427, "losses/total": 0.3802192807197571, "ref_logps/chosen": -36.93397521972656, "ref_logps/rejected": -35.979698181152344, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0133131742477417, "rewards/margins": 0.21229061484336853, "rewards/rejected": -1.2256038188934326, "step": 572 }, { "epoch": 0.54, "grad_norm": 23.75169563293457, "learning_rate": 4.554039874081846e-07, "logps/chosen": -47.19709777832031, "logps/rejected": -42.59159851074219, "loss": 0.7487, "losses/dpo": 0.5805943012237549, "losses/sft": 1.385767936706543, "losses/total": 0.5805943012237549, "ref_logps/chosen": -37.008399963378906, "ref_logps/rejected": -32.48085403442383, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0188690423965454, "rewards/margins": -0.007794246077537537, "rewards/rejected": -1.0110747814178467, "step": 573 }, { "epoch": 0.54, "grad_norm": 21.6710205078125, "learning_rate": 4.552291010842952e-07, "logps/chosen": -50.27872085571289, "logps/rejected": -53.5006217956543, "loss": 0.5499, "losses/dpo": 0.5672124624252319, "losses/sft": 1.839987874031067, "losses/total": 0.5672124624252319, "ref_logps/chosen": -39.56987762451172, "ref_logps/rejected": -37.309051513671875, "rewards/accuracies": 0.75, "rewards/chosen": -1.0708839893341064, "rewards/margins": 0.5482732057571411, "rewards/rejected": -1.619157314300537, "step": 574 }, { "epoch": 0.54, "grad_norm": 20.268836975097656, "learning_rate": 4.5505421476040575e-07, "logps/chosen": -51.502044677734375, "logps/rejected": -58.27291488647461, "loss": 0.6278, "losses/dpo": 0.6022443771362305, "losses/sft": 2.057661294937134, "losses/total": 0.6022443771362305, "ref_logps/chosen": -41.06641387939453, "ref_logps/rejected": -44.71141815185547, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0435630083084106, "rewards/margins": 0.3125864863395691, "rewards/rejected": -1.356149435043335, "step": 575 }, { "epoch": 0.54, "grad_norm": 20.673593521118164, "learning_rate": 4.548793284365162e-07, "logps/chosen": -39.24158477783203, "logps/rejected": -51.94776916503906, "loss": 0.5591, "losses/dpo": 0.9681692123413086, "losses/sft": 1.5640063285827637, "losses/total": 0.9681692123413086, "ref_logps/chosen": -32.29969024658203, "ref_logps/rejected": -39.803688049316406, "rewards/accuracies": 0.75, "rewards/chosen": -0.6941896080970764, "rewards/margins": 0.5202183723449707, "rewards/rejected": -1.2144079208374023, "step": 576 }, { "epoch": 0.54, "grad_norm": 25.663684844970703, "learning_rate": 4.547044421126268e-07, "logps/chosen": -53.03947448730469, "logps/rejected": -58.420494079589844, "loss": 0.6881, "losses/dpo": 0.876889169216156, "losses/sft": 2.2121920585632324, "losses/total": 0.876889169216156, "ref_logps/chosen": -41.90216827392578, "ref_logps/rejected": -44.720306396484375, "rewards/accuracies": 0.5, "rewards/chosen": -1.1137303113937378, "rewards/margins": 0.2562887668609619, "rewards/rejected": -1.3700189590454102, "step": 577 }, { "epoch": 0.55, "grad_norm": 21.71703338623047, "learning_rate": 4.545295557887373e-07, "logps/chosen": -62.46804428100586, "logps/rejected": -72.32112121582031, "loss": 0.5451, "losses/dpo": 0.5514512062072754, "losses/sft": 1.7490044832229614, "losses/total": 0.5514512062072754, "ref_logps/chosen": -48.041316986083984, "ref_logps/rejected": -53.017478942871094, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4426724910736084, "rewards/margins": 0.48769164085388184, "rewards/rejected": -1.9303641319274902, "step": 578 }, { "epoch": 0.55, "grad_norm": 21.563751220703125, "learning_rate": 4.5435466946484785e-07, "logps/chosen": -47.78504943847656, "logps/rejected": -54.838382720947266, "loss": 0.572, "losses/dpo": 0.5821988582611084, "losses/sft": 1.9250428676605225, "losses/total": 0.5821988582611084, "ref_logps/chosen": -37.25121307373047, "ref_logps/rejected": -39.90326690673828, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0533840656280518, "rewards/margins": 0.44012749195098877, "rewards/rejected": -1.49351167678833, "step": 579 }, { "epoch": 0.55, "grad_norm": 14.213398933410645, "learning_rate": 4.541797831409583e-07, "logps/chosen": -38.6250114440918, "logps/rejected": -59.392425537109375, "loss": 0.3845, "losses/dpo": 0.41140130162239075, "losses/sft": 1.2234476804733276, "losses/total": 0.41140130162239075, "ref_logps/chosen": -33.129390716552734, "ref_logps/rejected": -43.63618469238281, "rewards/accuracies": 0.9375, "rewards/chosen": -0.549561619758606, "rewards/margins": 1.0260627269744873, "rewards/rejected": -1.5756242275238037, "step": 580 }, { "epoch": 0.55, "grad_norm": 17.11842155456543, "learning_rate": 4.540048968170689e-07, "logps/chosen": -50.15816116333008, "logps/rejected": -63.31660461425781, "loss": 0.4982, "losses/dpo": 0.40851694345474243, "losses/sft": 1.7249274253845215, "losses/total": 0.40851694345474243, "ref_logps/chosen": -38.71712875366211, "ref_logps/rejected": -45.193084716796875, "rewards/accuracies": 0.75, "rewards/chosen": -1.1441032886505127, "rewards/margins": 0.6682482361793518, "rewards/rejected": -1.8123514652252197, "step": 581 }, { "epoch": 0.55, "grad_norm": 20.58220672607422, "learning_rate": 4.5383001049317945e-07, "logps/chosen": -46.187110900878906, "logps/rejected": -48.68061828613281, "loss": 0.6063, "losses/dpo": 0.7693386077880859, "losses/sft": 1.2192364931106567, "losses/total": 0.7693386077880859, "ref_logps/chosen": -36.772674560546875, "ref_logps/rejected": -35.26833724975586, "rewards/accuracies": 0.5, "rewards/chosen": -0.941443920135498, "rewards/margins": 0.3997841477394104, "rewards/rejected": -1.3412280082702637, "step": 582 }, { "epoch": 0.55, "grad_norm": 19.872751235961914, "learning_rate": 4.536551241692899e-07, "logps/chosen": -54.05327224731445, "logps/rejected": -72.29106140136719, "loss": 0.577, "losses/dpo": 0.3492737114429474, "losses/sft": 1.7046188116073608, "losses/total": 0.3492737114429474, "ref_logps/chosen": -43.72734451293945, "ref_logps/rejected": -57.298274993896484, "rewards/accuracies": 0.75, "rewards/chosen": -1.0325927734375, "rewards/margins": 0.46668627858161926, "rewards/rejected": -1.4992791414260864, "step": 583 }, { "epoch": 0.55, "grad_norm": 22.728727340698242, "learning_rate": 4.5348023784540047e-07, "logps/chosen": -54.06036376953125, "logps/rejected": -69.52227783203125, "loss": 0.6209, "losses/dpo": 0.5739734172821045, "losses/sft": 1.489017128944397, "losses/total": 0.5739734172821045, "ref_logps/chosen": -41.19548797607422, "ref_logps/rejected": -52.632259368896484, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2864876985549927, "rewards/margins": 0.4025137722492218, "rewards/rejected": -1.6890015602111816, "step": 584 }, { "epoch": 0.55, "grad_norm": 23.140422821044922, "learning_rate": 4.53305351521511e-07, "logps/chosen": -57.32766342163086, "logps/rejected": -58.91395568847656, "loss": 0.6863, "losses/dpo": 0.44302016496658325, "losses/sft": 1.8704756498336792, "losses/total": 0.44302016496658325, "ref_logps/chosen": -43.63447570800781, "ref_logps/rejected": -43.35402297973633, "rewards/accuracies": 0.5, "rewards/chosen": -1.3693188428878784, "rewards/margins": 0.18667477369308472, "rewards/rejected": -1.555993676185608, "step": 585 }, { "epoch": 0.55, "grad_norm": 21.6605224609375, "learning_rate": 4.5313046519762155e-07, "logps/chosen": -49.245025634765625, "logps/rejected": -52.45628356933594, "loss": 0.6422, "losses/dpo": 0.5428541898727417, "losses/sft": 1.4998265504837036, "losses/total": 0.5428541898727417, "ref_logps/chosen": -37.843074798583984, "ref_logps/rejected": -38.00508117675781, "rewards/accuracies": 0.625, "rewards/chosen": -1.140195369720459, "rewards/margins": 0.3049251437187195, "rewards/rejected": -1.4451203346252441, "step": 586 }, { "epoch": 0.55, "grad_norm": 25.053163528442383, "learning_rate": 4.5295557887373206e-07, "logps/chosen": -61.2729377746582, "logps/rejected": -55.074424743652344, "loss": 0.7245, "losses/dpo": 0.558214008808136, "losses/sft": 1.6333088874816895, "losses/total": 0.558214008808136, "ref_logps/chosen": -47.624298095703125, "ref_logps/rejected": -39.871944427490234, "rewards/accuracies": 0.625, "rewards/chosen": -1.364863634109497, "rewards/margins": 0.155384361743927, "rewards/rejected": -1.5202481746673584, "step": 587 }, { "epoch": 0.56, "grad_norm": 19.314786911010742, "learning_rate": 4.527806925498426e-07, "logps/chosen": -59.835540771484375, "logps/rejected": -83.142822265625, "loss": 0.4663, "losses/dpo": 0.4594837427139282, "losses/sft": 2.0525152683258057, "losses/total": 0.4594837427139282, "ref_logps/chosen": -46.653953552246094, "ref_logps/rejected": -61.53036880493164, "rewards/accuracies": 0.875, "rewards/chosen": -1.3181588649749756, "rewards/margins": 0.8430860042572021, "rewards/rejected": -2.1612448692321777, "step": 588 }, { "epoch": 0.56, "grad_norm": 32.04518508911133, "learning_rate": 4.5260580622595314e-07, "logps/chosen": -59.97466278076172, "logps/rejected": -55.113014221191406, "loss": 0.9089, "losses/dpo": 1.2490079402923584, "losses/sft": 2.043095350265503, "losses/total": 1.2490079402923584, "ref_logps/chosen": -43.02207946777344, "ref_logps/rejected": -38.07715606689453, "rewards/accuracies": 0.5625, "rewards/chosen": -1.695258617401123, "rewards/margins": 0.008327044546604156, "rewards/rejected": -1.7035856246948242, "step": 589 }, { "epoch": 0.56, "grad_norm": 19.12995719909668, "learning_rate": 4.524309199020636e-07, "logps/chosen": -43.498085021972656, "logps/rejected": -48.67451858520508, "loss": 0.5929, "losses/dpo": 0.7954567670822144, "losses/sft": 1.5061460733413696, "losses/total": 0.7954567670822144, "ref_logps/chosen": -35.334983825683594, "ref_logps/rejected": -36.11871337890625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8163097500801086, "rewards/margins": 0.43927085399627686, "rewards/rejected": -1.2555806636810303, "step": 590 }, { "epoch": 0.56, "grad_norm": 22.31923484802246, "learning_rate": 4.5225603357817417e-07, "logps/chosen": -49.24600601196289, "logps/rejected": -58.38401412963867, "loss": 0.598, "losses/dpo": 0.5679134130477905, "losses/sft": 2.1319870948791504, "losses/total": 0.5679134130477905, "ref_logps/chosen": -35.836273193359375, "ref_logps/rejected": -41.80289840698242, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3409734964370728, "rewards/margins": 0.3171381950378418, "rewards/rejected": -1.6581116914749146, "step": 591 }, { "epoch": 0.56, "grad_norm": 24.918611526489258, "learning_rate": 4.520811472542847e-07, "logps/chosen": -49.44950866699219, "logps/rejected": -57.95235824584961, "loss": 0.6847, "losses/dpo": 0.6354164481163025, "losses/sft": 1.8745847940444946, "losses/total": 0.6354164481163025, "ref_logps/chosen": -38.98527908325195, "ref_logps/rejected": -46.05634307861328, "rewards/accuracies": 0.5, "rewards/chosen": -1.0464226007461548, "rewards/margins": 0.14317886531352997, "rewards/rejected": -1.1896014213562012, "step": 592 }, { "epoch": 0.56, "grad_norm": 23.120941162109375, "learning_rate": 4.5190626093039524e-07, "logps/chosen": -44.30820846557617, "logps/rejected": -51.69993591308594, "loss": 0.6163, "losses/dpo": 0.6547796130180359, "losses/sft": 2.007625102996826, "losses/total": 0.6547796130180359, "ref_logps/chosen": -33.460899353027344, "ref_logps/rejected": -37.08675003051758, "rewards/accuracies": 0.625, "rewards/chosen": -1.084730863571167, "rewards/margins": 0.37658780813217163, "rewards/rejected": -1.4613187313079834, "step": 593 }, { "epoch": 0.56, "grad_norm": 27.32744026184082, "learning_rate": 4.5173137460650576e-07, "logps/chosen": -53.49062728881836, "logps/rejected": -66.09136962890625, "loss": 0.769, "losses/dpo": 0.8161667585372925, "losses/sft": 1.976332426071167, "losses/total": 0.8161667585372925, "ref_logps/chosen": -38.28842544555664, "ref_logps/rejected": -49.76551818847656, "rewards/accuracies": 0.5625, "rewards/chosen": -1.520220160484314, "rewards/margins": 0.11236533522605896, "rewards/rejected": -1.6325855255126953, "step": 594 }, { "epoch": 0.56, "grad_norm": 22.235774993896484, "learning_rate": 4.5155648828261627e-07, "logps/chosen": -48.553611755371094, "logps/rejected": -53.690345764160156, "loss": 0.6468, "losses/dpo": 0.7293063998222351, "losses/sft": 1.7181318998336792, "losses/total": 0.7293063998222351, "ref_logps/chosen": -36.56941223144531, "ref_logps/rejected": -37.150569915771484, "rewards/accuracies": 0.625, "rewards/chosen": -1.1984202861785889, "rewards/margins": 0.45555734634399414, "rewards/rejected": -1.653977632522583, "step": 595 }, { "epoch": 0.56, "grad_norm": 16.202119827270508, "learning_rate": 4.5138160195872683e-07, "logps/chosen": -34.22391128540039, "logps/rejected": -50.515464782714844, "loss": 0.5549, "losses/dpo": 0.553246259689331, "losses/sft": 1.7854032516479492, "losses/total": 0.553246259689331, "ref_logps/chosen": -27.792156219482422, "ref_logps/rejected": -38.98239517211914, "rewards/accuracies": 0.75, "rewards/chosen": -0.6431754231452942, "rewards/margins": 0.5101317167282104, "rewards/rejected": -1.1533071994781494, "step": 596 }, { "epoch": 0.56, "grad_norm": 28.784940719604492, "learning_rate": 4.512067156348373e-07, "logps/chosen": -64.26377868652344, "logps/rejected": -59.91178894042969, "loss": 0.8101, "losses/dpo": 0.5749263763427734, "losses/sft": 1.6070976257324219, "losses/total": 0.5749263763427734, "ref_logps/chosen": -49.78087615966797, "ref_logps/rejected": -45.48026657104492, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4482907056808472, "rewards/margins": -0.005138680338859558, "rewards/rejected": -1.4431519508361816, "step": 597 }, { "epoch": 0.56, "grad_norm": 25.374406814575195, "learning_rate": 4.5103182931094786e-07, "logps/chosen": -57.253143310546875, "logps/rejected": -60.46489715576172, "loss": 0.7086, "losses/dpo": 0.6373536586761475, "losses/sft": 2.185122013092041, "losses/total": 0.6373536586761475, "ref_logps/chosen": -45.47064208984375, "ref_logps/rejected": -48.51488494873047, "rewards/accuracies": 0.5, "rewards/chosen": -1.1782500743865967, "rewards/margins": 0.01675093173980713, "rewards/rejected": -1.1950010061264038, "step": 598 }, { "epoch": 0.57, "grad_norm": 17.572267532348633, "learning_rate": 4.5085694298705837e-07, "logps/chosen": -45.471588134765625, "logps/rejected": -52.342628479003906, "loss": 0.5378, "losses/dpo": 0.644324004650116, "losses/sft": 1.1434687376022339, "losses/total": 0.644324004650116, "ref_logps/chosen": -34.827049255371094, "ref_logps/rejected": -37.0857048034668, "rewards/accuracies": 0.75, "rewards/chosen": -1.0644538402557373, "rewards/margins": 0.46123889088630676, "rewards/rejected": -1.5256928205490112, "step": 599 }, { "epoch": 0.57, "grad_norm": 17.340742111206055, "learning_rate": 4.5068205666316894e-07, "logps/chosen": -38.48366928100586, "logps/rejected": -67.47239685058594, "loss": 0.4542, "losses/dpo": 0.39069920778274536, "losses/sft": 1.951080560684204, "losses/total": 0.39069920778274536, "ref_logps/chosen": -31.885093688964844, "ref_logps/rejected": -54.0456657409668, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6598575115203857, "rewards/margins": 0.6828150749206543, "rewards/rejected": -1.34267258644104, "step": 600 }, { "epoch": 0.57, "grad_norm": 16.781187057495117, "learning_rate": 4.5050717033927945e-07, "logps/chosen": -36.43646240234375, "logps/rejected": -46.403053283691406, "loss": 0.5473, "losses/dpo": 0.43342477083206177, "losses/sft": 1.403662919998169, "losses/total": 0.43342477083206177, "ref_logps/chosen": -28.485918045043945, "ref_logps/rejected": -32.811485290527344, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7950546741485596, "rewards/margins": 0.5641021728515625, "rewards/rejected": -1.359156847000122, "step": 601 }, { "epoch": 0.57, "grad_norm": 20.688121795654297, "learning_rate": 4.5033228401538996e-07, "logps/chosen": -47.729129791259766, "logps/rejected": -53.09663391113281, "loss": 0.6226, "losses/dpo": 0.6209818124771118, "losses/sft": 1.6103700399398804, "losses/total": 0.6209818124771118, "ref_logps/chosen": -36.89863967895508, "ref_logps/rejected": -39.051124572753906, "rewards/accuracies": 0.625, "rewards/chosen": -1.083048939704895, "rewards/margins": 0.3215019106864929, "rewards/rejected": -1.4045507907867432, "step": 602 }, { "epoch": 0.57, "grad_norm": 17.99138832092285, "learning_rate": 4.5015739769150053e-07, "logps/chosen": -46.57192611694336, "logps/rejected": -67.58000183105469, "loss": 0.4871, "losses/dpo": 0.41051098704338074, "losses/sft": 1.6824710369110107, "losses/total": 0.41051098704338074, "ref_logps/chosen": -36.0202522277832, "ref_logps/rejected": -50.3137092590332, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0551674365997314, "rewards/margins": 0.671461820602417, "rewards/rejected": -1.7266292572021484, "step": 603 }, { "epoch": 0.57, "grad_norm": 17.019672393798828, "learning_rate": 4.49982511367611e-07, "logps/chosen": -36.29152297973633, "logps/rejected": -54.796356201171875, "loss": 0.5166, "losses/dpo": 0.5871896147727966, "losses/sft": 1.7426890134811401, "losses/total": 0.5871896147727966, "ref_logps/chosen": -28.109407424926758, "ref_logps/rejected": -41.816139221191406, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8182113766670227, "rewards/margins": 0.47981035709381104, "rewards/rejected": -1.2980217933654785, "step": 604 }, { "epoch": 0.57, "grad_norm": 15.666430473327637, "learning_rate": 4.4980762504372155e-07, "logps/chosen": -46.628196716308594, "logps/rejected": -48.52845001220703, "loss": 0.4873, "losses/dpo": 0.45817893743515015, "losses/sft": 1.7076882123947144, "losses/total": 0.45817893743515015, "ref_logps/chosen": -40.01538848876953, "ref_logps/rejected": -36.313812255859375, "rewards/accuracies": 0.875, "rewards/chosen": -0.6612809300422668, "rewards/margins": 0.5601828098297119, "rewards/rejected": -1.221463680267334, "step": 605 }, { "epoch": 0.57, "grad_norm": 17.331449508666992, "learning_rate": 4.496327387198321e-07, "logps/chosen": -48.081321716308594, "logps/rejected": -68.89327239990234, "loss": 0.4777, "losses/dpo": 0.2920644283294678, "losses/sft": 1.9619967937469482, "losses/total": 0.2920644283294678, "ref_logps/chosen": -39.36968231201172, "ref_logps/rejected": -52.335227966308594, "rewards/accuracies": 0.875, "rewards/chosen": -0.8711642026901245, "rewards/margins": 0.7846402525901794, "rewards/rejected": -1.6558043956756592, "step": 606 }, { "epoch": 0.57, "grad_norm": 21.904211044311523, "learning_rate": 4.4945785239594263e-07, "logps/chosen": -46.43212127685547, "logps/rejected": -62.09995651245117, "loss": 0.5793, "losses/dpo": 0.43012261390686035, "losses/sft": 1.255206823348999, "losses/total": 0.43012261390686035, "ref_logps/chosen": -36.983028411865234, "ref_logps/rejected": -46.1136474609375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9449092149734497, "rewards/margins": 0.6537215709686279, "rewards/rejected": -1.5986307859420776, "step": 607 }, { "epoch": 0.57, "grad_norm": 25.75718879699707, "learning_rate": 4.4928296607205315e-07, "logps/chosen": -49.889095306396484, "logps/rejected": -50.12107849121094, "loss": 0.7068, "losses/dpo": 0.6289348602294922, "losses/sft": 1.6405929327011108, "losses/total": 0.6289348602294922, "ref_logps/chosen": -39.404510498046875, "ref_logps/rejected": -38.11316680908203, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0484580993652344, "rewards/margins": 0.1523330807685852, "rewards/rejected": -1.2007912397384644, "step": 608 }, { "epoch": 0.58, "grad_norm": 16.910297393798828, "learning_rate": 4.4910807974816366e-07, "logps/chosen": -54.687355041503906, "logps/rejected": -74.47000122070312, "loss": 0.4881, "losses/dpo": 0.5970364809036255, "losses/sft": 1.9302432537078857, "losses/total": 0.5970364809036255, "ref_logps/chosen": -45.57235336303711, "ref_logps/rejected": -55.18684387207031, "rewards/accuracies": 0.625, "rewards/chosen": -0.9115001559257507, "rewards/margins": 1.0168161392211914, "rewards/rejected": -1.9283162355422974, "step": 609 }, { "epoch": 0.58, "grad_norm": 19.139265060424805, "learning_rate": 4.489331934242742e-07, "logps/chosen": -45.38105773925781, "logps/rejected": -52.451560974121094, "loss": 0.5949, "losses/dpo": 0.519945502281189, "losses/sft": 1.6535450220108032, "losses/total": 0.519945502281189, "ref_logps/chosen": -37.386199951171875, "ref_logps/rejected": -39.85076141357422, "rewards/accuracies": 0.5, "rewards/chosen": -0.7994853854179382, "rewards/margins": 0.4605950117111206, "rewards/rejected": -1.260080337524414, "step": 610 }, { "epoch": 0.58, "grad_norm": 21.23914909362793, "learning_rate": 4.487583071003847e-07, "logps/chosen": -41.983001708984375, "logps/rejected": -48.5654296875, "loss": 0.6184, "losses/dpo": 0.4955393373966217, "losses/sft": 1.6477437019348145, "losses/total": 0.4955393373966217, "ref_logps/chosen": -33.260704040527344, "ref_logps/rejected": -34.713722229003906, "rewards/accuracies": 0.75, "rewards/chosen": -0.8722302913665771, "rewards/margins": 0.5129398703575134, "rewards/rejected": -1.3851702213287354, "step": 611 }, { "epoch": 0.58, "grad_norm": 15.444493293762207, "learning_rate": 4.4858342077649525e-07, "logps/chosen": -38.10261535644531, "logps/rejected": -67.46133422851562, "loss": 0.4426, "losses/dpo": 0.28211790323257446, "losses/sft": 1.3990199565887451, "losses/total": 0.28211790323257446, "ref_logps/chosen": -29.05875015258789, "ref_logps/rejected": -49.289119720458984, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9043869972229004, "rewards/margins": 0.9128339290618896, "rewards/rejected": -1.8172208070755005, "step": 612 }, { "epoch": 0.58, "grad_norm": 22.545305252075195, "learning_rate": 4.484085344526058e-07, "logps/chosen": -53.714263916015625, "logps/rejected": -64.93685913085938, "loss": 0.5222, "losses/dpo": 0.7485692501068115, "losses/sft": 1.8144086599349976, "losses/total": 0.7485692501068115, "ref_logps/chosen": -43.9848747253418, "ref_logps/rejected": -50.34032440185547, "rewards/accuracies": 0.75, "rewards/chosen": -0.9729388952255249, "rewards/margins": 0.4867154359817505, "rewards/rejected": -1.4596543312072754, "step": 613 }, { "epoch": 0.58, "grad_norm": 28.44540786743164, "learning_rate": 4.4823364812871633e-07, "logps/chosen": -67.01869201660156, "logps/rejected": -65.50695037841797, "loss": 0.8167, "losses/dpo": 0.9889156818389893, "losses/sft": 2.019357681274414, "losses/total": 0.9889156818389893, "ref_logps/chosen": -51.936622619628906, "ref_logps/rejected": -51.06785583496094, "rewards/accuracies": 0.625, "rewards/chosen": -1.5082073211669922, "rewards/margins": -0.0642981082201004, "rewards/rejected": -1.4439091682434082, "step": 614 }, { "epoch": 0.58, "grad_norm": 18.57333755493164, "learning_rate": 4.4805876180482684e-07, "logps/chosen": -39.33977508544922, "logps/rejected": -52.94608688354492, "loss": 0.6512, "losses/dpo": 0.3669934868812561, "losses/sft": 1.4202157258987427, "losses/total": 0.3669934868812561, "ref_logps/chosen": -32.420963287353516, "ref_logps/rejected": -42.54301834106445, "rewards/accuracies": 0.75, "rewards/chosen": -0.6918811798095703, "rewards/margins": 0.3484255373477936, "rewards/rejected": -1.0403066873550415, "step": 615 }, { "epoch": 0.58, "grad_norm": 20.795682907104492, "learning_rate": 4.4788387548093735e-07, "logps/chosen": -43.713478088378906, "logps/rejected": -47.93873977661133, "loss": 0.6897, "losses/dpo": 0.574022650718689, "losses/sft": 1.5561370849609375, "losses/total": 0.574022650718689, "ref_logps/chosen": -34.589534759521484, "ref_logps/rejected": -35.650272369384766, "rewards/accuracies": 0.625, "rewards/chosen": -0.9123944044113159, "rewards/margins": 0.3164524435997009, "rewards/rejected": -1.228846788406372, "step": 616 }, { "epoch": 0.58, "grad_norm": 18.406551361083984, "learning_rate": 4.477089891570479e-07, "logps/chosen": -50.79158020019531, "logps/rejected": -59.605995178222656, "loss": 0.5404, "losses/dpo": 0.49215713143348694, "losses/sft": 1.7804259061813354, "losses/total": 0.49215713143348694, "ref_logps/chosen": -44.591209411621094, "ref_logps/rejected": -47.803138732910156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6200370788574219, "rewards/margins": 0.5602489113807678, "rewards/rejected": -1.180285930633545, "step": 617 }, { "epoch": 0.58, "grad_norm": 19.99086570739746, "learning_rate": 4.475341028331584e-07, "logps/chosen": -41.745399475097656, "logps/rejected": -66.42382049560547, "loss": 0.4712, "losses/dpo": 0.2765766382217407, "losses/sft": 1.3556243181228638, "losses/total": 0.2765766382217407, "ref_logps/chosen": -33.391448974609375, "ref_logps/rejected": -51.502716064453125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8353952169418335, "rewards/margins": 0.6567153930664062, "rewards/rejected": -1.4921106100082397, "step": 618 }, { "epoch": 0.58, "grad_norm": 20.219806671142578, "learning_rate": 4.4735921650926894e-07, "logps/chosen": -46.19069290161133, "logps/rejected": -57.514678955078125, "loss": 0.5436, "losses/dpo": 0.7588982582092285, "losses/sft": 1.919982671737671, "losses/total": 0.7588982582092285, "ref_logps/chosen": -38.15788650512695, "ref_logps/rejected": -43.16008377075195, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8032807111740112, "rewards/margins": 0.6321789026260376, "rewards/rejected": -1.4354596138000488, "step": 619 }, { "epoch": 0.59, "grad_norm": 17.956785202026367, "learning_rate": 4.471843301853795e-07, "logps/chosen": -43.92296600341797, "logps/rejected": -52.783172607421875, "loss": 0.5365, "losses/dpo": 0.7335257530212402, "losses/sft": 1.7678004503250122, "losses/total": 0.7335257530212402, "ref_logps/chosen": -35.4367561340332, "ref_logps/rejected": -39.01974105834961, "rewards/accuracies": 0.75, "rewards/chosen": -0.8486208915710449, "rewards/margins": 0.5277221202850342, "rewards/rejected": -1.376343011856079, "step": 620 }, { "epoch": 0.59, "grad_norm": 23.003658294677734, "learning_rate": 4.4700944386149e-07, "logps/chosen": -57.13471984863281, "logps/rejected": -54.509063720703125, "loss": 0.6757, "losses/dpo": 0.6394138932228088, "losses/sft": 1.6702107191085815, "losses/total": 0.6394138932228088, "ref_logps/chosen": -45.88554382324219, "ref_logps/rejected": -41.745384216308594, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1249175071716309, "rewards/margins": 0.15145081281661987, "rewards/rejected": -1.2763683795928955, "step": 621 }, { "epoch": 0.59, "grad_norm": 18.1394100189209, "learning_rate": 4.4683455753760053e-07, "logps/chosen": -41.69705581665039, "logps/rejected": -66.72216796875, "loss": 0.5115, "losses/dpo": 0.47494760155677795, "losses/sft": 2.0901641845703125, "losses/total": 0.47494760155677795, "ref_logps/chosen": -34.35966873168945, "ref_logps/rejected": -53.75463104248047, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7337387204170227, "rewards/margins": 0.5630146265029907, "rewards/rejected": -1.2967534065246582, "step": 622 }, { "epoch": 0.59, "grad_norm": 22.04412841796875, "learning_rate": 4.4665967121371105e-07, "logps/chosen": -36.097251892089844, "logps/rejected": -61.0118408203125, "loss": 0.5032, "losses/dpo": 0.40838390588760376, "losses/sft": 1.519967794418335, "losses/total": 0.40838390588760376, "ref_logps/chosen": -31.266021728515625, "ref_logps/rejected": -49.03853988647461, "rewards/accuracies": 0.6875, "rewards/chosen": -0.48312294483184814, "rewards/margins": 0.7142065763473511, "rewards/rejected": -1.1973296403884888, "step": 623 }, { "epoch": 0.59, "grad_norm": 19.78840446472168, "learning_rate": 4.464847848898216e-07, "logps/chosen": -56.3748893737793, "logps/rejected": -62.040428161621094, "loss": 0.5451, "losses/dpo": 0.5086503028869629, "losses/sft": 2.1892902851104736, "losses/total": 0.5086503028869629, "ref_logps/chosen": -47.25972366333008, "ref_logps/rejected": -48.04988098144531, "rewards/accuracies": 0.75, "rewards/chosen": -0.9115162491798401, "rewards/margins": 0.4875383675098419, "rewards/rejected": -1.3990545272827148, "step": 624 }, { "epoch": 0.59, "grad_norm": 23.76604652404785, "learning_rate": 4.4630989856593207e-07, "logps/chosen": -46.96368408203125, "logps/rejected": -56.02411651611328, "loss": 0.686, "losses/dpo": 0.8924371600151062, "losses/sft": 1.450379490852356, "losses/total": 0.8924371600151062, "ref_logps/chosen": -35.780189514160156, "ref_logps/rejected": -41.658897399902344, "rewards/accuracies": 0.5, "rewards/chosen": -1.1183497905731201, "rewards/margins": 0.3181724548339844, "rewards/rejected": -1.4365222454071045, "step": 625 }, { "epoch": 0.59, "grad_norm": 18.979887008666992, "learning_rate": 4.4613501224204264e-07, "logps/chosen": -48.69410705566406, "logps/rejected": -62.87856674194336, "loss": 0.5634, "losses/dpo": 0.8508898615837097, "losses/sft": 1.8799076080322266, "losses/total": 0.8508898615837097, "ref_logps/chosen": -40.932003021240234, "ref_logps/rejected": -49.844390869140625, "rewards/accuracies": 0.75, "rewards/chosen": -0.7762105464935303, "rewards/margins": 0.5272067189216614, "rewards/rejected": -1.3034173250198364, "step": 626 }, { "epoch": 0.59, "grad_norm": 19.02654457092285, "learning_rate": 4.459601259181532e-07, "logps/chosen": -52.498817443847656, "logps/rejected": -51.22602462768555, "loss": 0.556, "losses/dpo": 0.5481646656990051, "losses/sft": 1.5784449577331543, "losses/total": 0.5481646656990051, "ref_logps/chosen": -44.086944580078125, "ref_logps/rejected": -37.77796936035156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8411874771118164, "rewards/margins": 0.5036177039146423, "rewards/rejected": -1.3448052406311035, "step": 627 }, { "epoch": 0.59, "grad_norm": 20.742963790893555, "learning_rate": 4.457852395942637e-07, "logps/chosen": -50.570465087890625, "logps/rejected": -60.769622802734375, "loss": 0.5196, "losses/dpo": 0.5619484186172485, "losses/sft": 2.273573875427246, "losses/total": 0.5619484186172485, "ref_logps/chosen": -41.500213623046875, "ref_logps/rejected": -46.17858123779297, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9070254564285278, "rewards/margins": 0.552078902721405, "rewards/rejected": -1.459104299545288, "step": 628 }, { "epoch": 0.59, "grad_norm": 16.53354263305664, "learning_rate": 4.4561035327037423e-07, "logps/chosen": -46.26103973388672, "logps/rejected": -61.017906188964844, "loss": 0.5165, "losses/dpo": 0.3043246865272522, "losses/sft": 2.1364951133728027, "losses/total": 0.3043246865272522, "ref_logps/chosen": -37.735923767089844, "ref_logps/rejected": -47.00932693481445, "rewards/accuracies": 0.75, "rewards/chosen": -0.8525112867355347, "rewards/margins": 0.5483470559120178, "rewards/rejected": -1.4008582830429077, "step": 629 }, { "epoch": 0.59, "grad_norm": 17.88193130493164, "learning_rate": 4.4543546694648474e-07, "logps/chosen": -44.97325134277344, "logps/rejected": -53.60231018066406, "loss": 0.5447, "losses/dpo": 0.6644160747528076, "losses/sft": 1.4925224781036377, "losses/total": 0.6644160747528076, "ref_logps/chosen": -38.09996795654297, "ref_logps/rejected": -41.851593017578125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6873289346694946, "rewards/margins": 0.4877431392669678, "rewards/rejected": -1.1750720739364624, "step": 630 }, { "epoch": 0.6, "grad_norm": 17.928112030029297, "learning_rate": 4.452605806225953e-07, "logps/chosen": -50.4836540222168, "logps/rejected": -52.671653747558594, "loss": 0.5926, "losses/dpo": 0.4583841562271118, "losses/sft": 1.5576122999191284, "losses/total": 0.4583841562271118, "ref_logps/chosen": -39.99382019042969, "ref_logps/rejected": -37.47768020629883, "rewards/accuracies": 0.5, "rewards/chosen": -1.0489832162857056, "rewards/margins": 0.47041383385658264, "rewards/rejected": -1.5193971395492554, "step": 631 }, { "epoch": 0.6, "grad_norm": 17.331432342529297, "learning_rate": 4.450856942987058e-07, "logps/chosen": -41.436256408691406, "logps/rejected": -42.20221710205078, "loss": 0.5962, "losses/dpo": 0.4479900896549225, "losses/sft": 1.6780507564544678, "losses/total": 0.4479900896549225, "ref_logps/chosen": -34.681339263916016, "ref_logps/rejected": -31.729753494262695, "rewards/accuracies": 0.8125, "rewards/chosen": -0.675491452217102, "rewards/margins": 0.37175506353378296, "rewards/rejected": -1.0472465753555298, "step": 632 }, { "epoch": 0.6, "grad_norm": 19.252670288085938, "learning_rate": 4.4491080797481633e-07, "logps/chosen": -41.41585922241211, "logps/rejected": -59.671630859375, "loss": 0.6025, "losses/dpo": 0.4925847053527832, "losses/sft": 1.540802240371704, "losses/total": 0.4925847053527832, "ref_logps/chosen": -34.36231994628906, "ref_logps/rejected": -49.26945495605469, "rewards/accuracies": 0.75, "rewards/chosen": -0.7053536176681519, "rewards/margins": 0.33486396074295044, "rewards/rejected": -1.040217638015747, "step": 633 }, { "epoch": 0.6, "grad_norm": 15.870306968688965, "learning_rate": 4.447359216509269e-07, "logps/chosen": -36.664276123046875, "logps/rejected": -61.64225769042969, "loss": 0.5396, "losses/dpo": 0.4136042296886444, "losses/sft": 1.9071153402328491, "losses/total": 0.4136042296886444, "ref_logps/chosen": -28.928457260131836, "ref_logps/rejected": -47.71489715576172, "rewards/accuracies": 0.625, "rewards/chosen": -0.7735821604728699, "rewards/margins": 0.6191542148590088, "rewards/rejected": -1.3927364349365234, "step": 634 }, { "epoch": 0.6, "grad_norm": 21.3395938873291, "learning_rate": 4.445610353270374e-07, "logps/chosen": -46.159767150878906, "logps/rejected": -44.8878173828125, "loss": 0.6575, "losses/dpo": 0.9645848274230957, "losses/sft": 1.457635521888733, "losses/total": 0.9645848274230957, "ref_logps/chosen": -38.900474548339844, "ref_logps/rejected": -35.76042175292969, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7259290218353271, "rewards/margins": 0.18681050837039948, "rewards/rejected": -0.9127395749092102, "step": 635 }, { "epoch": 0.6, "grad_norm": 18.366098403930664, "learning_rate": 4.443861490031479e-07, "logps/chosen": -45.634368896484375, "logps/rejected": -57.911338806152344, "loss": 0.5407, "losses/dpo": 0.8006632328033447, "losses/sft": 1.6804780960083008, "losses/total": 0.8006632328033447, "ref_logps/chosen": -37.43037414550781, "ref_logps/rejected": -44.61968231201172, "rewards/accuracies": 0.75, "rewards/chosen": -0.8203996419906616, "rewards/margins": 0.5087662935256958, "rewards/rejected": -1.3291659355163574, "step": 636 }, { "epoch": 0.6, "grad_norm": 20.716598510742188, "learning_rate": 4.4421126267925844e-07, "logps/chosen": -41.47325134277344, "logps/rejected": -61.56958770751953, "loss": 0.6046, "losses/dpo": 0.5198478698730469, "losses/sft": 0.8199011087417603, "losses/total": 0.5198478698730469, "ref_logps/chosen": -33.44092559814453, "ref_logps/rejected": -50.522911071777344, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8032324314117432, "rewards/margins": 0.30143505334854126, "rewards/rejected": -1.1046674251556396, "step": 637 }, { "epoch": 0.6, "grad_norm": 20.82286262512207, "learning_rate": 4.44036376355369e-07, "logps/chosen": -39.74163055419922, "logps/rejected": -51.326393127441406, "loss": 0.693, "losses/dpo": 0.9570564031600952, "losses/sft": 1.4647154808044434, "losses/total": 0.9570564031600952, "ref_logps/chosen": -31.393878936767578, "ref_logps/rejected": -40.26043701171875, "rewards/accuracies": 0.5, "rewards/chosen": -0.8347752690315247, "rewards/margins": 0.2718203663825989, "rewards/rejected": -1.106595516204834, "step": 638 }, { "epoch": 0.6, "grad_norm": 19.03815269470215, "learning_rate": 4.438614900314795e-07, "logps/chosen": -43.423133850097656, "logps/rejected": -52.632022857666016, "loss": 0.6168, "losses/dpo": 0.5782968997955322, "losses/sft": 1.9146307706832886, "losses/total": 0.5782968997955322, "ref_logps/chosen": -35.6546516418457, "ref_logps/rejected": -41.304603576660156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7768484950065613, "rewards/margins": 0.35589343309402466, "rewards/rejected": -1.132741928100586, "step": 639 }, { "epoch": 0.6, "grad_norm": 16.551855087280273, "learning_rate": 4.4368660370759e-07, "logps/chosen": -37.846988677978516, "logps/rejected": -68.63668060302734, "loss": 0.4806, "losses/dpo": 0.3581385016441345, "losses/sft": 1.4599511623382568, "losses/total": 0.3581385016441345, "ref_logps/chosen": -29.67315673828125, "ref_logps/rejected": -53.69397735595703, "rewards/accuracies": 0.75, "rewards/chosen": -0.8173829913139343, "rewards/margins": 0.6768878102302551, "rewards/rejected": -1.4942708015441895, "step": 640 }, { "epoch": 0.61, "grad_norm": 21.649520874023438, "learning_rate": 4.435117173837006e-07, "logps/chosen": -51.88352966308594, "logps/rejected": -53.07874298095703, "loss": 0.6776, "losses/dpo": 0.7447686791419983, "losses/sft": 1.8003766536712646, "losses/total": 0.7447686791419983, "ref_logps/chosen": -41.13995361328125, "ref_logps/rejected": -40.63636016845703, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0743573904037476, "rewards/margins": 0.16988088190555573, "rewards/rejected": -1.244238257408142, "step": 641 }, { "epoch": 0.61, "grad_norm": 19.520015716552734, "learning_rate": 4.433368310598111e-07, "logps/chosen": -39.8344841003418, "logps/rejected": -54.55594253540039, "loss": 0.5592, "losses/dpo": 0.3267616927623749, "losses/sft": 1.6068446636199951, "losses/total": 0.3267616927623749, "ref_logps/chosen": -34.02265930175781, "ref_logps/rejected": -43.448482513427734, "rewards/accuracies": 0.625, "rewards/chosen": -0.5811823010444641, "rewards/margins": 0.5295640230178833, "rewards/rejected": -1.1107463836669922, "step": 642 }, { "epoch": 0.61, "grad_norm": 17.208520889282227, "learning_rate": 4.431619447359216e-07, "logps/chosen": -50.041473388671875, "logps/rejected": -56.7177619934082, "loss": 0.5225, "losses/dpo": 0.8120496273040771, "losses/sft": 1.8595517873764038, "losses/total": 0.8120496273040771, "ref_logps/chosen": -42.69335174560547, "ref_logps/rejected": -43.57000732421875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7348119020462036, "rewards/margins": 0.5799638032913208, "rewards/rejected": -1.3147757053375244, "step": 643 }, { "epoch": 0.61, "grad_norm": 20.3192138671875, "learning_rate": 4.4298705841203213e-07, "logps/chosen": -37.84234619140625, "logps/rejected": -53.206207275390625, "loss": 0.5602, "losses/dpo": 0.7065169811248779, "losses/sft": 1.7062314748764038, "losses/total": 0.7065169811248779, "ref_logps/chosen": -29.939010620117188, "ref_logps/rejected": -40.785850524902344, "rewards/accuracies": 0.75, "rewards/chosen": -0.7903335094451904, "rewards/margins": 0.4517020285129547, "rewards/rejected": -1.2420353889465332, "step": 644 }, { "epoch": 0.61, "grad_norm": 18.488365173339844, "learning_rate": 4.428121720881427e-07, "logps/chosen": -42.64060974121094, "logps/rejected": -59.79262161254883, "loss": 0.6055, "losses/dpo": 0.6880443096160889, "losses/sft": 1.4448903799057007, "losses/total": 0.6880443096160889, "ref_logps/chosen": -34.36505126953125, "ref_logps/rejected": -47.38486099243164, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8275556564331055, "rewards/margins": 0.4132207930088043, "rewards/rejected": -1.2407763004302979, "step": 645 }, { "epoch": 0.61, "grad_norm": 23.138898849487305, "learning_rate": 4.426372857642532e-07, "logps/chosen": -49.89390563964844, "logps/rejected": -53.07732391357422, "loss": 0.7357, "losses/dpo": 0.6794490218162537, "losses/sft": 1.3935376405715942, "losses/total": 0.6794490218162537, "ref_logps/chosen": -39.706748962402344, "ref_logps/rejected": -42.40960693359375, "rewards/accuracies": 0.375, "rewards/chosen": -1.018715739250183, "rewards/margins": 0.048056092113256454, "rewards/rejected": -1.0667717456817627, "step": 646 }, { "epoch": 0.61, "grad_norm": 16.54840660095215, "learning_rate": 4.424623994403637e-07, "logps/chosen": -42.495155334472656, "logps/rejected": -52.528602600097656, "loss": 0.4561, "losses/dpo": 0.3666003942489624, "losses/sft": 1.7475947141647339, "losses/total": 0.3666003942489624, "ref_logps/chosen": -33.528594970703125, "ref_logps/rejected": -36.515892028808594, "rewards/accuracies": 0.875, "rewards/chosen": -0.8966560363769531, "rewards/margins": 0.7046149373054504, "rewards/rejected": -1.6012709140777588, "step": 647 }, { "epoch": 0.61, "grad_norm": 25.164112091064453, "learning_rate": 4.422875131164743e-07, "logps/chosen": -62.31259536743164, "logps/rejected": -66.88058471679688, "loss": 0.631, "losses/dpo": 0.634787917137146, "losses/sft": 1.5533806085586548, "losses/total": 0.634787917137146, "ref_logps/chosen": -51.22900390625, "ref_logps/rejected": -51.84467315673828, "rewards/accuracies": 0.625, "rewards/chosen": -1.1083593368530273, "rewards/margins": 0.3952321410179138, "rewards/rejected": -1.503591537475586, "step": 648 }, { "epoch": 0.61, "grad_norm": 25.253978729248047, "learning_rate": 4.421126267925848e-07, "logps/chosen": -50.226009368896484, "logps/rejected": -44.45460891723633, "loss": 0.7328, "losses/dpo": 1.0524861812591553, "losses/sft": 2.099217414855957, "losses/total": 1.0524861812591553, "ref_logps/chosen": -39.9755859375, "ref_logps/rejected": -33.808143615722656, "rewards/accuracies": 0.5, "rewards/chosen": -1.0250425338745117, "rewards/margins": 0.039604201912879944, "rewards/rejected": -1.0646467208862305, "step": 649 }, { "epoch": 0.61, "grad_norm": 18.047706604003906, "learning_rate": 4.419377404686953e-07, "logps/chosen": -45.84514617919922, "logps/rejected": -64.69760131835938, "loss": 0.4586, "losses/dpo": 0.6318464279174805, "losses/sft": 1.4103286266326904, "losses/total": 0.6318464279174805, "ref_logps/chosen": -36.6884880065918, "ref_logps/rejected": -48.65917205810547, "rewards/accuracies": 0.875, "rewards/chosen": -0.9156655073165894, "rewards/margins": 0.6881779432296753, "rewards/rejected": -1.603843331336975, "step": 650 }, { "epoch": 0.61, "grad_norm": 20.930429458618164, "learning_rate": 4.417628541448059e-07, "logps/chosen": -50.62479019165039, "logps/rejected": -47.59782028198242, "loss": 0.6128, "losses/dpo": 0.5327115058898926, "losses/sft": 1.6167140007019043, "losses/total": 0.5327115058898926, "ref_logps/chosen": -40.95143127441406, "ref_logps/rejected": -34.05638885498047, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9673357605934143, "rewards/margins": 0.3868076205253601, "rewards/rejected": -1.3541433811187744, "step": 651 }, { "epoch": 0.62, "grad_norm": 20.108413696289062, "learning_rate": 4.415879678209164e-07, "logps/chosen": -50.80863952636719, "logps/rejected": -78.14813232421875, "loss": 0.4678, "losses/dpo": 0.5177967548370361, "losses/sft": 2.116820812225342, "losses/total": 0.5177967548370361, "ref_logps/chosen": -38.272247314453125, "ref_logps/rejected": -59.01039505004883, "rewards/accuracies": 0.875, "rewards/chosen": -1.2536389827728271, "rewards/margins": 0.6601352691650391, "rewards/rejected": -1.9137742519378662, "step": 652 }, { "epoch": 0.62, "grad_norm": 16.709762573242188, "learning_rate": 4.4141308149702696e-07, "logps/chosen": -46.55236053466797, "logps/rejected": -79.44471740722656, "loss": 0.4021, "losses/dpo": 0.32214534282684326, "losses/sft": 2.058011531829834, "losses/total": 0.32214534282684326, "ref_logps/chosen": -38.24629211425781, "ref_logps/rejected": -60.538787841796875, "rewards/accuracies": 0.75, "rewards/chosen": -0.8306071162223816, "rewards/margins": 1.0599861145019531, "rewards/rejected": -1.8905932903289795, "step": 653 }, { "epoch": 0.62, "grad_norm": 16.43296241760254, "learning_rate": 4.412381951731374e-07, "logps/chosen": -53.85081100463867, "logps/rejected": -69.29866027832031, "loss": 0.4691, "losses/dpo": 0.4782261848449707, "losses/sft": 2.026470899581909, "losses/total": 0.4782261848449707, "ref_logps/chosen": -44.15160369873047, "ref_logps/rejected": -51.61612319946289, "rewards/accuracies": 0.75, "rewards/chosen": -0.9699205160140991, "rewards/margins": 0.7983332872390747, "rewards/rejected": -1.7682539224624634, "step": 654 }, { "epoch": 0.62, "grad_norm": 25.55121421813965, "learning_rate": 4.41063308849248e-07, "logps/chosen": -64.85223388671875, "logps/rejected": -68.76423645019531, "loss": 0.5282, "losses/dpo": 0.8686087131500244, "losses/sft": 1.912293553352356, "losses/total": 0.8686087131500244, "ref_logps/chosen": -53.41896057128906, "ref_logps/rejected": -49.61004638671875, "rewards/accuracies": 0.875, "rewards/chosen": -1.1433278322219849, "rewards/margins": 0.772091269493103, "rewards/rejected": -1.9154192209243774, "step": 655 }, { "epoch": 0.62, "grad_norm": 21.51396369934082, "learning_rate": 4.408884225253585e-07, "logps/chosen": -46.20930099487305, "logps/rejected": -54.008995056152344, "loss": 0.6033, "losses/dpo": 0.5834099054336548, "losses/sft": 1.929543375968933, "losses/total": 0.5834099054336548, "ref_logps/chosen": -38.36368942260742, "ref_logps/rejected": -41.02717590332031, "rewards/accuracies": 0.5625, "rewards/chosen": -0.784561038017273, "rewards/margins": 0.513620913028717, "rewards/rejected": -1.2981820106506348, "step": 656 }, { "epoch": 0.62, "grad_norm": 21.454519271850586, "learning_rate": 4.40713536201469e-07, "logps/chosen": -49.7601203918457, "logps/rejected": -56.79192352294922, "loss": 0.5375, "losses/dpo": 0.715327799320221, "losses/sft": 1.7063201665878296, "losses/total": 0.715327799320221, "ref_logps/chosen": -40.21389389038086, "ref_logps/rejected": -42.62712097167969, "rewards/accuracies": 0.8125, "rewards/chosen": -0.954622745513916, "rewards/margins": 0.46185749769210815, "rewards/rejected": -1.416480302810669, "step": 657 }, { "epoch": 0.62, "grad_norm": 22.692399978637695, "learning_rate": 4.4053864987757957e-07, "logps/chosen": -59.7557373046875, "logps/rejected": -68.03887176513672, "loss": 0.5193, "losses/dpo": 0.6416804790496826, "losses/sft": 1.9612102508544922, "losses/total": 0.6416804790496826, "ref_logps/chosen": -45.95321273803711, "ref_logps/rejected": -48.96763610839844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3802525997161865, "rewards/margins": 0.5268707275390625, "rewards/rejected": -1.9071234464645386, "step": 658 }, { "epoch": 0.62, "grad_norm": 19.45255470275879, "learning_rate": 4.403637635536901e-07, "logps/chosen": -43.349327087402344, "logps/rejected": -55.58039093017578, "loss": 0.5812, "losses/dpo": 0.9194954633712769, "losses/sft": 2.0749683380126953, "losses/total": 0.9194954633712769, "ref_logps/chosen": -32.08791732788086, "ref_logps/rejected": -38.647396087646484, "rewards/accuracies": 0.75, "rewards/chosen": -1.126140832901001, "rewards/margins": 0.567158579826355, "rewards/rejected": -1.6932992935180664, "step": 659 }, { "epoch": 0.62, "grad_norm": 17.813053131103516, "learning_rate": 4.4018887722980065e-07, "logps/chosen": -45.078216552734375, "logps/rejected": -54.474239349365234, "loss": 0.4964, "losses/dpo": 0.4895384907722473, "losses/sft": 1.8483972549438477, "losses/total": 0.4895384907722473, "ref_logps/chosen": -36.561851501464844, "ref_logps/rejected": -40.54484176635742, "rewards/accuracies": 0.75, "rewards/chosen": -0.8516362905502319, "rewards/margins": 0.5413033962249756, "rewards/rejected": -1.3929396867752075, "step": 660 }, { "epoch": 0.62, "grad_norm": 17.709857940673828, "learning_rate": 4.400139909059111e-07, "logps/chosen": -44.531532287597656, "logps/rejected": -61.72312545776367, "loss": 0.5045, "losses/dpo": 0.6141805648803711, "losses/sft": 1.736121654510498, "losses/total": 0.6141805648803711, "ref_logps/chosen": -35.72633361816406, "ref_logps/rejected": -44.452491760253906, "rewards/accuracies": 0.6875, "rewards/chosen": -0.880520224571228, "rewards/margins": 0.8465432524681091, "rewards/rejected": -1.7270634174346924, "step": 661 }, { "epoch": 0.63, "grad_norm": 25.278169631958008, "learning_rate": 4.398391045820217e-07, "logps/chosen": -46.960514068603516, "logps/rejected": -52.05955505371094, "loss": 0.7564, "losses/dpo": 0.8389472365379333, "losses/sft": 1.385761022567749, "losses/total": 0.8389472365379333, "ref_logps/chosen": -36.994712829589844, "ref_logps/rejected": -41.69624710083008, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9965801239013672, "rewards/margins": 0.03975026682019234, "rewards/rejected": -1.0363304615020752, "step": 662 }, { "epoch": 0.63, "grad_norm": 24.060327529907227, "learning_rate": 4.396642182581322e-07, "logps/chosen": -51.66042709350586, "logps/rejected": -70.60588073730469, "loss": 0.6368, "losses/dpo": 0.8159767389297485, "losses/sft": 1.8975802659988403, "losses/total": 0.8159767389297485, "ref_logps/chosen": -41.02614212036133, "ref_logps/rejected": -53.883201599121094, "rewards/accuracies": 0.625, "rewards/chosen": -1.0634284019470215, "rewards/margins": 0.6088401079177856, "rewards/rejected": -1.6722686290740967, "step": 663 }, { "epoch": 0.63, "grad_norm": 25.352890014648438, "learning_rate": 4.394893319342427e-07, "logps/chosen": -53.84954071044922, "logps/rejected": -71.38723754882812, "loss": 0.5687, "losses/dpo": 0.8507640361785889, "losses/sft": 2.5388991832733154, "losses/total": 0.8507640361785889, "ref_logps/chosen": -41.39336395263672, "ref_logps/rejected": -52.69435119628906, "rewards/accuracies": 0.6875, "rewards/chosen": -1.245617389678955, "rewards/margins": 0.6236708164215088, "rewards/rejected": -1.8692882061004639, "step": 664 }, { "epoch": 0.63, "grad_norm": 26.868606567382812, "learning_rate": 4.3931444561035327e-07, "logps/chosen": -56.91460418701172, "logps/rejected": -64.68113708496094, "loss": 0.6458, "losses/dpo": 0.9483543634414673, "losses/sft": 2.0880446434020996, "losses/total": 0.9483543634414673, "ref_logps/chosen": -47.140235900878906, "ref_logps/rejected": -49.95226287841797, "rewards/accuracies": 0.6875, "rewards/chosen": -0.977436900138855, "rewards/margins": 0.49545052647590637, "rewards/rejected": -1.472887396812439, "step": 665 }, { "epoch": 0.63, "grad_norm": 25.787260055541992, "learning_rate": 4.391395592864638e-07, "logps/chosen": -45.327796936035156, "logps/rejected": -53.5628776550293, "loss": 0.6558, "losses/dpo": 1.2123247385025024, "losses/sft": 2.302277088165283, "losses/total": 1.2123247385025024, "ref_logps/chosen": -34.81057357788086, "ref_logps/rejected": -39.50253677368164, "rewards/accuracies": 0.625, "rewards/chosen": -1.0517226457595825, "rewards/margins": 0.3543117642402649, "rewards/rejected": -1.4060344696044922, "step": 666 }, { "epoch": 0.63, "grad_norm": 20.70470428466797, "learning_rate": 4.3896467296257434e-07, "logps/chosen": -43.803653717041016, "logps/rejected": -57.881004333496094, "loss": 0.632, "losses/dpo": 0.8098856806755066, "losses/sft": 2.082270622253418, "losses/total": 0.8098856806755066, "ref_logps/chosen": -34.23711395263672, "ref_logps/rejected": -43.208396911621094, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9566540718078613, "rewards/margins": 0.5106068849563599, "rewards/rejected": -1.4672608375549316, "step": 667 }, { "epoch": 0.63, "grad_norm": 27.2071533203125, "learning_rate": 4.387897866386848e-07, "logps/chosen": -53.50001525878906, "logps/rejected": -57.1082878112793, "loss": 0.7066, "losses/dpo": 0.8632234334945679, "losses/sft": 1.7941436767578125, "losses/total": 0.8632234334945679, "ref_logps/chosen": -41.003883361816406, "ref_logps/rejected": -43.136844635009766, "rewards/accuracies": 0.5, "rewards/chosen": -1.2496129274368286, "rewards/margins": 0.1475316286087036, "rewards/rejected": -1.3971445560455322, "step": 668 }, { "epoch": 0.63, "grad_norm": 17.104589462280273, "learning_rate": 4.3861490031479537e-07, "logps/chosen": -41.32619094848633, "logps/rejected": -60.7089729309082, "loss": 0.4586, "losses/dpo": 0.5647233724594116, "losses/sft": 1.8316291570663452, "losses/total": 0.5647233724594116, "ref_logps/chosen": -32.7796630859375, "ref_logps/rejected": -43.463077545166016, "rewards/accuracies": 0.75, "rewards/chosen": -0.8546529412269592, "rewards/margins": 0.8699364066123962, "rewards/rejected": -1.7245893478393555, "step": 669 }, { "epoch": 0.63, "grad_norm": 16.925846099853516, "learning_rate": 4.3844001399090594e-07, "logps/chosen": -47.019798278808594, "logps/rejected": -64.30384063720703, "loss": 0.5102, "losses/dpo": 0.8314761519432068, "losses/sft": 1.7040923833847046, "losses/total": 0.8314761519432068, "ref_logps/chosen": -40.31928253173828, "ref_logps/rejected": -49.528221130371094, "rewards/accuracies": 0.75, "rewards/chosen": -0.6700518131256104, "rewards/margins": 0.8075105547904968, "rewards/rejected": -1.477562427520752, "step": 670 }, { "epoch": 0.63, "grad_norm": 22.852285385131836, "learning_rate": 4.382651276670164e-07, "logps/chosen": -48.86735534667969, "logps/rejected": -60.63975524902344, "loss": 0.5387, "losses/dpo": 0.2187681794166565, "losses/sft": 1.4940122365951538, "losses/total": 0.2187681794166565, "ref_logps/chosen": -39.97687530517578, "ref_logps/rejected": -44.219886779785156, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8890483975410461, "rewards/margins": 0.7529385089874268, "rewards/rejected": -1.6419868469238281, "step": 671 }, { "epoch": 0.63, "grad_norm": 23.92821502685547, "learning_rate": 4.3809024134312696e-07, "logps/chosen": -47.636817932128906, "logps/rejected": -54.7767333984375, "loss": 0.6583, "losses/dpo": 0.7265498042106628, "losses/sft": 1.5916630029678345, "losses/total": 0.7265498042106628, "ref_logps/chosen": -38.55921173095703, "ref_logps/rejected": -42.94745635986328, "rewards/accuracies": 0.5, "rewards/chosen": -0.907760500907898, "rewards/margins": 0.2751672863960266, "rewards/rejected": -1.1829278469085693, "step": 672 }, { "epoch": 0.64, "grad_norm": 18.440404891967773, "learning_rate": 4.3791535501923747e-07, "logps/chosen": -48.635337829589844, "logps/rejected": -51.012855529785156, "loss": 0.5051, "losses/dpo": 0.7917296290397644, "losses/sft": 1.6104592084884644, "losses/total": 0.7917296290397644, "ref_logps/chosen": -40.228145599365234, "ref_logps/rejected": -36.581321716308594, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8407193422317505, "rewards/margins": 0.602434515953064, "rewards/rejected": -1.4431538581848145, "step": 673 }, { "epoch": 0.64, "grad_norm": 21.293899536132812, "learning_rate": 4.3774046869534804e-07, "logps/chosen": -35.00872802734375, "logps/rejected": -61.432472229003906, "loss": 0.4489, "losses/dpo": 0.5239160656929016, "losses/sft": 1.4297856092453003, "losses/total": 0.5239160656929016, "ref_logps/chosen": -29.145401000976562, "ref_logps/rejected": -48.458168029785156, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5863324403762817, "rewards/margins": 0.7110979557037354, "rewards/rejected": -1.2974302768707275, "step": 674 }, { "epoch": 0.64, "grad_norm": 19.253389358520508, "learning_rate": 4.375655823714585e-07, "logps/chosen": -43.62630844116211, "logps/rejected": -57.79328918457031, "loss": 0.4773, "losses/dpo": 0.6137611269950867, "losses/sft": 1.7614896297454834, "losses/total": 0.6137611269950867, "ref_logps/chosen": -36.42924880981445, "ref_logps/rejected": -42.72636413574219, "rewards/accuracies": 0.75, "rewards/chosen": -0.7197059392929077, "rewards/margins": 0.7869865894317627, "rewards/rejected": -1.5066924095153809, "step": 675 }, { "epoch": 0.64, "grad_norm": 22.75641632080078, "learning_rate": 4.3739069604756906e-07, "logps/chosen": -55.21026611328125, "logps/rejected": -53.981781005859375, "loss": 0.7281, "losses/dpo": 0.7902309894561768, "losses/sft": 1.4334278106689453, "losses/total": 0.7902309894561768, "ref_logps/chosen": -43.870574951171875, "ref_logps/rejected": -38.35118865966797, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1339691877365112, "rewards/margins": 0.42909008264541626, "rewards/rejected": -1.5630592107772827, "step": 676 }, { "epoch": 0.64, "grad_norm": 22.33275032043457, "learning_rate": 4.3721580972367963e-07, "logps/chosen": -41.71989440917969, "logps/rejected": -59.97500991821289, "loss": 0.5978, "losses/dpo": 0.517135739326477, "losses/sft": 1.2349271774291992, "losses/total": 0.517135739326477, "ref_logps/chosen": -35.97785949707031, "ref_logps/rejected": -49.64389419555664, "rewards/accuracies": 0.75, "rewards/chosen": -0.5742034316062927, "rewards/margins": 0.4589080214500427, "rewards/rejected": -1.0331114530563354, "step": 677 }, { "epoch": 0.64, "grad_norm": 22.815269470214844, "learning_rate": 4.370409233997901e-07, "logps/chosen": -39.422367095947266, "logps/rejected": -77.874755859375, "loss": 0.5313, "losses/dpo": 0.6275076270103455, "losses/sft": 1.2638832330703735, "losses/total": 0.6275076270103455, "ref_logps/chosen": -33.01189041137695, "ref_logps/rejected": -65.9725341796875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6410475373268127, "rewards/margins": 0.5491743087768555, "rewards/rejected": -1.1902217864990234, "step": 678 }, { "epoch": 0.64, "grad_norm": 17.992204666137695, "learning_rate": 4.3686603707590065e-07, "logps/chosen": -40.244049072265625, "logps/rejected": -64.80023193359375, "loss": 0.4144, "losses/dpo": 0.4164362847805023, "losses/sft": 1.7152419090270996, "losses/total": 0.4164362847805023, "ref_logps/chosen": -33.803955078125, "ref_logps/rejected": -50.13800811767578, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6440093517303467, "rewards/margins": 0.8222131133079529, "rewards/rejected": -1.4662225246429443, "step": 679 }, { "epoch": 0.64, "grad_norm": 16.815841674804688, "learning_rate": 4.3669115075201117e-07, "logps/chosen": -42.92987060546875, "logps/rejected": -66.77555847167969, "loss": 0.3951, "losses/dpo": 0.3958371877670288, "losses/sft": 2.2657477855682373, "losses/total": 0.3958371877670288, "ref_logps/chosen": -36.862754821777344, "ref_logps/rejected": -48.75736618041992, "rewards/accuracies": 0.75, "rewards/chosen": -0.606711208820343, "rewards/margins": 1.1951079368591309, "rewards/rejected": -1.801819086074829, "step": 680 }, { "epoch": 0.64, "grad_norm": 25.332477569580078, "learning_rate": 4.3651626442812173e-07, "logps/chosen": -48.87312316894531, "logps/rejected": -47.05182647705078, "loss": 0.9043, "losses/dpo": 0.7493389844894409, "losses/sft": 1.770767092704773, "losses/total": 0.7493389844894409, "ref_logps/chosen": -37.08683776855469, "ref_logps/rejected": -36.57822799682617, "rewards/accuracies": 0.4375, "rewards/chosen": -1.17862868309021, "rewards/margins": -0.13126879930496216, "rewards/rejected": -1.047359824180603, "step": 681 }, { "epoch": 0.64, "grad_norm": 20.06817626953125, "learning_rate": 4.363413781042322e-07, "logps/chosen": -39.16017150878906, "logps/rejected": -45.40052032470703, "loss": 0.7039, "losses/dpo": 1.1117303371429443, "losses/sft": 1.7223199605941772, "losses/total": 1.1117303371429443, "ref_logps/chosen": -29.940467834472656, "ref_logps/rejected": -32.448307037353516, "rewards/accuracies": 0.625, "rewards/chosen": -0.9219707250595093, "rewards/margins": 0.37325066328048706, "rewards/rejected": -1.2952213287353516, "step": 682 }, { "epoch": 0.64, "grad_norm": 17.64261817932129, "learning_rate": 4.3616649178034276e-07, "logps/chosen": -42.20592498779297, "logps/rejected": -60.50687789916992, "loss": 0.4485, "losses/dpo": 0.4617617428302765, "losses/sft": 1.6748210191726685, "losses/total": 0.4617617428302765, "ref_logps/chosen": -35.366451263427734, "ref_logps/rejected": -44.39671325683594, "rewards/accuracies": 0.75, "rewards/chosen": -0.6839475631713867, "rewards/margins": 0.9270689487457275, "rewards/rejected": -1.6110165119171143, "step": 683 }, { "epoch": 0.65, "grad_norm": 21.765056610107422, "learning_rate": 4.359916054564533e-07, "logps/chosen": -51.79177474975586, "logps/rejected": -49.704383850097656, "loss": 0.6447, "losses/dpo": 0.38130292296409607, "losses/sft": 1.7255353927612305, "losses/total": 0.38130292296409607, "ref_logps/chosen": -41.98659896850586, "ref_logps/rejected": -36.198974609375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9805173873901367, "rewards/margins": 0.37002331018447876, "rewards/rejected": -1.3505406379699707, "step": 684 }, { "epoch": 0.65, "grad_norm": 24.796009063720703, "learning_rate": 4.358167191325638e-07, "logps/chosen": -62.32807922363281, "logps/rejected": -58.585182189941406, "loss": 0.7332, "losses/dpo": 0.9550775289535522, "losses/sft": 2.147249221801758, "losses/total": 0.9550775289535522, "ref_logps/chosen": -49.45254135131836, "ref_logps/rejected": -43.134979248046875, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2875540256500244, "rewards/margins": 0.2574663460254669, "rewards/rejected": -1.545020341873169, "step": 685 }, { "epoch": 0.65, "grad_norm": 20.360877990722656, "learning_rate": 4.3564183280867435e-07, "logps/chosen": -48.447593688964844, "logps/rejected": -61.438575744628906, "loss": 0.4941, "losses/dpo": 0.6078076362609863, "losses/sft": 1.8867300748825073, "losses/total": 0.6078076362609863, "ref_logps/chosen": -39.92787170410156, "ref_logps/rejected": -45.262115478515625, "rewards/accuracies": 0.625, "rewards/chosen": -0.851972222328186, "rewards/margins": 0.7656738758087158, "rewards/rejected": -1.6176462173461914, "step": 686 }, { "epoch": 0.65, "grad_norm": 23.67317008972168, "learning_rate": 4.3546694648478486e-07, "logps/chosen": -47.97858810424805, "logps/rejected": -73.85092163085938, "loss": 0.6241, "losses/dpo": 0.40187883377075195, "losses/sft": 1.4718133211135864, "losses/total": 0.40187883377075195, "ref_logps/chosen": -35.94654846191406, "ref_logps/rejected": -54.152320861816406, "rewards/accuracies": 0.75, "rewards/chosen": -1.2032039165496826, "rewards/margins": 0.7666558027267456, "rewards/rejected": -1.9698598384857178, "step": 687 }, { "epoch": 0.65, "grad_norm": 19.96158790588379, "learning_rate": 4.3529206016089543e-07, "logps/chosen": -41.90129470825195, "logps/rejected": -55.30469512939453, "loss": 0.6205, "losses/dpo": 0.6631141901016235, "losses/sft": 1.4098172187805176, "losses/total": 0.6631141901016235, "ref_logps/chosen": -33.053680419921875, "ref_logps/rejected": -42.67448043823242, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8847612142562866, "rewards/margins": 0.37826040387153625, "rewards/rejected": -1.2630215883255005, "step": 688 }, { "epoch": 0.65, "grad_norm": 27.99469757080078, "learning_rate": 4.351171738370059e-07, "logps/chosen": -51.00370788574219, "logps/rejected": -55.13591003417969, "loss": 0.8336, "losses/dpo": 1.0738999843597412, "losses/sft": 1.9919081926345825, "losses/total": 1.0738999843597412, "ref_logps/chosen": -39.451507568359375, "ref_logps/rejected": -44.67042541503906, "rewards/accuracies": 0.5, "rewards/chosen": -1.1552196741104126, "rewards/margins": -0.10867124050855637, "rewards/rejected": -1.0465483665466309, "step": 689 }, { "epoch": 0.65, "grad_norm": 28.0861873626709, "learning_rate": 4.3494228751311645e-07, "logps/chosen": -44.46216583251953, "logps/rejected": -49.33219528198242, "loss": 0.8589, "losses/dpo": 1.1967506408691406, "losses/sft": 1.7002787590026855, "losses/total": 1.1967506408691406, "ref_logps/chosen": -32.268699645996094, "ref_logps/rejected": -38.94769287109375, "rewards/accuracies": 0.25, "rewards/chosen": -1.2193467617034912, "rewards/margins": -0.18089626729488373, "rewards/rejected": -1.0384504795074463, "step": 690 }, { "epoch": 0.65, "grad_norm": 20.783323287963867, "learning_rate": 4.34767401189227e-07, "logps/chosen": -51.34596252441406, "logps/rejected": -57.26661682128906, "loss": 0.6163, "losses/dpo": 0.7968945503234863, "losses/sft": 1.4953840970993042, "losses/total": 0.7968945503234863, "ref_logps/chosen": -42.618202209472656, "ref_logps/rejected": -44.93904495239258, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8727757930755615, "rewards/margins": 0.35998106002807617, "rewards/rejected": -1.2327568531036377, "step": 691 }, { "epoch": 0.65, "grad_norm": 22.957321166992188, "learning_rate": 4.345925148653375e-07, "logps/chosen": -48.171058654785156, "logps/rejected": -51.01253128051758, "loss": 0.871, "losses/dpo": 0.7004855871200562, "losses/sft": 1.5449622869491577, "losses/total": 0.7004855871200562, "ref_logps/chosen": -37.79178237915039, "ref_logps/rejected": -39.620338439941406, "rewards/accuracies": 0.5, "rewards/chosen": -1.0379281044006348, "rewards/margins": 0.10129156708717346, "rewards/rejected": -1.1392196416854858, "step": 692 }, { "epoch": 0.65, "grad_norm": 22.219606399536133, "learning_rate": 4.3441762854144804e-07, "logps/chosen": -50.45658874511719, "logps/rejected": -68.93952178955078, "loss": 0.5351, "losses/dpo": 0.6337833404541016, "losses/sft": 1.8166413307189941, "losses/total": 0.6337833404541016, "ref_logps/chosen": -41.516822814941406, "ref_logps/rejected": -53.788997650146484, "rewards/accuracies": 0.625, "rewards/chosen": -0.893976628780365, "rewards/margins": 0.6210753917694092, "rewards/rejected": -1.515052080154419, "step": 693 }, { "epoch": 0.66, "grad_norm": 16.8718204498291, "learning_rate": 4.3424274221755856e-07, "logps/chosen": -44.262779235839844, "logps/rejected": -65.33273315429688, "loss": 0.4547, "losses/dpo": 0.5813052654266357, "losses/sft": 1.791420817375183, "losses/total": 0.5813052654266357, "ref_logps/chosen": -33.40455627441406, "ref_logps/rejected": -46.93719482421875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0858221054077148, "rewards/margins": 0.7537323832511902, "rewards/rejected": -1.8395545482635498, "step": 694 }, { "epoch": 0.66, "grad_norm": 18.257749557495117, "learning_rate": 4.340678558936691e-07, "logps/chosen": -33.0558967590332, "logps/rejected": -49.74082946777344, "loss": 0.5253, "losses/dpo": 0.583354115486145, "losses/sft": 1.278853178024292, "losses/total": 0.583354115486145, "ref_logps/chosen": -25.848865509033203, "ref_logps/rejected": -36.68687438964844, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7207031846046448, "rewards/margins": 0.5846923589706421, "rewards/rejected": -1.3053956031799316, "step": 695 }, { "epoch": 0.66, "grad_norm": 24.73470115661621, "learning_rate": 4.3389296956977963e-07, "logps/chosen": -49.288055419921875, "logps/rejected": -59.290550231933594, "loss": 0.6199, "losses/dpo": 0.5122953653335571, "losses/sft": 1.709577202796936, "losses/total": 0.5122953653335571, "ref_logps/chosen": -39.45866394042969, "ref_logps/rejected": -44.76314163208008, "rewards/accuracies": 0.625, "rewards/chosen": -0.9829391241073608, "rewards/margins": 0.4698018431663513, "rewards/rejected": -1.4527409076690674, "step": 696 }, { "epoch": 0.66, "grad_norm": 17.934207916259766, "learning_rate": 4.3371808324589015e-07, "logps/chosen": -37.41145324707031, "logps/rejected": -51.14091873168945, "loss": 0.6256, "losses/dpo": 0.9151169657707214, "losses/sft": 1.8010318279266357, "losses/total": 0.9151169657707214, "ref_logps/chosen": -29.690553665161133, "ref_logps/rejected": -37.15107727050781, "rewards/accuracies": 0.625, "rewards/chosen": -0.7720898985862732, "rewards/margins": 0.6268945336341858, "rewards/rejected": -1.398984432220459, "step": 697 }, { "epoch": 0.66, "grad_norm": 15.5745849609375, "learning_rate": 4.335431969220007e-07, "logps/chosen": -36.08177185058594, "logps/rejected": -47.9534797668457, "loss": 0.5665, "losses/dpo": 0.5802960395812988, "losses/sft": 1.8154224157333374, "losses/total": 0.5802960395812988, "ref_logps/chosen": -27.115325927734375, "ref_logps/rejected": -34.380088806152344, "rewards/accuracies": 0.75, "rewards/chosen": -0.8966445326805115, "rewards/margins": 0.4606950879096985, "rewards/rejected": -1.35733962059021, "step": 698 }, { "epoch": 0.66, "grad_norm": 16.618440628051758, "learning_rate": 4.3336831059811117e-07, "logps/chosen": -44.092445373535156, "logps/rejected": -50.84418487548828, "loss": 0.5017, "losses/dpo": 0.5852111577987671, "losses/sft": 1.4667259454727173, "losses/total": 0.5852111577987671, "ref_logps/chosen": -35.301414489746094, "ref_logps/rejected": -36.55914306640625, "rewards/accuracies": 0.875, "rewards/chosen": -0.8791036605834961, "rewards/margins": 0.5494003891944885, "rewards/rejected": -1.4285039901733398, "step": 699 }, { "epoch": 0.66, "grad_norm": 17.073543548583984, "learning_rate": 4.3319342427422174e-07, "logps/chosen": -46.26211166381836, "logps/rejected": -67.36860656738281, "loss": 0.3976, "losses/dpo": 0.7678629159927368, "losses/sft": 1.8686453104019165, "losses/total": 0.7678629159927368, "ref_logps/chosen": -39.48162841796875, "ref_logps/rejected": -50.303245544433594, "rewards/accuracies": 0.875, "rewards/chosen": -0.6780481934547424, "rewards/margins": 1.0284874439239502, "rewards/rejected": -1.7065356969833374, "step": 700 }, { "epoch": 0.66, "grad_norm": 19.845348358154297, "learning_rate": 4.3301853795033225e-07, "logps/chosen": -45.529266357421875, "logps/rejected": -47.47041320800781, "loss": 0.713, "losses/dpo": 0.8201539516448975, "losses/sft": 1.7441099882125854, "losses/total": 0.8201539516448975, "ref_logps/chosen": -35.924644470214844, "ref_logps/rejected": -35.18165588378906, "rewards/accuracies": 0.625, "rewards/chosen": -0.960462749004364, "rewards/margins": 0.26841315627098083, "rewards/rejected": -1.2288758754730225, "step": 701 }, { "epoch": 0.66, "grad_norm": 27.96426773071289, "learning_rate": 4.328436516264428e-07, "logps/chosen": -56.74497985839844, "logps/rejected": -58.99053192138672, "loss": 0.7965, "losses/dpo": 1.0811152458190918, "losses/sft": 2.0527267456054688, "losses/total": 1.0811152458190918, "ref_logps/chosen": -46.56687927246094, "ref_logps/rejected": -48.54755401611328, "rewards/accuracies": 0.5, "rewards/chosen": -1.0178104639053345, "rewards/margins": 0.02648720145225525, "rewards/rejected": -1.0442975759506226, "step": 702 }, { "epoch": 0.66, "grad_norm": 23.73536491394043, "learning_rate": 4.3266876530255333e-07, "logps/chosen": -53.9489631652832, "logps/rejected": -67.30574035644531, "loss": 0.6054, "losses/dpo": 0.24673229455947876, "losses/sft": 1.821505069732666, "losses/total": 0.24673229455947876, "ref_logps/chosen": -43.61452865600586, "ref_logps/rejected": -52.201171875, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0334436893463135, "rewards/margins": 0.47701317071914673, "rewards/rejected": -1.5104568004608154, "step": 703 }, { "epoch": 0.66, "grad_norm": 16.407470703125, "learning_rate": 4.3249387897866384e-07, "logps/chosen": -27.682621002197266, "logps/rejected": -50.503944396972656, "loss": 0.5236, "losses/dpo": 0.49027788639068604, "losses/sft": 1.0848276615142822, "losses/total": 0.49027788639068604, "ref_logps/chosen": -22.123821258544922, "ref_logps/rejected": -38.551002502441406, "rewards/accuracies": 0.625, "rewards/chosen": -0.555880069732666, "rewards/margins": 0.6394139528274536, "rewards/rejected": -1.19529390335083, "step": 704 }, { "epoch": 0.67, "grad_norm": 21.02859878540039, "learning_rate": 4.323189926547744e-07, "logps/chosen": -46.8189697265625, "logps/rejected": -59.234222412109375, "loss": 0.5939, "losses/dpo": 0.631971001625061, "losses/sft": 1.6848599910736084, "losses/total": 0.631971001625061, "ref_logps/chosen": -38.235374450683594, "ref_logps/rejected": -45.48722457885742, "rewards/accuracies": 0.75, "rewards/chosen": -0.8583594560623169, "rewards/margins": 0.5163403153419495, "rewards/rejected": -1.3746997117996216, "step": 705 }, { "epoch": 0.67, "grad_norm": 23.449262619018555, "learning_rate": 4.3214410633088487e-07, "logps/chosen": -48.120262145996094, "logps/rejected": -59.29852294921875, "loss": 0.6985, "losses/dpo": 0.525850772857666, "losses/sft": 1.7156158685684204, "losses/total": 0.525850772857666, "ref_logps/chosen": -39.4202995300293, "ref_logps/rejected": -49.22132110595703, "rewards/accuracies": 0.625, "rewards/chosen": -0.8699963688850403, "rewards/margins": 0.1377238780260086, "rewards/rejected": -1.0077202320098877, "step": 706 }, { "epoch": 0.67, "grad_norm": 18.30802345275879, "learning_rate": 4.3196922000699543e-07, "logps/chosen": -44.236473083496094, "logps/rejected": -48.68854522705078, "loss": 0.6107, "losses/dpo": 0.5337749123573303, "losses/sft": 1.376046895980835, "losses/total": 0.5337749123573303, "ref_logps/chosen": -36.16105651855469, "ref_logps/rejected": -37.10627746582031, "rewards/accuracies": 0.625, "rewards/chosen": -0.8075416684150696, "rewards/margins": 0.3506850600242615, "rewards/rejected": -1.158226728439331, "step": 707 }, { "epoch": 0.67, "grad_norm": 14.369156837463379, "learning_rate": 4.3179433368310595e-07, "logps/chosen": -36.92255401611328, "logps/rejected": -51.405059814453125, "loss": 0.4603, "losses/dpo": 0.4892197251319885, "losses/sft": 1.0883471965789795, "losses/total": 0.4892197251319885, "ref_logps/chosen": -32.85285186767578, "ref_logps/rejected": -39.484031677246094, "rewards/accuracies": 0.75, "rewards/chosen": -0.40697014331817627, "rewards/margins": 0.7851329445838928, "rewards/rejected": -1.1921030282974243, "step": 708 }, { "epoch": 0.67, "grad_norm": 19.65743064880371, "learning_rate": 4.316194473592165e-07, "logps/chosen": -34.99608612060547, "logps/rejected": -43.15175247192383, "loss": 0.6552, "losses/dpo": 0.4912104606628418, "losses/sft": 1.3308316469192505, "losses/total": 0.4912104606628418, "ref_logps/chosen": -30.362171173095703, "ref_logps/rejected": -36.70307922363281, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4633915424346924, "rewards/margins": 0.1814756691455841, "rewards/rejected": -0.6448671817779541, "step": 709 }, { "epoch": 0.67, "grad_norm": 19.184167861938477, "learning_rate": 4.31444561035327e-07, "logps/chosen": -41.29024887084961, "logps/rejected": -45.53723907470703, "loss": 0.5721, "losses/dpo": 0.3396609425544739, "losses/sft": 1.4803591966629028, "losses/total": 0.3396609425544739, "ref_logps/chosen": -36.41984939575195, "ref_logps/rejected": -35.92561340332031, "rewards/accuracies": 0.6875, "rewards/chosen": -0.48704004287719727, "rewards/margins": 0.4741221070289612, "rewards/rejected": -0.9611622095108032, "step": 710 }, { "epoch": 0.67, "grad_norm": 17.922670364379883, "learning_rate": 4.3126967471143754e-07, "logps/chosen": -38.75063705444336, "logps/rejected": -49.31770324707031, "loss": 0.5491, "losses/dpo": 0.5774656534194946, "losses/sft": 1.4425379037857056, "losses/total": 0.5774656534194946, "ref_logps/chosen": -33.69990921020508, "ref_logps/rejected": -39.866554260253906, "rewards/accuracies": 0.625, "rewards/chosen": -0.5050727725028992, "rewards/margins": 0.4400426149368286, "rewards/rejected": -0.945115327835083, "step": 711 }, { "epoch": 0.67, "grad_norm": 12.123807907104492, "learning_rate": 4.310947883875481e-07, "logps/chosen": -27.21611213684082, "logps/rejected": -45.500240325927734, "loss": 0.4194, "losses/dpo": 0.45889779925346375, "losses/sft": 2.3296761512756348, "losses/total": 0.45889779925346375, "ref_logps/chosen": -23.09859848022461, "ref_logps/rejected": -32.90995788574219, "rewards/accuracies": 0.875, "rewards/chosen": -0.41175127029418945, "rewards/margins": 0.8472768068313599, "rewards/rejected": -1.2590280771255493, "step": 712 }, { "epoch": 0.67, "grad_norm": 16.523786544799805, "learning_rate": 4.3091990206365856e-07, "logps/chosen": -43.260677337646484, "logps/rejected": -72.78237915039062, "loss": 0.4426, "losses/dpo": 0.1860278695821762, "losses/sft": 1.7062638998031616, "losses/total": 0.1860278695821762, "ref_logps/chosen": -36.98164749145508, "ref_logps/rejected": -56.80029296875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6279028654098511, "rewards/margins": 0.9703060388565063, "rewards/rejected": -1.5982087850570679, "step": 713 }, { "epoch": 0.67, "grad_norm": 12.94727611541748, "learning_rate": 4.3074501573976913e-07, "logps/chosen": -31.61503028869629, "logps/rejected": -45.780914306640625, "loss": 0.3611, "losses/dpo": 0.3345387876033783, "losses/sft": 1.019202709197998, "losses/total": 0.3345387876033783, "ref_logps/chosen": -28.76348876953125, "ref_logps/rejected": -33.738136291503906, "rewards/accuracies": 1.0, "rewards/chosen": -0.2851543426513672, "rewards/margins": 0.9191235303878784, "rewards/rejected": -1.204277753829956, "step": 714 }, { "epoch": 0.68, "grad_norm": 15.002126693725586, "learning_rate": 4.305701294158797e-07, "logps/chosen": -34.27792739868164, "logps/rejected": -49.252220153808594, "loss": 0.4541, "losses/dpo": 0.5120267868041992, "losses/sft": 1.7566144466400146, "losses/total": 0.5120267868041992, "ref_logps/chosen": -29.74795150756836, "ref_logps/rejected": -36.67368698120117, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4529974162578583, "rewards/margins": 0.8048561811447144, "rewards/rejected": -1.2578535079956055, "step": 715 }, { "epoch": 0.68, "grad_norm": 23.409090042114258, "learning_rate": 4.303952430919902e-07, "logps/chosen": -57.65196228027344, "logps/rejected": -58.61241912841797, "loss": 0.6757, "losses/dpo": 0.37955763936042786, "losses/sft": 1.9902818202972412, "losses/total": 0.37955763936042786, "ref_logps/chosen": -48.691795349121094, "ref_logps/rejected": -45.262969970703125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8960169553756714, "rewards/margins": 0.43892765045166016, "rewards/rejected": -1.334944486618042, "step": 716 }, { "epoch": 0.68, "grad_norm": 20.81814956665039, "learning_rate": 4.302203567681007e-07, "logps/chosen": -52.60441207885742, "logps/rejected": -54.37532043457031, "loss": 0.6278, "losses/dpo": 0.6902827024459839, "losses/sft": 1.4632818698883057, "losses/total": 0.6902827024459839, "ref_logps/chosen": -45.165748596191406, "ref_logps/rejected": -43.0654296875, "rewards/accuracies": 0.75, "rewards/chosen": -0.7438666224479675, "rewards/margins": 0.3871225118637085, "rewards/rejected": -1.1309890747070312, "step": 717 }, { "epoch": 0.68, "grad_norm": 18.56036949157715, "learning_rate": 4.3004547044421123e-07, "logps/chosen": -48.89460754394531, "logps/rejected": -73.34172058105469, "loss": 0.4418, "losses/dpo": 0.3784174621105194, "losses/sft": 1.4442063570022583, "losses/total": 0.3784174621105194, "ref_logps/chosen": -38.736427307128906, "ref_logps/rejected": -52.50584411621094, "rewards/accuracies": 0.8125, "rewards/chosen": -1.015817642211914, "rewards/margins": 1.0677695274353027, "rewards/rejected": -2.083587169647217, "step": 718 }, { "epoch": 0.68, "grad_norm": 18.991525650024414, "learning_rate": 4.298705841203218e-07, "logps/chosen": -46.43339157104492, "logps/rejected": -58.17090606689453, "loss": 0.6159, "losses/dpo": 0.5434184074401855, "losses/sft": 1.0356076955795288, "losses/total": 0.5434184074401855, "ref_logps/chosen": -36.94187927246094, "ref_logps/rejected": -45.233558654785156, "rewards/accuracies": 0.875, "rewards/chosen": -0.9491511583328247, "rewards/margins": 0.3445841372013092, "rewards/rejected": -1.2937352657318115, "step": 719 }, { "epoch": 0.68, "grad_norm": 14.975947380065918, "learning_rate": 4.2969569779643226e-07, "logps/chosen": -42.52625274658203, "logps/rejected": -54.581756591796875, "loss": 0.4927, "losses/dpo": 0.4607549011707306, "losses/sft": 1.430629014968872, "losses/total": 0.4607549011707306, "ref_logps/chosen": -34.63993835449219, "ref_logps/rejected": -40.030967712402344, "rewards/accuracies": 0.75, "rewards/chosen": -0.7886309027671814, "rewards/margins": 0.6664482355117798, "rewards/rejected": -1.4550790786743164, "step": 720 }, { "epoch": 0.68, "grad_norm": 16.38325309753418, "learning_rate": 4.295208114725428e-07, "logps/chosen": -41.7340087890625, "logps/rejected": -52.14056396484375, "loss": 0.5441, "losses/dpo": 0.5851291418075562, "losses/sft": 1.5170259475708008, "losses/total": 0.5851291418075562, "ref_logps/chosen": -33.08143615722656, "ref_logps/rejected": -38.7519645690918, "rewards/accuracies": 0.875, "rewards/chosen": -0.8652572631835938, "rewards/margins": 0.47360312938690186, "rewards/rejected": -1.3388605117797852, "step": 721 }, { "epoch": 0.68, "grad_norm": 21.832866668701172, "learning_rate": 4.293459251486534e-07, "logps/chosen": -43.31774139404297, "logps/rejected": -62.20421600341797, "loss": 0.6606, "losses/dpo": 0.7681498527526855, "losses/sft": 1.543290376663208, "losses/total": 0.7681498527526855, "ref_logps/chosen": -34.86967849731445, "ref_logps/rejected": -47.756134033203125, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8448069095611572, "rewards/margins": 0.6000010967254639, "rewards/rejected": -1.444808006286621, "step": 722 }, { "epoch": 0.68, "grad_norm": 18.4277400970459, "learning_rate": 4.291710388247639e-07, "logps/chosen": -51.22413635253906, "logps/rejected": -60.75074005126953, "loss": 0.5454, "losses/dpo": 0.44816651940345764, "losses/sft": 1.9517427682876587, "losses/total": 0.44816651940345764, "ref_logps/chosen": -43.811912536621094, "ref_logps/rejected": -47.564964294433594, "rewards/accuracies": 0.75, "rewards/chosen": -0.7412219047546387, "rewards/margins": 0.5773557424545288, "rewards/rejected": -1.3185776472091675, "step": 723 }, { "epoch": 0.68, "grad_norm": 20.5158748626709, "learning_rate": 4.289961525008744e-07, "logps/chosen": -45.1531867980957, "logps/rejected": -66.62257385253906, "loss": 0.5519, "losses/dpo": 0.5188019871711731, "losses/sft": 1.0994102954864502, "losses/total": 0.5188019871711731, "ref_logps/chosen": -34.660316467285156, "ref_logps/rejected": -49.06583786010742, "rewards/accuracies": 0.625, "rewards/chosen": -1.0492873191833496, "rewards/margins": 0.7063854336738586, "rewards/rejected": -1.7556726932525635, "step": 724 }, { "epoch": 0.68, "grad_norm": 21.37078857421875, "learning_rate": 4.288212661769849e-07, "logps/chosen": -50.27540588378906, "logps/rejected": -67.41368103027344, "loss": 0.6139, "losses/dpo": 0.3498527407646179, "losses/sft": 1.631231427192688, "losses/total": 0.3498527407646179, "ref_logps/chosen": -41.15169906616211, "ref_logps/rejected": -52.33612823486328, "rewards/accuracies": 0.625, "rewards/chosen": -0.9123706817626953, "rewards/margins": 0.5953848958015442, "rewards/rejected": -1.5077555179595947, "step": 725 }, { "epoch": 0.69, "grad_norm": 21.714824676513672, "learning_rate": 4.286463798530955e-07, "logps/chosen": -50.311920166015625, "logps/rejected": -50.24692153930664, "loss": 0.6893, "losses/dpo": 0.8099931478500366, "losses/sft": 1.538763165473938, "losses/total": 0.8099931478500366, "ref_logps/chosen": -42.77843475341797, "ref_logps/rejected": -41.30024337768555, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7533482313156128, "rewards/margins": 0.14131946861743927, "rewards/rejected": -0.8946677446365356, "step": 726 }, { "epoch": 0.69, "grad_norm": 18.687244415283203, "learning_rate": 4.2847149352920595e-07, "logps/chosen": -48.927268981933594, "logps/rejected": -56.550621032714844, "loss": 0.6577, "losses/dpo": 0.7903240919113159, "losses/sft": 1.6181057691574097, "losses/total": 0.7903240919113159, "ref_logps/chosen": -40.08326721191406, "ref_logps/rejected": -43.57933044433594, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8843997716903687, "rewards/margins": 0.41272953152656555, "rewards/rejected": -1.2971292734146118, "step": 727 }, { "epoch": 0.69, "grad_norm": 18.43297004699707, "learning_rate": 4.282966072053165e-07, "logps/chosen": -45.604248046875, "logps/rejected": -67.88539123535156, "loss": 0.4841, "losses/dpo": 0.281689316034317, "losses/sft": 1.3659275770187378, "losses/total": 0.281689316034317, "ref_logps/chosen": -36.35345458984375, "ref_logps/rejected": -51.47601318359375, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9250791072845459, "rewards/margins": 0.7158592343330383, "rewards/rejected": -1.6409382820129395, "step": 728 }, { "epoch": 0.69, "grad_norm": 18.301931381225586, "learning_rate": 4.281217208814271e-07, "logps/chosen": -46.586639404296875, "logps/rejected": -55.49821472167969, "loss": 0.513, "losses/dpo": 0.4403191804885864, "losses/sft": 1.4714233875274658, "losses/total": 0.4403191804885864, "ref_logps/chosen": -38.97897720336914, "ref_logps/rejected": -41.933082580566406, "rewards/accuracies": 0.625, "rewards/chosen": -0.7607663869857788, "rewards/margins": 0.5957470536231995, "rewards/rejected": -1.356513500213623, "step": 729 }, { "epoch": 0.69, "grad_norm": 16.465715408325195, "learning_rate": 4.279468345575376e-07, "logps/chosen": -33.934410095214844, "logps/rejected": -49.73075866699219, "loss": 0.6093, "losses/dpo": 0.7005358934402466, "losses/sft": 1.7313860654830933, "losses/total": 0.7005358934402466, "ref_logps/chosen": -27.16747283935547, "ref_logps/rejected": -39.068580627441406, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6766937375068665, "rewards/margins": 0.3895241618156433, "rewards/rejected": -1.0662178993225098, "step": 730 }, { "epoch": 0.69, "grad_norm": 20.65587043762207, "learning_rate": 4.277719482336481e-07, "logps/chosen": -44.32313537597656, "logps/rejected": -36.8135871887207, "loss": 0.6444, "losses/dpo": 0.5208037495613098, "losses/sft": 1.4644453525543213, "losses/total": 0.5208037495613098, "ref_logps/chosen": -37.35309600830078, "ref_logps/rejected": -26.82744789123535, "rewards/accuracies": 0.625, "rewards/chosen": -0.6970037221908569, "rewards/margins": 0.3016101121902466, "rewards/rejected": -0.9986138343811035, "step": 731 }, { "epoch": 0.69, "grad_norm": 30.098657608032227, "learning_rate": 4.275970619097586e-07, "logps/chosen": -51.476871490478516, "logps/rejected": -47.054290771484375, "loss": 0.8557, "losses/dpo": 1.655027151107788, "losses/sft": 2.198712110519409, "losses/total": 1.655027151107788, "ref_logps/chosen": -40.543113708496094, "ref_logps/rejected": -36.189697265625, "rewards/accuracies": 0.4375, "rewards/chosen": -1.0933759212493896, "rewards/margins": -0.006916806101799011, "rewards/rejected": -1.0864590406417847, "step": 732 }, { "epoch": 0.69, "grad_norm": 18.317766189575195, "learning_rate": 4.274221755858692e-07, "logps/chosen": -50.106510162353516, "logps/rejected": -66.85099792480469, "loss": 0.5015, "losses/dpo": 0.6240159273147583, "losses/sft": 1.5468125343322754, "losses/total": 0.6240159273147583, "ref_logps/chosen": -41.568607330322266, "ref_logps/rejected": -51.991661071777344, "rewards/accuracies": 0.8125, "rewards/chosen": -0.853790283203125, "rewards/margins": 0.6321431398391724, "rewards/rejected": -1.4859334230422974, "step": 733 }, { "epoch": 0.69, "grad_norm": 20.739360809326172, "learning_rate": 4.272472892619797e-07, "logps/chosen": -43.271549224853516, "logps/rejected": -48.183555603027344, "loss": 0.6991, "losses/dpo": 0.6731064915657043, "losses/sft": 1.7340065240859985, "losses/total": 0.6731064915657043, "ref_logps/chosen": -33.46013641357422, "ref_logps/rejected": -35.56556701660156, "rewards/accuracies": 0.5625, "rewards/chosen": -0.981141209602356, "rewards/margins": 0.2806575298309326, "rewards/rejected": -1.261798620223999, "step": 734 }, { "epoch": 0.69, "grad_norm": 22.32449722290039, "learning_rate": 4.270724029380902e-07, "logps/chosen": -43.6602783203125, "logps/rejected": -42.747215270996094, "loss": 0.711, "losses/dpo": 0.34072479605674744, "losses/sft": 1.4519860744476318, "losses/total": 0.34072479605674744, "ref_logps/chosen": -35.907073974609375, "ref_logps/rejected": -32.58649444580078, "rewards/accuracies": 0.75, "rewards/chosen": -0.7753205299377441, "rewards/margins": 0.2407512068748474, "rewards/rejected": -1.0160717964172363, "step": 735 }, { "epoch": 0.69, "grad_norm": 17.203248977661133, "learning_rate": 4.268975166142008e-07, "logps/chosen": -38.31851577758789, "logps/rejected": -52.45063018798828, "loss": 0.5077, "losses/dpo": 0.7386688590049744, "losses/sft": 1.5262503623962402, "losses/total": 0.7386688590049744, "ref_logps/chosen": -30.972883224487305, "ref_logps/rejected": -39.24861526489258, "rewards/accuracies": 0.75, "rewards/chosen": -0.7345629930496216, "rewards/margins": 0.5856384038925171, "rewards/rejected": -1.3202013969421387, "step": 736 }, { "epoch": 0.7, "grad_norm": 20.836872100830078, "learning_rate": 4.267226302903113e-07, "logps/chosen": -40.628726959228516, "logps/rejected": -53.397701263427734, "loss": 0.6443, "losses/dpo": 0.9054415225982666, "losses/sft": 1.6336662769317627, "losses/total": 0.9054415225982666, "ref_logps/chosen": -31.981647491455078, "ref_logps/rejected": -41.35036087036133, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8647080659866333, "rewards/margins": 0.3400261402130127, "rewards/rejected": -1.204734206199646, "step": 737 }, { "epoch": 0.7, "grad_norm": 18.280460357666016, "learning_rate": 4.265477439664218e-07, "logps/chosen": -43.923126220703125, "logps/rejected": -53.445716857910156, "loss": 0.5706, "losses/dpo": 0.5863304138183594, "losses/sft": 0.8763689398765564, "losses/total": 0.5863304138183594, "ref_logps/chosen": -36.32550811767578, "ref_logps/rejected": -42.005767822265625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7597619295120239, "rewards/margins": 0.3842333257198334, "rewards/rejected": -1.1439952850341797, "step": 738 }, { "epoch": 0.7, "grad_norm": 20.237655639648438, "learning_rate": 4.263728576425323e-07, "logps/chosen": -50.909202575683594, "logps/rejected": -49.70928192138672, "loss": 0.6183, "losses/dpo": 0.46446093916893005, "losses/sft": 1.925357699394226, "losses/total": 0.46446093916893005, "ref_logps/chosen": -42.42932891845703, "ref_logps/rejected": -37.705322265625, "rewards/accuracies": 0.625, "rewards/chosen": -0.8479871153831482, "rewards/margins": 0.352408766746521, "rewards/rejected": -1.200395941734314, "step": 739 }, { "epoch": 0.7, "grad_norm": 22.662113189697266, "learning_rate": 4.261979713186429e-07, "logps/chosen": -47.26714324951172, "logps/rejected": -73.65606689453125, "loss": 0.5853, "losses/dpo": 0.528915286064148, "losses/sft": 1.3249739408493042, "losses/total": 0.528915286064148, "ref_logps/chosen": -38.37602996826172, "ref_logps/rejected": -60.28350830078125, "rewards/accuracies": 0.75, "rewards/chosen": -0.8891115188598633, "rewards/margins": 0.4481440782546997, "rewards/rejected": -1.3372557163238525, "step": 740 }, { "epoch": 0.7, "grad_norm": 24.209074020385742, "learning_rate": 4.260230849947534e-07, "logps/chosen": -45.871097564697266, "logps/rejected": -60.66228103637695, "loss": 0.7298, "losses/dpo": 1.0251754522323608, "losses/sft": 2.0724728107452393, "losses/total": 1.0251754522323608, "ref_logps/chosen": -37.298606872558594, "ref_logps/rejected": -50.359161376953125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.857248842716217, "rewards/margins": 0.17306344211101532, "rewards/rejected": -1.0303122997283936, "step": 741 }, { "epoch": 0.7, "grad_norm": 21.85954475402832, "learning_rate": 4.258481986708639e-07, "logps/chosen": -46.614173889160156, "logps/rejected": -60.85839080810547, "loss": 0.6027, "losses/dpo": 0.5607704520225525, "losses/sft": 1.6466827392578125, "losses/total": 0.5607704520225525, "ref_logps/chosen": -39.28166580200195, "ref_logps/rejected": -48.896339416503906, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7332503795623779, "rewards/margins": 0.46295464038848877, "rewards/rejected": -1.1962049007415771, "step": 742 }, { "epoch": 0.7, "grad_norm": 22.585887908935547, "learning_rate": 4.2567331234697447e-07, "logps/chosen": -46.457496643066406, "logps/rejected": -58.06674575805664, "loss": 0.6909, "losses/dpo": 0.5679949522018433, "losses/sft": 1.2433321475982666, "losses/total": 0.5679949522018433, "ref_logps/chosen": -35.988277435302734, "ref_logps/rejected": -45.05171203613281, "rewards/accuracies": 0.6875, "rewards/chosen": -1.046921968460083, "rewards/margins": 0.2545810043811798, "rewards/rejected": -1.3015029430389404, "step": 743 }, { "epoch": 0.7, "grad_norm": 16.260671615600586, "learning_rate": 4.25498426023085e-07, "logps/chosen": -40.18232727050781, "logps/rejected": -54.373538970947266, "loss": 0.456, "losses/dpo": 0.4967708885669708, "losses/sft": 1.9124349355697632, "losses/total": 0.4967708885669708, "ref_logps/chosen": -33.05384063720703, "ref_logps/rejected": -39.20069122314453, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7128486037254333, "rewards/margins": 0.8044363260269165, "rewards/rejected": -1.517284870147705, "step": 744 }, { "epoch": 0.7, "grad_norm": 19.383041381835938, "learning_rate": 4.253235396991955e-07, "logps/chosen": -40.77610778808594, "logps/rejected": -45.19227600097656, "loss": 0.5829, "losses/dpo": 0.581690788269043, "losses/sft": 1.2575770616531372, "losses/total": 0.581690788269043, "ref_logps/chosen": -34.16371536254883, "ref_logps/rejected": -35.671607971191406, "rewards/accuracies": 0.75, "rewards/chosen": -0.6612394452095032, "rewards/margins": 0.290827214717865, "rewards/rejected": -0.9520666599273682, "step": 745 }, { "epoch": 0.7, "grad_norm": 19.118419647216797, "learning_rate": 4.25148653375306e-07, "logps/chosen": -53.51045608520508, "logps/rejected": -64.26415252685547, "loss": 0.4855, "losses/dpo": 0.5189401507377625, "losses/sft": 1.8860023021697998, "losses/total": 0.5189401507377625, "ref_logps/chosen": -45.97468566894531, "ref_logps/rejected": -49.77253723144531, "rewards/accuracies": 0.75, "rewards/chosen": -0.7535770535469055, "rewards/margins": 0.6955845952033997, "rewards/rejected": -1.4491616487503052, "step": 746 }, { "epoch": 0.71, "grad_norm": 19.952722549438477, "learning_rate": 4.249737670514166e-07, "logps/chosen": -42.86393356323242, "logps/rejected": -48.983543395996094, "loss": 0.6327, "losses/dpo": 0.7953437566757202, "losses/sft": 1.6539349555969238, "losses/total": 0.7953437566757202, "ref_logps/chosen": -35.93853759765625, "ref_logps/rejected": -39.69743728637695, "rewards/accuracies": 0.625, "rewards/chosen": -0.6925398111343384, "rewards/margins": 0.23607105016708374, "rewards/rejected": -0.9286109209060669, "step": 747 }, { "epoch": 0.71, "grad_norm": 16.43267059326172, "learning_rate": 4.247988807275271e-07, "logps/chosen": -38.21826171875, "logps/rejected": -61.831634521484375, "loss": 0.4886, "losses/dpo": 0.5306552648544312, "losses/sft": 1.4700191020965576, "losses/total": 0.5306552648544312, "ref_logps/chosen": -31.346384048461914, "ref_logps/rejected": -47.10458755493164, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6871877908706665, "rewards/margins": 0.785516619682312, "rewards/rejected": -1.4727044105529785, "step": 748 }, { "epoch": 0.71, "grad_norm": 18.212879180908203, "learning_rate": 4.246239944036376e-07, "logps/chosen": -46.521263122558594, "logps/rejected": -62.309139251708984, "loss": 0.4978, "losses/dpo": 0.4854315221309662, "losses/sft": 2.568373918533325, "losses/total": 0.4854315221309662, "ref_logps/chosen": -37.42906188964844, "ref_logps/rejected": -47.113624572753906, "rewards/accuracies": 0.75, "rewards/chosen": -0.9092199206352234, "rewards/margins": 0.6103312969207764, "rewards/rejected": -1.5195512771606445, "step": 749 }, { "epoch": 0.71, "grad_norm": 20.696321487426758, "learning_rate": 4.2444910807974816e-07, "logps/chosen": -41.21089172363281, "logps/rejected": -45.72018051147461, "loss": 0.6021, "losses/dpo": 0.6415588855743408, "losses/sft": 1.5518860816955566, "losses/total": 0.6415588855743408, "ref_logps/chosen": -34.7630615234375, "ref_logps/rejected": -35.60466766357422, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6447831392288208, "rewards/margins": 0.36676809191703796, "rewards/rejected": -1.0115511417388916, "step": 750 }, { "epoch": 0.71, "grad_norm": 19.7783260345459, "learning_rate": 4.242742217558587e-07, "logps/chosen": -45.709781646728516, "logps/rejected": -51.80164337158203, "loss": 0.5186, "losses/dpo": 0.3710705637931824, "losses/sft": 1.8136144876480103, "losses/total": 0.3710705637931824, "ref_logps/chosen": -36.29664993286133, "ref_logps/rejected": -35.79341506958008, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9413133263587952, "rewards/margins": 0.6595095992088318, "rewards/rejected": -1.600822925567627, "step": 751 }, { "epoch": 0.71, "grad_norm": 18.367704391479492, "learning_rate": 4.240993354319692e-07, "logps/chosen": -47.46202087402344, "logps/rejected": -72.6573715209961, "loss": 0.3996, "losses/dpo": 0.2256067395210266, "losses/sft": 1.6897969245910645, "losses/total": 0.2256067395210266, "ref_logps/chosen": -39.82744216918945, "ref_logps/rejected": -55.34682083129883, "rewards/accuracies": 0.875, "rewards/chosen": -0.7634580135345459, "rewards/margins": 0.9675969481468201, "rewards/rejected": -1.7310549020767212, "step": 752 }, { "epoch": 0.71, "grad_norm": 19.741113662719727, "learning_rate": 4.2392444910807976e-07, "logps/chosen": -36.69940948486328, "logps/rejected": -46.498069763183594, "loss": 0.5511, "losses/dpo": 0.2733551859855652, "losses/sft": 0.9542773365974426, "losses/total": 0.2733551859855652, "ref_logps/chosen": -28.31500244140625, "ref_logps/rejected": -32.472503662109375, "rewards/accuracies": 0.75, "rewards/chosen": -0.8384406566619873, "rewards/margins": 0.5641160011291504, "rewards/rejected": -1.4025567770004272, "step": 753 }, { "epoch": 0.71, "grad_norm": 19.360973358154297, "learning_rate": 4.2374956278419027e-07, "logps/chosen": -50.95995330810547, "logps/rejected": -62.476322174072266, "loss": 0.5458, "losses/dpo": 0.3861052095890045, "losses/sft": 1.8141791820526123, "losses/total": 0.3861052095890045, "ref_logps/chosen": -42.563636779785156, "ref_logps/rejected": -48.6370849609375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8396315574645996, "rewards/margins": 0.5442919731140137, "rewards/rejected": -1.3839235305786133, "step": 754 }, { "epoch": 0.71, "grad_norm": 23.597888946533203, "learning_rate": 4.235746764603008e-07, "logps/chosen": -54.756996154785156, "logps/rejected": -68.79257202148438, "loss": 0.7102, "losses/dpo": 0.8296176195144653, "losses/sft": 1.999776005744934, "losses/total": 0.8296176195144653, "ref_logps/chosen": -44.24373245239258, "ref_logps/rejected": -54.635887145996094, "rewards/accuracies": 0.625, "rewards/chosen": -1.0513266324996948, "rewards/margins": 0.36434170603752136, "rewards/rejected": -1.415668249130249, "step": 755 }, { "epoch": 0.71, "grad_norm": 22.130552291870117, "learning_rate": 4.233997901364113e-07, "logps/chosen": -42.543399810791016, "logps/rejected": -53.249122619628906, "loss": 0.6306, "losses/dpo": 0.9714921712875366, "losses/sft": 2.300448179244995, "losses/total": 0.9714921712875366, "ref_logps/chosen": -34.651344299316406, "ref_logps/rejected": -42.10847473144531, "rewards/accuracies": 0.625, "rewards/chosen": -0.7892056703567505, "rewards/margins": 0.3248593211174011, "rewards/rejected": -1.1140649318695068, "step": 756 }, { "epoch": 0.71, "grad_norm": 20.522016525268555, "learning_rate": 4.2322490381252186e-07, "logps/chosen": -51.109954833984375, "logps/rejected": -72.20364379882812, "loss": 0.5195, "losses/dpo": 0.4485291540622711, "losses/sft": 1.6937845945358276, "losses/total": 0.4485291540622711, "ref_logps/chosen": -41.51011276245117, "ref_logps/rejected": -55.568756103515625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9599847197532654, "rewards/margins": 0.7035040855407715, "rewards/rejected": -1.6634888648986816, "step": 757 }, { "epoch": 0.72, "grad_norm": 20.583127975463867, "learning_rate": 4.2305001748863237e-07, "logps/chosen": -42.421775817871094, "logps/rejected": -63.52996826171875, "loss": 0.5248, "losses/dpo": 0.7386205196380615, "losses/sft": 1.3855749368667603, "losses/total": 0.7386205196380615, "ref_logps/chosen": -34.836395263671875, "ref_logps/rejected": -48.709537506103516, "rewards/accuracies": 0.75, "rewards/chosen": -0.758538007736206, "rewards/margins": 0.723504900932312, "rewards/rejected": -1.4820430278778076, "step": 758 }, { "epoch": 0.72, "grad_norm": 20.102645874023438, "learning_rate": 4.228751311647429e-07, "logps/chosen": -49.17989730834961, "logps/rejected": -57.92820358276367, "loss": 0.6536, "losses/dpo": 0.5786886811256409, "losses/sft": 1.9223612546920776, "losses/total": 0.5786886811256409, "ref_logps/chosen": -37.882286071777344, "ref_logps/rejected": -44.04322814941406, "rewards/accuracies": 0.6875, "rewards/chosen": -1.129760980606079, "rewards/margins": 0.2587363123893738, "rewards/rejected": -1.3884973526000977, "step": 759 }, { "epoch": 0.72, "grad_norm": 21.211217880249023, "learning_rate": 4.2270024484085345e-07, "logps/chosen": -54.49882507324219, "logps/rejected": -66.64996337890625, "loss": 0.5075, "losses/dpo": 0.6029857397079468, "losses/sft": 1.9275867938995361, "losses/total": 0.6029857397079468, "ref_logps/chosen": -43.71864318847656, "ref_logps/rejected": -48.92106628417969, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0780181884765625, "rewards/margins": 0.6948716640472412, "rewards/rejected": -1.7728897333145142, "step": 760 }, { "epoch": 0.72, "grad_norm": 21.209583282470703, "learning_rate": 4.2252535851696396e-07, "logps/chosen": -43.100486755371094, "logps/rejected": -46.91571807861328, "loss": 0.6642, "losses/dpo": 0.7757740616798401, "losses/sft": 1.8444371223449707, "losses/total": 0.7757740616798401, "ref_logps/chosen": -32.9022102355957, "ref_logps/rejected": -34.38334655761719, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0198278427124023, "rewards/margins": 0.23340973258018494, "rewards/rejected": -1.2532374858856201, "step": 761 }, { "epoch": 0.72, "grad_norm": 24.357664108276367, "learning_rate": 4.223504721930745e-07, "logps/chosen": -45.410179138183594, "logps/rejected": -46.113075256347656, "loss": 0.8068, "losses/dpo": 0.6233912706375122, "losses/sft": 1.0533487796783447, "losses/total": 0.6233912706375122, "ref_logps/chosen": -36.20232009887695, "ref_logps/rejected": -34.531959533691406, "rewards/accuracies": 0.5625, "rewards/chosen": -0.920785665512085, "rewards/margins": 0.2373262494802475, "rewards/rejected": -1.158111810684204, "step": 762 }, { "epoch": 0.72, "grad_norm": 15.830036163330078, "learning_rate": 4.22175585869185e-07, "logps/chosen": -42.101783752441406, "logps/rejected": -67.05045318603516, "loss": 0.3771, "losses/dpo": 0.4679235816001892, "losses/sft": 1.8049534559249878, "losses/total": 0.4679235816001892, "ref_logps/chosen": -36.04362487792969, "ref_logps/rejected": -50.57368469238281, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6058157682418823, "rewards/margins": 1.0418604612350464, "rewards/rejected": -1.6476762294769287, "step": 763 }, { "epoch": 0.72, "grad_norm": 20.92279052734375, "learning_rate": 4.2200069954529555e-07, "logps/chosen": -47.54430389404297, "logps/rejected": -56.36003875732422, "loss": 0.5923, "losses/dpo": 0.5802595615386963, "losses/sft": 1.6230095624923706, "losses/total": 0.5802595615386963, "ref_logps/chosen": -37.64332580566406, "ref_logps/rejected": -41.81571960449219, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9900981187820435, "rewards/margins": 0.464333713054657, "rewards/rejected": -1.4544317722320557, "step": 764 }, { "epoch": 0.72, "grad_norm": 23.231075286865234, "learning_rate": 4.2182581322140607e-07, "logps/chosen": -62.5670280456543, "logps/rejected": -71.05088806152344, "loss": 0.5894, "losses/dpo": 0.4304642975330353, "losses/sft": 1.9444787502288818, "losses/total": 0.4304642975330353, "ref_logps/chosen": -52.799781799316406, "ref_logps/rejected": -54.82190704345703, "rewards/accuracies": 0.625, "rewards/chosen": -0.9767248630523682, "rewards/margins": 0.6461727619171143, "rewards/rejected": -1.6228976249694824, "step": 765 }, { "epoch": 0.72, "grad_norm": 19.713966369628906, "learning_rate": 4.216509268975166e-07, "logps/chosen": -54.23445510864258, "logps/rejected": -56.89936447143555, "loss": 0.6581, "losses/dpo": 0.9080683588981628, "losses/sft": 1.9636963605880737, "losses/total": 0.9080683588981628, "ref_logps/chosen": -43.35801696777344, "ref_logps/rejected": -42.41692352294922, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0876442193984985, "rewards/margins": 0.3606003522872925, "rewards/rejected": -1.448244571685791, "step": 766 }, { "epoch": 0.72, "grad_norm": 17.78038215637207, "learning_rate": 4.2147604057362714e-07, "logps/chosen": -49.073158264160156, "logps/rejected": -52.753692626953125, "loss": 0.641, "losses/dpo": 0.5669728517532349, "losses/sft": 2.2225940227508545, "losses/total": 0.5669728517532349, "ref_logps/chosen": -38.89840316772461, "ref_logps/rejected": -40.61378479003906, "rewards/accuracies": 0.5, "rewards/chosen": -1.0174754858016968, "rewards/margins": 0.19651541113853455, "rewards/rejected": -1.2139909267425537, "step": 767 }, { "epoch": 0.73, "grad_norm": 20.39261245727539, "learning_rate": 4.2130115424973766e-07, "logps/chosen": -39.9666633605957, "logps/rejected": -42.861839294433594, "loss": 0.7395, "losses/dpo": 0.5248615145683289, "losses/sft": 1.6448071002960205, "losses/total": 0.5248615145683289, "ref_logps/chosen": -32.12987518310547, "ref_logps/rejected": -33.25669479370117, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7836792469024658, "rewards/margins": 0.17683541774749756, "rewards/rejected": -0.9605146646499634, "step": 768 }, { "epoch": 0.73, "grad_norm": 17.96187400817871, "learning_rate": 4.2112626792584817e-07, "logps/chosen": -47.99996566772461, "logps/rejected": -54.542686462402344, "loss": 0.5555, "losses/dpo": 0.438449889421463, "losses/sft": 1.5923808813095093, "losses/total": 0.438449889421463, "ref_logps/chosen": -40.29195022583008, "ref_logps/rejected": -42.82405090332031, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7708016633987427, "rewards/margins": 0.4010623097419739, "rewards/rejected": -1.1718639135360718, "step": 769 }, { "epoch": 0.73, "grad_norm": 25.057157516479492, "learning_rate": 4.209513816019587e-07, "logps/chosen": -61.90786361694336, "logps/rejected": -66.06634521484375, "loss": 0.6793, "losses/dpo": 0.339076966047287, "losses/sft": 2.2863316535949707, "losses/total": 0.339076966047287, "ref_logps/chosen": -49.08837127685547, "ref_logps/rejected": -48.471160888671875, "rewards/accuracies": 0.625, "rewards/chosen": -1.2819492816925049, "rewards/margins": 0.4775692820549011, "rewards/rejected": -1.7595185041427612, "step": 770 }, { "epoch": 0.73, "grad_norm": 20.30738067626953, "learning_rate": 4.2077649527806925e-07, "logps/chosen": -43.031578063964844, "logps/rejected": -49.231285095214844, "loss": 0.6316, "losses/dpo": 0.485808789730072, "losses/sft": 1.4857324361801147, "losses/total": 0.485808789730072, "ref_logps/chosen": -33.978431701660156, "ref_logps/rejected": -36.98942565917969, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9053147435188293, "rewards/margins": 0.31887149810791016, "rewards/rejected": -1.2241861820220947, "step": 771 }, { "epoch": 0.73, "grad_norm": 18.805618286132812, "learning_rate": 4.206016089541798e-07, "logps/chosen": -50.753421783447266, "logps/rejected": -58.30438995361328, "loss": 0.4742, "losses/dpo": 0.5090314149856567, "losses/sft": 1.7893503904342651, "losses/total": 0.5090314149856567, "ref_logps/chosen": -41.46656799316406, "ref_logps/rejected": -40.94102478027344, "rewards/accuracies": 0.75, "rewards/chosen": -0.9286853671073914, "rewards/margins": 0.8076509237289429, "rewards/rejected": -1.736336350440979, "step": 772 }, { "epoch": 0.73, "grad_norm": 17.45510482788086, "learning_rate": 4.2042672263029027e-07, "logps/chosen": -43.150169372558594, "logps/rejected": -58.464111328125, "loss": 0.5258, "losses/dpo": 0.4015842080116272, "losses/sft": 1.6756699085235596, "losses/total": 0.4015842080116272, "ref_logps/chosen": -35.26593780517578, "ref_logps/rejected": -43.1739387512207, "rewards/accuracies": 0.625, "rewards/chosen": -0.7884229421615601, "rewards/margins": 0.7405939102172852, "rewards/rejected": -1.5290168523788452, "step": 773 }, { "epoch": 0.73, "grad_norm": 22.22146987915039, "learning_rate": 4.2025183630640084e-07, "logps/chosen": -50.417083740234375, "logps/rejected": -69.91157531738281, "loss": 0.6617, "losses/dpo": 0.40283524990081787, "losses/sft": 1.894343614578247, "losses/total": 0.40283524990081787, "ref_logps/chosen": -39.54027557373047, "ref_logps/rejected": -55.56966781616211, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0876809358596802, "rewards/margins": 0.3465101718902588, "rewards/rejected": -1.4341912269592285, "step": 774 }, { "epoch": 0.73, "grad_norm": 22.246706008911133, "learning_rate": 4.2007694998251135e-07, "logps/chosen": -53.201717376708984, "logps/rejected": -61.34033966064453, "loss": 0.5247, "losses/dpo": 0.20747162401676178, "losses/sft": 1.3337289094924927, "losses/total": 0.20747162401676178, "ref_logps/chosen": -42.53508377075195, "ref_logps/rejected": -44.6602783203125, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0666635036468506, "rewards/margins": 0.6013425588607788, "rewards/rejected": -1.668006181716919, "step": 775 }, { "epoch": 0.73, "grad_norm": 24.267658233642578, "learning_rate": 4.1990206365862186e-07, "logps/chosen": -58.03358459472656, "logps/rejected": -55.07177734375, "loss": 0.6796, "losses/dpo": 0.9036049842834473, "losses/sft": 2.1022844314575195, "losses/total": 0.9036049842834473, "ref_logps/chosen": -47.973228454589844, "ref_logps/rejected": -42.999908447265625, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0060359239578247, "rewards/margins": 0.20115122199058533, "rewards/rejected": -1.2071871757507324, "step": 776 }, { "epoch": 0.73, "grad_norm": 21.436649322509766, "learning_rate": 4.197271773347324e-07, "logps/chosen": -53.396942138671875, "logps/rejected": -56.93098449707031, "loss": 0.5242, "losses/dpo": 0.43892043828964233, "losses/sft": 1.739390254020691, "losses/total": 0.43892043828964233, "ref_logps/chosen": -45.15161895751953, "ref_logps/rejected": -42.66443634033203, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8245325088500977, "rewards/margins": 0.6021220684051514, "rewards/rejected": -1.426654577255249, "step": 777 }, { "epoch": 0.73, "grad_norm": 19.981088638305664, "learning_rate": 4.1955229101084294e-07, "logps/chosen": -48.69720458984375, "logps/rejected": -52.137699127197266, "loss": 0.5863, "losses/dpo": 0.7162208557128906, "losses/sft": 1.709145426750183, "losses/total": 0.7162208557128906, "ref_logps/chosen": -36.80187225341797, "ref_logps/rejected": -36.985755920410156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.18953275680542, "rewards/margins": 0.3256617784500122, "rewards/rejected": -1.5151946544647217, "step": 778 }, { "epoch": 0.74, "grad_norm": 20.368045806884766, "learning_rate": 4.193774046869535e-07, "logps/chosen": -45.996665954589844, "logps/rejected": -59.24977493286133, "loss": 0.525, "losses/dpo": 0.7196766138076782, "losses/sft": 2.0520544052124023, "losses/total": 0.7196766138076782, "ref_logps/chosen": -36.62474060058594, "ref_logps/rejected": -44.03826141357422, "rewards/accuracies": 0.75, "rewards/chosen": -0.9371929168701172, "rewards/margins": 0.5839586853981018, "rewards/rejected": -1.5211515426635742, "step": 779 }, { "epoch": 0.74, "grad_norm": 17.651996612548828, "learning_rate": 4.1920251836306397e-07, "logps/chosen": -37.72838592529297, "logps/rejected": -61.68711853027344, "loss": 0.4566, "losses/dpo": 0.3346171975135803, "losses/sft": 1.868023157119751, "losses/total": 0.3346171975135803, "ref_logps/chosen": -30.137733459472656, "ref_logps/rejected": -46.17749786376953, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7590653896331787, "rewards/margins": 0.7918962240219116, "rewards/rejected": -1.5509614944458008, "step": 780 }, { "epoch": 0.74, "grad_norm": 25.09738540649414, "learning_rate": 4.1902763203917453e-07, "logps/chosen": -57.371253967285156, "logps/rejected": -47.14936828613281, "loss": 0.8164, "losses/dpo": 0.9405972957611084, "losses/sft": 1.6142300367355347, "losses/total": 0.9405972957611084, "ref_logps/chosen": -46.65610885620117, "ref_logps/rejected": -37.34865188598633, "rewards/accuracies": 0.375, "rewards/chosen": -1.0715144872665405, "rewards/margins": -0.0914430171251297, "rewards/rejected": -0.980071485042572, "step": 781 }, { "epoch": 0.74, "grad_norm": 17.679214477539062, "learning_rate": 4.1885274571528505e-07, "logps/chosen": -33.392459869384766, "logps/rejected": -55.045135498046875, "loss": 0.6448, "losses/dpo": 0.8382935523986816, "losses/sft": 1.943509578704834, "losses/total": 0.8382935523986816, "ref_logps/chosen": -24.722564697265625, "ref_logps/rejected": -42.780574798583984, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8669896125793457, "rewards/margins": 0.3594665229320526, "rewards/rejected": -1.2264561653137207, "step": 782 }, { "epoch": 0.74, "grad_norm": 17.070148468017578, "learning_rate": 4.1867785939139556e-07, "logps/chosen": -47.054222106933594, "logps/rejected": -59.38862991333008, "loss": 0.4827, "losses/dpo": 0.4504500925540924, "losses/sft": 1.7148276567459106, "losses/total": 0.4504500925540924, "ref_logps/chosen": -40.27749252319336, "ref_logps/rejected": -46.283416748046875, "rewards/accuracies": 0.75, "rewards/chosen": -0.6776725053787231, "rewards/margins": 0.6328486204147339, "rewards/rejected": -1.310521125793457, "step": 783 }, { "epoch": 0.74, "grad_norm": 23.34598731994629, "learning_rate": 4.1850297306750607e-07, "logps/chosen": -51.39959716796875, "logps/rejected": -54.16435241699219, "loss": 0.6386, "losses/dpo": 0.7482547760009766, "losses/sft": 1.876220703125, "losses/total": 0.7482547760009766, "ref_logps/chosen": -39.918174743652344, "ref_logps/rejected": -39.992759704589844, "rewards/accuracies": 0.625, "rewards/chosen": -1.1481423377990723, "rewards/margins": 0.2690170705318451, "rewards/rejected": -1.4171594381332397, "step": 784 }, { "epoch": 0.74, "grad_norm": 27.0095272064209, "learning_rate": 4.1832808674361664e-07, "logps/chosen": -59.550010681152344, "logps/rejected": -67.86109924316406, "loss": 0.7146, "losses/dpo": 0.5503778457641602, "losses/sft": 1.526914119720459, "losses/total": 0.5503778457641602, "ref_logps/chosen": -47.91358184814453, "ref_logps/rejected": -52.84379577636719, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1636431217193604, "rewards/margins": 0.33808720111846924, "rewards/rejected": -1.5017304420471191, "step": 785 }, { "epoch": 0.74, "grad_norm": 23.250091552734375, "learning_rate": 4.181532004197272e-07, "logps/chosen": -61.68149185180664, "logps/rejected": -84.31434631347656, "loss": 0.6161, "losses/dpo": 0.48296403884887695, "losses/sft": 2.0142619609832764, "losses/total": 0.48296403884887695, "ref_logps/chosen": -49.42213821411133, "ref_logps/rejected": -66.10447692871094, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2259352207183838, "rewards/margins": 0.5950525403022766, "rewards/rejected": -1.8209878206253052, "step": 786 }, { "epoch": 0.74, "grad_norm": 20.944580078125, "learning_rate": 4.1797831409583766e-07, "logps/chosen": -47.476417541503906, "logps/rejected": -53.08404541015625, "loss": 0.5704, "losses/dpo": 0.9001350402832031, "losses/sft": 1.9318976402282715, "losses/total": 0.9001350402832031, "ref_logps/chosen": -39.116355895996094, "ref_logps/rejected": -39.44659423828125, "rewards/accuracies": 0.75, "rewards/chosen": -0.8360065221786499, "rewards/margins": 0.5277382135391235, "rewards/rejected": -1.3637447357177734, "step": 787 }, { "epoch": 0.74, "grad_norm": 16.427867889404297, "learning_rate": 4.1780342777194823e-07, "logps/chosen": -49.147247314453125, "logps/rejected": -68.76141357421875, "loss": 0.4217, "losses/dpo": 0.43051594495773315, "losses/sft": 1.7895997762680054, "losses/total": 0.43051594495773315, "ref_logps/chosen": -40.01856231689453, "ref_logps/rejected": -49.69984436035156, "rewards/accuracies": 0.875, "rewards/chosen": -0.912868082523346, "rewards/margins": 0.9932891130447388, "rewards/rejected": -1.9061572551727295, "step": 788 }, { "epoch": 0.75, "grad_norm": 19.30293083190918, "learning_rate": 4.1762854144805874e-07, "logps/chosen": -38.080177307128906, "logps/rejected": -54.69189453125, "loss": 0.5678, "losses/dpo": 0.9089207649230957, "losses/sft": 2.139173746109009, "losses/total": 0.9089207649230957, "ref_logps/chosen": -30.37895965576172, "ref_logps/rejected": -39.110130310058594, "rewards/accuracies": 0.75, "rewards/chosen": -0.7701219320297241, "rewards/margins": 0.7880542278289795, "rewards/rejected": -1.5581762790679932, "step": 789 }, { "epoch": 0.75, "grad_norm": 19.495140075683594, "learning_rate": 4.1745365512416925e-07, "logps/chosen": -46.094276428222656, "logps/rejected": -57.03180694580078, "loss": 0.579, "losses/dpo": 0.9135038256645203, "losses/sft": 2.1158804893493652, "losses/total": 0.9135038256645203, "ref_logps/chosen": -36.45563507080078, "ref_logps/rejected": -41.92792510986328, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9638640284538269, "rewards/margins": 0.5465238690376282, "rewards/rejected": -1.510387897491455, "step": 790 }, { "epoch": 0.75, "grad_norm": 21.350933074951172, "learning_rate": 4.1727876880027977e-07, "logps/chosen": -46.796260833740234, "logps/rejected": -60.494384765625, "loss": 0.6413, "losses/dpo": 0.6952491998672485, "losses/sft": 2.5563645362854004, "losses/total": 0.6952491998672485, "ref_logps/chosen": -33.72479248046875, "ref_logps/rejected": -45.16609573364258, "rewards/accuracies": 0.6875, "rewards/chosen": -1.307146430015564, "rewards/margins": 0.22568242251873016, "rewards/rejected": -1.5328289270401, "step": 791 }, { "epoch": 0.75, "grad_norm": 20.2003173828125, "learning_rate": 4.1710388247639033e-07, "logps/chosen": -40.17908477783203, "logps/rejected": -55.57908630371094, "loss": 0.5794, "losses/dpo": 0.5370991230010986, "losses/sft": 1.3463118076324463, "losses/total": 0.5370991230010986, "ref_logps/chosen": -31.964265823364258, "ref_logps/rejected": -42.87556838989258, "rewards/accuracies": 0.6875, "rewards/chosen": -0.821482241153717, "rewards/margins": 0.4488694667816162, "rewards/rejected": -1.2703516483306885, "step": 792 }, { "epoch": 0.75, "grad_norm": 27.169261932373047, "learning_rate": 4.169289961525009e-07, "logps/chosen": -55.7948112487793, "logps/rejected": -55.48624038696289, "loss": 0.7654, "losses/dpo": 0.9784973859786987, "losses/sft": 2.267885208129883, "losses/total": 0.9784973859786987, "ref_logps/chosen": -42.779937744140625, "ref_logps/rejected": -40.891395568847656, "rewards/accuracies": 0.4375, "rewards/chosen": -1.301487684249878, "rewards/margins": 0.1579965054988861, "rewards/rejected": -1.4594842195510864, "step": 793 }, { "epoch": 0.75, "grad_norm": 15.89783763885498, "learning_rate": 4.1675410982861136e-07, "logps/chosen": -37.41274642944336, "logps/rejected": -58.109989166259766, "loss": 0.488, "losses/dpo": 0.4054686427116394, "losses/sft": 1.6878503561019897, "losses/total": 0.4054686427116394, "ref_logps/chosen": -28.857276916503906, "ref_logps/rejected": -44.28254699707031, "rewards/accuracies": 1.0, "rewards/chosen": -0.8555470108985901, "rewards/margins": 0.5271970629692078, "rewards/rejected": -1.3827440738677979, "step": 794 }, { "epoch": 0.75, "grad_norm": 20.05536651611328, "learning_rate": 4.165792235047219e-07, "logps/chosen": -38.05184555053711, "logps/rejected": -51.57506561279297, "loss": 0.5269, "losses/dpo": 0.6790121793746948, "losses/sft": 1.5501598119735718, "losses/total": 0.6790121793746948, "ref_logps/chosen": -27.786029815673828, "ref_logps/rejected": -36.00325012207031, "rewards/accuracies": 0.75, "rewards/chosen": -1.0265814065933228, "rewards/margins": 0.5306002497673035, "rewards/rejected": -1.557181715965271, "step": 795 }, { "epoch": 0.75, "grad_norm": 22.130945205688477, "learning_rate": 4.1640433718083243e-07, "logps/chosen": -54.87089538574219, "logps/rejected": -56.082725524902344, "loss": 0.6803, "losses/dpo": 0.7415988445281982, "losses/sft": 1.8232758045196533, "losses/total": 0.7415988445281982, "ref_logps/chosen": -42.646095275878906, "ref_logps/rejected": -41.11537170410156, "rewards/accuracies": 0.5, "rewards/chosen": -1.2224805355072021, "rewards/margins": 0.27425479888916016, "rewards/rejected": -1.4967353343963623, "step": 796 }, { "epoch": 0.75, "grad_norm": 22.653902053833008, "learning_rate": 4.1622945085694295e-07, "logps/chosen": -44.57743835449219, "logps/rejected": -57.335716247558594, "loss": 0.731, "losses/dpo": 0.5241410732269287, "losses/sft": 1.8190603256225586, "losses/total": 0.5241410732269287, "ref_logps/chosen": -33.62150192260742, "ref_logps/rejected": -45.14116668701172, "rewards/accuracies": 0.5, "rewards/chosen": -1.0955939292907715, "rewards/margins": 0.12386080622673035, "rewards/rejected": -1.2194546461105347, "step": 797 }, { "epoch": 0.75, "grad_norm": 17.54418182373047, "learning_rate": 4.160545645330535e-07, "logps/chosen": -45.42450714111328, "logps/rejected": -48.62165832519531, "loss": 0.5393, "losses/dpo": 0.43921250104904175, "losses/sft": 1.1786720752716064, "losses/total": 0.43921250104904175, "ref_logps/chosen": -37.135379791259766, "ref_logps/rejected": -35.45717239379883, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8289127349853516, "rewards/margins": 0.4875360131263733, "rewards/rejected": -1.31644868850708, "step": 798 }, { "epoch": 0.75, "grad_norm": 17.97772979736328, "learning_rate": 4.15879678209164e-07, "logps/chosen": -44.88686752319336, "logps/rejected": -54.74813461303711, "loss": 0.5674, "losses/dpo": 0.5229305624961853, "losses/sft": 1.2484173774719238, "losses/total": 0.5229305624961853, "ref_logps/chosen": -35.25, "ref_logps/rejected": -40.10423278808594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9636870622634888, "rewards/margins": 0.5007030367851257, "rewards/rejected": -1.4643900394439697, "step": 799 }, { "epoch": 0.76, "grad_norm": 13.845191955566406, "learning_rate": 4.157047918852746e-07, "logps/chosen": -39.998382568359375, "logps/rejected": -43.76007080078125, "loss": 0.4603, "losses/dpo": 0.4537975788116455, "losses/sft": 1.456099510192871, "losses/total": 0.4537975788116455, "ref_logps/chosen": -33.46385955810547, "ref_logps/rejected": -30.600872039794922, "rewards/accuracies": 0.875, "rewards/chosen": -0.6534522771835327, "rewards/margins": 0.6624678373336792, "rewards/rejected": -1.315920114517212, "step": 800 }, { "epoch": 0.76, "grad_norm": 24.161046981811523, "learning_rate": 4.1552990556138505e-07, "logps/chosen": -56.78883361816406, "logps/rejected": -69.69316101074219, "loss": 0.7032, "losses/dpo": 0.7013365030288696, "losses/sft": 1.7608017921447754, "losses/total": 0.7013365030288696, "ref_logps/chosen": -44.61452102661133, "ref_logps/rejected": -54.76441192626953, "rewards/accuracies": 0.5, "rewards/chosen": -1.2174313068389893, "rewards/margins": 0.2754439413547516, "rewards/rejected": -1.4928752183914185, "step": 801 }, { "epoch": 0.76, "grad_norm": 16.207292556762695, "learning_rate": 4.153550192374956e-07, "logps/chosen": -43.64310836791992, "logps/rejected": -55.11307907104492, "loss": 0.5325, "losses/dpo": 0.5881010293960571, "losses/sft": 1.5619310140609741, "losses/total": 0.5881010293960571, "ref_logps/chosen": -34.51484680175781, "ref_logps/rejected": -40.832275390625, "rewards/accuracies": 0.75, "rewards/chosen": -0.9128257036209106, "rewards/margins": 0.5152546167373657, "rewards/rejected": -1.4280803203582764, "step": 802 }, { "epoch": 0.76, "grad_norm": 21.621896743774414, "learning_rate": 4.1518013291360613e-07, "logps/chosen": -49.106651306152344, "logps/rejected": -57.2791633605957, "loss": 0.6013, "losses/dpo": 0.4520745575428009, "losses/sft": 2.139284133911133, "losses/total": 0.4520745575428009, "ref_logps/chosen": -38.272666931152344, "ref_logps/rejected": -43.19129180908203, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0833981037139893, "rewards/margins": 0.3253892660140991, "rewards/rejected": -1.408787488937378, "step": 803 }, { "epoch": 0.76, "grad_norm": 19.893678665161133, "learning_rate": 4.1500524658971664e-07, "logps/chosen": -44.95887756347656, "logps/rejected": -49.64577102661133, "loss": 0.6321, "losses/dpo": 0.5761253833770752, "losses/sft": 1.3944107294082642, "losses/total": 0.5761253833770752, "ref_logps/chosen": -34.86906433105469, "ref_logps/rejected": -36.0997314453125, "rewards/accuracies": 0.75, "rewards/chosen": -1.008981466293335, "rewards/margins": 0.3456220030784607, "rewards/rejected": -1.3546034097671509, "step": 804 }, { "epoch": 0.76, "grad_norm": 20.27569007873535, "learning_rate": 4.148303602658272e-07, "logps/chosen": -47.399662017822266, "logps/rejected": -50.15160369873047, "loss": 0.5316, "losses/dpo": 0.30595090985298157, "losses/sft": 1.7690634727478027, "losses/total": 0.30595090985298157, "ref_logps/chosen": -36.06136703491211, "ref_logps/rejected": -31.95119857788086, "rewards/accuracies": 0.75, "rewards/chosen": -1.1338295936584473, "rewards/margins": 0.6862108707427979, "rewards/rejected": -1.8200404644012451, "step": 805 }, { "epoch": 0.76, "grad_norm": 17.346487045288086, "learning_rate": 4.146554739419377e-07, "logps/chosen": -46.592437744140625, "logps/rejected": -59.75151443481445, "loss": 0.5321, "losses/dpo": 0.8430801630020142, "losses/sft": 2.247488021850586, "losses/total": 0.8430801630020142, "ref_logps/chosen": -33.222312927246094, "ref_logps/rejected": -38.73173904418945, "rewards/accuracies": 0.875, "rewards/chosen": -1.3370126485824585, "rewards/margins": 0.7649650573730469, "rewards/rejected": -2.1019773483276367, "step": 806 }, { "epoch": 0.76, "grad_norm": 19.774389266967773, "learning_rate": 4.144805876180483e-07, "logps/chosen": -46.54816436767578, "logps/rejected": -63.87855529785156, "loss": 0.462, "losses/dpo": 0.3038465976715088, "losses/sft": 1.6905699968338013, "losses/total": 0.3038465976715088, "ref_logps/chosen": -36.38085174560547, "ref_logps/rejected": -45.94116973876953, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0167306661605835, "rewards/margins": 0.7770076394081116, "rewards/rejected": -1.7937383651733398, "step": 807 }, { "epoch": 0.76, "grad_norm": 16.16718292236328, "learning_rate": 4.1430570129415875e-07, "logps/chosen": -45.99098587036133, "logps/rejected": -57.12336730957031, "loss": 0.4629, "losses/dpo": 0.32424142956733704, "losses/sft": 1.6986743211746216, "losses/total": 0.32424142956733704, "ref_logps/chosen": -38.268798828125, "ref_logps/rejected": -42.035465240478516, "rewards/accuracies": 0.875, "rewards/chosen": -0.7722187042236328, "rewards/margins": 0.7365716695785522, "rewards/rejected": -1.5087902545928955, "step": 808 }, { "epoch": 0.76, "grad_norm": 13.154738426208496, "learning_rate": 4.141308149702693e-07, "logps/chosen": -46.79335403442383, "logps/rejected": -64.21115112304688, "loss": 0.3422, "losses/dpo": 0.4130054712295532, "losses/sft": 1.7548103332519531, "losses/total": 0.4130054712295532, "ref_logps/chosen": -39.620872497558594, "ref_logps/rejected": -44.596710205078125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7172482013702393, "rewards/margins": 1.2441960573196411, "rewards/rejected": -1.96144437789917, "step": 809 }, { "epoch": 0.76, "grad_norm": 16.363344192504883, "learning_rate": 4.139559286463798e-07, "logps/chosen": -56.21271514892578, "logps/rejected": -84.21590423583984, "loss": 0.3861, "losses/dpo": 0.09063832461833954, "losses/sft": 1.6268547773361206, "losses/total": 0.09063832461833954, "ref_logps/chosen": -45.975486755371094, "ref_logps/rejected": -62.186920166015625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.023722767829895, "rewards/margins": 1.1791757345199585, "rewards/rejected": -2.2028985023498535, "step": 810 }, { "epoch": 0.77, "grad_norm": 18.855722427368164, "learning_rate": 4.1378104232249034e-07, "logps/chosen": -46.534881591796875, "logps/rejected": -64.19956970214844, "loss": 0.5512, "losses/dpo": 0.8763961791992188, "losses/sft": 2.28117036819458, "losses/total": 0.8763961791992188, "ref_logps/chosen": -36.1524658203125, "ref_logps/rejected": -48.884525299072266, "rewards/accuracies": 0.75, "rewards/chosen": -1.0382417440414429, "rewards/margins": 0.4932631850242615, "rewards/rejected": -1.5315048694610596, "step": 811 }, { "epoch": 0.77, "grad_norm": 17.897693634033203, "learning_rate": 4.136061559986009e-07, "logps/chosen": -51.246246337890625, "logps/rejected": -59.971458435058594, "loss": 0.4539, "losses/dpo": 0.4118199646472931, "losses/sft": 1.7263908386230469, "losses/total": 0.4118199646472931, "ref_logps/chosen": -41.09989929199219, "ref_logps/rejected": -41.491268157958984, "rewards/accuracies": 0.875, "rewards/chosen": -1.0146347284317017, "rewards/margins": 0.8333843946456909, "rewards/rejected": -1.8480191230773926, "step": 812 }, { "epoch": 0.77, "grad_norm": 21.614953994750977, "learning_rate": 4.134312696747114e-07, "logps/chosen": -60.36675262451172, "logps/rejected": -73.6515884399414, "loss": 0.5616, "losses/dpo": 0.37293195724487305, "losses/sft": 1.845620036125183, "losses/total": 0.37293195724487305, "ref_logps/chosen": -48.35881042480469, "ref_logps/rejected": -56.498138427734375, "rewards/accuracies": 0.75, "rewards/chosen": -1.2007945775985718, "rewards/margins": 0.5145505666732788, "rewards/rejected": -1.7153451442718506, "step": 813 }, { "epoch": 0.77, "grad_norm": 22.284120559692383, "learning_rate": 4.13256383350822e-07, "logps/chosen": -70.5172119140625, "logps/rejected": -73.63374328613281, "loss": 0.5859, "losses/dpo": 0.40943634510040283, "losses/sft": 2.5171494483947754, "losses/total": 0.40943634510040283, "ref_logps/chosen": -57.142147064208984, "ref_logps/rejected": -54.95692443847656, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3375064134597778, "rewards/margins": 0.5301756858825684, "rewards/rejected": -1.8676820993423462, "step": 814 }, { "epoch": 0.77, "grad_norm": 18.59164047241211, "learning_rate": 4.1308149702693244e-07, "logps/chosen": -51.64142990112305, "logps/rejected": -65.4079360961914, "loss": 0.4503, "losses/dpo": 0.3758418560028076, "losses/sft": 1.860427975654602, "losses/total": 0.3758418560028076, "ref_logps/chosen": -40.368900299072266, "ref_logps/rejected": -45.71522521972656, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1272530555725098, "rewards/margins": 0.8420186042785645, "rewards/rejected": -1.9692716598510742, "step": 815 }, { "epoch": 0.77, "grad_norm": 17.908594131469727, "learning_rate": 4.12906610703043e-07, "logps/chosen": -47.54798126220703, "logps/rejected": -67.79107666015625, "loss": 0.4823, "losses/dpo": 0.24897406995296478, "losses/sft": 1.7283592224121094, "losses/total": 0.24897406995296478, "ref_logps/chosen": -37.467628479003906, "ref_logps/rejected": -49.21507263183594, "rewards/accuracies": 0.75, "rewards/chosen": -1.0080349445343018, "rewards/margins": 0.8495657444000244, "rewards/rejected": -1.8576006889343262, "step": 816 }, { "epoch": 0.77, "grad_norm": 21.439563751220703, "learning_rate": 4.1273172437915357e-07, "logps/chosen": -48.15118408203125, "logps/rejected": -61.09138870239258, "loss": 0.5663, "losses/dpo": 0.2909359931945801, "losses/sft": 1.461403250694275, "losses/total": 0.2909359931945801, "ref_logps/chosen": -35.859962463378906, "ref_logps/rejected": -42.76637268066406, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2291221618652344, "rewards/margins": 0.6033795475959778, "rewards/rejected": -1.8325018882751465, "step": 817 }, { "epoch": 0.77, "grad_norm": 15.60698413848877, "learning_rate": 4.1255683805526403e-07, "logps/chosen": -51.07274627685547, "logps/rejected": -77.69608306884766, "loss": 0.3989, "losses/dpo": 0.5728371739387512, "losses/sft": 1.7782914638519287, "losses/total": 0.5728371739387512, "ref_logps/chosen": -39.94403076171875, "ref_logps/rejected": -56.00139617919922, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1128716468811035, "rewards/margins": 1.056597113609314, "rewards/rejected": -2.169468641281128, "step": 818 }, { "epoch": 0.77, "grad_norm": 16.548442840576172, "learning_rate": 4.123819517313746e-07, "logps/chosen": -40.735740661621094, "logps/rejected": -45.109466552734375, "loss": 0.5646, "losses/dpo": 0.7372438907623291, "losses/sft": 2.486968517303467, "losses/total": 0.7372438907623291, "ref_logps/chosen": -30.94430923461914, "ref_logps/rejected": -31.13395118713379, "rewards/accuracies": 0.625, "rewards/chosen": -0.9791431427001953, "rewards/margins": 0.4184086322784424, "rewards/rejected": -1.3975516557693481, "step": 819 }, { "epoch": 0.77, "grad_norm": 25.764150619506836, "learning_rate": 4.122070654074851e-07, "logps/chosen": -49.48291015625, "logps/rejected": -61.048072814941406, "loss": 0.5621, "losses/dpo": 0.9668756127357483, "losses/sft": 1.7554867267608643, "losses/total": 0.9668756127357483, "ref_logps/chosen": -38.410850524902344, "ref_logps/rejected": -41.62816619873047, "rewards/accuracies": 0.8125, "rewards/chosen": -1.107205867767334, "rewards/margins": 0.834784746170044, "rewards/rejected": -1.941990613937378, "step": 820 }, { "epoch": 0.78, "grad_norm": 18.528274536132812, "learning_rate": 4.120321790835957e-07, "logps/chosen": -39.327728271484375, "logps/rejected": -70.03535461425781, "loss": 0.4533, "losses/dpo": 0.3814285397529602, "losses/sft": 1.7690520286560059, "losses/total": 0.3814285397529602, "ref_logps/chosen": -31.195537567138672, "ref_logps/rejected": -53.608245849609375, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8132188320159912, "rewards/margins": 0.8294921517372131, "rewards/rejected": -1.6427109241485596, "step": 821 }, { "epoch": 0.78, "grad_norm": 21.447460174560547, "learning_rate": 4.1185729275970613e-07, "logps/chosen": -43.9781379699707, "logps/rejected": -56.883888244628906, "loss": 0.6067, "losses/dpo": 1.2185615301132202, "losses/sft": 1.8627616167068481, "losses/total": 1.2185615301132202, "ref_logps/chosen": -34.30780029296875, "ref_logps/rejected": -41.61172866821289, "rewards/accuracies": 0.875, "rewards/chosen": -0.9670339822769165, "rewards/margins": 0.5601822137832642, "rewards/rejected": -1.5272161960601807, "step": 822 }, { "epoch": 0.78, "grad_norm": 23.782699584960938, "learning_rate": 4.116824064358167e-07, "logps/chosen": -55.485130310058594, "logps/rejected": -69.16844177246094, "loss": 0.6105, "losses/dpo": 0.6353790163993835, "losses/sft": 1.612541913986206, "losses/total": 0.6353790163993835, "ref_logps/chosen": -42.11946487426758, "ref_logps/rejected": -52.072021484375, "rewards/accuracies": 0.6875, "rewards/chosen": -1.33656644821167, "rewards/margins": 0.37307578325271606, "rewards/rejected": -1.7096424102783203, "step": 823 }, { "epoch": 0.78, "grad_norm": 24.362964630126953, "learning_rate": 4.1150752011192727e-07, "logps/chosen": -49.45055389404297, "logps/rejected": -52.51344299316406, "loss": 0.5772, "losses/dpo": 0.7817726135253906, "losses/sft": 1.7117810249328613, "losses/total": 0.7817726135253906, "ref_logps/chosen": -39.750152587890625, "ref_logps/rejected": -37.19168472290039, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9700397253036499, "rewards/margins": 0.562136173248291, "rewards/rejected": -1.532175898551941, "step": 824 }, { "epoch": 0.78, "grad_norm": 17.661327362060547, "learning_rate": 4.113326337880377e-07, "logps/chosen": -39.17385482788086, "logps/rejected": -55.61888122558594, "loss": 0.5506, "losses/dpo": 0.430070161819458, "losses/sft": 1.939976453781128, "losses/total": 0.430070161819458, "ref_logps/chosen": -28.86272621154785, "ref_logps/rejected": -41.25264358520508, "rewards/accuracies": 0.625, "rewards/chosen": -1.0311129093170166, "rewards/margins": 0.4055110812187195, "rewards/rejected": -1.4366239309310913, "step": 825 }, { "epoch": 0.78, "grad_norm": 19.97083282470703, "learning_rate": 4.111577474641483e-07, "logps/chosen": -47.04802703857422, "logps/rejected": -46.20124435424805, "loss": 0.5741, "losses/dpo": 0.508751630783081, "losses/sft": 1.8096230030059814, "losses/total": 0.508751630783081, "ref_logps/chosen": -37.95310974121094, "ref_logps/rejected": -33.11418151855469, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9094916582107544, "rewards/margins": 0.39921486377716064, "rewards/rejected": -1.308706521987915, "step": 826 }, { "epoch": 0.78, "grad_norm": 24.062883377075195, "learning_rate": 4.109828611402588e-07, "logps/chosen": -56.68595886230469, "logps/rejected": -70.44078826904297, "loss": 0.5813, "losses/dpo": 0.7843765020370483, "losses/sft": 2.074343204498291, "losses/total": 0.7843765020370483, "ref_logps/chosen": -44.316707611083984, "ref_logps/rejected": -50.27688980102539, "rewards/accuracies": 0.6875, "rewards/chosen": -1.236924648284912, "rewards/margins": 0.779464840888977, "rewards/rejected": -2.0163896083831787, "step": 827 }, { "epoch": 0.78, "grad_norm": 18.22933578491211, "learning_rate": 4.1080797481636937e-07, "logps/chosen": -41.456085205078125, "logps/rejected": -47.322505950927734, "loss": 0.6461, "losses/dpo": 0.6511818170547485, "losses/sft": 1.433754801750183, "losses/total": 0.6511818170547485, "ref_logps/chosen": -30.937042236328125, "ref_logps/rejected": -34.29116439819336, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0519046783447266, "rewards/margins": 0.25122973322868347, "rewards/rejected": -1.3031344413757324, "step": 828 }, { "epoch": 0.78, "grad_norm": 23.702861785888672, "learning_rate": 4.1063308849247983e-07, "logps/chosen": -48.07335662841797, "logps/rejected": -52.76336669921875, "loss": 0.6673, "losses/dpo": 0.6375836133956909, "losses/sft": 1.4201908111572266, "losses/total": 0.6375836133956909, "ref_logps/chosen": -37.506710052490234, "ref_logps/rejected": -38.000221252441406, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0566641092300415, "rewards/margins": 0.41965004801750183, "rewards/rejected": -1.4763140678405762, "step": 829 }, { "epoch": 0.78, "grad_norm": 25.07790184020996, "learning_rate": 4.104582021685904e-07, "logps/chosen": -54.89522171020508, "logps/rejected": -59.961612701416016, "loss": 0.6327, "losses/dpo": 0.7537351250648499, "losses/sft": 1.9888134002685547, "losses/total": 0.7537351250648499, "ref_logps/chosen": -42.2194938659668, "ref_logps/rejected": -43.52435302734375, "rewards/accuracies": 0.75, "rewards/chosen": -1.2675732374191284, "rewards/margins": 0.3761524558067322, "rewards/rejected": -1.6437256336212158, "step": 830 }, { "epoch": 0.78, "grad_norm": 24.729991912841797, "learning_rate": 4.1028331584470096e-07, "logps/chosen": -51.150794982910156, "logps/rejected": -61.62116241455078, "loss": 0.6719, "losses/dpo": 0.5452122688293457, "losses/sft": 1.8438490629196167, "losses/total": 0.5452122688293457, "ref_logps/chosen": -38.29194641113281, "ref_logps/rejected": -45.78569030761719, "rewards/accuracies": 0.625, "rewards/chosen": -1.2858850955963135, "rewards/margins": 0.2976619601249695, "rewards/rejected": -1.5835471153259277, "step": 831 }, { "epoch": 0.79, "grad_norm": 21.44819450378418, "learning_rate": 4.101084295208114e-07, "logps/chosen": -46.09823989868164, "logps/rejected": -52.49946594238281, "loss": 0.6399, "losses/dpo": 0.5194214582443237, "losses/sft": 1.5613081455230713, "losses/total": 0.5194214582443237, "ref_logps/chosen": -36.419857025146484, "ref_logps/rejected": -38.36824035644531, "rewards/accuracies": 0.625, "rewards/chosen": -0.9678384065628052, "rewards/margins": 0.4452841579914093, "rewards/rejected": -1.4131226539611816, "step": 832 }, { "epoch": 0.79, "grad_norm": 19.220415115356445, "learning_rate": 4.09933543196922e-07, "logps/chosen": -42.4419059753418, "logps/rejected": -61.82816696166992, "loss": 0.4571, "losses/dpo": 0.4315023720264435, "losses/sft": 1.821016788482666, "losses/total": 0.4315023720264435, "ref_logps/chosen": -34.52996826171875, "ref_logps/rejected": -46.77499771118164, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7911937236785889, "rewards/margins": 0.7141229510307312, "rewards/rejected": -1.5053167343139648, "step": 833 }, { "epoch": 0.79, "grad_norm": 19.52722930908203, "learning_rate": 4.097586568730325e-07, "logps/chosen": -46.842567443847656, "logps/rejected": -65.82450866699219, "loss": 0.5215, "losses/dpo": 0.9241123795509338, "losses/sft": 1.4499462842941284, "losses/total": 0.9241123795509338, "ref_logps/chosen": -35.38399887084961, "ref_logps/rejected": -46.04804229736328, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1458566188812256, "rewards/margins": 0.8317896127700806, "rewards/rejected": -1.9776462316513062, "step": 834 }, { "epoch": 0.79, "grad_norm": 16.686166763305664, "learning_rate": 4.0958377054914306e-07, "logps/chosen": -48.39841842651367, "logps/rejected": -58.906097412109375, "loss": 0.482, "losses/dpo": 0.3820689916610718, "losses/sft": 2.4467155933380127, "losses/total": 0.3820689916610718, "ref_logps/chosen": -37.098514556884766, "ref_logps/rejected": -40.42327117919922, "rewards/accuracies": 0.75, "rewards/chosen": -1.1299902200698853, "rewards/margins": 0.7182927131652832, "rewards/rejected": -1.8482829332351685, "step": 835 }, { "epoch": 0.79, "grad_norm": 17.65949249267578, "learning_rate": 4.094088842252535e-07, "logps/chosen": -37.25263977050781, "logps/rejected": -55.28199005126953, "loss": 0.5436, "losses/dpo": 0.45034536719322205, "losses/sft": 1.5987011194229126, "losses/total": 0.45034536719322205, "ref_logps/chosen": -26.89162826538086, "ref_logps/rejected": -39.481117248535156, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0361013412475586, "rewards/margins": 0.5439859628677368, "rewards/rejected": -1.5800873041152954, "step": 836 }, { "epoch": 0.79, "grad_norm": 21.985748291015625, "learning_rate": 4.092339979013641e-07, "logps/chosen": -40.997798919677734, "logps/rejected": -67.35408020019531, "loss": 0.642, "losses/dpo": 0.8488292098045349, "losses/sft": 1.6512669324874878, "losses/total": 0.8488292098045349, "ref_logps/chosen": -28.734466552734375, "ref_logps/rejected": -50.182098388671875, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2263331413269043, "rewards/margins": 0.49086517095565796, "rewards/rejected": -1.717198371887207, "step": 837 }, { "epoch": 0.79, "grad_norm": 21.788148880004883, "learning_rate": 4.0905911157747465e-07, "logps/chosen": -46.52219772338867, "logps/rejected": -68.809326171875, "loss": 0.5504, "losses/dpo": 0.6097391843795776, "losses/sft": 1.4508566856384277, "losses/total": 0.6097391843795776, "ref_logps/chosen": -37.70521545410156, "ref_logps/rejected": -53.499549865722656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8816981315612793, "rewards/margins": 0.6492795348167419, "rewards/rejected": -1.5309778451919556, "step": 838 }, { "epoch": 0.79, "grad_norm": 27.178285598754883, "learning_rate": 4.088842252535851e-07, "logps/chosen": -54.411441802978516, "logps/rejected": -67.19982147216797, "loss": 0.7348, "losses/dpo": 1.534578561782837, "losses/sft": 2.0376055240631104, "losses/total": 1.534578561782837, "ref_logps/chosen": -41.59713363647461, "ref_logps/rejected": -49.88591003417969, "rewards/accuracies": 0.625, "rewards/chosen": -1.2814306020736694, "rewards/margins": 0.44996070861816406, "rewards/rejected": -1.7313913106918335, "step": 839 }, { "epoch": 0.79, "grad_norm": 20.112735748291016, "learning_rate": 4.087093389296957e-07, "logps/chosen": -50.57804489135742, "logps/rejected": -76.22239685058594, "loss": 0.4434, "losses/dpo": 0.42248982191085815, "losses/sft": 1.702013373374939, "losses/total": 0.42248982191085815, "ref_logps/chosen": -40.00060272216797, "ref_logps/rejected": -54.03583908081055, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0577441453933716, "rewards/margins": 1.1609119176864624, "rewards/rejected": -2.218656063079834, "step": 840 }, { "epoch": 0.79, "grad_norm": 18.04446029663086, "learning_rate": 4.085344526058062e-07, "logps/chosen": -46.6158447265625, "logps/rejected": -63.661006927490234, "loss": 0.4268, "losses/dpo": 0.6124235391616821, "losses/sft": 1.5183452367782593, "losses/total": 0.6124235391616821, "ref_logps/chosen": -37.38063430786133, "ref_logps/rejected": -45.835533142089844, "rewards/accuracies": 0.875, "rewards/chosen": -0.9235209226608276, "rewards/margins": 0.8590266704559326, "rewards/rejected": -1.7825474739074707, "step": 841 }, { "epoch": 0.8, "grad_norm": 21.226863861083984, "learning_rate": 4.0835956628191676e-07, "logps/chosen": -43.76727294921875, "logps/rejected": -68.08877563476562, "loss": 0.5675, "losses/dpo": 0.5349567532539368, "losses/sft": 1.2213505506515503, "losses/total": 0.5349567532539368, "ref_logps/chosen": -32.184818267822266, "ref_logps/rejected": -50.73428726196289, "rewards/accuracies": 0.625, "rewards/chosen": -1.1582458019256592, "rewards/margins": 0.5772035121917725, "rewards/rejected": -1.7354493141174316, "step": 842 }, { "epoch": 0.8, "grad_norm": 22.93755531311035, "learning_rate": 4.0818467995802727e-07, "logps/chosen": -46.00981903076172, "logps/rejected": -56.951881408691406, "loss": 0.6753, "losses/dpo": 0.8745790123939514, "losses/sft": 1.7632869482040405, "losses/total": 0.8745790123939514, "ref_logps/chosen": -32.30010986328125, "ref_logps/rejected": -41.57134246826172, "rewards/accuracies": 0.5625, "rewards/chosen": -1.370970606803894, "rewards/margins": 0.1670835018157959, "rewards/rejected": -1.5380539894104004, "step": 843 }, { "epoch": 0.8, "grad_norm": 25.24207305908203, "learning_rate": 4.080097936341378e-07, "logps/chosen": -42.13519287109375, "logps/rejected": -58.031620025634766, "loss": 0.6824, "losses/dpo": 0.8411730527877808, "losses/sft": 1.5144634246826172, "losses/total": 0.8411730527877808, "ref_logps/chosen": -31.530597686767578, "ref_logps/rejected": -43.723594665527344, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0604591369628906, "rewards/margins": 0.3703431487083435, "rewards/rejected": -1.430802345275879, "step": 844 }, { "epoch": 0.8, "grad_norm": 19.652835845947266, "learning_rate": 4.0783490731024835e-07, "logps/chosen": -40.53631591796875, "logps/rejected": -54.869380950927734, "loss": 0.5725, "losses/dpo": 0.7487130165100098, "losses/sft": 1.791237473487854, "losses/total": 0.7487130165100098, "ref_logps/chosen": -32.66266632080078, "ref_logps/rejected": -42.31922912597656, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7873647212982178, "rewards/margins": 0.4676506519317627, "rewards/rejected": -1.2550153732299805, "step": 845 }, { "epoch": 0.8, "grad_norm": 22.695154190063477, "learning_rate": 4.076600209863588e-07, "logps/chosen": -48.605037689208984, "logps/rejected": -67.17694091796875, "loss": 0.5637, "losses/dpo": 0.6091452836990356, "losses/sft": 1.739264726638794, "losses/total": 0.6091452836990356, "ref_logps/chosen": -35.46781921386719, "ref_logps/rejected": -48.201053619384766, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3137223720550537, "rewards/margins": 0.5838658213615417, "rewards/rejected": -1.8975882530212402, "step": 846 }, { "epoch": 0.8, "grad_norm": 27.827070236206055, "learning_rate": 4.074851346624694e-07, "logps/chosen": -59.363311767578125, "logps/rejected": -66.76353454589844, "loss": 0.6976, "losses/dpo": 0.26183071732521057, "losses/sft": 1.4789923429489136, "losses/total": 0.26183071732521057, "ref_logps/chosen": -45.20199966430664, "ref_logps/rejected": -49.7233772277832, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4161311388015747, "rewards/margins": 0.2878847122192383, "rewards/rejected": -1.7040159702301025, "step": 847 }, { "epoch": 0.8, "grad_norm": 16.140514373779297, "learning_rate": 4.073102483385799e-07, "logps/chosen": -42.16569519042969, "logps/rejected": -58.39725112915039, "loss": 0.4775, "losses/dpo": 0.43545204401016235, "losses/sft": 1.5353697538375854, "losses/total": 0.43545204401016235, "ref_logps/chosen": -32.512306213378906, "ref_logps/rejected": -40.56866455078125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9653388261795044, "rewards/margins": 0.8175197839736938, "rewards/rejected": -1.7828586101531982, "step": 848 }, { "epoch": 0.8, "grad_norm": 22.41170883178711, "learning_rate": 4.0713536201469045e-07, "logps/chosen": -47.756431579589844, "logps/rejected": -60.93867111206055, "loss": 0.6327, "losses/dpo": 0.3982849419116974, "losses/sft": 1.5695722103118896, "losses/total": 0.3982849419116974, "ref_logps/chosen": -37.33623504638672, "ref_logps/rejected": -45.953495025634766, "rewards/accuracies": 0.75, "rewards/chosen": -1.0420196056365967, "rewards/margins": 0.4564979076385498, "rewards/rejected": -1.4985175132751465, "step": 849 }, { "epoch": 0.8, "grad_norm": 22.37420082092285, "learning_rate": 4.0696047569080096e-07, "logps/chosen": -50.984737396240234, "logps/rejected": -59.799072265625, "loss": 0.547, "losses/dpo": 0.2710026502609253, "losses/sft": 1.1395957469940186, "losses/total": 0.2710026502609253, "ref_logps/chosen": -39.94063949584961, "ref_logps/rejected": -43.56227111816406, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1044096946716309, "rewards/margins": 0.5192702412605286, "rewards/rejected": -1.6236801147460938, "step": 850 }, { "epoch": 0.8, "grad_norm": 22.754549026489258, "learning_rate": 4.067855893669115e-07, "logps/chosen": -51.86137008666992, "logps/rejected": -49.23633575439453, "loss": 0.6634, "losses/dpo": 0.9052378535270691, "losses/sft": 1.8138394355773926, "losses/total": 0.9052378535270691, "ref_logps/chosen": -41.40962600708008, "ref_logps/rejected": -34.698333740234375, "rewards/accuracies": 0.5, "rewards/chosen": -1.0451745986938477, "rewards/margins": 0.40862545371055603, "rewards/rejected": -1.453800082206726, "step": 851 }, { "epoch": 0.8, "grad_norm": 16.493871688842773, "learning_rate": 4.0661070304302204e-07, "logps/chosen": -38.2988395690918, "logps/rejected": -64.69878387451172, "loss": 0.4974, "losses/dpo": 0.38129743933677673, "losses/sft": 1.8565140962600708, "losses/total": 0.38129743933677673, "ref_logps/chosen": -27.315887451171875, "ref_logps/rejected": -44.236541748046875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.098294973373413, "rewards/margins": 0.9479295015335083, "rewards/rejected": -2.046224355697632, "step": 852 }, { "epoch": 0.81, "grad_norm": 19.36992835998535, "learning_rate": 4.064358167191325e-07, "logps/chosen": -48.493099212646484, "logps/rejected": -57.895172119140625, "loss": 0.5137, "losses/dpo": 0.3286309242248535, "losses/sft": 1.859135389328003, "losses/total": 0.3286309242248535, "ref_logps/chosen": -36.25791931152344, "ref_logps/rejected": -39.356407165527344, "rewards/accuracies": 0.625, "rewards/chosen": -1.223517894744873, "rewards/margins": 0.6303588151931763, "rewards/rejected": -1.8538768291473389, "step": 853 }, { "epoch": 0.81, "grad_norm": 19.615402221679688, "learning_rate": 4.0626093039524307e-07, "logps/chosen": -57.19934844970703, "logps/rejected": -71.09638977050781, "loss": 0.5387, "losses/dpo": 0.7195569276809692, "losses/sft": 2.2594923973083496, "losses/total": 0.7195569276809692, "ref_logps/chosen": -47.57176971435547, "ref_logps/rejected": -55.40192413330078, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9627577066421509, "rewards/margins": 0.6066886782646179, "rewards/rejected": -1.5694464445114136, "step": 854 }, { "epoch": 0.81, "grad_norm": 19.97063636779785, "learning_rate": 4.060860440713536e-07, "logps/chosen": -45.506553649902344, "logps/rejected": -49.71862030029297, "loss": 0.5772, "losses/dpo": 0.4405631721019745, "losses/sft": 2.0534377098083496, "losses/total": 0.4405631721019745, "ref_logps/chosen": -35.478851318359375, "ref_logps/rejected": -35.66107940673828, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0027706623077393, "rewards/margins": 0.4029836058616638, "rewards/rejected": -1.4057542085647583, "step": 855 }, { "epoch": 0.81, "grad_norm": 22.616846084594727, "learning_rate": 4.0591115774746415e-07, "logps/chosen": -60.477508544921875, "logps/rejected": -68.45823669433594, "loss": 0.6113, "losses/dpo": 0.5044362545013428, "losses/sft": 1.688843011856079, "losses/total": 0.5044362545013428, "ref_logps/chosen": -47.33610916137695, "ref_logps/rejected": -50.687744140625, "rewards/accuracies": 0.625, "rewards/chosen": -1.3141398429870605, "rewards/margins": 0.4629090130329132, "rewards/rejected": -1.7770488262176514, "step": 856 }, { "epoch": 0.81, "grad_norm": 12.60323715209961, "learning_rate": 4.0573627142357466e-07, "logps/chosen": -38.97118377685547, "logps/rejected": -58.85884094238281, "loss": 0.3425, "losses/dpo": 0.33693045377731323, "losses/sft": 1.9000900983810425, "losses/total": 0.33693045377731323, "ref_logps/chosen": -31.284116744995117, "ref_logps/rejected": -40.602928161621094, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7687070965766907, "rewards/margins": 1.056883692741394, "rewards/rejected": -1.8255908489227295, "step": 857 }, { "epoch": 0.81, "grad_norm": 16.767677307128906, "learning_rate": 4.0556138509968517e-07, "logps/chosen": -50.87198257446289, "logps/rejected": -63.775840759277344, "loss": 0.4464, "losses/dpo": 0.17628905177116394, "losses/sft": 1.7027733325958252, "losses/total": 0.17628905177116394, "ref_logps/chosen": -39.1417350769043, "ref_logps/rejected": -40.746944427490234, "rewards/accuracies": 0.75, "rewards/chosen": -1.1730247735977173, "rewards/margins": 1.1298646926879883, "rewards/rejected": -2.302889347076416, "step": 858 }, { "epoch": 0.81, "grad_norm": 22.397029876708984, "learning_rate": 4.0538649877579574e-07, "logps/chosen": -58.40730285644531, "logps/rejected": -50.577537536621094, "loss": 0.6917, "losses/dpo": 0.7703583240509033, "losses/sft": 1.8212858438491821, "losses/total": 0.7703583240509033, "ref_logps/chosen": -47.790496826171875, "ref_logps/rejected": -37.80146789550781, "rewards/accuracies": 0.5625, "rewards/chosen": -1.061680555343628, "rewards/margins": 0.21592707931995392, "rewards/rejected": -1.2776075601577759, "step": 859 }, { "epoch": 0.81, "grad_norm": 25.436233520507812, "learning_rate": 4.052116124519062e-07, "logps/chosen": -57.184967041015625, "logps/rejected": -69.12425231933594, "loss": 0.6217, "losses/dpo": 0.4821227192878723, "losses/sft": 1.9631327390670776, "losses/total": 0.4821227192878723, "ref_logps/chosen": -43.59064483642578, "ref_logps/rejected": -48.96746063232422, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3594326972961426, "rewards/margins": 0.6562467813491821, "rewards/rejected": -2.015679359436035, "step": 860 }, { "epoch": 0.81, "grad_norm": 18.395221710205078, "learning_rate": 4.0503672612801676e-07, "logps/chosen": -44.1633186340332, "logps/rejected": -53.20210266113281, "loss": 0.5284, "losses/dpo": 0.6680523157119751, "losses/sft": 1.4781405925750732, "losses/total": 0.6680523157119751, "ref_logps/chosen": -35.332000732421875, "ref_logps/rejected": -38.637901306152344, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8831318616867065, "rewards/margins": 0.5732882022857666, "rewards/rejected": -1.4564200639724731, "step": 861 }, { "epoch": 0.81, "grad_norm": 23.400135040283203, "learning_rate": 4.0486183980412733e-07, "logps/chosen": -48.28411102294922, "logps/rejected": -57.12320327758789, "loss": 0.6643, "losses/dpo": 0.505574643611908, "losses/sft": 1.6830575466156006, "losses/total": 0.505574643611908, "ref_logps/chosen": -35.49404525756836, "ref_logps/rejected": -40.492889404296875, "rewards/accuracies": 0.625, "rewards/chosen": -1.2790067195892334, "rewards/margins": 0.38402464985847473, "rewards/rejected": -1.6630313396453857, "step": 862 }, { "epoch": 0.81, "grad_norm": 19.01360511779785, "learning_rate": 4.0468695348023784e-07, "logps/chosen": -38.048397064208984, "logps/rejected": -53.74854278564453, "loss": 0.5528, "losses/dpo": 0.6405391693115234, "losses/sft": 2.077653408050537, "losses/total": 0.6405391693115234, "ref_logps/chosen": -27.689138412475586, "ref_logps/rejected": -38.26967239379883, "rewards/accuracies": 0.625, "rewards/chosen": -1.0359257459640503, "rewards/margins": 0.5119612216949463, "rewards/rejected": -1.5478869676589966, "step": 863 }, { "epoch": 0.82, "grad_norm": 23.446998596191406, "learning_rate": 4.0451206715634835e-07, "logps/chosen": -40.65340805053711, "logps/rejected": -60.793148040771484, "loss": 0.5443, "losses/dpo": 0.7148911356925964, "losses/sft": 1.7339686155319214, "losses/total": 0.7148911356925964, "ref_logps/chosen": -31.940067291259766, "ref_logps/rejected": -46.303550720214844, "rewards/accuracies": 0.75, "rewards/chosen": -0.8713340759277344, "rewards/margins": 0.5776263475418091, "rewards/rejected": -1.4489604234695435, "step": 864 }, { "epoch": 0.82, "grad_norm": 27.00196075439453, "learning_rate": 4.0433718083245887e-07, "logps/chosen": -53.50159454345703, "logps/rejected": -57.526405334472656, "loss": 0.7365, "losses/dpo": 1.0049927234649658, "losses/sft": 2.0289671421051025, "losses/total": 1.0049927234649658, "ref_logps/chosen": -38.87628936767578, "ref_logps/rejected": -39.68628692626953, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4625301361083984, "rewards/margins": 0.3214820623397827, "rewards/rejected": -1.7840123176574707, "step": 865 }, { "epoch": 0.82, "grad_norm": 25.118370056152344, "learning_rate": 4.0416229450856943e-07, "logps/chosen": -61.04706954956055, "logps/rejected": -61.71116638183594, "loss": 0.5993, "losses/dpo": 0.7177576422691345, "losses/sft": 1.7501208782196045, "losses/total": 0.7177576422691345, "ref_logps/chosen": -45.91304016113281, "ref_logps/rejected": -41.8324089050293, "rewards/accuracies": 0.625, "rewards/chosen": -1.5134029388427734, "rewards/margins": 0.4744730293750763, "rewards/rejected": -1.9878759384155273, "step": 866 }, { "epoch": 0.82, "grad_norm": 21.401714324951172, "learning_rate": 4.039874081846799e-07, "logps/chosen": -59.45185470581055, "logps/rejected": -59.17026138305664, "loss": 0.5522, "losses/dpo": 0.5085623264312744, "losses/sft": 2.0390872955322266, "losses/total": 0.5085623264312744, "ref_logps/chosen": -49.54937744140625, "ref_logps/rejected": -44.580406188964844, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9902480840682983, "rewards/margins": 0.46873754262924194, "rewards/rejected": -1.458985686302185, "step": 867 }, { "epoch": 0.82, "grad_norm": 21.441139221191406, "learning_rate": 4.0381252186079046e-07, "logps/chosen": -46.783607482910156, "logps/rejected": -56.933860778808594, "loss": 0.569, "losses/dpo": 0.7660975456237793, "losses/sft": 1.5038211345672607, "losses/total": 0.7660975456237793, "ref_logps/chosen": -36.38450622558594, "ref_logps/rejected": -42.272499084472656, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0399103164672852, "rewards/margins": 0.4262256920337677, "rewards/rejected": -1.4661359786987305, "step": 868 }, { "epoch": 0.82, "grad_norm": 24.64531707763672, "learning_rate": 4.03637635536901e-07, "logps/chosen": -54.77189636230469, "logps/rejected": -60.194175720214844, "loss": 0.6953, "losses/dpo": 0.5726479291915894, "losses/sft": 1.5797961950302124, "losses/total": 0.5726479291915894, "ref_logps/chosen": -41.2158088684082, "ref_logps/rejected": -44.439491271972656, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3556084632873535, "rewards/margins": 0.2198602259159088, "rewards/rejected": -1.5754687786102295, "step": 869 }, { "epoch": 0.82, "grad_norm": 15.96130657196045, "learning_rate": 4.0346274921301154e-07, "logps/chosen": -46.201385498046875, "logps/rejected": -59.735992431640625, "loss": 0.4572, "losses/dpo": 0.42531144618988037, "losses/sft": 1.75662362575531, "losses/total": 0.42531144618988037, "ref_logps/chosen": -34.47917175292969, "ref_logps/rejected": -40.12859344482422, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1722216606140137, "rewards/margins": 0.7885184288024902, "rewards/rejected": -1.9607398509979248, "step": 870 }, { "epoch": 0.82, "grad_norm": 16.508520126342773, "learning_rate": 4.0328786288912205e-07, "logps/chosen": -41.911895751953125, "logps/rejected": -64.87440490722656, "loss": 0.4431, "losses/dpo": 0.5562084913253784, "losses/sft": 1.542809247970581, "losses/total": 0.5562084913253784, "ref_logps/chosen": -32.55203628540039, "ref_logps/rejected": -46.92086410522461, "rewards/accuracies": 0.8125, "rewards/chosen": -0.935985803604126, "rewards/margins": 0.8593685626983643, "rewards/rejected": -1.7953543663024902, "step": 871 }, { "epoch": 0.82, "grad_norm": 17.955007553100586, "learning_rate": 4.0311297656523256e-07, "logps/chosen": -45.63519287109375, "logps/rejected": -59.40199279785156, "loss": 0.5459, "losses/dpo": 0.5789581537246704, "losses/sft": 1.654513955116272, "losses/total": 0.5789581537246704, "ref_logps/chosen": -33.384796142578125, "ref_logps/rejected": -41.051612854003906, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2250394821166992, "rewards/margins": 0.6099984645843506, "rewards/rejected": -1.8350379467010498, "step": 872 }, { "epoch": 0.82, "grad_norm": 18.90740394592285, "learning_rate": 4.029380902413431e-07, "logps/chosen": -53.078369140625, "logps/rejected": -66.50953674316406, "loss": 0.4955, "losses/dpo": 0.2812321186065674, "losses/sft": 1.4978892803192139, "losses/total": 0.2812321186065674, "ref_logps/chosen": -41.2359619140625, "ref_logps/rejected": -48.61559295654297, "rewards/accuracies": 0.75, "rewards/chosen": -1.1842408180236816, "rewards/margins": 0.6051534414291382, "rewards/rejected": -1.7893941402435303, "step": 873 }, { "epoch": 0.83, "grad_norm": 18.567195892333984, "learning_rate": 4.0276320391745364e-07, "logps/chosen": -34.209442138671875, "logps/rejected": -44.80946350097656, "loss": 0.5305, "losses/dpo": 0.8185225129127502, "losses/sft": 2.060314178466797, "losses/total": 0.8185225129127502, "ref_logps/chosen": -24.25729751586914, "ref_logps/rejected": -27.58606719970703, "rewards/accuracies": 0.75, "rewards/chosen": -0.9952144622802734, "rewards/margins": 0.7271252870559692, "rewards/rejected": -1.7223396301269531, "step": 874 }, { "epoch": 0.83, "grad_norm": 20.771114349365234, "learning_rate": 4.0258831759356415e-07, "logps/chosen": -53.101951599121094, "logps/rejected": -56.18994140625, "loss": 0.5429, "losses/dpo": 0.3754958510398865, "losses/sft": 1.6476372480392456, "losses/total": 0.3754958510398865, "ref_logps/chosen": -40.65750503540039, "ref_logps/rejected": -38.36329650878906, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2444448471069336, "rewards/margins": 0.5382197499275208, "rewards/rejected": -1.7826645374298096, "step": 875 }, { "epoch": 0.83, "grad_norm": 24.008037567138672, "learning_rate": 4.024134312696747e-07, "logps/chosen": -54.33305740356445, "logps/rejected": -63.0389404296875, "loss": 0.5769, "losses/dpo": 0.32310086488723755, "losses/sft": 1.8714921474456787, "losses/total": 0.32310086488723755, "ref_logps/chosen": -40.23691940307617, "ref_logps/rejected": -43.30393981933594, "rewards/accuracies": 0.5625, "rewards/chosen": -1.409613847732544, "rewards/margins": 0.5638861656188965, "rewards/rejected": -1.9734998941421509, "step": 876 }, { "epoch": 0.83, "grad_norm": 22.259435653686523, "learning_rate": 4.0223854494578523e-07, "logps/chosen": -38.70526885986328, "logps/rejected": -46.02214431762695, "loss": 0.6679, "losses/dpo": 0.7911670804023743, "losses/sft": 1.2371726036071777, "losses/total": 0.7911670804023743, "ref_logps/chosen": -29.42253875732422, "ref_logps/rejected": -34.9599723815918, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9282734394073486, "rewards/margins": 0.17794373631477356, "rewards/rejected": -1.1062171459197998, "step": 877 }, { "epoch": 0.83, "grad_norm": 17.41746711730957, "learning_rate": 4.0206365862189574e-07, "logps/chosen": -47.60137176513672, "logps/rejected": -64.38847351074219, "loss": 0.4599, "losses/dpo": 0.4213147461414337, "losses/sft": 1.8955029249191284, "losses/total": 0.4213147461414337, "ref_logps/chosen": -33.97480773925781, "ref_logps/rejected": -41.838401794433594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3626561164855957, "rewards/margins": 0.8923516273498535, "rewards/rejected": -2.255007743835449, "step": 878 }, { "epoch": 0.83, "grad_norm": 19.2557315826416, "learning_rate": 4.0188877229800625e-07, "logps/chosen": -47.31446838378906, "logps/rejected": -62.8548698425293, "loss": 0.5233, "losses/dpo": 0.5200448036193848, "losses/sft": 1.705970048904419, "losses/total": 0.5200448036193848, "ref_logps/chosen": -35.320770263671875, "ref_logps/rejected": -43.62351989746094, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1993695497512817, "rewards/margins": 0.7237653136253357, "rewards/rejected": -1.9231348037719727, "step": 879 }, { "epoch": 0.83, "grad_norm": 21.179445266723633, "learning_rate": 4.017138859741168e-07, "logps/chosen": -45.69499969482422, "logps/rejected": -57.72776794433594, "loss": 0.5355, "losses/dpo": 0.3703647255897522, "losses/sft": 1.6293296813964844, "losses/total": 0.3703647255897522, "ref_logps/chosen": -34.82201385498047, "ref_logps/rejected": -41.07172393798828, "rewards/accuracies": 0.8125, "rewards/chosen": -1.08729887008667, "rewards/margins": 0.5783048868179321, "rewards/rejected": -1.6656038761138916, "step": 880 }, { "epoch": 0.83, "grad_norm": 20.20770835876465, "learning_rate": 4.015389996502274e-07, "logps/chosen": -47.463958740234375, "logps/rejected": -71.09518432617188, "loss": 0.5154, "losses/dpo": 0.6252133250236511, "losses/sft": 1.6834959983825684, "losses/total": 0.6252133250236511, "ref_logps/chosen": -35.67316436767578, "ref_logps/rejected": -52.39569854736328, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1790798902511597, "rewards/margins": 0.6908687949180603, "rewards/rejected": -1.8699486255645752, "step": 881 }, { "epoch": 0.83, "grad_norm": 22.328908920288086, "learning_rate": 4.0136411332633785e-07, "logps/chosen": -58.70216369628906, "logps/rejected": -63.54753494262695, "loss": 0.6014, "losses/dpo": 0.9843833446502686, "losses/sft": 2.578153610229492, "losses/total": 0.9843833446502686, "ref_logps/chosen": -43.267913818359375, "ref_logps/rejected": -42.69639205932617, "rewards/accuracies": 0.6875, "rewards/chosen": -1.54342520236969, "rewards/margins": 0.5416891574859619, "rewards/rejected": -2.0851144790649414, "step": 882 }, { "epoch": 0.83, "grad_norm": 22.3562068939209, "learning_rate": 4.011892270024484e-07, "logps/chosen": -59.6101188659668, "logps/rejected": -64.79010009765625, "loss": 0.6366, "losses/dpo": 0.7219882607460022, "losses/sft": 2.1527886390686035, "losses/total": 0.7219882607460022, "ref_logps/chosen": -44.10911560058594, "ref_logps/rejected": -45.55890655517578, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5501008033752441, "rewards/margins": 0.37301838397979736, "rewards/rejected": -1.923119306564331, "step": 883 }, { "epoch": 0.83, "grad_norm": 19.44447135925293, "learning_rate": 4.010143406785589e-07, "logps/chosen": -39.67781066894531, "logps/rejected": -46.88531494140625, "loss": 0.6339, "losses/dpo": 0.546170711517334, "losses/sft": 1.4967249631881714, "losses/total": 0.546170711517334, "ref_logps/chosen": -30.153152465820312, "ref_logps/rejected": -34.239295959472656, "rewards/accuracies": 0.625, "rewards/chosen": -0.9524660110473633, "rewards/margins": 0.31213611364364624, "rewards/rejected": -1.2646021842956543, "step": 884 }, { "epoch": 0.84, "grad_norm": 18.314401626586914, "learning_rate": 4.0083945435466944e-07, "logps/chosen": -51.6224365234375, "logps/rejected": -61.485015869140625, "loss": 0.5356, "losses/dpo": 0.30389899015426636, "losses/sft": 1.3712421655654907, "losses/total": 0.30389899015426636, "ref_logps/chosen": -38.87030792236328, "ref_logps/rejected": -42.986480712890625, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2752134799957275, "rewards/margins": 0.5746399164199829, "rewards/rejected": -1.849853277206421, "step": 885 }, { "epoch": 0.84, "grad_norm": 17.25050926208496, "learning_rate": 4.0066456803077995e-07, "logps/chosen": -47.176368713378906, "logps/rejected": -58.17384719848633, "loss": 0.4748, "losses/dpo": 0.417095422744751, "losses/sft": 1.666614294052124, "losses/total": 0.417095422744751, "ref_logps/chosen": -34.78626251220703, "ref_logps/rejected": -38.810752868652344, "rewards/accuracies": 0.875, "rewards/chosen": -1.2390108108520508, "rewards/margins": 0.6972986459732056, "rewards/rejected": -1.936309576034546, "step": 886 }, { "epoch": 0.84, "grad_norm": 21.474668502807617, "learning_rate": 4.004896817068905e-07, "logps/chosen": -57.298316955566406, "logps/rejected": -70.33256530761719, "loss": 0.4984, "losses/dpo": 0.5374317169189453, "losses/sft": 1.7164843082427979, "losses/total": 0.5374317169189453, "ref_logps/chosen": -43.94514083862305, "ref_logps/rejected": -50.434391021728516, "rewards/accuracies": 0.8125, "rewards/chosen": -1.335317611694336, "rewards/margins": 0.6544992327690125, "rewards/rejected": -1.9898170232772827, "step": 887 }, { "epoch": 0.84, "grad_norm": 25.858394622802734, "learning_rate": 4.003147953830011e-07, "logps/chosen": -63.030738830566406, "logps/rejected": -69.27155303955078, "loss": 0.7204, "losses/dpo": 0.9102529287338257, "losses/sft": 2.259722948074341, "losses/total": 0.9102529287338257, "ref_logps/chosen": -46.33272171020508, "ref_logps/rejected": -51.422882080078125, "rewards/accuracies": 0.625, "rewards/chosen": -1.6698018312454224, "rewards/margins": 0.11506487429141998, "rewards/rejected": -1.7848666906356812, "step": 888 }, { "epoch": 0.84, "grad_norm": 25.910511016845703, "learning_rate": 4.0013990905911154e-07, "logps/chosen": -63.817386627197266, "logps/rejected": -73.01847839355469, "loss": 0.6373, "losses/dpo": 0.5446485280990601, "losses/sft": 1.8927452564239502, "losses/total": 0.5446485280990601, "ref_logps/chosen": -46.975746154785156, "ref_logps/rejected": -53.32268142700195, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6841641664505005, "rewards/margins": 0.2854151129722595, "rewards/rejected": -1.9695793390274048, "step": 889 }, { "epoch": 0.84, "grad_norm": 23.929601669311523, "learning_rate": 3.999650227352221e-07, "logps/chosen": -49.52193069458008, "logps/rejected": -48.13573455810547, "loss": 0.7912, "losses/dpo": 0.3822995722293854, "losses/sft": 1.1467831134796143, "losses/total": 0.3822995722293854, "ref_logps/chosen": -37.495567321777344, "ref_logps/rejected": -35.206275939941406, "rewards/accuracies": 0.625, "rewards/chosen": -1.2026360034942627, "rewards/margins": 0.09031036496162415, "rewards/rejected": -1.2929463386535645, "step": 890 }, { "epoch": 0.84, "grad_norm": 17.41577911376953, "learning_rate": 3.997901364113326e-07, "logps/chosen": -41.08697509765625, "logps/rejected": -48.04826354980469, "loss": 0.5444, "losses/dpo": 0.44084930419921875, "losses/sft": 2.1039211750030518, "losses/total": 0.44084930419921875, "ref_logps/chosen": -30.391429901123047, "ref_logps/rejected": -31.823579788208008, "rewards/accuracies": 0.625, "rewards/chosen": -1.0695546865463257, "rewards/margins": 0.5529135465621948, "rewards/rejected": -1.6224682331085205, "step": 891 }, { "epoch": 0.84, "grad_norm": 18.696300506591797, "learning_rate": 3.9961525008744313e-07, "logps/chosen": -51.2611198425293, "logps/rejected": -69.18788146972656, "loss": 0.5201, "losses/dpo": 0.7692731618881226, "losses/sft": 1.5752068758010864, "losses/total": 0.7692731618881226, "ref_logps/chosen": -39.96553039550781, "ref_logps/rejected": -50.13174057006836, "rewards/accuracies": 0.625, "rewards/chosen": -1.1295592784881592, "rewards/margins": 0.7760545611381531, "rewards/rejected": -1.905613899230957, "step": 892 }, { "epoch": 0.84, "grad_norm": 19.385847091674805, "learning_rate": 3.9944036376355364e-07, "logps/chosen": -69.12168884277344, "logps/rejected": -68.44438171386719, "loss": 0.4876, "losses/dpo": 0.4618051052093506, "losses/sft": 1.9552251100540161, "losses/total": 0.4618051052093506, "ref_logps/chosen": -53.12922286987305, "ref_logps/rejected": -45.817264556884766, "rewards/accuracies": 0.875, "rewards/chosen": -1.5992467403411865, "rewards/margins": 0.6634646654129028, "rewards/rejected": -2.262711524963379, "step": 893 }, { "epoch": 0.84, "grad_norm": 16.251096725463867, "learning_rate": 3.992654774396642e-07, "logps/chosen": -52.292945861816406, "logps/rejected": -77.89862823486328, "loss": 0.3504, "losses/dpo": 0.3794952630996704, "losses/sft": 2.046430826187134, "losses/total": 0.3794952630996704, "ref_logps/chosen": -40.439971923828125, "ref_logps/rejected": -53.88254165649414, "rewards/accuracies": 0.875, "rewards/chosen": -1.1852972507476807, "rewards/margins": 1.216310977935791, "rewards/rejected": -2.4016079902648926, "step": 894 }, { "epoch": 0.85, "grad_norm": 21.71742820739746, "learning_rate": 3.990905911157748e-07, "logps/chosen": -40.72724151611328, "logps/rejected": -52.5378303527832, "loss": 0.7007, "losses/dpo": 0.9905320405960083, "losses/sft": 1.6795307397842407, "losses/total": 0.9905320405960083, "ref_logps/chosen": -28.036211013793945, "ref_logps/rejected": -37.149574279785156, "rewards/accuracies": 0.5, "rewards/chosen": -1.2691028118133545, "rewards/margins": 0.26972299814224243, "rewards/rejected": -1.5388257503509521, "step": 895 }, { "epoch": 0.85, "grad_norm": 18.174007415771484, "learning_rate": 3.9891570479188523e-07, "logps/chosen": -44.134765625, "logps/rejected": -59.41065216064453, "loss": 0.536, "losses/dpo": 0.5511208772659302, "losses/sft": 1.4953030347824097, "losses/total": 0.5511208772659302, "ref_logps/chosen": -30.967546463012695, "ref_logps/rejected": -40.59535217285156, "rewards/accuracies": 0.6875, "rewards/chosen": -1.31672203540802, "rewards/margins": 0.5648081302642822, "rewards/rejected": -1.8815300464630127, "step": 896 }, { "epoch": 0.85, "grad_norm": 17.954891204833984, "learning_rate": 3.987408184679958e-07, "logps/chosen": -41.72795486450195, "logps/rejected": -62.03364944458008, "loss": 0.4385, "losses/dpo": 0.34971052408218384, "losses/sft": 1.5322344303131104, "losses/total": 0.34971052408218384, "ref_logps/chosen": -30.000877380371094, "ref_logps/rejected": -41.904273986816406, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1727076768875122, "rewards/margins": 0.840229868888855, "rewards/rejected": -2.012937545776367, "step": 897 }, { "epoch": 0.85, "grad_norm": 22.424640655517578, "learning_rate": 3.985659321441063e-07, "logps/chosen": -58.33543395996094, "logps/rejected": -56.343326568603516, "loss": 0.674, "losses/dpo": 0.6729226112365723, "losses/sft": 1.7288624048233032, "losses/total": 0.6729226112365723, "ref_logps/chosen": -45.78932571411133, "ref_logps/rejected": -40.792091369628906, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2546104192733765, "rewards/margins": 0.3005130887031555, "rewards/rejected": -1.5551234483718872, "step": 898 }, { "epoch": 0.85, "grad_norm": 23.461875915527344, "learning_rate": 3.983910458202168e-07, "logps/chosen": -57.271263122558594, "logps/rejected": -62.131683349609375, "loss": 0.77, "losses/dpo": 1.2522003650665283, "losses/sft": 1.9639540910720825, "losses/total": 1.2522003650665283, "ref_logps/chosen": -43.60448455810547, "ref_logps/rejected": -45.90069580078125, "rewards/accuracies": 0.625, "rewards/chosen": -1.3666772842407227, "rewards/margins": 0.2564208507537842, "rewards/rejected": -1.623098373413086, "step": 899 }, { "epoch": 0.85, "grad_norm": 15.951128959655762, "learning_rate": 3.9821615949632734e-07, "logps/chosen": -46.20152282714844, "logps/rejected": -68.09762573242188, "loss": 0.4218, "losses/dpo": 0.5306084156036377, "losses/sft": 1.9610953330993652, "losses/total": 0.5306084156036377, "ref_logps/chosen": -33.677608489990234, "ref_logps/rejected": -47.117156982421875, "rewards/accuracies": 0.875, "rewards/chosen": -1.2523912191390991, "rewards/margins": 0.8456557393074036, "rewards/rejected": -2.0980470180511475, "step": 900 }, { "epoch": 0.85, "grad_norm": 25.189109802246094, "learning_rate": 3.980412731724379e-07, "logps/chosen": -56.62052917480469, "logps/rejected": -50.940757751464844, "loss": 0.6881, "losses/dpo": 0.4470304250717163, "losses/sft": 1.9205844402313232, "losses/total": 0.4470304250717163, "ref_logps/chosen": -43.021026611328125, "ref_logps/rejected": -35.41661071777344, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3599497079849243, "rewards/margins": 0.19246482849121094, "rewards/rejected": -1.5524145364761353, "step": 901 }, { "epoch": 0.85, "grad_norm": 20.101633071899414, "learning_rate": 3.9786638684854847e-07, "logps/chosen": -48.788063049316406, "logps/rejected": -61.658782958984375, "loss": 0.6498, "losses/dpo": 0.6262995004653931, "losses/sft": 1.9294172525405884, "losses/total": 0.6262995004653931, "ref_logps/chosen": -35.97642135620117, "ref_logps/rejected": -46.43687057495117, "rewards/accuracies": 0.5, "rewards/chosen": -1.2811641693115234, "rewards/margins": 0.24102668464183807, "rewards/rejected": -1.522190809249878, "step": 902 }, { "epoch": 0.85, "grad_norm": 20.468608856201172, "learning_rate": 3.9769150052465893e-07, "logps/chosen": -51.07587814331055, "logps/rejected": -60.33686065673828, "loss": 0.5267, "losses/dpo": 0.39729684591293335, "losses/sft": 1.6105037927627563, "losses/total": 0.39729684591293335, "ref_logps/chosen": -40.25775909423828, "ref_logps/rejected": -42.918670654296875, "rewards/accuracies": 0.625, "rewards/chosen": -1.0818116664886475, "rewards/margins": 0.6600068807601929, "rewards/rejected": -1.7418184280395508, "step": 903 }, { "epoch": 0.85, "grad_norm": 21.925613403320312, "learning_rate": 3.975166142007695e-07, "logps/chosen": -57.73406219482422, "logps/rejected": -67.70793151855469, "loss": 0.5483, "losses/dpo": 0.4529944360256195, "losses/sft": 1.730518102645874, "losses/total": 0.4529944360256195, "ref_logps/chosen": -46.38959884643555, "ref_logps/rejected": -50.24241638183594, "rewards/accuracies": 0.75, "rewards/chosen": -1.134446144104004, "rewards/margins": 0.6121060848236084, "rewards/rejected": -1.7465522289276123, "step": 904 }, { "epoch": 0.85, "grad_norm": 27.211524963378906, "learning_rate": 3.9734172787688e-07, "logps/chosen": -55.615142822265625, "logps/rejected": -52.123146057128906, "loss": 0.793, "losses/dpo": 0.9936805963516235, "losses/sft": 1.4861667156219482, "losses/total": 0.9936805963516235, "ref_logps/chosen": -42.5081787109375, "ref_logps/rejected": -39.06757354736328, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3106964826583862, "rewards/margins": -0.005139090120792389, "rewards/rejected": -1.305557370185852, "step": 905 }, { "epoch": 0.86, "grad_norm": 25.954444885253906, "learning_rate": 3.971668415529905e-07, "logps/chosen": -49.63874053955078, "logps/rejected": -64.95807647705078, "loss": 0.6194, "losses/dpo": 0.8540397882461548, "losses/sft": 1.63751220703125, "losses/total": 0.8540397882461548, "ref_logps/chosen": -34.46154022216797, "ref_logps/rejected": -45.160213470458984, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5177199840545654, "rewards/margins": 0.46206629276275635, "rewards/rejected": -1.9797862768173218, "step": 906 }, { "epoch": 0.86, "grad_norm": 22.458322525024414, "learning_rate": 3.969919552291011e-07, "logps/chosen": -55.3083610534668, "logps/rejected": -77.94244384765625, "loss": 0.6459, "losses/dpo": 0.540063202381134, "losses/sft": 2.054966449737549, "losses/total": 0.540063202381134, "ref_logps/chosen": -38.51998519897461, "ref_logps/rejected": -57.277915954589844, "rewards/accuracies": 0.625, "rewards/chosen": -1.678837537765503, "rewards/margins": 0.387615442276001, "rewards/rejected": -2.066452980041504, "step": 907 }, { "epoch": 0.86, "grad_norm": 24.14090919494629, "learning_rate": 3.968170689052116e-07, "logps/chosen": -58.927574157714844, "logps/rejected": -65.96034240722656, "loss": 0.6056, "losses/dpo": 0.9082849621772766, "losses/sft": 2.2859010696411133, "losses/total": 0.9082849621772766, "ref_logps/chosen": -46.313568115234375, "ref_logps/rejected": -48.550933837890625, "rewards/accuracies": 0.625, "rewards/chosen": -1.2614003419876099, "rewards/margins": 0.4795409142971039, "rewards/rejected": -1.7409412860870361, "step": 908 }, { "epoch": 0.86, "grad_norm": 14.95252799987793, "learning_rate": 3.9664218258132216e-07, "logps/chosen": -43.511505126953125, "logps/rejected": -58.01011276245117, "loss": 0.4388, "losses/dpo": 0.43878018856048584, "losses/sft": 1.9378046989440918, "losses/total": 0.43878018856048584, "ref_logps/chosen": -31.892057418823242, "ref_logps/rejected": -38.38436508178711, "rewards/accuracies": 0.875, "rewards/chosen": -1.16194486618042, "rewards/margins": 0.8006298542022705, "rewards/rejected": -1.9625747203826904, "step": 909 }, { "epoch": 0.86, "grad_norm": 20.923730850219727, "learning_rate": 3.964672962574326e-07, "logps/chosen": -52.31150436401367, "logps/rejected": -64.87698364257812, "loss": 0.535, "losses/dpo": 0.854328989982605, "losses/sft": 2.2774758338928223, "losses/total": 0.854328989982605, "ref_logps/chosen": -43.05564880371094, "ref_logps/rejected": -49.296844482421875, "rewards/accuracies": 0.75, "rewards/chosen": -0.9255850315093994, "rewards/margins": 0.6324282288551331, "rewards/rejected": -1.5580133199691772, "step": 910 }, { "epoch": 0.86, "grad_norm": 20.61965560913086, "learning_rate": 3.962924099335432e-07, "logps/chosen": -48.34567642211914, "logps/rejected": -71.5721664428711, "loss": 0.5809, "losses/dpo": 0.21209962666034698, "losses/sft": 1.9302908182144165, "losses/total": 0.21209962666034698, "ref_logps/chosen": -33.894256591796875, "ref_logps/rejected": -51.49092102050781, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4451422691345215, "rewards/margins": 0.5629817247390747, "rewards/rejected": -2.0081241130828857, "step": 911 }, { "epoch": 0.86, "grad_norm": 14.768598556518555, "learning_rate": 3.961175236096537e-07, "logps/chosen": -52.110084533691406, "logps/rejected": -69.14460754394531, "loss": 0.2961, "losses/dpo": 0.40846747159957886, "losses/sft": 1.9573677778244019, "losses/total": 0.40846747159957886, "ref_logps/chosen": -41.841468811035156, "ref_logps/rejected": -46.12614440917969, "rewards/accuracies": 1.0, "rewards/chosen": -1.0268621444702148, "rewards/margins": 1.2749836444854736, "rewards/rejected": -2.3018460273742676, "step": 912 }, { "epoch": 0.86, "grad_norm": 20.606040954589844, "learning_rate": 3.959426372857642e-07, "logps/chosen": -56.27659225463867, "logps/rejected": -56.47953796386719, "loss": 0.6488, "losses/dpo": 0.5586219429969788, "losses/sft": 1.7982591390609741, "losses/total": 0.5586219429969788, "ref_logps/chosen": -44.78296661376953, "ref_logps/rejected": -39.60145568847656, "rewards/accuracies": 0.5, "rewards/chosen": -1.149362564086914, "rewards/margins": 0.538446307182312, "rewards/rejected": -1.6878087520599365, "step": 913 }, { "epoch": 0.86, "grad_norm": 18.362518310546875, "learning_rate": 3.957677509618748e-07, "logps/chosen": -40.653472900390625, "logps/rejected": -60.30322265625, "loss": 0.4946, "losses/dpo": 0.6199674606323242, "losses/sft": 1.369446873664856, "losses/total": 0.6199674606323242, "ref_logps/chosen": -32.407623291015625, "ref_logps/rejected": -46.03407669067383, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8245849609375, "rewards/margins": 0.6023292541503906, "rewards/rejected": -1.4269142150878906, "step": 914 }, { "epoch": 0.86, "grad_norm": 17.280731201171875, "learning_rate": 3.955928646379853e-07, "logps/chosen": -48.192405700683594, "logps/rejected": -68.83238220214844, "loss": 0.418, "losses/dpo": 0.2858433723449707, "losses/sft": 1.3345253467559814, "losses/total": 0.2858433723449707, "ref_logps/chosen": -39.491615295410156, "ref_logps/rejected": -49.456016540527344, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8700789213180542, "rewards/margins": 1.0675580501556396, "rewards/rejected": -1.9376369714736938, "step": 915 }, { "epoch": 0.86, "grad_norm": 22.392044067382812, "learning_rate": 3.9541797831409586e-07, "logps/chosen": -45.73032760620117, "logps/rejected": -56.69221878051758, "loss": 0.6681, "losses/dpo": 0.5759015083312988, "losses/sft": 1.6123839616775513, "losses/total": 0.5759015083312988, "ref_logps/chosen": -32.025428771972656, "ref_logps/rejected": -39.44956588745117, "rewards/accuracies": 0.5, "rewards/chosen": -1.3704901933670044, "rewards/margins": 0.3537752032279968, "rewards/rejected": -1.7242653369903564, "step": 916 }, { "epoch": 0.87, "grad_norm": 29.403060913085938, "learning_rate": 3.952430919902063e-07, "logps/chosen": -69.3777847290039, "logps/rejected": -77.7389144897461, "loss": 0.7287, "losses/dpo": 0.9813758134841919, "losses/sft": 2.0115604400634766, "losses/total": 0.9813758134841919, "ref_logps/chosen": -53.83844757080078, "ref_logps/rejected": -58.71717834472656, "rewards/accuracies": 0.625, "rewards/chosen": -1.5539339780807495, "rewards/margins": 0.348239928483963, "rewards/rejected": -1.9021738767623901, "step": 917 }, { "epoch": 0.87, "grad_norm": 16.38727378845215, "learning_rate": 3.950682056663169e-07, "logps/chosen": -49.663917541503906, "logps/rejected": -66.61436462402344, "loss": 0.4548, "losses/dpo": 0.32091787457466125, "losses/sft": 1.9862086772918701, "losses/total": 0.32091787457466125, "ref_logps/chosen": -35.33073043823242, "ref_logps/rejected": -44.718589782714844, "rewards/accuracies": 0.75, "rewards/chosen": -1.4333181381225586, "rewards/margins": 0.7562595009803772, "rewards/rejected": -2.18957781791687, "step": 918 }, { "epoch": 0.87, "grad_norm": 14.43567180633545, "learning_rate": 3.948933193424274e-07, "logps/chosen": -47.30598449707031, "logps/rejected": -78.9987564086914, "loss": 0.3346, "losses/dpo": 0.41614753007888794, "losses/sft": 1.6911240816116333, "losses/total": 0.41614753007888794, "ref_logps/chosen": -35.60873031616211, "ref_logps/rejected": -55.5838508605957, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1697256565093994, "rewards/margins": 1.1717644929885864, "rewards/rejected": -2.3414902687072754, "step": 919 }, { "epoch": 0.87, "grad_norm": 17.731252670288086, "learning_rate": 3.947184330185379e-07, "logps/chosen": -46.636131286621094, "logps/rejected": -52.21043395996094, "loss": 0.558, "losses/dpo": 0.4975965917110443, "losses/sft": 1.7086056470870972, "losses/total": 0.4975965917110443, "ref_logps/chosen": -34.643272399902344, "ref_logps/rejected": -36.218162536621094, "rewards/accuracies": 0.75, "rewards/chosen": -1.1992857456207275, "rewards/margins": 0.3999415934085846, "rewards/rejected": -1.5992274284362793, "step": 920 }, { "epoch": 0.87, "grad_norm": 21.331438064575195, "learning_rate": 3.945435466946485e-07, "logps/chosen": -54.239837646484375, "logps/rejected": -65.28260803222656, "loss": 0.5102, "losses/dpo": 0.35546794533729553, "losses/sft": 1.9883983135223389, "losses/total": 0.35546794533729553, "ref_logps/chosen": -39.257781982421875, "ref_logps/rejected": -44.58476257324219, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4982060194015503, "rewards/margins": 0.5715781450271606, "rewards/rejected": -2.069784164428711, "step": 921 }, { "epoch": 0.87, "grad_norm": 23.890710830688477, "learning_rate": 3.94368660370759e-07, "logps/chosen": -52.246673583984375, "logps/rejected": -57.574310302734375, "loss": 0.6025, "losses/dpo": 0.4584146738052368, "losses/sft": 1.7509024143218994, "losses/total": 0.4584146738052368, "ref_logps/chosen": -39.73377227783203, "ref_logps/rejected": -40.20069885253906, "rewards/accuracies": 0.75, "rewards/chosen": -1.2512900829315186, "rewards/margins": 0.48607081174850464, "rewards/rejected": -1.7373608350753784, "step": 922 }, { "epoch": 0.87, "grad_norm": 24.46379852294922, "learning_rate": 3.9419377404686955e-07, "logps/chosen": -54.13951110839844, "logps/rejected": -62.03705596923828, "loss": 0.7075, "losses/dpo": 0.628227949142456, "losses/sft": 1.8977872133255005, "losses/total": 0.628227949142456, "ref_logps/chosen": -40.45734405517578, "ref_logps/rejected": -42.91888427734375, "rewards/accuracies": 0.625, "rewards/chosen": -1.368216633796692, "rewards/margins": 0.5436007380485535, "rewards/rejected": -1.9118174314498901, "step": 923 }, { "epoch": 0.87, "grad_norm": 16.505361557006836, "learning_rate": 3.9401888772298e-07, "logps/chosen": -43.91877746582031, "logps/rejected": -57.58205795288086, "loss": 0.415, "losses/dpo": 0.5588536858558655, "losses/sft": 1.7603886127471924, "losses/total": 0.5588536858558655, "ref_logps/chosen": -33.67098617553711, "ref_logps/rejected": -37.86896514892578, "rewards/accuracies": 0.875, "rewards/chosen": -1.0247793197631836, "rewards/margins": 0.9465299844741821, "rewards/rejected": -1.9713091850280762, "step": 924 }, { "epoch": 0.87, "grad_norm": 19.35439109802246, "learning_rate": 3.938440013990906e-07, "logps/chosen": -46.69012451171875, "logps/rejected": -71.21768951416016, "loss": 0.4487, "losses/dpo": 0.17130029201507568, "losses/sft": 1.5461541414260864, "losses/total": 0.17130029201507568, "ref_logps/chosen": -36.7366943359375, "ref_logps/rejected": -50.868370056152344, "rewards/accuracies": 0.75, "rewards/chosen": -0.9953426718711853, "rewards/margins": 1.0395894050598145, "rewards/rejected": -2.0349321365356445, "step": 925 }, { "epoch": 0.87, "grad_norm": 19.393033981323242, "learning_rate": 3.9366911507520114e-07, "logps/chosen": -46.82405090332031, "logps/rejected": -58.96955871582031, "loss": 0.5752, "losses/dpo": 0.4874016046524048, "losses/sft": 2.263362407684326, "losses/total": 0.4874016046524048, "ref_logps/chosen": -33.57951354980469, "ref_logps/rejected": -39.69049835205078, "rewards/accuracies": 0.75, "rewards/chosen": -1.3244540691375732, "rewards/margins": 0.6034518480300903, "rewards/rejected": -1.9279060363769531, "step": 926 }, { "epoch": 0.88, "grad_norm": 19.086570739746094, "learning_rate": 3.934942287513116e-07, "logps/chosen": -55.59938049316406, "logps/rejected": -62.348541259765625, "loss": 0.4738, "losses/dpo": 0.43744832277297974, "losses/sft": 2.0748870372772217, "losses/total": 0.43744832277297974, "ref_logps/chosen": -42.695716857910156, "ref_logps/rejected": -43.56956481933594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2903660535812378, "rewards/margins": 0.5875317454338074, "rewards/rejected": -1.8778977394104004, "step": 927 }, { "epoch": 0.88, "grad_norm": 30.810121536254883, "learning_rate": 3.9331934242742217e-07, "logps/chosen": -55.43946075439453, "logps/rejected": -57.36415100097656, "loss": 0.9686, "losses/dpo": 0.5155337452888489, "losses/sft": 1.5941705703735352, "losses/total": 0.5155337452888489, "ref_logps/chosen": -38.125701904296875, "ref_logps/rejected": -43.50593566894531, "rewards/accuracies": 0.375, "rewards/chosen": -1.7313754558563232, "rewards/margins": -0.34555378556251526, "rewards/rejected": -1.3858217000961304, "step": 928 }, { "epoch": 0.88, "grad_norm": 19.359027862548828, "learning_rate": 3.931444561035327e-07, "logps/chosen": -48.7660026550293, "logps/rejected": -57.48979949951172, "loss": 0.6006, "losses/dpo": 0.5694717764854431, "losses/sft": 1.886942982673645, "losses/total": 0.5694717764854431, "ref_logps/chosen": -37.55628967285156, "ref_logps/rejected": -41.28240966796875, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1209712028503418, "rewards/margins": 0.49976757168769836, "rewards/rejected": -1.6207387447357178, "step": 929 }, { "epoch": 0.88, "grad_norm": 25.701581954956055, "learning_rate": 3.9296956977964325e-07, "logps/chosen": -53.83027648925781, "logps/rejected": -59.25616455078125, "loss": 0.7557, "losses/dpo": 0.9149075746536255, "losses/sft": 1.8033795356750488, "losses/total": 0.9149075746536255, "ref_logps/chosen": -40.902042388916016, "ref_logps/rejected": -44.212833404541016, "rewards/accuracies": 0.5, "rewards/chosen": -1.2928234338760376, "rewards/margins": 0.21150939166545868, "rewards/rejected": -1.5043327808380127, "step": 930 }, { "epoch": 0.88, "grad_norm": 23.654983520507812, "learning_rate": 3.927946834557537e-07, "logps/chosen": -52.84912109375, "logps/rejected": -58.94500732421875, "loss": 0.6085, "losses/dpo": 0.6101242303848267, "losses/sft": 1.5752911567687988, "losses/total": 0.6101242303848267, "ref_logps/chosen": -38.9780158996582, "ref_logps/rejected": -41.359989166259766, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3871103525161743, "rewards/margins": 0.3713914155960083, "rewards/rejected": -1.7585017681121826, "step": 931 }, { "epoch": 0.88, "grad_norm": 18.34036636352539, "learning_rate": 3.9261979713186427e-07, "logps/chosen": -41.510948181152344, "logps/rejected": -50.07872772216797, "loss": 0.5924, "losses/dpo": 0.41716867685317993, "losses/sft": 1.6984672546386719, "losses/total": 0.41716867685317993, "ref_logps/chosen": -29.266643524169922, "ref_logps/rejected": -34.5325927734375, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2244302034378052, "rewards/margins": 0.33018285036087036, "rewards/rejected": -1.5546129941940308, "step": 932 }, { "epoch": 0.88, "grad_norm": 19.952167510986328, "learning_rate": 3.9244491080797484e-07, "logps/chosen": -47.16590881347656, "logps/rejected": -59.26890563964844, "loss": 0.5872, "losses/dpo": 0.6848894357681274, "losses/sft": 1.7096810340881348, "losses/total": 0.6848894357681274, "ref_logps/chosen": -39.62025833129883, "ref_logps/rejected": -45.42413330078125, "rewards/accuracies": 0.625, "rewards/chosen": -0.7545651793479919, "rewards/margins": 0.6299120783805847, "rewards/rejected": -1.384477138519287, "step": 933 }, { "epoch": 0.88, "grad_norm": 17.6008243560791, "learning_rate": 3.922700244840853e-07, "logps/chosen": -55.84395980834961, "logps/rejected": -68.52776336669922, "loss": 0.4359, "losses/dpo": 0.44452258944511414, "losses/sft": 1.6741211414337158, "losses/total": 0.44452258944511414, "ref_logps/chosen": -43.75857162475586, "ref_logps/rejected": -47.53761291503906, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2085391283035278, "rewards/margins": 0.8904761075973511, "rewards/rejected": -2.099015235900879, "step": 934 }, { "epoch": 0.88, "grad_norm": 24.569114685058594, "learning_rate": 3.9209513816019586e-07, "logps/chosen": -39.32587432861328, "logps/rejected": -51.08375549316406, "loss": 0.6675, "losses/dpo": 0.6287034749984741, "losses/sft": 0.9051434993743896, "losses/total": 0.6287034749984741, "ref_logps/chosen": -29.49465560913086, "ref_logps/rejected": -38.91350555419922, "rewards/accuracies": 0.5, "rewards/chosen": -0.9831218719482422, "rewards/margins": 0.23390275239944458, "rewards/rejected": -1.217024564743042, "step": 935 }, { "epoch": 0.88, "grad_norm": 22.57014274597168, "learning_rate": 3.919202518363064e-07, "logps/chosen": -47.48615264892578, "logps/rejected": -52.418453216552734, "loss": 0.6616, "losses/dpo": 0.6661105155944824, "losses/sft": 1.788139820098877, "losses/total": 0.6661105155944824, "ref_logps/chosen": -38.539764404296875, "ref_logps/rejected": -40.90968322753906, "rewards/accuracies": 0.625, "rewards/chosen": -0.8946388959884644, "rewards/margins": 0.25623804330825806, "rewards/rejected": -1.1508769989013672, "step": 936 }, { "epoch": 0.88, "grad_norm": 31.024436950683594, "learning_rate": 3.9174536551241694e-07, "logps/chosen": -69.43479919433594, "logps/rejected": -67.86477661132812, "loss": 0.8196, "losses/dpo": 0.806796133518219, "losses/sft": 2.2336392402648926, "losses/total": 0.806796133518219, "ref_logps/chosen": -52.997554779052734, "ref_logps/rejected": -51.07181167602539, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6437244415283203, "rewards/margins": 0.03557199239730835, "rewards/rejected": -1.6792964935302734, "step": 937 }, { "epoch": 0.89, "grad_norm": 18.274883270263672, "learning_rate": 3.915704791885274e-07, "logps/chosen": -33.531494140625, "logps/rejected": -50.1838493347168, "loss": 0.5802, "losses/dpo": 0.5538694262504578, "losses/sft": 1.8189845085144043, "losses/total": 0.5538694262504578, "ref_logps/chosen": -22.51466178894043, "ref_logps/rejected": -34.66609191894531, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1016830205917358, "rewards/margins": 0.45009273290634155, "rewards/rejected": -1.5517758131027222, "step": 938 }, { "epoch": 0.89, "grad_norm": 18.459444046020508, "learning_rate": 3.9139559286463797e-07, "logps/chosen": -45.984439849853516, "logps/rejected": -63.44975280761719, "loss": 0.4937, "losses/dpo": 0.4282464385032654, "losses/sft": 1.753808617591858, "losses/total": 0.4282464385032654, "ref_logps/chosen": -34.085784912109375, "ref_logps/rejected": -45.0334358215332, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1898653507232666, "rewards/margins": 0.651766300201416, "rewards/rejected": -1.8416316509246826, "step": 939 }, { "epoch": 0.89, "grad_norm": 22.217809677124023, "learning_rate": 3.9122070654074853e-07, "logps/chosen": -36.222145080566406, "logps/rejected": -54.83385467529297, "loss": 0.6612, "losses/dpo": 0.46967020630836487, "losses/sft": 1.7441169023513794, "losses/total": 0.46967020630836487, "ref_logps/chosen": -27.031352996826172, "ref_logps/rejected": -42.950531005859375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9190797805786133, "rewards/margins": 0.26925280690193176, "rewards/rejected": -1.1883325576782227, "step": 940 }, { "epoch": 0.89, "grad_norm": 13.500921249389648, "learning_rate": 3.91045820216859e-07, "logps/chosen": -44.33881759643555, "logps/rejected": -52.58893585205078, "loss": 0.4083, "losses/dpo": 0.3898429274559021, "losses/sft": 1.3914735317230225, "losses/total": 0.3898429274559021, "ref_logps/chosen": -35.23923110961914, "ref_logps/rejected": -34.500614166259766, "rewards/accuracies": 0.75, "rewards/chosen": -0.9099588990211487, "rewards/margins": 0.8988734483718872, "rewards/rejected": -1.8088324069976807, "step": 941 }, { "epoch": 0.89, "grad_norm": 19.755144119262695, "learning_rate": 3.9087093389296956e-07, "logps/chosen": -42.63121795654297, "logps/rejected": -56.15619659423828, "loss": 0.5452, "losses/dpo": 0.38568371534347534, "losses/sft": 1.2724207639694214, "losses/total": 0.38568371534347534, "ref_logps/chosen": -34.555423736572266, "ref_logps/rejected": -41.963417053222656, "rewards/accuracies": 0.875, "rewards/chosen": -0.8075796365737915, "rewards/margins": 0.6116986274719238, "rewards/rejected": -1.4192781448364258, "step": 942 }, { "epoch": 0.89, "grad_norm": 21.889968872070312, "learning_rate": 3.9069604756908007e-07, "logps/chosen": -49.28250503540039, "logps/rejected": -55.694854736328125, "loss": 0.6037, "losses/dpo": 0.5128437280654907, "losses/sft": 1.5409718751907349, "losses/total": 0.5128437280654907, "ref_logps/chosen": -38.92012023925781, "ref_logps/rejected": -40.67806625366211, "rewards/accuracies": 0.6875, "rewards/chosen": -1.036238193511963, "rewards/margins": 0.4654407501220703, "rewards/rejected": -1.5016790628433228, "step": 943 }, { "epoch": 0.89, "grad_norm": 23.30292510986328, "learning_rate": 3.9052116124519064e-07, "logps/chosen": -56.98743438720703, "logps/rejected": -84.62454223632812, "loss": 0.569, "losses/dpo": 0.30215567350387573, "losses/sft": 1.9144946336746216, "losses/total": 0.30215567350387573, "ref_logps/chosen": -41.867645263671875, "ref_logps/rejected": -64.49409484863281, "rewards/accuracies": 0.625, "rewards/chosen": -1.511979103088379, "rewards/margins": 0.501066267490387, "rewards/rejected": -2.013045310974121, "step": 944 }, { "epoch": 0.89, "grad_norm": 21.492578506469727, "learning_rate": 3.9034627492130115e-07, "logps/chosen": -52.53728485107422, "logps/rejected": -56.453453063964844, "loss": 0.6, "losses/dpo": 0.6856805086135864, "losses/sft": 1.5601537227630615, "losses/total": 0.6856805086135864, "ref_logps/chosen": -43.88945007324219, "ref_logps/rejected": -42.156883239746094, "rewards/accuracies": 0.625, "rewards/chosen": -0.8647836446762085, "rewards/margins": 0.5648730993270874, "rewards/rejected": -1.4296568632125854, "step": 945 }, { "epoch": 0.89, "grad_norm": 22.729167938232422, "learning_rate": 3.9017138859741166e-07, "logps/chosen": -61.96066665649414, "logps/rejected": -62.0217170715332, "loss": 0.5765, "losses/dpo": 0.34722572565078735, "losses/sft": 1.393270492553711, "losses/total": 0.34722572565078735, "ref_logps/chosen": -50.60178756713867, "ref_logps/rejected": -45.35589599609375, "rewards/accuracies": 0.6875, "rewards/chosen": -1.135887622833252, "rewards/margins": 0.5306945443153381, "rewards/rejected": -1.6665821075439453, "step": 946 }, { "epoch": 0.89, "grad_norm": 18.95105743408203, "learning_rate": 3.8999650227352223e-07, "logps/chosen": -40.4675178527832, "logps/rejected": -50.58290100097656, "loss": 0.5595, "losses/dpo": 0.8041644096374512, "losses/sft": 1.6902856826782227, "losses/total": 0.8041644096374512, "ref_logps/chosen": -31.34231185913086, "ref_logps/rejected": -34.31023406982422, "rewards/accuracies": 0.75, "rewards/chosen": -0.9125205278396606, "rewards/margins": 0.7147461771965027, "rewards/rejected": -1.627266764640808, "step": 947 }, { "epoch": 0.9, "grad_norm": 19.459688186645508, "learning_rate": 3.898216159496327e-07, "logps/chosen": -31.382644653320312, "logps/rejected": -36.57513427734375, "loss": 0.6887, "losses/dpo": 0.915522038936615, "losses/sft": 1.5414443016052246, "losses/total": 0.915522038936615, "ref_logps/chosen": -25.55060577392578, "ref_logps/rejected": -29.547283172607422, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5832041501998901, "rewards/margins": 0.11958077549934387, "rewards/rejected": -0.7027848958969116, "step": 948 }, { "epoch": 0.9, "grad_norm": 15.459049224853516, "learning_rate": 3.8964672962574325e-07, "logps/chosen": -35.39344024658203, "logps/rejected": -43.9502067565918, "loss": 0.4462, "losses/dpo": 0.3306221067905426, "losses/sft": 1.1747256517410278, "losses/total": 0.3306221067905426, "ref_logps/chosen": -30.467212677001953, "ref_logps/rejected": -30.87660026550293, "rewards/accuracies": 0.8125, "rewards/chosen": -0.49262285232543945, "rewards/margins": 0.8147377967834473, "rewards/rejected": -1.3073606491088867, "step": 949 }, { "epoch": 0.9, "grad_norm": 15.679216384887695, "learning_rate": 3.8947184330185376e-07, "logps/chosen": -43.178802490234375, "logps/rejected": -63.8748893737793, "loss": 0.392, "losses/dpo": 0.3804413378238678, "losses/sft": 2.044457197189331, "losses/total": 0.3804413378238678, "ref_logps/chosen": -33.71409225463867, "ref_logps/rejected": -42.914451599121094, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9464713931083679, "rewards/margins": 1.1495721340179443, "rewards/rejected": -2.096043586730957, "step": 950 }, { "epoch": 0.9, "grad_norm": 24.586549758911133, "learning_rate": 3.8929695697796433e-07, "logps/chosen": -57.81559753417969, "logps/rejected": -66.35832214355469, "loss": 0.7206, "losses/dpo": 1.0126290321350098, "losses/sft": 2.1405298709869385, "losses/total": 1.0126290321350098, "ref_logps/chosen": -43.024879455566406, "ref_logps/rejected": -49.4676513671875, "rewards/accuracies": 0.5, "rewards/chosen": -1.4790716171264648, "rewards/margins": 0.20999547839164734, "rewards/rejected": -1.6890668869018555, "step": 951 }, { "epoch": 0.9, "grad_norm": 17.131330490112305, "learning_rate": 3.8912207065407484e-07, "logps/chosen": -37.56055450439453, "logps/rejected": -64.56658172607422, "loss": 0.4258, "losses/dpo": 0.3325684666633606, "losses/sft": 1.6636767387390137, "losses/total": 0.3325684666633606, "ref_logps/chosen": -28.85824203491211, "ref_logps/rejected": -48.03290939331055, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8702311515808105, "rewards/margins": 0.7831363677978516, "rewards/rejected": -1.653367519378662, "step": 952 }, { "epoch": 0.9, "grad_norm": 15.110618591308594, "learning_rate": 3.8894718433018536e-07, "logps/chosen": -53.59522247314453, "logps/rejected": -67.45619201660156, "loss": 0.4449, "losses/dpo": 0.3363577425479889, "losses/sft": 2.001035213470459, "losses/total": 0.3363577425479889, "ref_logps/chosen": -43.685890197753906, "ref_logps/rejected": -47.81781005859375, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9909330606460571, "rewards/margins": 0.9729050397872925, "rewards/rejected": -1.9638381004333496, "step": 953 }, { "epoch": 0.9, "grad_norm": 20.63409996032715, "learning_rate": 3.887722980062959e-07, "logps/chosen": -45.954010009765625, "logps/rejected": -55.219051361083984, "loss": 0.5064, "losses/dpo": 0.45236095786094666, "losses/sft": 1.484810709953308, "losses/total": 0.45236095786094666, "ref_logps/chosen": -39.28350830078125, "ref_logps/rejected": -41.63930892944336, "rewards/accuracies": 0.75, "rewards/chosen": -0.6670501828193665, "rewards/margins": 0.6909241676330566, "rewards/rejected": -1.3579744100570679, "step": 954 }, { "epoch": 0.9, "grad_norm": 18.03986930847168, "learning_rate": 3.885974116824064e-07, "logps/chosen": -35.99790573120117, "logps/rejected": -55.131629943847656, "loss": 0.5489, "losses/dpo": 0.3931209444999695, "losses/sft": 1.5018911361694336, "losses/total": 0.3931209444999695, "ref_logps/chosen": -27.373233795166016, "ref_logps/rejected": -41.234100341796875, "rewards/accuracies": 0.625, "rewards/chosen": -0.8624672889709473, "rewards/margins": 0.5272859334945679, "rewards/rejected": -1.3897532224655151, "step": 955 }, { "epoch": 0.9, "grad_norm": 16.459518432617188, "learning_rate": 3.8842252535851695e-07, "logps/chosen": -49.29063415527344, "logps/rejected": -71.05470275878906, "loss": 0.3506, "losses/dpo": 0.25867295265197754, "losses/sft": 1.7450703382492065, "losses/total": 0.25867295265197754, "ref_logps/chosen": -39.947410583496094, "ref_logps/rejected": -51.3934326171875, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9343230128288269, "rewards/margins": 1.0318034887313843, "rewards/rejected": -1.966126561164856, "step": 956 }, { "epoch": 0.9, "grad_norm": 21.14252281188965, "learning_rate": 3.8824763903462746e-07, "logps/chosen": -41.167327880859375, "logps/rejected": -56.70052719116211, "loss": 0.6161, "losses/dpo": 0.24370966851711273, "losses/sft": 1.8089507818222046, "losses/total": 0.24370966851711273, "ref_logps/chosen": -29.911165237426758, "ref_logps/rejected": -40.54180908203125, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1256160736083984, "rewards/margins": 0.49025583267211914, "rewards/rejected": -1.6158719062805176, "step": 957 }, { "epoch": 0.9, "grad_norm": 15.646565437316895, "learning_rate": 3.88072752710738e-07, "logps/chosen": -43.166893005371094, "logps/rejected": -63.51106262207031, "loss": 0.4196, "losses/dpo": 0.31831222772598267, "losses/sft": 2.056562900543213, "losses/total": 0.31831222772598267, "ref_logps/chosen": -34.48486328125, "ref_logps/rejected": -44.45180130004883, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8682032823562622, "rewards/margins": 1.0377230644226074, "rewards/rejected": -1.9059263467788696, "step": 958 }, { "epoch": 0.91, "grad_norm": 23.05276870727539, "learning_rate": 3.8789786638684854e-07, "logps/chosen": -46.186981201171875, "logps/rejected": -56.274253845214844, "loss": 0.7001, "losses/dpo": 0.8616746664047241, "losses/sft": 1.8922234773635864, "losses/total": 0.8616746664047241, "ref_logps/chosen": -35.46388244628906, "ref_logps/rejected": -42.425453186035156, "rewards/accuracies": 0.5625, "rewards/chosen": -1.072309970855713, "rewards/margins": 0.3125705420970917, "rewards/rejected": -1.384880542755127, "step": 959 }, { "epoch": 0.91, "grad_norm": 28.662580490112305, "learning_rate": 3.8772298006295905e-07, "logps/chosen": -76.91574096679688, "logps/rejected": -69.48300170898438, "loss": 0.6311, "losses/dpo": 0.9258673787117004, "losses/sft": 2.213568687438965, "losses/total": 0.9258673787117004, "ref_logps/chosen": -63.26716613769531, "ref_logps/rejected": -49.8331413269043, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3648579120635986, "rewards/margins": 0.6001282334327698, "rewards/rejected": -1.9649860858917236, "step": 960 }, { "epoch": 0.91, "grad_norm": 22.746126174926758, "learning_rate": 3.875480937390696e-07, "logps/chosen": -50.369815826416016, "logps/rejected": -50.867897033691406, "loss": 0.6593, "losses/dpo": 0.4779524505138397, "losses/sft": 1.7878148555755615, "losses/total": 0.4779524505138397, "ref_logps/chosen": -38.644439697265625, "ref_logps/rejected": -36.70579147338867, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1725378036499023, "rewards/margins": 0.2436729520559311, "rewards/rejected": -1.4162108898162842, "step": 961 }, { "epoch": 0.91, "grad_norm": 19.56867218017578, "learning_rate": 3.873732074151801e-07, "logps/chosen": -42.459083557128906, "logps/rejected": -56.513710021972656, "loss": 0.6029, "losses/dpo": 0.6437307596206665, "losses/sft": 1.8656847476959229, "losses/total": 0.6437307596206665, "ref_logps/chosen": -31.495113372802734, "ref_logps/rejected": -39.55369186401367, "rewards/accuracies": 0.5, "rewards/chosen": -1.096397042274475, "rewards/margins": 0.5996049046516418, "rewards/rejected": -1.6960020065307617, "step": 962 }, { "epoch": 0.91, "grad_norm": 19.292837142944336, "learning_rate": 3.8719832109129064e-07, "logps/chosen": -50.66706466674805, "logps/rejected": -64.09747314453125, "loss": 0.491, "losses/dpo": 0.49953269958496094, "losses/sft": 2.0225589275360107, "losses/total": 0.49953269958496094, "ref_logps/chosen": -39.68737030029297, "ref_logps/rejected": -46.53889465332031, "rewards/accuracies": 0.75, "rewards/chosen": -1.0979692935943604, "rewards/margins": 0.6578893661499023, "rewards/rejected": -1.7558586597442627, "step": 963 }, { "epoch": 0.91, "grad_norm": 26.163936614990234, "learning_rate": 3.870234347674012e-07, "logps/chosen": -56.905460357666016, "logps/rejected": -64.78118896484375, "loss": 0.6471, "losses/dpo": 0.24652838706970215, "losses/sft": 1.862913966178894, "losses/total": 0.24652838706970215, "ref_logps/chosen": -43.511051177978516, "ref_logps/rejected": -47.04642868041992, "rewards/accuracies": 0.6875, "rewards/chosen": -1.339440941810608, "rewards/margins": 0.43403521180152893, "rewards/rejected": -1.7734761238098145, "step": 964 }, { "epoch": 0.91, "grad_norm": 24.758644104003906, "learning_rate": 3.868485484435117e-07, "logps/chosen": -50.36905288696289, "logps/rejected": -63.91368103027344, "loss": 0.7482, "losses/dpo": 0.9134958982467651, "losses/sft": 1.8413722515106201, "losses/total": 0.9134958982467651, "ref_logps/chosen": -38.325660705566406, "ref_logps/rejected": -49.209144592285156, "rewards/accuracies": 0.625, "rewards/chosen": -1.2043395042419434, "rewards/margins": 0.26611417531967163, "rewards/rejected": -1.4704537391662598, "step": 965 }, { "epoch": 0.91, "grad_norm": 14.799829483032227, "learning_rate": 3.8667366211962223e-07, "logps/chosen": -45.358970642089844, "logps/rejected": -62.32545471191406, "loss": 0.3398, "losses/dpo": 0.22617366909980774, "losses/sft": 1.3016481399536133, "losses/total": 0.22617366909980774, "ref_logps/chosen": -38.82315444946289, "ref_logps/rejected": -43.84934997558594, "rewards/accuracies": 0.9375, "rewards/chosen": -0.653581440448761, "rewards/margins": 1.1940287351608276, "rewards/rejected": -1.8476101160049438, "step": 966 }, { "epoch": 0.91, "grad_norm": 19.053054809570312, "learning_rate": 3.8649877579573274e-07, "logps/chosen": -41.64030456542969, "logps/rejected": -52.60406494140625, "loss": 0.5582, "losses/dpo": 0.8065115213394165, "losses/sft": 1.68719482421875, "losses/total": 0.8065115213394165, "ref_logps/chosen": -31.42445945739746, "ref_logps/rejected": -36.04149627685547, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0215842723846436, "rewards/margins": 0.6346728801727295, "rewards/rejected": -1.6562572717666626, "step": 967 }, { "epoch": 0.91, "grad_norm": 19.083852767944336, "learning_rate": 3.863238894718433e-07, "logps/chosen": -52.436588287353516, "logps/rejected": -74.2242431640625, "loss": 0.4103, "losses/dpo": 0.2037041187286377, "losses/sft": 1.3081860542297363, "losses/total": 0.2037041187286377, "ref_logps/chosen": -43.161067962646484, "ref_logps/rejected": -54.99870300292969, "rewards/accuracies": 0.875, "rewards/chosen": -0.9275518655776978, "rewards/margins": 0.9950025677680969, "rewards/rejected": -1.9225544929504395, "step": 968 }, { "epoch": 0.92, "grad_norm": 22.65544891357422, "learning_rate": 3.8614900314795377e-07, "logps/chosen": -50.19179153442383, "logps/rejected": -69.511474609375, "loss": 0.5592, "losses/dpo": 0.48498445749282837, "losses/sft": 1.9186389446258545, "losses/total": 0.48498445749282837, "ref_logps/chosen": -40.42304229736328, "ref_logps/rejected": -53.426124572753906, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9768748879432678, "rewards/margins": 0.6316596269607544, "rewards/rejected": -1.608534574508667, "step": 969 }, { "epoch": 0.92, "grad_norm": 24.638919830322266, "learning_rate": 3.8597411682406434e-07, "logps/chosen": -51.37481689453125, "logps/rejected": -57.98303985595703, "loss": 0.6773, "losses/dpo": 0.9160447716712952, "losses/sft": 1.5926841497421265, "losses/total": 0.9160447716712952, "ref_logps/chosen": -39.58479309082031, "ref_logps/rejected": -40.19683837890625, "rewards/accuracies": 0.75, "rewards/chosen": -1.179002285003662, "rewards/margins": 0.5996181964874268, "rewards/rejected": -1.7786204814910889, "step": 970 }, { "epoch": 0.92, "grad_norm": 19.828022003173828, "learning_rate": 3.857992305001749e-07, "logps/chosen": -40.58400344848633, "logps/rejected": -59.37562561035156, "loss": 0.5543, "losses/dpo": 0.495002806186676, "losses/sft": 1.6763334274291992, "losses/total": 0.495002806186676, "ref_logps/chosen": -31.079570770263672, "ref_logps/rejected": -43.05794906616211, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9504432678222656, "rewards/margins": 0.6813246607780457, "rewards/rejected": -1.631767988204956, "step": 971 }, { "epoch": 0.92, "grad_norm": 23.73946189880371, "learning_rate": 3.856243441762854e-07, "logps/chosen": -54.873634338378906, "logps/rejected": -51.76392364501953, "loss": 0.7246, "losses/dpo": 0.5085068941116333, "losses/sft": 1.5580775737762451, "losses/total": 0.5085068941116333, "ref_logps/chosen": -45.79169845581055, "ref_logps/rejected": -40.69203186035156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.908193826675415, "rewards/margins": 0.19899529218673706, "rewards/rejected": -1.1071891784667969, "step": 972 }, { "epoch": 0.92, "grad_norm": 18.36198616027832, "learning_rate": 3.854494578523959e-07, "logps/chosen": -50.29176330566406, "logps/rejected": -57.004920959472656, "loss": 0.5245, "losses/dpo": 0.6440709829330444, "losses/sft": 1.832862377166748, "losses/total": 0.6440709829330444, "ref_logps/chosen": -40.84815979003906, "ref_logps/rejected": -41.187034606933594, "rewards/accuracies": 0.75, "rewards/chosen": -0.9443599581718445, "rewards/margins": 0.6374289393424988, "rewards/rejected": -1.5817888975143433, "step": 973 }, { "epoch": 0.92, "grad_norm": 18.88251304626465, "learning_rate": 3.8527457152850644e-07, "logps/chosen": -43.22986602783203, "logps/rejected": -67.54998779296875, "loss": 0.5005, "losses/dpo": 0.561363697052002, "losses/sft": 1.8750559091567993, "losses/total": 0.561363697052002, "ref_logps/chosen": -33.15339279174805, "ref_logps/rejected": -50.49925994873047, "rewards/accuracies": 0.75, "rewards/chosen": -1.0076472759246826, "rewards/margins": 0.6974254250526428, "rewards/rejected": -1.7050727605819702, "step": 974 }, { "epoch": 0.92, "grad_norm": 18.624319076538086, "learning_rate": 3.85099685204617e-07, "logps/chosen": -45.34701919555664, "logps/rejected": -69.48483276367188, "loss": 0.4595, "losses/dpo": 0.7396693229675293, "losses/sft": 1.9366568326950073, "losses/total": 0.7396693229675293, "ref_logps/chosen": -36.88391876220703, "ref_logps/rejected": -52.47506332397461, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8463100790977478, "rewards/margins": 0.8546671867370605, "rewards/rejected": -1.7009772062301636, "step": 975 }, { "epoch": 0.92, "grad_norm": 23.694549560546875, "learning_rate": 3.8492479888072746e-07, "logps/chosen": -47.048587799072266, "logps/rejected": -56.86940383911133, "loss": 0.7133, "losses/dpo": 0.9186302423477173, "losses/sft": 1.9295350313186646, "losses/total": 0.9186302423477173, "ref_logps/chosen": -35.889808654785156, "ref_logps/rejected": -39.932003021240234, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1158777475357056, "rewards/margins": 0.5778622627258301, "rewards/rejected": -1.6937401294708252, "step": 976 }, { "epoch": 0.92, "grad_norm": 17.80436897277832, "learning_rate": 3.8474991255683803e-07, "logps/chosen": -42.967979431152344, "logps/rejected": -70.74673461914062, "loss": 0.4746, "losses/dpo": 0.3472397029399872, "losses/sft": 1.6480947732925415, "losses/total": 0.3472397029399872, "ref_logps/chosen": -35.133766174316406, "ref_logps/rejected": -52.64344024658203, "rewards/accuracies": 0.75, "rewards/chosen": -0.7834211587905884, "rewards/margins": 1.0269087553024292, "rewards/rejected": -1.8103299140930176, "step": 977 }, { "epoch": 0.92, "grad_norm": 20.923574447631836, "learning_rate": 3.845750262329486e-07, "logps/chosen": -52.09071350097656, "logps/rejected": -57.20856475830078, "loss": 0.48, "losses/dpo": 0.2674854099750519, "losses/sft": 1.99339759349823, "losses/total": 0.2674854099750519, "ref_logps/chosen": -42.38469696044922, "ref_logps/rejected": -40.23178482055664, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9706017374992371, "rewards/margins": 0.7270764708518982, "rewards/rejected": -1.6976782083511353, "step": 978 }, { "epoch": 0.92, "grad_norm": 23.62529182434082, "learning_rate": 3.844001399090591e-07, "logps/chosen": -56.328712463378906, "logps/rejected": -58.30531692504883, "loss": 0.6757, "losses/dpo": 0.5385838150978088, "losses/sft": 1.8027693033218384, "losses/total": 0.5385838150978088, "ref_logps/chosen": -42.6113395690918, "ref_logps/rejected": -41.7963752746582, "rewards/accuracies": 0.5, "rewards/chosen": -1.3717374801635742, "rewards/margins": 0.2791568636894226, "rewards/rejected": -1.6508941650390625, "step": 979 }, { "epoch": 0.93, "grad_norm": 18.919498443603516, "learning_rate": 3.842252535851696e-07, "logps/chosen": -48.087867736816406, "logps/rejected": -70.01002502441406, "loss": 0.5324, "losses/dpo": 0.4499993920326233, "losses/sft": 1.245666265487671, "losses/total": 0.4499993920326233, "ref_logps/chosen": -35.94928741455078, "ref_logps/rejected": -51.554725646972656, "rewards/accuracies": 0.75, "rewards/chosen": -1.2138580083847046, "rewards/margins": 0.6316715478897095, "rewards/rejected": -1.845529556274414, "step": 980 }, { "epoch": 0.93, "grad_norm": 19.970478057861328, "learning_rate": 3.8405036726128013e-07, "logps/chosen": -48.89336395263672, "logps/rejected": -60.02153015136719, "loss": 0.4721, "losses/dpo": 0.6085375547409058, "losses/sft": 1.633169412612915, "losses/total": 0.6085375547409058, "ref_logps/chosen": -40.027061462402344, "ref_logps/rejected": -42.2012825012207, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8866301774978638, "rewards/margins": 0.8953946828842163, "rewards/rejected": -1.78202486038208, "step": 981 }, { "epoch": 0.93, "grad_norm": 17.84539222717285, "learning_rate": 3.838754809373907e-07, "logps/chosen": -43.731719970703125, "logps/rejected": -61.472137451171875, "loss": 0.4654, "losses/dpo": 0.5906164646148682, "losses/sft": 1.684012532234192, "losses/total": 0.5906164646148682, "ref_logps/chosen": -33.648414611816406, "ref_logps/rejected": -44.06370544433594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0083309412002563, "rewards/margins": 0.7325129508972168, "rewards/rejected": -1.7408437728881836, "step": 982 }, { "epoch": 0.93, "grad_norm": 26.598716735839844, "learning_rate": 3.8370059461350116e-07, "logps/chosen": -51.013755798339844, "logps/rejected": -52.03260040283203, "loss": 0.8181, "losses/dpo": 0.9581042528152466, "losses/sft": 1.6930867433547974, "losses/total": 0.9581042528152466, "ref_logps/chosen": -37.647823333740234, "ref_logps/rejected": -37.89427185058594, "rewards/accuracies": 0.5, "rewards/chosen": -1.3365932703018188, "rewards/margins": 0.0772397592663765, "rewards/rejected": -1.4138330221176147, "step": 983 }, { "epoch": 0.93, "grad_norm": 21.65802001953125, "learning_rate": 3.835257082896117e-07, "logps/chosen": -51.42059326171875, "logps/rejected": -71.58717346191406, "loss": 0.5062, "losses/dpo": 0.7946414947509766, "losses/sft": 1.9316154718399048, "losses/total": 0.7946414947509766, "ref_logps/chosen": -40.99918746948242, "ref_logps/rejected": -51.60511779785156, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0421407222747803, "rewards/margins": 0.9560640454292297, "rewards/rejected": -1.9982047080993652, "step": 984 }, { "epoch": 0.93, "grad_norm": 16.202625274658203, "learning_rate": 3.833508219657223e-07, "logps/chosen": -39.13245391845703, "logps/rejected": -57.883689880371094, "loss": 0.4267, "losses/dpo": 0.38652050495147705, "losses/sft": 1.6998968124389648, "losses/total": 0.38652050495147705, "ref_logps/chosen": -32.44765090942383, "ref_logps/rejected": -41.958614349365234, "rewards/accuracies": 0.875, "rewards/chosen": -0.6684800386428833, "rewards/margins": 0.9240274429321289, "rewards/rejected": -1.5925076007843018, "step": 985 }, { "epoch": 0.93, "grad_norm": 27.589420318603516, "learning_rate": 3.831759356418328e-07, "logps/chosen": -51.51816940307617, "logps/rejected": -53.463623046875, "loss": 0.6767, "losses/dpo": 0.47124022245407104, "losses/sft": 1.4325437545776367, "losses/total": 0.47124022245407104, "ref_logps/chosen": -39.39977264404297, "ref_logps/rejected": -38.43071365356445, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2118401527404785, "rewards/margins": 0.291450560092926, "rewards/rejected": -1.5032906532287598, "step": 986 }, { "epoch": 0.93, "grad_norm": 21.131973266601562, "learning_rate": 3.830010493179433e-07, "logps/chosen": -45.02501678466797, "logps/rejected": -57.56442642211914, "loss": 0.5345, "losses/dpo": 0.3590014576911926, "losses/sft": 1.4518725872039795, "losses/total": 0.3590014576911926, "ref_logps/chosen": -35.90077209472656, "ref_logps/rejected": -41.12401580810547, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9124248027801514, "rewards/margins": 0.7316160202026367, "rewards/rejected": -1.644040822982788, "step": 987 }, { "epoch": 0.93, "grad_norm": 19.58783531188965, "learning_rate": 3.8282616299405383e-07, "logps/chosen": -48.63311004638672, "logps/rejected": -65.70501708984375, "loss": 0.5203, "losses/dpo": 0.5570815801620483, "losses/sft": 1.673298954963684, "losses/total": 0.5570815801620483, "ref_logps/chosen": -36.90353775024414, "ref_logps/rejected": -45.29994201660156, "rewards/accuracies": 0.6875, "rewards/chosen": -1.172957181930542, "rewards/margins": 0.867550790309906, "rewards/rejected": -2.0405077934265137, "step": 988 }, { "epoch": 0.93, "grad_norm": 29.893646240234375, "learning_rate": 3.826512766701644e-07, "logps/chosen": -53.61981201171875, "logps/rejected": -60.33222198486328, "loss": 0.8667, "losses/dpo": 0.7239810824394226, "losses/sft": 1.856675624847412, "losses/total": 0.7239810824394226, "ref_logps/chosen": -37.53385925292969, "ref_logps/rejected": -42.647804260253906, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6085954904556274, "rewards/margins": 0.1598462462425232, "rewards/rejected": -1.7684417963027954, "step": 989 }, { "epoch": 0.93, "grad_norm": 27.25286293029785, "learning_rate": 3.824763903462749e-07, "logps/chosen": -61.245811462402344, "logps/rejected": -75.42939758300781, "loss": 0.4865, "losses/dpo": 0.39196962118148804, "losses/sft": 1.2838695049285889, "losses/total": 0.39196962118148804, "ref_logps/chosen": -51.34972381591797, "ref_logps/rejected": -55.9708251953125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9896084666252136, "rewards/margins": 0.9562491178512573, "rewards/rejected": -1.9458576440811157, "step": 990 }, { "epoch": 0.94, "grad_norm": 20.264774322509766, "learning_rate": 3.823015040223854e-07, "logps/chosen": -48.33627700805664, "logps/rejected": -54.53680419921875, "loss": 0.6103, "losses/dpo": 0.865106463432312, "losses/sft": 2.1319375038146973, "losses/total": 0.865106463432312, "ref_logps/chosen": -37.25901794433594, "ref_logps/rejected": -38.98707580566406, "rewards/accuracies": 0.75, "rewards/chosen": -1.1077256202697754, "rewards/margins": 0.44724732637405396, "rewards/rejected": -1.5549730062484741, "step": 991 }, { "epoch": 0.94, "grad_norm": 17.77625846862793, "learning_rate": 3.82126617698496e-07, "logps/chosen": -45.6514892578125, "logps/rejected": -70.92652893066406, "loss": 0.4292, "losses/dpo": 0.348925918340683, "losses/sft": 1.6102349758148193, "losses/total": 0.348925918340683, "ref_logps/chosen": -37.006778717041016, "ref_logps/rejected": -54.52971267700195, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8644711375236511, "rewards/margins": 0.7752112150192261, "rewards/rejected": -1.6396822929382324, "step": 992 }, { "epoch": 0.94, "grad_norm": 29.45354461669922, "learning_rate": 3.819517313746065e-07, "logps/chosen": -52.84046936035156, "logps/rejected": -53.68536376953125, "loss": 0.8718, "losses/dpo": 0.8627600073814392, "losses/sft": 1.4657849073410034, "losses/total": 0.8627600073814392, "ref_logps/chosen": -40.24302291870117, "ref_logps/rejected": -42.56493377685547, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2597452402114868, "rewards/margins": -0.14770209789276123, "rewards/rejected": -1.1120431423187256, "step": 993 }, { "epoch": 0.94, "grad_norm": 30.172273635864258, "learning_rate": 3.81776845050717e-07, "logps/chosen": -46.63984680175781, "logps/rejected": -48.94502639770508, "loss": 0.8928, "losses/dpo": 0.9043549299240112, "losses/sft": 1.6924247741699219, "losses/total": 0.9043549299240112, "ref_logps/chosen": -36.92937469482422, "ref_logps/rejected": -39.463645935058594, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9710469841957092, "rewards/margins": -0.022908613085746765, "rewards/rejected": -0.9481383562088013, "step": 994 }, { "epoch": 0.94, "grad_norm": 26.310802459716797, "learning_rate": 3.816019587268275e-07, "logps/chosen": -64.32036590576172, "logps/rejected": -60.466400146484375, "loss": 0.5553, "losses/dpo": 0.43956348299980164, "losses/sft": 1.8819116353988647, "losses/total": 0.43956348299980164, "ref_logps/chosen": -51.93980407714844, "ref_logps/rejected": -41.98312759399414, "rewards/accuracies": 0.625, "rewards/chosen": -1.238055944442749, "rewards/margins": 0.6102713346481323, "rewards/rejected": -1.8483272790908813, "step": 995 }, { "epoch": 0.94, "grad_norm": 22.787324905395508, "learning_rate": 3.814270724029381e-07, "logps/chosen": -55.270965576171875, "logps/rejected": -53.940147399902344, "loss": 0.6598, "losses/dpo": 0.6217118501663208, "losses/sft": 2.482112169265747, "losses/total": 0.6217118501663208, "ref_logps/chosen": -44.208213806152344, "ref_logps/rejected": -39.184974670410156, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1062750816345215, "rewards/margins": 0.36924174427986145, "rewards/rejected": -1.47551691532135, "step": 996 }, { "epoch": 0.94, "grad_norm": 22.396766662597656, "learning_rate": 3.812521860790486e-07, "logps/chosen": -53.92234802246094, "logps/rejected": -63.89082336425781, "loss": 0.5739, "losses/dpo": 0.5646842122077942, "losses/sft": 1.2429379224777222, "losses/total": 0.5646842122077942, "ref_logps/chosen": -43.771934509277344, "ref_logps/rejected": -47.69895553588867, "rewards/accuracies": 0.6875, "rewards/chosen": -1.015041470527649, "rewards/margins": 0.6041450500488281, "rewards/rejected": -1.619186520576477, "step": 997 }, { "epoch": 0.94, "grad_norm": 17.27048683166504, "learning_rate": 3.810772997551591e-07, "logps/chosen": -50.35649871826172, "logps/rejected": -60.818870544433594, "loss": 0.4675, "losses/dpo": 0.3894166350364685, "losses/sft": 2.066436767578125, "losses/total": 0.3894166350364685, "ref_logps/chosen": -45.14312744140625, "ref_logps/rejected": -47.95522689819336, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5213367938995361, "rewards/margins": 0.7650270462036133, "rewards/rejected": -1.2863638401031494, "step": 998 }, { "epoch": 0.94, "grad_norm": 25.125530242919922, "learning_rate": 3.809024134312697e-07, "logps/chosen": -53.18193817138672, "logps/rejected": -58.44088363647461, "loss": 0.6611, "losses/dpo": 0.9224898815155029, "losses/sft": 1.8091856241226196, "losses/total": 0.9224898815155029, "ref_logps/chosen": -42.58659362792969, "ref_logps/rejected": -45.03008270263672, "rewards/accuracies": 0.4375, "rewards/chosen": -1.0595345497131348, "rewards/margins": 0.28154563903808594, "rewards/rejected": -1.3410801887512207, "step": 999 }, { "epoch": 0.94, "grad_norm": 27.306087493896484, "learning_rate": 3.807275271073802e-07, "logps/chosen": -63.88386535644531, "logps/rejected": -59.284210205078125, "loss": 0.7064, "losses/dpo": 0.8276010751724243, "losses/sft": 2.3495540618896484, "losses/total": 0.8276010751724243, "ref_logps/chosen": -52.349456787109375, "ref_logps/rejected": -45.523284912109375, "rewards/accuracies": 0.6875, "rewards/chosen": -1.153441309928894, "rewards/margins": 0.22265127301216125, "rewards/rejected": -1.3760924339294434, "step": 1000 }, { "epoch": 0.95, "grad_norm": 22.246232986450195, "learning_rate": 3.805526407834907e-07, "logps/chosen": -44.66143035888672, "logps/rejected": -55.53178405761719, "loss": 0.6809, "losses/dpo": 1.2138080596923828, "losses/sft": 2.2949931621551514, "losses/total": 1.2138080596923828, "ref_logps/chosen": -34.898231506347656, "ref_logps/rejected": -42.61272430419922, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9763199090957642, "rewards/margins": 0.3155861496925354, "rewards/rejected": -1.2919061183929443, "step": 1001 }, { "epoch": 0.95, "grad_norm": 11.103623390197754, "learning_rate": 3.803777544596012e-07, "logps/chosen": -46.28156661987305, "logps/rejected": -66.62234497070312, "loss": 0.3419, "losses/dpo": 0.46704888343811035, "losses/sft": 1.5207267999649048, "losses/total": 0.46704888343811035, "ref_logps/chosen": -39.715660095214844, "ref_logps/rejected": -48.267433166503906, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6565909385681152, "rewards/margins": 1.1789004802703857, "rewards/rejected": -1.835491418838501, "step": 1002 }, { "epoch": 0.95, "grad_norm": 24.383560180664062, "learning_rate": 3.802028681357118e-07, "logps/chosen": -50.68195343017578, "logps/rejected": -60.60013198852539, "loss": 0.6051, "losses/dpo": 0.4206312298774719, "losses/sft": 1.2067759037017822, "losses/total": 0.4206312298774719, "ref_logps/chosen": -41.747169494628906, "ref_logps/rejected": -46.576988220214844, "rewards/accuracies": 0.75, "rewards/chosen": -0.8934780955314636, "rewards/margins": 0.5088360905647278, "rewards/rejected": -1.4023141860961914, "step": 1003 }, { "epoch": 0.95, "grad_norm": 18.2020320892334, "learning_rate": 3.800279818118223e-07, "logps/chosen": -52.240333557128906, "logps/rejected": -64.44216918945312, "loss": 0.5414, "losses/dpo": 0.6033343076705933, "losses/sft": 1.76661217212677, "losses/total": 0.6033343076705933, "ref_logps/chosen": -41.72374725341797, "ref_logps/rejected": -48.181156158447266, "rewards/accuracies": 0.75, "rewards/chosen": -1.0516587495803833, "rewards/margins": 0.5744425058364868, "rewards/rejected": -1.6261012554168701, "step": 1004 }, { "epoch": 0.95, "grad_norm": 22.256027221679688, "learning_rate": 3.798530954879328e-07, "logps/chosen": -41.30514907836914, "logps/rejected": -46.581298828125, "loss": 0.6013, "losses/dpo": 0.5583899021148682, "losses/sft": 1.2865116596221924, "losses/total": 0.5583899021148682, "ref_logps/chosen": -34.4627571105957, "ref_logps/rejected": -35.83324432373047, "rewards/accuracies": 0.625, "rewards/chosen": -0.6842390894889832, "rewards/margins": 0.3905665874481201, "rewards/rejected": -1.074805736541748, "step": 1005 }, { "epoch": 0.95, "grad_norm": 25.01209259033203, "learning_rate": 3.7967820916404337e-07, "logps/chosen": -52.127925872802734, "logps/rejected": -71.54975891113281, "loss": 0.6054, "losses/dpo": 0.6716561913490295, "losses/sft": 1.758927822113037, "losses/total": 0.6716561913490295, "ref_logps/chosen": -39.68230438232422, "ref_logps/rejected": -53.215599060058594, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2445619106292725, "rewards/margins": 0.5888535976409912, "rewards/rejected": -1.8334155082702637, "step": 1006 }, { "epoch": 0.95, "grad_norm": 18.98240089416504, "learning_rate": 3.795033228401539e-07, "logps/chosen": -49.99851608276367, "logps/rejected": -73.98358917236328, "loss": 0.4659, "losses/dpo": 0.4777738153934479, "losses/sft": 1.444373607635498, "losses/total": 0.4777738153934479, "ref_logps/chosen": -41.225772857666016, "ref_logps/rejected": -57.0179443359375, "rewards/accuracies": 0.75, "rewards/chosen": -0.8772743940353394, "rewards/margins": 0.8192898035049438, "rewards/rejected": -1.6965641975402832, "step": 1007 }, { "epoch": 0.95, "grad_norm": 21.20924949645996, "learning_rate": 3.793284365162644e-07, "logps/chosen": -44.317840576171875, "logps/rejected": -50.83393096923828, "loss": 0.5868, "losses/dpo": 0.6321349143981934, "losses/sft": 1.7833943367004395, "losses/total": 0.6321349143981934, "ref_logps/chosen": -37.92741012573242, "ref_logps/rejected": -40.65975570678711, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6390430927276611, "rewards/margins": 0.37837448716163635, "rewards/rejected": -1.0174176692962646, "step": 1008 }, { "epoch": 0.95, "grad_norm": 17.293670654296875, "learning_rate": 3.7915355019237496e-07, "logps/chosen": -49.412513732910156, "logps/rejected": -69.95012664794922, "loss": 0.4331, "losses/dpo": 0.3595442771911621, "losses/sft": 1.9984853267669678, "losses/total": 0.3595442771911621, "ref_logps/chosen": -42.101043701171875, "ref_logps/rejected": -54.27330017089844, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7311465740203857, "rewards/margins": 0.8365360498428345, "rewards/rejected": -1.5676827430725098, "step": 1009 }, { "epoch": 0.95, "grad_norm": 19.22477149963379, "learning_rate": 3.789786638684855e-07, "logps/chosen": -47.13673400878906, "logps/rejected": -54.32954025268555, "loss": 0.5807, "losses/dpo": 0.6841297149658203, "losses/sft": 1.992552399635315, "losses/total": 0.6841297149658203, "ref_logps/chosen": -38.553260803222656, "ref_logps/rejected": -41.32449722290039, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8583472967147827, "rewards/margins": 0.4421570301055908, "rewards/rejected": -1.300504446029663, "step": 1010 }, { "epoch": 0.95, "grad_norm": 24.354263305664062, "learning_rate": 3.78803777544596e-07, "logps/chosen": -47.76940155029297, "logps/rejected": -51.45342254638672, "loss": 0.6668, "losses/dpo": 0.5718290209770203, "losses/sft": 1.7609448432922363, "losses/total": 0.5718290209770203, "ref_logps/chosen": -37.937049865722656, "ref_logps/rejected": -38.792938232421875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9832356572151184, "rewards/margins": 0.2828129529953003, "rewards/rejected": -1.266048550605774, "step": 1011 }, { "epoch": 0.96, "grad_norm": 23.792476654052734, "learning_rate": 3.786288912207065e-07, "logps/chosen": -50.83463668823242, "logps/rejected": -60.22372817993164, "loss": 0.6897, "losses/dpo": 0.6477515697479248, "losses/sft": 1.191267967224121, "losses/total": 0.6477515697479248, "ref_logps/chosen": -41.08557891845703, "ref_logps/rejected": -47.36979675292969, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9749058485031128, "rewards/margins": 0.3104873597621918, "rewards/rejected": -1.285393238067627, "step": 1012 }, { "epoch": 0.96, "grad_norm": 21.003948211669922, "learning_rate": 3.7845400489681707e-07, "logps/chosen": -56.907066345214844, "logps/rejected": -54.59217834472656, "loss": 0.7273, "losses/dpo": 0.49638450145721436, "losses/sft": 1.5870550870895386, "losses/total": 0.49638450145721436, "ref_logps/chosen": -44.87751770019531, "ref_logps/rejected": -39.635398864746094, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2029545307159424, "rewards/margins": 0.29272323846817017, "rewards/rejected": -1.4956775903701782, "step": 1013 }, { "epoch": 0.96, "grad_norm": 21.156476974487305, "learning_rate": 3.782791185729276e-07, "logps/chosen": -45.71857833862305, "logps/rejected": -52.760101318359375, "loss": 0.5529, "losses/dpo": 0.516573965549469, "losses/sft": 1.537147045135498, "losses/total": 0.516573965549469, "ref_logps/chosen": -36.65092468261719, "ref_logps/rejected": -38.323673248291016, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9067651033401489, "rewards/margins": 0.5368777513504028, "rewards/rejected": -1.4436429738998413, "step": 1014 }, { "epoch": 0.96, "grad_norm": 18.79005241394043, "learning_rate": 3.781042322490381e-07, "logps/chosen": -59.44023132324219, "logps/rejected": -81.28251647949219, "loss": 0.4426, "losses/dpo": 0.44827088713645935, "losses/sft": 1.653613805770874, "losses/total": 0.44827088713645935, "ref_logps/chosen": -49.774696350097656, "ref_logps/rejected": -62.330841064453125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9665535688400269, "rewards/margins": 0.9286140203475952, "rewards/rejected": -1.895167589187622, "step": 1015 }, { "epoch": 0.96, "grad_norm": 22.976865768432617, "learning_rate": 3.7792934592514866e-07, "logps/chosen": -45.89210510253906, "logps/rejected": -41.85190200805664, "loss": 0.7391, "losses/dpo": 1.1187453269958496, "losses/sft": 1.4958677291870117, "losses/total": 1.1187453269958496, "ref_logps/chosen": -37.236061096191406, "ref_logps/rejected": -31.13835334777832, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8656048774719238, "rewards/margins": 0.2057502567768097, "rewards/rejected": -1.0713551044464111, "step": 1016 }, { "epoch": 0.96, "grad_norm": 20.510265350341797, "learning_rate": 3.7775445960125917e-07, "logps/chosen": -45.14139938354492, "logps/rejected": -54.66725540161133, "loss": 0.6605, "losses/dpo": 0.8151442408561707, "losses/sft": 1.7279096841812134, "losses/total": 0.8151442408561707, "ref_logps/chosen": -36.190696716308594, "ref_logps/rejected": -40.86396026611328, "rewards/accuracies": 0.625, "rewards/chosen": -0.8950698971748352, "rewards/margins": 0.4852599501609802, "rewards/rejected": -1.3803297281265259, "step": 1017 }, { "epoch": 0.96, "grad_norm": 23.57343864440918, "learning_rate": 3.775795732773697e-07, "logps/chosen": -48.56698989868164, "logps/rejected": -50.53917694091797, "loss": 0.6687, "losses/dpo": 0.3203766644001007, "losses/sft": 1.8190281391143799, "losses/total": 0.3203766644001007, "ref_logps/chosen": -39.001007080078125, "ref_logps/rejected": -37.89703369140625, "rewards/accuracies": 0.625, "rewards/chosen": -0.9565984010696411, "rewards/margins": 0.30761587619781494, "rewards/rejected": -1.264214277267456, "step": 1018 }, { "epoch": 0.96, "grad_norm": 16.368715286254883, "learning_rate": 3.774046869534802e-07, "logps/chosen": -38.70006561279297, "logps/rejected": -47.883331298828125, "loss": 0.5139, "losses/dpo": 0.36832404136657715, "losses/sft": 1.65565824508667, "losses/total": 0.36832404136657715, "ref_logps/chosen": -35.24560546875, "ref_logps/rejected": -38.418861389160156, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3454464077949524, "rewards/margins": 0.6010006666183472, "rewards/rejected": -0.9464471340179443, "step": 1019 }, { "epoch": 0.96, "grad_norm": 20.903846740722656, "learning_rate": 3.7722980062959076e-07, "logps/chosen": -51.29652404785156, "logps/rejected": -65.01475524902344, "loss": 0.5941, "losses/dpo": 0.43693631887435913, "losses/sft": 1.4224003553390503, "losses/total": 0.43693631887435913, "ref_logps/chosen": -38.19739532470703, "ref_logps/rejected": -46.52694320678711, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3099132776260376, "rewards/margins": 0.5388673543930054, "rewards/rejected": -1.848780632019043, "step": 1020 }, { "epoch": 0.96, "grad_norm": 32.36252975463867, "learning_rate": 3.770549143057013e-07, "logps/chosen": -57.22710418701172, "logps/rejected": -72.98391723632812, "loss": 0.7051, "losses/dpo": 0.8327932357788086, "losses/sft": 1.5569915771484375, "losses/total": 0.8327932357788086, "ref_logps/chosen": -46.58798599243164, "ref_logps/rejected": -57.68727493286133, "rewards/accuracies": 0.5, "rewards/chosen": -1.0639116764068604, "rewards/margins": 0.46575188636779785, "rewards/rejected": -1.5296636819839478, "step": 1021 }, { "epoch": 0.97, "grad_norm": 19.232439041137695, "learning_rate": 3.768800279818118e-07, "logps/chosen": -47.70030212402344, "logps/rejected": -69.32796478271484, "loss": 0.4997, "losses/dpo": 0.3835393190383911, "losses/sft": 1.9313428401947021, "losses/total": 0.3835393190383911, "ref_logps/chosen": -37.832122802734375, "ref_logps/rejected": -53.7058219909668, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9868176579475403, "rewards/margins": 0.5753968954086304, "rewards/rejected": -1.5622146129608154, "step": 1022 }, { "epoch": 0.97, "grad_norm": 20.81832504272461, "learning_rate": 3.7670514165792235e-07, "logps/chosen": -47.133567810058594, "logps/rejected": -58.0272331237793, "loss": 0.6315, "losses/dpo": 0.39221709966659546, "losses/sft": 1.6592286825180054, "losses/total": 0.39221709966659546, "ref_logps/chosen": -39.48822021484375, "ref_logps/rejected": -46.671810150146484, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7645347714424133, "rewards/margins": 0.37100735306739807, "rewards/rejected": -1.1355421543121338, "step": 1023 }, { "epoch": 0.97, "grad_norm": 23.17807388305664, "learning_rate": 3.7653025533403287e-07, "logps/chosen": -56.657379150390625, "logps/rejected": -62.271324157714844, "loss": 0.5657, "losses/dpo": 1.0621778964996338, "losses/sft": 1.998990535736084, "losses/total": 1.0621778964996338, "ref_logps/chosen": -47.528175354003906, "ref_logps/rejected": -46.84623718261719, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9129210114479065, "rewards/margins": 0.6295875310897827, "rewards/rejected": -1.542508602142334, "step": 1024 }, { "epoch": 0.97, "grad_norm": 21.51382064819336, "learning_rate": 3.763553690101434e-07, "logps/chosen": -46.72819137573242, "logps/rejected": -69.21802520751953, "loss": 0.5117, "losses/dpo": 0.6582629680633545, "losses/sft": 1.9512324333190918, "losses/total": 0.6582629680633545, "ref_logps/chosen": -37.18133544921875, "ref_logps/rejected": -52.33158874511719, "rewards/accuracies": 0.625, "rewards/chosen": -0.9546855092048645, "rewards/margins": 0.7339580059051514, "rewards/rejected": -1.6886436939239502, "step": 1025 }, { "epoch": 0.97, "grad_norm": 22.774349212646484, "learning_rate": 3.761804826862539e-07, "logps/chosen": -45.344322204589844, "logps/rejected": -52.74687957763672, "loss": 0.5741, "losses/dpo": 0.49935778975486755, "losses/sft": 1.4545105695724487, "losses/total": 0.49935778975486755, "ref_logps/chosen": -36.59073257446289, "ref_logps/rejected": -39.85047912597656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8753587007522583, "rewards/margins": 0.4142812490463257, "rewards/rejected": -1.289639949798584, "step": 1026 }, { "epoch": 0.97, "grad_norm": 24.60048484802246, "learning_rate": 3.7600559636236446e-07, "logps/chosen": -46.23433303833008, "logps/rejected": -55.34899139404297, "loss": 0.7812, "losses/dpo": 0.6737127304077148, "losses/sft": 1.1214194297790527, "losses/total": 0.6737127304077148, "ref_logps/chosen": -36.20796585083008, "ref_logps/rejected": -44.365966796875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0026365518569946, "rewards/margins": 0.09566552937030792, "rewards/rejected": -1.0983021259307861, "step": 1027 }, { "epoch": 0.97, "grad_norm": 21.08169937133789, "learning_rate": 3.75830710038475e-07, "logps/chosen": -50.23443603515625, "logps/rejected": -52.77334213256836, "loss": 0.6507, "losses/dpo": 0.29170680046081543, "losses/sft": 1.2326050996780396, "losses/total": 0.29170680046081543, "ref_logps/chosen": -44.33406066894531, "ref_logps/rejected": -42.89617919921875, "rewards/accuracies": 0.625, "rewards/chosen": -0.5900372266769409, "rewards/margins": 0.39767909049987793, "rewards/rejected": -0.9877163171768188, "step": 1028 }, { "epoch": 0.97, "grad_norm": 22.025733947753906, "learning_rate": 3.756558237145855e-07, "logps/chosen": -48.47467041015625, "logps/rejected": -64.47343444824219, "loss": 0.6189, "losses/dpo": 0.5170712471008301, "losses/sft": 1.1247472763061523, "losses/total": 0.5170712471008301, "ref_logps/chosen": -38.449867248535156, "ref_logps/rejected": -51.1156005859375, "rewards/accuracies": 0.75, "rewards/chosen": -1.0024805068969727, "rewards/margins": 0.3333030939102173, "rewards/rejected": -1.33578360080719, "step": 1029 }, { "epoch": 0.97, "grad_norm": 14.860428810119629, "learning_rate": 3.7548093739069605e-07, "logps/chosen": -30.676557540893555, "logps/rejected": -62.83124923706055, "loss": 0.4224, "losses/dpo": 0.37315595149993896, "losses/sft": 1.9370832443237305, "losses/total": 0.37315595149993896, "ref_logps/chosen": -25.772823333740234, "ref_logps/rejected": -50.06254959106445, "rewards/accuracies": 0.875, "rewards/chosen": -0.49037328362464905, "rewards/margins": 0.7864969968795776, "rewards/rejected": -1.2768702507019043, "step": 1030 }, { "epoch": 0.97, "grad_norm": 19.947961807250977, "learning_rate": 3.7530605106680656e-07, "logps/chosen": -47.666038513183594, "logps/rejected": -51.949249267578125, "loss": 0.6094, "losses/dpo": 0.490215003490448, "losses/sft": 2.1000752449035645, "losses/total": 0.490215003490448, "ref_logps/chosen": -39.435821533203125, "ref_logps/rejected": -40.621337890625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8230217695236206, "rewards/margins": 0.30976951122283936, "rewards/rejected": -1.13279128074646, "step": 1031 }, { "epoch": 0.97, "grad_norm": 18.579145431518555, "learning_rate": 3.7513116474291707e-07, "logps/chosen": -43.39051818847656, "logps/rejected": -51.849422454833984, "loss": 0.5608, "losses/dpo": 0.6709878444671631, "losses/sft": 2.0446085929870605, "losses/total": 0.6709878444671631, "ref_logps/chosen": -35.785308837890625, "ref_logps/rejected": -39.19109344482422, "rewards/accuracies": 0.625, "rewards/chosen": -0.7605208158493042, "rewards/margins": 0.5053122043609619, "rewards/rejected": -1.2658330202102661, "step": 1032 }, { "epoch": 0.98, "grad_norm": 21.474868774414062, "learning_rate": 3.749562784190276e-07, "logps/chosen": -53.84595489501953, "logps/rejected": -66.29723358154297, "loss": 0.6453, "losses/dpo": 0.31144535541534424, "losses/sft": 1.8267700672149658, "losses/total": 0.31144535541534424, "ref_logps/chosen": -43.61677932739258, "ref_logps/rejected": -50.179439544677734, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0229175090789795, "rewards/margins": 0.5888623595237732, "rewards/rejected": -1.6117799282073975, "step": 1033 }, { "epoch": 0.98, "grad_norm": 21.029232025146484, "learning_rate": 3.7478139209513815e-07, "logps/chosen": -43.316226959228516, "logps/rejected": -56.854164123535156, "loss": 0.5448, "losses/dpo": 0.6631639003753662, "losses/sft": 1.811882495880127, "losses/total": 0.6631639003753662, "ref_logps/chosen": -35.18425750732422, "ref_logps/rejected": -42.97540283203125, "rewards/accuracies": 0.75, "rewards/chosen": -0.8131965398788452, "rewards/margins": 0.5746797919273376, "rewards/rejected": -1.3878765106201172, "step": 1034 }, { "epoch": 0.98, "grad_norm": 15.447344779968262, "learning_rate": 3.746065057712487e-07, "logps/chosen": -33.777618408203125, "logps/rejected": -51.820091247558594, "loss": 0.4812, "losses/dpo": 0.514406681060791, "losses/sft": 1.1542354822158813, "losses/total": 0.514406681060791, "ref_logps/chosen": -30.266817092895508, "ref_logps/rejected": -40.79583740234375, "rewards/accuracies": 0.8125, "rewards/chosen": -0.35108014941215515, "rewards/margins": 0.751345157623291, "rewards/rejected": -1.1024253368377686, "step": 1035 }, { "epoch": 0.98, "grad_norm": 22.523555755615234, "learning_rate": 3.744316194473592e-07, "logps/chosen": -56.821998596191406, "logps/rejected": -69.46830749511719, "loss": 0.5878, "losses/dpo": 0.28811097145080566, "losses/sft": 1.9191207885742188, "losses/total": 0.28811097145080566, "ref_logps/chosen": -43.662689208984375, "ref_logps/rejected": -50.66514205932617, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3159312009811401, "rewards/margins": 0.5643856525421143, "rewards/rejected": -1.8803167343139648, "step": 1036 }, { "epoch": 0.98, "grad_norm": 21.70399284362793, "learning_rate": 3.7425673312346974e-07, "logps/chosen": -54.694068908691406, "logps/rejected": -77.07733154296875, "loss": 0.5499, "losses/dpo": 0.372211754322052, "losses/sft": 2.259852886199951, "losses/total": 0.372211754322052, "ref_logps/chosen": -43.89327621459961, "ref_logps/rejected": -60.12117004394531, "rewards/accuracies": 0.75, "rewards/chosen": -1.0800797939300537, "rewards/margins": 0.6155374050140381, "rewards/rejected": -1.6956171989440918, "step": 1037 }, { "epoch": 0.98, "grad_norm": 17.327770233154297, "learning_rate": 3.7408184679958025e-07, "logps/chosen": -51.10768127441406, "logps/rejected": -67.70603942871094, "loss": 0.4945, "losses/dpo": 0.5754619240760803, "losses/sft": 1.5056612491607666, "losses/total": 0.5754619240760803, "ref_logps/chosen": -43.47105407714844, "ref_logps/rejected": -52.151615142822266, "rewards/accuracies": 0.75, "rewards/chosen": -0.7636622786521912, "rewards/margins": 0.7917799949645996, "rewards/rejected": -1.555442214012146, "step": 1038 }, { "epoch": 0.98, "grad_norm": 18.521121978759766, "learning_rate": 3.7390696047569077e-07, "logps/chosen": -43.31429672241211, "logps/rejected": -55.69745635986328, "loss": 0.5285, "losses/dpo": 0.37058568000793457, "losses/sft": 1.3831158876419067, "losses/total": 0.37058568000793457, "ref_logps/chosen": -34.684295654296875, "ref_logps/rejected": -40.088356018066406, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8629997968673706, "rewards/margins": 0.6979104280471802, "rewards/rejected": -1.5609102249145508, "step": 1039 }, { "epoch": 0.98, "grad_norm": 23.897001266479492, "learning_rate": 3.737320741518013e-07, "logps/chosen": -57.846435546875, "logps/rejected": -62.12091827392578, "loss": 0.5588, "losses/dpo": 0.8358293175697327, "losses/sft": 2.0152835845947266, "losses/total": 0.8358293175697327, "ref_logps/chosen": -45.81495666503906, "ref_logps/rejected": -44.22117614746094, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2031478881835938, "rewards/margins": 0.5868260860443115, "rewards/rejected": -1.7899739742279053, "step": 1040 }, { "epoch": 0.98, "grad_norm": 23.08881950378418, "learning_rate": 3.7355718782791184e-07, "logps/chosen": -51.02873229980469, "logps/rejected": -63.517189025878906, "loss": 0.604, "losses/dpo": 0.6009421348571777, "losses/sft": 1.8231730461120605, "losses/total": 0.6009421348571777, "ref_logps/chosen": -39.89787292480469, "ref_logps/rejected": -44.95099639892578, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1130861043930054, "rewards/margins": 0.7435336709022522, "rewards/rejected": -1.8566197156906128, "step": 1041 }, { "epoch": 0.98, "grad_norm": 25.16348648071289, "learning_rate": 3.733823015040224e-07, "logps/chosen": -47.1480598449707, "logps/rejected": -42.78644943237305, "loss": 0.8707, "losses/dpo": 1.097036361694336, "losses/sft": 1.966797947883606, "losses/total": 1.097036361694336, "ref_logps/chosen": -37.51509094238281, "ref_logps/rejected": -34.06185531616211, "rewards/accuracies": 0.5, "rewards/chosen": -0.9632970094680786, "rewards/margins": -0.09083791822195053, "rewards/rejected": -0.8724589943885803, "step": 1042 }, { "epoch": 0.98, "grad_norm": 16.07930564880371, "learning_rate": 3.7320741518013287e-07, "logps/chosen": -30.622501373291016, "logps/rejected": -54.29332733154297, "loss": 0.4737, "losses/dpo": 0.312989205121994, "losses/sft": 1.2640271186828613, "losses/total": 0.312989205121994, "ref_logps/chosen": -25.249773025512695, "ref_logps/rejected": -41.58097839355469, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5372729301452637, "rewards/margins": 0.7339622974395752, "rewards/rejected": -1.2712352275848389, "step": 1043 }, { "epoch": 0.99, "grad_norm": 18.39409828186035, "learning_rate": 3.7303252885624344e-07, "logps/chosen": -50.152862548828125, "logps/rejected": -62.194969177246094, "loss": 0.5529, "losses/dpo": 0.619890034198761, "losses/sft": 2.1275525093078613, "losses/total": 0.619890034198761, "ref_logps/chosen": -39.10810089111328, "ref_logps/rejected": -46.15287399291992, "rewards/accuracies": 0.75, "rewards/chosen": -1.104475975036621, "rewards/margins": 0.49973371624946594, "rewards/rejected": -1.6042097806930542, "step": 1044 }, { "epoch": 0.99, "grad_norm": 18.956058502197266, "learning_rate": 3.7285764253235395e-07, "logps/chosen": -36.96673583984375, "logps/rejected": -51.40975570678711, "loss": 0.6046, "losses/dpo": 0.45018085837364197, "losses/sft": 1.6682674884796143, "losses/total": 0.45018085837364197, "ref_logps/chosen": -28.295307159423828, "ref_logps/rejected": -39.06208038330078, "rewards/accuracies": 0.75, "rewards/chosen": -0.8671428561210632, "rewards/margins": 0.36762452125549316, "rewards/rejected": -1.2347674369812012, "step": 1045 }, { "epoch": 0.99, "grad_norm": 20.456775665283203, "learning_rate": 3.7268275620846446e-07, "logps/chosen": -43.20927429199219, "logps/rejected": -65.18101501464844, "loss": 0.5981, "losses/dpo": 0.17798872292041779, "losses/sft": 1.6047101020812988, "losses/total": 0.17798872292041779, "ref_logps/chosen": -35.581031799316406, "ref_logps/rejected": -50.5703125, "rewards/accuracies": 0.625, "rewards/chosen": -0.7628246545791626, "rewards/margins": 0.6982451677322388, "rewards/rejected": -1.4610698223114014, "step": 1046 }, { "epoch": 0.99, "grad_norm": 16.558841705322266, "learning_rate": 3.72507869884575e-07, "logps/chosen": -39.19849395751953, "logps/rejected": -59.20799255371094, "loss": 0.4089, "losses/dpo": 0.5027921199798584, "losses/sft": 1.5396077632904053, "losses/total": 0.5027921199798584, "ref_logps/chosen": -32.8937873840332, "ref_logps/rejected": -45.09962844848633, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6304706335067749, "rewards/margins": 0.7803655862808228, "rewards/rejected": -1.4108362197875977, "step": 1047 }, { "epoch": 0.99, "grad_norm": 20.622018814086914, "learning_rate": 3.7233298356068554e-07, "logps/chosen": -40.00585174560547, "logps/rejected": -58.10869216918945, "loss": 0.5211, "losses/dpo": 0.40653449296951294, "losses/sft": 1.5392721891403198, "losses/total": 0.40653449296951294, "ref_logps/chosen": -32.68403625488281, "ref_logps/rejected": -44.4426155090332, "rewards/accuracies": 0.75, "rewards/chosen": -0.7321815490722656, "rewards/margins": 0.6344262957572937, "rewards/rejected": -1.366607904434204, "step": 1048 }, { "epoch": 0.99, "grad_norm": 20.416519165039062, "learning_rate": 3.721580972367961e-07, "logps/chosen": -46.07855224609375, "logps/rejected": -62.601341247558594, "loss": 0.542, "losses/dpo": 0.45975667238235474, "losses/sft": 1.3041117191314697, "losses/total": 0.45975667238235474, "ref_logps/chosen": -36.73108673095703, "ref_logps/rejected": -48.306671142578125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9347469210624695, "rewards/margins": 0.4947202503681183, "rewards/rejected": -1.4294672012329102, "step": 1049 }, { "epoch": 0.99, "grad_norm": 18.21001625061035, "learning_rate": 3.7198321091290656e-07, "logps/chosen": -38.742042541503906, "logps/rejected": -50.60753631591797, "loss": 0.5939, "losses/dpo": 0.3910270929336548, "losses/sft": 1.2642822265625, "losses/total": 0.3910270929336548, "ref_logps/chosen": -32.67649459838867, "ref_logps/rejected": -41.21440887451172, "rewards/accuracies": 0.75, "rewards/chosen": -0.6065546274185181, "rewards/margins": 0.3327580690383911, "rewards/rejected": -0.939312756061554, "step": 1050 }, { "epoch": 0.99, "grad_norm": 17.968276977539062, "learning_rate": 3.7180832458901713e-07, "logps/chosen": -42.053462982177734, "logps/rejected": -49.964454650878906, "loss": 0.5479, "losses/dpo": 0.6227540969848633, "losses/sft": 1.782165288925171, "losses/total": 0.6227540969848633, "ref_logps/chosen": -34.993019104003906, "ref_logps/rejected": -38.62141418457031, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7060443162918091, "rewards/margins": 0.4282597303390503, "rewards/rejected": -1.1343040466308594, "step": 1051 }, { "epoch": 0.99, "grad_norm": 15.051322937011719, "learning_rate": 3.7163343826512764e-07, "logps/chosen": -44.43914794921875, "logps/rejected": -67.49777221679688, "loss": 0.3392, "losses/dpo": 0.23947615921497345, "losses/sft": 1.1581668853759766, "losses/total": 0.23947615921497345, "ref_logps/chosen": -37.87828063964844, "ref_logps/rejected": -48.68732452392578, "rewards/accuracies": 0.875, "rewards/chosen": -0.6560868620872498, "rewards/margins": 1.224957823753357, "rewards/rejected": -1.881044626235962, "step": 1052 }, { "epoch": 0.99, "grad_norm": 25.21403694152832, "learning_rate": 3.7145855194123816e-07, "logps/chosen": -54.13300323486328, "logps/rejected": -50.41889190673828, "loss": 0.7712, "losses/dpo": 0.6819635629653931, "losses/sft": 1.9272966384887695, "losses/total": 0.6819635629653931, "ref_logps/chosen": -43.46582794189453, "ref_logps/rejected": -39.4797248840332, "rewards/accuracies": 0.4375, "rewards/chosen": -1.0667178630828857, "rewards/margins": 0.02719871699810028, "rewards/rejected": -1.093916654586792, "step": 1053 }, { "epoch": 1.0, "grad_norm": 14.475399017333984, "learning_rate": 3.712836656173487e-07, "logps/chosen": -29.974699020385742, "logps/rejected": -50.017608642578125, "loss": 0.5052, "losses/dpo": 0.7304818630218506, "losses/sft": 1.7995580434799194, "losses/total": 0.7304818630218506, "ref_logps/chosen": -24.798912048339844, "ref_logps/rejected": -38.5832405090332, "rewards/accuracies": 0.75, "rewards/chosen": -0.5175787210464478, "rewards/margins": 0.6258580684661865, "rewards/rejected": -1.1434367895126343, "step": 1054 }, { "epoch": 1.0, "grad_norm": 26.792694091796875, "learning_rate": 3.7110877929345923e-07, "logps/chosen": -55.168270111083984, "logps/rejected": -64.51309967041016, "loss": 0.637, "losses/dpo": 1.0172982215881348, "losses/sft": 1.9349207878112793, "losses/total": 1.0172982215881348, "ref_logps/chosen": -45.040283203125, "ref_logps/rejected": -50.64051055908203, "rewards/accuracies": 0.625, "rewards/chosen": -1.0127986669540405, "rewards/margins": 0.3744605779647827, "rewards/rejected": -1.3872592449188232, "step": 1055 }, { "epoch": 1.0, "grad_norm": 19.410390853881836, "learning_rate": 3.709338929695698e-07, "logps/chosen": -39.06595993041992, "logps/rejected": -52.589717864990234, "loss": 0.6574, "losses/dpo": 0.7321785092353821, "losses/sft": 1.5721993446350098, "losses/total": 0.7321785092353821, "ref_logps/chosen": -31.124237060546875, "ref_logps/rejected": -40.465423583984375, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7941721677780151, "rewards/margins": 0.4182566702365875, "rewards/rejected": -1.2124288082122803, "step": 1056 }, { "epoch": 1.0, "grad_norm": 24.434770584106445, "learning_rate": 3.7075900664568026e-07, "logps/chosen": -53.610530853271484, "logps/rejected": -63.1193962097168, "loss": 0.602, "losses/dpo": 0.8597586154937744, "losses/sft": 1.8838454484939575, "losses/total": 0.8597586154937744, "ref_logps/chosen": -45.25860595703125, "ref_logps/rejected": -49.585853576660156, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8351923823356628, "rewards/margins": 0.518161416053772, "rewards/rejected": -1.3533538579940796, "step": 1057 }, { "epoch": 1.0, "grad_norm": 23.938945770263672, "learning_rate": 3.705841203217908e-07, "logps/chosen": -51.38863754272461, "logps/rejected": -56.15193176269531, "loss": 0.5816, "losses/dpo": 0.7956613898277283, "losses/sft": 1.733151912689209, "losses/total": 0.7956613898277283, "ref_logps/chosen": -42.19108581542969, "ref_logps/rejected": -42.952274322509766, "rewards/accuracies": 0.625, "rewards/chosen": -0.919755220413208, "rewards/margins": 0.4002106189727783, "rewards/rejected": -1.3199658393859863, "step": 1058 }, { "epoch": 1.0, "grad_norm": 16.61768913269043, "learning_rate": 3.7040923399790134e-07, "logps/chosen": -43.220340728759766, "logps/rejected": -65.68209838867188, "loss": 0.4328, "losses/dpo": 0.3953036963939667, "losses/sft": 1.71744966506958, "losses/total": 0.3953036963939667, "ref_logps/chosen": -35.62976837158203, "ref_logps/rejected": -48.545066833496094, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7590570449829102, "rewards/margins": 0.9546458721160889, "rewards/rejected": -1.713702917098999, "step": 1059 }, { "epoch": 1.0, "grad_norm": 19.569744110107422, "learning_rate": 3.7023434767401185e-07, "logps/chosen": -48.77082824707031, "logps/rejected": -68.16401672363281, "loss": 0.4348, "losses/dpo": 0.15455271303653717, "losses/sft": 1.7090686559677124, "losses/total": 0.15455271303653717, "ref_logps/chosen": -40.929649353027344, "ref_logps/rejected": -50.74110794067383, "rewards/accuracies": 0.875, "rewards/chosen": -0.7841184139251709, "rewards/margins": 0.9581716060638428, "rewards/rejected": -1.7422899007797241, "step": 1060 }, { "epoch": 1.0, "grad_norm": 15.181184768676758, "learning_rate": 3.700594613501224e-07, "logps/chosen": -39.98779296875, "logps/rejected": -55.82892990112305, "loss": 0.4619, "losses/dpo": 0.3095157742500305, "losses/sft": 1.5837920904159546, "losses/total": 0.3095157742500305, "ref_logps/chosen": -32.98744583129883, "ref_logps/rejected": -39.85797119140625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7000346779823303, "rewards/margins": 0.897061288356781, "rewards/rejected": -1.5970959663391113, "step": 1061 }, { "epoch": 1.0, "grad_norm": 14.743623733520508, "learning_rate": 3.6988457502623293e-07, "logps/chosen": -39.40172576904297, "logps/rejected": -63.34711837768555, "loss": 0.3648, "losses/dpo": 0.2464207261800766, "losses/sft": 1.2853291034698486, "losses/total": 0.2464207261800766, "ref_logps/chosen": -34.49665832519531, "ref_logps/rejected": -47.112762451171875, "rewards/accuracies": 0.9375, "rewards/chosen": -0.49050676822662354, "rewards/margins": 1.1329294443130493, "rewards/rejected": -1.6234362125396729, "step": 1062 }, { "epoch": 1.0, "grad_norm": 24.7276554107666, "learning_rate": 3.697096887023435e-07, "logps/chosen": -43.58515930175781, "logps/rejected": -50.95394515991211, "loss": 0.5362, "losses/dpo": 0.7975062131881714, "losses/sft": 1.7530547380447388, "losses/total": 0.7975062131881714, "ref_logps/chosen": -35.57408905029297, "ref_logps/rejected": -37.36211395263672, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8011068105697632, "rewards/margins": 0.5580763816833496, "rewards/rejected": -1.3591831922531128, "step": 1063 }, { "epoch": 1.0, "grad_norm": 18.076126098632812, "learning_rate": 3.6953480237845395e-07, "logps/chosen": -43.305152893066406, "logps/rejected": -54.1341667175293, "loss": 0.4355, "losses/dpo": 0.4449557662010193, "losses/sft": 1.8413487672805786, "losses/total": 0.4449557662010193, "ref_logps/chosen": -33.95836639404297, "ref_logps/rejected": -36.06989288330078, "rewards/accuracies": 0.75, "rewards/chosen": -0.9346787333488464, "rewards/margins": 0.871748685836792, "rewards/rejected": -1.8064274787902832, "step": 1064 }, { "epoch": 1.01, "grad_norm": 17.21904754638672, "learning_rate": 3.693599160545645e-07, "logps/chosen": -42.98076629638672, "logps/rejected": -60.32102584838867, "loss": 0.4183, "losses/dpo": 0.6023704409599304, "losses/sft": 1.759673833847046, "losses/total": 0.6023704409599304, "ref_logps/chosen": -33.99935531616211, "ref_logps/rejected": -40.51173400878906, "rewards/accuracies": 0.875, "rewards/chosen": -0.8981411457061768, "rewards/margins": 1.0827878713607788, "rewards/rejected": -1.980928897857666, "step": 1065 }, { "epoch": 1.01, "grad_norm": 16.149755477905273, "learning_rate": 3.6918502973067503e-07, "logps/chosen": -34.371150970458984, "logps/rejected": -66.52780151367188, "loss": 0.3845, "losses/dpo": 0.28156861662864685, "losses/sft": 1.2597063779830933, "losses/total": 0.28156861662864685, "ref_logps/chosen": -28.831809997558594, "ref_logps/rejected": -47.38080978393555, "rewards/accuracies": 0.875, "rewards/chosen": -0.5539339780807495, "rewards/margins": 1.360764980316162, "rewards/rejected": -1.9146989583969116, "step": 1066 }, { "epoch": 1.01, "grad_norm": 14.031977653503418, "learning_rate": 3.6901014340678554e-07, "logps/chosen": -37.38449478149414, "logps/rejected": -46.552528381347656, "loss": 0.3672, "losses/dpo": 0.5438218116760254, "losses/sft": 1.4593948125839233, "losses/total": 0.5438218116760254, "ref_logps/chosen": -30.516246795654297, "ref_logps/rejected": -29.92164421081543, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6868245601654053, "rewards/margins": 0.9762636423110962, "rewards/rejected": -1.6630882024765015, "step": 1067 }, { "epoch": 1.01, "grad_norm": 16.827117919921875, "learning_rate": 3.688352570828961e-07, "logps/chosen": -37.271148681640625, "logps/rejected": -60.42409896850586, "loss": 0.4615, "losses/dpo": 0.3220047354698181, "losses/sft": 1.2956514358520508, "losses/total": 0.3220047354698181, "ref_logps/chosen": -29.47098731994629, "ref_logps/rejected": -43.098026275634766, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7800158262252808, "rewards/margins": 0.9525912404060364, "rewards/rejected": -1.732607126235962, "step": 1068 }, { "epoch": 1.01, "grad_norm": 13.062885284423828, "learning_rate": 3.686603707590066e-07, "logps/chosen": -46.08294677734375, "logps/rejected": -61.942962646484375, "loss": 0.3078, "losses/dpo": 0.2849777340888977, "losses/sft": 1.4762276411056519, "losses/total": 0.2849777340888977, "ref_logps/chosen": -39.95171356201172, "ref_logps/rejected": -43.798824310302734, "rewards/accuracies": 1.0, "rewards/chosen": -0.6131234169006348, "rewards/margins": 1.2012903690338135, "rewards/rejected": -1.8144135475158691, "step": 1069 }, { "epoch": 1.01, "grad_norm": 18.53889274597168, "learning_rate": 3.684854844351172e-07, "logps/chosen": -49.929996490478516, "logps/rejected": -64.5808334350586, "loss": 0.4355, "losses/dpo": 0.6379729509353638, "losses/sft": 1.7307603359222412, "losses/total": 0.6379729509353638, "ref_logps/chosen": -40.72583770751953, "ref_logps/rejected": -45.09645080566406, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9204157590866089, "rewards/margins": 1.0280221700668335, "rewards/rejected": -1.9484379291534424, "step": 1070 }, { "epoch": 1.01, "grad_norm": 17.705724716186523, "learning_rate": 3.6831059811122765e-07, "logps/chosen": -57.26272201538086, "logps/rejected": -75.029296875, "loss": 0.4108, "losses/dpo": 0.5157050490379333, "losses/sft": 2.2226946353912354, "losses/total": 0.5157050490379333, "ref_logps/chosen": -46.039466857910156, "ref_logps/rejected": -53.679019927978516, "rewards/accuracies": 0.75, "rewards/chosen": -1.1223256587982178, "rewards/margins": 1.0127019882202148, "rewards/rejected": -2.1350276470184326, "step": 1071 }, { "epoch": 1.01, "grad_norm": 17.380020141601562, "learning_rate": 3.681357117873382e-07, "logps/chosen": -30.642452239990234, "logps/rejected": -39.821754455566406, "loss": 0.5762, "losses/dpo": 0.8206643462181091, "losses/sft": 1.267864465713501, "losses/total": 0.8206643462181091, "ref_logps/chosen": -25.10495948791504, "ref_logps/rejected": -29.695594787597656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5537492036819458, "rewards/margins": 0.45886677503585815, "rewards/rejected": -1.0126159191131592, "step": 1072 }, { "epoch": 1.01, "grad_norm": 18.155853271484375, "learning_rate": 3.679608254634488e-07, "logps/chosen": -44.522071838378906, "logps/rejected": -57.01021194458008, "loss": 0.3812, "losses/dpo": 0.49304908514022827, "losses/sft": 1.2953386306762695, "losses/total": 0.49304908514022827, "ref_logps/chosen": -38.756256103515625, "ref_logps/rejected": -40.62232208251953, "rewards/accuracies": 0.875, "rewards/chosen": -0.5765818357467651, "rewards/margins": 1.0622072219848633, "rewards/rejected": -1.6387890577316284, "step": 1073 }, { "epoch": 1.01, "grad_norm": 18.12737464904785, "learning_rate": 3.6778593913955924e-07, "logps/chosen": -46.933799743652344, "logps/rejected": -50.66566848754883, "loss": 0.4962, "losses/dpo": 0.2647829055786133, "losses/sft": 1.135524868965149, "losses/total": 0.2647829055786133, "ref_logps/chosen": -40.021026611328125, "ref_logps/rejected": -36.490535736083984, "rewards/accuracies": 0.75, "rewards/chosen": -0.6912773251533508, "rewards/margins": 0.7262359857559204, "rewards/rejected": -1.417513370513916, "step": 1074 }, { "epoch": 1.02, "grad_norm": 26.857921600341797, "learning_rate": 3.676110528156698e-07, "logps/chosen": -54.28265380859375, "logps/rejected": -64.04670715332031, "loss": 0.72, "losses/dpo": 0.4676631689071655, "losses/sft": 1.7014347314834595, "losses/total": 0.4676631689071655, "ref_logps/chosen": -42.96564483642578, "ref_logps/rejected": -51.200721740722656, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1317007541656494, "rewards/margins": 0.15289808809757233, "rewards/rejected": -1.2845988273620605, "step": 1075 }, { "epoch": 1.02, "grad_norm": 15.14867115020752, "learning_rate": 3.674361664917803e-07, "logps/chosen": -39.258968353271484, "logps/rejected": -63.0911979675293, "loss": 0.3872, "losses/dpo": 0.20462581515312195, "losses/sft": 1.478392481803894, "losses/total": 0.20462581515312195, "ref_logps/chosen": -32.907691955566406, "ref_logps/rejected": -43.177433013916016, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6351274251937866, "rewards/margins": 1.3562489748001099, "rewards/rejected": -1.9913763999938965, "step": 1076 }, { "epoch": 1.02, "grad_norm": 17.26877212524414, "learning_rate": 3.672612801678909e-07, "logps/chosen": -49.37654113769531, "logps/rejected": -69.18081665039062, "loss": 0.3712, "losses/dpo": 0.25452861189842224, "losses/sft": 2.0616605281829834, "losses/total": 0.25452861189842224, "ref_logps/chosen": -39.10771942138672, "ref_logps/rejected": -46.87623596191406, "rewards/accuracies": 0.875, "rewards/chosen": -1.0268824100494385, "rewards/margins": 1.203575611114502, "rewards/rejected": -2.2304580211639404, "step": 1077 }, { "epoch": 1.02, "grad_norm": 19.198070526123047, "learning_rate": 3.6708639384400134e-07, "logps/chosen": -42.97004318237305, "logps/rejected": -58.670021057128906, "loss": 0.5011, "losses/dpo": 0.4471086263656616, "losses/sft": 1.2140623331069946, "losses/total": 0.4471086263656616, "ref_logps/chosen": -31.619060516357422, "ref_logps/rejected": -38.09598159790039, "rewards/accuracies": 0.75, "rewards/chosen": -1.1350982189178467, "rewards/margins": 0.9223060607910156, "rewards/rejected": -2.0574045181274414, "step": 1078 }, { "epoch": 1.02, "grad_norm": 15.979565620422363, "learning_rate": 3.669115075201119e-07, "logps/chosen": -43.28668975830078, "logps/rejected": -55.396141052246094, "loss": 0.3708, "losses/dpo": 0.25019770860671997, "losses/sft": 1.5838481187820435, "losses/total": 0.25019770860671997, "ref_logps/chosen": -36.61464309692383, "ref_logps/rejected": -38.42881774902344, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6672050356864929, "rewards/margins": 1.029528021812439, "rewards/rejected": -1.696732997894287, "step": 1079 }, { "epoch": 1.02, "grad_norm": 12.003375053405762, "learning_rate": 3.6673662119622247e-07, "logps/chosen": -29.29328727722168, "logps/rejected": -61.710323333740234, "loss": 0.3018, "losses/dpo": 0.3685706555843353, "losses/sft": 1.3761706352233887, "losses/total": 0.3685706555843353, "ref_logps/chosen": -23.90287208557129, "ref_logps/rejected": -43.84029769897461, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5390416383743286, "rewards/margins": 1.2479610443115234, "rewards/rejected": -1.7870025634765625, "step": 1080 }, { "epoch": 1.02, "grad_norm": 19.205904006958008, "learning_rate": 3.6656173487233293e-07, "logps/chosen": -52.96255111694336, "logps/rejected": -71.59742736816406, "loss": 0.4664, "losses/dpo": 0.27110928297042847, "losses/sft": 1.3454355001449585, "losses/total": 0.27110928297042847, "ref_logps/chosen": -42.1851806640625, "ref_logps/rejected": -51.919124603271484, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0777368545532227, "rewards/margins": 0.8900934457778931, "rewards/rejected": -1.9678301811218262, "step": 1081 }, { "epoch": 1.02, "grad_norm": 20.57588768005371, "learning_rate": 3.663868485484435e-07, "logps/chosen": -46.54554748535156, "logps/rejected": -69.75184631347656, "loss": 0.5054, "losses/dpo": 0.5467911958694458, "losses/sft": 1.9871207475662231, "losses/total": 0.5467911958694458, "ref_logps/chosen": -35.361541748046875, "ref_logps/rejected": -49.73594665527344, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1184006929397583, "rewards/margins": 0.8831894397735596, "rewards/rejected": -2.0015902519226074, "step": 1082 }, { "epoch": 1.02, "grad_norm": 20.42867660522461, "learning_rate": 3.66211962224554e-07, "logps/chosen": -50.04201889038086, "logps/rejected": -57.776390075683594, "loss": 0.495, "losses/dpo": 0.2337605059146881, "losses/sft": 1.7641470432281494, "losses/total": 0.2337605059146881, "ref_logps/chosen": -41.76304244995117, "ref_logps/rejected": -42.703399658203125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8278974890708923, "rewards/margins": 0.6794015169143677, "rewards/rejected": -1.5072989463806152, "step": 1083 }, { "epoch": 1.02, "grad_norm": 24.451725006103516, "learning_rate": 3.660370759006646e-07, "logps/chosen": -50.32181167602539, "logps/rejected": -53.35579299926758, "loss": 0.6113, "losses/dpo": 0.5341934561729431, "losses/sft": 1.7669661045074463, "losses/total": 0.5341934561729431, "ref_logps/chosen": -38.576175689697266, "ref_logps/rejected": -36.4826774597168, "rewards/accuracies": 0.75, "rewards/chosen": -1.1745634078979492, "rewards/margins": 0.5127479434013367, "rewards/rejected": -1.6873114109039307, "step": 1084 }, { "epoch": 1.02, "grad_norm": 18.57804298400879, "learning_rate": 3.6586218957677504e-07, "logps/chosen": -51.10692596435547, "logps/rejected": -69.60911560058594, "loss": 0.4381, "losses/dpo": 0.21541637182235718, "losses/sft": 2.0709481239318848, "losses/total": 0.21541637182235718, "ref_logps/chosen": -38.682254791259766, "ref_logps/rejected": -46.849151611328125, "rewards/accuracies": 0.75, "rewards/chosen": -1.2424674034118652, "rewards/margins": 1.0335289239883423, "rewards/rejected": -2.275996208190918, "step": 1085 }, { "epoch": 1.03, "grad_norm": 14.761630058288574, "learning_rate": 3.656873032528856e-07, "logps/chosen": -42.015716552734375, "logps/rejected": -60.31280517578125, "loss": 0.3823, "losses/dpo": 0.38250288367271423, "losses/sft": 1.6169302463531494, "losses/total": 0.38250288367271423, "ref_logps/chosen": -33.85796356201172, "ref_logps/rejected": -39.86603927612305, "rewards/accuracies": 0.875, "rewards/chosen": -0.8157749176025391, "rewards/margins": 1.2289013862609863, "rewards/rejected": -2.0446763038635254, "step": 1086 }, { "epoch": 1.03, "grad_norm": 16.132125854492188, "learning_rate": 3.6551241692899617e-07, "logps/chosen": -43.13732147216797, "logps/rejected": -61.28578186035156, "loss": 0.4222, "losses/dpo": 0.6710204482078552, "losses/sft": 1.5309427976608276, "losses/total": 0.6710204482078552, "ref_logps/chosen": -34.049827575683594, "ref_logps/rejected": -41.381378173828125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9087490439414978, "rewards/margins": 1.0816915035247803, "rewards/rejected": -1.9904407262802124, "step": 1087 }, { "epoch": 1.03, "grad_norm": 17.13888931274414, "learning_rate": 3.653375306051067e-07, "logps/chosen": -50.40296173095703, "logps/rejected": -51.6271858215332, "loss": 0.4137, "losses/dpo": 0.48693373799324036, "losses/sft": 2.0377941131591797, "losses/total": 0.48693373799324036, "ref_logps/chosen": -42.096466064453125, "ref_logps/rejected": -34.91358947753906, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8306496739387512, "rewards/margins": 0.8407102823257446, "rewards/rejected": -1.671359896659851, "step": 1088 }, { "epoch": 1.03, "grad_norm": 14.04586410522461, "learning_rate": 3.651626442812172e-07, "logps/chosen": -60.53893280029297, "logps/rejected": -75.64192199707031, "loss": 0.2733, "losses/dpo": 0.2743315100669861, "losses/sft": 2.013023853302002, "losses/total": 0.2743315100669861, "ref_logps/chosen": -51.14175796508789, "ref_logps/rejected": -52.965911865234375, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9397180080413818, "rewards/margins": 1.3278836011886597, "rewards/rejected": -2.267601728439331, "step": 1089 }, { "epoch": 1.03, "grad_norm": 19.8659725189209, "learning_rate": 3.649877579573277e-07, "logps/chosen": -46.49089050292969, "logps/rejected": -73.6563491821289, "loss": 0.5047, "losses/dpo": 0.48692750930786133, "losses/sft": 1.5766716003417969, "losses/total": 0.48692750930786133, "ref_logps/chosen": -36.45451354980469, "ref_logps/rejected": -55.79833984375, "rewards/accuracies": 0.75, "rewards/chosen": -1.0036377906799316, "rewards/margins": 0.7821632027626038, "rewards/rejected": -1.7858009338378906, "step": 1090 }, { "epoch": 1.03, "grad_norm": 21.191913604736328, "learning_rate": 3.6481287163343827e-07, "logps/chosen": -54.523948669433594, "logps/rejected": -64.21807861328125, "loss": 0.5332, "losses/dpo": 0.3113740384578705, "losses/sft": 1.6445286273956299, "losses/total": 0.3113740384578705, "ref_logps/chosen": -43.71592712402344, "ref_logps/rejected": -44.88184356689453, "rewards/accuracies": 0.75, "rewards/chosen": -1.0808024406433105, "rewards/margins": 0.8528213500976562, "rewards/rejected": -1.9336237907409668, "step": 1091 }, { "epoch": 1.03, "grad_norm": 15.382048606872559, "learning_rate": 3.6463798530954873e-07, "logps/chosen": -47.13607406616211, "logps/rejected": -55.18809509277344, "loss": 0.3878, "losses/dpo": 0.408435583114624, "losses/sft": 1.995726227760315, "losses/total": 0.408435583114624, "ref_logps/chosen": -37.132164001464844, "ref_logps/rejected": -36.19349670410156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0003913640975952, "rewards/margins": 0.8990681767463684, "rewards/rejected": -1.8994596004486084, "step": 1092 }, { "epoch": 1.03, "grad_norm": 16.46430778503418, "learning_rate": 3.644630989856593e-07, "logps/chosen": -37.23838424682617, "logps/rejected": -55.96215057373047, "loss": 0.3795, "losses/dpo": 0.49285826086997986, "losses/sft": 1.6897306442260742, "losses/total": 0.49285826086997986, "ref_logps/chosen": -28.73897361755371, "ref_logps/rejected": -37.051513671875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.849941074848175, "rewards/margins": 1.0411226749420166, "rewards/rejected": -1.8910638093948364, "step": 1093 }, { "epoch": 1.03, "grad_norm": 21.956340789794922, "learning_rate": 3.6428821266176986e-07, "logps/chosen": -57.616729736328125, "logps/rejected": -68.11526489257812, "loss": 0.5237, "losses/dpo": 0.5827259421348572, "losses/sft": 1.3818707466125488, "losses/total": 0.5827259421348572, "ref_logps/chosen": -45.73115539550781, "ref_logps/rejected": -49.68520736694336, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1885576248168945, "rewards/margins": 0.6544477939605713, "rewards/rejected": -1.8430054187774658, "step": 1094 }, { "epoch": 1.03, "grad_norm": 18.08264923095703, "learning_rate": 3.641133263378804e-07, "logps/chosen": -54.895057678222656, "logps/rejected": -75.75250244140625, "loss": 0.429, "losses/dpo": 0.2474673092365265, "losses/sft": 2.306041955947876, "losses/total": 0.2474673092365265, "ref_logps/chosen": -41.94483947753906, "ref_logps/rejected": -50.75457763671875, "rewards/accuracies": 0.875, "rewards/chosen": -1.2950215339660645, "rewards/margins": 1.204770803451538, "rewards/rejected": -2.4997923374176025, "step": 1095 }, { "epoch": 1.03, "grad_norm": 18.44590187072754, "learning_rate": 3.639384400139909e-07, "logps/chosen": -41.84230422973633, "logps/rejected": -60.22125244140625, "loss": 0.3716, "losses/dpo": 0.5278711318969727, "losses/sft": 1.1868032217025757, "losses/total": 0.5278711318969727, "ref_logps/chosen": -33.745643615722656, "ref_logps/rejected": -39.61204528808594, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8096659183502197, "rewards/margins": 1.251254677772522, "rewards/rejected": -2.060920476913452, "step": 1096 }, { "epoch": 1.04, "grad_norm": 26.01068687438965, "learning_rate": 3.637635536901014e-07, "logps/chosen": -42.99476623535156, "logps/rejected": -51.496116638183594, "loss": 0.6381, "losses/dpo": 0.7461168169975281, "losses/sft": 1.5090261697769165, "losses/total": 0.7461168169975281, "ref_logps/chosen": -32.50988006591797, "ref_logps/rejected": -36.6175537109375, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0484886169433594, "rewards/margins": 0.43936800956726074, "rewards/rejected": -1.4878566265106201, "step": 1097 }, { "epoch": 1.04, "grad_norm": 15.729644775390625, "learning_rate": 3.6358866736621197e-07, "logps/chosen": -36.038970947265625, "logps/rejected": -47.00421142578125, "loss": 0.3699, "losses/dpo": 0.26515817642211914, "losses/sft": 1.4794013500213623, "losses/total": 0.26515817642211914, "ref_logps/chosen": -31.03093719482422, "ref_logps/rejected": -31.28116226196289, "rewards/accuracies": 0.875, "rewards/chosen": -0.5008033514022827, "rewards/margins": 1.0715014934539795, "rewards/rejected": -1.5723048448562622, "step": 1098 }, { "epoch": 1.04, "grad_norm": 19.49972915649414, "learning_rate": 3.634137810423225e-07, "logps/chosen": -46.45841979980469, "logps/rejected": -57.73186492919922, "loss": 0.4877, "losses/dpo": 0.48669862747192383, "losses/sft": 1.3054991960525513, "losses/total": 0.48669862747192383, "ref_logps/chosen": -37.5338134765625, "ref_logps/rejected": -40.144432067871094, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8924607038497925, "rewards/margins": 0.8662827014923096, "rewards/rejected": -1.7587432861328125, "step": 1099 }, { "epoch": 1.04, "grad_norm": 16.852275848388672, "learning_rate": 3.63238894718433e-07, "logps/chosen": -43.074493408203125, "logps/rejected": -51.77428436279297, "loss": 0.4386, "losses/dpo": 0.4206892251968384, "losses/sft": 1.713990330696106, "losses/total": 0.4206892251968384, "ref_logps/chosen": -33.638893127441406, "ref_logps/rejected": -33.846519470214844, "rewards/accuracies": 0.875, "rewards/chosen": -0.9435595273971558, "rewards/margins": 0.8492172956466675, "rewards/rejected": -1.7927768230438232, "step": 1100 }, { "epoch": 1.04, "grad_norm": 19.013151168823242, "learning_rate": 3.6306400839454356e-07, "logps/chosen": -50.996768951416016, "logps/rejected": -59.741737365722656, "loss": 0.4349, "losses/dpo": 0.4508028030395508, "losses/sft": 1.7069379091262817, "losses/total": 0.4508028030395508, "ref_logps/chosen": -40.384803771972656, "ref_logps/rejected": -39.33921432495117, "rewards/accuracies": 0.75, "rewards/chosen": -1.0611966848373413, "rewards/margins": 0.9790558218955994, "rewards/rejected": -2.040252447128296, "step": 1101 }, { "epoch": 1.04, "grad_norm": 16.264345169067383, "learning_rate": 3.6288912207065407e-07, "logps/chosen": -51.23906707763672, "logps/rejected": -59.40775680541992, "loss": 0.4368, "losses/dpo": 0.6014887094497681, "losses/sft": 2.029402256011963, "losses/total": 0.6014887094497681, "ref_logps/chosen": -42.39584732055664, "ref_logps/rejected": -41.84788513183594, "rewards/accuracies": 0.75, "rewards/chosen": -0.8843222260475159, "rewards/margins": 0.8716649413108826, "rewards/rejected": -1.7559871673583984, "step": 1102 }, { "epoch": 1.04, "grad_norm": 17.19400405883789, "learning_rate": 3.627142357467646e-07, "logps/chosen": -41.24311065673828, "logps/rejected": -64.39938354492188, "loss": 0.3703, "losses/dpo": 0.3022533655166626, "losses/sft": 1.4316505193710327, "losses/total": 0.3022533655166626, "ref_logps/chosen": -30.809202194213867, "ref_logps/rejected": -43.1936149597168, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0433909893035889, "rewards/margins": 1.0771856307983398, "rewards/rejected": -2.1205766201019287, "step": 1103 }, { "epoch": 1.04, "grad_norm": 14.232845306396484, "learning_rate": 3.625393494228751e-07, "logps/chosen": -53.44538116455078, "logps/rejected": -81.70079803466797, "loss": 0.3057, "losses/dpo": 0.35619908571243286, "losses/sft": 1.759982943534851, "losses/total": 0.35619908571243286, "ref_logps/chosen": -42.34745788574219, "ref_logps/rejected": -55.37383270263672, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1097925901412964, "rewards/margins": 1.5229041576385498, "rewards/rejected": -2.6326968669891357, "step": 1104 }, { "epoch": 1.04, "grad_norm": 16.24256706237793, "learning_rate": 3.6236446309898566e-07, "logps/chosen": -59.25031280517578, "logps/rejected": -76.15831756591797, "loss": 0.3057, "losses/dpo": 0.5977211594581604, "losses/sft": 1.8647884130477905, "losses/total": 0.5977211594581604, "ref_logps/chosen": -49.02546691894531, "ref_logps/rejected": -48.99300765991211, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0224847793579102, "rewards/margins": 1.6940460205078125, "rewards/rejected": -2.7165307998657227, "step": 1105 }, { "epoch": 1.04, "grad_norm": 14.55997371673584, "learning_rate": 3.6218957677509617e-07, "logps/chosen": -40.7120361328125, "logps/rejected": -59.31071472167969, "loss": 0.3353, "losses/dpo": 0.2439900040626526, "losses/sft": 1.3137151002883911, "losses/total": 0.2439900040626526, "ref_logps/chosen": -34.27995300292969, "ref_logps/rejected": -40.9670295715332, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6432080268859863, "rewards/margins": 1.1911606788635254, "rewards/rejected": -1.8343685865402222, "step": 1106 }, { "epoch": 1.05, "grad_norm": 16.666099548339844, "learning_rate": 3.620146904512067e-07, "logps/chosen": -47.12230682373047, "logps/rejected": -61.94658660888672, "loss": 0.4388, "losses/dpo": 0.33863022923469543, "losses/sft": 1.6335513591766357, "losses/total": 0.33863022923469543, "ref_logps/chosen": -38.68480682373047, "ref_logps/rejected": -42.507816314697266, "rewards/accuracies": 0.75, "rewards/chosen": -0.8437499403953552, "rewards/margins": 1.1001267433166504, "rewards/rejected": -1.9438767433166504, "step": 1107 }, { "epoch": 1.05, "grad_norm": 17.748262405395508, "learning_rate": 3.6183980412731725e-07, "logps/chosen": -51.22157287597656, "logps/rejected": -69.38434600830078, "loss": 0.423, "losses/dpo": 0.8381637930870056, "losses/sft": 2.3169407844543457, "losses/total": 0.8381637930870056, "ref_logps/chosen": -37.958824157714844, "ref_logps/rejected": -44.11109161376953, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3262743949890137, "rewards/margins": 1.2010509967803955, "rewards/rejected": -2.5273256301879883, "step": 1108 }, { "epoch": 1.05, "grad_norm": 22.701948165893555, "learning_rate": 3.6166491780342776e-07, "logps/chosen": -54.894561767578125, "logps/rejected": -66.18856811523438, "loss": 0.52, "losses/dpo": 0.6727304458618164, "losses/sft": 1.6012508869171143, "losses/total": 0.6727304458618164, "ref_logps/chosen": -41.45005798339844, "ref_logps/rejected": -45.44903564453125, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3444503545761108, "rewards/margins": 0.7295031547546387, "rewards/rejected": -2.073953628540039, "step": 1109 }, { "epoch": 1.05, "grad_norm": 18.720478057861328, "learning_rate": 3.614900314795383e-07, "logps/chosen": -59.015777587890625, "logps/rejected": -59.62892532348633, "loss": 0.4505, "losses/dpo": 0.53801429271698, "losses/sft": 1.892388105392456, "losses/total": 0.53801429271698, "ref_logps/chosen": -46.950958251953125, "ref_logps/rejected": -40.51457214355469, "rewards/accuracies": 0.875, "rewards/chosen": -1.20648193359375, "rewards/margins": 0.7049534320831299, "rewards/rejected": -1.9114353656768799, "step": 1110 }, { "epoch": 1.05, "grad_norm": 17.969141006469727, "learning_rate": 3.613151451556488e-07, "logps/chosen": -49.31396484375, "logps/rejected": -55.11629867553711, "loss": 0.447, "losses/dpo": 0.32817405462265015, "losses/sft": 1.3181028366088867, "losses/total": 0.32817405462265015, "ref_logps/chosen": -40.17245101928711, "ref_logps/rejected": -37.67129898071289, "rewards/accuracies": 0.875, "rewards/chosen": -0.9141513109207153, "rewards/margins": 0.8303487300872803, "rewards/rejected": -1.7445001602172852, "step": 1111 }, { "epoch": 1.05, "grad_norm": 25.278850555419922, "learning_rate": 3.6114025883175935e-07, "logps/chosen": -69.34782409667969, "logps/rejected": -71.66302490234375, "loss": 0.643, "losses/dpo": 0.6143825054168701, "losses/sft": 1.6051511764526367, "losses/total": 0.6143825054168701, "ref_logps/chosen": -49.629981994628906, "ref_logps/rejected": -47.354042053222656, "rewards/accuracies": 0.6875, "rewards/chosen": -1.971783995628357, "rewards/margins": 0.4591141939163208, "rewards/rejected": -2.4308981895446777, "step": 1112 }, { "epoch": 1.05, "grad_norm": 22.47568130493164, "learning_rate": 3.6096537250786987e-07, "logps/chosen": -37.21412658691406, "logps/rejected": -58.294795989990234, "loss": 0.5597, "losses/dpo": 0.17180033028125763, "losses/sft": 1.4381208419799805, "losses/total": 0.17180033028125763, "ref_logps/chosen": -27.49213409423828, "ref_logps/rejected": -40.21863555908203, "rewards/accuracies": 0.625, "rewards/chosen": -0.9721994996070862, "rewards/margins": 0.8354163765907288, "rewards/rejected": -1.8076159954071045, "step": 1113 }, { "epoch": 1.05, "grad_norm": 15.010885238647461, "learning_rate": 3.607904861839804e-07, "logps/chosen": -46.075538635253906, "logps/rejected": -66.32315063476562, "loss": 0.3255, "losses/dpo": 0.40915241837501526, "losses/sft": 1.4643089771270752, "losses/total": 0.40915241837501526, "ref_logps/chosen": -36.338890075683594, "ref_logps/rejected": -44.26709747314453, "rewards/accuracies": 0.875, "rewards/chosen": -0.9736647009849548, "rewards/margins": 1.2319402694702148, "rewards/rejected": -2.2056050300598145, "step": 1114 }, { "epoch": 1.05, "grad_norm": 19.576242446899414, "learning_rate": 3.6061559986009095e-07, "logps/chosen": -44.7883186340332, "logps/rejected": -59.95389175415039, "loss": 0.4244, "losses/dpo": 0.35913681983947754, "losses/sft": 1.27180814743042, "losses/total": 0.35913681983947754, "ref_logps/chosen": -35.445308685302734, "ref_logps/rejected": -38.98551940917969, "rewards/accuracies": 0.75, "rewards/chosen": -0.9343007206916809, "rewards/margins": 1.16253662109375, "rewards/rejected": -2.096837282180786, "step": 1115 }, { "epoch": 1.05, "grad_norm": 18.308237075805664, "learning_rate": 3.6044071353620146e-07, "logps/chosen": -36.147193908691406, "logps/rejected": -58.4028205871582, "loss": 0.4321, "losses/dpo": 0.4773769974708557, "losses/sft": 1.2280681133270264, "losses/total": 0.4773769974708557, "ref_logps/chosen": -28.9426326751709, "ref_logps/rejected": -40.52062225341797, "rewards/accuracies": 0.75, "rewards/chosen": -0.7204559445381165, "rewards/margins": 1.0677636861801147, "rewards/rejected": -1.788219690322876, "step": 1116 }, { "epoch": 1.05, "grad_norm": 16.233169555664062, "learning_rate": 3.6026582721231197e-07, "logps/chosen": -39.595314025878906, "logps/rejected": -53.48869705200195, "loss": 0.4073, "losses/dpo": 0.3475499749183655, "losses/sft": 1.2709814310073853, "losses/total": 0.3475499749183655, "ref_logps/chosen": -30.529605865478516, "ref_logps/rejected": -36.00749969482422, "rewards/accuracies": 0.875, "rewards/chosen": -0.9065707325935364, "rewards/margins": 0.8415488600730896, "rewards/rejected": -1.748119592666626, "step": 1117 }, { "epoch": 1.06, "grad_norm": 15.320701599121094, "learning_rate": 3.6009094088842254e-07, "logps/chosen": -44.485107421875, "logps/rejected": -54.91472625732422, "loss": 0.3597, "losses/dpo": 0.48282861709594727, "losses/sft": 1.5630582571029663, "losses/total": 0.48282861709594727, "ref_logps/chosen": -35.60723114013672, "ref_logps/rejected": -35.52288055419922, "rewards/accuracies": 0.875, "rewards/chosen": -0.8877873420715332, "rewards/margins": 1.0513970851898193, "rewards/rejected": -1.9391844272613525, "step": 1118 }, { "epoch": 1.06, "grad_norm": 14.03819751739502, "learning_rate": 3.5991605456453305e-07, "logps/chosen": -47.40251159667969, "logps/rejected": -75.83522033691406, "loss": 0.2648, "losses/dpo": 0.24401050806045532, "losses/sft": 1.9604064226150513, "losses/total": 0.24401050806045532, "ref_logps/chosen": -38.702205657958984, "ref_logps/rejected": -52.07846450805664, "rewards/accuracies": 1.0, "rewards/chosen": -0.8700307011604309, "rewards/margins": 1.5056447982788086, "rewards/rejected": -2.375675678253174, "step": 1119 }, { "epoch": 1.06, "grad_norm": 17.748851776123047, "learning_rate": 3.5974116824064356e-07, "logps/chosen": -47.6712532043457, "logps/rejected": -79.5129623413086, "loss": 0.3323, "losses/dpo": 0.25378283858299255, "losses/sft": 1.4068554639816284, "losses/total": 0.25378283858299255, "ref_logps/chosen": -37.408958435058594, "ref_logps/rejected": -55.53813934326172, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0262292623519897, "rewards/margins": 1.3712530136108398, "rewards/rejected": -2.39748215675354, "step": 1120 }, { "epoch": 1.06, "grad_norm": 19.006412506103516, "learning_rate": 3.595662819167541e-07, "logps/chosen": -49.565391540527344, "logps/rejected": -76.67436218261719, "loss": 0.3702, "losses/dpo": 0.50902259349823, "losses/sft": 2.1297295093536377, "losses/total": 0.50902259349823, "ref_logps/chosen": -38.21741485595703, "ref_logps/rejected": -54.113346099853516, "rewards/accuracies": 0.875, "rewards/chosen": -1.1347975730895996, "rewards/margins": 1.1213042736053467, "rewards/rejected": -2.2561018466949463, "step": 1121 }, { "epoch": 1.06, "grad_norm": 13.391525268554688, "learning_rate": 3.5939139559286464e-07, "logps/chosen": -45.734527587890625, "logps/rejected": -64.69873046875, "loss": 0.2798, "losses/dpo": 0.22471119463443756, "losses/sft": 1.4187021255493164, "losses/total": 0.22471119463443756, "ref_logps/chosen": -36.93622589111328, "ref_logps/rejected": -41.9991569519043, "rewards/accuracies": 0.875, "rewards/chosen": -0.8798298835754395, "rewards/margins": 1.3901268243789673, "rewards/rejected": -2.2699568271636963, "step": 1122 }, { "epoch": 1.06, "grad_norm": 15.596683502197266, "learning_rate": 3.5921650926897515e-07, "logps/chosen": -41.36247634887695, "logps/rejected": -71.56450653076172, "loss": 0.3539, "losses/dpo": 0.24520686268806458, "losses/sft": 1.4111450910568237, "losses/total": 0.24520686268806458, "ref_logps/chosen": -34.94282531738281, "ref_logps/rejected": -50.83561706542969, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6419654488563538, "rewards/margins": 1.4309229850769043, "rewards/rejected": -2.0728883743286133, "step": 1123 }, { "epoch": 1.06, "grad_norm": 19.83420753479004, "learning_rate": 3.5904162294508567e-07, "logps/chosen": -47.32707214355469, "logps/rejected": -72.98094940185547, "loss": 0.3454, "losses/dpo": 0.40817296504974365, "losses/sft": 1.5442166328430176, "losses/total": 0.40817296504974365, "ref_logps/chosen": -37.06970977783203, "ref_logps/rejected": -48.767147064208984, "rewards/accuracies": 0.75, "rewards/chosen": -1.0257360935211182, "rewards/margins": 1.395643949508667, "rewards/rejected": -2.421380043029785, "step": 1124 }, { "epoch": 1.06, "grad_norm": 21.95766830444336, "learning_rate": 3.5886673662119623e-07, "logps/chosen": -57.457759857177734, "logps/rejected": -59.646793365478516, "loss": 0.4503, "losses/dpo": 0.3703533411026001, "losses/sft": 1.6664035320281982, "losses/total": 0.3703533411026001, "ref_logps/chosen": -43.65606689453125, "ref_logps/rejected": -37.202117919921875, "rewards/accuracies": 0.875, "rewards/chosen": -1.3801689147949219, "rewards/margins": 0.8642987012863159, "rewards/rejected": -2.2444677352905273, "step": 1125 }, { "epoch": 1.06, "grad_norm": 21.429195404052734, "learning_rate": 3.5869185029730674e-07, "logps/chosen": -58.990962982177734, "logps/rejected": -64.17696380615234, "loss": 0.4925, "losses/dpo": 0.3632112741470337, "losses/sft": 1.3681801557540894, "losses/total": 0.3632112741470337, "ref_logps/chosen": -44.11982727050781, "ref_logps/rejected": -40.79416275024414, "rewards/accuracies": 0.875, "rewards/chosen": -1.4871137142181396, "rewards/margins": 0.8511661887168884, "rewards/rejected": -2.338279962539673, "step": 1126 }, { "epoch": 1.06, "grad_norm": 19.995677947998047, "learning_rate": 3.5851696397341726e-07, "logps/chosen": -37.011383056640625, "logps/rejected": -66.07659912109375, "loss": 0.4604, "losses/dpo": 0.21649906039237976, "losses/sft": 1.7158467769622803, "losses/total": 0.21649906039237976, "ref_logps/chosen": -26.816097259521484, "ref_logps/rejected": -43.77798080444336, "rewards/accuracies": 0.75, "rewards/chosen": -1.0195286273956299, "rewards/margins": 1.2103328704833984, "rewards/rejected": -2.229861259460449, "step": 1127 }, { "epoch": 1.07, "grad_norm": 23.656599044799805, "learning_rate": 3.5834207764952777e-07, "logps/chosen": -53.18360900878906, "logps/rejected": -63.63755416870117, "loss": 0.4436, "losses/dpo": 0.5402354598045349, "losses/sft": 1.850532054901123, "losses/total": 0.5402354598045349, "ref_logps/chosen": -39.15081787109375, "ref_logps/rejected": -40.1318359375, "rewards/accuracies": 0.75, "rewards/chosen": -1.4032790660858154, "rewards/margins": 0.9472929239273071, "rewards/rejected": -2.350571870803833, "step": 1128 }, { "epoch": 1.07, "grad_norm": 15.228174209594727, "learning_rate": 3.5816719132563833e-07, "logps/chosen": -57.978492736816406, "logps/rejected": -80.89125061035156, "loss": 0.296, "losses/dpo": 0.23466572165489197, "losses/sft": 2.539710521697998, "losses/total": 0.23466572165489197, "ref_logps/chosen": -45.61967086791992, "ref_logps/rejected": -53.44007873535156, "rewards/accuracies": 0.875, "rewards/chosen": -1.2358821630477905, "rewards/margins": 1.509235143661499, "rewards/rejected": -2.7451171875, "step": 1129 }, { "epoch": 1.07, "grad_norm": 27.180347442626953, "learning_rate": 3.579923050017489e-07, "logps/chosen": -50.36742401123047, "logps/rejected": -55.680397033691406, "loss": 0.6255, "losses/dpo": 0.6598228216171265, "losses/sft": 1.9887068271636963, "losses/total": 0.6598228216171265, "ref_logps/chosen": -33.17410659790039, "ref_logps/rejected": -34.39702606201172, "rewards/accuracies": 0.625, "rewards/chosen": -1.7193315029144287, "rewards/margins": 0.4090052843093872, "rewards/rejected": -2.1283369064331055, "step": 1130 }, { "epoch": 1.07, "grad_norm": 27.55199432373047, "learning_rate": 3.5781741867785936e-07, "logps/chosen": -60.86018753051758, "logps/rejected": -68.66188049316406, "loss": 0.6284, "losses/dpo": 0.8763452172279358, "losses/sft": 1.7240406274795532, "losses/total": 0.8763452172279358, "ref_logps/chosen": -45.129634857177734, "ref_logps/rejected": -48.709686279296875, "rewards/accuracies": 0.5, "rewards/chosen": -1.5730552673339844, "rewards/margins": 0.4221642017364502, "rewards/rejected": -1.9952194690704346, "step": 1131 }, { "epoch": 1.07, "grad_norm": 25.422346115112305, "learning_rate": 3.576425323539699e-07, "logps/chosen": -58.734981536865234, "logps/rejected": -54.84503173828125, "loss": 0.6635, "losses/dpo": 0.43343645334243774, "losses/sft": 1.9890453815460205, "losses/total": 0.43343645334243774, "ref_logps/chosen": -41.60216522216797, "ref_logps/rejected": -33.43422317504883, "rewards/accuracies": 0.5625, "rewards/chosen": -1.7132813930511475, "rewards/margins": 0.4277995228767395, "rewards/rejected": -2.141080856323242, "step": 1132 }, { "epoch": 1.07, "grad_norm": 18.690250396728516, "learning_rate": 3.5746764603008044e-07, "logps/chosen": -58.3661994934082, "logps/rejected": -69.38311767578125, "loss": 0.3678, "losses/dpo": 0.42590248584747314, "losses/sft": 1.55850088596344, "losses/total": 0.42590248584747314, "ref_logps/chosen": -45.18629455566406, "ref_logps/rejected": -44.711822509765625, "rewards/accuracies": 0.875, "rewards/chosen": -1.3179905414581299, "rewards/margins": 1.1491386890411377, "rewards/rejected": -2.4671292304992676, "step": 1133 }, { "epoch": 1.07, "grad_norm": 24.80685043334961, "learning_rate": 3.5729275970619095e-07, "logps/chosen": -48.22721481323242, "logps/rejected": -63.99211120605469, "loss": 0.4485, "losses/dpo": 0.45494773983955383, "losses/sft": 1.812914252281189, "losses/total": 0.45494773983955383, "ref_logps/chosen": -33.45372772216797, "ref_logps/rejected": -39.96595001220703, "rewards/accuracies": 0.8125, "rewards/chosen": -1.477348804473877, "rewards/margins": 0.9252671003341675, "rewards/rejected": -2.402616024017334, "step": 1134 }, { "epoch": 1.07, "grad_norm": 26.141324996948242, "learning_rate": 3.5711787338230146e-07, "logps/chosen": -50.24357604980469, "logps/rejected": -70.65209197998047, "loss": 0.4864, "losses/dpo": 0.8688951730728149, "losses/sft": 2.723207712173462, "losses/total": 0.8688951730728149, "ref_logps/chosen": -32.94593048095703, "ref_logps/rejected": -43.611351013183594, "rewards/accuracies": 0.75, "rewards/chosen": -1.729764699935913, "rewards/margins": 0.9743098020553589, "rewards/rejected": -2.7040746212005615, "step": 1135 }, { "epoch": 1.07, "grad_norm": 18.747983932495117, "learning_rate": 3.5694298705841203e-07, "logps/chosen": -49.9259033203125, "logps/rejected": -58.2346076965332, "loss": 0.4268, "losses/dpo": 0.30385822057724, "losses/sft": 1.8258968591690063, "losses/total": 0.30385822057724, "ref_logps/chosen": -39.310333251953125, "ref_logps/rejected": -37.35885238647461, "rewards/accuracies": 0.8125, "rewards/chosen": -1.061557412147522, "rewards/margins": 1.0260180234909058, "rewards/rejected": -2.0875754356384277, "step": 1136 }, { "epoch": 1.07, "grad_norm": 19.496219635009766, "learning_rate": 3.567681007345226e-07, "logps/chosen": -53.89305877685547, "logps/rejected": -78.14730072021484, "loss": 0.3448, "losses/dpo": 0.4381665289402008, "losses/sft": 1.6898956298828125, "losses/total": 0.4381665289402008, "ref_logps/chosen": -38.962066650390625, "ref_logps/rejected": -49.8504638671875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.493099331855774, "rewards/margins": 1.3365838527679443, "rewards/rejected": -2.8296830654144287, "step": 1137 }, { "epoch": 1.07, "grad_norm": 28.223752975463867, "learning_rate": 3.5659321441063305e-07, "logps/chosen": -69.51285552978516, "logps/rejected": -78.18511199951172, "loss": 0.4396, "losses/dpo": 0.8602421283721924, "losses/sft": 2.231619358062744, "losses/total": 0.8602421283721924, "ref_logps/chosen": -51.995994567871094, "ref_logps/rejected": -49.90565490722656, "rewards/accuracies": 0.875, "rewards/chosen": -1.7516865730285645, "rewards/margins": 1.0762584209442139, "rewards/rejected": -2.8279449939727783, "step": 1138 }, { "epoch": 1.08, "grad_norm": 22.851444244384766, "learning_rate": 3.564183280867436e-07, "logps/chosen": -44.15315246582031, "logps/rejected": -51.366722106933594, "loss": 0.4632, "losses/dpo": 0.5887936353683472, "losses/sft": 2.151808261871338, "losses/total": 0.5887936353683472, "ref_logps/chosen": -31.659286499023438, "ref_logps/rejected": -28.490835189819336, "rewards/accuracies": 0.75, "rewards/chosen": -1.2493866682052612, "rewards/margins": 1.0382020473480225, "rewards/rejected": -2.2875888347625732, "step": 1139 }, { "epoch": 1.08, "grad_norm": 25.950254440307617, "learning_rate": 3.5624344176285413e-07, "logps/chosen": -57.27245330810547, "logps/rejected": -66.78453063964844, "loss": 0.5041, "losses/dpo": 0.5709726810455322, "losses/sft": 2.028679609298706, "losses/total": 0.5709726810455322, "ref_logps/chosen": -44.13425827026367, "ref_logps/rejected": -45.715187072753906, "rewards/accuracies": 0.75, "rewards/chosen": -1.3138198852539062, "rewards/margins": 0.7931150794029236, "rewards/rejected": -2.1069350242614746, "step": 1140 }, { "epoch": 1.08, "grad_norm": 33.055450439453125, "learning_rate": 3.5606855543896464e-07, "logps/chosen": -60.55674743652344, "logps/rejected": -58.6107292175293, "loss": 0.7891, "losses/dpo": 0.7282435894012451, "losses/sft": 1.6608691215515137, "losses/total": 0.7282435894012451, "ref_logps/chosen": -41.75940704345703, "ref_logps/rejected": -37.362300872802734, "rewards/accuracies": 0.625, "rewards/chosen": -1.879733920097351, "rewards/margins": 0.24510888755321503, "rewards/rejected": -2.124842643737793, "step": 1141 }, { "epoch": 1.08, "grad_norm": 18.956497192382812, "learning_rate": 3.5589366911507516e-07, "logps/chosen": -46.049354553222656, "logps/rejected": -58.819793701171875, "loss": 0.3508, "losses/dpo": 0.2803114354610443, "losses/sft": 1.8871058225631714, "losses/total": 0.2803114354610443, "ref_logps/chosen": -35.16623306274414, "ref_logps/rejected": -36.46910095214844, "rewards/accuracies": 0.875, "rewards/chosen": -1.0883126258850098, "rewards/margins": 1.1467571258544922, "rewards/rejected": -2.235069751739502, "step": 1142 }, { "epoch": 1.08, "grad_norm": 17.948486328125, "learning_rate": 3.557187827911857e-07, "logps/chosen": -52.22205352783203, "logps/rejected": -73.42619323730469, "loss": 0.3478, "losses/dpo": 0.4428257346153259, "losses/sft": 1.4677045345306396, "losses/total": 0.4428257346153259, "ref_logps/chosen": -39.68170928955078, "ref_logps/rejected": -49.47446060180664, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2540345191955566, "rewards/margins": 1.1411387920379639, "rewards/rejected": -2.3951730728149414, "step": 1143 }, { "epoch": 1.08, "grad_norm": 25.768634796142578, "learning_rate": 3.555438964672963e-07, "logps/chosen": -54.37225341796875, "logps/rejected": -80.14595794677734, "loss": 0.4138, "losses/dpo": 0.23643368482589722, "losses/sft": 1.6120222806930542, "losses/total": 0.23643368482589722, "ref_logps/chosen": -41.12909698486328, "ref_logps/rejected": -52.570533752441406, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3243160247802734, "rewards/margins": 1.4332270622253418, "rewards/rejected": -2.7575430870056152, "step": 1144 }, { "epoch": 1.08, "grad_norm": 19.334613800048828, "learning_rate": 3.5536901014340675e-07, "logps/chosen": -44.830909729003906, "logps/rejected": -62.91521072387695, "loss": 0.4225, "losses/dpo": 0.22556906938552856, "losses/sft": 1.6811991930007935, "losses/total": 0.22556906938552856, "ref_logps/chosen": -32.09433364868164, "ref_logps/rejected": -38.02540588378906, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2736577987670898, "rewards/margins": 1.2153228521347046, "rewards/rejected": -2.488980770111084, "step": 1145 }, { "epoch": 1.08, "grad_norm": 14.628767967224121, "learning_rate": 3.551941238195173e-07, "logps/chosen": -42.94926071166992, "logps/rejected": -71.724853515625, "loss": 0.2858, "losses/dpo": 0.4847782254219055, "losses/sft": 1.6568411588668823, "losses/total": 0.4847782254219055, "ref_logps/chosen": -33.102718353271484, "ref_logps/rejected": -45.65483856201172, "rewards/accuracies": 1.0, "rewards/chosen": -0.9846542477607727, "rewards/margins": 1.6223468780517578, "rewards/rejected": -2.6070010662078857, "step": 1146 }, { "epoch": 1.08, "grad_norm": 18.073406219482422, "learning_rate": 3.5501923749562783e-07, "logps/chosen": -39.053016662597656, "logps/rejected": -57.59465026855469, "loss": 0.4177, "losses/dpo": 0.8261393308639526, "losses/sft": 1.3657069206237793, "losses/total": 0.8261393308639526, "ref_logps/chosen": -28.937376022338867, "ref_logps/rejected": -34.375274658203125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0115641355514526, "rewards/margins": 1.310373306274414, "rewards/rejected": -2.3219375610351562, "step": 1147 }, { "epoch": 1.08, "grad_norm": 21.49103355407715, "learning_rate": 3.5484435117173834e-07, "logps/chosen": -45.497642517089844, "logps/rejected": -65.38764190673828, "loss": 0.4269, "losses/dpo": 0.5173580646514893, "losses/sft": 1.3202495574951172, "losses/total": 0.5173580646514893, "ref_logps/chosen": -33.979156494140625, "ref_logps/rejected": -41.26577377319336, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1518484354019165, "rewards/margins": 1.260338544845581, "rewards/rejected": -2.412186861038208, "step": 1148 }, { "epoch": 1.08, "grad_norm": 23.00715446472168, "learning_rate": 3.5466946484784885e-07, "logps/chosen": -49.389007568359375, "logps/rejected": -73.2331771850586, "loss": 0.3978, "losses/dpo": 0.5119600296020508, "losses/sft": 1.5053342580795288, "losses/total": 0.5119600296020508, "ref_logps/chosen": -38.111602783203125, "ref_logps/rejected": -51.177330017089844, "rewards/accuracies": 0.875, "rewards/chosen": -1.1277403831481934, "rewards/margins": 1.0778441429138184, "rewards/rejected": -2.2055845260620117, "step": 1149 }, { "epoch": 1.09, "grad_norm": 15.713883399963379, "learning_rate": 3.544945785239594e-07, "logps/chosen": -41.73316955566406, "logps/rejected": -59.24371337890625, "loss": 0.4268, "losses/dpo": 0.5106825828552246, "losses/sft": 1.5899378061294556, "losses/total": 0.5106825828552246, "ref_logps/chosen": -32.25578308105469, "ref_logps/rejected": -39.29499053955078, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9477388858795166, "rewards/margins": 1.0471336841583252, "rewards/rejected": -1.9948724508285522, "step": 1150 }, { "epoch": 1.09, "grad_norm": 24.67866325378418, "learning_rate": 3.5431969220007e-07, "logps/chosen": -54.55917739868164, "logps/rejected": -71.77665710449219, "loss": 0.4008, "losses/dpo": 0.2664833962917328, "losses/sft": 1.702461838722229, "losses/total": 0.2664833962917328, "ref_logps/chosen": -41.261722564697266, "ref_logps/rejected": -47.21800994873047, "rewards/accuracies": 0.875, "rewards/chosen": -1.3297452926635742, "rewards/margins": 1.126119613647461, "rewards/rejected": -2.455864906311035, "step": 1151 }, { "epoch": 1.09, "grad_norm": 16.442657470703125, "learning_rate": 3.5414480587618044e-07, "logps/chosen": -49.97710037231445, "logps/rejected": -66.29367065429688, "loss": 0.3146, "losses/dpo": 0.26360827684402466, "losses/sft": 1.9229429960250854, "losses/total": 0.26360827684402466, "ref_logps/chosen": -39.35449981689453, "ref_logps/rejected": -43.36257553100586, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0622599124908447, "rewards/margins": 1.230850338935852, "rewards/rejected": -2.2931101322174072, "step": 1152 }, { "epoch": 1.09, "grad_norm": 17.925357818603516, "learning_rate": 3.53969919552291e-07, "logps/chosen": -54.834068298339844, "logps/rejected": -59.09382629394531, "loss": 0.3315, "losses/dpo": 0.35731256008148193, "losses/sft": 1.4937348365783691, "losses/total": 0.35731256008148193, "ref_logps/chosen": -47.17584991455078, "ref_logps/rejected": -37.09530258178711, "rewards/accuracies": 0.875, "rewards/chosen": -0.7658220529556274, "rewards/margins": 1.434030532836914, "rewards/rejected": -2.199852466583252, "step": 1153 }, { "epoch": 1.09, "grad_norm": 17.206207275390625, "learning_rate": 3.537950332284015e-07, "logps/chosen": -35.15504455566406, "logps/rejected": -58.040550231933594, "loss": 0.3286, "losses/dpo": 0.6031808257102966, "losses/sft": 1.793176293373108, "losses/total": 0.6031808257102966, "ref_logps/chosen": -27.431175231933594, "ref_logps/rejected": -37.54655838012695, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7723866701126099, "rewards/margins": 1.2770124673843384, "rewards/rejected": -2.0493993759155273, "step": 1154 }, { "epoch": 1.09, "grad_norm": 23.088726043701172, "learning_rate": 3.5362014690451203e-07, "logps/chosen": -56.150489807128906, "logps/rejected": -67.29031372070312, "loss": 0.5781, "losses/dpo": 0.9381304979324341, "losses/sft": 2.3748488426208496, "losses/total": 0.9381304979324341, "ref_logps/chosen": -42.44438171386719, "ref_logps/rejected": -46.41227340698242, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3706107139587402, "rewards/margins": 0.7171932458877563, "rewards/rejected": -2.087803840637207, "step": 1155 }, { "epoch": 1.09, "grad_norm": 14.592273712158203, "learning_rate": 3.534452605806226e-07, "logps/chosen": -43.50091552734375, "logps/rejected": -55.27341079711914, "loss": 0.3501, "losses/dpo": 0.4037971794605255, "losses/sft": 1.7588260173797607, "losses/total": 0.4037971794605255, "ref_logps/chosen": -33.55754470825195, "ref_logps/rejected": -34.06553649902344, "rewards/accuracies": 0.875, "rewards/chosen": -0.9943369030952454, "rewards/margins": 1.126450538635254, "rewards/rejected": -2.1207876205444336, "step": 1156 }, { "epoch": 1.09, "grad_norm": 14.295937538146973, "learning_rate": 3.532703742567331e-07, "logps/chosen": -46.77973556518555, "logps/rejected": -55.979454040527344, "loss": 0.3319, "losses/dpo": 0.25589895248413086, "losses/sft": 1.7189871072769165, "losses/total": 0.25589895248413086, "ref_logps/chosen": -37.207427978515625, "ref_logps/rejected": -33.743812561035156, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9572307467460632, "rewards/margins": 1.2663336992263794, "rewards/rejected": -2.223564386367798, "step": 1157 }, { "epoch": 1.09, "grad_norm": 19.403175354003906, "learning_rate": 3.530954879328437e-07, "logps/chosen": -38.16740417480469, "logps/rejected": -66.7232666015625, "loss": 0.41, "losses/dpo": 0.34852665662765503, "losses/sft": 1.9315986633300781, "losses/total": 0.34852665662765503, "ref_logps/chosen": -25.988985061645508, "ref_logps/rejected": -42.32234573364258, "rewards/accuracies": 0.875, "rewards/chosen": -1.2178417444229126, "rewards/margins": 1.22225022315979, "rewards/rejected": -2.440091848373413, "step": 1158 }, { "epoch": 1.09, "grad_norm": 22.430150985717773, "learning_rate": 3.5292060160895414e-07, "logps/chosen": -51.44413757324219, "logps/rejected": -69.00386047363281, "loss": 0.3955, "losses/dpo": 0.5232186913490295, "losses/sft": 1.9724373817443848, "losses/total": 0.5232186913490295, "ref_logps/chosen": -36.11259078979492, "ref_logps/rejected": -43.00016403198242, "rewards/accuracies": 0.875, "rewards/chosen": -1.533154845237732, "rewards/margins": 1.067214012145996, "rewards/rejected": -2.6003689765930176, "step": 1159 }, { "epoch": 1.1, "grad_norm": 22.362287521362305, "learning_rate": 3.527457152850647e-07, "logps/chosen": -55.34199905395508, "logps/rejected": -68.48287963867188, "loss": 0.454, "losses/dpo": 0.2427171915769577, "losses/sft": 1.4085420370101929, "losses/total": 0.2427171915769577, "ref_logps/chosen": -41.06510925292969, "ref_logps/rejected": -45.122982025146484, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4276885986328125, "rewards/margins": 0.9083008766174316, "rewards/rejected": -2.335989475250244, "step": 1160 }, { "epoch": 1.1, "grad_norm": 24.605642318725586, "learning_rate": 3.525708289611752e-07, "logps/chosen": -58.87159729003906, "logps/rejected": -63.40034484863281, "loss": 0.581, "losses/dpo": 0.33205026388168335, "losses/sft": 2.30277156829834, "losses/total": 0.33205026388168335, "ref_logps/chosen": -43.74916076660156, "ref_logps/rejected": -40.892364501953125, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5122437477111816, "rewards/margins": 0.7385542392730713, "rewards/rejected": -2.250798225402832, "step": 1161 }, { "epoch": 1.1, "grad_norm": 19.314685821533203, "learning_rate": 3.5239594263728573e-07, "logps/chosen": -42.61058807373047, "logps/rejected": -50.25887680053711, "loss": 0.4573, "losses/dpo": 0.7957479357719421, "losses/sft": 1.5312973260879517, "losses/total": 0.7957479357719421, "ref_logps/chosen": -36.30377960205078, "ref_logps/rejected": -33.12445831298828, "rewards/accuracies": 0.75, "rewards/chosen": -0.6306807994842529, "rewards/margins": 1.082761287689209, "rewards/rejected": -1.7134422063827515, "step": 1162 }, { "epoch": 1.1, "grad_norm": 24.99732208251953, "learning_rate": 3.522210563133963e-07, "logps/chosen": -53.78838348388672, "logps/rejected": -63.896514892578125, "loss": 0.544, "losses/dpo": 0.30477678775787354, "losses/sft": 1.4882583618164062, "losses/total": 0.30477678775787354, "ref_logps/chosen": -41.39977264404297, "ref_logps/rejected": -43.97334289550781, "rewards/accuracies": 0.6875, "rewards/chosen": -1.238861083984375, "rewards/margins": 0.7534563541412354, "rewards/rejected": -1.9923176765441895, "step": 1163 }, { "epoch": 1.1, "grad_norm": 19.90959930419922, "learning_rate": 3.520461699895068e-07, "logps/chosen": -50.8354606628418, "logps/rejected": -75.72209167480469, "loss": 0.3793, "losses/dpo": 0.5388731360435486, "losses/sft": 2.3876144886016846, "losses/total": 0.5388731360435486, "ref_logps/chosen": -38.40864562988281, "ref_logps/rejected": -49.724021911621094, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2426813840866089, "rewards/margins": 1.3571256399154663, "rewards/rejected": -2.599807024002075, "step": 1164 }, { "epoch": 1.1, "grad_norm": 19.020309448242188, "learning_rate": 3.5187128366561737e-07, "logps/chosen": -49.30963897705078, "logps/rejected": -77.75773620605469, "loss": 0.332, "losses/dpo": 0.39261364936828613, "losses/sft": 1.989349126815796, "losses/total": 0.39261364936828613, "ref_logps/chosen": -36.979759216308594, "ref_logps/rejected": -50.448089599609375, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2329885959625244, "rewards/margins": 1.497976303100586, "rewards/rejected": -2.7309646606445312, "step": 1165 }, { "epoch": 1.1, "grad_norm": 24.917984008789062, "learning_rate": 3.5169639734172783e-07, "logps/chosen": -51.5273323059082, "logps/rejected": -80.72734832763672, "loss": 0.5428, "losses/dpo": 0.2605714797973633, "losses/sft": 1.7832306623458862, "losses/total": 0.2605714797973633, "ref_logps/chosen": -35.4371337890625, "ref_logps/rejected": -55.13147735595703, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6090195178985596, "rewards/margins": 0.9505681991577148, "rewards/rejected": -2.5595874786376953, "step": 1166 }, { "epoch": 1.1, "grad_norm": 24.68726921081543, "learning_rate": 3.515215110178384e-07, "logps/chosen": -53.63042068481445, "logps/rejected": -64.61624145507812, "loss": 0.4266, "losses/dpo": 0.4296250343322754, "losses/sft": 2.0445499420166016, "losses/total": 0.4296250343322754, "ref_logps/chosen": -37.47551727294922, "ref_logps/rejected": -37.14946746826172, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6154899597167969, "rewards/margins": 1.1311869621276855, "rewards/rejected": -2.7466769218444824, "step": 1167 }, { "epoch": 1.1, "grad_norm": 24.4298095703125, "learning_rate": 3.513466246939489e-07, "logps/chosen": -58.022926330566406, "logps/rejected": -87.08427429199219, "loss": 0.4136, "losses/dpo": 0.5407834053039551, "losses/sft": 1.6361950635910034, "losses/total": 0.5407834053039551, "ref_logps/chosen": -41.00047302246094, "ref_logps/rejected": -55.60737991333008, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7022452354431152, "rewards/margins": 1.445443868637085, "rewards/rejected": -3.147688865661621, "step": 1168 }, { "epoch": 1.1, "grad_norm": 21.377750396728516, "learning_rate": 3.511717383700594e-07, "logps/chosen": -36.05640411376953, "logps/rejected": -51.381248474121094, "loss": 0.4771, "losses/dpo": 0.9195261001586914, "losses/sft": 2.144254684448242, "losses/total": 0.9195261001586914, "ref_logps/chosen": -27.85747528076172, "ref_logps/rejected": -35.07959747314453, "rewards/accuracies": 0.875, "rewards/chosen": -0.8198926448822021, "rewards/margins": 0.8102728128433228, "rewards/rejected": -1.6301653385162354, "step": 1169 }, { "epoch": 1.1, "grad_norm": 18.577001571655273, "learning_rate": 3.5099685204617e-07, "logps/chosen": -42.984107971191406, "logps/rejected": -57.425537109375, "loss": 0.427, "losses/dpo": 0.5384687185287476, "losses/sft": 1.9965769052505493, "losses/total": 0.5384687185287476, "ref_logps/chosen": -31.05020523071289, "ref_logps/rejected": -35.98308563232422, "rewards/accuracies": 0.75, "rewards/chosen": -1.1933903694152832, "rewards/margins": 0.9508547186851501, "rewards/rejected": -2.144245147705078, "step": 1170 }, { "epoch": 1.11, "grad_norm": 19.3896484375, "learning_rate": 3.508219657222805e-07, "logps/chosen": -55.75626754760742, "logps/rejected": -64.54580688476562, "loss": 0.4252, "losses/dpo": 0.1418294906616211, "losses/sft": 1.8937249183654785, "losses/total": 0.1418294906616211, "ref_logps/chosen": -43.520999908447266, "ref_logps/rejected": -42.44101333618164, "rewards/accuracies": 0.8125, "rewards/chosen": -1.223527193069458, "rewards/margins": 0.9869518280029297, "rewards/rejected": -2.2104790210723877, "step": 1171 }, { "epoch": 1.11, "grad_norm": 20.17457389831543, "learning_rate": 3.5064707939839107e-07, "logps/chosen": -47.316165924072266, "logps/rejected": -65.99116516113281, "loss": 0.4152, "losses/dpo": 0.4568825364112854, "losses/sft": 1.6591540575027466, "losses/total": 0.4568825364112854, "ref_logps/chosen": -35.279884338378906, "ref_logps/rejected": -43.724586486816406, "rewards/accuracies": 0.75, "rewards/chosen": -1.2036280632019043, "rewards/margins": 1.0230306386947632, "rewards/rejected": -2.226658821105957, "step": 1172 }, { "epoch": 1.11, "grad_norm": 17.68301010131836, "learning_rate": 3.504721930745015e-07, "logps/chosen": -50.62818145751953, "logps/rejected": -87.6301498413086, "loss": 0.2948, "losses/dpo": 0.3418300151824951, "losses/sft": 2.1226580142974854, "losses/total": 0.3418300151824951, "ref_logps/chosen": -36.131919860839844, "ref_logps/rejected": -54.819026947021484, "rewards/accuracies": 0.875, "rewards/chosen": -1.4496264457702637, "rewards/margins": 1.8314857482910156, "rewards/rejected": -3.2811119556427, "step": 1173 }, { "epoch": 1.11, "grad_norm": 27.2384090423584, "learning_rate": 3.502973067506121e-07, "logps/chosen": -53.364532470703125, "logps/rejected": -63.85072326660156, "loss": 0.6441, "losses/dpo": 0.8595701456069946, "losses/sft": 2.1518218517303467, "losses/total": 0.8595701456069946, "ref_logps/chosen": -38.69149398803711, "ref_logps/rejected": -44.97615051269531, "rewards/accuracies": 0.625, "rewards/chosen": -1.4673038721084595, "rewards/margins": 0.4201531410217285, "rewards/rejected": -1.887457013130188, "step": 1174 }, { "epoch": 1.11, "grad_norm": 22.04258155822754, "learning_rate": 3.5012242042672266e-07, "logps/chosen": -54.312103271484375, "logps/rejected": -67.18894958496094, "loss": 0.4164, "losses/dpo": 0.718026876449585, "losses/sft": 1.9256107807159424, "losses/total": 0.718026876449585, "ref_logps/chosen": -39.61787796020508, "ref_logps/rejected": -42.2315673828125, "rewards/accuracies": 0.75, "rewards/chosen": -1.4694225788116455, "rewards/margins": 1.0263155698776245, "rewards/rejected": -2.4957382678985596, "step": 1175 }, { "epoch": 1.11, "grad_norm": 17.261892318725586, "learning_rate": 3.499475341028331e-07, "logps/chosen": -38.29346466064453, "logps/rejected": -62.170143127441406, "loss": 0.3702, "losses/dpo": 0.1875734031200409, "losses/sft": 1.3133978843688965, "losses/total": 0.1875734031200409, "ref_logps/chosen": -29.878101348876953, "ref_logps/rejected": -39.332252502441406, "rewards/accuracies": 0.875, "rewards/chosen": -0.8415360450744629, "rewards/margins": 1.4422534704208374, "rewards/rejected": -2.28378963470459, "step": 1176 }, { "epoch": 1.11, "grad_norm": 21.05582046508789, "learning_rate": 3.497726477789437e-07, "logps/chosen": -44.98912048339844, "logps/rejected": -67.68132781982422, "loss": 0.4188, "losses/dpo": 0.8851693272590637, "losses/sft": 1.7933876514434814, "losses/total": 0.8851693272590637, "ref_logps/chosen": -36.263694763183594, "ref_logps/rejected": -42.989280700683594, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8725423216819763, "rewards/margins": 1.596662163734436, "rewards/rejected": -2.4692044258117676, "step": 1177 }, { "epoch": 1.11, "grad_norm": 32.253196716308594, "learning_rate": 3.495977614550542e-07, "logps/chosen": -58.107582092285156, "logps/rejected": -74.15830993652344, "loss": 0.6214, "losses/dpo": 0.49097663164138794, "losses/sft": 1.653022289276123, "losses/total": 0.49097663164138794, "ref_logps/chosen": -43.99420928955078, "ref_logps/rejected": -56.31801986694336, "rewards/accuracies": 0.625, "rewards/chosen": -1.41133713722229, "rewards/margins": 0.372691810131073, "rewards/rejected": -1.7840288877487183, "step": 1178 }, { "epoch": 1.11, "grad_norm": 26.46973419189453, "learning_rate": 3.4942287513116476e-07, "logps/chosen": -63.79710006713867, "logps/rejected": -69.51112365722656, "loss": 0.5685, "losses/dpo": 0.6211001873016357, "losses/sft": 1.787951111793518, "losses/total": 0.6211001873016357, "ref_logps/chosen": -49.32140350341797, "ref_logps/rejected": -46.48867416381836, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4475698471069336, "rewards/margins": 0.8546746373176575, "rewards/rejected": -2.3022444248199463, "step": 1179 }, { "epoch": 1.11, "grad_norm": 18.976913452148438, "learning_rate": 3.492479888072752e-07, "logps/chosen": -58.0233154296875, "logps/rejected": -75.26605224609375, "loss": 0.4405, "losses/dpo": 0.3172830641269684, "losses/sft": 1.5456390380859375, "losses/total": 0.3172830641269684, "ref_logps/chosen": -43.87802505493164, "ref_logps/rejected": -49.81204605102539, "rewards/accuracies": 0.75, "rewards/chosen": -1.4145286083221436, "rewards/margins": 1.130871295928955, "rewards/rejected": -2.5453999042510986, "step": 1180 }, { "epoch": 1.12, "grad_norm": 25.700510025024414, "learning_rate": 3.490731024833858e-07, "logps/chosen": -59.06761169433594, "logps/rejected": -68.7720947265625, "loss": 0.5264, "losses/dpo": 0.9217410683631897, "losses/sft": 1.8986196517944336, "losses/total": 0.9217410683631897, "ref_logps/chosen": -43.40380096435547, "ref_logps/rejected": -45.43873596191406, "rewards/accuracies": 0.75, "rewards/chosen": -1.5663809776306152, "rewards/margins": 0.7669545412063599, "rewards/rejected": -2.3333356380462646, "step": 1181 }, { "epoch": 1.12, "grad_norm": 22.932247161865234, "learning_rate": 3.4889821615949635e-07, "logps/chosen": -54.19947052001953, "logps/rejected": -69.83200073242188, "loss": 0.4193, "losses/dpo": 0.2667746841907501, "losses/sft": 1.887205958366394, "losses/total": 0.2667746841907501, "ref_logps/chosen": -40.734580993652344, "ref_logps/rejected": -46.85262680053711, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3464891910552979, "rewards/margins": 0.9514486789703369, "rewards/rejected": -2.2979378700256348, "step": 1182 }, { "epoch": 1.12, "grad_norm": 17.1081600189209, "learning_rate": 3.487233298356068e-07, "logps/chosen": -48.15372848510742, "logps/rejected": -75.6485595703125, "loss": 0.3246, "losses/dpo": 0.4840124845504761, "losses/sft": 1.9550937414169312, "losses/total": 0.4840124845504761, "ref_logps/chosen": -36.77861785888672, "ref_logps/rejected": -50.692771911621094, "rewards/accuracies": 0.875, "rewards/chosen": -1.1375110149383545, "rewards/margins": 1.3580682277679443, "rewards/rejected": -2.495579242706299, "step": 1183 }, { "epoch": 1.12, "grad_norm": 18.510454177856445, "learning_rate": 3.485484435117174e-07, "logps/chosen": -43.016136169433594, "logps/rejected": -58.0286865234375, "loss": 0.4385, "losses/dpo": 0.7479536533355713, "losses/sft": 2.54318904876709, "losses/total": 0.7479536533355713, "ref_logps/chosen": -31.054210662841797, "ref_logps/rejected": -35.893402099609375, "rewards/accuracies": 0.75, "rewards/chosen": -1.1961928606033325, "rewards/margins": 1.0173360109329224, "rewards/rejected": -2.213528871536255, "step": 1184 }, { "epoch": 1.12, "grad_norm": 17.244873046875, "learning_rate": 3.483735571878279e-07, "logps/chosen": -43.629615783691406, "logps/rejected": -54.11824035644531, "loss": 0.4856, "losses/dpo": 0.5764984488487244, "losses/sft": 1.954749584197998, "losses/total": 0.5764984488487244, "ref_logps/chosen": -31.12030601501465, "ref_logps/rejected": -32.72331619262695, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2509305477142334, "rewards/margins": 0.888561487197876, "rewards/rejected": -2.1394920349121094, "step": 1185 }, { "epoch": 1.12, "grad_norm": 28.519880294799805, "learning_rate": 3.4819867086393846e-07, "logps/chosen": -51.87816619873047, "logps/rejected": -62.20604705810547, "loss": 0.5812, "losses/dpo": 0.8336532711982727, "losses/sft": 1.679840087890625, "losses/total": 0.8336532711982727, "ref_logps/chosen": -37.35664367675781, "ref_logps/rejected": -42.5760498046875, "rewards/accuracies": 0.6875, "rewards/chosen": -1.452152967453003, "rewards/margins": 0.5108469724655151, "rewards/rejected": -1.962999939918518, "step": 1186 }, { "epoch": 1.12, "grad_norm": 21.679903030395508, "learning_rate": 3.480237845400489e-07, "logps/chosen": -49.95682144165039, "logps/rejected": -61.141510009765625, "loss": 0.4159, "losses/dpo": 0.5916539430618286, "losses/sft": 1.7467758655548096, "losses/total": 0.5916539430618286, "ref_logps/chosen": -37.57395553588867, "ref_logps/rejected": -37.24937057495117, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2382864952087402, "rewards/margins": 1.1509270668029785, "rewards/rejected": -2.3892135620117188, "step": 1187 }, { "epoch": 1.12, "grad_norm": 24.70008659362793, "learning_rate": 3.478488982161595e-07, "logps/chosen": -61.8380241394043, "logps/rejected": -66.40875244140625, "loss": 0.4498, "losses/dpo": 0.3130904734134674, "losses/sft": 1.7137500047683716, "losses/total": 0.3130904734134674, "ref_logps/chosen": -47.12581253051758, "ref_logps/rejected": -42.88390350341797, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4712209701538086, "rewards/margins": 0.8812636137008667, "rewards/rejected": -2.352484703063965, "step": 1188 }, { "epoch": 1.12, "grad_norm": 20.829978942871094, "learning_rate": 3.4767401189227005e-07, "logps/chosen": -39.790435791015625, "logps/rejected": -62.06210708618164, "loss": 0.4435, "losses/dpo": 0.6889925599098206, "losses/sft": 1.716752290725708, "losses/total": 0.6889925599098206, "ref_logps/chosen": -28.73815155029297, "ref_logps/rejected": -40.22623825073242, "rewards/accuracies": 0.875, "rewards/chosen": -1.1052287817001343, "rewards/margins": 1.0783581733703613, "rewards/rejected": -2.183587074279785, "step": 1189 }, { "epoch": 1.12, "grad_norm": 21.44162940979004, "learning_rate": 3.474991255683805e-07, "logps/chosen": -56.900238037109375, "logps/rejected": -64.34751892089844, "loss": 0.4279, "losses/dpo": 0.6053601503372192, "losses/sft": 1.8646621704101562, "losses/total": 0.6053601503372192, "ref_logps/chosen": -45.60890197753906, "ref_logps/rejected": -44.466705322265625, "rewards/accuracies": 0.875, "rewards/chosen": -1.129133701324463, "rewards/margins": 0.8589475154876709, "rewards/rejected": -1.9880813360214233, "step": 1190 }, { "epoch": 1.12, "grad_norm": 16.19450569152832, "learning_rate": 3.4732423924449107e-07, "logps/chosen": -39.03544616699219, "logps/rejected": -61.84416961669922, "loss": 0.3648, "losses/dpo": 0.2910487651824951, "losses/sft": 1.8084921836853027, "losses/total": 0.2910487651824951, "ref_logps/chosen": -29.14715576171875, "ref_logps/rejected": -40.108909606933594, "rewards/accuracies": 0.875, "rewards/chosen": -0.9888288974761963, "rewards/margins": 1.184696912765503, "rewards/rejected": -2.173525810241699, "step": 1191 }, { "epoch": 1.13, "grad_norm": 24.440706253051758, "learning_rate": 3.471493529206016e-07, "logps/chosen": -63.605384826660156, "logps/rejected": -75.43583679199219, "loss": 0.4601, "losses/dpo": 0.8292043209075928, "losses/sft": 1.2350585460662842, "losses/total": 0.8292043209075928, "ref_logps/chosen": -48.958065032958984, "ref_logps/rejected": -46.86841583251953, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4647314548492432, "rewards/margins": 1.39201021194458, "rewards/rejected": -2.8567416667938232, "step": 1192 }, { "epoch": 1.13, "grad_norm": 27.790037155151367, "learning_rate": 3.4697446659671215e-07, "logps/chosen": -42.98646545410156, "logps/rejected": -58.1151123046875, "loss": 0.6586, "losses/dpo": 0.873833417892456, "losses/sft": 2.1007161140441895, "losses/total": 0.873833417892456, "ref_logps/chosen": -29.699018478393555, "ref_logps/rejected": -37.354427337646484, "rewards/accuracies": 0.5, "rewards/chosen": -1.328744888305664, "rewards/margins": 0.7473236322402954, "rewards/rejected": -2.07606840133667, "step": 1193 }, { "epoch": 1.13, "grad_norm": 14.21298885345459, "learning_rate": 3.467995802728226e-07, "logps/chosen": -43.702049255371094, "logps/rejected": -59.763389587402344, "loss": 0.3356, "losses/dpo": 0.15674805641174316, "losses/sft": 1.9462584257125854, "losses/total": 0.15674805641174316, "ref_logps/chosen": -31.779237747192383, "ref_logps/rejected": -35.758663177490234, "rewards/accuracies": 1.0, "rewards/chosen": -1.1922811269760132, "rewards/margins": 1.2081913948059082, "rewards/rejected": -2.400472640991211, "step": 1194 }, { "epoch": 1.13, "grad_norm": 20.321813583374023, "learning_rate": 3.466246939489332e-07, "logps/chosen": -63.832942962646484, "logps/rejected": -88.71019744873047, "loss": 0.3626, "losses/dpo": 0.8511580228805542, "losses/sft": 1.8943411111831665, "losses/total": 0.8511580228805542, "ref_logps/chosen": -47.3338623046875, "ref_logps/rejected": -58.8153076171875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6499080657958984, "rewards/margins": 1.339580774307251, "rewards/rejected": -2.9894888401031494, "step": 1195 }, { "epoch": 1.13, "grad_norm": 25.849332809448242, "learning_rate": 3.4644980762504374e-07, "logps/chosen": -64.0870361328125, "logps/rejected": -87.64054870605469, "loss": 0.3906, "losses/dpo": 0.3396153748035431, "losses/sft": 2.0073556900024414, "losses/total": 0.3396153748035431, "ref_logps/chosen": -50.96879577636719, "ref_logps/rejected": -56.45702362060547, "rewards/accuracies": 0.75, "rewards/chosen": -1.3118246793746948, "rewards/margins": 1.8065276145935059, "rewards/rejected": -3.118352174758911, "step": 1196 }, { "epoch": 1.13, "grad_norm": 21.948686599731445, "learning_rate": 3.462749213011542e-07, "logps/chosen": -55.41584014892578, "logps/rejected": -85.0135498046875, "loss": 0.4285, "losses/dpo": 0.3888290524482727, "losses/sft": 1.7734923362731934, "losses/total": 0.3888290524482727, "ref_logps/chosen": -42.59334945678711, "ref_logps/rejected": -61.39726638793945, "rewards/accuracies": 0.75, "rewards/chosen": -1.2822489738464355, "rewards/margins": 1.0793800354003906, "rewards/rejected": -2.361629009246826, "step": 1197 }, { "epoch": 1.13, "grad_norm": 21.668668746948242, "learning_rate": 3.4610003497726477e-07, "logps/chosen": -47.953975677490234, "logps/rejected": -72.549072265625, "loss": 0.3893, "losses/dpo": 0.31027325987815857, "losses/sft": 2.0131776332855225, "losses/total": 0.31027325987815857, "ref_logps/chosen": -33.96575927734375, "ref_logps/rejected": -43.709205627441406, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3988215923309326, "rewards/margins": 1.4851644039154053, "rewards/rejected": -2.883985996246338, "step": 1198 }, { "epoch": 1.13, "grad_norm": 20.344911575317383, "learning_rate": 3.459251486533753e-07, "logps/chosen": -43.740814208984375, "logps/rejected": -76.95494079589844, "loss": 0.3314, "losses/dpo": 0.11246241629123688, "losses/sft": 1.1133618354797363, "losses/total": 0.11246241629123688, "ref_logps/chosen": -34.54724884033203, "ref_logps/rejected": -52.08617401123047, "rewards/accuracies": 0.875, "rewards/chosen": -0.919356107711792, "rewards/margins": 1.5675208568572998, "rewards/rejected": -2.486876964569092, "step": 1199 }, { "epoch": 1.13, "grad_norm": 20.741167068481445, "learning_rate": 3.4575026232948584e-07, "logps/chosen": -39.625831604003906, "logps/rejected": -62.57728576660156, "loss": 0.3801, "losses/dpo": 0.5188046097755432, "losses/sft": 1.4814120531082153, "losses/total": 0.5188046097755432, "ref_logps/chosen": -29.104610443115234, "ref_logps/rejected": -39.900455474853516, "rewards/accuracies": 0.875, "rewards/chosen": -1.0521219968795776, "rewards/margins": 1.2155613899230957, "rewards/rejected": -2.267683267593384, "step": 1200 }, { "epoch": 1.13, "grad_norm": 19.507539749145508, "learning_rate": 3.4557537600559636e-07, "logps/chosen": -48.30961227416992, "logps/rejected": -72.48915100097656, "loss": 0.4491, "losses/dpo": 0.6221827268600464, "losses/sft": 2.064242124557495, "losses/total": 0.6221827268600464, "ref_logps/chosen": -35.892147064208984, "ref_logps/rejected": -43.38697814941406, "rewards/accuracies": 0.75, "rewards/chosen": -1.2417469024658203, "rewards/margins": 1.6684703826904297, "rewards/rejected": -2.910217046737671, "step": 1201 }, { "epoch": 1.14, "grad_norm": 16.92333221435547, "learning_rate": 3.4540048968170687e-07, "logps/chosen": -35.59849548339844, "logps/rejected": -68.27458190917969, "loss": 0.2701, "losses/dpo": 0.5904721021652222, "losses/sft": 2.1532437801361084, "losses/total": 0.5904721021652222, "ref_logps/chosen": -28.967607498168945, "ref_logps/rejected": -40.41972351074219, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6630889773368835, "rewards/margins": 2.1223959922790527, "rewards/rejected": -2.78548526763916, "step": 1202 }, { "epoch": 1.14, "grad_norm": 17.5150146484375, "learning_rate": 3.4522560335781743e-07, "logps/chosen": -49.920013427734375, "logps/rejected": -69.733154296875, "loss": 0.2983, "losses/dpo": 0.41659703850746155, "losses/sft": 1.6118489503860474, "losses/total": 0.41659703850746155, "ref_logps/chosen": -41.028804779052734, "ref_logps/rejected": -44.3741455078125, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8891209363937378, "rewards/margins": 1.6467803716659546, "rewards/rejected": -2.5359013080596924, "step": 1203 }, { "epoch": 1.14, "grad_norm": 22.143356323242188, "learning_rate": 3.450507170339279e-07, "logps/chosen": -44.29490280151367, "logps/rejected": -54.45267105102539, "loss": 0.4696, "losses/dpo": 0.5185851454734802, "losses/sft": 1.8470827341079712, "losses/total": 0.5185851454734802, "ref_logps/chosen": -32.514862060546875, "ref_logps/rejected": -35.87690734863281, "rewards/accuracies": 0.75, "rewards/chosen": -1.1780040264129639, "rewards/margins": 0.6795724630355835, "rewards/rejected": -1.8575763702392578, "step": 1204 }, { "epoch": 1.14, "grad_norm": 26.703330993652344, "learning_rate": 3.4487583071003846e-07, "logps/chosen": -66.18683624267578, "logps/rejected": -79.56542205810547, "loss": 0.4226, "losses/dpo": 0.45258915424346924, "losses/sft": 1.4578217267990112, "losses/total": 0.45258915424346924, "ref_logps/chosen": -52.280574798583984, "ref_logps/rejected": -53.096160888671875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3906259536743164, "rewards/margins": 1.2563002109527588, "rewards/rejected": -2.646925926208496, "step": 1205 }, { "epoch": 1.14, "grad_norm": 22.803064346313477, "learning_rate": 3.4470094438614897e-07, "logps/chosen": -54.265052795410156, "logps/rejected": -62.260963439941406, "loss": 0.3688, "losses/dpo": 0.17984730005264282, "losses/sft": 1.8301756381988525, "losses/total": 0.17984730005264282, "ref_logps/chosen": -44.24738311767578, "ref_logps/rejected": -39.08616256713867, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0017666816711426, "rewards/margins": 1.3157135248184204, "rewards/rejected": -2.3174803256988525, "step": 1206 }, { "epoch": 1.14, "grad_norm": 24.572607040405273, "learning_rate": 3.4452605806225954e-07, "logps/chosen": -43.329105377197266, "logps/rejected": -63.29738998413086, "loss": 0.5151, "losses/dpo": 1.060744285583496, "losses/sft": 2.0242159366607666, "losses/total": 1.060744285583496, "ref_logps/chosen": -30.796119689941406, "ref_logps/rejected": -39.337188720703125, "rewards/accuracies": 0.75, "rewards/chosen": -1.2532985210418701, "rewards/margins": 1.1427216529846191, "rewards/rejected": -2.3960201740264893, "step": 1207 }, { "epoch": 1.14, "grad_norm": 19.429122924804688, "learning_rate": 3.4435117173837005e-07, "logps/chosen": -52.67515563964844, "logps/rejected": -56.843177795410156, "loss": 0.4331, "losses/dpo": 0.380759060382843, "losses/sft": 1.5727355480194092, "losses/total": 0.380759060382843, "ref_logps/chosen": -43.468894958496094, "ref_logps/rejected": -35.63218688964844, "rewards/accuracies": 0.75, "rewards/chosen": -0.920626163482666, "rewards/margins": 1.200473427772522, "rewards/rejected": -2.1210994720458984, "step": 1208 }, { "epoch": 1.14, "grad_norm": 18.721139907836914, "learning_rate": 3.4417628541448056e-07, "logps/chosen": -43.63262176513672, "logps/rejected": -62.44777297973633, "loss": 0.3952, "losses/dpo": 0.3956284523010254, "losses/sft": 1.261940598487854, "losses/total": 0.3956284523010254, "ref_logps/chosen": -32.311004638671875, "ref_logps/rejected": -41.452457427978516, "rewards/accuracies": 0.875, "rewards/chosen": -1.1321617364883423, "rewards/margins": 0.9673698544502258, "rewards/rejected": -2.099531650543213, "step": 1209 }, { "epoch": 1.14, "grad_norm": 20.891883850097656, "learning_rate": 3.4400139909059113e-07, "logps/chosen": -36.589542388916016, "logps/rejected": -52.03498077392578, "loss": 0.5049, "losses/dpo": 0.7367061376571655, "losses/sft": 1.6468583345413208, "losses/total": 0.7367061376571655, "ref_logps/chosen": -24.960180282592773, "ref_logps/rejected": -32.505863189697266, "rewards/accuracies": 0.75, "rewards/chosen": -1.1629362106323242, "rewards/margins": 0.7899754047393799, "rewards/rejected": -1.952911615371704, "step": 1210 }, { "epoch": 1.14, "grad_norm": 30.904935836791992, "learning_rate": 3.438265127667016e-07, "logps/chosen": -58.95918273925781, "logps/rejected": -67.94422149658203, "loss": 0.584, "losses/dpo": 0.3868604302406311, "losses/sft": 1.5404444932937622, "losses/total": 0.3868604302406311, "ref_logps/chosen": -47.20154571533203, "ref_logps/rejected": -49.71845245361328, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1757640838623047, "rewards/margins": 0.646812915802002, "rewards/rejected": -1.8225769996643066, "step": 1211 }, { "epoch": 1.14, "grad_norm": 23.595455169677734, "learning_rate": 3.4365162644281215e-07, "logps/chosen": -42.342552185058594, "logps/rejected": -61.444252014160156, "loss": 0.4705, "losses/dpo": 0.2707156538963318, "losses/sft": 1.5560718774795532, "losses/total": 0.2707156538963318, "ref_logps/chosen": -32.79653549194336, "ref_logps/rejected": -41.464778900146484, "rewards/accuracies": 0.75, "rewards/chosen": -0.9546019434928894, "rewards/margins": 1.0433454513549805, "rewards/rejected": -1.9979474544525146, "step": 1212 }, { "epoch": 1.15, "grad_norm": 27.435976028442383, "learning_rate": 3.4347674011892267e-07, "logps/chosen": -47.908782958984375, "logps/rejected": -46.43934631347656, "loss": 0.6044, "losses/dpo": 0.545130729675293, "losses/sft": 1.9446781873703003, "losses/total": 0.545130729675293, "ref_logps/chosen": -35.36150360107422, "ref_logps/rejected": -30.26004981994629, "rewards/accuracies": 0.6875, "rewards/chosen": -1.254727840423584, "rewards/margins": 0.36320173740386963, "rewards/rejected": -1.6179295778274536, "step": 1213 }, { "epoch": 1.15, "grad_norm": 25.55695915222168, "learning_rate": 3.4330185379503323e-07, "logps/chosen": -60.84926986694336, "logps/rejected": -63.439491271972656, "loss": 0.5527, "losses/dpo": 0.9513847827911377, "losses/sft": 1.7977392673492432, "losses/total": 0.9513847827911377, "ref_logps/chosen": -48.891082763671875, "ref_logps/rejected": -44.028541564941406, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1958189010620117, "rewards/margins": 0.7452763319015503, "rewards/rejected": -1.9410953521728516, "step": 1214 }, { "epoch": 1.15, "grad_norm": 21.73856544494629, "learning_rate": 3.4312696747114375e-07, "logps/chosen": -62.00055694580078, "logps/rejected": -74.59437561035156, "loss": 0.3918, "losses/dpo": 0.24058350920677185, "losses/sft": 1.6256749629974365, "losses/total": 0.24058350920677185, "ref_logps/chosen": -48.43283462524414, "ref_logps/rejected": -49.34600067138672, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3567726612091064, "rewards/margins": 1.168065071105957, "rewards/rejected": -2.5248374938964844, "step": 1215 }, { "epoch": 1.15, "grad_norm": 24.474241256713867, "learning_rate": 3.4295208114725426e-07, "logps/chosen": -50.98002624511719, "logps/rejected": -68.3804702758789, "loss": 0.4446, "losses/dpo": 0.1954561471939087, "losses/sft": 1.62507963180542, "losses/total": 0.1954561471939087, "ref_logps/chosen": -37.444149017333984, "ref_logps/rejected": -43.96714782714844, "rewards/accuracies": 0.75, "rewards/chosen": -1.3535881042480469, "rewards/margins": 1.0877442359924316, "rewards/rejected": -2.4413321018218994, "step": 1216 }, { "epoch": 1.15, "grad_norm": 19.39516258239746, "learning_rate": 3.427771948233648e-07, "logps/chosen": -49.55461120605469, "logps/rejected": -75.17345428466797, "loss": 0.3015, "losses/dpo": 0.3323233425617218, "losses/sft": 2.0071802139282227, "losses/total": 0.3323233425617218, "ref_logps/chosen": -40.910945892333984, "ref_logps/rejected": -53.50979995727539, "rewards/accuracies": 1.0, "rewards/chosen": -0.8643665313720703, "rewards/margins": 1.3019989728927612, "rewards/rejected": -2.166365623474121, "step": 1217 }, { "epoch": 1.15, "grad_norm": 22.340932846069336, "learning_rate": 3.426023084994753e-07, "logps/chosen": -50.58102798461914, "logps/rejected": -67.12346649169922, "loss": 0.4652, "losses/dpo": 0.7647037506103516, "losses/sft": 1.7319880723953247, "losses/total": 0.7647037506103516, "ref_logps/chosen": -38.5652961730957, "ref_logps/rejected": -46.59025573730469, "rewards/accuracies": 0.8125, "rewards/chosen": -1.201573371887207, "rewards/margins": 0.8517480492591858, "rewards/rejected": -2.053321361541748, "step": 1218 }, { "epoch": 1.15, "grad_norm": 25.574398040771484, "learning_rate": 3.4242742217558585e-07, "logps/chosen": -48.58213806152344, "logps/rejected": -71.12791442871094, "loss": 0.4387, "losses/dpo": 0.19296759366989136, "losses/sft": 1.4935076236724854, "losses/total": 0.19296759366989136, "ref_logps/chosen": -35.044158935546875, "ref_logps/rejected": -46.30248260498047, "rewards/accuracies": 0.75, "rewards/chosen": -1.3537980318069458, "rewards/margins": 1.1287453174591064, "rewards/rejected": -2.482543468475342, "step": 1219 }, { "epoch": 1.15, "grad_norm": 24.586471557617188, "learning_rate": 3.422525358516964e-07, "logps/chosen": -48.48291015625, "logps/rejected": -77.8056640625, "loss": 0.3786, "losses/dpo": 0.10799476504325867, "losses/sft": 1.7551960945129395, "losses/total": 0.10799476504325867, "ref_logps/chosen": -39.020484924316406, "ref_logps/rejected": -53.53319549560547, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9462423324584961, "rewards/margins": 1.4810049533843994, "rewards/rejected": -2.4272472858428955, "step": 1220 }, { "epoch": 1.15, "grad_norm": 22.86333465576172, "learning_rate": 3.4207764952780693e-07, "logps/chosen": -53.229713439941406, "logps/rejected": -67.08908081054688, "loss": 0.4879, "losses/dpo": 0.41523340344429016, "losses/sft": 2.0493810176849365, "losses/total": 0.41523340344429016, "ref_logps/chosen": -41.48245620727539, "ref_logps/rejected": -41.98067855834961, "rewards/accuracies": 0.75, "rewards/chosen": -1.174725890159607, "rewards/margins": 1.3361148834228516, "rewards/rejected": -2.510840654373169, "step": 1221 }, { "epoch": 1.15, "grad_norm": 20.58643913269043, "learning_rate": 3.4190276320391744e-07, "logps/chosen": -41.96229553222656, "logps/rejected": -65.61783599853516, "loss": 0.3209, "losses/dpo": 0.22559937834739685, "losses/sft": 1.1402311325073242, "losses/total": 0.22559937834739685, "ref_logps/chosen": -33.453243255615234, "ref_logps/rejected": -42.23716735839844, "rewards/accuracies": 0.875, "rewards/chosen": -0.850905179977417, "rewards/margins": 1.48716139793396, "rewards/rejected": -2.338066339492798, "step": 1222 }, { "epoch": 1.15, "grad_norm": 27.87337875366211, "learning_rate": 3.4172787688002795e-07, "logps/chosen": -48.875816345214844, "logps/rejected": -60.657806396484375, "loss": 0.5726, "losses/dpo": 0.2870476245880127, "losses/sft": 1.8599473237991333, "losses/total": 0.2870476245880127, "ref_logps/chosen": -36.14516067504883, "ref_logps/rejected": -42.29712677001953, "rewards/accuracies": 0.625, "rewards/chosen": -1.2730658054351807, "rewards/margins": 0.5630024671554565, "rewards/rejected": -1.8360682725906372, "step": 1223 }, { "epoch": 1.16, "grad_norm": 26.35036277770996, "learning_rate": 3.415529905561385e-07, "logps/chosen": -56.7800178527832, "logps/rejected": -77.70372009277344, "loss": 0.6803, "losses/dpo": 0.25999119877815247, "losses/sft": 1.5244412422180176, "losses/total": 0.25999119877815247, "ref_logps/chosen": -37.98125457763672, "ref_logps/rejected": -49.28694152832031, "rewards/accuracies": 0.75, "rewards/chosen": -1.8798760175704956, "rewards/margins": 0.9618018269538879, "rewards/rejected": -2.8416779041290283, "step": 1224 }, { "epoch": 1.16, "grad_norm": 24.638765335083008, "learning_rate": 3.41378104232249e-07, "logps/chosen": -53.67375946044922, "logps/rejected": -63.890159606933594, "loss": 0.5792, "losses/dpo": 0.45446139574050903, "losses/sft": 1.8416240215301514, "losses/total": 0.45446139574050903, "ref_logps/chosen": -35.34421157836914, "ref_logps/rejected": -41.183658599853516, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8329551219940186, "rewards/margins": 0.4376949965953827, "rewards/rejected": -2.2706503868103027, "step": 1225 }, { "epoch": 1.16, "grad_norm": 17.07170867919922, "learning_rate": 3.4120321790835954e-07, "logps/chosen": -44.629207611083984, "logps/rejected": -64.49826049804688, "loss": 0.3347, "losses/dpo": 0.41418763995170593, "losses/sft": 1.9355415105819702, "losses/total": 0.41418763995170593, "ref_logps/chosen": -35.71815490722656, "ref_logps/rejected": -42.28117370605469, "rewards/accuracies": 0.875, "rewards/chosen": -0.891105055809021, "rewards/margins": 1.3306039571762085, "rewards/rejected": -2.2217092514038086, "step": 1226 }, { "epoch": 1.16, "grad_norm": 21.43756866455078, "learning_rate": 3.410283315844701e-07, "logps/chosen": -49.661277770996094, "logps/rejected": -54.75538635253906, "loss": 0.4956, "losses/dpo": 0.5396450757980347, "losses/sft": 1.9451037645339966, "losses/total": 0.5396450757980347, "ref_logps/chosen": -39.7839469909668, "ref_logps/rejected": -37.6505241394043, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9877331852912903, "rewards/margins": 0.7227531671524048, "rewards/rejected": -1.7104862928390503, "step": 1227 }, { "epoch": 1.16, "grad_norm": 18.683822631835938, "learning_rate": 3.408534452605806e-07, "logps/chosen": -43.6932487487793, "logps/rejected": -66.22261047363281, "loss": 0.3186, "losses/dpo": 0.4445999264717102, "losses/sft": 1.171536922454834, "losses/total": 0.4445999264717102, "ref_logps/chosen": -35.19602966308594, "ref_logps/rejected": -43.317169189453125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8497219085693359, "rewards/margins": 1.4408226013183594, "rewards/rejected": -2.290544271469116, "step": 1228 }, { "epoch": 1.16, "grad_norm": 25.496213912963867, "learning_rate": 3.4067855893669113e-07, "logps/chosen": -55.74542999267578, "logps/rejected": -69.71067810058594, "loss": 0.4917, "losses/dpo": 0.5883376598358154, "losses/sft": 1.744455099105835, "losses/total": 0.5883376598358154, "ref_logps/chosen": -43.384056091308594, "ref_logps/rejected": -48.68836975097656, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2361371517181396, "rewards/margins": 0.8660933971405029, "rewards/rejected": -2.1022305488586426, "step": 1229 }, { "epoch": 1.16, "grad_norm": 23.95579719543457, "learning_rate": 3.4050367261280165e-07, "logps/chosen": -42.41576385498047, "logps/rejected": -55.842376708984375, "loss": 0.4732, "losses/dpo": 0.5958298444747925, "losses/sft": 1.346472144126892, "losses/total": 0.5958298444747925, "ref_logps/chosen": -34.13622283935547, "ref_logps/rejected": -39.37353515625, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8279539942741394, "rewards/margins": 0.8189303278923035, "rewards/rejected": -1.6468843221664429, "step": 1230 }, { "epoch": 1.16, "grad_norm": 23.343685150146484, "learning_rate": 3.403287862889122e-07, "logps/chosen": -55.46833038330078, "logps/rejected": -67.71742248535156, "loss": 0.4914, "losses/dpo": 0.282474547624588, "losses/sft": 2.198577642440796, "losses/total": 0.282474547624588, "ref_logps/chosen": -44.964542388916016, "ref_logps/rejected": -47.52948760986328, "rewards/accuracies": 0.75, "rewards/chosen": -1.0503791570663452, "rewards/margins": 0.9684140086174011, "rewards/rejected": -2.0187931060791016, "step": 1231 }, { "epoch": 1.16, "grad_norm": 23.989164352416992, "learning_rate": 3.4015389996502267e-07, "logps/chosen": -57.11771011352539, "logps/rejected": -66.72931671142578, "loss": 0.4929, "losses/dpo": 0.4271523058414459, "losses/sft": 2.0742557048797607, "losses/total": 0.4271523058414459, "ref_logps/chosen": -44.4672737121582, "ref_logps/rejected": -45.18669509887695, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2650437355041504, "rewards/margins": 0.8892186880111694, "rewards/rejected": -2.1542623043060303, "step": 1232 }, { "epoch": 1.16, "grad_norm": 16.49856948852539, "learning_rate": 3.3997901364113324e-07, "logps/chosen": -52.76203918457031, "logps/rejected": -64.7552490234375, "loss": 0.3319, "losses/dpo": 0.1526704579591751, "losses/sft": 1.746389389038086, "losses/total": 0.1526704579591751, "ref_logps/chosen": -43.64642333984375, "ref_logps/rejected": -42.858543395996094, "rewards/accuracies": 0.875, "rewards/chosen": -0.911561906337738, "rewards/margins": 1.2781083583831787, "rewards/rejected": -2.1896703243255615, "step": 1233 }, { "epoch": 1.17, "grad_norm": 26.359237670898438, "learning_rate": 3.398041273172438e-07, "logps/chosen": -42.836246490478516, "logps/rejected": -44.02879333496094, "loss": 0.596, "losses/dpo": 0.45190829038619995, "losses/sft": 1.4974873065948486, "losses/total": 0.45190829038619995, "ref_logps/chosen": -31.9793701171875, "ref_logps/rejected": -28.331953048706055, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0856877565383911, "rewards/margins": 0.4839964807033539, "rewards/rejected": -1.5696842670440674, "step": 1234 }, { "epoch": 1.17, "grad_norm": 22.135757446289062, "learning_rate": 3.396292409933543e-07, "logps/chosen": -46.788543701171875, "logps/rejected": -75.48374938964844, "loss": 0.3362, "losses/dpo": 0.43912792205810547, "losses/sft": 1.712257742881775, "losses/total": 0.43912792205810547, "ref_logps/chosen": -37.78060531616211, "ref_logps/rejected": -51.97429656982422, "rewards/accuracies": 0.875, "rewards/chosen": -0.900794267654419, "rewards/margins": 1.450150728225708, "rewards/rejected": -2.350944995880127, "step": 1235 }, { "epoch": 1.17, "grad_norm": 21.167810440063477, "learning_rate": 3.3945435466946483e-07, "logps/chosen": -62.74198532104492, "logps/rejected": -78.19017028808594, "loss": 0.3671, "losses/dpo": 0.26451680064201355, "losses/sft": 1.9652423858642578, "losses/total": 0.26451680064201355, "ref_logps/chosen": -46.7498779296875, "ref_logps/rejected": -49.8843994140625, "rewards/accuracies": 0.875, "rewards/chosen": -1.599210500717163, "rewards/margins": 1.2313660383224487, "rewards/rejected": -2.8305766582489014, "step": 1236 }, { "epoch": 1.17, "grad_norm": 24.189664840698242, "learning_rate": 3.3927946834557534e-07, "logps/chosen": -61.61280059814453, "logps/rejected": -63.16066360473633, "loss": 0.52, "losses/dpo": 1.0029711723327637, "losses/sft": 1.8602609634399414, "losses/total": 1.0029711723327637, "ref_logps/chosen": -49.45880126953125, "ref_logps/rejected": -41.41373062133789, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2154000997543335, "rewards/margins": 0.9592931866645813, "rewards/rejected": -2.1746931076049805, "step": 1237 }, { "epoch": 1.17, "grad_norm": 18.63071632385254, "learning_rate": 3.391045820216859e-07, "logps/chosen": -51.032657623291016, "logps/rejected": -67.61209106445312, "loss": 0.4245, "losses/dpo": 0.31612345576286316, "losses/sft": 1.7192654609680176, "losses/total": 0.31612345576286316, "ref_logps/chosen": -39.01557159423828, "ref_logps/rejected": -44.32916259765625, "rewards/accuracies": 0.75, "rewards/chosen": -1.2017087936401367, "rewards/margins": 1.1265840530395508, "rewards/rejected": -2.3282928466796875, "step": 1238 }, { "epoch": 1.17, "grad_norm": 17.873682022094727, "learning_rate": 3.3892969569779637e-07, "logps/chosen": -51.54199981689453, "logps/rejected": -69.31196594238281, "loss": 0.3382, "losses/dpo": 0.16271719336509705, "losses/sft": 1.8073300123214722, "losses/total": 0.16271719336509705, "ref_logps/chosen": -41.933921813964844, "ref_logps/rejected": -44.349037170410156, "rewards/accuracies": 0.875, "rewards/chosen": -0.9608081579208374, "rewards/margins": 1.535484790802002, "rewards/rejected": -2.49629282951355, "step": 1239 }, { "epoch": 1.17, "grad_norm": 25.454572677612305, "learning_rate": 3.3875480937390693e-07, "logps/chosen": -46.22962951660156, "logps/rejected": -56.698699951171875, "loss": 0.544, "losses/dpo": 0.482551634311676, "losses/sft": 1.4616835117340088, "losses/total": 0.482551634311676, "ref_logps/chosen": -36.28697967529297, "ref_logps/rejected": -39.95439910888672, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9942651987075806, "rewards/margins": 0.6801650524139404, "rewards/rejected": -1.674430251121521, "step": 1240 }, { "epoch": 1.17, "grad_norm": 15.92263126373291, "learning_rate": 3.385799230500175e-07, "logps/chosen": -41.26055145263672, "logps/rejected": -71.25456237792969, "loss": 0.2884, "losses/dpo": 0.3872529864311218, "losses/sft": 2.4158272743225098, "losses/total": 0.3872529864311218, "ref_logps/chosen": -32.057342529296875, "ref_logps/rejected": -44.658355712890625, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9203210473060608, "rewards/margins": 1.7392998933792114, "rewards/rejected": -2.659621000289917, "step": 1241 }, { "epoch": 1.17, "grad_norm": 18.061687469482422, "learning_rate": 3.38405036726128e-07, "logps/chosen": -41.618431091308594, "logps/rejected": -57.597293853759766, "loss": 0.4316, "losses/dpo": 0.7112546563148499, "losses/sft": 1.568984031677246, "losses/total": 0.7112546563148499, "ref_logps/chosen": -33.12033462524414, "ref_logps/rejected": -38.881324768066406, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8498092889785767, "rewards/margins": 1.0217880010604858, "rewards/rejected": -1.871597409248352, "step": 1242 }, { "epoch": 1.17, "grad_norm": 33.961753845214844, "learning_rate": 3.382301504022385e-07, "logps/chosen": -58.7257080078125, "logps/rejected": -63.61299133300781, "loss": 0.6259, "losses/dpo": 0.8247978687286377, "losses/sft": 1.817539930343628, "losses/total": 0.8247978687286377, "ref_logps/chosen": -45.45909881591797, "ref_logps/rejected": -42.36334228515625, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3266608715057373, "rewards/margins": 0.7983040809631348, "rewards/rejected": -2.124964952468872, "step": 1243 }, { "epoch": 1.17, "grad_norm": 19.390769958496094, "learning_rate": 3.3805526407834904e-07, "logps/chosen": -45.50224304199219, "logps/rejected": -69.91033172607422, "loss": 0.4912, "losses/dpo": 0.5417351126670837, "losses/sft": 1.3218685388565063, "losses/total": 0.5417351126670837, "ref_logps/chosen": -35.31776428222656, "ref_logps/rejected": -50.800045013427734, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0184478759765625, "rewards/margins": 0.8925809264183044, "rewards/rejected": -1.9110288619995117, "step": 1244 }, { "epoch": 1.18, "grad_norm": 26.61286163330078, "learning_rate": 3.378803777544596e-07, "logps/chosen": -49.75404357910156, "logps/rejected": -57.819114685058594, "loss": 0.5683, "losses/dpo": 0.47424811124801636, "losses/sft": 1.544967532157898, "losses/total": 0.47424811124801636, "ref_logps/chosen": -40.68010711669922, "ref_logps/rejected": -42.18236541748047, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9073936939239502, "rewards/margins": 0.6562814712524414, "rewards/rejected": -1.5636751651763916, "step": 1245 }, { "epoch": 1.18, "grad_norm": 20.838897705078125, "learning_rate": 3.377054914305701e-07, "logps/chosen": -55.13612747192383, "logps/rejected": -88.3150634765625, "loss": 0.3553, "losses/dpo": 0.16532008349895477, "losses/sft": 1.6109546422958374, "losses/total": 0.16532008349895477, "ref_logps/chosen": -41.640995025634766, "ref_logps/rejected": -63.374874114990234, "rewards/accuracies": 0.875, "rewards/chosen": -1.349513053894043, "rewards/margins": 1.1445056200027466, "rewards/rejected": -2.494018793106079, "step": 1246 }, { "epoch": 1.18, "grad_norm": 21.849136352539062, "learning_rate": 3.3753060510668063e-07, "logps/chosen": -37.76445770263672, "logps/rejected": -56.318603515625, "loss": 0.58, "losses/dpo": 0.556659996509552, "losses/sft": 1.354345679283142, "losses/total": 0.556659996509552, "ref_logps/chosen": -28.26213836669922, "ref_logps/rejected": -38.53440856933594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.950231671333313, "rewards/margins": 0.8281873464584351, "rewards/rejected": -1.7784191370010376, "step": 1247 }, { "epoch": 1.18, "grad_norm": 16.14711570739746, "learning_rate": 3.373557187827912e-07, "logps/chosen": -34.53186798095703, "logps/rejected": -58.24977111816406, "loss": 0.3753, "losses/dpo": 0.19008731842041016, "losses/sft": 1.8000578880310059, "losses/total": 0.19008731842041016, "ref_logps/chosen": -24.019250869750977, "ref_logps/rejected": -37.30735397338867, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0512619018554688, "rewards/margins": 1.0429797172546387, "rewards/rejected": -2.0942416191101074, "step": 1248 }, { "epoch": 1.18, "grad_norm": 19.2435245513916, "learning_rate": 3.371808324589017e-07, "logps/chosen": -49.262107849121094, "logps/rejected": -73.62611389160156, "loss": 0.3579, "losses/dpo": 0.49603015184402466, "losses/sft": 1.5602797269821167, "losses/total": 0.49603015184402466, "ref_logps/chosen": -35.401824951171875, "ref_logps/rejected": -48.3310661315918, "rewards/accuracies": 0.875, "rewards/chosen": -1.3860286474227905, "rewards/margins": 1.1434768438339233, "rewards/rejected": -2.529505491256714, "step": 1249 }, { "epoch": 1.18, "grad_norm": 19.644094467163086, "learning_rate": 3.370059461350122e-07, "logps/chosen": -39.05671691894531, "logps/rejected": -48.12267303466797, "loss": 0.4691, "losses/dpo": 0.4304007291793823, "losses/sft": 2.091778039932251, "losses/total": 0.4304007291793823, "ref_logps/chosen": -31.152490615844727, "ref_logps/rejected": -32.670867919921875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7904224395751953, "rewards/margins": 0.7547580003738403, "rewards/rejected": -1.545180320739746, "step": 1250 }, { "epoch": 1.18, "grad_norm": 24.426603317260742, "learning_rate": 3.3683105981112273e-07, "logps/chosen": -49.237548828125, "logps/rejected": -59.71967315673828, "loss": 0.5087, "losses/dpo": 0.8596978187561035, "losses/sft": 2.3221054077148438, "losses/total": 0.8596978187561035, "ref_logps/chosen": -36.67880630493164, "ref_logps/rejected": -39.929840087890625, "rewards/accuracies": 0.75, "rewards/chosen": -1.2558743953704834, "rewards/margins": 0.7231088280677795, "rewards/rejected": -1.9789831638336182, "step": 1251 }, { "epoch": 1.18, "grad_norm": 18.18250274658203, "learning_rate": 3.366561734872333e-07, "logps/chosen": -44.187374114990234, "logps/rejected": -61.64546203613281, "loss": 0.3814, "losses/dpo": 0.21781179308891296, "losses/sft": 1.3680617809295654, "losses/total": 0.21781179308891296, "ref_logps/chosen": -35.716651916503906, "ref_logps/rejected": -42.11402893066406, "rewards/accuracies": 0.875, "rewards/chosen": -0.8470723628997803, "rewards/margins": 1.1060707569122314, "rewards/rejected": -1.9531432390213013, "step": 1252 }, { "epoch": 1.18, "grad_norm": 21.00637435913086, "learning_rate": 3.364812871633438e-07, "logps/chosen": -41.177223205566406, "logps/rejected": -61.0933837890625, "loss": 0.4484, "losses/dpo": 0.7239801287651062, "losses/sft": 2.150071620941162, "losses/total": 0.7239801287651062, "ref_logps/chosen": -31.72400665283203, "ref_logps/rejected": -39.310707092285156, "rewards/accuracies": 0.75, "rewards/chosen": -0.9453219771385193, "rewards/margins": 1.2329456806182861, "rewards/rejected": -2.178267478942871, "step": 1253 }, { "epoch": 1.18, "grad_norm": 16.84493637084961, "learning_rate": 3.363064008394543e-07, "logps/chosen": -33.50115203857422, "logps/rejected": -46.99577331542969, "loss": 0.4482, "losses/dpo": 0.5594444274902344, "losses/sft": 1.4713568687438965, "losses/total": 0.5594444274902344, "ref_logps/chosen": -26.529706954956055, "ref_logps/rejected": -29.561607360839844, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6971445083618164, "rewards/margins": 1.0462722778320312, "rewards/rejected": -1.7434167861938477, "step": 1254 }, { "epoch": 1.19, "grad_norm": 24.219676971435547, "learning_rate": 3.361315145155649e-07, "logps/chosen": -51.231842041015625, "logps/rejected": -53.90184020996094, "loss": 0.5889, "losses/dpo": 0.892292857170105, "losses/sft": 1.986249566078186, "losses/total": 0.892292857170105, "ref_logps/chosen": -38.029048919677734, "ref_logps/rejected": -33.53123474121094, "rewards/accuracies": 0.6875, "rewards/chosen": -1.320279598236084, "rewards/margins": 0.7167806625366211, "rewards/rejected": -2.037060260772705, "step": 1255 }, { "epoch": 1.19, "grad_norm": 21.42949104309082, "learning_rate": 3.359566281916754e-07, "logps/chosen": -47.90787887573242, "logps/rejected": -60.8084716796875, "loss": 0.4334, "losses/dpo": 0.3060031831264496, "losses/sft": 1.823817491531372, "losses/total": 0.3060031831264496, "ref_logps/chosen": -38.683349609375, "ref_logps/rejected": -40.10588073730469, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9224532246589661, "rewards/margins": 1.1478058099746704, "rewards/rejected": -2.0702590942382812, "step": 1256 }, { "epoch": 1.19, "grad_norm": 15.512833595275879, "learning_rate": 3.357817418677859e-07, "logps/chosen": -43.582130432128906, "logps/rejected": -81.34471130371094, "loss": 0.246, "losses/dpo": 0.11325369775295258, "losses/sft": 1.1968235969543457, "losses/total": 0.11325369775295258, "ref_logps/chosen": -32.442893981933594, "ref_logps/rejected": -51.03504943847656, "rewards/accuracies": 0.875, "rewards/chosen": -1.1139239072799683, "rewards/margins": 1.9170420169830322, "rewards/rejected": -3.030965805053711, "step": 1257 }, { "epoch": 1.19, "grad_norm": 28.494823455810547, "learning_rate": 3.356068555438964e-07, "logps/chosen": -55.90460968017578, "logps/rejected": -67.21074676513672, "loss": 0.5795, "losses/dpo": 0.6859239339828491, "losses/sft": 1.5206575393676758, "losses/total": 0.6859239339828491, "ref_logps/chosen": -41.750732421875, "ref_logps/rejected": -45.588375091552734, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4153876304626465, "rewards/margins": 0.7468500137329102, "rewards/rejected": -2.1622376441955566, "step": 1258 }, { "epoch": 1.19, "grad_norm": 17.048093795776367, "learning_rate": 3.35431969220007e-07, "logps/chosen": -40.54545593261719, "logps/rejected": -58.8145751953125, "loss": 0.3684, "losses/dpo": 0.15894848108291626, "losses/sft": 1.6647748947143555, "losses/total": 0.15894848108291626, "ref_logps/chosen": -30.776290893554688, "ref_logps/rejected": -39.05743408203125, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9769164323806763, "rewards/margins": 0.9987971186637878, "rewards/rejected": -1.9757134914398193, "step": 1259 }, { "epoch": 1.19, "grad_norm": 19.45284080505371, "learning_rate": 3.352570828961175e-07, "logps/chosen": -37.42103958129883, "logps/rejected": -52.38259506225586, "loss": 0.4916, "losses/dpo": 0.505672812461853, "losses/sft": 1.741233468055725, "losses/total": 0.505672812461853, "ref_logps/chosen": -28.626670837402344, "ref_logps/rejected": -35.35721969604492, "rewards/accuracies": 0.75, "rewards/chosen": -0.8794369697570801, "rewards/margins": 0.8231004476547241, "rewards/rejected": -1.7025374174118042, "step": 1260 }, { "epoch": 1.19, "grad_norm": 28.87559700012207, "learning_rate": 3.35082196572228e-07, "logps/chosen": -51.134605407714844, "logps/rejected": -61.78376770019531, "loss": 0.5852, "losses/dpo": 1.0553419589996338, "losses/sft": 2.0093259811401367, "losses/total": 1.0553419589996338, "ref_logps/chosen": -37.38733673095703, "ref_logps/rejected": -41.00181198120117, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3747273683547974, "rewards/margins": 0.7034682035446167, "rewards/rejected": -2.078195571899414, "step": 1261 }, { "epoch": 1.19, "grad_norm": 25.339797973632812, "learning_rate": 3.349073102483386e-07, "logps/chosen": -46.318939208984375, "logps/rejected": -49.26006317138672, "loss": 0.6336, "losses/dpo": 0.5517014265060425, "losses/sft": 1.5738214254379272, "losses/total": 0.5517014265060425, "ref_logps/chosen": -35.05769348144531, "ref_logps/rejected": -32.71645736694336, "rewards/accuracies": 0.75, "rewards/chosen": -1.1261239051818848, "rewards/margins": 0.5282371044158936, "rewards/rejected": -1.6543610095977783, "step": 1262 }, { "epoch": 1.19, "grad_norm": 11.409197807312012, "learning_rate": 3.347324239244491e-07, "logps/chosen": -44.18559265136719, "logps/rejected": -73.9047622680664, "loss": 0.2993, "losses/dpo": 0.1533624827861786, "losses/sft": 1.511976957321167, "losses/total": 0.1533624827861786, "ref_logps/chosen": -36.915496826171875, "ref_logps/rejected": -49.74488067626953, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7270100116729736, "rewards/margins": 1.6889781951904297, "rewards/rejected": -2.4159882068634033, "step": 1263 }, { "epoch": 1.19, "grad_norm": 16.06197166442871, "learning_rate": 3.345575376005596e-07, "logps/chosen": -39.298484802246094, "logps/rejected": -72.40126037597656, "loss": 0.3503, "losses/dpo": 0.23409026861190796, "losses/sft": 1.4718996286392212, "losses/total": 0.23409026861190796, "ref_logps/chosen": -29.222522735595703, "ref_logps/rejected": -49.64337158203125, "rewards/accuracies": 0.875, "rewards/chosen": -1.0075957775115967, "rewards/margins": 1.2681934833526611, "rewards/rejected": -2.275789260864258, "step": 1264 }, { "epoch": 1.19, "grad_norm": 21.85924530029297, "learning_rate": 3.3438265127667017e-07, "logps/chosen": -52.99810028076172, "logps/rejected": -59.741310119628906, "loss": 0.4037, "losses/dpo": 0.6012160778045654, "losses/sft": 2.290933132171631, "losses/total": 0.6012160778045654, "ref_logps/chosen": -41.28215026855469, "ref_logps/rejected": -36.79560089111328, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1715948581695557, "rewards/margins": 1.1229760646820068, "rewards/rejected": -2.2945709228515625, "step": 1265 }, { "epoch": 1.2, "grad_norm": 17.8614501953125, "learning_rate": 3.342077649527807e-07, "logps/chosen": -41.201026916503906, "logps/rejected": -48.78249740600586, "loss": 0.4047, "losses/dpo": 0.5122185349464417, "losses/sft": 1.697983741760254, "losses/total": 0.5122185349464417, "ref_logps/chosen": -35.62199401855469, "ref_logps/rejected": -33.27435302734375, "rewards/accuracies": 0.875, "rewards/chosen": -0.5579028725624084, "rewards/margins": 0.992911696434021, "rewards/rejected": -1.5508146286010742, "step": 1266 }, { "epoch": 1.2, "grad_norm": 15.121383666992188, "learning_rate": 3.340328786288912e-07, "logps/chosen": -48.94519805908203, "logps/rejected": -60.90454864501953, "loss": 0.3843, "losses/dpo": 0.3824542760848999, "losses/sft": 1.8560597896575928, "losses/total": 0.3824542760848999, "ref_logps/chosen": -39.710853576660156, "ref_logps/rejected": -41.079078674316406, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9234342575073242, "rewards/margins": 1.059112787246704, "rewards/rejected": -1.9825471639633179, "step": 1267 }, { "epoch": 1.2, "grad_norm": 14.699516296386719, "learning_rate": 3.338579923050017e-07, "logps/chosen": -47.368682861328125, "logps/rejected": -55.56596374511719, "loss": 0.4032, "losses/dpo": 0.6939573287963867, "losses/sft": 2.2385778427124023, "losses/total": 0.6939573287963867, "ref_logps/chosen": -37.04937744140625, "ref_logps/rejected": -34.13915252685547, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0319303274154663, "rewards/margins": 1.110750436782837, "rewards/rejected": -2.1426808834075928, "step": 1268 }, { "epoch": 1.2, "grad_norm": 17.357484817504883, "learning_rate": 3.336831059811123e-07, "logps/chosen": -44.28623580932617, "logps/rejected": -64.76651763916016, "loss": 0.3356, "losses/dpo": 0.3124982416629791, "losses/sft": 1.9567397832870483, "losses/total": 0.3124982416629791, "ref_logps/chosen": -31.228628158569336, "ref_logps/rejected": -39.99836730957031, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3057608604431152, "rewards/margins": 1.1710538864135742, "rewards/rejected": -2.4768147468566895, "step": 1269 }, { "epoch": 1.2, "grad_norm": 19.059419631958008, "learning_rate": 3.335082196572228e-07, "logps/chosen": -52.90300750732422, "logps/rejected": -71.27776336669922, "loss": 0.3608, "losses/dpo": 0.24609430134296417, "losses/sft": 1.595199704170227, "losses/total": 0.24609430134296417, "ref_logps/chosen": -37.32032012939453, "ref_logps/rejected": -43.765377044677734, "rewards/accuracies": 0.9375, "rewards/chosen": -1.558268666267395, "rewards/margins": 1.1929700374603271, "rewards/rejected": -2.7512388229370117, "step": 1270 }, { "epoch": 1.2, "grad_norm": 25.79266929626465, "learning_rate": 3.333333333333333e-07, "logps/chosen": -64.91693115234375, "logps/rejected": -75.42364501953125, "loss": 0.4968, "losses/dpo": 0.34554123878479004, "losses/sft": 1.9924956560134888, "losses/total": 0.34554123878479004, "ref_logps/chosen": -47.75408935546875, "ref_logps/rejected": -50.0391731262207, "rewards/accuracies": 0.75, "rewards/chosen": -1.7162843942642212, "rewards/margins": 0.822162926197052, "rewards/rejected": -2.538447380065918, "step": 1271 }, { "epoch": 1.2, "grad_norm": 21.458112716674805, "learning_rate": 3.3315844700944387e-07, "logps/chosen": -52.42706298828125, "logps/rejected": -64.53559875488281, "loss": 0.4788, "losses/dpo": 0.2602001428604126, "losses/sft": 1.9368573427200317, "losses/total": 0.2602001428604126, "ref_logps/chosen": -39.55405044555664, "ref_logps/rejected": -43.710662841796875, "rewards/accuracies": 0.75, "rewards/chosen": -1.2873010635375977, "rewards/margins": 0.795192539691925, "rewards/rejected": -2.082493782043457, "step": 1272 }, { "epoch": 1.2, "grad_norm": 22.87200927734375, "learning_rate": 3.329835606855544e-07, "logps/chosen": -56.033592224121094, "logps/rejected": -70.26383209228516, "loss": 0.4821, "losses/dpo": 0.4073163866996765, "losses/sft": 1.952742576599121, "losses/total": 0.4073163866996765, "ref_logps/chosen": -40.43696212768555, "ref_logps/rejected": -44.102195739746094, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5596632957458496, "rewards/margins": 1.0565005540847778, "rewards/rejected": -2.616163730621338, "step": 1273 }, { "epoch": 1.2, "grad_norm": 20.100311279296875, "learning_rate": 3.328086743616649e-07, "logps/chosen": -52.1953125, "logps/rejected": -70.45608520507812, "loss": 0.3996, "losses/dpo": 0.4546908438205719, "losses/sft": 1.8390356302261353, "losses/total": 0.4546908438205719, "ref_logps/chosen": -38.603729248046875, "ref_logps/rejected": -45.85691833496094, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3591587543487549, "rewards/margins": 1.1007575988769531, "rewards/rejected": -2.459916591644287, "step": 1274 }, { "epoch": 1.2, "grad_norm": 25.579519271850586, "learning_rate": 3.326337880377754e-07, "logps/chosen": -54.416290283203125, "logps/rejected": -63.55059814453125, "loss": 0.4958, "losses/dpo": 0.85321044921875, "losses/sft": 1.9153268337249756, "losses/total": 0.85321044921875, "ref_logps/chosen": -41.786014556884766, "ref_logps/rejected": -44.38050079345703, "rewards/accuracies": 0.75, "rewards/chosen": -1.2630276679992676, "rewards/margins": 0.653982400894165, "rewards/rejected": -1.9170100688934326, "step": 1275 }, { "epoch": 1.2, "grad_norm": 17.778179168701172, "learning_rate": 3.3245890171388597e-07, "logps/chosen": -44.57392120361328, "logps/rejected": -61.3828125, "loss": 0.3441, "losses/dpo": 0.3692394495010376, "losses/sft": 1.4996060132980347, "losses/total": 0.3692394495010376, "ref_logps/chosen": -33.99726867675781, "ref_logps/rejected": -38.345863342285156, "rewards/accuracies": 0.875, "rewards/chosen": -1.0576651096343994, "rewards/margins": 1.2460298538208008, "rewards/rejected": -2.3036949634552, "step": 1276 }, { "epoch": 1.21, "grad_norm": 15.322132110595703, "learning_rate": 3.322840153899965e-07, "logps/chosen": -48.347389221191406, "logps/rejected": -68.76718139648438, "loss": 0.2668, "losses/dpo": 0.1739310324192047, "losses/sft": 1.8910424709320068, "losses/total": 0.1739310324192047, "ref_logps/chosen": -38.66949462890625, "ref_logps/rejected": -44.7320442199707, "rewards/accuracies": 1.0, "rewards/chosen": -0.9677895903587341, "rewards/margins": 1.4357243776321411, "rewards/rejected": -2.4035139083862305, "step": 1277 }, { "epoch": 1.21, "grad_norm": 24.36154556274414, "learning_rate": 3.32109129066107e-07, "logps/chosen": -48.66584777832031, "logps/rejected": -68.83869934082031, "loss": 0.4297, "losses/dpo": 0.3207012414932251, "losses/sft": 1.997493863105774, "losses/total": 0.3207012414932251, "ref_logps/chosen": -36.260589599609375, "ref_logps/rejected": -46.24098205566406, "rewards/accuracies": 0.875, "rewards/chosen": -1.2405259609222412, "rewards/margins": 1.0192451477050781, "rewards/rejected": -2.2597711086273193, "step": 1278 }, { "epoch": 1.21, "grad_norm": 14.60727596282959, "learning_rate": 3.3193424274221756e-07, "logps/chosen": -47.10565185546875, "logps/rejected": -75.04266357421875, "loss": 0.2979, "losses/dpo": 0.30064719915390015, "losses/sft": 1.6382322311401367, "losses/total": 0.30064719915390015, "ref_logps/chosen": -34.7554931640625, "ref_logps/rejected": -46.60670471191406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.235015869140625, "rewards/margins": 1.6085803508758545, "rewards/rejected": -2.8435959815979004, "step": 1279 }, { "epoch": 1.21, "grad_norm": 19.049394607543945, "learning_rate": 3.3175935641832807e-07, "logps/chosen": -42.42154312133789, "logps/rejected": -57.276573181152344, "loss": 0.3747, "losses/dpo": 0.4905742406845093, "losses/sft": 2.6760754585266113, "losses/total": 0.4905742406845093, "ref_logps/chosen": -31.650192260742188, "ref_logps/rejected": -35.149505615234375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0771350860595703, "rewards/margins": 1.1355715990066528, "rewards/rejected": -2.2127065658569336, "step": 1280 }, { "epoch": 1.21, "grad_norm": 17.385385513305664, "learning_rate": 3.315844700944386e-07, "logps/chosen": -38.581573486328125, "logps/rejected": -62.73283004760742, "loss": 0.3966, "losses/dpo": 0.558727502822876, "losses/sft": 2.257667064666748, "losses/total": 0.558727502822876, "ref_logps/chosen": -28.25278091430664, "ref_logps/rejected": -38.52047348022461, "rewards/accuracies": 0.75, "rewards/chosen": -1.0328795909881592, "rewards/margins": 1.3883559703826904, "rewards/rejected": -2.4212355613708496, "step": 1281 }, { "epoch": 1.21, "grad_norm": 21.758419036865234, "learning_rate": 3.314095837705491e-07, "logps/chosen": -43.095306396484375, "logps/rejected": -64.56269836425781, "loss": 0.3912, "losses/dpo": 0.4416448473930359, "losses/sft": 1.790996789932251, "losses/total": 0.4416448473930359, "ref_logps/chosen": -29.484451293945312, "ref_logps/rejected": -39.142303466796875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.361085295677185, "rewards/margins": 1.1809539794921875, "rewards/rejected": -2.542039394378662, "step": 1282 }, { "epoch": 1.21, "grad_norm": 15.446070671081543, "learning_rate": 3.3123469744665966e-07, "logps/chosen": -54.28368377685547, "logps/rejected": -66.2320556640625, "loss": 0.3684, "losses/dpo": 0.6027462482452393, "losses/sft": 2.1526315212249756, "losses/total": 0.6027462482452393, "ref_logps/chosen": -41.75803756713867, "ref_logps/rejected": -40.7514762878418, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2525646686553955, "rewards/margins": 1.2954933643341064, "rewards/rejected": -2.548058271408081, "step": 1283 }, { "epoch": 1.21, "grad_norm": 29.56391143798828, "learning_rate": 3.3105981112277023e-07, "logps/chosen": -55.56463623046875, "logps/rejected": -52.370445251464844, "loss": 0.6312, "losses/dpo": 0.940421462059021, "losses/sft": 2.232243537902832, "losses/total": 0.940421462059021, "ref_logps/chosen": -40.911827087402344, "ref_logps/rejected": -34.261634826660156, "rewards/accuracies": 0.75, "rewards/chosen": -1.4652807712554932, "rewards/margins": 0.34560006856918335, "rewards/rejected": -1.8108808994293213, "step": 1284 }, { "epoch": 1.21, "grad_norm": 17.717260360717773, "learning_rate": 3.308849247988807e-07, "logps/chosen": -48.872928619384766, "logps/rejected": -74.94236755371094, "loss": 0.2708, "losses/dpo": 0.27558350563049316, "losses/sft": 1.7259505987167358, "losses/total": 0.27558350563049316, "ref_logps/chosen": -36.185546875, "ref_logps/rejected": -46.036842346191406, "rewards/accuracies": 0.875, "rewards/chosen": -1.2687382698059082, "rewards/margins": 1.6218140125274658, "rewards/rejected": -2.890552282333374, "step": 1285 }, { "epoch": 1.21, "grad_norm": 22.396329879760742, "learning_rate": 3.3071003847499126e-07, "logps/chosen": -51.96961975097656, "logps/rejected": -74.1020278930664, "loss": 0.5146, "losses/dpo": 0.5004276633262634, "losses/sft": 2.29833984375, "losses/total": 0.5004276633262634, "ref_logps/chosen": -34.832855224609375, "ref_logps/rejected": -49.5932731628418, "rewards/accuracies": 0.75, "rewards/chosen": -1.7136763334274292, "rewards/margins": 0.7371993064880371, "rewards/rejected": -2.450875759124756, "step": 1286 }, { "epoch": 1.22, "grad_norm": 15.472954750061035, "learning_rate": 3.3053515215110177e-07, "logps/chosen": -47.909706115722656, "logps/rejected": -91.96464538574219, "loss": 0.2595, "losses/dpo": 0.10487969219684601, "losses/sft": 1.825188159942627, "losses/total": 0.10487969219684601, "ref_logps/chosen": -34.9535026550293, "ref_logps/rejected": -61.30604553222656, "rewards/accuracies": 0.875, "rewards/chosen": -1.295620083808899, "rewards/margins": 1.770240068435669, "rewards/rejected": -3.0658605098724365, "step": 1287 }, { "epoch": 1.22, "grad_norm": 22.184104919433594, "learning_rate": 3.303602658272123e-07, "logps/chosen": -50.46155548095703, "logps/rejected": -80.95024108886719, "loss": 0.3384, "losses/dpo": 0.18355077505111694, "losses/sft": 1.5707978010177612, "losses/total": 0.18355077505111694, "ref_logps/chosen": -39.31085968017578, "ref_logps/rejected": -54.36072540283203, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1150697469711304, "rewards/margins": 1.5438816547393799, "rewards/rejected": -2.6589515209198, "step": 1288 }, { "epoch": 1.22, "grad_norm": 21.306243896484375, "learning_rate": 3.301853795033228e-07, "logps/chosen": -43.325958251953125, "logps/rejected": -68.84542083740234, "loss": 0.3303, "losses/dpo": 0.24994681775569916, "losses/sft": 1.3414628505706787, "losses/total": 0.24994681775569916, "ref_logps/chosen": -30.669391632080078, "ref_logps/rejected": -40.66756820678711, "rewards/accuracies": 0.875, "rewards/chosen": -1.2656567096710205, "rewards/margins": 1.5521284341812134, "rewards/rejected": -2.8177850246429443, "step": 1289 }, { "epoch": 1.22, "grad_norm": 21.933183670043945, "learning_rate": 3.3001049317943336e-07, "logps/chosen": -49.80470275878906, "logps/rejected": -60.630489349365234, "loss": 0.5128, "losses/dpo": 0.545912504196167, "losses/sft": 1.558157205581665, "losses/total": 0.545912504196167, "ref_logps/chosen": -34.777557373046875, "ref_logps/rejected": -37.527740478515625, "rewards/accuracies": 0.6875, "rewards/chosen": -1.50271475315094, "rewards/margins": 0.8075599074363708, "rewards/rejected": -2.310274600982666, "step": 1290 }, { "epoch": 1.22, "grad_norm": 23.52597999572754, "learning_rate": 3.298356068555439e-07, "logps/chosen": -51.77589416503906, "logps/rejected": -69.64151763916016, "loss": 0.4809, "losses/dpo": 0.8141963481903076, "losses/sft": 1.747296690940857, "losses/total": 0.8141963481903076, "ref_logps/chosen": -33.86619186401367, "ref_logps/rejected": -42.238868713378906, "rewards/accuracies": 0.875, "rewards/chosen": -1.790969967842102, "rewards/margins": 0.949295163154602, "rewards/rejected": -2.740265369415283, "step": 1291 }, { "epoch": 1.22, "grad_norm": 30.15660285949707, "learning_rate": 3.296607205316544e-07, "logps/chosen": -51.819515228271484, "logps/rejected": -74.5934066772461, "loss": 0.4627, "losses/dpo": 0.306501179933548, "losses/sft": 1.456073522567749, "losses/total": 0.306501179933548, "ref_logps/chosen": -37.83601760864258, "ref_logps/rejected": -48.12083053588867, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3983497619628906, "rewards/margins": 1.2489078044891357, "rewards/rejected": -2.6472578048706055, "step": 1292 }, { "epoch": 1.22, "grad_norm": 30.59558868408203, "learning_rate": 3.2948583420776495e-07, "logps/chosen": -53.75914001464844, "logps/rejected": -62.064453125, "loss": 0.6725, "losses/dpo": 0.6996077299118042, "losses/sft": 1.8936095237731934, "losses/total": 0.6996077299118042, "ref_logps/chosen": -38.6866569519043, "ref_logps/rejected": -39.625144958496094, "rewards/accuracies": 0.75, "rewards/chosen": -1.5072486400604248, "rewards/margins": 0.7366818785667419, "rewards/rejected": -2.2439303398132324, "step": 1293 }, { "epoch": 1.22, "grad_norm": 19.79813575744629, "learning_rate": 3.2931094788387546e-07, "logps/chosen": -48.740421295166016, "logps/rejected": -67.98152160644531, "loss": 0.3642, "losses/dpo": 0.4135620594024658, "losses/sft": 2.6681344509124756, "losses/total": 0.4135620594024658, "ref_logps/chosen": -33.50664138793945, "ref_logps/rejected": -41.82233810424805, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5233776569366455, "rewards/margins": 1.092540979385376, "rewards/rejected": -2.6159186363220215, "step": 1294 }, { "epoch": 1.22, "grad_norm": 32.642818450927734, "learning_rate": 3.29136061559986e-07, "logps/chosen": -58.44609451293945, "logps/rejected": -64.89666748046875, "loss": 0.6469, "losses/dpo": 0.526237428188324, "losses/sft": 1.6568933725357056, "losses/total": 0.526237428188324, "ref_logps/chosen": -41.75444030761719, "ref_logps/rejected": -41.766197204589844, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6691652536392212, "rewards/margins": 0.6438819766044617, "rewards/rejected": -2.313047170639038, "step": 1295 }, { "epoch": 1.22, "grad_norm": 22.285186767578125, "learning_rate": 3.289611752360965e-07, "logps/chosen": -45.24211883544922, "logps/rejected": -68.41378021240234, "loss": 0.4107, "losses/dpo": 0.24445749819278717, "losses/sft": 1.6323164701461792, "losses/total": 0.24445749819278717, "ref_logps/chosen": -33.608001708984375, "ref_logps/rejected": -44.66828918457031, "rewards/accuracies": 0.8125, "rewards/chosen": -1.163412094116211, "rewards/margins": 1.211136817932129, "rewards/rejected": -2.37454891204834, "step": 1296 }, { "epoch": 1.22, "grad_norm": 19.355915069580078, "learning_rate": 3.2878628891220705e-07, "logps/chosen": -56.947296142578125, "logps/rejected": -81.64164733886719, "loss": 0.3288, "losses/dpo": 0.19625043869018555, "losses/sft": 1.7529845237731934, "losses/total": 0.19625043869018555, "ref_logps/chosen": -42.715492248535156, "ref_logps/rejected": -51.96495819091797, "rewards/accuracies": 0.875, "rewards/chosen": -1.4231808185577393, "rewards/margins": 1.544487714767456, "rewards/rejected": -2.9676685333251953, "step": 1297 }, { "epoch": 1.23, "grad_norm": 15.184709548950195, "learning_rate": 3.286114025883176e-07, "logps/chosen": -43.13945388793945, "logps/rejected": -76.0487289428711, "loss": 0.2396, "losses/dpo": 0.22111205756664276, "losses/sft": 1.25008225440979, "losses/total": 0.22111205756664276, "ref_logps/chosen": -33.227134704589844, "ref_logps/rejected": -49.567604064941406, "rewards/accuracies": 1.0, "rewards/chosen": -0.9912323951721191, "rewards/margins": 1.6568796634674072, "rewards/rejected": -2.6481118202209473, "step": 1298 }, { "epoch": 1.23, "grad_norm": 22.35556411743164, "learning_rate": 3.284365162644281e-07, "logps/chosen": -48.418701171875, "logps/rejected": -66.66232299804688, "loss": 0.4478, "losses/dpo": 0.5131890773773193, "losses/sft": 1.5699821710586548, "losses/total": 0.5131890773773193, "ref_logps/chosen": -36.57819747924805, "ref_logps/rejected": -43.835689544677734, "rewards/accuracies": 0.875, "rewards/chosen": -1.1840505599975586, "rewards/margins": 1.0986123085021973, "rewards/rejected": -2.282662868499756, "step": 1299 }, { "epoch": 1.23, "grad_norm": 18.5380916595459, "learning_rate": 3.2826162994053864e-07, "logps/chosen": -61.87382507324219, "logps/rejected": -77.09158325195312, "loss": 0.325, "losses/dpo": 0.24416860938072205, "losses/sft": 1.715246319770813, "losses/total": 0.24416860938072205, "ref_logps/chosen": -47.20370864868164, "ref_logps/rejected": -48.25052261352539, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4670116901397705, "rewards/margins": 1.4170947074890137, "rewards/rejected": -2.884106397628784, "step": 1300 }, { "epoch": 1.23, "grad_norm": 11.219528198242188, "learning_rate": 3.2808674361664916e-07, "logps/chosen": -37.850894927978516, "logps/rejected": -75.34614562988281, "loss": 0.2226, "losses/dpo": 0.19624850153923035, "losses/sft": 1.3619885444641113, "losses/total": 0.19624850153923035, "ref_logps/chosen": -29.011240005493164, "ref_logps/rejected": -44.524391174316406, "rewards/accuracies": 0.875, "rewards/chosen": -0.883965253829956, "rewards/margins": 2.1982107162475586, "rewards/rejected": -3.0821757316589355, "step": 1301 }, { "epoch": 1.23, "grad_norm": 18.190052032470703, "learning_rate": 3.2791185729275967e-07, "logps/chosen": -54.94478225708008, "logps/rejected": -64.18688201904297, "loss": 0.3208, "losses/dpo": 0.2329922318458557, "losses/sft": 2.344893455505371, "losses/total": 0.2329922318458557, "ref_logps/chosen": -42.08929443359375, "ref_logps/rejected": -39.92123031616211, "rewards/accuracies": 0.875, "rewards/chosen": -1.2855491638183594, "rewards/margins": 1.1410161256790161, "rewards/rejected": -2.426565170288086, "step": 1302 }, { "epoch": 1.23, "grad_norm": 23.517995834350586, "learning_rate": 3.277369709688702e-07, "logps/chosen": -46.17790985107422, "logps/rejected": -67.5840072631836, "loss": 0.471, "losses/dpo": 0.3811895549297333, "losses/sft": 1.5601413249969482, "losses/total": 0.3811895549297333, "ref_logps/chosen": -32.570960998535156, "ref_logps/rejected": -43.74147033691406, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3606951236724854, "rewards/margins": 1.0235587358474731, "rewards/rejected": -2.384253978729248, "step": 1303 }, { "epoch": 1.23, "grad_norm": 28.837244033813477, "learning_rate": 3.2756208464498075e-07, "logps/chosen": -51.181602478027344, "logps/rejected": -68.76593017578125, "loss": 0.5232, "losses/dpo": 0.3507500886917114, "losses/sft": 1.9810177087783813, "losses/total": 0.3507500886917114, "ref_logps/chosen": -36.52156066894531, "ref_logps/rejected": -45.390869140625, "rewards/accuracies": 0.625, "rewards/chosen": -1.4660041332244873, "rewards/margins": 0.8715018033981323, "rewards/rejected": -2.33750581741333, "step": 1304 }, { "epoch": 1.23, "grad_norm": 18.90814971923828, "learning_rate": 3.273871983210913e-07, "logps/chosen": -43.88724899291992, "logps/rejected": -69.35987854003906, "loss": 0.3235, "losses/dpo": 0.17113202810287476, "losses/sft": 1.858979344367981, "losses/total": 0.17113202810287476, "ref_logps/chosen": -32.038177490234375, "ref_logps/rejected": -43.49333190917969, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1849069595336914, "rewards/margins": 1.4017481803894043, "rewards/rejected": -2.5866551399230957, "step": 1305 }, { "epoch": 1.23, "grad_norm": 22.561180114746094, "learning_rate": 3.2721231199720177e-07, "logps/chosen": -53.358001708984375, "logps/rejected": -76.11177825927734, "loss": 0.4161, "losses/dpo": 0.31508854031562805, "losses/sft": 1.9650845527648926, "losses/total": 0.31508854031562805, "ref_logps/chosen": -40.74730682373047, "ref_logps/rejected": -48.347511291503906, "rewards/accuracies": 0.75, "rewards/chosen": -1.2610701322555542, "rewards/margins": 1.5153571367263794, "rewards/rejected": -2.7764272689819336, "step": 1306 }, { "epoch": 1.23, "grad_norm": 19.188674926757812, "learning_rate": 3.2703742567331234e-07, "logps/chosen": -48.32026672363281, "logps/rejected": -57.26163864135742, "loss": 0.444, "losses/dpo": 0.276664137840271, "losses/sft": 1.4580862522125244, "losses/total": 0.276664137840271, "ref_logps/chosen": -37.229698181152344, "ref_logps/rejected": -35.70116424560547, "rewards/accuracies": 0.875, "rewards/chosen": -1.1090574264526367, "rewards/margins": 1.0469906330108643, "rewards/rejected": -2.156047821044922, "step": 1307 }, { "epoch": 1.24, "grad_norm": 21.35573959350586, "learning_rate": 3.2686253934942285e-07, "logps/chosen": -50.68305206298828, "logps/rejected": -65.30364990234375, "loss": 0.486, "losses/dpo": 0.5735315084457397, "losses/sft": 1.9887821674346924, "losses/total": 0.5735315084457397, "ref_logps/chosen": -36.80727005004883, "ref_logps/rejected": -42.483665466308594, "rewards/accuracies": 0.625, "rewards/chosen": -1.3875783681869507, "rewards/margins": 0.8944202661514282, "rewards/rejected": -2.281998634338379, "step": 1308 }, { "epoch": 1.24, "grad_norm": 21.293556213378906, "learning_rate": 3.266876530255334e-07, "logps/chosen": -50.35670852661133, "logps/rejected": -81.25212097167969, "loss": 0.3172, "losses/dpo": 0.2058955579996109, "losses/sft": 1.4526883363723755, "losses/total": 0.2058955579996109, "ref_logps/chosen": -36.02672576904297, "ref_logps/rejected": -49.118896484375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4329981803894043, "rewards/margins": 1.7803233861923218, "rewards/rejected": -3.2133214473724365, "step": 1309 }, { "epoch": 1.24, "grad_norm": 32.47032165527344, "learning_rate": 3.2651276670164393e-07, "logps/chosen": -55.274818420410156, "logps/rejected": -68.23551940917969, "loss": 0.5644, "losses/dpo": 0.3510875105857849, "losses/sft": 1.2595924139022827, "losses/total": 0.3510875105857849, "ref_logps/chosen": -40.185157775878906, "ref_logps/rejected": -45.297996520996094, "rewards/accuracies": 0.625, "rewards/chosen": -1.5089654922485352, "rewards/margins": 0.784786581993103, "rewards/rejected": -2.2937521934509277, "step": 1310 }, { "epoch": 1.24, "grad_norm": 30.444149017333984, "learning_rate": 3.2633788037775444e-07, "logps/chosen": -55.1437873840332, "logps/rejected": -63.98257827758789, "loss": 0.6349, "losses/dpo": 0.4569612741470337, "losses/sft": 1.7529760599136353, "losses/total": 0.4569612741470337, "ref_logps/chosen": -41.296539306640625, "ref_logps/rejected": -43.07489013671875, "rewards/accuracies": 0.625, "rewards/chosen": -1.3847248554229736, "rewards/margins": 0.7060439586639404, "rewards/rejected": -2.090768814086914, "step": 1311 }, { "epoch": 1.24, "grad_norm": 15.60333251953125, "learning_rate": 3.26162994053865e-07, "logps/chosen": -44.36737823486328, "logps/rejected": -68.90261840820312, "loss": 0.3255, "losses/dpo": 0.36724573373794556, "losses/sft": 2.181623697280884, "losses/total": 0.36724573373794556, "ref_logps/chosen": -34.942996978759766, "ref_logps/rejected": -44.84746551513672, "rewards/accuracies": 0.875, "rewards/chosen": -0.9424382448196411, "rewards/margins": 1.4630770683288574, "rewards/rejected": -2.405515193939209, "step": 1312 }, { "epoch": 1.24, "grad_norm": 12.743868827819824, "learning_rate": 3.2598810772997547e-07, "logps/chosen": -48.645076751708984, "logps/rejected": -77.03884887695312, "loss": 0.234, "losses/dpo": 0.27354711294174194, "losses/sft": 1.7931358814239502, "losses/total": 0.27354711294174194, "ref_logps/chosen": -37.277740478515625, "ref_logps/rejected": -48.298004150390625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1367335319519043, "rewards/margins": 1.737350344657898, "rewards/rejected": -2.874083995819092, "step": 1313 }, { "epoch": 1.24, "grad_norm": 19.99807357788086, "learning_rate": 3.2581322140608603e-07, "logps/chosen": -50.415985107421875, "logps/rejected": -68.30654907226562, "loss": 0.428, "losses/dpo": 0.7504759430885315, "losses/sft": 1.4540705680847168, "losses/total": 0.7504759430885315, "ref_logps/chosen": -40.197364807128906, "ref_logps/rejected": -46.552452087402344, "rewards/accuracies": 0.75, "rewards/chosen": -1.0218619108200073, "rewards/margins": 1.153548240661621, "rewards/rejected": -2.175410270690918, "step": 1314 }, { "epoch": 1.24, "grad_norm": 20.18872833251953, "learning_rate": 3.2563833508219655e-07, "logps/chosen": -58.441917419433594, "logps/rejected": -72.74459838867188, "loss": 0.4563, "losses/dpo": 0.600630521774292, "losses/sft": 1.7910703420639038, "losses/total": 0.600630521774292, "ref_logps/chosen": -46.625999450683594, "ref_logps/rejected": -50.275848388671875, "rewards/accuracies": 0.75, "rewards/chosen": -1.1815918684005737, "rewards/margins": 1.0652837753295898, "rewards/rejected": -2.246875524520874, "step": 1315 }, { "epoch": 1.24, "grad_norm": 25.531356811523438, "learning_rate": 3.254634487583071e-07, "logps/chosen": -64.70285034179688, "logps/rejected": -60.91915512084961, "loss": 0.4955, "losses/dpo": 0.3505529761314392, "losses/sft": 1.782562017440796, "losses/total": 0.3505529761314392, "ref_logps/chosen": -50.220314025878906, "ref_logps/rejected": -36.196380615234375, "rewards/accuracies": 0.75, "rewards/chosen": -1.4482536315917969, "rewards/margins": 1.0240238904953003, "rewards/rejected": -2.4722776412963867, "step": 1316 }, { "epoch": 1.24, "grad_norm": 16.17140769958496, "learning_rate": 3.252885624344176e-07, "logps/chosen": -43.879234313964844, "logps/rejected": -75.45570373535156, "loss": 0.2886, "losses/dpo": 0.18516552448272705, "losses/sft": 1.9407551288604736, "losses/total": 0.18516552448272705, "ref_logps/chosen": -33.99541473388672, "ref_logps/rejected": -50.96930694580078, "rewards/accuracies": 0.875, "rewards/chosen": -0.9883820414543152, "rewards/margins": 1.4602574110031128, "rewards/rejected": -2.448639392852783, "step": 1317 }, { "epoch": 1.24, "grad_norm": 24.757556915283203, "learning_rate": 3.2511367611052814e-07, "logps/chosen": -60.0771369934082, "logps/rejected": -74.33728790283203, "loss": 0.5302, "losses/dpo": 0.23207569122314453, "losses/sft": 1.9666743278503418, "losses/total": 0.23207569122314453, "ref_logps/chosen": -45.66221237182617, "ref_logps/rejected": -50.77057647705078, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4414923191070557, "rewards/margins": 0.9151792526245117, "rewards/rejected": -2.3566718101501465, "step": 1318 }, { "epoch": 1.25, "grad_norm": 15.23419189453125, "learning_rate": 3.249387897866387e-07, "logps/chosen": -37.39873504638672, "logps/rejected": -70.27153015136719, "loss": 0.3191, "losses/dpo": 0.35991108417510986, "losses/sft": 1.3433964252471924, "losses/total": 0.35991108417510986, "ref_logps/chosen": -28.353347778320312, "ref_logps/rejected": -48.33560562133789, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9045386910438538, "rewards/margins": 1.2890541553497314, "rewards/rejected": -2.1935927867889404, "step": 1319 }, { "epoch": 1.25, "grad_norm": 22.617067337036133, "learning_rate": 3.2476390346274916e-07, "logps/chosen": -46.061668395996094, "logps/rejected": -51.542694091796875, "loss": 0.6039, "losses/dpo": 0.4379119277000427, "losses/sft": 1.9982529878616333, "losses/total": 0.4379119277000427, "ref_logps/chosen": -33.610599517822266, "ref_logps/rejected": -33.450477600097656, "rewards/accuracies": 0.875, "rewards/chosen": -1.2451066970825195, "rewards/margins": 0.5641152262687683, "rewards/rejected": -1.8092219829559326, "step": 1320 }, { "epoch": 1.25, "grad_norm": 21.929431915283203, "learning_rate": 3.2458901713885973e-07, "logps/chosen": -45.45535659790039, "logps/rejected": -55.51594543457031, "loss": 0.4203, "losses/dpo": 0.6898786425590515, "losses/sft": 2.0693986415863037, "losses/total": 0.6898786425590515, "ref_logps/chosen": -35.48495864868164, "ref_logps/rejected": -34.851654052734375, "rewards/accuracies": 0.875, "rewards/chosen": -0.9970399141311646, "rewards/margins": 1.0693888664245605, "rewards/rejected": -2.0664286613464355, "step": 1321 }, { "epoch": 1.25, "grad_norm": 22.96612548828125, "learning_rate": 3.2441413081497024e-07, "logps/chosen": -52.88302993774414, "logps/rejected": -73.43191528320312, "loss": 0.3806, "losses/dpo": 0.2667972445487976, "losses/sft": 1.705133080482483, "losses/total": 0.2667972445487976, "ref_logps/chosen": -41.457115173339844, "ref_logps/rejected": -48.10161209106445, "rewards/accuracies": 0.875, "rewards/chosen": -1.1425915956497192, "rewards/margins": 1.3904387950897217, "rewards/rejected": -2.5330302715301514, "step": 1322 }, { "epoch": 1.25, "grad_norm": 15.859878540039062, "learning_rate": 3.242392444910808e-07, "logps/chosen": -44.19548034667969, "logps/rejected": -60.982975006103516, "loss": 0.2992, "losses/dpo": 0.4226863384246826, "losses/sft": 1.8237709999084473, "losses/total": 0.4226863384246826, "ref_logps/chosen": -34.07893371582031, "ref_logps/rejected": -36.695064544677734, "rewards/accuracies": 0.875, "rewards/chosen": -1.0116544961929321, "rewards/margins": 1.4171364307403564, "rewards/rejected": -2.428791046142578, "step": 1323 }, { "epoch": 1.25, "grad_norm": 14.331469535827637, "learning_rate": 3.240643581671913e-07, "logps/chosen": -38.36085510253906, "logps/rejected": -58.140342712402344, "loss": 0.3015, "losses/dpo": 0.28820958733558655, "losses/sft": 1.4967849254608154, "losses/total": 0.28820958733558655, "ref_logps/chosen": -31.664566040039062, "ref_logps/rejected": -37.260887145996094, "rewards/accuracies": 0.875, "rewards/chosen": -0.6696290969848633, "rewards/margins": 1.41831636428833, "rewards/rejected": -2.0879454612731934, "step": 1324 }, { "epoch": 1.25, "grad_norm": 19.79657554626465, "learning_rate": 3.2388947184330183e-07, "logps/chosen": -45.195281982421875, "logps/rejected": -66.21366882324219, "loss": 0.3472, "losses/dpo": 0.1739564836025238, "losses/sft": 1.7063114643096924, "losses/total": 0.1739564836025238, "ref_logps/chosen": -33.74018859863281, "ref_logps/rejected": -41.121517181396484, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1455092430114746, "rewards/margins": 1.3637057542800903, "rewards/rejected": -2.5092148780822754, "step": 1325 }, { "epoch": 1.25, "grad_norm": 22.342103958129883, "learning_rate": 3.237145855194124e-07, "logps/chosen": -42.53139114379883, "logps/rejected": -48.85778045654297, "loss": 0.5487, "losses/dpo": 0.3733326494693756, "losses/sft": 1.5709648132324219, "losses/total": 0.3733326494693756, "ref_logps/chosen": -33.1717643737793, "ref_logps/rejected": -31.201541900634766, "rewards/accuracies": 0.625, "rewards/chosen": -0.935962975025177, "rewards/margins": 0.8296608924865723, "rewards/rejected": -1.7656238079071045, "step": 1326 }, { "epoch": 1.25, "grad_norm": 27.36860466003418, "learning_rate": 3.2353969919552286e-07, "logps/chosen": -45.33452224731445, "logps/rejected": -56.8319206237793, "loss": 0.638, "losses/dpo": 0.9248436093330383, "losses/sft": 1.8668464422225952, "losses/total": 0.9248436093330383, "ref_logps/chosen": -35.331947326660156, "ref_logps/rejected": -41.05805969238281, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0002573728561401, "rewards/margins": 0.5771288275718689, "rewards/rejected": -1.5773861408233643, "step": 1327 }, { "epoch": 1.25, "grad_norm": 27.762012481689453, "learning_rate": 3.233648128716334e-07, "logps/chosen": -51.885101318359375, "logps/rejected": -61.62514114379883, "loss": 0.5824, "losses/dpo": 0.7015052437782288, "losses/sft": 1.5709404945373535, "losses/total": 0.7015052437782288, "ref_logps/chosen": -36.931671142578125, "ref_logps/rejected": -39.759273529052734, "rewards/accuracies": 0.625, "rewards/chosen": -1.4953434467315674, "rewards/margins": 0.6912434101104736, "rewards/rejected": -2.186586856842041, "step": 1328 }, { "epoch": 1.25, "grad_norm": 23.887310028076172, "learning_rate": 3.23189926547744e-07, "logps/chosen": -61.18767547607422, "logps/rejected": -87.2144546508789, "loss": 0.4081, "losses/dpo": 0.3507990837097168, "losses/sft": 1.886390209197998, "losses/total": 0.3507990837097168, "ref_logps/chosen": -42.69068908691406, "ref_logps/rejected": -57.786170959472656, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8496990203857422, "rewards/margins": 1.093129277229309, "rewards/rejected": -2.942828416824341, "step": 1329 }, { "epoch": 1.26, "grad_norm": 20.27486228942871, "learning_rate": 3.230150402238545e-07, "logps/chosen": -47.38467788696289, "logps/rejected": -69.49496459960938, "loss": 0.3936, "losses/dpo": 0.28516754508018494, "losses/sft": 1.731813669204712, "losses/total": 0.28516754508018494, "ref_logps/chosen": -36.72315216064453, "ref_logps/rejected": -46.98150634765625, "rewards/accuracies": 0.875, "rewards/chosen": -1.066152572631836, "rewards/margins": 1.1851938962936401, "rewards/rejected": -2.2513465881347656, "step": 1330 }, { "epoch": 1.26, "grad_norm": 21.366527557373047, "learning_rate": 3.22840153899965e-07, "logps/chosen": -44.96656036376953, "logps/rejected": -62.3281135559082, "loss": 0.4295, "losses/dpo": 0.515168309211731, "losses/sft": 1.5524519681930542, "losses/total": 0.515168309211731, "ref_logps/chosen": -36.836669921875, "ref_logps/rejected": -43.83147430419922, "rewards/accuracies": 0.875, "rewards/chosen": -0.8129887580871582, "rewards/margins": 1.036675214767456, "rewards/rejected": -1.8496639728546143, "step": 1331 }, { "epoch": 1.26, "grad_norm": 17.08660888671875, "learning_rate": 3.226652675760755e-07, "logps/chosen": -41.66600799560547, "logps/rejected": -62.67748260498047, "loss": 0.4071, "losses/dpo": 0.164664626121521, "losses/sft": 2.1566896438598633, "losses/total": 0.164664626121521, "ref_logps/chosen": -28.76436996459961, "ref_logps/rejected": -37.91204071044922, "rewards/accuracies": 0.75, "rewards/chosen": -1.290163516998291, "rewards/margins": 1.1863805055618286, "rewards/rejected": -2.476544141769409, "step": 1332 }, { "epoch": 1.26, "grad_norm": 20.865203857421875, "learning_rate": 3.224903812521861e-07, "logps/chosen": -64.59005737304688, "logps/rejected": -69.01496124267578, "loss": 0.4417, "losses/dpo": 0.4876309037208557, "losses/sft": 1.9798153638839722, "losses/total": 0.4876309037208557, "ref_logps/chosen": -50.089942932128906, "ref_logps/rejected": -44.22984313964844, "rewards/accuracies": 0.75, "rewards/chosen": -1.4500117301940918, "rewards/margins": 1.0285006761550903, "rewards/rejected": -2.4785122871398926, "step": 1333 }, { "epoch": 1.26, "grad_norm": 21.650136947631836, "learning_rate": 3.2231549492829655e-07, "logps/chosen": -43.238407135009766, "logps/rejected": -61.714271545410156, "loss": 0.441, "losses/dpo": 0.9948136806488037, "losses/sft": 2.3222570419311523, "losses/total": 0.9948136806488037, "ref_logps/chosen": -32.03950881958008, "ref_logps/rejected": -38.622928619384766, "rewards/accuracies": 0.75, "rewards/chosen": -1.119889736175537, "rewards/margins": 1.1892445087432861, "rewards/rejected": -2.3091344833374023, "step": 1334 }, { "epoch": 1.26, "grad_norm": 18.566551208496094, "learning_rate": 3.221406086044071e-07, "logps/chosen": -60.351646423339844, "logps/rejected": -86.7037353515625, "loss": 0.2427, "losses/dpo": 0.09323880076408386, "losses/sft": 1.6382561922073364, "losses/total": 0.09323880076408386, "ref_logps/chosen": -47.07814025878906, "ref_logps/rejected": -54.51207733154297, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3273508548736572, "rewards/margins": 1.8918142318725586, "rewards/rejected": -3.2191648483276367, "step": 1335 }, { "epoch": 1.26, "grad_norm": 23.849151611328125, "learning_rate": 3.219657222805177e-07, "logps/chosen": -49.82647705078125, "logps/rejected": -65.72238159179688, "loss": 0.3579, "losses/dpo": 0.34007012844085693, "losses/sft": 2.0767719745635986, "losses/total": 0.34007012844085693, "ref_logps/chosen": -36.99418258666992, "ref_logps/rejected": -39.154380798339844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2832295894622803, "rewards/margins": 1.3735709190368652, "rewards/rejected": -2.6568005084991455, "step": 1336 }, { "epoch": 1.26, "grad_norm": 19.60880470275879, "learning_rate": 3.217908359566282e-07, "logps/chosen": -52.775386810302734, "logps/rejected": -66.595458984375, "loss": 0.3526, "losses/dpo": 0.31136977672576904, "losses/sft": 1.9005811214447021, "losses/total": 0.31136977672576904, "ref_logps/chosen": -39.96556091308594, "ref_logps/rejected": -41.529136657714844, "rewards/accuracies": 0.875, "rewards/chosen": -1.280982255935669, "rewards/margins": 1.2256507873535156, "rewards/rejected": -2.5066330432891846, "step": 1337 }, { "epoch": 1.26, "grad_norm": 24.112470626831055, "learning_rate": 3.216159496327387e-07, "logps/chosen": -57.16948699951172, "logps/rejected": -61.171268463134766, "loss": 0.4699, "losses/dpo": 0.4526618719100952, "losses/sft": 1.6857000589370728, "losses/total": 0.4526618719100952, "ref_logps/chosen": -41.94329833984375, "ref_logps/rejected": -35.8074836730957, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5226187705993652, "rewards/margins": 1.0137600898742676, "rewards/rejected": -2.536378860473633, "step": 1338 }, { "epoch": 1.26, "grad_norm": 28.725250244140625, "learning_rate": 3.214410633088492e-07, "logps/chosen": -56.87745666503906, "logps/rejected": -64.42515563964844, "loss": 0.5745, "losses/dpo": 0.33915460109710693, "losses/sft": 1.551383137702942, "losses/total": 0.33915460109710693, "ref_logps/chosen": -40.86872863769531, "ref_logps/rejected": -40.371002197265625, "rewards/accuracies": 0.625, "rewards/chosen": -1.6008729934692383, "rewards/margins": 0.8045421838760376, "rewards/rejected": -2.4054150581359863, "step": 1339 }, { "epoch": 1.27, "grad_norm": 14.856887817382812, "learning_rate": 3.212661769849598e-07, "logps/chosen": -60.09968566894531, "logps/rejected": -94.28604125976562, "loss": 0.195, "losses/dpo": 0.2769324481487274, "losses/sft": 2.1882357597351074, "losses/total": 0.2769324481487274, "ref_logps/chosen": -47.022762298583984, "ref_logps/rejected": -61.86514663696289, "rewards/accuracies": 1.0, "rewards/chosen": -1.307692050933838, "rewards/margins": 1.9343969821929932, "rewards/rejected": -3.242089033126831, "step": 1340 }, { "epoch": 1.27, "grad_norm": 30.66367530822754, "learning_rate": 3.2109129066107024e-07, "logps/chosen": -55.5614013671875, "logps/rejected": -69.0540542602539, "loss": 0.6662, "losses/dpo": 1.352977991104126, "losses/sft": 2.3313493728637695, "losses/total": 1.352977991104126, "ref_logps/chosen": -39.0507698059082, "ref_logps/rejected": -41.118221282958984, "rewards/accuracies": 0.625, "rewards/chosen": -1.6510632038116455, "rewards/margins": 1.142520546913147, "rewards/rejected": -2.793583869934082, "step": 1341 }, { "epoch": 1.27, "grad_norm": 38.804439544677734, "learning_rate": 3.209164043371808e-07, "logps/chosen": -72.30899047851562, "logps/rejected": -66.9932861328125, "loss": 0.5741, "losses/dpo": 0.19433581829071045, "losses/sft": 1.922597050666809, "losses/total": 0.19433581829071045, "ref_logps/chosen": -55.39595031738281, "ref_logps/rejected": -43.16709899902344, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6913042068481445, "rewards/margins": 0.6913138628005981, "rewards/rejected": -2.3826181888580322, "step": 1342 }, { "epoch": 1.27, "grad_norm": 19.082738876342773, "learning_rate": 3.207415180132914e-07, "logps/chosen": -49.338626861572266, "logps/rejected": -62.61579513549805, "loss": 0.3741, "losses/dpo": 0.7597951889038086, "losses/sft": 2.1300644874572754, "losses/total": 0.7597951889038086, "ref_logps/chosen": -39.40080261230469, "ref_logps/rejected": -40.771018981933594, "rewards/accuracies": 0.875, "rewards/chosen": -0.9937824010848999, "rewards/margins": 1.1906951665878296, "rewards/rejected": -2.1844775676727295, "step": 1343 }, { "epoch": 1.27, "grad_norm": 17.68570899963379, "learning_rate": 3.205666316894019e-07, "logps/chosen": -42.218475341796875, "logps/rejected": -62.51219177246094, "loss": 0.3369, "losses/dpo": 0.24026553332805634, "losses/sft": 0.9595241546630859, "losses/total": 0.24026553332805634, "ref_logps/chosen": -33.65283203125, "ref_logps/rejected": -41.172088623046875, "rewards/accuracies": 0.875, "rewards/chosen": -0.8565640449523926, "rewards/margins": 1.2774463891983032, "rewards/rejected": -2.1340105533599854, "step": 1344 }, { "epoch": 1.27, "grad_norm": 21.236780166625977, "learning_rate": 3.203917453655124e-07, "logps/chosen": -50.389320373535156, "logps/rejected": -55.241783142089844, "loss": 0.4897, "losses/dpo": 0.5423358678817749, "losses/sft": 1.8764293193817139, "losses/total": 0.5423358678817749, "ref_logps/chosen": -37.61534881591797, "ref_logps/rejected": -35.13709259033203, "rewards/accuracies": 0.875, "rewards/chosen": -1.2773971557617188, "rewards/margins": 0.733071506023407, "rewards/rejected": -2.0104687213897705, "step": 1345 }, { "epoch": 1.27, "grad_norm": 27.776611328125, "learning_rate": 3.202168590416229e-07, "logps/chosen": -55.493804931640625, "logps/rejected": -61.5127067565918, "loss": 0.4846, "losses/dpo": 0.6097148656845093, "losses/sft": 2.082397699356079, "losses/total": 0.6097148656845093, "ref_logps/chosen": -41.849891662597656, "ref_logps/rejected": -38.26130294799805, "rewards/accuracies": 0.75, "rewards/chosen": -1.3643914461135864, "rewards/margins": 0.9607487916946411, "rewards/rejected": -2.3251404762268066, "step": 1346 }, { "epoch": 1.27, "grad_norm": 21.580564498901367, "learning_rate": 3.200419727177335e-07, "logps/chosen": -60.56695556640625, "logps/rejected": -64.56971740722656, "loss": 0.4147, "losses/dpo": 0.6108167171478271, "losses/sft": 2.1351778507232666, "losses/total": 0.6108167171478271, "ref_logps/chosen": -44.02821350097656, "ref_logps/rejected": -38.69805145263672, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6538739204406738, "rewards/margins": 0.9332923889160156, "rewards/rejected": -2.5871663093566895, "step": 1347 }, { "epoch": 1.27, "grad_norm": 17.11998176574707, "learning_rate": 3.19867086393844e-07, "logps/chosen": -56.3470344543457, "logps/rejected": -83.40293884277344, "loss": 0.2419, "losses/dpo": 0.2898802161216736, "losses/sft": 2.107215642929077, "losses/total": 0.2898802161216736, "ref_logps/chosen": -41.44533920288086, "ref_logps/rejected": -51.06003189086914, "rewards/accuracies": 0.875, "rewards/chosen": -1.4901695251464844, "rewards/margins": 1.7441210746765137, "rewards/rejected": -3.234290599822998, "step": 1348 }, { "epoch": 1.27, "grad_norm": 24.633512496948242, "learning_rate": 3.196922000699545e-07, "logps/chosen": -60.06035232543945, "logps/rejected": -66.82292175292969, "loss": 0.4087, "losses/dpo": 0.5050278306007385, "losses/sft": 1.8513654470443726, "losses/total": 0.5050278306007385, "ref_logps/chosen": -44.01942443847656, "ref_logps/rejected": -39.494895935058594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6040929555892944, "rewards/margins": 1.1287102699279785, "rewards/rejected": -2.7328033447265625, "step": 1349 }, { "epoch": 1.27, "grad_norm": 24.701948165893555, "learning_rate": 3.1951731374606507e-07, "logps/chosen": -48.76957702636719, "logps/rejected": -66.14830780029297, "loss": 0.5414, "losses/dpo": 0.43534407019615173, "losses/sft": 2.3554751873016357, "losses/total": 0.43534407019615173, "ref_logps/chosen": -32.77943420410156, "ref_logps/rejected": -44.04889678955078, "rewards/accuracies": 0.75, "rewards/chosen": -1.5990142822265625, "rewards/margins": 0.6109272241592407, "rewards/rejected": -2.2099416255950928, "step": 1350 }, { "epoch": 1.28, "grad_norm": 22.263809204101562, "learning_rate": 3.193424274221756e-07, "logps/chosen": -47.04250717163086, "logps/rejected": -62.94432067871094, "loss": 0.4185, "losses/dpo": 0.5180111527442932, "losses/sft": 1.6933324337005615, "losses/total": 0.5180111527442932, "ref_logps/chosen": -34.896610260009766, "ref_logps/rejected": -37.53900909423828, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2145897150039673, "rewards/margins": 1.325941562652588, "rewards/rejected": -2.5405313968658447, "step": 1351 }, { "epoch": 1.28, "grad_norm": 23.63612937927246, "learning_rate": 3.191675410982861e-07, "logps/chosen": -43.21095275878906, "logps/rejected": -57.190086364746094, "loss": 0.5392, "losses/dpo": 0.35031503438949585, "losses/sft": 1.4813015460968018, "losses/total": 0.35031503438949585, "ref_logps/chosen": -30.437179565429688, "ref_logps/rejected": -38.99433898925781, "rewards/accuracies": 0.625, "rewards/chosen": -1.2773772478103638, "rewards/margins": 0.5421980023384094, "rewards/rejected": -1.8195751905441284, "step": 1352 }, { "epoch": 1.28, "grad_norm": 18.869770050048828, "learning_rate": 3.189926547743966e-07, "logps/chosen": -57.13106918334961, "logps/rejected": -87.74666595458984, "loss": 0.3007, "losses/dpo": 0.25921720266342163, "losses/sft": 2.8428573608398438, "losses/total": 0.25921720266342163, "ref_logps/chosen": -38.957374572753906, "ref_logps/rejected": -56.00676727294922, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8173695802688599, "rewards/margins": 1.3566205501556396, "rewards/rejected": -3.17399001121521, "step": 1353 }, { "epoch": 1.28, "grad_norm": 20.820863723754883, "learning_rate": 3.188177684505072e-07, "logps/chosen": -61.46854019165039, "logps/rejected": -75.09683227539062, "loss": 0.3858, "losses/dpo": 0.42118579149246216, "losses/sft": 2.2673394680023193, "losses/total": 0.42118579149246216, "ref_logps/chosen": -44.9959716796875, "ref_logps/rejected": -47.12028503417969, "rewards/accuracies": 0.875, "rewards/chosen": -1.6472563743591309, "rewards/margins": 1.150398850440979, "rewards/rejected": -2.7976553440093994, "step": 1354 }, { "epoch": 1.28, "grad_norm": 17.014755249023438, "learning_rate": 3.186428821266177e-07, "logps/chosen": -36.23109817504883, "logps/rejected": -64.0101547241211, "loss": 0.394, "losses/dpo": 0.7347977161407471, "losses/sft": 1.2118299007415771, "losses/total": 0.7347977161407471, "ref_logps/chosen": -28.18022346496582, "ref_logps/rejected": -45.771461486816406, "rewards/accuracies": 0.875, "rewards/chosen": -0.8050874471664429, "rewards/margins": 1.0187814235687256, "rewards/rejected": -1.823868989944458, "step": 1355 }, { "epoch": 1.28, "grad_norm": 20.06956672668457, "learning_rate": 3.184679958027282e-07, "logps/chosen": -60.052703857421875, "logps/rejected": -74.61544799804688, "loss": 0.3505, "losses/dpo": 0.35956865549087524, "losses/sft": 1.5750818252563477, "losses/total": 0.35956865549087524, "ref_logps/chosen": -44.549537658691406, "ref_logps/rejected": -47.12712097167969, "rewards/accuracies": 0.875, "rewards/chosen": -1.5503166913986206, "rewards/margins": 1.1985156536102295, "rewards/rejected": -2.7488322257995605, "step": 1356 }, { "epoch": 1.28, "grad_norm": 16.932592391967773, "learning_rate": 3.1829310947883876e-07, "logps/chosen": -47.99407958984375, "logps/rejected": -78.48458099365234, "loss": 0.2493, "losses/dpo": 0.12762005627155304, "losses/sft": 1.5339864492416382, "losses/total": 0.12762005627155304, "ref_logps/chosen": -35.01840591430664, "ref_logps/rejected": -48.10680389404297, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2975674867630005, "rewards/margins": 1.7402098178863525, "rewards/rejected": -3.0377771854400635, "step": 1357 }, { "epoch": 1.28, "grad_norm": 18.315305709838867, "learning_rate": 3.181182231549493e-07, "logps/chosen": -63.45601272583008, "logps/rejected": -85.56204986572266, "loss": 0.3312, "losses/dpo": 0.18121808767318726, "losses/sft": 1.7915785312652588, "losses/total": 0.18121808767318726, "ref_logps/chosen": -46.37159729003906, "ref_logps/rejected": -52.41903305053711, "rewards/accuracies": 0.875, "rewards/chosen": -1.7084417343139648, "rewards/margins": 1.605859637260437, "rewards/rejected": -3.3143014907836914, "step": 1358 }, { "epoch": 1.28, "grad_norm": 26.892333984375, "learning_rate": 3.179433368310598e-07, "logps/chosen": -51.82383728027344, "logps/rejected": -53.26301574707031, "loss": 0.5431, "losses/dpo": 0.6771758794784546, "losses/sft": 2.1927335262298584, "losses/total": 0.6771758794784546, "ref_logps/chosen": -37.56463623046875, "ref_logps/rejected": -33.25959777832031, "rewards/accuracies": 0.75, "rewards/chosen": -1.4259207248687744, "rewards/margins": 0.5744211673736572, "rewards/rejected": -2.0003418922424316, "step": 1359 }, { "epoch": 1.28, "grad_norm": 15.517579078674316, "learning_rate": 3.177684505071703e-07, "logps/chosen": -51.13165283203125, "logps/rejected": -86.49072265625, "loss": 0.2102, "losses/dpo": 0.32153305411338806, "losses/sft": 1.4930654764175415, "losses/total": 0.32153305411338806, "ref_logps/chosen": -39.39052963256836, "ref_logps/rejected": -56.91254425048828, "rewards/accuracies": 1.0, "rewards/chosen": -1.1741122007369995, "rewards/margins": 1.783705711364746, "rewards/rejected": -2.957818031311035, "step": 1360 }, { "epoch": 1.29, "grad_norm": 15.040142059326172, "learning_rate": 3.1759356418328087e-07, "logps/chosen": -51.7848014831543, "logps/rejected": -74.01992797851562, "loss": 0.247, "losses/dpo": 0.16273543238639832, "losses/sft": 1.7990646362304688, "losses/total": 0.16273543238639832, "ref_logps/chosen": -35.85475158691406, "ref_logps/rejected": -42.82506561279297, "rewards/accuracies": 1.0, "rewards/chosen": -1.5930051803588867, "rewards/margins": 1.5264816284179688, "rewards/rejected": -3.1194868087768555, "step": 1361 }, { "epoch": 1.29, "grad_norm": 22.02246856689453, "learning_rate": 3.174186778593914e-07, "logps/chosen": -52.05388641357422, "logps/rejected": -68.54545593261719, "loss": 0.3583, "losses/dpo": 0.1601119190454483, "losses/sft": 0.9818538427352905, "losses/total": 0.1601119190454483, "ref_logps/chosen": -39.668514251708984, "ref_logps/rejected": -40.752655029296875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.238537311553955, "rewards/margins": 1.5407426357269287, "rewards/rejected": -2.779280185699463, "step": 1362 }, { "epoch": 1.29, "grad_norm": 18.685449600219727, "learning_rate": 3.172437915355019e-07, "logps/chosen": -53.236289978027344, "logps/rejected": -81.12696838378906, "loss": 0.4615, "losses/dpo": 0.44704434275627136, "losses/sft": 2.2950265407562256, "losses/total": 0.44704434275627136, "ref_logps/chosen": -37.747615814208984, "ref_logps/rejected": -48.26818084716797, "rewards/accuracies": 0.8125, "rewards/chosen": -1.548867106437683, "rewards/margins": 1.7370113134384155, "rewards/rejected": -3.2858781814575195, "step": 1363 }, { "epoch": 1.29, "grad_norm": 26.25226402282715, "learning_rate": 3.1706890521161246e-07, "logps/chosen": -53.697139739990234, "logps/rejected": -52.0499267578125, "loss": 0.539, "losses/dpo": 0.40041327476501465, "losses/sft": 0.8750542402267456, "losses/total": 0.40041327476501465, "ref_logps/chosen": -39.54927062988281, "ref_logps/rejected": -32.445838928222656, "rewards/accuracies": 0.625, "rewards/chosen": -1.414786696434021, "rewards/margins": 0.5456223487854004, "rewards/rejected": -1.9604090452194214, "step": 1364 }, { "epoch": 1.29, "grad_norm": 25.24365234375, "learning_rate": 3.1689401888772297e-07, "logps/chosen": -49.30266571044922, "logps/rejected": -60.79222106933594, "loss": 0.5297, "losses/dpo": 0.4613967537879944, "losses/sft": 1.035914421081543, "losses/total": 0.4613967537879944, "ref_logps/chosen": -35.4945068359375, "ref_logps/rejected": -35.82539749145508, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3808157444000244, "rewards/margins": 1.115866780281067, "rewards/rejected": -2.4966824054718018, "step": 1365 }, { "epoch": 1.29, "grad_norm": 24.446874618530273, "learning_rate": 3.167191325638335e-07, "logps/chosen": -62.15544128417969, "logps/rejected": -69.06873321533203, "loss": 0.3823, "losses/dpo": 0.36644697189331055, "losses/sft": 1.7216076850891113, "losses/total": 0.36644697189331055, "ref_logps/chosen": -48.019168853759766, "ref_logps/rejected": -44.19029998779297, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4136276245117188, "rewards/margins": 1.0742160081863403, "rewards/rejected": -2.4878435134887695, "step": 1366 }, { "epoch": 1.29, "grad_norm": 18.419918060302734, "learning_rate": 3.1654424623994405e-07, "logps/chosen": -53.24803161621094, "logps/rejected": -74.8726806640625, "loss": 0.3131, "losses/dpo": 0.2178664654493332, "losses/sft": 2.127765655517578, "losses/total": 0.2178664654493332, "ref_logps/chosen": -41.76417922973633, "ref_logps/rejected": -48.734710693359375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1483851671218872, "rewards/margins": 1.465411901473999, "rewards/rejected": -2.613797187805176, "step": 1367 }, { "epoch": 1.29, "grad_norm": 21.006553649902344, "learning_rate": 3.1636935991605456e-07, "logps/chosen": -43.47159957885742, "logps/rejected": -67.40238189697266, "loss": 0.392, "losses/dpo": 0.4687536060810089, "losses/sft": 1.481898307800293, "losses/total": 0.4687536060810089, "ref_logps/chosen": -33.60332489013672, "ref_logps/rejected": -43.203651428222656, "rewards/accuracies": 0.75, "rewards/chosen": -0.9868277311325073, "rewards/margins": 1.4330453872680664, "rewards/rejected": -2.4198732376098633, "step": 1368 }, { "epoch": 1.29, "grad_norm": 28.368614196777344, "learning_rate": 3.161944735921651e-07, "logps/chosen": -52.62860870361328, "logps/rejected": -73.36225891113281, "loss": 0.4706, "losses/dpo": 0.5309086441993713, "losses/sft": 1.9961135387420654, "losses/total": 0.5309086441993713, "ref_logps/chosen": -35.80301284790039, "ref_logps/rejected": -45.82862854003906, "rewards/accuracies": 0.8125, "rewards/chosen": -1.682559609413147, "rewards/margins": 1.0708030462265015, "rewards/rejected": -2.7533626556396484, "step": 1369 }, { "epoch": 1.29, "grad_norm": 22.93353843688965, "learning_rate": 3.160195872682756e-07, "logps/chosen": -47.724754333496094, "logps/rejected": -70.8294906616211, "loss": 0.3462, "losses/dpo": 0.2852926254272461, "losses/sft": 1.5040239095687866, "losses/total": 0.2852926254272461, "ref_logps/chosen": -35.95964813232422, "ref_logps/rejected": -46.462371826171875, "rewards/accuracies": 0.875, "rewards/chosen": -1.1765105724334717, "rewards/margins": 1.2602012157440186, "rewards/rejected": -2.4367117881774902, "step": 1370 }, { "epoch": 1.29, "grad_norm": 23.429407119750977, "learning_rate": 3.1584470094438615e-07, "logps/chosen": -46.56145095825195, "logps/rejected": -73.58964538574219, "loss": 0.2879, "losses/dpo": 0.13537660241127014, "losses/sft": 1.4202475547790527, "losses/total": 0.13537660241127014, "ref_logps/chosen": -35.454078674316406, "ref_logps/rejected": -47.185768127441406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1107375621795654, "rewards/margins": 1.5296508073806763, "rewards/rejected": -2.6403884887695312, "step": 1371 }, { "epoch": 1.3, "grad_norm": 23.322397232055664, "learning_rate": 3.1566981462049667e-07, "logps/chosen": -37.70085906982422, "logps/rejected": -47.00138854980469, "loss": 0.5646, "losses/dpo": 0.30335646867752075, "losses/sft": 1.3127626180648804, "losses/total": 0.30335646867752075, "ref_logps/chosen": -24.70101547241211, "ref_logps/rejected": -26.147720336914062, "rewards/accuracies": 0.625, "rewards/chosen": -1.2999846935272217, "rewards/margins": 0.7853823304176331, "rewards/rejected": -2.08536696434021, "step": 1372 }, { "epoch": 1.3, "grad_norm": 21.259082794189453, "learning_rate": 3.154949282966072e-07, "logps/chosen": -47.568992614746094, "logps/rejected": -59.967716217041016, "loss": 0.424, "losses/dpo": 0.12321989238262177, "losses/sft": 2.2505016326904297, "losses/total": 0.12321989238262177, "ref_logps/chosen": -33.088985443115234, "ref_logps/rejected": -33.45567321777344, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4480009078979492, "rewards/margins": 1.2032034397125244, "rewards/rejected": -2.6512041091918945, "step": 1373 }, { "epoch": 1.3, "grad_norm": 23.981849670410156, "learning_rate": 3.1532004197271774e-07, "logps/chosen": -43.099525451660156, "logps/rejected": -77.929931640625, "loss": 0.4545, "losses/dpo": 0.20337752997875214, "losses/sft": 1.3605133295059204, "losses/total": 0.20337752997875214, "ref_logps/chosen": -32.27179718017578, "ref_logps/rejected": -53.765743255615234, "rewards/accuracies": 0.75, "rewards/chosen": -1.0827726125717163, "rewards/margins": 1.333645224571228, "rewards/rejected": -2.4164180755615234, "step": 1374 }, { "epoch": 1.3, "grad_norm": 23.155954360961914, "learning_rate": 3.1514515564882826e-07, "logps/chosen": -47.0855712890625, "logps/rejected": -75.8427963256836, "loss": 0.4053, "losses/dpo": 0.3050435781478882, "losses/sft": 2.213414430618286, "losses/total": 0.3050435781478882, "ref_logps/chosen": -34.05945587158203, "ref_logps/rejected": -50.41439437866211, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3026115894317627, "rewards/margins": 1.2402284145355225, "rewards/rejected": -2.542840003967285, "step": 1375 }, { "epoch": 1.3, "grad_norm": 30.488391876220703, "learning_rate": 3.1497026932493877e-07, "logps/chosen": -59.1077995300293, "logps/rejected": -55.779541015625, "loss": 0.5633, "losses/dpo": 0.2272609919309616, "losses/sft": 1.6881656646728516, "losses/total": 0.2272609919309616, "ref_logps/chosen": -45.06895065307617, "ref_logps/rejected": -34.236572265625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4038848876953125, "rewards/margins": 0.7504122257232666, "rewards/rejected": -2.154297113418579, "step": 1376 }, { "epoch": 1.3, "grad_norm": 19.647987365722656, "learning_rate": 3.147953830010493e-07, "logps/chosen": -52.400299072265625, "logps/rejected": -66.96345520019531, "loss": 0.3621, "losses/dpo": 0.5456865429878235, "losses/sft": 1.9704679250717163, "losses/total": 0.5456865429878235, "ref_logps/chosen": -39.86430358886719, "ref_logps/rejected": -41.286895751953125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2535994052886963, "rewards/margins": 1.3140560388565063, "rewards/rejected": -2.567655563354492, "step": 1377 }, { "epoch": 1.3, "grad_norm": 21.903074264526367, "learning_rate": 3.1462049667715985e-07, "logps/chosen": -45.04633331298828, "logps/rejected": -62.4086799621582, "loss": 0.4614, "losses/dpo": 0.383821040391922, "losses/sft": 1.47364342212677, "losses/total": 0.383821040391922, "ref_logps/chosen": -33.184043884277344, "ref_logps/rejected": -40.430763244628906, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1862293481826782, "rewards/margins": 1.0115625858306885, "rewards/rejected": -2.1977920532226562, "step": 1378 }, { "epoch": 1.3, "grad_norm": 28.027938842773438, "learning_rate": 3.1444561035327036e-07, "logps/chosen": -45.439125061035156, "logps/rejected": -58.0157356262207, "loss": 0.483, "losses/dpo": 0.9206573963165283, "losses/sft": 1.75630784034729, "losses/total": 0.9206573963165283, "ref_logps/chosen": -31.78700065612793, "ref_logps/rejected": -34.04890441894531, "rewards/accuracies": 0.875, "rewards/chosen": -1.3652124404907227, "rewards/margins": 1.0314708948135376, "rewards/rejected": -2.3966832160949707, "step": 1379 }, { "epoch": 1.3, "grad_norm": 25.44988250732422, "learning_rate": 3.1427072402938087e-07, "logps/chosen": -56.93769073486328, "logps/rejected": -57.96370315551758, "loss": 0.4546, "losses/dpo": 0.6200724840164185, "losses/sft": 2.593087911605835, "losses/total": 0.6200724840164185, "ref_logps/chosen": -42.7227783203125, "ref_logps/rejected": -34.85062789916992, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4214911460876465, "rewards/margins": 0.8898162245750427, "rewards/rejected": -2.311307430267334, "step": 1380 }, { "epoch": 1.3, "grad_norm": 15.13788890838623, "learning_rate": 3.1409583770549144e-07, "logps/chosen": -35.49049377441406, "logps/rejected": -72.80972290039062, "loss": 0.2828, "losses/dpo": 0.5170881152153015, "losses/sft": 1.7763038873672485, "losses/total": 0.5170881152153015, "ref_logps/chosen": -26.234588623046875, "ref_logps/rejected": -44.85145950317383, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9255908131599426, "rewards/margins": 1.8702361583709717, "rewards/rejected": -2.7958269119262695, "step": 1381 }, { "epoch": 1.31, "grad_norm": 27.159870147705078, "learning_rate": 3.1392095138160195e-07, "logps/chosen": -60.885982513427734, "logps/rejected": -68.66365051269531, "loss": 0.4944, "losses/dpo": 0.6054320335388184, "losses/sft": 2.1589083671569824, "losses/total": 0.6054320335388184, "ref_logps/chosen": -42.962127685546875, "ref_logps/rejected": -41.3615837097168, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7923858165740967, "rewards/margins": 0.9378213286399841, "rewards/rejected": -2.7302069664001465, "step": 1382 }, { "epoch": 1.31, "grad_norm": 30.14463233947754, "learning_rate": 3.1374606505771246e-07, "logps/chosen": -56.305328369140625, "logps/rejected": -61.138851165771484, "loss": 0.5178, "losses/dpo": 0.6132975220680237, "losses/sft": 1.7179806232452393, "losses/total": 0.6132975220680237, "ref_logps/chosen": -39.56067657470703, "ref_logps/rejected": -36.21038818359375, "rewards/accuracies": 0.75, "rewards/chosen": -1.6744654178619385, "rewards/margins": 0.8183808326721191, "rewards/rejected": -2.4928462505340576, "step": 1383 }, { "epoch": 1.31, "grad_norm": 18.163667678833008, "learning_rate": 3.13571178733823e-07, "logps/chosen": -53.994598388671875, "logps/rejected": -68.44343566894531, "loss": 0.3387, "losses/dpo": 0.42649635672569275, "losses/sft": 1.394381046295166, "losses/total": 0.42649635672569275, "ref_logps/chosen": -40.39307403564453, "ref_logps/rejected": -41.70367431640625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3601528406143188, "rewards/margins": 1.3138233423233032, "rewards/rejected": -2.673975944519043, "step": 1384 }, { "epoch": 1.31, "grad_norm": 28.407211303710938, "learning_rate": 3.1339629240993354e-07, "logps/chosen": -50.175987243652344, "logps/rejected": -71.09711456298828, "loss": 0.5418, "losses/dpo": 0.5175858736038208, "losses/sft": 2.164062261581421, "losses/total": 0.5175858736038208, "ref_logps/chosen": -36.302520751953125, "ref_logps/rejected": -47.9297981262207, "rewards/accuracies": 0.625, "rewards/chosen": -1.3873467445373535, "rewards/margins": 0.929384708404541, "rewards/rejected": -2.3167314529418945, "step": 1385 }, { "epoch": 1.31, "grad_norm": 22.01079750061035, "learning_rate": 3.132214060860441e-07, "logps/chosen": -43.22167205810547, "logps/rejected": -73.66404724121094, "loss": 0.3278, "losses/dpo": 0.2465500384569168, "losses/sft": 1.84312903881073, "losses/total": 0.2465500384569168, "ref_logps/chosen": -31.679141998291016, "ref_logps/rejected": -48.11632537841797, "rewards/accuracies": 0.875, "rewards/chosen": -1.1542531251907349, "rewards/margins": 1.4005193710327148, "rewards/rejected": -2.5547726154327393, "step": 1386 }, { "epoch": 1.31, "grad_norm": 20.3621883392334, "learning_rate": 3.1304651976215457e-07, "logps/chosen": -42.49702453613281, "logps/rejected": -68.95793151855469, "loss": 0.3815, "losses/dpo": 0.2193598747253418, "losses/sft": 1.3440654277801514, "losses/total": 0.2193598747253418, "ref_logps/chosen": -32.190826416015625, "ref_logps/rejected": -43.43907928466797, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0306202173233032, "rewards/margins": 1.5212647914886475, "rewards/rejected": -2.551884889602661, "step": 1387 }, { "epoch": 1.31, "grad_norm": 26.03891944885254, "learning_rate": 3.1287163343826513e-07, "logps/chosen": -52.6580810546875, "logps/rejected": -67.25297546386719, "loss": 0.5382, "losses/dpo": 0.6871962547302246, "losses/sft": 1.9108715057373047, "losses/total": 0.6871962547302246, "ref_logps/chosen": -36.66200256347656, "ref_logps/rejected": -43.18291091918945, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5996079444885254, "rewards/margins": 0.8073986172676086, "rewards/rejected": -2.4070067405700684, "step": 1388 }, { "epoch": 1.31, "grad_norm": 27.576459884643555, "learning_rate": 3.1269674711437565e-07, "logps/chosen": -58.50428771972656, "logps/rejected": -77.48928833007812, "loss": 0.4801, "losses/dpo": 0.13750389218330383, "losses/sft": 1.2845847606658936, "losses/total": 0.13750389218330383, "ref_logps/chosen": -45.2631721496582, "ref_logps/rejected": -51.28300476074219, "rewards/accuracies": 0.875, "rewards/chosen": -1.3241115808486938, "rewards/margins": 1.2965164184570312, "rewards/rejected": -2.6206281185150146, "step": 1389 }, { "epoch": 1.31, "grad_norm": 28.661888122558594, "learning_rate": 3.1252186079048616e-07, "logps/chosen": -47.94889831542969, "logps/rejected": -68.34281921386719, "loss": 0.6343, "losses/dpo": 1.873807668685913, "losses/sft": 2.6926257610321045, "losses/total": 1.873807668685913, "ref_logps/chosen": -35.423587799072266, "ref_logps/rejected": -46.64491271972656, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2525306940078735, "rewards/margins": 0.917259693145752, "rewards/rejected": -2.169790267944336, "step": 1390 }, { "epoch": 1.31, "grad_norm": 19.96749496459961, "learning_rate": 3.1234697446659667e-07, "logps/chosen": -47.88706970214844, "logps/rejected": -59.14627456665039, "loss": 0.478, "losses/dpo": 0.21075883507728577, "losses/sft": 1.6604812145233154, "losses/total": 0.21075883507728577, "ref_logps/chosen": -36.14986801147461, "ref_logps/rejected": -37.06296920776367, "rewards/accuracies": 0.875, "rewards/chosen": -1.173720359802246, "rewards/margins": 1.0346100330352783, "rewards/rejected": -2.2083301544189453, "step": 1391 }, { "epoch": 1.31, "grad_norm": 18.352394104003906, "learning_rate": 3.1217208814270724e-07, "logps/chosen": -48.96562957763672, "logps/rejected": -89.93772888183594, "loss": 0.3174, "losses/dpo": 0.6201391816139221, "losses/sft": 2.112382650375366, "losses/total": 0.6201391816139221, "ref_logps/chosen": -36.74701690673828, "ref_logps/rejected": -60.49577713012695, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2218611240386963, "rewards/margins": 1.7223336696624756, "rewards/rejected": -2.944194793701172, "step": 1392 }, { "epoch": 1.32, "grad_norm": 30.134258270263672, "learning_rate": 3.119972018188178e-07, "logps/chosen": -71.5577392578125, "logps/rejected": -68.81761169433594, "loss": 0.6765, "losses/dpo": 1.0463733673095703, "losses/sft": 2.4263932704925537, "losses/total": 1.0463733673095703, "ref_logps/chosen": -53.1054801940918, "ref_logps/rejected": -42.88414764404297, "rewards/accuracies": 0.625, "rewards/chosen": -1.8452259302139282, "rewards/margins": 0.7481206655502319, "rewards/rejected": -2.59334659576416, "step": 1393 }, { "epoch": 1.32, "grad_norm": 12.316241264343262, "learning_rate": 3.1182231549492826e-07, "logps/chosen": -58.682777404785156, "logps/rejected": -76.54867553710938, "loss": 0.2377, "losses/dpo": 0.2974410057067871, "losses/sft": 1.4883127212524414, "losses/total": 0.2974410057067871, "ref_logps/chosen": -49.17033386230469, "ref_logps/rejected": -50.62391662597656, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9512447118759155, "rewards/margins": 1.6412322521209717, "rewards/rejected": -2.5924768447875977, "step": 1394 }, { "epoch": 1.32, "grad_norm": 20.66306495666504, "learning_rate": 3.1164742917103883e-07, "logps/chosen": -49.01014709472656, "logps/rejected": -64.79468536376953, "loss": 0.3602, "losses/dpo": 0.33817166090011597, "losses/sft": 2.0505852699279785, "losses/total": 0.33817166090011597, "ref_logps/chosen": -39.86006546020508, "ref_logps/rejected": -43.593971252441406, "rewards/accuracies": 0.875, "rewards/chosen": -0.9150079488754272, "rewards/margins": 1.2050637006759644, "rewards/rejected": -2.1200718879699707, "step": 1395 }, { "epoch": 1.32, "grad_norm": 16.288755416870117, "learning_rate": 3.1147254284714934e-07, "logps/chosen": -52.439876556396484, "logps/rejected": -68.74030303955078, "loss": 0.3846, "losses/dpo": 0.46823808550834656, "losses/sft": 1.661376714706421, "losses/total": 0.46823808550834656, "ref_logps/chosen": -42.11474609375, "ref_logps/rejected": -45.83921813964844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0325132608413696, "rewards/margins": 1.2575949430465698, "rewards/rejected": -2.2901082038879395, "step": 1396 }, { "epoch": 1.32, "grad_norm": 17.891496658325195, "learning_rate": 3.1129765652325985e-07, "logps/chosen": -39.93794631958008, "logps/rejected": -59.983604431152344, "loss": 0.3558, "losses/dpo": 0.5310298204421997, "losses/sft": 1.0889902114868164, "losses/total": 0.5310298204421997, "ref_logps/chosen": -31.426830291748047, "ref_logps/rejected": -40.64684295654297, "rewards/accuracies": 1.0, "rewards/chosen": -0.8511111736297607, "rewards/margins": 1.0825653076171875, "rewards/rejected": -1.9336764812469482, "step": 1397 }, { "epoch": 1.32, "grad_norm": 17.31990623474121, "learning_rate": 3.1112277019937037e-07, "logps/chosen": -47.70417404174805, "logps/rejected": -65.0086669921875, "loss": 0.348, "losses/dpo": 0.32458168268203735, "losses/sft": 2.0135648250579834, "losses/total": 0.32458168268203735, "ref_logps/chosen": -35.5030632019043, "ref_logps/rejected": -41.95423126220703, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2201111316680908, "rewards/margins": 1.0853321552276611, "rewards/rejected": -2.305443286895752, "step": 1398 }, { "epoch": 1.32, "grad_norm": 13.60582447052002, "learning_rate": 3.1094788387548093e-07, "logps/chosen": -35.148406982421875, "logps/rejected": -60.91284942626953, "loss": 0.2909, "losses/dpo": 0.3278396725654602, "losses/sft": 1.5964231491088867, "losses/total": 0.3278396725654602, "ref_logps/chosen": -26.8546085357666, "ref_logps/rejected": -40.30609130859375, "rewards/accuracies": 1.0, "rewards/chosen": -0.8293799161911011, "rewards/margins": 1.2312958240509033, "rewards/rejected": -2.060675621032715, "step": 1399 }, { "epoch": 1.32, "grad_norm": 18.040922164916992, "learning_rate": 3.107729975515915e-07, "logps/chosen": -36.44855499267578, "logps/rejected": -61.811790466308594, "loss": 0.3144, "losses/dpo": 0.21064826846122742, "losses/sft": 1.0480576753616333, "losses/total": 0.21064826846122742, "ref_logps/chosen": -25.492290496826172, "ref_logps/rejected": -36.45253372192383, "rewards/accuracies": 1.0, "rewards/chosen": -1.0956261157989502, "rewards/margins": 1.4402997493743896, "rewards/rejected": -2.53592586517334, "step": 1400 }, { "epoch": 1.32, "grad_norm": 15.514551162719727, "learning_rate": 3.1059811122770196e-07, "logps/chosen": -37.759986877441406, "logps/rejected": -70.44426727294922, "loss": 0.3003, "losses/dpo": 0.13849696516990662, "losses/sft": 1.3929380178451538, "losses/total": 0.13849696516990662, "ref_logps/chosen": -29.04631233215332, "ref_logps/rejected": -47.98952865600586, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8713672161102295, "rewards/margins": 1.3741066455841064, "rewards/rejected": -2.245473861694336, "step": 1401 }, { "epoch": 1.32, "grad_norm": 24.216726303100586, "learning_rate": 3.104232249038125e-07, "logps/chosen": -39.83381271362305, "logps/rejected": -72.62610626220703, "loss": 0.5245, "losses/dpo": 0.3698168098926544, "losses/sft": 1.223478078842163, "losses/total": 0.3698168098926544, "ref_logps/chosen": -27.672334671020508, "ref_logps/rejected": -49.60409927368164, "rewards/accuracies": 0.75, "rewards/chosen": -1.2161478996276855, "rewards/margins": 1.0860525369644165, "rewards/rejected": -2.3022005558013916, "step": 1402 }, { "epoch": 1.32, "grad_norm": 20.74520492553711, "learning_rate": 3.1024833857992303e-07, "logps/chosen": -63.94044876098633, "logps/rejected": -74.19377136230469, "loss": 0.3808, "losses/dpo": 0.2923702001571655, "losses/sft": 2.277377128601074, "losses/total": 0.2923702001571655, "ref_logps/chosen": -50.2935791015625, "ref_logps/rejected": -46.86443328857422, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3646868467330933, "rewards/margins": 1.3682467937469482, "rewards/rejected": -2.732933521270752, "step": 1403 }, { "epoch": 1.33, "grad_norm": 17.602275848388672, "learning_rate": 3.1007345225603355e-07, "logps/chosen": -42.638694763183594, "logps/rejected": -63.673919677734375, "loss": 0.3501, "losses/dpo": 0.6859549283981323, "losses/sft": 1.8699051141738892, "losses/total": 0.6859549283981323, "ref_logps/chosen": -32.98509216308594, "ref_logps/rejected": -38.486568450927734, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9653604030609131, "rewards/margins": 1.5533747673034668, "rewards/rejected": -2.51873517036438, "step": 1404 }, { "epoch": 1.33, "grad_norm": 24.334733963012695, "learning_rate": 3.0989856593214406e-07, "logps/chosen": -44.016693115234375, "logps/rejected": -54.46356964111328, "loss": 0.5681, "losses/dpo": 0.5730810761451721, "losses/sft": 1.4357812404632568, "losses/total": 0.5730810761451721, "ref_logps/chosen": -30.908496856689453, "ref_logps/rejected": -32.35789489746094, "rewards/accuracies": 0.625, "rewards/chosen": -1.3108196258544922, "rewards/margins": 0.8997477889060974, "rewards/rejected": -2.2105674743652344, "step": 1405 }, { "epoch": 1.33, "grad_norm": 24.985742568969727, "learning_rate": 3.097236796082546e-07, "logps/chosen": -60.19560241699219, "logps/rejected": -63.56106185913086, "loss": 0.4605, "losses/dpo": 0.8694175481796265, "losses/sft": 2.2830049991607666, "losses/total": 0.8694175481796265, "ref_logps/chosen": -47.15262985229492, "ref_logps/rejected": -40.38645935058594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3042973279953003, "rewards/margins": 1.0131628513336182, "rewards/rejected": -2.317460060119629, "step": 1406 }, { "epoch": 1.33, "grad_norm": 19.60630989074707, "learning_rate": 3.095487932843652e-07, "logps/chosen": -39.14239501953125, "logps/rejected": -57.21702194213867, "loss": 0.3744, "losses/dpo": 0.5650507211685181, "losses/sft": 1.6307978630065918, "losses/total": 0.5650507211685181, "ref_logps/chosen": -31.956689834594727, "ref_logps/rejected": -36.87541198730469, "rewards/accuracies": 0.875, "rewards/chosen": -0.718570351600647, "rewards/margins": 1.3155903816223145, "rewards/rejected": -2.034160852432251, "step": 1407 }, { "epoch": 1.33, "grad_norm": 22.99588394165039, "learning_rate": 3.0937390696047565e-07, "logps/chosen": -62.28180694580078, "logps/rejected": -73.65724182128906, "loss": 0.4073, "losses/dpo": 0.6211172342300415, "losses/sft": 2.6106433868408203, "losses/total": 0.6211172342300415, "ref_logps/chosen": -50.00990295410156, "ref_logps/rejected": -50.14404296875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.227190613746643, "rewards/margins": 1.1241296529769897, "rewards/rejected": -2.351320266723633, "step": 1408 }, { "epoch": 1.33, "grad_norm": 18.683799743652344, "learning_rate": 3.091990206365862e-07, "logps/chosen": -42.29640197753906, "logps/rejected": -61.958717346191406, "loss": 0.3347, "losses/dpo": 0.45394790172576904, "losses/sft": 1.4220184087753296, "losses/total": 0.45394790172576904, "ref_logps/chosen": -35.667076110839844, "ref_logps/rejected": -40.28544616699219, "rewards/accuracies": 0.875, "rewards/chosen": -0.6629323959350586, "rewards/margins": 1.5043952465057373, "rewards/rejected": -2.167327642440796, "step": 1409 }, { "epoch": 1.33, "grad_norm": 23.03899574279785, "learning_rate": 3.0902413431269673e-07, "logps/chosen": -54.78341293334961, "logps/rejected": -68.1576919555664, "loss": 0.4087, "losses/dpo": 0.3472962975502014, "losses/sft": 1.6087357997894287, "losses/total": 0.3472962975502014, "ref_logps/chosen": -40.278770446777344, "ref_logps/rejected": -43.559349060058594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4504643678665161, "rewards/margins": 1.0093703269958496, "rewards/rejected": -2.459834575653076, "step": 1410 }, { "epoch": 1.33, "grad_norm": 26.325695037841797, "learning_rate": 3.0884924798880724e-07, "logps/chosen": -47.2593994140625, "logps/rejected": -60.71413803100586, "loss": 0.5073, "losses/dpo": 0.7224621772766113, "losses/sft": 2.0751097202301025, "losses/total": 0.7224621772766113, "ref_logps/chosen": -35.848514556884766, "ref_logps/rejected": -42.53636932373047, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1410884857177734, "rewards/margins": 0.676688551902771, "rewards/rejected": -1.8177769184112549, "step": 1411 }, { "epoch": 1.33, "grad_norm": 17.443208694458008, "learning_rate": 3.086743616649178e-07, "logps/chosen": -39.574119567871094, "logps/rejected": -66.24822235107422, "loss": 0.3471, "losses/dpo": 0.4222831130027771, "losses/sft": 1.7287424802780151, "losses/total": 0.4222831130027771, "ref_logps/chosen": -31.801837921142578, "ref_logps/rejected": -44.20322799682617, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7772287130355835, "rewards/margins": 1.4272706508636475, "rewards/rejected": -2.2044992446899414, "step": 1412 }, { "epoch": 1.33, "grad_norm": 19.533376693725586, "learning_rate": 3.084994753410283e-07, "logps/chosen": -49.30174255371094, "logps/rejected": -71.27017211914062, "loss": 0.2996, "losses/dpo": 0.17927199602127075, "losses/sft": 1.7610194683074951, "losses/total": 0.17927199602127075, "ref_logps/chosen": -38.06349563598633, "ref_logps/rejected": -45.71255874633789, "rewards/accuracies": 0.875, "rewards/chosen": -1.1238248348236084, "rewards/margins": 1.4319367408752441, "rewards/rejected": -2.5557613372802734, "step": 1413 }, { "epoch": 1.34, "grad_norm": 20.28409767150879, "learning_rate": 3.083245890171389e-07, "logps/chosen": -44.53003692626953, "logps/rejected": -58.603981018066406, "loss": 0.3487, "losses/dpo": 0.4930720329284668, "losses/sft": 1.788412094116211, "losses/total": 0.4930720329284668, "ref_logps/chosen": -34.771034240722656, "ref_logps/rejected": -37.61746597290039, "rewards/accuracies": 0.875, "rewards/chosen": -0.9759001731872559, "rewards/margins": 1.1227518320083618, "rewards/rejected": -2.098651885986328, "step": 1414 }, { "epoch": 1.34, "grad_norm": 24.667396545410156, "learning_rate": 3.0814970269324935e-07, "logps/chosen": -45.952110290527344, "logps/rejected": -62.221256256103516, "loss": 0.4718, "losses/dpo": 0.6820125579833984, "losses/sft": 2.0642268657684326, "losses/total": 0.6820125579833984, "ref_logps/chosen": -34.38420104980469, "ref_logps/rejected": -40.26169204711914, "rewards/accuracies": 0.8125, "rewards/chosen": -1.156791090965271, "rewards/margins": 1.0391653776168823, "rewards/rejected": -2.1959567070007324, "step": 1415 }, { "epoch": 1.34, "grad_norm": 15.919194221496582, "learning_rate": 3.079748163693599e-07, "logps/chosen": -42.049278259277344, "logps/rejected": -68.30839538574219, "loss": 0.3513, "losses/dpo": 0.21697184443473816, "losses/sft": 2.0377728939056396, "losses/total": 0.21697184443473816, "ref_logps/chosen": -32.20050811767578, "ref_logps/rejected": -44.524757385253906, "rewards/accuracies": 0.875, "rewards/chosen": -0.9848769903182983, "rewards/margins": 1.3934866189956665, "rewards/rejected": -2.378363609313965, "step": 1416 }, { "epoch": 1.34, "grad_norm": 22.401674270629883, "learning_rate": 3.077999300454704e-07, "logps/chosen": -53.42433166503906, "logps/rejected": -68.79466247558594, "loss": 0.4149, "losses/dpo": 0.3252949118614197, "losses/sft": 1.8426626920700073, "losses/total": 0.3252949118614197, "ref_logps/chosen": -40.75001525878906, "ref_logps/rejected": -44.611534118652344, "rewards/accuracies": 0.8125, "rewards/chosen": -1.267431616783142, "rewards/margins": 1.1508824825286865, "rewards/rejected": -2.418313980102539, "step": 1417 }, { "epoch": 1.34, "grad_norm": 23.449138641357422, "learning_rate": 3.0762504372158094e-07, "logps/chosen": -56.36857604980469, "logps/rejected": -58.75748062133789, "loss": 0.4542, "losses/dpo": 0.45658814907073975, "losses/sft": 1.5619760751724243, "losses/total": 0.45658814907073975, "ref_logps/chosen": -42.70182800292969, "ref_logps/rejected": -36.784507751464844, "rewards/accuracies": 0.75, "rewards/chosen": -1.3666751384735107, "rewards/margins": 0.8306224942207336, "rewards/rejected": -2.1972975730895996, "step": 1418 }, { "epoch": 1.34, "grad_norm": 24.037595748901367, "learning_rate": 3.074501573976915e-07, "logps/chosen": -56.9433708190918, "logps/rejected": -70.42808532714844, "loss": 0.3817, "losses/dpo": 0.2560586631298065, "losses/sft": 1.9976487159729004, "losses/total": 0.2560586631298065, "ref_logps/chosen": -41.48696517944336, "ref_logps/rejected": -40.96171188354492, "rewards/accuracies": 0.75, "rewards/chosen": -1.545640468597412, "rewards/margins": 1.4009971618652344, "rewards/rejected": -2.9466376304626465, "step": 1419 }, { "epoch": 1.34, "grad_norm": 38.60078430175781, "learning_rate": 3.07275271073802e-07, "logps/chosen": -57.37983703613281, "logps/rejected": -69.86796569824219, "loss": 0.578, "losses/dpo": 0.7916027307510376, "losses/sft": 2.2825205326080322, "losses/total": 0.7916027307510376, "ref_logps/chosen": -38.82680130004883, "ref_logps/rejected": -44.12309265136719, "rewards/accuracies": 0.625, "rewards/chosen": -1.8553032875061035, "rewards/margins": 0.719184160232544, "rewards/rejected": -2.5744876861572266, "step": 1420 }, { "epoch": 1.34, "grad_norm": 14.173458099365234, "learning_rate": 3.071003847499126e-07, "logps/chosen": -41.91478729248047, "logps/rejected": -71.73396301269531, "loss": 0.2214, "losses/dpo": 0.10589051246643066, "losses/sft": 1.3087167739868164, "losses/total": 0.10589051246643066, "ref_logps/chosen": -34.511016845703125, "ref_logps/rejected": -42.56721496582031, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7403774261474609, "rewards/margins": 2.1762967109680176, "rewards/rejected": -2.9166741371154785, "step": 1421 }, { "epoch": 1.34, "grad_norm": 40.23226547241211, "learning_rate": 3.0692549842602304e-07, "logps/chosen": -50.16059875488281, "logps/rejected": -45.11272430419922, "loss": 0.9759, "losses/dpo": 1.4183539152145386, "losses/sft": 2.296969413757324, "losses/total": 1.4183539152145386, "ref_logps/chosen": -31.263580322265625, "ref_logps/rejected": -27.629045486450195, "rewards/accuracies": 0.4375, "rewards/chosen": -1.8897016048431396, "rewards/margins": -0.14133356511592865, "rewards/rejected": -1.7483680248260498, "step": 1422 }, { "epoch": 1.34, "grad_norm": 25.430992126464844, "learning_rate": 3.067506121021336e-07, "logps/chosen": -56.25402069091797, "logps/rejected": -82.1207275390625, "loss": 0.372, "losses/dpo": 0.34306663274765015, "losses/sft": 1.7412848472595215, "losses/total": 0.34306663274765015, "ref_logps/chosen": -40.79608154296875, "ref_logps/rejected": -54.380760192871094, "rewards/accuracies": 0.75, "rewards/chosen": -1.545793890953064, "rewards/margins": 1.22820246219635, "rewards/rejected": -2.773996591567993, "step": 1423 }, { "epoch": 1.34, "grad_norm": 31.236656188964844, "learning_rate": 3.065757257782441e-07, "logps/chosen": -59.461517333984375, "logps/rejected": -70.31063842773438, "loss": 0.4914, "losses/dpo": 0.8357799649238586, "losses/sft": 2.1770999431610107, "losses/total": 0.8357799649238586, "ref_logps/chosen": -48.422119140625, "ref_logps/rejected": -46.93023681640625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1039400100708008, "rewards/margins": 1.2341006994247437, "rewards/rejected": -2.338040590286255, "step": 1424 }, { "epoch": 1.35, "grad_norm": 16.982175827026367, "learning_rate": 3.0640083945435463e-07, "logps/chosen": -50.047523498535156, "logps/rejected": -78.69482421875, "loss": 0.2798, "losses/dpo": 0.4208105802536011, "losses/sft": 1.4603816270828247, "losses/total": 0.4208105802536011, "ref_logps/chosen": -36.75709533691406, "ref_logps/rejected": -50.03679656982422, "rewards/accuracies": 1.0, "rewards/chosen": -1.3290431499481201, "rewards/margins": 1.5367591381072998, "rewards/rejected": -2.865802049636841, "step": 1425 }, { "epoch": 1.35, "grad_norm": 20.309284210205078, "learning_rate": 3.062259531304652e-07, "logps/chosen": -42.76316833496094, "logps/rejected": -71.7971420288086, "loss": 0.3374, "losses/dpo": 0.29878053069114685, "losses/sft": 1.239473819732666, "losses/total": 0.29878053069114685, "ref_logps/chosen": -30.08619499206543, "ref_logps/rejected": -45.589942932128906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2676975727081299, "rewards/margins": 1.3530223369598389, "rewards/rejected": -2.6207199096679688, "step": 1426 }, { "epoch": 1.35, "grad_norm": 24.290002822875977, "learning_rate": 3.060510668065757e-07, "logps/chosen": -48.008941650390625, "logps/rejected": -68.45893859863281, "loss": 0.4337, "losses/dpo": 0.7064761519432068, "losses/sft": 1.5562361478805542, "losses/total": 0.7064761519432068, "ref_logps/chosen": -33.58290100097656, "ref_logps/rejected": -42.375648498535156, "rewards/accuracies": 0.875, "rewards/chosen": -1.4426039457321167, "rewards/margins": 1.1657254695892334, "rewards/rejected": -2.6083292961120605, "step": 1427 }, { "epoch": 1.35, "grad_norm": 21.337886810302734, "learning_rate": 3.058761804826863e-07, "logps/chosen": -42.58956527709961, "logps/rejected": -69.51599884033203, "loss": 0.348, "losses/dpo": 0.37191101908683777, "losses/sft": 1.7934852838516235, "losses/total": 0.37191101908683777, "ref_logps/chosen": -30.46814727783203, "ref_logps/rejected": -41.44960403442383, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2121418714523315, "rewards/margins": 1.5944979190826416, "rewards/rejected": -2.8066396713256836, "step": 1428 }, { "epoch": 1.35, "grad_norm": 30.51302719116211, "learning_rate": 3.0570129415879673e-07, "logps/chosen": -57.298519134521484, "logps/rejected": -81.44483947753906, "loss": 0.5259, "losses/dpo": 0.6635831594467163, "losses/sft": 1.7359380722045898, "losses/total": 0.6635831594467163, "ref_logps/chosen": -40.06396484375, "ref_logps/rejected": -50.34728240966797, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7234556674957275, "rewards/margins": 1.386299967765808, "rewards/rejected": -3.109755754470825, "step": 1429 }, { "epoch": 1.35, "grad_norm": 28.02737045288086, "learning_rate": 3.055264078349073e-07, "logps/chosen": -41.67241668701172, "logps/rejected": -62.38465881347656, "loss": 0.5008, "losses/dpo": 0.2610743045806885, "losses/sft": 0.9479192495346069, "losses/total": 0.2610743045806885, "ref_logps/chosen": -29.95924186706543, "ref_logps/rejected": -41.242156982421875, "rewards/accuracies": 0.75, "rewards/chosen": -1.1713175773620605, "rewards/margins": 0.9429323673248291, "rewards/rejected": -2.1142499446868896, "step": 1430 }, { "epoch": 1.35, "grad_norm": 25.366226196289062, "learning_rate": 3.0535152151101787e-07, "logps/chosen": -56.33328628540039, "logps/rejected": -80.22727966308594, "loss": 0.4365, "losses/dpo": 0.8064418435096741, "losses/sft": 2.8340845108032227, "losses/total": 0.8064418435096741, "ref_logps/chosen": -38.87138366699219, "ref_logps/rejected": -50.27754211425781, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7461903095245361, "rewards/margins": 1.2487839460372925, "rewards/rejected": -2.994974374771118, "step": 1431 }, { "epoch": 1.35, "grad_norm": 26.362878799438477, "learning_rate": 3.051766351871283e-07, "logps/chosen": -56.68375015258789, "logps/rejected": -76.91268157958984, "loss": 0.4751, "losses/dpo": 0.31512925028800964, "losses/sft": 1.7146873474121094, "losses/total": 0.31512925028800964, "ref_logps/chosen": -39.540618896484375, "ref_logps/rejected": -48.338619232177734, "rewards/accuracies": 0.8125, "rewards/chosen": -1.71431303024292, "rewards/margins": 1.143093228340149, "rewards/rejected": -2.8574063777923584, "step": 1432 }, { "epoch": 1.35, "grad_norm": 14.078072547912598, "learning_rate": 3.050017488632389e-07, "logps/chosen": -42.78202819824219, "logps/rejected": -55.35765075683594, "loss": 0.2449, "losses/dpo": 0.2670789062976837, "losses/sft": 1.6147735118865967, "losses/total": 0.2670789062976837, "ref_logps/chosen": -32.7816276550293, "ref_logps/rejected": -29.931459426879883, "rewards/accuracies": 1.0, "rewards/chosen": -1.0000396966934204, "rewards/margins": 1.542580008506775, "rewards/rejected": -2.542619466781616, "step": 1433 }, { "epoch": 1.35, "grad_norm": 16.286357879638672, "learning_rate": 3.048268625393494e-07, "logps/chosen": -61.516666412353516, "logps/rejected": -71.70602416992188, "loss": 0.2722, "losses/dpo": 0.5190421938896179, "losses/sft": 1.8730074167251587, "losses/total": 0.5190421938896179, "ref_logps/chosen": -44.22786331176758, "ref_logps/rejected": -39.38705825805664, "rewards/accuracies": 0.875, "rewards/chosen": -1.7288801670074463, "rewards/margins": 1.503016710281372, "rewards/rejected": -3.2318968772888184, "step": 1434 }, { "epoch": 1.36, "grad_norm": 18.778362274169922, "learning_rate": 3.0465197621545997e-07, "logps/chosen": -51.56160354614258, "logps/rejected": -79.54507446289062, "loss": 0.2966, "losses/dpo": 0.3342849016189575, "losses/sft": 1.7648053169250488, "losses/total": 0.3342849016189575, "ref_logps/chosen": -40.895713806152344, "ref_logps/rejected": -54.30652618408203, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0665889978408813, "rewards/margins": 1.4572656154632568, "rewards/rejected": -2.5238547325134277, "step": 1435 }, { "epoch": 1.36, "grad_norm": 21.250564575195312, "learning_rate": 3.0447708989157043e-07, "logps/chosen": -52.267730712890625, "logps/rejected": -68.39076232910156, "loss": 0.3329, "losses/dpo": 0.338424414396286, "losses/sft": 1.6868879795074463, "losses/total": 0.338424414396286, "ref_logps/chosen": -37.6220588684082, "ref_logps/rejected": -38.4321403503418, "rewards/accuracies": 0.875, "rewards/chosen": -1.4645671844482422, "rewards/margins": 1.5312950611114502, "rewards/rejected": -2.9958624839782715, "step": 1436 }, { "epoch": 1.36, "grad_norm": 23.66788673400879, "learning_rate": 3.04302203567681e-07, "logps/chosen": -49.62910461425781, "logps/rejected": -62.755252838134766, "loss": 0.3298, "losses/dpo": 0.18312913179397583, "losses/sft": 1.5485693216323853, "losses/total": 0.18312913179397583, "ref_logps/chosen": -37.44635009765625, "ref_logps/rejected": -36.1099967956543, "rewards/accuracies": 0.875, "rewards/chosen": -1.2182754278182983, "rewards/margins": 1.4462497234344482, "rewards/rejected": -2.664525270462036, "step": 1437 }, { "epoch": 1.36, "grad_norm": 18.332170486450195, "learning_rate": 3.0412731724379156e-07, "logps/chosen": -51.35747528076172, "logps/rejected": -77.13438415527344, "loss": 0.2693, "losses/dpo": 0.5719197392463684, "losses/sft": 1.5391196012496948, "losses/total": 0.5719197392463684, "ref_logps/chosen": -41.13132858276367, "ref_logps/rejected": -46.666290283203125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.02261483669281, "rewards/margins": 2.0241949558258057, "rewards/rejected": -3.046809673309326, "step": 1438 }, { "epoch": 1.36, "grad_norm": 35.252159118652344, "learning_rate": 3.03952430919902e-07, "logps/chosen": -48.03419494628906, "logps/rejected": -80.33631134033203, "loss": 0.559, "losses/dpo": 0.896109938621521, "losses/sft": 3.0311272144317627, "losses/total": 0.896109938621521, "ref_logps/chosen": -30.956846237182617, "ref_logps/rejected": -51.07893371582031, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7077350616455078, "rewards/margins": 1.218003511428833, "rewards/rejected": -2.9257383346557617, "step": 1439 }, { "epoch": 1.36, "grad_norm": 23.185909271240234, "learning_rate": 3.037775445960126e-07, "logps/chosen": -53.763240814208984, "logps/rejected": -66.82176208496094, "loss": 0.471, "losses/dpo": 0.2621922492980957, "losses/sft": 1.472888469696045, "losses/total": 0.2621922492980957, "ref_logps/chosen": -37.556148529052734, "ref_logps/rejected": -42.45654296875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6207095384597778, "rewards/margins": 0.8158121109008789, "rewards/rejected": -2.436521530151367, "step": 1440 }, { "epoch": 1.36, "grad_norm": 21.757980346679688, "learning_rate": 3.036026582721231e-07, "logps/chosen": -57.9247932434082, "logps/rejected": -71.8721923828125, "loss": 0.3349, "losses/dpo": 0.2935296893119812, "losses/sft": 1.7554571628570557, "losses/total": 0.2935296893119812, "ref_logps/chosen": -43.217811584472656, "ref_logps/rejected": -42.12742614746094, "rewards/accuracies": 0.875, "rewards/chosen": -1.470698595046997, "rewards/margins": 1.5037784576416016, "rewards/rejected": -2.9744772911071777, "step": 1441 }, { "epoch": 1.36, "grad_norm": 22.38652801513672, "learning_rate": 3.0342777194823366e-07, "logps/chosen": -53.57727813720703, "logps/rejected": -87.47329711914062, "loss": 0.3578, "losses/dpo": 0.18158069252967834, "losses/sft": 1.5796535015106201, "losses/total": 0.18158069252967834, "ref_logps/chosen": -37.418819427490234, "ref_logps/rejected": -52.9796142578125, "rewards/accuracies": 0.875, "rewards/chosen": -1.615846037864685, "rewards/margins": 1.8335225582122803, "rewards/rejected": -3.449368715286255, "step": 1442 }, { "epoch": 1.36, "grad_norm": 21.699548721313477, "learning_rate": 3.032528856243441e-07, "logps/chosen": -64.91116333007812, "logps/rejected": -92.02908325195312, "loss": 0.3342, "losses/dpo": 0.3280050754547119, "losses/sft": 1.766418218612671, "losses/total": 0.3280050754547119, "ref_logps/chosen": -46.51850128173828, "ref_logps/rejected": -56.66664123535156, "rewards/accuracies": 0.875, "rewards/chosen": -1.839266061782837, "rewards/margins": 1.6969778537750244, "rewards/rejected": -3.5362439155578613, "step": 1443 }, { "epoch": 1.36, "grad_norm": 25.19713592529297, "learning_rate": 3.030779993004547e-07, "logps/chosen": -38.48619842529297, "logps/rejected": -51.78202819824219, "loss": 0.5595, "losses/dpo": 0.8129055500030518, "losses/sft": 3.3074753284454346, "losses/total": 0.8129055500030518, "ref_logps/chosen": -25.25677490234375, "ref_logps/rejected": -31.497846603393555, "rewards/accuracies": 0.75, "rewards/chosen": -1.3229424953460693, "rewards/margins": 0.7054756283760071, "rewards/rejected": -2.0284180641174316, "step": 1444 }, { "epoch": 1.36, "grad_norm": 24.862638473510742, "learning_rate": 3.0290311297656525e-07, "logps/chosen": -64.99393463134766, "logps/rejected": -86.62968444824219, "loss": 0.4145, "losses/dpo": 0.23780396580696106, "losses/sft": 2.1835410594940186, "losses/total": 0.23780396580696106, "ref_logps/chosen": -47.015663146972656, "ref_logps/rejected": -55.13080596923828, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7978276014328003, "rewards/margins": 1.352060317993164, "rewards/rejected": -3.149887800216675, "step": 1445 }, { "epoch": 1.37, "grad_norm": 26.846160888671875, "learning_rate": 3.027282266526757e-07, "logps/chosen": -48.17491912841797, "logps/rejected": -71.64388275146484, "loss": 0.5015, "losses/dpo": 0.41429728269577026, "losses/sft": 1.542240858078003, "losses/total": 0.41429728269577026, "ref_logps/chosen": -36.93295669555664, "ref_logps/rejected": -49.83993911743164, "rewards/accuracies": 0.75, "rewards/chosen": -1.1241968870162964, "rewards/margins": 1.0561978816986084, "rewards/rejected": -2.1803946495056152, "step": 1446 }, { "epoch": 1.37, "grad_norm": 17.389711380004883, "learning_rate": 3.025533403287863e-07, "logps/chosen": -64.23756408691406, "logps/rejected": -86.47703552246094, "loss": 0.2239, "losses/dpo": 0.20563524961471558, "losses/sft": 1.9172698259353638, "losses/total": 0.20563524961471558, "ref_logps/chosen": -44.17626953125, "ref_logps/rejected": -47.868324279785156, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0061302185058594, "rewards/margins": 1.8547405004501343, "rewards/rejected": -3.860870599746704, "step": 1447 }, { "epoch": 1.37, "grad_norm": 25.225473403930664, "learning_rate": 3.023784540048968e-07, "logps/chosen": -53.32830810546875, "logps/rejected": -63.27170944213867, "loss": 0.4693, "losses/dpo": 0.6495373845100403, "losses/sft": 1.8110337257385254, "losses/total": 0.6495373845100403, "ref_logps/chosen": -39.959617614746094, "ref_logps/rejected": -37.68694305419922, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3368688821792603, "rewards/margins": 1.22160804271698, "rewards/rejected": -2.5584769248962402, "step": 1448 }, { "epoch": 1.37, "grad_norm": 21.963321685791016, "learning_rate": 3.0220356768100736e-07, "logps/chosen": -61.472476959228516, "logps/rejected": -81.95194244384766, "loss": 0.3652, "losses/dpo": 0.244198277592659, "losses/sft": 1.7694854736328125, "losses/total": 0.244198277592659, "ref_logps/chosen": -42.86567687988281, "ref_logps/rejected": -47.24297332763672, "rewards/accuracies": 0.875, "rewards/chosen": -1.8606804609298706, "rewards/margins": 1.6102163791656494, "rewards/rejected": -3.4708969593048096, "step": 1449 }, { "epoch": 1.37, "grad_norm": 20.733707427978516, "learning_rate": 3.020286813571178e-07, "logps/chosen": -53.143592834472656, "logps/rejected": -80.09049224853516, "loss": 0.4151, "losses/dpo": 0.46113601326942444, "losses/sft": 1.6883480548858643, "losses/total": 0.46113601326942444, "ref_logps/chosen": -39.02067565917969, "ref_logps/rejected": -50.30956268310547, "rewards/accuracies": 0.75, "rewards/chosen": -1.412292242050171, "rewards/margins": 1.5658011436462402, "rewards/rejected": -2.978093385696411, "step": 1450 }, { "epoch": 1.37, "grad_norm": 15.747688293457031, "learning_rate": 3.018537950332284e-07, "logps/chosen": -35.794960021972656, "logps/rejected": -54.41717529296875, "loss": 0.3542, "losses/dpo": 0.4572751522064209, "losses/sft": 1.935394525527954, "losses/total": 0.4572751522064209, "ref_logps/chosen": -27.21696662902832, "ref_logps/rejected": -32.192317962646484, "rewards/accuracies": 0.875, "rewards/chosen": -0.8577994704246521, "rewards/margins": 1.3646862506866455, "rewards/rejected": -2.2224857807159424, "step": 1451 }, { "epoch": 1.37, "grad_norm": 27.470935821533203, "learning_rate": 3.0167890870933895e-07, "logps/chosen": -54.65156936645508, "logps/rejected": -64.08354187011719, "loss": 0.4168, "losses/dpo": 0.21041765809059143, "losses/sft": 1.51226007938385, "losses/total": 0.21041765809059143, "ref_logps/chosen": -41.28759765625, "ref_logps/rejected": -38.83619689941406, "rewards/accuracies": 0.75, "rewards/chosen": -1.336397409439087, "rewards/margins": 1.1883370876312256, "rewards/rejected": -2.5247344970703125, "step": 1452 }, { "epoch": 1.37, "grad_norm": 32.393550872802734, "learning_rate": 3.015040223854494e-07, "logps/chosen": -62.701499938964844, "logps/rejected": -89.2132568359375, "loss": 0.4346, "losses/dpo": 0.16079044342041016, "losses/sft": 1.9983454942703247, "losses/total": 0.16079044342041016, "ref_logps/chosen": -44.023834228515625, "ref_logps/rejected": -56.803733825683594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8677663803100586, "rewards/margins": 1.373186469078064, "rewards/rejected": -3.240952730178833, "step": 1453 }, { "epoch": 1.37, "grad_norm": 35.32954788208008, "learning_rate": 3.0132913606156e-07, "logps/chosen": -48.779273986816406, "logps/rejected": -63.26791763305664, "loss": 0.5552, "losses/dpo": 0.24887573719024658, "losses/sft": 2.008256196975708, "losses/total": 0.24887573719024658, "ref_logps/chosen": -34.924224853515625, "ref_logps/rejected": -38.76007080078125, "rewards/accuracies": 0.625, "rewards/chosen": -1.3855046033859253, "rewards/margins": 1.0652801990509033, "rewards/rejected": -2.450784683227539, "step": 1454 }, { "epoch": 1.37, "grad_norm": 18.07564926147461, "learning_rate": 3.011542497376705e-07, "logps/chosen": -50.26887512207031, "logps/rejected": -73.92079162597656, "loss": 0.3222, "losses/dpo": 0.3506878912448883, "losses/sft": 1.500986099243164, "losses/total": 0.3506878912448883, "ref_logps/chosen": -36.34586715698242, "ref_logps/rejected": -47.58843231201172, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3923009634017944, "rewards/margins": 1.2409355640411377, "rewards/rejected": -2.6332364082336426, "step": 1455 }, { "epoch": 1.37, "grad_norm": 29.577266693115234, "learning_rate": 3.0097936341378105e-07, "logps/chosen": -49.849891662597656, "logps/rejected": -57.964447021484375, "loss": 0.5189, "losses/dpo": 0.8094742298126221, "losses/sft": 2.1526012420654297, "losses/total": 0.8094742298126221, "ref_logps/chosen": -34.91853713989258, "ref_logps/rejected": -33.85536193847656, "rewards/accuracies": 0.75, "rewards/chosen": -1.4931353330612183, "rewards/margins": 0.9177733659744263, "rewards/rejected": -2.4109086990356445, "step": 1456 }, { "epoch": 1.38, "grad_norm": 18.888126373291016, "learning_rate": 3.0080447708989156e-07, "logps/chosen": -45.32219314575195, "logps/rejected": -58.74849319458008, "loss": 0.4271, "losses/dpo": 0.2755812406539917, "losses/sft": 1.6777021884918213, "losses/total": 0.2755812406539917, "ref_logps/chosen": -30.581050872802734, "ref_logps/rejected": -34.79866027832031, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4741144180297852, "rewards/margins": 0.9208693504333496, "rewards/rejected": -2.3949837684631348, "step": 1457 }, { "epoch": 1.38, "grad_norm": 17.708778381347656, "learning_rate": 3.006295907660021e-07, "logps/chosen": -35.59089279174805, "logps/rejected": -53.55289077758789, "loss": 0.3655, "losses/dpo": 0.2321612536907196, "losses/sft": 1.954699158668518, "losses/total": 0.2321612536907196, "ref_logps/chosen": -25.250795364379883, "ref_logps/rejected": -30.8167724609375, "rewards/accuracies": 0.875, "rewards/chosen": -1.0340098142623901, "rewards/margins": 1.239601731300354, "rewards/rejected": -2.273611545562744, "step": 1458 }, { "epoch": 1.38, "grad_norm": 27.67070198059082, "learning_rate": 3.0045470444211264e-07, "logps/chosen": -70.18630981445312, "logps/rejected": -73.20052337646484, "loss": 0.5085, "losses/dpo": 0.4248794615268707, "losses/sft": 1.938214659690857, "losses/total": 0.4248794615268707, "ref_logps/chosen": -53.95916748046875, "ref_logps/rejected": -47.098670959472656, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6227140426635742, "rewards/margins": 0.9874710440635681, "rewards/rejected": -2.610185146331787, "step": 1459 }, { "epoch": 1.38, "grad_norm": 32.969573974609375, "learning_rate": 3.002798181182231e-07, "logps/chosen": -54.15985870361328, "logps/rejected": -62.675636291503906, "loss": 0.6297, "losses/dpo": 0.2536635398864746, "losses/sft": 1.526346206665039, "losses/total": 0.2536635398864746, "ref_logps/chosen": -35.572959899902344, "ref_logps/rejected": -39.281192779541016, "rewards/accuracies": 0.625, "rewards/chosen": -1.858689546585083, "rewards/margins": 0.4807544946670532, "rewards/rejected": -2.339444160461426, "step": 1460 }, { "epoch": 1.38, "grad_norm": 27.672409057617188, "learning_rate": 3.0010493179433367e-07, "logps/chosen": -58.23644256591797, "logps/rejected": -85.10983276367188, "loss": 0.4249, "losses/dpo": 0.39252281188964844, "losses/sft": 1.9027915000915527, "losses/total": 0.39252281188964844, "ref_logps/chosen": -40.46833801269531, "ref_logps/rejected": -53.45152282714844, "rewards/accuracies": 0.75, "rewards/chosen": -1.7768104076385498, "rewards/margins": 1.389020323753357, "rewards/rejected": -3.1658310890197754, "step": 1461 }, { "epoch": 1.38, "grad_norm": 18.22044563293457, "learning_rate": 2.999300454704442e-07, "logps/chosen": -59.07020568847656, "logps/rejected": -64.58023071289062, "loss": 0.332, "losses/dpo": 0.5386080741882324, "losses/sft": 1.5160285234451294, "losses/total": 0.5386080741882324, "ref_logps/chosen": -42.942867279052734, "ref_logps/rejected": -35.57465362548828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6127336025238037, "rewards/margins": 1.2878247499465942, "rewards/rejected": -2.9005582332611084, "step": 1462 }, { "epoch": 1.38, "grad_norm": 17.094030380249023, "learning_rate": 2.9975515914655475e-07, "logps/chosen": -38.967994689941406, "logps/rejected": -64.69197082519531, "loss": 0.3281, "losses/dpo": 0.19840840995311737, "losses/sft": 1.487196445465088, "losses/total": 0.19840840995311737, "ref_logps/chosen": -30.22754669189453, "ref_logps/rejected": -40.487152099609375, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8740450143814087, "rewards/margins": 1.5464364290237427, "rewards/rejected": -2.4204814434051514, "step": 1463 }, { "epoch": 1.38, "grad_norm": 15.936551094055176, "learning_rate": 2.9958027282266526e-07, "logps/chosen": -43.809566497802734, "logps/rejected": -52.186256408691406, "loss": 0.3292, "losses/dpo": 0.1375245749950409, "losses/sft": 1.757676601409912, "losses/total": 0.1375245749950409, "ref_logps/chosen": -30.919689178466797, "ref_logps/rejected": -26.887386322021484, "rewards/accuracies": 0.875, "rewards/chosen": -1.2889879941940308, "rewards/margins": 1.2408989667892456, "rewards/rejected": -2.5298869609832764, "step": 1464 }, { "epoch": 1.38, "grad_norm": 17.255407333374023, "learning_rate": 2.9940538649877577e-07, "logps/chosen": -60.7708854675293, "logps/rejected": -95.71295928955078, "loss": 0.2841, "losses/dpo": 0.23410826921463013, "losses/sft": 2.4984688758850098, "losses/total": 0.23410826921463013, "ref_logps/chosen": -43.045684814453125, "ref_logps/rejected": -57.78691864013672, "rewards/accuracies": 0.875, "rewards/chosen": -1.772519826889038, "rewards/margins": 2.020085096359253, "rewards/rejected": -3.792604923248291, "step": 1465 }, { "epoch": 1.38, "grad_norm": 31.450180053710938, "learning_rate": 2.9923050017488634e-07, "logps/chosen": -47.75181579589844, "logps/rejected": -50.85969161987305, "loss": 0.673, "losses/dpo": 0.7621719241142273, "losses/sft": 2.1680119037628174, "losses/total": 0.7621719241142273, "ref_logps/chosen": -31.87733268737793, "ref_logps/rejected": -30.926931381225586, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5874487161636353, "rewards/margins": 0.40582719445228577, "rewards/rejected": -1.9932758808135986, "step": 1466 }, { "epoch": 1.39, "grad_norm": 19.149681091308594, "learning_rate": 2.990556138509968e-07, "logps/chosen": -57.261173248291016, "logps/rejected": -82.31720733642578, "loss": 0.2558, "losses/dpo": 0.191327303647995, "losses/sft": 1.7014291286468506, "losses/total": 0.191327303647995, "ref_logps/chosen": -38.61812973022461, "ref_logps/rejected": -47.27553176879883, "rewards/accuracies": 1.0, "rewards/chosen": -1.8643043041229248, "rewards/margins": 1.6398630142211914, "rewards/rejected": -3.504167318344116, "step": 1467 }, { "epoch": 1.39, "grad_norm": 28.514707565307617, "learning_rate": 2.9888072752710736e-07, "logps/chosen": -63.502899169921875, "logps/rejected": -64.31593322753906, "loss": 0.6183, "losses/dpo": 0.6624680161476135, "losses/sft": 2.3497395515441895, "losses/total": 0.6624680161476135, "ref_logps/chosen": -46.184349060058594, "ref_logps/rejected": -40.362693786621094, "rewards/accuracies": 0.625, "rewards/chosen": -1.7318546772003174, "rewards/margins": 0.6634698510169983, "rewards/rejected": -2.395324468612671, "step": 1468 }, { "epoch": 1.39, "grad_norm": 25.650829315185547, "learning_rate": 2.987058412032179e-07, "logps/chosen": -53.402183532714844, "logps/rejected": -61.13035583496094, "loss": 0.4607, "losses/dpo": 0.10315266996622086, "losses/sft": 1.7648452520370483, "losses/total": 0.10315266996622086, "ref_logps/chosen": -42.29344940185547, "ref_logps/rejected": -38.432212829589844, "rewards/accuracies": 0.75, "rewards/chosen": -1.1108734607696533, "rewards/margins": 1.1589409112930298, "rewards/rejected": -2.2698144912719727, "step": 1469 }, { "epoch": 1.39, "grad_norm": 25.22063446044922, "learning_rate": 2.9853095487932844e-07, "logps/chosen": -44.644264221191406, "logps/rejected": -57.26899719238281, "loss": 0.4648, "losses/dpo": 0.6071901917457581, "losses/sft": 2.1052732467651367, "losses/total": 0.6071901917457581, "ref_logps/chosen": -32.22168731689453, "ref_logps/rejected": -34.735984802246094, "rewards/accuracies": 0.75, "rewards/chosen": -1.2422577142715454, "rewards/margins": 1.0110435485839844, "rewards/rejected": -2.2533011436462402, "step": 1470 }, { "epoch": 1.39, "grad_norm": 24.80872344970703, "learning_rate": 2.9835606855543895e-07, "logps/chosen": -56.374412536621094, "logps/rejected": -71.41012573242188, "loss": 0.4466, "losses/dpo": 0.7869513630867004, "losses/sft": 2.471348285675049, "losses/total": 0.7869513630867004, "ref_logps/chosen": -43.42095184326172, "ref_logps/rejected": -49.27517318725586, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2953461408615112, "rewards/margins": 0.9181485772132874, "rewards/rejected": -2.2134947776794434, "step": 1471 }, { "epoch": 1.39, "grad_norm": 18.416790008544922, "learning_rate": 2.9818118223154947e-07, "logps/chosen": -61.26020812988281, "logps/rejected": -78.95443725585938, "loss": 0.3026, "losses/dpo": 0.23834973573684692, "losses/sft": 1.5854521989822388, "losses/total": 0.23834973573684692, "ref_logps/chosen": -49.4541015625, "ref_logps/rejected": -51.38745880126953, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1806107759475708, "rewards/margins": 1.5760869979858398, "rewards/rejected": -2.7566978931427, "step": 1472 }, { "epoch": 1.39, "grad_norm": 15.231879234313965, "learning_rate": 2.9800629590766003e-07, "logps/chosen": -44.0016975402832, "logps/rejected": -75.897705078125, "loss": 0.248, "losses/dpo": 0.1659824699163437, "losses/sft": 1.1497304439544678, "losses/total": 0.1659824699163437, "ref_logps/chosen": -32.45808792114258, "ref_logps/rejected": -46.080848693847656, "rewards/accuracies": 1.0, "rewards/chosen": -1.1543606519699097, "rewards/margins": 1.827324628829956, "rewards/rejected": -2.981685161590576, "step": 1473 }, { "epoch": 1.39, "grad_norm": 21.670198440551758, "learning_rate": 2.978314095837705e-07, "logps/chosen": -45.749427795410156, "logps/rejected": -63.82823181152344, "loss": 0.4681, "losses/dpo": 0.43698054552078247, "losses/sft": 1.4650715589523315, "losses/total": 0.43698054552078247, "ref_logps/chosen": -31.394006729125977, "ref_logps/rejected": -38.07575225830078, "rewards/accuracies": 0.6875, "rewards/chosen": -1.435542106628418, "rewards/margins": 1.139705777168274, "rewards/rejected": -2.5752477645874023, "step": 1474 }, { "epoch": 1.39, "grad_norm": 19.23586082458496, "learning_rate": 2.9765652325988106e-07, "logps/chosen": -64.34100341796875, "logps/rejected": -79.12982940673828, "loss": 0.3976, "losses/dpo": 0.64848393201828, "losses/sft": 1.8697618246078491, "losses/total": 0.64848393201828, "ref_logps/chosen": -50.03496551513672, "ref_logps/rejected": -53.19569778442383, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4306037425994873, "rewards/margins": 1.1628097295761108, "rewards/rejected": -2.5934133529663086, "step": 1475 }, { "epoch": 1.39, "grad_norm": 21.026540756225586, "learning_rate": 2.974816369359916e-07, "logps/chosen": -42.318729400634766, "logps/rejected": -58.60890579223633, "loss": 0.4032, "losses/dpo": 0.33661365509033203, "losses/sft": 1.560653805732727, "losses/total": 0.33661365509033203, "ref_logps/chosen": -31.113370895385742, "ref_logps/rejected": -36.7552490234375, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1205358505249023, "rewards/margins": 1.06482994556427, "rewards/rejected": -2.185365676879883, "step": 1476 }, { "epoch": 1.39, "grad_norm": 17.171052932739258, "learning_rate": 2.9730675061210214e-07, "logps/chosen": -45.365013122558594, "logps/rejected": -73.35462951660156, "loss": 0.3509, "losses/dpo": 0.44881758093833923, "losses/sft": 1.5224173069000244, "losses/total": 0.44881758093833923, "ref_logps/chosen": -31.196611404418945, "ref_logps/rejected": -45.02224349975586, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4168400764465332, "rewards/margins": 1.4163987636566162, "rewards/rejected": -2.8332388401031494, "step": 1477 }, { "epoch": 1.4, "grad_norm": 18.62702178955078, "learning_rate": 2.9713186428821265e-07, "logps/chosen": -48.18018341064453, "logps/rejected": -66.46004486083984, "loss": 0.3649, "losses/dpo": 0.37515488266944885, "losses/sft": 2.106121778488159, "losses/total": 0.37515488266944885, "ref_logps/chosen": -33.46634292602539, "ref_logps/rejected": -39.04322052001953, "rewards/accuracies": 0.75, "rewards/chosen": -1.4713842868804932, "rewards/margins": 1.2702980041503906, "rewards/rejected": -2.741682529449463, "step": 1478 }, { "epoch": 1.4, "grad_norm": 22.654354095458984, "learning_rate": 2.9695697796432316e-07, "logps/chosen": -44.82452392578125, "logps/rejected": -61.94343566894531, "loss": 0.4275, "losses/dpo": 0.40967732667922974, "losses/sft": 1.5727530717849731, "losses/total": 0.40967732667922974, "ref_logps/chosen": -30.690963745117188, "ref_logps/rejected": -37.72716522216797, "rewards/accuracies": 0.875, "rewards/chosen": -1.413355827331543, "rewards/margins": 1.0082708597183228, "rewards/rejected": -2.4216268062591553, "step": 1479 }, { "epoch": 1.4, "grad_norm": 28.83608627319336, "learning_rate": 2.967820916404337e-07, "logps/chosen": -53.062374114990234, "logps/rejected": -74.76139831542969, "loss": 0.4279, "losses/dpo": 0.2415730357170105, "losses/sft": 1.272810935974121, "losses/total": 0.2415730357170105, "ref_logps/chosen": -39.616756439208984, "ref_logps/rejected": -50.39042663574219, "rewards/accuracies": 0.75, "rewards/chosen": -1.3445615768432617, "rewards/margins": 1.092535376548767, "rewards/rejected": -2.4370968341827393, "step": 1480 }, { "epoch": 1.4, "grad_norm": 25.0146427154541, "learning_rate": 2.966072053165442e-07, "logps/chosen": -44.36920928955078, "logps/rejected": -57.59333038330078, "loss": 0.4528, "losses/dpo": 0.2863396406173706, "losses/sft": 1.8024622201919556, "losses/total": 0.2863396406173706, "ref_logps/chosen": -34.70337677001953, "ref_logps/rejected": -36.049468994140625, "rewards/accuracies": 0.875, "rewards/chosen": -0.9665828943252563, "rewards/margins": 1.1878032684326172, "rewards/rejected": -2.154386043548584, "step": 1481 }, { "epoch": 1.4, "grad_norm": 22.59343719482422, "learning_rate": 2.9643231899265475e-07, "logps/chosen": -54.02422332763672, "logps/rejected": -72.16030883789062, "loss": 0.3826, "losses/dpo": 0.44329535961151123, "losses/sft": 1.5579030513763428, "losses/total": 0.44329535961151123, "ref_logps/chosen": -41.91730499267578, "ref_logps/rejected": -46.37119674682617, "rewards/accuracies": 0.875, "rewards/chosen": -1.2106914520263672, "rewards/margins": 1.3682198524475098, "rewards/rejected": -2.578911304473877, "step": 1482 }, { "epoch": 1.4, "grad_norm": 13.133771896362305, "learning_rate": 2.962574326687653e-07, "logps/chosen": -31.992694854736328, "logps/rejected": -65.34564208984375, "loss": 0.2487, "losses/dpo": 0.1788356602191925, "losses/sft": 1.3065111637115479, "losses/total": 0.1788356602191925, "ref_logps/chosen": -24.564327239990234, "ref_logps/rejected": -38.47550964355469, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7428367137908936, "rewards/margins": 1.9441765546798706, "rewards/rejected": -2.6870133876800537, "step": 1483 }, { "epoch": 1.4, "grad_norm": 20.612218856811523, "learning_rate": 2.9608254634487583e-07, "logps/chosen": -48.381187438964844, "logps/rejected": -68.538818359375, "loss": 0.3926, "losses/dpo": 0.40987130999565125, "losses/sft": 1.2583063840866089, "losses/total": 0.40987130999565125, "ref_logps/chosen": -36.965511322021484, "ref_logps/rejected": -43.162193298339844, "rewards/accuracies": 0.75, "rewards/chosen": -1.1415679454803467, "rewards/margins": 1.3960951566696167, "rewards/rejected": -2.537662982940674, "step": 1484 }, { "epoch": 1.4, "grad_norm": 21.078365325927734, "learning_rate": 2.9590766002098634e-07, "logps/chosen": -57.50191116333008, "logps/rejected": -72.59945678710938, "loss": 0.3103, "losses/dpo": 0.3797740340232849, "losses/sft": 1.9937254190444946, "losses/total": 0.3797740340232849, "ref_logps/chosen": -42.138980865478516, "ref_logps/rejected": -43.61561965942383, "rewards/accuracies": 0.9375, "rewards/chosen": -1.536293387413025, "rewards/margins": 1.362090826034546, "rewards/rejected": -2.8983843326568604, "step": 1485 }, { "epoch": 1.4, "grad_norm": 26.72555160522461, "learning_rate": 2.9573277369709686e-07, "logps/chosen": -60.041465759277344, "logps/rejected": -71.0130386352539, "loss": 0.4958, "losses/dpo": 0.41573166847229004, "losses/sft": 2.1062893867492676, "losses/total": 0.41573166847229004, "ref_logps/chosen": -43.17316436767578, "ref_logps/rejected": -43.25284194946289, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6868302822113037, "rewards/margins": 1.089189887046814, "rewards/rejected": -2.776020050048828, "step": 1486 }, { "epoch": 1.4, "grad_norm": 20.682126998901367, "learning_rate": 2.955578873732074e-07, "logps/chosen": -45.147117614746094, "logps/rejected": -73.24772644042969, "loss": 0.3537, "losses/dpo": 0.20876505970954895, "losses/sft": 1.2642250061035156, "losses/total": 0.20876505970954895, "ref_logps/chosen": -32.60017395019531, "ref_logps/rejected": -46.0899772644043, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2546948194503784, "rewards/margins": 1.4610795974731445, "rewards/rejected": -2.7157745361328125, "step": 1487 }, { "epoch": 1.41, "grad_norm": 24.359195709228516, "learning_rate": 2.953830010493179e-07, "logps/chosen": -52.26470184326172, "logps/rejected": -60.49019241333008, "loss": 0.4173, "losses/dpo": 0.5849593877792358, "losses/sft": 1.9212629795074463, "losses/total": 0.5849593877792358, "ref_logps/chosen": -42.57030487060547, "ref_logps/rejected": -39.783363342285156, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9694399237632751, "rewards/margins": 1.1012427806854248, "rewards/rejected": -2.0706825256347656, "step": 1488 }, { "epoch": 1.41, "grad_norm": 30.23659896850586, "learning_rate": 2.9520811472542845e-07, "logps/chosen": -50.95896911621094, "logps/rejected": -55.51099395751953, "loss": 0.511, "losses/dpo": 0.6571680307388306, "losses/sft": 1.5848480463027954, "losses/total": 0.6571680307388306, "ref_logps/chosen": -37.404823303222656, "ref_logps/rejected": -31.563274383544922, "rewards/accuracies": 0.875, "rewards/chosen": -1.355414628982544, "rewards/margins": 1.0393571853637695, "rewards/rejected": -2.3947718143463135, "step": 1489 }, { "epoch": 1.41, "grad_norm": 22.450220108032227, "learning_rate": 2.95033228401539e-07, "logps/chosen": -45.04136657714844, "logps/rejected": -64.29212951660156, "loss": 0.4041, "losses/dpo": 0.5002979636192322, "losses/sft": 1.9214532375335693, "losses/total": 0.5002979636192322, "ref_logps/chosen": -34.148441314697266, "ref_logps/rejected": -40.619911193847656, "rewards/accuracies": 0.875, "rewards/chosen": -1.0892921686172485, "rewards/margins": 1.2779297828674316, "rewards/rejected": -2.3672218322753906, "step": 1490 }, { "epoch": 1.41, "grad_norm": 32.676029205322266, "learning_rate": 2.948583420776495e-07, "logps/chosen": -47.213138580322266, "logps/rejected": -56.650901794433594, "loss": 0.7712, "losses/dpo": 1.327339768409729, "losses/sft": 2.4795992374420166, "losses/total": 1.327339768409729, "ref_logps/chosen": -31.19516944885254, "ref_logps/rejected": -35.810768127441406, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6017969846725464, "rewards/margins": 0.48221641778945923, "rewards/rejected": -2.0840134620666504, "step": 1491 }, { "epoch": 1.41, "grad_norm": 28.814014434814453, "learning_rate": 2.9468345575376004e-07, "logps/chosen": -56.77680969238281, "logps/rejected": -82.70301055908203, "loss": 0.4759, "losses/dpo": 0.3210974335670471, "losses/sft": 2.3065695762634277, "losses/total": 0.3210974335670471, "ref_logps/chosen": -39.640220642089844, "ref_logps/rejected": -55.055564880371094, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7136590480804443, "rewards/margins": 1.0510858297348022, "rewards/rejected": -2.764744758605957, "step": 1492 }, { "epoch": 1.41, "grad_norm": 16.515443801879883, "learning_rate": 2.9450856942987055e-07, "logps/chosen": -52.44084167480469, "logps/rejected": -69.65238952636719, "loss": 0.3448, "losses/dpo": 0.5896556377410889, "losses/sft": 2.2405614852905273, "losses/total": 0.5896556377410889, "ref_logps/chosen": -39.882965087890625, "ref_logps/rejected": -43.62897872924805, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2557874917984009, "rewards/margins": 1.3465540409088135, "rewards/rejected": -2.602341651916504, "step": 1493 }, { "epoch": 1.41, "grad_norm": 21.969770431518555, "learning_rate": 2.943336831059811e-07, "logps/chosen": -46.63779830932617, "logps/rejected": -69.41429138183594, "loss": 0.3957, "losses/dpo": 0.4231574237346649, "losses/sft": 1.2778698205947876, "losses/total": 0.4231574237346649, "ref_logps/chosen": -32.38006591796875, "ref_logps/rejected": -43.47391891479492, "rewards/accuracies": 0.75, "rewards/chosen": -1.4257733821868896, "rewards/margins": 1.1682645082473755, "rewards/rejected": -2.5940377712249756, "step": 1494 }, { "epoch": 1.41, "grad_norm": 20.34341049194336, "learning_rate": 2.941587967820916e-07, "logps/chosen": -46.3596076965332, "logps/rejected": -70.40168762207031, "loss": 0.3418, "losses/dpo": 0.40635621547698975, "losses/sft": 1.805832862854004, "losses/total": 0.40635621547698975, "ref_logps/chosen": -32.84781265258789, "ref_logps/rejected": -43.06932830810547, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3511793613433838, "rewards/margins": 1.3820571899414062, "rewards/rejected": -2.73323655128479, "step": 1495 }, { "epoch": 1.41, "grad_norm": 12.552985191345215, "learning_rate": 2.9398391045820214e-07, "logps/chosen": -47.052276611328125, "logps/rejected": -82.99396514892578, "loss": 0.1853, "losses/dpo": 0.31122273206710815, "losses/sft": 1.8056796789169312, "losses/total": 0.31122273206710815, "ref_logps/chosen": -36.09916305541992, "ref_logps/rejected": -52.081565856933594, "rewards/accuracies": 1.0, "rewards/chosen": -1.0953112840652466, "rewards/margins": 1.9959288835525513, "rewards/rejected": -3.0912399291992188, "step": 1496 }, { "epoch": 1.41, "grad_norm": 19.483808517456055, "learning_rate": 2.938090241343127e-07, "logps/chosen": -39.39792251586914, "logps/rejected": -74.99205780029297, "loss": 0.3764, "losses/dpo": 0.1508772224187851, "losses/sft": 1.6205919981002808, "losses/total": 0.1508772224187851, "ref_logps/chosen": -28.510116577148438, "ref_logps/rejected": -46.25724411010742, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0887806415557861, "rewards/margins": 1.7847009897232056, "rewards/rejected": -2.8734817504882812, "step": 1497 }, { "epoch": 1.41, "grad_norm": 23.783533096313477, "learning_rate": 2.936341378104232e-07, "logps/chosen": -49.414306640625, "logps/rejected": -64.71704864501953, "loss": 0.4158, "losses/dpo": 0.6990461945533752, "losses/sft": 1.6989450454711914, "losses/total": 0.6990461945533752, "ref_logps/chosen": -36.04567337036133, "ref_logps/rejected": -39.98622131347656, "rewards/accuracies": 0.875, "rewards/chosen": -1.3368635177612305, "rewards/margins": 1.1362197399139404, "rewards/rejected": -2.473083257675171, "step": 1498 }, { "epoch": 1.42, "grad_norm": 22.386396408081055, "learning_rate": 2.9345925148653373e-07, "logps/chosen": -51.40376281738281, "logps/rejected": -64.31002807617188, "loss": 0.4081, "losses/dpo": 0.33907002210617065, "losses/sft": 1.927138090133667, "losses/total": 0.33907002210617065, "ref_logps/chosen": -40.528560638427734, "ref_logps/rejected": -41.02455139160156, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0875203609466553, "rewards/margins": 1.2410271167755127, "rewards/rejected": -2.328547477722168, "step": 1499 }, { "epoch": 1.42, "grad_norm": 26.581707000732422, "learning_rate": 2.9328436516264424e-07, "logps/chosen": -49.87763595581055, "logps/rejected": -64.03345489501953, "loss": 0.4726, "losses/dpo": 0.3879508376121521, "losses/sft": 1.7184689044952393, "losses/total": 0.3879508376121521, "ref_logps/chosen": -37.52323913574219, "ref_logps/rejected": -39.740928649902344, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2354395389556885, "rewards/margins": 1.1938132047653198, "rewards/rejected": -2.4292526245117188, "step": 1500 }, { "epoch": 1.42, "grad_norm": 24.524402618408203, "learning_rate": 2.931094788387548e-07, "logps/chosen": -54.667537689208984, "logps/rejected": -64.08084106445312, "loss": 0.4537, "losses/dpo": 0.1589667797088623, "losses/sft": 1.773787260055542, "losses/total": 0.1589667797088623, "ref_logps/chosen": -38.175533294677734, "ref_logps/rejected": -38.80839920043945, "rewards/accuracies": 0.8125, "rewards/chosen": -1.649200439453125, "rewards/margins": 0.8780432343482971, "rewards/rejected": -2.5272438526153564, "step": 1501 }, { "epoch": 1.42, "grad_norm": 15.457566261291504, "learning_rate": 2.929345925148653e-07, "logps/chosen": -52.253299713134766, "logps/rejected": -71.617919921875, "loss": 0.2313, "losses/dpo": 0.20826825499534607, "losses/sft": 1.8494374752044678, "losses/total": 0.20826825499534607, "ref_logps/chosen": -38.869773864746094, "ref_logps/rejected": -41.653194427490234, "rewards/accuracies": 1.0, "rewards/chosen": -1.3383526802062988, "rewards/margins": 1.6581199169158936, "rewards/rejected": -2.9964723587036133, "step": 1502 }, { "epoch": 1.42, "grad_norm": 20.69858169555664, "learning_rate": 2.9275970619097583e-07, "logps/chosen": -44.85877990722656, "logps/rejected": -70.6641845703125, "loss": 0.3753, "losses/dpo": 0.20970723032951355, "losses/sft": 1.2575109004974365, "losses/total": 0.20970723032951355, "ref_logps/chosen": -33.47350311279297, "ref_logps/rejected": -47.147457122802734, "rewards/accuracies": 0.75, "rewards/chosen": -1.1385278701782227, "rewards/margins": 1.2131446599960327, "rewards/rejected": -2.351672649383545, "step": 1503 }, { "epoch": 1.42, "grad_norm": 19.670150756835938, "learning_rate": 2.925848198670864e-07, "logps/chosen": -47.65375518798828, "logps/rejected": -76.619140625, "loss": 0.373, "losses/dpo": 0.4358763098716736, "losses/sft": 1.4477770328521729, "losses/total": 0.4358763098716736, "ref_logps/chosen": -36.318790435791016, "ref_logps/rejected": -51.11369323730469, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1334965229034424, "rewards/margins": 1.4170477390289307, "rewards/rejected": -2.550544261932373, "step": 1504 }, { "epoch": 1.42, "grad_norm": 37.48612594604492, "learning_rate": 2.924099335431969e-07, "logps/chosen": -53.99786376953125, "logps/rejected": -70.99600219726562, "loss": 0.6758, "losses/dpo": 0.4331752359867096, "losses/sft": 2.1719987392425537, "losses/total": 0.4331752359867096, "ref_logps/chosen": -36.0965576171875, "ref_logps/rejected": -47.481536865234375, "rewards/accuracies": 0.625, "rewards/chosen": -1.790130853652954, "rewards/margins": 0.5613151788711548, "rewards/rejected": -2.3514459133148193, "step": 1505 }, { "epoch": 1.42, "grad_norm": 30.493154525756836, "learning_rate": 2.922350472193074e-07, "logps/chosen": -57.61941146850586, "logps/rejected": -71.89325714111328, "loss": 0.672, "losses/dpo": 0.6709376573562622, "losses/sft": 2.321643829345703, "losses/total": 0.6709376573562622, "ref_logps/chosen": -36.45568084716797, "ref_logps/rejected": -44.72772979736328, "rewards/accuracies": 0.625, "rewards/chosen": -2.116373062133789, "rewards/margins": 0.6001795530319214, "rewards/rejected": -2.716552495956421, "step": 1506 }, { "epoch": 1.42, "grad_norm": 34.07032012939453, "learning_rate": 2.9206016089541794e-07, "logps/chosen": -45.518280029296875, "logps/rejected": -56.7679443359375, "loss": 0.613, "losses/dpo": 0.5298448801040649, "losses/sft": 1.6550202369689941, "losses/total": 0.5298448801040649, "ref_logps/chosen": -33.158653259277344, "ref_logps/rejected": -39.70037841796875, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2359625101089478, "rewards/margins": 0.4707942008972168, "rewards/rejected": -1.706756830215454, "step": 1507 }, { "epoch": 1.42, "grad_norm": 23.747217178344727, "learning_rate": 2.918852745715285e-07, "logps/chosen": -51.798919677734375, "logps/rejected": -66.12730407714844, "loss": 0.4607, "losses/dpo": 0.5760438442230225, "losses/sft": 1.8068910837173462, "losses/total": 0.5760438442230225, "ref_logps/chosen": -40.55227279663086, "ref_logps/rejected": -48.49240493774414, "rewards/accuracies": 0.8125, "rewards/chosen": -1.124664545059204, "rewards/margins": 0.6388249397277832, "rewards/rejected": -1.7634894847869873, "step": 1508 }, { "epoch": 1.42, "grad_norm": 15.932150840759277, "learning_rate": 2.91710388247639e-07, "logps/chosen": -34.748321533203125, "logps/rejected": -58.75572204589844, "loss": 0.2881, "losses/dpo": 0.24681426584720612, "losses/sft": 1.252745270729065, "losses/total": 0.24681426584720612, "ref_logps/chosen": -24.019683837890625, "ref_logps/rejected": -33.04027557373047, "rewards/accuracies": 0.875, "rewards/chosen": -1.0728639364242554, "rewards/margins": 1.4986803531646729, "rewards/rejected": -2.5715441703796387, "step": 1509 }, { "epoch": 1.43, "grad_norm": 23.119163513183594, "learning_rate": 2.9153550192374953e-07, "logps/chosen": -45.46012878417969, "logps/rejected": -58.938079833984375, "loss": 0.5108, "losses/dpo": 0.9658684730529785, "losses/sft": 2.4378509521484375, "losses/total": 0.9658684730529785, "ref_logps/chosen": -32.790374755859375, "ref_logps/rejected": -36.19810485839844, "rewards/accuracies": 0.75, "rewards/chosen": -1.2669752836227417, "rewards/margins": 1.007022500038147, "rewards/rejected": -2.2739977836608887, "step": 1510 }, { "epoch": 1.43, "grad_norm": 19.32199478149414, "learning_rate": 2.913606155998601e-07, "logps/chosen": -63.526588439941406, "logps/rejected": -73.48130798339844, "loss": 0.3019, "losses/dpo": 0.3907686173915863, "losses/sft": 1.6382976770401, "losses/total": 0.3907686173915863, "ref_logps/chosen": -49.78626251220703, "ref_logps/rejected": -44.628684997558594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3740321397781372, "rewards/margins": 1.51123046875, "rewards/rejected": -2.8852624893188477, "step": 1511 }, { "epoch": 1.43, "grad_norm": 19.381914138793945, "learning_rate": 2.911857292759706e-07, "logps/chosen": -63.94828414916992, "logps/rejected": -77.06166076660156, "loss": 0.4115, "losses/dpo": 0.4323621392250061, "losses/sft": 2.058326482772827, "losses/total": 0.4323621392250061, "ref_logps/chosen": -46.979366302490234, "ref_logps/rejected": -49.90327072143555, "rewards/accuracies": 0.875, "rewards/chosen": -1.6968919038772583, "rewards/margins": 1.0189476013183594, "rewards/rejected": -2.715839385986328, "step": 1512 }, { "epoch": 1.43, "grad_norm": 24.129758834838867, "learning_rate": 2.910108429520811e-07, "logps/chosen": -45.2930908203125, "logps/rejected": -66.28303527832031, "loss": 0.5376, "losses/dpo": 0.5452848076820374, "losses/sft": 1.6820714473724365, "losses/total": 0.5452848076820374, "ref_logps/chosen": -31.70247459411621, "ref_logps/rejected": -43.183048248291016, "rewards/accuracies": 0.75, "rewards/chosen": -1.3590612411499023, "rewards/margins": 0.9509379863739014, "rewards/rejected": -2.309999465942383, "step": 1513 }, { "epoch": 1.43, "grad_norm": 22.892858505249023, "learning_rate": 2.9083595662819163e-07, "logps/chosen": -52.986759185791016, "logps/rejected": -59.184425354003906, "loss": 0.3972, "losses/dpo": 0.5723955631256104, "losses/sft": 2.107868194580078, "losses/total": 0.5723955631256104, "ref_logps/chosen": -40.682411193847656, "ref_logps/rejected": -35.80892562866211, "rewards/accuracies": 0.875, "rewards/chosen": -1.2304348945617676, "rewards/margins": 1.1071151494979858, "rewards/rejected": -2.337550163269043, "step": 1514 }, { "epoch": 1.43, "grad_norm": 19.007612228393555, "learning_rate": 2.906610703043022e-07, "logps/chosen": -48.39806365966797, "logps/rejected": -75.96879577636719, "loss": 0.3577, "losses/dpo": 0.5371244549751282, "losses/sft": 1.5192457437515259, "losses/total": 0.5371244549751282, "ref_logps/chosen": -34.886680603027344, "ref_logps/rejected": -47.24713134765625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3511383533477783, "rewards/margins": 1.5210281610488892, "rewards/rejected": -2.872166633605957, "step": 1515 }, { "epoch": 1.43, "grad_norm": 25.711668014526367, "learning_rate": 2.904861839804127e-07, "logps/chosen": -51.43296813964844, "logps/rejected": -85.52340698242188, "loss": 0.4498, "losses/dpo": 0.3060365617275238, "losses/sft": 1.6344822645187378, "losses/total": 0.3060365617275238, "ref_logps/chosen": -37.094058990478516, "ref_logps/rejected": -60.42805480957031, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4338903427124023, "rewards/margins": 1.0756444931030273, "rewards/rejected": -2.5095348358154297, "step": 1516 }, { "epoch": 1.43, "grad_norm": 25.63076400756836, "learning_rate": 2.903112976565232e-07, "logps/chosen": -48.82794189453125, "logps/rejected": -67.44844818115234, "loss": 0.4389, "losses/dpo": 0.5563966631889343, "losses/sft": 2.7458395957946777, "losses/total": 0.5563966631889343, "ref_logps/chosen": -33.88334274291992, "ref_logps/rejected": -41.201087951660156, "rewards/accuracies": 0.6875, "rewards/chosen": -1.494459867477417, "rewards/margins": 1.1302764415740967, "rewards/rejected": -2.6247363090515137, "step": 1517 }, { "epoch": 1.43, "grad_norm": 20.57042121887207, "learning_rate": 2.901364113326338e-07, "logps/chosen": -47.47187423706055, "logps/rejected": -82.17427825927734, "loss": 0.352, "losses/dpo": 0.3270842730998993, "losses/sft": 1.8293434381484985, "losses/total": 0.3270842730998993, "ref_logps/chosen": -31.914215087890625, "ref_logps/rejected": -49.63578796386719, "rewards/accuracies": 0.875, "rewards/chosen": -1.555765986442566, "rewards/margins": 1.6980829238891602, "rewards/rejected": -3.2538490295410156, "step": 1518 }, { "epoch": 1.43, "grad_norm": 23.56537628173828, "learning_rate": 2.899615250087443e-07, "logps/chosen": -61.43927001953125, "logps/rejected": -64.43576049804688, "loss": 0.4044, "losses/dpo": 0.22194579243659973, "losses/sft": 2.3176565170288086, "losses/total": 0.22194579243659973, "ref_logps/chosen": -44.38973617553711, "ref_logps/rejected": -35.563785552978516, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7049531936645508, "rewards/margins": 1.1822437047958374, "rewards/rejected": -2.8871970176696777, "step": 1519 }, { "epoch": 1.44, "grad_norm": 25.64571762084961, "learning_rate": 2.897866386848548e-07, "logps/chosen": -51.46670913696289, "logps/rejected": -60.356689453125, "loss": 0.503, "losses/dpo": 0.4018670618534088, "losses/sft": 2.062720775604248, "losses/total": 0.4018670618534088, "ref_logps/chosen": -36.0409049987793, "ref_logps/rejected": -34.509521484375, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5425803661346436, "rewards/margins": 1.0421358346939087, "rewards/rejected": -2.584716320037842, "step": 1520 }, { "epoch": 1.44, "grad_norm": 32.24479293823242, "learning_rate": 2.896117523609654e-07, "logps/chosen": -54.112548828125, "logps/rejected": -66.57638549804688, "loss": 0.5873, "losses/dpo": 0.8029208183288574, "losses/sft": 2.2057156562805176, "losses/total": 0.8029208183288574, "ref_logps/chosen": -35.82487869262695, "ref_logps/rejected": -41.952247619628906, "rewards/accuracies": 0.75, "rewards/chosen": -1.8287665843963623, "rewards/margins": 0.6336476802825928, "rewards/rejected": -2.462414264678955, "step": 1521 }, { "epoch": 1.44, "grad_norm": 23.201295852661133, "learning_rate": 2.894368660370759e-07, "logps/chosen": -51.750732421875, "logps/rejected": -64.2300796508789, "loss": 0.4232, "losses/dpo": 0.38377732038497925, "losses/sft": 1.3923503160476685, "losses/total": 0.38377732038497925, "ref_logps/chosen": -39.22446823120117, "ref_logps/rejected": -42.31391143798828, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2526264190673828, "rewards/margins": 0.9389901161193848, "rewards/rejected": -2.1916165351867676, "step": 1522 }, { "epoch": 1.44, "grad_norm": 22.923320770263672, "learning_rate": 2.892619797131864e-07, "logps/chosen": -56.127052307128906, "logps/rejected": -66.88697052001953, "loss": 0.4187, "losses/dpo": 0.3056759536266327, "losses/sft": 2.107492446899414, "losses/total": 0.3056759536266327, "ref_logps/chosen": -42.2886848449707, "ref_logps/rejected": -41.96747589111328, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3838367462158203, "rewards/margins": 1.108112096786499, "rewards/rejected": -2.4919488430023193, "step": 1523 }, { "epoch": 1.44, "grad_norm": 31.246360778808594, "learning_rate": 2.890870933892969e-07, "logps/chosen": -59.89424514770508, "logps/rejected": -86.02073669433594, "loss": 0.4256, "losses/dpo": 0.5294551253318787, "losses/sft": 1.980635166168213, "losses/total": 0.5294551253318787, "ref_logps/chosen": -40.18248748779297, "ref_logps/rejected": -49.21497344970703, "rewards/accuracies": 0.875, "rewards/chosen": -1.9711757898330688, "rewards/margins": 1.7094006538391113, "rewards/rejected": -3.6805765628814697, "step": 1524 }, { "epoch": 1.44, "grad_norm": 16.346588134765625, "learning_rate": 2.889122070654075e-07, "logps/chosen": -61.171051025390625, "logps/rejected": -80.6778564453125, "loss": 0.2355, "losses/dpo": 0.2049711048603058, "losses/sft": 2.1592917442321777, "losses/total": 0.2049711048603058, "ref_logps/chosen": -41.30645751953125, "ref_logps/rejected": -43.77112579345703, "rewards/accuracies": 1.0, "rewards/chosen": -1.986459732055664, "rewards/margins": 1.7042136192321777, "rewards/rejected": -3.690673351287842, "step": 1525 }, { "epoch": 1.44, "grad_norm": 19.330259323120117, "learning_rate": 2.88737320741518e-07, "logps/chosen": -47.257225036621094, "logps/rejected": -61.823699951171875, "loss": 0.3129, "losses/dpo": 0.1936805546283722, "losses/sft": 1.9064816236495972, "losses/total": 0.1936805546283722, "ref_logps/chosen": -37.022098541259766, "ref_logps/rejected": -34.80775451660156, "rewards/accuracies": 0.875, "rewards/chosen": -1.0235130786895752, "rewards/margins": 1.6780810356140137, "rewards/rejected": -2.701594114303589, "step": 1526 }, { "epoch": 1.44, "grad_norm": 33.08138656616211, "learning_rate": 2.885624344176285e-07, "logps/chosen": -54.41649627685547, "logps/rejected": -55.74816131591797, "loss": 0.6738, "losses/dpo": 0.8506128191947937, "losses/sft": 1.9796555042266846, "losses/total": 0.8506128191947937, "ref_logps/chosen": -36.925262451171875, "ref_logps/rejected": -31.216808319091797, "rewards/accuracies": 0.625, "rewards/chosen": -1.749123454093933, "rewards/margins": 0.7040117979049683, "rewards/rejected": -2.4531352519989014, "step": 1527 }, { "epoch": 1.44, "grad_norm": 32.28748321533203, "learning_rate": 2.883875480937391e-07, "logps/chosen": -65.16987609863281, "logps/rejected": -69.23661804199219, "loss": 0.5265, "losses/dpo": 0.5600053668022156, "losses/sft": 1.978285312652588, "losses/total": 0.5600053668022156, "ref_logps/chosen": -46.6458854675293, "ref_logps/rejected": -45.33740234375, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8523993492126465, "rewards/margins": 0.537522554397583, "rewards/rejected": -2.3899219036102295, "step": 1528 }, { "epoch": 1.44, "grad_norm": 22.397335052490234, "learning_rate": 2.882126617698496e-07, "logps/chosen": -43.86811065673828, "logps/rejected": -61.027381896972656, "loss": 0.4417, "losses/dpo": 0.3098493218421936, "losses/sft": 1.9116106033325195, "losses/total": 0.3098493218421936, "ref_logps/chosen": -30.395709991455078, "ref_logps/rejected": -37.6104736328125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3472402095794678, "rewards/margins": 0.9944506883621216, "rewards/rejected": -2.3416907787323, "step": 1529 }, { "epoch": 1.44, "grad_norm": 19.612794876098633, "learning_rate": 2.8803777544596015e-07, "logps/chosen": -57.82060241699219, "logps/rejected": -75.35323333740234, "loss": 0.3016, "losses/dpo": 0.3849601447582245, "losses/sft": 2.273878812789917, "losses/total": 0.3849601447582245, "ref_logps/chosen": -43.81038284301758, "ref_logps/rejected": -45.89567184448242, "rewards/accuracies": 0.875, "rewards/chosen": -1.4010215997695923, "rewards/margins": 1.5447348356246948, "rewards/rejected": -2.945756196975708, "step": 1530 }, { "epoch": 1.45, "grad_norm": 29.8697509765625, "learning_rate": 2.878628891220706e-07, "logps/chosen": -61.79139709472656, "logps/rejected": -71.67982482910156, "loss": 0.4182, "losses/dpo": 0.6001218557357788, "losses/sft": 2.438936948776245, "losses/total": 0.6001218557357788, "ref_logps/chosen": -44.27633285522461, "ref_logps/rejected": -39.547454833984375, "rewards/accuracies": 0.875, "rewards/chosen": -1.7515063285827637, "rewards/margins": 1.4617310762405396, "rewards/rejected": -3.213237762451172, "step": 1531 }, { "epoch": 1.45, "grad_norm": 19.21063232421875, "learning_rate": 2.876880027981812e-07, "logps/chosen": -58.567779541015625, "logps/rejected": -81.89295196533203, "loss": 0.3401, "losses/dpo": 0.2961357831954956, "losses/sft": 2.013099193572998, "losses/total": 0.2961357831954956, "ref_logps/chosen": -41.91876983642578, "ref_logps/rejected": -51.12880325317383, "rewards/accuracies": 0.875, "rewards/chosen": -1.664900779724121, "rewards/margins": 1.4115142822265625, "rewards/rejected": -3.0764150619506836, "step": 1532 }, { "epoch": 1.45, "grad_norm": 22.92458152770996, "learning_rate": 2.875131164742917e-07, "logps/chosen": -58.74645233154297, "logps/rejected": -77.60418701171875, "loss": 0.3205, "losses/dpo": 0.7321463227272034, "losses/sft": 1.7700061798095703, "losses/total": 0.7321463227272034, "ref_logps/chosen": -44.00907897949219, "ref_logps/rejected": -48.02771759033203, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4737370014190674, "rewards/margins": 1.483910322189331, "rewards/rejected": -2.9576473236083984, "step": 1533 }, { "epoch": 1.45, "grad_norm": 26.128801345825195, "learning_rate": 2.873382301504022e-07, "logps/chosen": -50.60676956176758, "logps/rejected": -59.92283630371094, "loss": 0.4765, "losses/dpo": 0.5237456560134888, "losses/sft": 1.5703126192092896, "losses/total": 0.5237456560134888, "ref_logps/chosen": -38.51496124267578, "ref_logps/rejected": -38.8935661315918, "rewards/accuracies": 0.75, "rewards/chosen": -1.2091807126998901, "rewards/margins": 0.8937462568283081, "rewards/rejected": -2.1029269695281982, "step": 1534 }, { "epoch": 1.45, "grad_norm": 23.607364654541016, "learning_rate": 2.8716334382651277e-07, "logps/chosen": -56.33033752441406, "logps/rejected": -75.4780044555664, "loss": 0.4796, "losses/dpo": 0.688484787940979, "losses/sft": 2.1886415481567383, "losses/total": 0.688484787940979, "ref_logps/chosen": -42.12576675415039, "ref_logps/rejected": -53.68501663208008, "rewards/accuracies": 0.75, "rewards/chosen": -1.420457124710083, "rewards/margins": 0.7588416337966919, "rewards/rejected": -2.1792988777160645, "step": 1535 }, { "epoch": 1.45, "grad_norm": 20.63005256652832, "learning_rate": 2.869884575026233e-07, "logps/chosen": -55.593955993652344, "logps/rejected": -73.67984771728516, "loss": 0.3587, "losses/dpo": 0.12686285376548767, "losses/sft": 2.268369674682617, "losses/total": 0.12686285376548767, "ref_logps/chosen": -40.68044662475586, "ref_logps/rejected": -45.844207763671875, "rewards/accuracies": 0.875, "rewards/chosen": -1.4913510084152222, "rewards/margins": 1.2922132015228271, "rewards/rejected": -2.7835640907287598, "step": 1536 }, { "epoch": 1.45, "grad_norm": 22.66666603088379, "learning_rate": 2.8681357117873385e-07, "logps/chosen": -52.40771484375, "logps/rejected": -66.45508575439453, "loss": 0.4372, "losses/dpo": 0.35038965940475464, "losses/sft": 1.7115205526351929, "losses/total": 0.35038965940475464, "ref_logps/chosen": -38.07029342651367, "ref_logps/rejected": -40.9581184387207, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4337424039840698, "rewards/margins": 1.1159547567367554, "rewards/rejected": -2.549697160720825, "step": 1537 }, { "epoch": 1.45, "grad_norm": 18.346418380737305, "learning_rate": 2.866386848548443e-07, "logps/chosen": -66.79084777832031, "logps/rejected": -82.09329223632812, "loss": 0.2885, "losses/dpo": 0.2678743898868561, "losses/sft": 1.9576998949050903, "losses/total": 0.2678743898868561, "ref_logps/chosen": -51.47205352783203, "ref_logps/rejected": -52.53905487060547, "rewards/accuracies": 0.875, "rewards/chosen": -1.5318797826766968, "rewards/margins": 1.423543930053711, "rewards/rejected": -2.955423593521118, "step": 1538 }, { "epoch": 1.45, "grad_norm": 22.38614845275879, "learning_rate": 2.8646379853095487e-07, "logps/chosen": -52.541385650634766, "logps/rejected": -68.58695983886719, "loss": 0.3613, "losses/dpo": 0.3060445189476013, "losses/sft": 1.8688775300979614, "losses/total": 0.3060445189476013, "ref_logps/chosen": -38.203617095947266, "ref_logps/rejected": -40.814598083496094, "rewards/accuracies": 0.875, "rewards/chosen": -1.4337767362594604, "rewards/margins": 1.3434598445892334, "rewards/rejected": -2.7772367000579834, "step": 1539 }, { "epoch": 1.45, "grad_norm": 21.95256233215332, "learning_rate": 2.8628891220706544e-07, "logps/chosen": -52.6802978515625, "logps/rejected": -76.41194152832031, "loss": 0.3356, "losses/dpo": 0.4440712332725525, "losses/sft": 2.2653234004974365, "losses/total": 0.4440712332725525, "ref_logps/chosen": -36.10881805419922, "ref_logps/rejected": -43.95797348022461, "rewards/accuracies": 0.875, "rewards/chosen": -1.6571482419967651, "rewards/margins": 1.5882489681243896, "rewards/rejected": -3.2453970909118652, "step": 1540 }, { "epoch": 1.46, "grad_norm": 20.52392578125, "learning_rate": 2.861140258831759e-07, "logps/chosen": -49.52919387817383, "logps/rejected": -58.325340270996094, "loss": 0.4156, "losses/dpo": 0.4757683575153351, "losses/sft": 1.3999855518341064, "losses/total": 0.4757683575153351, "ref_logps/chosen": -38.35038757324219, "ref_logps/rejected": -38.191673278808594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1178803443908691, "rewards/margins": 0.8954866528511047, "rewards/rejected": -2.013367176055908, "step": 1541 }, { "epoch": 1.46, "grad_norm": 24.823984146118164, "learning_rate": 2.8593913955928646e-07, "logps/chosen": -59.465599060058594, "logps/rejected": -78.2006607055664, "loss": 0.4462, "losses/dpo": 0.23168253898620605, "losses/sft": 1.6652085781097412, "losses/total": 0.23168253898620605, "ref_logps/chosen": -41.725364685058594, "ref_logps/rejected": -49.47913360595703, "rewards/accuracies": 0.75, "rewards/chosen": -1.7740230560302734, "rewards/margins": 1.0981297492980957, "rewards/rejected": -2.872152805328369, "step": 1542 }, { "epoch": 1.46, "grad_norm": 28.500024795532227, "learning_rate": 2.85764253235397e-07, "logps/chosen": -49.46937942504883, "logps/rejected": -76.01848602294922, "loss": 0.5339, "losses/dpo": 1.129459261894226, "losses/sft": 2.185872793197632, "losses/total": 1.129459261894226, "ref_logps/chosen": -35.974552154541016, "ref_logps/rejected": -50.56367492675781, "rewards/accuracies": 0.75, "rewards/chosen": -1.3494828939437866, "rewards/margins": 1.1959989070892334, "rewards/rejected": -2.5454816818237305, "step": 1543 }, { "epoch": 1.46, "grad_norm": 27.28022003173828, "learning_rate": 2.8558936691150754e-07, "logps/chosen": -58.35121536254883, "logps/rejected": -73.19402313232422, "loss": 0.4723, "losses/dpo": 0.4089778661727905, "losses/sft": 1.9968239068984985, "losses/total": 0.4089778661727905, "ref_logps/chosen": -43.517208099365234, "ref_logps/rejected": -48.74143981933594, "rewards/accuracies": 0.75, "rewards/chosen": -1.4834007024765015, "rewards/margins": 0.9618576169013977, "rewards/rejected": -2.445258140563965, "step": 1544 }, { "epoch": 1.46, "grad_norm": 30.026042938232422, "learning_rate": 2.85414480587618e-07, "logps/chosen": -58.37890625, "logps/rejected": -76.18202209472656, "loss": 0.4926, "losses/dpo": 0.6482977867126465, "losses/sft": 2.542445659637451, "losses/total": 0.6482977867126465, "ref_logps/chosen": -38.98811340332031, "ref_logps/rejected": -45.21678924560547, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9390795230865479, "rewards/margins": 1.1574437618255615, "rewards/rejected": -3.0965235233306885, "step": 1545 }, { "epoch": 1.46, "grad_norm": 16.482799530029297, "learning_rate": 2.8523959426372857e-07, "logps/chosen": -51.419639587402344, "logps/rejected": -72.48983001708984, "loss": 0.2616, "losses/dpo": 0.17937470972537994, "losses/sft": 1.642836093902588, "losses/total": 0.17937470972537994, "ref_logps/chosen": -38.369903564453125, "ref_logps/rejected": -41.711090087890625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3049732446670532, "rewards/margins": 1.7729005813598633, "rewards/rejected": -3.077873706817627, "step": 1546 }, { "epoch": 1.46, "grad_norm": 25.0074405670166, "learning_rate": 2.8506470793983913e-07, "logps/chosen": -58.674293518066406, "logps/rejected": -80.90597534179688, "loss": 0.4267, "losses/dpo": 0.7177131175994873, "losses/sft": 2.143411636352539, "losses/total": 0.7177131175994873, "ref_logps/chosen": -43.082618713378906, "ref_logps/rejected": -51.89866256713867, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5591676235198975, "rewards/margins": 1.3415639400482178, "rewards/rejected": -2.9007315635681152, "step": 1547 }, { "epoch": 1.46, "grad_norm": 17.16482162475586, "learning_rate": 2.848898216159496e-07, "logps/chosen": -52.59001922607422, "logps/rejected": -62.16965866088867, "loss": 0.3492, "losses/dpo": 0.23446545004844666, "losses/sft": 1.5199135541915894, "losses/total": 0.23446545004844666, "ref_logps/chosen": -38.15099334716797, "ref_logps/rejected": -34.785945892333984, "rewards/accuracies": 0.875, "rewards/chosen": -1.4439024925231934, "rewards/margins": 1.294468879699707, "rewards/rejected": -2.7383713722229004, "step": 1548 }, { "epoch": 1.46, "grad_norm": 27.55540657043457, "learning_rate": 2.8471493529206016e-07, "logps/chosen": -49.91382598876953, "logps/rejected": -70.44082641601562, "loss": 0.5861, "losses/dpo": 0.7367491722106934, "losses/sft": 1.6084750890731812, "losses/total": 0.7367491722106934, "ref_logps/chosen": -32.57075500488281, "ref_logps/rejected": -45.83686828613281, "rewards/accuracies": 0.75, "rewards/chosen": -1.734306812286377, "rewards/margins": 0.7260898351669312, "rewards/rejected": -2.4603967666625977, "step": 1549 }, { "epoch": 1.46, "grad_norm": 26.385900497436523, "learning_rate": 2.8454004896817067e-07, "logps/chosen": -50.995521545410156, "logps/rejected": -68.32173156738281, "loss": 0.4243, "losses/dpo": 0.31904733180999756, "losses/sft": 1.5386825799942017, "losses/total": 0.31904733180999756, "ref_logps/chosen": -37.816734313964844, "ref_logps/rejected": -41.5258674621582, "rewards/accuracies": 0.75, "rewards/chosen": -1.3178787231445312, "rewards/margins": 1.3617067337036133, "rewards/rejected": -2.6795854568481445, "step": 1550 }, { "epoch": 1.46, "grad_norm": 27.311309814453125, "learning_rate": 2.8436516264428124e-07, "logps/chosen": -52.601322174072266, "logps/rejected": -76.10559844970703, "loss": 0.5335, "losses/dpo": 0.22984179854393005, "losses/sft": 2.0700111389160156, "losses/total": 0.22984179854393005, "ref_logps/chosen": -39.199684143066406, "ref_logps/rejected": -51.422935485839844, "rewards/accuracies": 0.625, "rewards/chosen": -1.3401637077331543, "rewards/margins": 1.1281023025512695, "rewards/rejected": -2.468266010284424, "step": 1551 }, { "epoch": 1.47, "grad_norm": 23.3348388671875, "learning_rate": 2.841902763203917e-07, "logps/chosen": -57.84280776977539, "logps/rejected": -86.03680419921875, "loss": 0.3552, "losses/dpo": 0.6518940925598145, "losses/sft": 1.9520008563995361, "losses/total": 0.6518940925598145, "ref_logps/chosen": -40.45856857299805, "ref_logps/rejected": -55.459381103515625, "rewards/accuracies": 0.875, "rewards/chosen": -1.7384238243103027, "rewards/margins": 1.3193187713623047, "rewards/rejected": -3.0577425956726074, "step": 1552 }, { "epoch": 1.47, "grad_norm": 17.599271774291992, "learning_rate": 2.8401538999650226e-07, "logps/chosen": -50.42523956298828, "logps/rejected": -85.38443756103516, "loss": 0.2557, "losses/dpo": 0.20078754425048828, "losses/sft": 1.3675918579101562, "losses/total": 0.20078754425048828, "ref_logps/chosen": -37.47771453857422, "ref_logps/rejected": -56.087860107421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.2947523593902588, "rewards/margins": 1.6349050998687744, "rewards/rejected": -2.929657459259033, "step": 1553 }, { "epoch": 1.47, "grad_norm": 24.149715423583984, "learning_rate": 2.8384050367261283e-07, "logps/chosen": -54.10523223876953, "logps/rejected": -60.35800552368164, "loss": 0.4512, "losses/dpo": 0.21545279026031494, "losses/sft": 2.0858535766601562, "losses/total": 0.21545279026031494, "ref_logps/chosen": -36.687259674072266, "ref_logps/rejected": -34.079776763916016, "rewards/accuracies": 0.875, "rewards/chosen": -1.7417970895767212, "rewards/margins": 0.8860259056091309, "rewards/rejected": -2.6278228759765625, "step": 1554 }, { "epoch": 1.47, "grad_norm": 19.41289520263672, "learning_rate": 2.836656173487233e-07, "logps/chosen": -34.860260009765625, "logps/rejected": -50.743934631347656, "loss": 0.46, "losses/dpo": 0.36985400319099426, "losses/sft": 1.7566661834716797, "losses/total": 0.36985400319099426, "ref_logps/chosen": -26.2508487701416, "ref_logps/rejected": -34.557228088378906, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8609414100646973, "rewards/margins": 0.7577290534973145, "rewards/rejected": -1.6186704635620117, "step": 1555 }, { "epoch": 1.47, "grad_norm": 23.737871170043945, "learning_rate": 2.8349073102483385e-07, "logps/chosen": -49.39778518676758, "logps/rejected": -60.15223693847656, "loss": 0.4537, "losses/dpo": 0.5179659128189087, "losses/sft": 1.335763692855835, "losses/total": 0.5179659128189087, "ref_logps/chosen": -36.8922119140625, "ref_logps/rejected": -34.520545959472656, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2505571842193604, "rewards/margins": 1.3126121759414673, "rewards/rejected": -2.563169479370117, "step": 1556 }, { "epoch": 1.47, "grad_norm": 14.922785758972168, "learning_rate": 2.8331584470094436e-07, "logps/chosen": -46.69976043701172, "logps/rejected": -57.63823699951172, "loss": 0.2497, "losses/dpo": 0.2243071049451828, "losses/sft": 1.6626936197280884, "losses/total": 0.2243071049451828, "ref_logps/chosen": -33.87782287597656, "ref_logps/rejected": -28.841154098510742, "rewards/accuracies": 1.0, "rewards/chosen": -1.2821935415267944, "rewards/margins": 1.5975148677825928, "rewards/rejected": -2.8797082901000977, "step": 1557 }, { "epoch": 1.47, "grad_norm": 18.02433967590332, "learning_rate": 2.8314095837705493e-07, "logps/chosen": -46.7186279296875, "logps/rejected": -71.32408905029297, "loss": 0.3577, "losses/dpo": 0.4005771577358246, "losses/sft": 1.6510266065597534, "losses/total": 0.4005771577358246, "ref_logps/chosen": -35.55266571044922, "ref_logps/rejected": -48.943687438964844, "rewards/accuracies": 0.875, "rewards/chosen": -1.1165958642959595, "rewards/margins": 1.1214449405670166, "rewards/rejected": -2.2380409240722656, "step": 1558 }, { "epoch": 1.47, "grad_norm": 32.218563079833984, "learning_rate": 2.8296607205316544e-07, "logps/chosen": -47.6123046875, "logps/rejected": -64.24397277832031, "loss": 0.5707, "losses/dpo": 0.591403603553772, "losses/sft": 1.5472232103347778, "losses/total": 0.591403603553772, "ref_logps/chosen": -35.12158203125, "ref_logps/rejected": -43.7344970703125, "rewards/accuracies": 0.75, "rewards/chosen": -1.249072551727295, "rewards/margins": 0.8018757104873657, "rewards/rejected": -2.05094838142395, "step": 1559 }, { "epoch": 1.47, "grad_norm": 18.46552848815918, "learning_rate": 2.8279118572927596e-07, "logps/chosen": -54.636051177978516, "logps/rejected": -80.05775451660156, "loss": 0.3058, "losses/dpo": 0.2528434991836548, "losses/sft": 2.1791927814483643, "losses/total": 0.2528434991836548, "ref_logps/chosen": -41.39192581176758, "ref_logps/rejected": -51.63854217529297, "rewards/accuracies": 0.875, "rewards/chosen": -1.3244123458862305, "rewards/margins": 1.5175093412399292, "rewards/rejected": -2.841921806335449, "step": 1560 }, { "epoch": 1.47, "grad_norm": 29.531017303466797, "learning_rate": 2.826162994053865e-07, "logps/chosen": -53.762428283691406, "logps/rejected": -54.455039978027344, "loss": 0.4727, "losses/dpo": 0.3079676032066345, "losses/sft": 1.5105602741241455, "losses/total": 0.3079676032066345, "ref_logps/chosen": -40.69398880004883, "ref_logps/rejected": -30.85037612915039, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3068442344665527, "rewards/margins": 1.0536220073699951, "rewards/rejected": -2.360466480255127, "step": 1561 }, { "epoch": 1.47, "grad_norm": 26.355924606323242, "learning_rate": 2.82441413081497e-07, "logps/chosen": -63.31415939331055, "logps/rejected": -83.74795532226562, "loss": 0.3683, "losses/dpo": 0.11343936622142792, "losses/sft": 1.4646058082580566, "losses/total": 0.11343936622142792, "ref_logps/chosen": -44.809471130371094, "ref_logps/rejected": -49.59527587890625, "rewards/accuracies": 0.75, "rewards/chosen": -1.850468397140503, "rewards/margins": 1.5647997856140137, "rewards/rejected": -3.4152684211730957, "step": 1562 }, { "epoch": 1.48, "grad_norm": 20.292633056640625, "learning_rate": 2.8226652675760755e-07, "logps/chosen": -53.957557678222656, "logps/rejected": -73.56104278564453, "loss": 0.3084, "losses/dpo": 0.5798536539077759, "losses/sft": 1.8515987396240234, "losses/total": 0.5798536539077759, "ref_logps/chosen": -40.87503433227539, "ref_logps/rejected": -44.657562255859375, "rewards/accuracies": 0.875, "rewards/chosen": -1.3082528114318848, "rewards/margins": 1.5820951461791992, "rewards/rejected": -2.890347957611084, "step": 1563 }, { "epoch": 1.48, "grad_norm": 19.71109962463379, "learning_rate": 2.8209164043371806e-07, "logps/chosen": -47.39905548095703, "logps/rejected": -75.1040267944336, "loss": 0.342, "losses/dpo": 0.25179505348205566, "losses/sft": 1.9480195045471191, "losses/total": 0.25179505348205566, "ref_logps/chosen": -32.6230583190918, "ref_logps/rejected": -43.57971954345703, "rewards/accuracies": 0.875, "rewards/chosen": -1.47760009765625, "rewards/margins": 1.674830436706543, "rewards/rejected": -3.152430534362793, "step": 1564 }, { "epoch": 1.48, "grad_norm": 23.368722915649414, "learning_rate": 2.819167541098286e-07, "logps/chosen": -42.84213638305664, "logps/rejected": -71.26492309570312, "loss": 0.3681, "losses/dpo": 0.3649923801422119, "losses/sft": 2.0411674976348877, "losses/total": 0.3649923801422119, "ref_logps/chosen": -27.057880401611328, "ref_logps/rejected": -42.94731140136719, "rewards/accuracies": 0.875, "rewards/chosen": -1.578425645828247, "rewards/margins": 1.2533351182937622, "rewards/rejected": -2.831760883331299, "step": 1565 }, { "epoch": 1.48, "grad_norm": 26.619714736938477, "learning_rate": 2.8174186778593914e-07, "logps/chosen": -54.24474334716797, "logps/rejected": -65.07306671142578, "loss": 0.4966, "losses/dpo": 0.3746645450592041, "losses/sft": 2.3145899772644043, "losses/total": 0.3746645450592041, "ref_logps/chosen": -35.97780990600586, "ref_logps/rejected": -36.86473846435547, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8266935348510742, "rewards/margins": 0.9941390156745911, "rewards/rejected": -2.8208324909210205, "step": 1566 }, { "epoch": 1.48, "grad_norm": 32.63786697387695, "learning_rate": 2.8156698146204965e-07, "logps/chosen": -48.44013977050781, "logps/rejected": -63.79279327392578, "loss": 0.6562, "losses/dpo": 0.6476283669471741, "losses/sft": 1.9668405055999756, "losses/total": 0.6476283669471741, "ref_logps/chosen": -32.326194763183594, "ref_logps/rejected": -43.733665466308594, "rewards/accuracies": 0.625, "rewards/chosen": -1.6113942861557007, "rewards/margins": 0.39451873302459717, "rewards/rejected": -2.005913019180298, "step": 1567 }, { "epoch": 1.48, "grad_norm": 23.762386322021484, "learning_rate": 2.813920951381602e-07, "logps/chosen": -55.13279342651367, "logps/rejected": -69.50912475585938, "loss": 0.4667, "losses/dpo": 0.49039798974990845, "losses/sft": 1.7319002151489258, "losses/total": 0.49039798974990845, "ref_logps/chosen": -41.90827178955078, "ref_logps/rejected": -43.75598907470703, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3224526643753052, "rewards/margins": 1.2528605461120605, "rewards/rejected": -2.575313091278076, "step": 1568 }, { "epoch": 1.48, "grad_norm": 22.09877586364746, "learning_rate": 2.812172088142707e-07, "logps/chosen": -51.13863754272461, "logps/rejected": -73.3948974609375, "loss": 0.3349, "losses/dpo": 0.2623365819454193, "losses/sft": 1.7351672649383545, "losses/total": 0.2623365819454193, "ref_logps/chosen": -36.487667083740234, "ref_logps/rejected": -45.0869255065918, "rewards/accuracies": 0.875, "rewards/chosen": -1.465097188949585, "rewards/margins": 1.3657000064849854, "rewards/rejected": -2.8307971954345703, "step": 1569 }, { "epoch": 1.48, "grad_norm": 24.633047103881836, "learning_rate": 2.8104232249038124e-07, "logps/chosen": -52.68830871582031, "logps/rejected": -89.28404235839844, "loss": 0.3344, "losses/dpo": 0.2976940870285034, "losses/sft": 1.8603380918502808, "losses/total": 0.2976940870285034, "ref_logps/chosen": -34.00613784790039, "ref_logps/rejected": -53.935401916503906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8682169914245605, "rewards/margins": 1.66664719581604, "rewards/rejected": -3.5348639488220215, "step": 1570 }, { "epoch": 1.48, "grad_norm": 31.272361755371094, "learning_rate": 2.8086743616649175e-07, "logps/chosen": -38.56789016723633, "logps/rejected": -66.273193359375, "loss": 0.5677, "losses/dpo": 0.32743704319000244, "losses/sft": 1.8801658153533936, "losses/total": 0.32743704319000244, "ref_logps/chosen": -27.371931076049805, "ref_logps/rejected": -47.53741455078125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1195958852767944, "rewards/margins": 0.7539824843406677, "rewards/rejected": -1.873578429222107, "step": 1571 }, { "epoch": 1.48, "grad_norm": 16.557924270629883, "learning_rate": 2.806925498426023e-07, "logps/chosen": -39.770877838134766, "logps/rejected": -64.97610473632812, "loss": 0.3123, "losses/dpo": 0.07179497182369232, "losses/sft": 1.649191975593567, "losses/total": 0.07179497182369232, "ref_logps/chosen": -30.08028793334961, "ref_logps/rejected": -39.861572265625, "rewards/accuracies": 0.875, "rewards/chosen": -0.9690591096878052, "rewards/margins": 1.542393684387207, "rewards/rejected": -2.5114529132843018, "step": 1572 }, { "epoch": 1.49, "grad_norm": 16.097074508666992, "learning_rate": 2.8051766351871283e-07, "logps/chosen": -38.5244255065918, "logps/rejected": -56.38629913330078, "loss": 0.3326, "losses/dpo": 0.3651847541332245, "losses/sft": 1.6834081411361694, "losses/total": 0.3651847541332245, "ref_logps/chosen": -29.10544776916504, "ref_logps/rejected": -31.85451889038086, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9418981075286865, "rewards/margins": 1.511279821395874, "rewards/rejected": -2.4531779289245605, "step": 1573 }, { "epoch": 1.49, "grad_norm": 18.000150680541992, "learning_rate": 2.8034277719482334e-07, "logps/chosen": -53.27288818359375, "logps/rejected": -86.36906433105469, "loss": 0.2641, "losses/dpo": 0.19226941466331482, "losses/sft": 2.676643133163452, "losses/total": 0.19226941466331482, "ref_logps/chosen": -35.221588134765625, "ref_logps/rejected": -52.99406433105469, "rewards/accuracies": 0.875, "rewards/chosen": -1.8051300048828125, "rewards/margins": 1.5323700904846191, "rewards/rejected": -3.3375000953674316, "step": 1574 }, { "epoch": 1.49, "grad_norm": 16.803436279296875, "learning_rate": 2.801678908709339e-07, "logps/chosen": -50.18499755859375, "logps/rejected": -70.54275512695312, "loss": 0.3553, "losses/dpo": 0.4526533782482147, "losses/sft": 1.590066909790039, "losses/total": 0.4526533782482147, "ref_logps/chosen": -35.84791564941406, "ref_logps/rejected": -42.39314270019531, "rewards/accuracies": 0.875, "rewards/chosen": -1.433708667755127, "rewards/margins": 1.3812527656555176, "rewards/rejected": -2.8149614334106445, "step": 1575 }, { "epoch": 1.49, "grad_norm": 23.92568588256836, "learning_rate": 2.7999300454704437e-07, "logps/chosen": -48.44969940185547, "logps/rejected": -63.28421401977539, "loss": 0.414, "losses/dpo": 0.35964441299438477, "losses/sft": 1.9736664295196533, "losses/total": 0.35964441299438477, "ref_logps/chosen": -33.312408447265625, "ref_logps/rejected": -39.735782623291016, "rewards/accuracies": 0.875, "rewards/chosen": -1.5137293338775635, "rewards/margins": 0.8411141633987427, "rewards/rejected": -2.3548436164855957, "step": 1576 }, { "epoch": 1.49, "grad_norm": 19.621519088745117, "learning_rate": 2.7981811822315494e-07, "logps/chosen": -45.52754211425781, "logps/rejected": -61.8480224609375, "loss": 0.4199, "losses/dpo": 0.34698039293289185, "losses/sft": 1.7505427598953247, "losses/total": 0.34698039293289185, "ref_logps/chosen": -34.88029861450195, "ref_logps/rejected": -38.03106689453125, "rewards/accuracies": 0.875, "rewards/chosen": -1.064724326133728, "rewards/margins": 1.3169710636138916, "rewards/rejected": -2.381695508956909, "step": 1577 }, { "epoch": 1.49, "grad_norm": 22.111160278320312, "learning_rate": 2.796432318992655e-07, "logps/chosen": -58.804664611816406, "logps/rejected": -77.11788177490234, "loss": 0.4165, "losses/dpo": 0.511218786239624, "losses/sft": 1.8728396892547607, "losses/total": 0.511218786239624, "ref_logps/chosen": -40.30754852294922, "ref_logps/rejected": -48.91407012939453, "rewards/accuracies": 0.8125, "rewards/chosen": -1.849711537361145, "rewards/margins": 0.9706699252128601, "rewards/rejected": -2.8203814029693604, "step": 1578 }, { "epoch": 1.49, "grad_norm": 19.60536003112793, "learning_rate": 2.79468345575376e-07, "logps/chosen": -46.77680206298828, "logps/rejected": -58.81962585449219, "loss": 0.3809, "losses/dpo": 0.42948460578918457, "losses/sft": 1.9717991352081299, "losses/total": 0.42948460578918457, "ref_logps/chosen": -33.579078674316406, "ref_logps/rejected": -35.79740524291992, "rewards/accuracies": 0.875, "rewards/chosen": -1.319772481918335, "rewards/margins": 0.9824498891830444, "rewards/rejected": -2.30222225189209, "step": 1579 }, { "epoch": 1.49, "grad_norm": 31.59912109375, "learning_rate": 2.792934592514865e-07, "logps/chosen": -52.323333740234375, "logps/rejected": -80.10462951660156, "loss": 0.4946, "losses/dpo": 0.305744469165802, "losses/sft": 1.9969563484191895, "losses/total": 0.305744469165802, "ref_logps/chosen": -36.05377197265625, "ref_logps/rejected": -51.194400787353516, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6269561052322388, "rewards/margins": 1.2640661001205444, "rewards/rejected": -2.891022205352783, "step": 1580 }, { "epoch": 1.49, "grad_norm": 26.642765045166016, "learning_rate": 2.7911857292759704e-07, "logps/chosen": -39.11742401123047, "logps/rejected": -50.89830780029297, "loss": 0.595, "losses/dpo": 1.009821891784668, "losses/sft": 2.328462600708008, "losses/total": 1.009821891784668, "ref_logps/chosen": -26.55136489868164, "ref_logps/rejected": -32.081886291503906, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2566059827804565, "rewards/margins": 0.6250360012054443, "rewards/rejected": -1.8816421031951904, "step": 1581 }, { "epoch": 1.49, "grad_norm": 17.78432273864746, "learning_rate": 2.789436866037076e-07, "logps/chosen": -49.73016357421875, "logps/rejected": -53.685997009277344, "loss": 0.4254, "losses/dpo": 0.4622114896774292, "losses/sft": 2.1299214363098145, "losses/total": 0.4622114896774292, "ref_logps/chosen": -37.85810852050781, "ref_logps/rejected": -31.274436950683594, "rewards/accuracies": 0.75, "rewards/chosen": -1.1872059106826782, "rewards/margins": 1.0539498329162598, "rewards/rejected": -2.2411556243896484, "step": 1582 }, { "epoch": 1.49, "grad_norm": 39.04930114746094, "learning_rate": 2.7876880027981806e-07, "logps/chosen": -58.037784576416016, "logps/rejected": -70.38892364501953, "loss": 0.6527, "losses/dpo": 0.2935391366481781, "losses/sft": 1.4305685758590698, "losses/total": 0.2935391366481781, "ref_logps/chosen": -41.22814178466797, "ref_logps/rejected": -46.40721893310547, "rewards/accuracies": 0.625, "rewards/chosen": -1.6809645891189575, "rewards/margins": 0.7172063589096069, "rewards/rejected": -2.3981709480285645, "step": 1583 }, { "epoch": 1.5, "grad_norm": 15.458442687988281, "learning_rate": 2.7859391395592863e-07, "logps/chosen": -42.144535064697266, "logps/rejected": -68.74707794189453, "loss": 0.2711, "losses/dpo": 0.3337159752845764, "losses/sft": 1.76071035861969, "losses/total": 0.3337159752845764, "ref_logps/chosen": -30.88461685180664, "ref_logps/rejected": -42.31597900390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.125991702079773, "rewards/margins": 1.5171184539794922, "rewards/rejected": -2.6431102752685547, "step": 1584 }, { "epoch": 1.5, "grad_norm": 17.06437873840332, "learning_rate": 2.784190276320392e-07, "logps/chosen": -42.097023010253906, "logps/rejected": -63.55470275878906, "loss": 0.3731, "losses/dpo": 0.447040855884552, "losses/sft": 1.8044205904006958, "losses/total": 0.447040855884552, "ref_logps/chosen": -33.403717041015625, "ref_logps/rejected": -42.10512924194336, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8693302869796753, "rewards/margins": 1.2756270170211792, "rewards/rejected": -2.1449573040008545, "step": 1585 }, { "epoch": 1.5, "grad_norm": 21.871400833129883, "learning_rate": 2.782441413081497e-07, "logps/chosen": -47.751651763916016, "logps/rejected": -65.92152404785156, "loss": 0.4306, "losses/dpo": 0.540984034538269, "losses/sft": 1.7272223234176636, "losses/total": 0.540984034538269, "ref_logps/chosen": -30.07518768310547, "ref_logps/rejected": -39.9300537109375, "rewards/accuracies": 0.8125, "rewards/chosen": -1.767646312713623, "rewards/margins": 0.8315005302429199, "rewards/rejected": -2.599146842956543, "step": 1586 }, { "epoch": 1.5, "grad_norm": 13.564169883728027, "learning_rate": 2.780692549842602e-07, "logps/chosen": -50.20806121826172, "logps/rejected": -80.58341979980469, "loss": 0.2536, "losses/dpo": 0.25744491815567017, "losses/sft": 2.364044427871704, "losses/total": 0.25744491815567017, "ref_logps/chosen": -37.42918395996094, "ref_logps/rejected": -49.507225036621094, "rewards/accuracies": 0.875, "rewards/chosen": -1.2778877019882202, "rewards/margins": 1.8297314643859863, "rewards/rejected": -3.107619047164917, "step": 1587 }, { "epoch": 1.5, "grad_norm": 21.28896713256836, "learning_rate": 2.7789436866037073e-07, "logps/chosen": -49.551422119140625, "logps/rejected": -72.1412353515625, "loss": 0.3334, "losses/dpo": 0.690658688545227, "losses/sft": 2.1990537643432617, "losses/total": 0.690658688545227, "ref_logps/chosen": -34.629905700683594, "ref_logps/rejected": -41.382835388183594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4921514987945557, "rewards/margins": 1.5836892127990723, "rewards/rejected": -3.075840473175049, "step": 1588 }, { "epoch": 1.5, "grad_norm": 35.028602600097656, "learning_rate": 2.777194823364813e-07, "logps/chosen": -51.10850524902344, "logps/rejected": -65.39685821533203, "loss": 0.5529, "losses/dpo": 0.848666787147522, "losses/sft": 1.9485321044921875, "losses/total": 0.848666787147522, "ref_logps/chosen": -33.31840133666992, "ref_logps/rejected": -39.65271759033203, "rewards/accuracies": 0.8125, "rewards/chosen": -1.779010534286499, "rewards/margins": 0.7954035997390747, "rewards/rejected": -2.5744142532348633, "step": 1589 }, { "epoch": 1.5, "grad_norm": 24.854259490966797, "learning_rate": 2.7754459601259176e-07, "logps/chosen": -45.76518630981445, "logps/rejected": -61.96608352661133, "loss": 0.3931, "losses/dpo": 0.2894047796726227, "losses/sft": 1.5795376300811768, "losses/total": 0.2894047796726227, "ref_logps/chosen": -32.27042007446289, "ref_logps/rejected": -35.597530364990234, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3494765758514404, "rewards/margins": 1.2873790264129639, "rewards/rejected": -2.636855363845825, "step": 1590 }, { "epoch": 1.5, "grad_norm": 30.77260971069336, "learning_rate": 2.773697096887023e-07, "logps/chosen": -66.782958984375, "logps/rejected": -73.30743408203125, "loss": 0.6087, "losses/dpo": 0.8013497591018677, "losses/sft": 1.7161396741867065, "losses/total": 0.8013497591018677, "ref_logps/chosen": -44.82712936401367, "ref_logps/rejected": -44.11414337158203, "rewards/accuracies": 0.75, "rewards/chosen": -2.195582866668701, "rewards/margins": 0.7237467169761658, "rewards/rejected": -2.9193296432495117, "step": 1591 }, { "epoch": 1.5, "grad_norm": 21.13929557800293, "learning_rate": 2.771948233648129e-07, "logps/chosen": -41.46681594848633, "logps/rejected": -68.57009887695312, "loss": 0.306, "losses/dpo": 0.15988892316818237, "losses/sft": 1.5330132246017456, "losses/total": 0.15988892316818237, "ref_logps/chosen": -29.467641830444336, "ref_logps/rejected": -41.549949645996094, "rewards/accuracies": 0.875, "rewards/chosen": -1.1999173164367676, "rewards/margins": 1.502097249031067, "rewards/rejected": -2.702014446258545, "step": 1592 }, { "epoch": 1.5, "grad_norm": 13.868195533752441, "learning_rate": 2.770199370409234e-07, "logps/chosen": -49.54917907714844, "logps/rejected": -89.79420471191406, "loss": 0.2149, "losses/dpo": 0.1659305840730667, "losses/sft": 2.0476858615875244, "losses/total": 0.1659305840730667, "ref_logps/chosen": -36.629356384277344, "ref_logps/rejected": -55.23522186279297, "rewards/accuracies": 1.0, "rewards/chosen": -1.291982650756836, "rewards/margins": 2.1639156341552734, "rewards/rejected": -3.4558980464935303, "step": 1593 }, { "epoch": 1.51, "grad_norm": 29.07862091064453, "learning_rate": 2.768450507170339e-07, "logps/chosen": -53.14488220214844, "logps/rejected": -69.19747161865234, "loss": 0.4449, "losses/dpo": 0.26309823989868164, "losses/sft": 2.200120687484741, "losses/total": 0.26309823989868164, "ref_logps/chosen": -38.730194091796875, "ref_logps/rejected": -44.16493225097656, "rewards/accuracies": 0.875, "rewards/chosen": -1.4414689540863037, "rewards/margins": 1.0617849826812744, "rewards/rejected": -2.503253936767578, "step": 1594 }, { "epoch": 1.51, "grad_norm": 15.981719970703125, "learning_rate": 2.7667016439314443e-07, "logps/chosen": -50.74475860595703, "logps/rejected": -72.94822692871094, "loss": 0.2646, "losses/dpo": 0.23204100131988525, "losses/sft": 1.665698766708374, "losses/total": 0.23204100131988525, "ref_logps/chosen": -35.56131362915039, "ref_logps/rejected": -44.1530876159668, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5183440446853638, "rewards/margins": 1.3611699342727661, "rewards/rejected": -2.87951397895813, "step": 1595 }, { "epoch": 1.51, "grad_norm": 21.13315773010254, "learning_rate": 2.76495278069255e-07, "logps/chosen": -42.29929733276367, "logps/rejected": -63.52539825439453, "loss": 0.3857, "losses/dpo": 0.37726065516471863, "losses/sft": 2.1821720600128174, "losses/total": 0.37726065516471863, "ref_logps/chosen": -29.5195255279541, "ref_logps/rejected": -38.37900924682617, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2779769897460938, "rewards/margins": 1.2366619110107422, "rewards/rejected": -2.514638662338257, "step": 1596 }, { "epoch": 1.51, "grad_norm": 14.99760627746582, "learning_rate": 2.7632039174536545e-07, "logps/chosen": -38.87886428833008, "logps/rejected": -70.91326141357422, "loss": 0.2672, "losses/dpo": 0.24154485762119293, "losses/sft": 2.2009530067443848, "losses/total": 0.24154485762119293, "ref_logps/chosen": -28.733173370361328, "ref_logps/rejected": -42.064971923828125, "rewards/accuracies": 0.875, "rewards/chosen": -1.0145691633224487, "rewards/margins": 1.8702595233917236, "rewards/rejected": -2.884828567504883, "step": 1597 }, { "epoch": 1.51, "grad_norm": 31.890321731567383, "learning_rate": 2.76145505421476e-07, "logps/chosen": -59.677406311035156, "logps/rejected": -83.85343933105469, "loss": 0.2931, "losses/dpo": 0.45264574885368347, "losses/sft": 1.9471641778945923, "losses/total": 0.45264574885368347, "ref_logps/chosen": -43.396541595458984, "ref_logps/rejected": -51.72345733642578, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6280864477157593, "rewards/margins": 1.5849120616912842, "rewards/rejected": -3.212998628616333, "step": 1598 }, { "epoch": 1.51, "grad_norm": 26.067333221435547, "learning_rate": 2.759706190975866e-07, "logps/chosen": -49.743629455566406, "logps/rejected": -63.852630615234375, "loss": 0.3928, "losses/dpo": 0.20356914401054382, "losses/sft": 1.668600082397461, "losses/total": 0.20356914401054382, "ref_logps/chosen": -36.41566848754883, "ref_logps/rejected": -36.86611557006836, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3327960968017578, "rewards/margins": 1.36585533618927, "rewards/rejected": -2.6986515522003174, "step": 1599 }, { "epoch": 1.51, "grad_norm": 27.520532608032227, "learning_rate": 2.757957327736971e-07, "logps/chosen": -54.62408447265625, "logps/rejected": -74.0721435546875, "loss": 0.3644, "losses/dpo": 0.3439791798591614, "losses/sft": 1.6578291654586792, "losses/total": 0.3439791798591614, "ref_logps/chosen": -40.1357536315918, "ref_logps/rejected": -42.285789489746094, "rewards/accuracies": 0.75, "rewards/chosen": -1.4488331079483032, "rewards/margins": 1.7298022508621216, "rewards/rejected": -3.178635358810425, "step": 1600 }, { "epoch": 1.51, "grad_norm": 36.437744140625, "learning_rate": 2.756208464498076e-07, "logps/chosen": -54.70110321044922, "logps/rejected": -62.645633697509766, "loss": 0.5619, "losses/dpo": 0.44855254888534546, "losses/sft": 1.701101303100586, "losses/total": 0.44855254888534546, "ref_logps/chosen": -35.591583251953125, "ref_logps/rejected": -33.868446350097656, "rewards/accuracies": 0.6875, "rewards/chosen": -1.910952091217041, "rewards/margins": 0.9667667746543884, "rewards/rejected": -2.877718925476074, "step": 1601 }, { "epoch": 1.51, "grad_norm": 37.80801010131836, "learning_rate": 2.754459601259181e-07, "logps/chosen": -58.61450958251953, "logps/rejected": -88.08565521240234, "loss": 0.6349, "losses/dpo": 0.6244620680809021, "losses/sft": 1.8752890825271606, "losses/total": 0.6244620680809021, "ref_logps/chosen": -38.55999755859375, "ref_logps/rejected": -56.104698181152344, "rewards/accuracies": 0.75, "rewards/chosen": -2.005451202392578, "rewards/margins": 1.1926450729370117, "rewards/rejected": -3.19809627532959, "step": 1602 }, { "epoch": 1.51, "grad_norm": 37.187156677246094, "learning_rate": 2.752710738020287e-07, "logps/chosen": -68.95501708984375, "logps/rejected": -74.0699691772461, "loss": 0.6101, "losses/dpo": 0.6041665077209473, "losses/sft": 1.6167887449264526, "losses/total": 0.6041665077209473, "ref_logps/chosen": -53.5874137878418, "ref_logps/rejected": -48.076377868652344, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5367608070373535, "rewards/margins": 1.0625981092453003, "rewards/rejected": -2.5993587970733643, "step": 1603 }, { "epoch": 1.51, "grad_norm": 24.92690086364746, "learning_rate": 2.750961874781392e-07, "logps/chosen": -48.456871032714844, "logps/rejected": -64.7651596069336, "loss": 0.406, "losses/dpo": 0.4590799808502197, "losses/sft": 2.4898979663848877, "losses/total": 0.4590799808502197, "ref_logps/chosen": -32.762123107910156, "ref_logps/rejected": -37.21025085449219, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5694751739501953, "rewards/margins": 1.1860153675079346, "rewards/rejected": -2.755490779876709, "step": 1604 }, { "epoch": 1.52, "grad_norm": 30.134885787963867, "learning_rate": 2.749213011542497e-07, "logps/chosen": -50.478050231933594, "logps/rejected": -75.0337142944336, "loss": 0.5033, "losses/dpo": 0.37894198298454285, "losses/sft": 2.071441650390625, "losses/total": 0.37894198298454285, "ref_logps/chosen": -33.355438232421875, "ref_logps/rejected": -46.22139358520508, "rewards/accuracies": 0.8125, "rewards/chosen": -1.712261438369751, "rewards/margins": 1.168969988822937, "rewards/rejected": -2.8812315464019775, "step": 1605 }, { "epoch": 1.52, "grad_norm": 26.25534439086914, "learning_rate": 2.747464148303603e-07, "logps/chosen": -46.702392578125, "logps/rejected": -62.1175651550293, "loss": 0.4033, "losses/dpo": 0.3480875790119171, "losses/sft": 1.6217899322509766, "losses/total": 0.3480875790119171, "ref_logps/chosen": -32.0451545715332, "ref_logps/rejected": -36.74039840698242, "rewards/accuracies": 0.75, "rewards/chosen": -1.4657235145568848, "rewards/margins": 1.0719932317733765, "rewards/rejected": -2.537716865539551, "step": 1606 }, { "epoch": 1.52, "grad_norm": 20.10713005065918, "learning_rate": 2.745715285064708e-07, "logps/chosen": -44.955055236816406, "logps/rejected": -77.20957946777344, "loss": 0.3435, "losses/dpo": 0.3660425543785095, "losses/sft": 1.945845365524292, "losses/total": 0.3660425543785095, "ref_logps/chosen": -32.77898025512695, "ref_logps/rejected": -50.41749572753906, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2176077365875244, "rewards/margins": 1.461601734161377, "rewards/rejected": -2.6792097091674805, "step": 1607 }, { "epoch": 1.52, "grad_norm": 20.93253517150879, "learning_rate": 2.743966421825813e-07, "logps/chosen": -52.94804382324219, "logps/rejected": -66.85520935058594, "loss": 0.378, "losses/dpo": 0.10774748027324677, "losses/sft": 1.7420648336410522, "losses/total": 0.10774748027324677, "ref_logps/chosen": -40.665367126464844, "ref_logps/rejected": -43.21400451660156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2282679080963135, "rewards/margins": 1.1358526945114136, "rewards/rejected": -2.3641204833984375, "step": 1608 }, { "epoch": 1.52, "grad_norm": 21.789220809936523, "learning_rate": 2.742217558586918e-07, "logps/chosen": -59.301334381103516, "logps/rejected": -65.62680053710938, "loss": 0.389, "losses/dpo": 0.24781490862369537, "losses/sft": 2.024040699005127, "losses/total": 0.24781490862369537, "ref_logps/chosen": -45.508140563964844, "ref_logps/rejected": -40.55152893066406, "rewards/accuracies": 0.8125, "rewards/chosen": -1.379319429397583, "rewards/margins": 1.1282069683074951, "rewards/rejected": -2.507526397705078, "step": 1609 }, { "epoch": 1.52, "grad_norm": 17.608909606933594, "learning_rate": 2.740468695348024e-07, "logps/chosen": -45.20891571044922, "logps/rejected": -88.14855194091797, "loss": 0.2216, "losses/dpo": 0.3600003123283386, "losses/sft": 2.1840012073516846, "losses/total": 0.3600003123283386, "ref_logps/chosen": -32.8494873046875, "ref_logps/rejected": -53.32741165161133, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2359428405761719, "rewards/margins": 2.246171474456787, "rewards/rejected": -3.482114315032959, "step": 1610 }, { "epoch": 1.52, "grad_norm": 27.919546127319336, "learning_rate": 2.738719832109129e-07, "logps/chosen": -41.939048767089844, "logps/rejected": -66.14915466308594, "loss": 0.4301, "losses/dpo": 0.5740477442741394, "losses/sft": 1.614914894104004, "losses/total": 0.5740477442741394, "ref_logps/chosen": -30.83543586730957, "ref_logps/rejected": -42.79560089111328, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1103609800338745, "rewards/margins": 1.224994421005249, "rewards/rejected": -2.335355520248413, "step": 1611 }, { "epoch": 1.52, "grad_norm": 25.810455322265625, "learning_rate": 2.736970968870234e-07, "logps/chosen": -52.598716735839844, "logps/rejected": -77.66197204589844, "loss": 0.335, "losses/dpo": 0.33409860730171204, "losses/sft": 1.7981247901916504, "losses/total": 0.33409860730171204, "ref_logps/chosen": -37.69508743286133, "ref_logps/rejected": -47.655296325683594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4903628826141357, "rewards/margins": 1.5103046894073486, "rewards/rejected": -3.0006678104400635, "step": 1612 }, { "epoch": 1.52, "grad_norm": 27.2560977935791, "learning_rate": 2.7352221056313397e-07, "logps/chosen": -50.28767395019531, "logps/rejected": -61.483577728271484, "loss": 0.5036, "losses/dpo": 0.6138076782226562, "losses/sft": 1.6396846771240234, "losses/total": 0.6138076782226562, "ref_logps/chosen": -37.69236755371094, "ref_logps/rejected": -36.72338104248047, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2595303058624268, "rewards/margins": 1.216489553451538, "rewards/rejected": -2.476019859313965, "step": 1613 }, { "epoch": 1.52, "grad_norm": 19.068532943725586, "learning_rate": 2.733473242392445e-07, "logps/chosen": -49.49738693237305, "logps/rejected": -75.90866088867188, "loss": 0.2989, "losses/dpo": 0.23595218360424042, "losses/sft": 2.039900779724121, "losses/total": 0.23595218360424042, "ref_logps/chosen": -34.36888885498047, "ref_logps/rejected": -45.201568603515625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.512850046157837, "rewards/margins": 1.5578593015670776, "rewards/rejected": -3.070709466934204, "step": 1614 }, { "epoch": 1.53, "grad_norm": 20.726409912109375, "learning_rate": 2.73172437915355e-07, "logps/chosen": -45.3119010925293, "logps/rejected": -67.6421127319336, "loss": 0.357, "losses/dpo": 0.5576993823051453, "losses/sft": 1.5033469200134277, "losses/total": 0.5576993823051453, "ref_logps/chosen": -30.938488006591797, "ref_logps/rejected": -38.47358703613281, "rewards/accuracies": 0.875, "rewards/chosen": -1.4373414516448975, "rewards/margins": 1.479511022567749, "rewards/rejected": -2.9168524742126465, "step": 1615 }, { "epoch": 1.53, "grad_norm": 14.721166610717773, "learning_rate": 2.729975515914655e-07, "logps/chosen": -54.51541519165039, "logps/rejected": -85.97838592529297, "loss": 0.1938, "losses/dpo": 0.15921086072921753, "losses/sft": 1.546753168106079, "losses/total": 0.15921086072921753, "ref_logps/chosen": -41.91796112060547, "ref_logps/rejected": -51.820594787597656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2597455978393555, "rewards/margins": 2.156033515930176, "rewards/rejected": -3.4157791137695312, "step": 1616 }, { "epoch": 1.53, "grad_norm": 14.837723731994629, "learning_rate": 2.728226652675761e-07, "logps/chosen": -49.117774963378906, "logps/rejected": -88.85385131835938, "loss": 0.2487, "losses/dpo": 0.2684962749481201, "losses/sft": 2.1303884983062744, "losses/total": 0.2684962749481201, "ref_logps/chosen": -37.05741882324219, "ref_logps/rejected": -53.95323944091797, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2060357332229614, "rewards/margins": 2.2840256690979004, "rewards/rejected": -3.4900612831115723, "step": 1617 }, { "epoch": 1.53, "grad_norm": 30.443340301513672, "learning_rate": 2.726477789436866e-07, "logps/chosen": -44.841835021972656, "logps/rejected": -62.176063537597656, "loss": 0.4849, "losses/dpo": 0.45843416452407837, "losses/sft": 1.3392354249954224, "losses/total": 0.45843416452407837, "ref_logps/chosen": -32.64509963989258, "ref_logps/rejected": -38.978004455566406, "rewards/accuracies": 0.75, "rewards/chosen": -1.2196736335754395, "rewards/margins": 1.100132942199707, "rewards/rejected": -2.3198065757751465, "step": 1618 }, { "epoch": 1.53, "grad_norm": 24.091711044311523, "learning_rate": 2.724728926197971e-07, "logps/chosen": -55.11222839355469, "logps/rejected": -65.99240112304688, "loss": 0.4108, "losses/dpo": 0.4813131093978882, "losses/sft": 1.751856803894043, "losses/total": 0.4813131093978882, "ref_logps/chosen": -39.802066802978516, "ref_logps/rejected": -42.212806701660156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.53101646900177, "rewards/margins": 0.8469429612159729, "rewards/rejected": -2.3779592514038086, "step": 1619 }, { "epoch": 1.53, "grad_norm": 31.9316349029541, "learning_rate": 2.7229800629590767e-07, "logps/chosen": -50.680057525634766, "logps/rejected": -77.05933380126953, "loss": 0.6088, "losses/dpo": 0.34035658836364746, "losses/sft": 1.5466454029083252, "losses/total": 0.34035658836364746, "ref_logps/chosen": -35.495094299316406, "ref_logps/rejected": -51.928260803222656, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5184965133666992, "rewards/margins": 0.9946112632751465, "rewards/rejected": -2.5131077766418457, "step": 1620 }, { "epoch": 1.53, "grad_norm": 25.840150833129883, "learning_rate": 2.721231199720182e-07, "logps/chosen": -53.757896423339844, "logps/rejected": -69.1905746459961, "loss": 0.4877, "losses/dpo": 0.4141101837158203, "losses/sft": 1.570658564567566, "losses/total": 0.4141101837158203, "ref_logps/chosen": -40.60039520263672, "ref_logps/rejected": -43.09496307373047, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3157503604888916, "rewards/margins": 1.2938103675842285, "rewards/rejected": -2.609560966491699, "step": 1621 }, { "epoch": 1.53, "grad_norm": 22.033201217651367, "learning_rate": 2.719482336481287e-07, "logps/chosen": -47.648292541503906, "logps/rejected": -65.65187072753906, "loss": 0.4527, "losses/dpo": 0.6945185661315918, "losses/sft": 1.9615315198898315, "losses/total": 0.6945185661315918, "ref_logps/chosen": -37.45869445800781, "ref_logps/rejected": -43.24253845214844, "rewards/accuracies": 0.75, "rewards/chosen": -1.0189597606658936, "rewards/margins": 1.2219738960266113, "rewards/rejected": -2.240933418273926, "step": 1622 }, { "epoch": 1.53, "grad_norm": 24.417654037475586, "learning_rate": 2.7177334732423926e-07, "logps/chosen": -57.761863708496094, "logps/rejected": -74.31767272949219, "loss": 0.3645, "losses/dpo": 0.14395514130592346, "losses/sft": 1.4621645212173462, "losses/total": 0.14395514130592346, "ref_logps/chosen": -42.51466751098633, "ref_logps/rejected": -48.25796127319336, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5247198343276978, "rewards/margins": 1.0812511444091797, "rewards/rejected": -2.605971336364746, "step": 1623 }, { "epoch": 1.53, "grad_norm": 18.523330688476562, "learning_rate": 2.7159846100034977e-07, "logps/chosen": -52.2335205078125, "logps/rejected": -81.66960144042969, "loss": 0.3326, "losses/dpo": 0.20169439911842346, "losses/sft": 1.5033025741577148, "losses/total": 0.20169439911842346, "ref_logps/chosen": -40.988014221191406, "ref_logps/rejected": -52.44248962402344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1245503425598145, "rewards/margins": 1.7981605529785156, "rewards/rejected": -2.92271089553833, "step": 1624 }, { "epoch": 1.53, "grad_norm": 21.227066040039062, "learning_rate": 2.714235746764603e-07, "logps/chosen": -44.20942306518555, "logps/rejected": -67.27243041992188, "loss": 0.3199, "losses/dpo": 0.3612087368965149, "losses/sft": 1.516041874885559, "losses/total": 0.3612087368965149, "ref_logps/chosen": -31.2517147064209, "ref_logps/rejected": -37.37519073486328, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2957711219787598, "rewards/margins": 1.693953037261963, "rewards/rejected": -2.9897239208221436, "step": 1625 }, { "epoch": 1.54, "grad_norm": 18.714885711669922, "learning_rate": 2.712486883525708e-07, "logps/chosen": -39.56517028808594, "logps/rejected": -75.67193603515625, "loss": 0.3049, "losses/dpo": 0.4981847405433655, "losses/sft": 1.671005129814148, "losses/total": 0.4981847405433655, "ref_logps/chosen": -29.99589729309082, "ref_logps/rejected": -50.809783935546875, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9569271206855774, "rewards/margins": 1.529287576675415, "rewards/rejected": -2.4862146377563477, "step": 1626 }, { "epoch": 1.54, "grad_norm": 18.798629760742188, "learning_rate": 2.7107380202868136e-07, "logps/chosen": -37.07057189941406, "logps/rejected": -50.39424133300781, "loss": 0.3917, "losses/dpo": 0.2724947929382324, "losses/sft": 1.7756692171096802, "losses/total": 0.2724947929382324, "ref_logps/chosen": -28.9957332611084, "ref_logps/rejected": -31.157318115234375, "rewards/accuracies": 0.875, "rewards/chosen": -0.8074840307235718, "rewards/margins": 1.1162086725234985, "rewards/rejected": -1.9236927032470703, "step": 1627 }, { "epoch": 1.54, "grad_norm": 26.653005599975586, "learning_rate": 2.708989157047919e-07, "logps/chosen": -60.65376281738281, "logps/rejected": -73.37484741210938, "loss": 0.4239, "losses/dpo": 0.45428338646888733, "losses/sft": 2.1997427940368652, "losses/total": 0.45428338646888733, "ref_logps/chosen": -43.859615325927734, "ref_logps/rejected": -44.42409896850586, "rewards/accuracies": 0.875, "rewards/chosen": -1.6794148683547974, "rewards/margins": 1.2156600952148438, "rewards/rejected": -2.8950748443603516, "step": 1628 }, { "epoch": 1.54, "grad_norm": 20.16916847229004, "learning_rate": 2.707240293809024e-07, "logps/chosen": -55.38658142089844, "logps/rejected": -60.91698455810547, "loss": 0.3616, "losses/dpo": 0.20826703310012817, "losses/sft": 1.8224296569824219, "losses/total": 0.20826703310012817, "ref_logps/chosen": -39.526954650878906, "ref_logps/rejected": -34.66309356689453, "rewards/accuracies": 0.875, "rewards/chosen": -1.5859628915786743, "rewards/margins": 1.0394259691238403, "rewards/rejected": -2.6253886222839355, "step": 1629 }, { "epoch": 1.54, "grad_norm": 20.150941848754883, "learning_rate": 2.7054914305701295e-07, "logps/chosen": -51.59336471557617, "logps/rejected": -88.48597717285156, "loss": 0.3071, "losses/dpo": 0.16902276873588562, "losses/sft": 2.2553014755249023, "losses/total": 0.16902276873588562, "ref_logps/chosen": -38.70440673828125, "ref_logps/rejected": -56.46617126464844, "rewards/accuracies": 0.875, "rewards/chosen": -1.288895606994629, "rewards/margins": 1.913084864616394, "rewards/rejected": -3.2019805908203125, "step": 1630 }, { "epoch": 1.54, "grad_norm": 32.25161361694336, "learning_rate": 2.7037425673312347e-07, "logps/chosen": -51.89531707763672, "logps/rejected": -66.19013977050781, "loss": 0.5398, "losses/dpo": 0.570817232131958, "losses/sft": 1.7185765504837036, "losses/total": 0.570817232131958, "ref_logps/chosen": -37.96915054321289, "ref_logps/rejected": -41.259674072265625, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3926165103912354, "rewards/margins": 1.1004307270050049, "rewards/rejected": -2.493046998977661, "step": 1631 }, { "epoch": 1.54, "grad_norm": 23.103656768798828, "learning_rate": 2.70199370409234e-07, "logps/chosen": -43.867820739746094, "logps/rejected": -60.219703674316406, "loss": 0.4104, "losses/dpo": 0.4848886728286743, "losses/sft": 1.5061545372009277, "losses/total": 0.4848886728286743, "ref_logps/chosen": -31.769641876220703, "ref_logps/rejected": -36.95215606689453, "rewards/accuracies": 0.875, "rewards/chosen": -1.2098181247711182, "rewards/margins": 1.1169370412826538, "rewards/rejected": -2.3267550468444824, "step": 1632 }, { "epoch": 1.54, "grad_norm": 23.330171585083008, "learning_rate": 2.700244840853445e-07, "logps/chosen": -77.50054168701172, "logps/rejected": -99.7005615234375, "loss": 0.3197, "losses/dpo": 0.1342008411884308, "losses/sft": 2.5591113567352295, "losses/total": 0.1342008411884308, "ref_logps/chosen": -57.61627197265625, "ref_logps/rejected": -63.15528869628906, "rewards/accuracies": 0.75, "rewards/chosen": -1.9884271621704102, "rewards/margins": 1.6661008596420288, "rewards/rejected": -3.6545281410217285, "step": 1633 }, { "epoch": 1.54, "grad_norm": 33.40924072265625, "learning_rate": 2.6984959776145506e-07, "logps/chosen": -62.27132797241211, "logps/rejected": -86.947509765625, "loss": 0.5329, "losses/dpo": 0.6631467342376709, "losses/sft": 2.3627607822418213, "losses/total": 0.6631467342376709, "ref_logps/chosen": -43.612518310546875, "ref_logps/rejected": -57.89971923828125, "rewards/accuracies": 0.6875, "rewards/chosen": -1.865881085395813, "rewards/margins": 1.0388987064361572, "rewards/rejected": -2.9047799110412598, "step": 1634 }, { "epoch": 1.54, "grad_norm": 28.160261154174805, "learning_rate": 2.6967471143756557e-07, "logps/chosen": -56.79981994628906, "logps/rejected": -83.97128295898438, "loss": 0.4519, "losses/dpo": 0.4497472047805786, "losses/sft": 2.3514842987060547, "losses/total": 0.4497472047805786, "ref_logps/chosen": -41.22658157348633, "ref_logps/rejected": -52.50547790527344, "rewards/accuracies": 0.75, "rewards/chosen": -1.557323932647705, "rewards/margins": 1.5892574787139893, "rewards/rejected": -3.1465816497802734, "step": 1635 }, { "epoch": 1.54, "grad_norm": 22.036100387573242, "learning_rate": 2.694998251136761e-07, "logps/chosen": -43.375343322753906, "logps/rejected": -76.14234924316406, "loss": 0.3429, "losses/dpo": 0.5489202737808228, "losses/sft": 1.8918821811676025, "losses/total": 0.5489202737808228, "ref_logps/chosen": -30.27825164794922, "ref_logps/rejected": -47.488224029541016, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3097093105316162, "rewards/margins": 1.5557036399841309, "rewards/rejected": -2.865412950515747, "step": 1636 }, { "epoch": 1.55, "grad_norm": 16.719146728515625, "learning_rate": 2.6932493878978665e-07, "logps/chosen": -54.67706298828125, "logps/rejected": -82.48965454101562, "loss": 0.2847, "losses/dpo": 0.2864099144935608, "losses/sft": 2.197626829147339, "losses/total": 0.2864099144935608, "ref_logps/chosen": -41.088531494140625, "ref_logps/rejected": -51.77882385253906, "rewards/accuracies": 0.875, "rewards/chosen": -1.3588535785675049, "rewards/margins": 1.7122292518615723, "rewards/rejected": -3.0710830688476562, "step": 1637 }, { "epoch": 1.55, "grad_norm": 22.877870559692383, "learning_rate": 2.6915005246589716e-07, "logps/chosen": -37.54486083984375, "logps/rejected": -51.50291442871094, "loss": 0.4067, "losses/dpo": 0.47823959589004517, "losses/sft": 2.042141914367676, "losses/total": 0.47823959589004517, "ref_logps/chosen": -26.308597564697266, "ref_logps/rejected": -30.071317672729492, "rewards/accuracies": 0.8125, "rewards/chosen": -1.123626470565796, "rewards/margins": 1.0195329189300537, "rewards/rejected": -2.1431596279144287, "step": 1638 }, { "epoch": 1.55, "grad_norm": 22.970375061035156, "learning_rate": 2.6897516614200767e-07, "logps/chosen": -48.15448760986328, "logps/rejected": -62.06604766845703, "loss": 0.4829, "losses/dpo": 0.6983827352523804, "losses/sft": 2.161217451095581, "losses/total": 0.6983827352523804, "ref_logps/chosen": -35.08259201049805, "ref_logps/rejected": -38.79847717285156, "rewards/accuracies": 0.75, "rewards/chosen": -1.30718994140625, "rewards/margins": 1.0195668935775757, "rewards/rejected": -2.3267569541931152, "step": 1639 }, { "epoch": 1.55, "grad_norm": 41.678348541259766, "learning_rate": 2.688002798181182e-07, "logps/chosen": -48.52545166015625, "logps/rejected": -63.07997512817383, "loss": 0.5523, "losses/dpo": 0.23472118377685547, "losses/sft": 1.5837814807891846, "losses/total": 0.23472118377685547, "ref_logps/chosen": -32.400516510009766, "ref_logps/rejected": -38.260520935058594, "rewards/accuracies": 0.75, "rewards/chosen": -1.6124935150146484, "rewards/margins": 0.8694522976875305, "rewards/rejected": -2.4819459915161133, "step": 1640 }, { "epoch": 1.55, "grad_norm": 28.9033145904541, "learning_rate": 2.6862539349422875e-07, "logps/chosen": -71.91659545898438, "logps/rejected": -96.31245422363281, "loss": 0.3554, "losses/dpo": 0.05449855327606201, "losses/sft": 1.3932918310165405, "losses/total": 0.05449855327606201, "ref_logps/chosen": -52.96250915527344, "ref_logps/rejected": -60.115394592285156, "rewards/accuracies": 0.875, "rewards/chosen": -1.8954088687896729, "rewards/margins": 1.7242975234985352, "rewards/rejected": -3.619706630706787, "step": 1641 }, { "epoch": 1.55, "grad_norm": 25.54038429260254, "learning_rate": 2.684505071703393e-07, "logps/chosen": -50.156150817871094, "logps/rejected": -72.30618286132812, "loss": 0.4543, "losses/dpo": 0.5111289024353027, "losses/sft": 2.578420400619507, "losses/total": 0.5111289024353027, "ref_logps/chosen": -34.52027893066406, "ref_logps/rejected": -43.94939422607422, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5635875463485718, "rewards/margins": 1.2720911502838135, "rewards/rejected": -2.835678815841675, "step": 1642 }, { "epoch": 1.55, "grad_norm": 21.259483337402344, "learning_rate": 2.682756208464498e-07, "logps/chosen": -49.59820556640625, "logps/rejected": -66.58502197265625, "loss": 0.369, "losses/dpo": 0.3234396278858185, "losses/sft": 1.5548697710037231, "losses/total": 0.3234396278858185, "ref_logps/chosen": -36.21709442138672, "ref_logps/rejected": -41.82677459716797, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3381109237670898, "rewards/margins": 1.13771390914917, "rewards/rejected": -2.4758248329162598, "step": 1643 }, { "epoch": 1.55, "grad_norm": 31.227542877197266, "learning_rate": 2.6810073452256034e-07, "logps/chosen": -45.606712341308594, "logps/rejected": -58.460350036621094, "loss": 0.4999, "losses/dpo": 0.5389057993888855, "losses/sft": 2.197465419769287, "losses/total": 0.5389057993888855, "ref_logps/chosen": -30.810718536376953, "ref_logps/rejected": -35.18593215942383, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4795989990234375, "rewards/margins": 0.8478430509567261, "rewards/rejected": -2.327441930770874, "step": 1644 }, { "epoch": 1.55, "grad_norm": 31.755725860595703, "learning_rate": 2.6792584819867085e-07, "logps/chosen": -54.021392822265625, "logps/rejected": -85.29512023925781, "loss": 0.4119, "losses/dpo": 0.22851862013339996, "losses/sft": 1.5314711332321167, "losses/total": 0.22851862013339996, "ref_logps/chosen": -37.89067840576172, "ref_logps/rejected": -53.333343505859375, "rewards/accuracies": 0.875, "rewards/chosen": -1.6130714416503906, "rewards/margins": 1.5831066370010376, "rewards/rejected": -3.1961779594421387, "step": 1645 }, { "epoch": 1.55, "grad_norm": 25.32545280456543, "learning_rate": 2.6775096187478137e-07, "logps/chosen": -51.02509689331055, "logps/rejected": -56.782466888427734, "loss": 0.5397, "losses/dpo": 0.5141239166259766, "losses/sft": 1.764752984046936, "losses/total": 0.5141239166259766, "ref_logps/chosen": -35.31363296508789, "ref_logps/rejected": -33.657630920410156, "rewards/accuracies": 0.75, "rewards/chosen": -1.5711463689804077, "rewards/margins": 0.741337776184082, "rewards/rejected": -2.3124840259552, "step": 1646 }, { "epoch": 1.56, "grad_norm": 20.92083168029785, "learning_rate": 2.675760755508919e-07, "logps/chosen": -53.08032989501953, "logps/rejected": -71.9874267578125, "loss": 0.2893, "losses/dpo": 0.13308952748775482, "losses/sft": 1.7331342697143555, "losses/total": 0.13308952748775482, "ref_logps/chosen": -39.94524383544922, "ref_logps/rejected": -41.948890686035156, "rewards/accuracies": 0.875, "rewards/chosen": -1.3135087490081787, "rewards/margins": 1.690345048904419, "rewards/rejected": -3.0038537979125977, "step": 1647 }, { "epoch": 1.56, "grad_norm": 24.365474700927734, "learning_rate": 2.6740118922700245e-07, "logps/chosen": -53.47990417480469, "logps/rejected": -75.38375854492188, "loss": 0.4943, "losses/dpo": 0.44575968384742737, "losses/sft": 2.0612235069274902, "losses/total": 0.44575968384742737, "ref_logps/chosen": -36.23787307739258, "ref_logps/rejected": -48.19116973876953, "rewards/accuracies": 0.6875, "rewards/chosen": -1.72420334815979, "rewards/margins": 0.995055079460144, "rewards/rejected": -2.7192583084106445, "step": 1648 }, { "epoch": 1.56, "grad_norm": 19.783899307250977, "learning_rate": 2.67226302903113e-07, "logps/chosen": -64.78406524658203, "logps/rejected": -82.91340637207031, "loss": 0.2551, "losses/dpo": 0.1174703985452652, "losses/sft": 1.5330079793930054, "losses/total": 0.1174703985452652, "ref_logps/chosen": -49.928321838378906, "ref_logps/rejected": -51.219703674316406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4855741262435913, "rewards/margins": 1.6837958097457886, "rewards/rejected": -3.16936993598938, "step": 1649 }, { "epoch": 1.56, "grad_norm": 18.13228988647461, "learning_rate": 2.6705141657922347e-07, "logps/chosen": -52.760826110839844, "logps/rejected": -77.49868774414062, "loss": 0.321, "losses/dpo": 0.5770677328109741, "losses/sft": 2.273347854614258, "losses/total": 0.5770677328109741, "ref_logps/chosen": -40.98771667480469, "ref_logps/rejected": -51.59224319458008, "rewards/accuracies": 0.875, "rewards/chosen": -1.1773107051849365, "rewards/margins": 1.413333535194397, "rewards/rejected": -2.590644359588623, "step": 1650 }, { "epoch": 1.56, "grad_norm": 18.541601181030273, "learning_rate": 2.6687653025533404e-07, "logps/chosen": -65.10570526123047, "logps/rejected": -84.50265502929688, "loss": 0.2826, "losses/dpo": 0.32084178924560547, "losses/sft": 2.3292148113250732, "losses/total": 0.32084178924560547, "ref_logps/chosen": -46.8994140625, "ref_logps/rejected": -51.53701400756836, "rewards/accuracies": 1.0, "rewards/chosen": -1.820629358291626, "rewards/margins": 1.475934624671936, "rewards/rejected": -3.2965638637542725, "step": 1651 }, { "epoch": 1.56, "grad_norm": 23.196929931640625, "learning_rate": 2.6670164393144455e-07, "logps/chosen": -62.20363998413086, "logps/rejected": -77.71561431884766, "loss": 0.4543, "losses/dpo": 0.6370358467102051, "losses/sft": 1.830292820930481, "losses/total": 0.6370358467102051, "ref_logps/chosen": -44.85289001464844, "ref_logps/rejected": -48.890987396240234, "rewards/accuracies": 0.6875, "rewards/chosen": -1.735074758529663, "rewards/margins": 1.1473876237869263, "rewards/rejected": -2.882462501525879, "step": 1652 }, { "epoch": 1.56, "grad_norm": 25.40617561340332, "learning_rate": 2.6652675760755506e-07, "logps/chosen": -50.79237365722656, "logps/rejected": -76.96620178222656, "loss": 0.4269, "losses/dpo": 0.5880008339881897, "losses/sft": 1.894627332687378, "losses/total": 0.5880008339881897, "ref_logps/chosen": -33.071617126464844, "ref_logps/rejected": -46.53327178955078, "rewards/accuracies": 0.875, "rewards/chosen": -1.772075891494751, "rewards/margins": 1.2712171077728271, "rewards/rejected": -3.043292999267578, "step": 1653 }, { "epoch": 1.56, "grad_norm": 18.516530990600586, "learning_rate": 2.663518712836656e-07, "logps/chosen": -53.31241226196289, "logps/rejected": -80.26522827148438, "loss": 0.2536, "losses/dpo": 0.2803356349468231, "losses/sft": 1.9406237602233887, "losses/total": 0.2803356349468231, "ref_logps/chosen": -41.60315704345703, "ref_logps/rejected": -51.36479568481445, "rewards/accuracies": 0.9375, "rewards/chosen": -1.170925259590149, "rewards/margins": 1.7191179990768433, "rewards/rejected": -2.890043258666992, "step": 1654 }, { "epoch": 1.56, "grad_norm": 19.091054916381836, "learning_rate": 2.6617698495977614e-07, "logps/chosen": -45.498844146728516, "logps/rejected": -71.2254638671875, "loss": 0.3605, "losses/dpo": 0.2558470070362091, "losses/sft": 1.3483762741088867, "losses/total": 0.2558470070362091, "ref_logps/chosen": -31.576784133911133, "ref_logps/rejected": -46.127159118652344, "rewards/accuracies": 0.875, "rewards/chosen": -1.392205834388733, "rewards/margins": 1.1176249980926514, "rewards/rejected": -2.5098307132720947, "step": 1655 }, { "epoch": 1.56, "grad_norm": 34.910762786865234, "learning_rate": 2.660020986358867e-07, "logps/chosen": -56.819129943847656, "logps/rejected": -62.712806701660156, "loss": 0.6916, "losses/dpo": 0.5331999659538269, "losses/sft": 1.8937010765075684, "losses/total": 0.5331999659538269, "ref_logps/chosen": -39.472007751464844, "ref_logps/rejected": -37.18384552001953, "rewards/accuracies": 0.5, "rewards/chosen": -1.7347123622894287, "rewards/margins": 0.818183958530426, "rewards/rejected": -2.552896499633789, "step": 1656 }, { "epoch": 1.56, "grad_norm": 26.515329360961914, "learning_rate": 2.6582721231199716e-07, "logps/chosen": -61.10515213012695, "logps/rejected": -85.40309143066406, "loss": 0.4208, "losses/dpo": 0.44674190878868103, "losses/sft": 1.8360155820846558, "losses/total": 0.44674190878868103, "ref_logps/chosen": -44.87999725341797, "ref_logps/rejected": -52.64459991455078, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6225149631500244, "rewards/margins": 1.653334140777588, "rewards/rejected": -3.2758491039276123, "step": 1657 }, { "epoch": 1.57, "grad_norm": 25.264232635498047, "learning_rate": 2.6565232598810773e-07, "logps/chosen": -44.700599670410156, "logps/rejected": -66.07460021972656, "loss": 0.4791, "losses/dpo": 0.3718392848968506, "losses/sft": 1.9944740533828735, "losses/total": 0.3718392848968506, "ref_logps/chosen": -28.533042907714844, "ref_logps/rejected": -39.207847595214844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.616755723953247, "rewards/margins": 1.0699198246002197, "rewards/rejected": -2.686675548553467, "step": 1658 }, { "epoch": 1.57, "grad_norm": 20.162647247314453, "learning_rate": 2.6547743966421824e-07, "logps/chosen": -51.169822692871094, "logps/rejected": -77.40350341796875, "loss": 0.36, "losses/dpo": 0.5586111545562744, "losses/sft": 1.6747393608093262, "losses/total": 0.5586111545562744, "ref_logps/chosen": -39.93372344970703, "ref_logps/rejected": -50.12847900390625, "rewards/accuracies": 0.75, "rewards/chosen": -1.123610019683838, "rewards/margins": 1.6038925647735596, "rewards/rejected": -2.7275025844573975, "step": 1659 }, { "epoch": 1.57, "grad_norm": 32.805057525634766, "learning_rate": 2.6530255334032876e-07, "logps/chosen": -61.154266357421875, "logps/rejected": -76.336181640625, "loss": 0.5574, "losses/dpo": 0.4673042297363281, "losses/sft": 1.8928769826889038, "losses/total": 0.4673042297363281, "ref_logps/chosen": -43.338523864746094, "ref_logps/rejected": -46.407073974609375, "rewards/accuracies": 0.5625, "rewards/chosen": -1.7815742492675781, "rewards/margins": 1.2113360166549683, "rewards/rejected": -2.992910385131836, "step": 1660 }, { "epoch": 1.57, "grad_norm": 28.616901397705078, "learning_rate": 2.6512766701643927e-07, "logps/chosen": -57.51754379272461, "logps/rejected": -72.57643127441406, "loss": 0.3595, "losses/dpo": 0.4811883568763733, "losses/sft": 1.6532338857650757, "losses/total": 0.4811883568763733, "ref_logps/chosen": -38.726295471191406, "ref_logps/rejected": -39.933048248291016, "rewards/accuracies": 0.875, "rewards/chosen": -1.8791247606277466, "rewards/margins": 1.3852133750915527, "rewards/rejected": -3.2643380165100098, "step": 1661 }, { "epoch": 1.57, "grad_norm": 14.62951374053955, "learning_rate": 2.6495278069254983e-07, "logps/chosen": -57.11421203613281, "logps/rejected": -89.01399993896484, "loss": 0.1998, "losses/dpo": 0.2647951543331146, "losses/sft": 2.361064910888672, "losses/total": 0.2647951543331146, "ref_logps/chosen": -44.09925079345703, "ref_logps/rejected": -52.31884002685547, "rewards/accuracies": 1.0, "rewards/chosen": -1.3014960289001465, "rewards/margins": 2.3680200576782227, "rewards/rejected": -3.669516086578369, "step": 1662 }, { "epoch": 1.57, "grad_norm": 22.190095901489258, "learning_rate": 2.647778943686604e-07, "logps/chosen": -41.692527770996094, "logps/rejected": -59.03276062011719, "loss": 0.4231, "losses/dpo": 0.24633584916591644, "losses/sft": 1.4761253595352173, "losses/total": 0.24633584916591644, "ref_logps/chosen": -29.166818618774414, "ref_logps/rejected": -34.794349670410156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2525708675384521, "rewards/margins": 1.171270728111267, "rewards/rejected": -2.4238414764404297, "step": 1663 }, { "epoch": 1.57, "grad_norm": 28.424457550048828, "learning_rate": 2.6460300804477086e-07, "logps/chosen": -59.99970245361328, "logps/rejected": -88.26316833496094, "loss": 0.4575, "losses/dpo": 0.7439466118812561, "losses/sft": 1.7997181415557861, "losses/total": 0.7439466118812561, "ref_logps/chosen": -39.7232551574707, "ref_logps/rejected": -54.9466667175293, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0276451110839844, "rewards/margins": 1.3040049076080322, "rewards/rejected": -3.3316500186920166, "step": 1664 }, { "epoch": 1.57, "grad_norm": 24.59447479248047, "learning_rate": 2.644281217208814e-07, "logps/chosen": -45.171573638916016, "logps/rejected": -69.65135192871094, "loss": 0.4677, "losses/dpo": 0.2128482609987259, "losses/sft": 1.6712005138397217, "losses/total": 0.2128482609987259, "ref_logps/chosen": -31.02631378173828, "ref_logps/rejected": -42.3289794921875, "rewards/accuracies": 0.75, "rewards/chosen": -1.4145258665084839, "rewards/margins": 1.317711591720581, "rewards/rejected": -2.7322373390197754, "step": 1665 }, { "epoch": 1.57, "grad_norm": 25.404090881347656, "learning_rate": 2.6425323539699194e-07, "logps/chosen": -57.798797607421875, "logps/rejected": -80.9564437866211, "loss": 0.3881, "losses/dpo": 0.3141580820083618, "losses/sft": 2.369131326675415, "losses/total": 0.3141580820083618, "ref_logps/chosen": -42.959590911865234, "ref_logps/rejected": -51.40434646606445, "rewards/accuracies": 0.9375, "rewards/chosen": -1.483920693397522, "rewards/margins": 1.4712886810302734, "rewards/rejected": -2.955209255218506, "step": 1666 }, { "epoch": 1.57, "grad_norm": 19.500904083251953, "learning_rate": 2.6407834907310245e-07, "logps/chosen": -50.6585693359375, "logps/rejected": -72.68571472167969, "loss": 0.2503, "losses/dpo": 0.250453382730484, "losses/sft": 2.021021842956543, "losses/total": 0.250453382730484, "ref_logps/chosen": -36.266387939453125, "ref_logps/rejected": -42.38813018798828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.439218282699585, "rewards/margins": 1.5905399322509766, "rewards/rejected": -3.0297582149505615, "step": 1667 }, { "epoch": 1.58, "grad_norm": 13.217467308044434, "learning_rate": 2.63903462749213e-07, "logps/chosen": -40.942283630371094, "logps/rejected": -55.480979919433594, "loss": 0.3213, "losses/dpo": 0.23293673992156982, "losses/sft": 2.005419969558716, "losses/total": 0.23293673992156982, "ref_logps/chosen": -31.410551071166992, "ref_logps/rejected": -32.45996856689453, "rewards/accuracies": 0.875, "rewards/chosen": -0.953173041343689, "rewards/margins": 1.3489274978637695, "rewards/rejected": -2.302100658416748, "step": 1668 }, { "epoch": 1.58, "grad_norm": 16.91592025756836, "learning_rate": 2.6372857642532353e-07, "logps/chosen": -40.017459869384766, "logps/rejected": -72.77808380126953, "loss": 0.306, "losses/dpo": 0.19263622164726257, "losses/sft": 1.4955772161483765, "losses/total": 0.19263622164726257, "ref_logps/chosen": -32.276084899902344, "ref_logps/rejected": -49.59219741821289, "rewards/accuracies": 0.875, "rewards/chosen": -0.7741374373435974, "rewards/margins": 1.5444512367248535, "rewards/rejected": -2.3185884952545166, "step": 1669 }, { "epoch": 1.58, "grad_norm": 31.049604415893555, "learning_rate": 2.635536901014341e-07, "logps/chosen": -54.75788879394531, "logps/rejected": -63.67683410644531, "loss": 0.5912, "losses/dpo": 0.6454762816429138, "losses/sft": 2.1500191688537598, "losses/total": 0.6454762816429138, "ref_logps/chosen": -37.41011047363281, "ref_logps/rejected": -37.62797546386719, "rewards/accuracies": 0.625, "rewards/chosen": -1.7347784042358398, "rewards/margins": 0.8701076507568359, "rewards/rejected": -2.604886054992676, "step": 1670 }, { "epoch": 1.58, "grad_norm": 22.56049919128418, "learning_rate": 2.6337880377754455e-07, "logps/chosen": -43.30486297607422, "logps/rejected": -70.1745834350586, "loss": 0.5217, "losses/dpo": 0.5436393022537231, "losses/sft": 1.8744052648544312, "losses/total": 0.5436393022537231, "ref_logps/chosen": -32.65325164794922, "ref_logps/rejected": -49.66826629638672, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0651612281799316, "rewards/margins": 0.9854701161384583, "rewards/rejected": -2.050631284713745, "step": 1671 }, { "epoch": 1.58, "grad_norm": 21.70183563232422, "learning_rate": 2.632039174536551e-07, "logps/chosen": -44.057373046875, "logps/rejected": -65.52379608154297, "loss": 0.3543, "losses/dpo": 0.5216940641403198, "losses/sft": 1.9431930780410767, "losses/total": 0.5216940641403198, "ref_logps/chosen": -31.411155700683594, "ref_logps/rejected": -37.00748825073242, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2646220922470093, "rewards/margins": 1.5870087146759033, "rewards/rejected": -2.851630687713623, "step": 1672 }, { "epoch": 1.58, "grad_norm": 27.148149490356445, "learning_rate": 2.6302903112976563e-07, "logps/chosen": -50.910274505615234, "logps/rejected": -69.40679931640625, "loss": 0.4255, "losses/dpo": 0.9526625871658325, "losses/sft": 2.315486192703247, "losses/total": 0.9526625871658325, "ref_logps/chosen": -34.414894104003906, "ref_logps/rejected": -37.466373443603516, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6495378017425537, "rewards/margins": 1.544505000114441, "rewards/rejected": -3.194042682647705, "step": 1673 }, { "epoch": 1.58, "grad_norm": 21.605085372924805, "learning_rate": 2.6285414480587614e-07, "logps/chosen": -56.68227005004883, "logps/rejected": -72.84638977050781, "loss": 0.3459, "losses/dpo": 0.34026867151260376, "losses/sft": 2.0914828777313232, "losses/total": 0.34026867151260376, "ref_logps/chosen": -40.51249694824219, "ref_logps/rejected": -44.0089111328125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6169772148132324, "rewards/margins": 1.266770362854004, "rewards/rejected": -2.8837475776672363, "step": 1674 }, { "epoch": 1.58, "grad_norm": 34.19541549682617, "learning_rate": 2.626792584819867e-07, "logps/chosen": -52.255653381347656, "logps/rejected": -67.04866027832031, "loss": 0.5724, "losses/dpo": 0.5703568458557129, "losses/sft": 1.9263702630996704, "losses/total": 0.5703568458557129, "ref_logps/chosen": -37.738319396972656, "ref_logps/rejected": -44.528045654296875, "rewards/accuracies": 0.75, "rewards/chosen": -1.4517334699630737, "rewards/margins": 0.8003284931182861, "rewards/rejected": -2.2520618438720703, "step": 1675 }, { "epoch": 1.58, "grad_norm": 22.78449058532715, "learning_rate": 2.625043721580972e-07, "logps/chosen": -49.107826232910156, "logps/rejected": -69.46508026123047, "loss": 0.3994, "losses/dpo": 0.26648449897766113, "losses/sft": 1.4606882333755493, "losses/total": 0.26648449897766113, "ref_logps/chosen": -37.83285903930664, "ref_logps/rejected": -45.494712829589844, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1274969577789307, "rewards/margins": 1.2695398330688477, "rewards/rejected": -2.397036552429199, "step": 1676 }, { "epoch": 1.58, "grad_norm": 18.713668823242188, "learning_rate": 2.623294858342078e-07, "logps/chosen": -45.18793869018555, "logps/rejected": -84.1526107788086, "loss": 0.317, "losses/dpo": 0.2993185520172119, "losses/sft": 1.3882665634155273, "losses/total": 0.2993185520172119, "ref_logps/chosen": -30.645986557006836, "ref_logps/rejected": -53.584232330322266, "rewards/accuracies": 0.9375, "rewards/chosen": -1.454195261001587, "rewards/margins": 1.6026426553726196, "rewards/rejected": -3.056837797164917, "step": 1677 }, { "epoch": 1.58, "grad_norm": 16.389495849609375, "learning_rate": 2.6215459951031825e-07, "logps/chosen": -36.11703872680664, "logps/rejected": -75.21994018554688, "loss": 0.2332, "losses/dpo": 0.28319504857063293, "losses/sft": 1.540827751159668, "losses/total": 0.28319504857063293, "ref_logps/chosen": -26.941329956054688, "ref_logps/rejected": -48.799434661865234, "rewards/accuracies": 1.0, "rewards/chosen": -0.9175708889961243, "rewards/margins": 1.7244800329208374, "rewards/rejected": -2.6420507431030273, "step": 1678 }, { "epoch": 1.59, "grad_norm": 19.882858276367188, "learning_rate": 2.619797131864288e-07, "logps/chosen": -49.78032302856445, "logps/rejected": -84.12419891357422, "loss": 0.2925, "losses/dpo": 0.1579287052154541, "losses/sft": 1.5552613735198975, "losses/total": 0.1579287052154541, "ref_logps/chosen": -35.98012924194336, "ref_logps/rejected": -52.107017517089844, "rewards/accuracies": 0.875, "rewards/chosen": -1.3800195455551147, "rewards/margins": 1.8216983079910278, "rewards/rejected": -3.2017178535461426, "step": 1679 }, { "epoch": 1.59, "grad_norm": 28.187620162963867, "learning_rate": 2.618048268625393e-07, "logps/chosen": -51.121551513671875, "logps/rejected": -71.55033111572266, "loss": 0.3887, "losses/dpo": 0.23457235097885132, "losses/sft": 2.2999722957611084, "losses/total": 0.23457235097885132, "ref_logps/chosen": -35.76118850708008, "ref_logps/rejected": -39.54771423339844, "rewards/accuracies": 0.875, "rewards/chosen": -1.5360362529754639, "rewards/margins": 1.6642255783081055, "rewards/rejected": -3.2002618312835693, "step": 1680 }, { "epoch": 1.59, "grad_norm": 26.547409057617188, "learning_rate": 2.6162994053864984e-07, "logps/chosen": -40.57977294921875, "logps/rejected": -59.65150451660156, "loss": 0.5212, "losses/dpo": 0.30037161707878113, "losses/sft": 1.3291321992874146, "losses/total": 0.30037161707878113, "ref_logps/chosen": -29.0831298828125, "ref_logps/rejected": -34.76066589355469, "rewards/accuracies": 0.75, "rewards/chosen": -1.1496641635894775, "rewards/margins": 1.339420199394226, "rewards/rejected": -2.489084243774414, "step": 1681 }, { "epoch": 1.59, "grad_norm": 16.880813598632812, "learning_rate": 2.614550542147604e-07, "logps/chosen": -41.66114807128906, "logps/rejected": -65.62359619140625, "loss": 0.3193, "losses/dpo": 0.536454439163208, "losses/sft": 1.594664454460144, "losses/total": 0.536454439163208, "ref_logps/chosen": -31.709991455078125, "ref_logps/rejected": -38.73860168457031, "rewards/accuracies": 0.875, "rewards/chosen": -0.9951153993606567, "rewards/margins": 1.6933845281600952, "rewards/rejected": -2.688499927520752, "step": 1682 }, { "epoch": 1.59, "grad_norm": 22.443984985351562, "learning_rate": 2.612801678908709e-07, "logps/chosen": -59.49350357055664, "logps/rejected": -79.27086639404297, "loss": 0.3134, "losses/dpo": 0.24396274983882904, "losses/sft": 2.112079620361328, "losses/total": 0.24396274983882904, "ref_logps/chosen": -43.74391174316406, "ref_logps/rejected": -49.0337028503418, "rewards/accuracies": 0.875, "rewards/chosen": -1.5749597549438477, "rewards/margins": 1.448757290840149, "rewards/rejected": -3.023717164993286, "step": 1683 }, { "epoch": 1.59, "grad_norm": 21.76535987854004, "learning_rate": 2.611052815669815e-07, "logps/chosen": -62.51936721801758, "logps/rejected": -90.518798828125, "loss": 0.3522, "losses/dpo": 0.1277112364768982, "losses/sft": 2.0342190265655518, "losses/total": 0.1277112364768982, "ref_logps/chosen": -44.36508560180664, "ref_logps/rejected": -56.39969253540039, "rewards/accuracies": 0.875, "rewards/chosen": -1.8154282569885254, "rewards/margins": 1.5964818000793457, "rewards/rejected": -3.411910057067871, "step": 1684 }, { "epoch": 1.59, "grad_norm": 20.25614356994629, "learning_rate": 2.6093039524309194e-07, "logps/chosen": -42.00189208984375, "logps/rejected": -63.54865264892578, "loss": 0.3939, "losses/dpo": 0.6064974665641785, "losses/sft": 1.8426246643066406, "losses/total": 0.6064974665641785, "ref_logps/chosen": -30.157485961914062, "ref_logps/rejected": -38.62895965576172, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1844403743743896, "rewards/margins": 1.3075294494628906, "rewards/rejected": -2.4919698238372803, "step": 1685 }, { "epoch": 1.59, "grad_norm": 24.392690658569336, "learning_rate": 2.607555089192025e-07, "logps/chosen": -55.11760330200195, "logps/rejected": -78.53960418701172, "loss": 0.4283, "losses/dpo": 0.4085690379142761, "losses/sft": 2.050025224685669, "losses/total": 0.4085690379142761, "ref_logps/chosen": -41.50851058959961, "ref_logps/rejected": -53.802818298339844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3609095811843872, "rewards/margins": 1.1127686500549316, "rewards/rejected": -2.4736781120300293, "step": 1686 }, { "epoch": 1.59, "grad_norm": 19.779436111450195, "learning_rate": 2.605806225953131e-07, "logps/chosen": -46.925655364990234, "logps/rejected": -76.889892578125, "loss": 0.2828, "losses/dpo": 0.3575228154659271, "losses/sft": 1.8481152057647705, "losses/total": 0.3575228154659271, "ref_logps/chosen": -35.66778564453125, "ref_logps/rejected": -49.306495666503906, "rewards/accuracies": 0.875, "rewards/chosen": -1.1257871389389038, "rewards/margins": 1.6325528621673584, "rewards/rejected": -2.7583398818969727, "step": 1687 }, { "epoch": 1.59, "grad_norm": 20.928817749023438, "learning_rate": 2.6040573627142353e-07, "logps/chosen": -42.58091354370117, "logps/rejected": -69.19400024414062, "loss": 0.4106, "losses/dpo": 0.26089322566986084, "losses/sft": 1.6918455362319946, "losses/total": 0.26089322566986084, "ref_logps/chosen": -31.497961044311523, "ref_logps/rejected": -46.74159240722656, "rewards/accuracies": 0.8125, "rewards/chosen": -1.108295202255249, "rewards/margins": 1.1369454860687256, "rewards/rejected": -2.2452406883239746, "step": 1688 }, { "epoch": 1.59, "grad_norm": 21.269784927368164, "learning_rate": 2.602308499475341e-07, "logps/chosen": -60.73674011230469, "logps/rejected": -78.62924194335938, "loss": 0.3499, "losses/dpo": 0.3991255760192871, "losses/sft": 2.6420505046844482, "losses/total": 0.3991255760192871, "ref_logps/chosen": -42.557193756103516, "ref_logps/rejected": -45.440250396728516, "rewards/accuracies": 0.875, "rewards/chosen": -1.8179550170898438, "rewards/margins": 1.5009442567825317, "rewards/rejected": -3.318899154663086, "step": 1689 }, { "epoch": 1.6, "grad_norm": 25.46879005432129, "learning_rate": 2.600559636236446e-07, "logps/chosen": -53.05203628540039, "logps/rejected": -69.90511322021484, "loss": 0.526, "losses/dpo": 0.5232917070388794, "losses/sft": 1.971091628074646, "losses/total": 0.5232917070388794, "ref_logps/chosen": -37.96042251586914, "ref_logps/rejected": -44.83235168457031, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5091618299484253, "rewards/margins": 0.9981141090393066, "rewards/rejected": -2.5072760581970215, "step": 1690 }, { "epoch": 1.6, "grad_norm": 19.873798370361328, "learning_rate": 2.598810772997552e-07, "logps/chosen": -37.886085510253906, "logps/rejected": -65.10121154785156, "loss": 0.3292, "losses/dpo": 0.15399359166622162, "losses/sft": 2.109825611114502, "losses/total": 0.15399359166622162, "ref_logps/chosen": -21.938209533691406, "ref_logps/rejected": -35.62663269042969, "rewards/accuracies": 0.875, "rewards/chosen": -1.59478759765625, "rewards/margins": 1.352670669555664, "rewards/rejected": -2.947458267211914, "step": 1691 }, { "epoch": 1.6, "grad_norm": 30.11886215209961, "learning_rate": 2.5970619097586564e-07, "logps/chosen": -61.93041229248047, "logps/rejected": -73.73487854003906, "loss": 0.5023, "losses/dpo": 0.4248866140842438, "losses/sft": 1.7431190013885498, "losses/total": 0.4248866140842438, "ref_logps/chosen": -44.71243667602539, "ref_logps/rejected": -45.64496994018555, "rewards/accuracies": 0.75, "rewards/chosen": -1.7217977046966553, "rewards/margins": 1.087193489074707, "rewards/rejected": -2.808990955352783, "step": 1692 }, { "epoch": 1.6, "grad_norm": 27.565887451171875, "learning_rate": 2.595313046519762e-07, "logps/chosen": -48.117340087890625, "logps/rejected": -63.55048751831055, "loss": 0.4001, "losses/dpo": 0.8755173087120056, "losses/sft": 1.6353271007537842, "losses/total": 0.8755173087120056, "ref_logps/chosen": -35.32022476196289, "ref_logps/rejected": -39.41569137573242, "rewards/accuracies": 0.875, "rewards/chosen": -1.2797114849090576, "rewards/margins": 1.1337684392929077, "rewards/rejected": -2.413480043411255, "step": 1693 }, { "epoch": 1.6, "grad_norm": 25.21326446533203, "learning_rate": 2.5935641832808677e-07, "logps/chosen": -46.920196533203125, "logps/rejected": -59.816200256347656, "loss": 0.4287, "losses/dpo": 0.43476080894470215, "losses/sft": 2.567338228225708, "losses/total": 0.43476080894470215, "ref_logps/chosen": -33.12450408935547, "ref_logps/rejected": -36.84740447998047, "rewards/accuracies": 0.875, "rewards/chosen": -1.37956964969635, "rewards/margins": 0.9173097610473633, "rewards/rejected": -2.296879291534424, "step": 1694 }, { "epoch": 1.6, "grad_norm": 27.18939971923828, "learning_rate": 2.5918153200419723e-07, "logps/chosen": -55.09325408935547, "logps/rejected": -64.12335968017578, "loss": 0.4843, "losses/dpo": 0.21715635061264038, "losses/sft": 1.8934741020202637, "losses/total": 0.21715635061264038, "ref_logps/chosen": -40.22957992553711, "ref_logps/rejected": -38.47683334350586, "rewards/accuracies": 0.75, "rewards/chosen": -1.4863673448562622, "rewards/margins": 1.0782852172851562, "rewards/rejected": -2.564652442932129, "step": 1695 }, { "epoch": 1.6, "grad_norm": 13.418107986450195, "learning_rate": 2.590066456803078e-07, "logps/chosen": -43.864192962646484, "logps/rejected": -69.68255615234375, "loss": 0.2231, "losses/dpo": 0.21122479438781738, "losses/sft": 1.6132981777191162, "losses/total": 0.21122479438781738, "ref_logps/chosen": -33.222068786621094, "ref_logps/rejected": -39.89115524291992, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0642125606536865, "rewards/margins": 1.9149274826049805, "rewards/rejected": -2.979140281677246, "step": 1696 }, { "epoch": 1.6, "grad_norm": 24.44149398803711, "learning_rate": 2.588317593564183e-07, "logps/chosen": -50.114990234375, "logps/rejected": -73.626220703125, "loss": 0.5198, "losses/dpo": 0.5436519980430603, "losses/sft": 1.7331504821777344, "losses/total": 0.5436519980430603, "ref_logps/chosen": -36.66084289550781, "ref_logps/rejected": -49.82818603515625, "rewards/accuracies": 0.75, "rewards/chosen": -1.3454147577285767, "rewards/margins": 1.0343890190124512, "rewards/rejected": -2.3798036575317383, "step": 1697 }, { "epoch": 1.6, "grad_norm": 14.38016128540039, "learning_rate": 2.5865687303252887e-07, "logps/chosen": -54.12076187133789, "logps/rejected": -78.1153564453125, "loss": 0.2204, "losses/dpo": 0.20779266953468323, "losses/sft": 2.3309385776519775, "losses/total": 0.20779266953468323, "ref_logps/chosen": -39.74736785888672, "ref_logps/rejected": -42.970306396484375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4373393058776855, "rewards/margins": 2.0771656036376953, "rewards/rejected": -3.51450514793396, "step": 1698 }, { "epoch": 1.6, "grad_norm": 23.805593490600586, "learning_rate": 2.5848198670863933e-07, "logps/chosen": -51.08177947998047, "logps/rejected": -76.69293975830078, "loss": 0.437, "losses/dpo": 0.5563952326774597, "losses/sft": 2.206334352493286, "losses/total": 0.5563952326774597, "ref_logps/chosen": -35.916412353515625, "ref_logps/rejected": -49.044219970703125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5165367126464844, "rewards/margins": 1.248335361480713, "rewards/rejected": -2.7648720741271973, "step": 1699 }, { "epoch": 1.61, "grad_norm": 29.400575637817383, "learning_rate": 2.583071003847499e-07, "logps/chosen": -61.05107879638672, "logps/rejected": -70.8686752319336, "loss": 0.5572, "losses/dpo": 0.2740618586540222, "losses/sft": 1.9630038738250732, "losses/total": 0.2740618586540222, "ref_logps/chosen": -45.12006378173828, "ref_logps/rejected": -43.93709182739258, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5931012630462646, "rewards/margins": 1.1000571250915527, "rewards/rejected": -2.6931581497192383, "step": 1700 }, { "epoch": 1.61, "grad_norm": 23.972097396850586, "learning_rate": 2.5813221406086046e-07, "logps/chosen": -58.94755172729492, "logps/rejected": -86.1441879272461, "loss": 0.3476, "losses/dpo": 0.3983757197856903, "losses/sft": 2.3936402797698975, "losses/total": 0.3983757197856903, "ref_logps/chosen": -43.79438781738281, "ref_logps/rejected": -56.778839111328125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5153162479400635, "rewards/margins": 1.4212180376052856, "rewards/rejected": -2.9365344047546387, "step": 1701 }, { "epoch": 1.61, "grad_norm": 31.20714569091797, "learning_rate": 2.579573277369709e-07, "logps/chosen": -51.1915283203125, "logps/rejected": -68.49287414550781, "loss": 0.5542, "losses/dpo": 0.6483299732208252, "losses/sft": 2.033721685409546, "losses/total": 0.6483299732208252, "ref_logps/chosen": -38.686920166015625, "ref_logps/rejected": -42.646610260009766, "rewards/accuracies": 0.75, "rewards/chosen": -1.2504607439041138, "rewards/margins": 1.3341654539108276, "rewards/rejected": -2.5846261978149414, "step": 1702 }, { "epoch": 1.61, "grad_norm": 21.918899536132812, "learning_rate": 2.577824414130815e-07, "logps/chosen": -47.61374282836914, "logps/rejected": -63.86582946777344, "loss": 0.4125, "losses/dpo": 0.46828797459602356, "losses/sft": 1.3913002014160156, "losses/total": 0.46828797459602356, "ref_logps/chosen": -32.2750358581543, "ref_logps/rejected": -37.893798828125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5338705778121948, "rewards/margins": 1.0633323192596436, "rewards/rejected": -2.597202777862549, "step": 1703 }, { "epoch": 1.61, "grad_norm": 19.991924285888672, "learning_rate": 2.57607555089192e-07, "logps/chosen": -50.15476989746094, "logps/rejected": -62.726287841796875, "loss": 0.3294, "losses/dpo": 0.21432074904441833, "losses/sft": 1.2985972166061401, "losses/total": 0.21432074904441833, "ref_logps/chosen": -36.133060455322266, "ref_logps/rejected": -34.09256362915039, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4021708965301514, "rewards/margins": 1.4612014293670654, "rewards/rejected": -2.863372325897217, "step": 1704 }, { "epoch": 1.61, "grad_norm": 20.620197296142578, "learning_rate": 2.5743266876530257e-07, "logps/chosen": -45.40034103393555, "logps/rejected": -56.69420623779297, "loss": 0.4422, "losses/dpo": 0.40340331196784973, "losses/sft": 1.82687509059906, "losses/total": 0.40340331196784973, "ref_logps/chosen": -33.02241897583008, "ref_logps/rejected": -34.886810302734375, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2377923727035522, "rewards/margins": 0.9429475665092468, "rewards/rejected": -2.1807398796081543, "step": 1705 }, { "epoch": 1.61, "grad_norm": 24.2882022857666, "learning_rate": 2.57257782441413e-07, "logps/chosen": -45.427650451660156, "logps/rejected": -69.88075256347656, "loss": 0.47, "losses/dpo": 0.3295285105705261, "losses/sft": 2.1115331649780273, "losses/total": 0.3295285105705261, "ref_logps/chosen": -30.624160766601562, "ref_logps/rejected": -43.11790466308594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.480348825454712, "rewards/margins": 1.1959362030029297, "rewards/rejected": -2.6762850284576416, "step": 1706 }, { "epoch": 1.61, "grad_norm": 21.574094772338867, "learning_rate": 2.570828961175236e-07, "logps/chosen": -42.071937561035156, "logps/rejected": -82.47891235351562, "loss": 0.3394, "losses/dpo": 0.1716458797454834, "losses/sft": 1.3351582288742065, "losses/total": 0.1716458797454834, "ref_logps/chosen": -32.68022918701172, "ref_logps/rejected": -55.13825988769531, "rewards/accuracies": 0.875, "rewards/chosen": -0.9391703605651855, "rewards/margins": 1.7948949337005615, "rewards/rejected": -2.734065294265747, "step": 1707 }, { "epoch": 1.61, "grad_norm": 21.97496223449707, "learning_rate": 2.5690800979363416e-07, "logps/chosen": -48.086212158203125, "logps/rejected": -77.50769805908203, "loss": 0.3312, "losses/dpo": 0.23380061984062195, "losses/sft": 1.7336677312850952, "losses/total": 0.23380061984062195, "ref_logps/chosen": -36.06480407714844, "ref_logps/rejected": -49.543487548828125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2021414041519165, "rewards/margins": 1.5942800045013428, "rewards/rejected": -2.7964212894439697, "step": 1708 }, { "epoch": 1.61, "grad_norm": 20.616004943847656, "learning_rate": 2.567331234697446e-07, "logps/chosen": -45.64478302001953, "logps/rejected": -59.34964370727539, "loss": 0.489, "losses/dpo": 0.5231375694274902, "losses/sft": 2.4794270992279053, "losses/total": 0.5231375694274902, "ref_logps/chosen": -31.097517013549805, "ref_logps/rejected": -33.70814514160156, "rewards/accuracies": 0.75, "rewards/chosen": -1.4547265768051147, "rewards/margins": 1.1094229221343994, "rewards/rejected": -2.5641493797302246, "step": 1709 }, { "epoch": 1.61, "grad_norm": 41.78421401977539, "learning_rate": 2.565582371458552e-07, "logps/chosen": -53.465431213378906, "logps/rejected": -66.07946014404297, "loss": 0.8232, "losses/dpo": 0.3527677059173584, "losses/sft": 1.5076885223388672, "losses/total": 0.3527677059173584, "ref_logps/chosen": -36.02337646484375, "ref_logps/rejected": -42.72295379638672, "rewards/accuracies": 0.5625, "rewards/chosen": -1.7442052364349365, "rewards/margins": 0.5914453864097595, "rewards/rejected": -2.3356504440307617, "step": 1710 }, { "epoch": 1.62, "grad_norm": 26.656536102294922, "learning_rate": 2.563833508219657e-07, "logps/chosen": -64.93763732910156, "logps/rejected": -102.15889739990234, "loss": 0.3415, "losses/dpo": 0.388261079788208, "losses/sft": 1.999680519104004, "losses/total": 0.388261079788208, "ref_logps/chosen": -47.505767822265625, "ref_logps/rejected": -68.66729736328125, "rewards/accuracies": 0.875, "rewards/chosen": -1.7431871891021729, "rewards/margins": 1.6059727668762207, "rewards/rejected": -3.3491601943969727, "step": 1711 }, { "epoch": 1.62, "grad_norm": 21.65997886657715, "learning_rate": 2.5620846449807626e-07, "logps/chosen": -50.296051025390625, "logps/rejected": -63.11650085449219, "loss": 0.4245, "losses/dpo": 0.29651695489883423, "losses/sft": 1.7380163669586182, "losses/total": 0.29651695489883423, "ref_logps/chosen": -36.176063537597656, "ref_logps/rejected": -36.67198944091797, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4119987487792969, "rewards/margins": 1.2324522733688354, "rewards/rejected": -2.644451141357422, "step": 1712 }, { "epoch": 1.62, "grad_norm": 30.8131160736084, "learning_rate": 2.5603357817418677e-07, "logps/chosen": -58.23980712890625, "logps/rejected": -76.20166778564453, "loss": 0.5437, "losses/dpo": 0.6521898508071899, "losses/sft": 2.064173936843872, "losses/total": 0.6521898508071899, "ref_logps/chosen": -41.63606262207031, "ref_logps/rejected": -51.62317657470703, "rewards/accuracies": 0.6875, "rewards/chosen": -1.660374641418457, "rewards/margins": 0.7974750399589539, "rewards/rejected": -2.4578497409820557, "step": 1713 }, { "epoch": 1.62, "grad_norm": 30.45929718017578, "learning_rate": 2.558586918502973e-07, "logps/chosen": -50.106475830078125, "logps/rejected": -60.24506378173828, "loss": 0.5065, "losses/dpo": 0.42407292127609253, "losses/sft": 1.9886914491653442, "losses/total": 0.42407292127609253, "ref_logps/chosen": -36.871726989746094, "ref_logps/rejected": -36.309322357177734, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3234748840332031, "rewards/margins": 1.0700993537902832, "rewards/rejected": -2.3935742378234863, "step": 1714 }, { "epoch": 1.62, "grad_norm": 22.826793670654297, "learning_rate": 2.5568380552640785e-07, "logps/chosen": -48.91212463378906, "logps/rejected": -75.21824645996094, "loss": 0.3376, "losses/dpo": 0.5897175669670105, "losses/sft": 2.2067716121673584, "losses/total": 0.5897175669670105, "ref_logps/chosen": -33.7586669921875, "ref_logps/rejected": -47.553443908691406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5153456926345825, "rewards/margins": 1.2511353492736816, "rewards/rejected": -2.7664811611175537, "step": 1715 }, { "epoch": 1.62, "grad_norm": 14.791064262390137, "learning_rate": 2.555089192025183e-07, "logps/chosen": -52.70648956298828, "logps/rejected": -71.68559265136719, "loss": 0.2707, "losses/dpo": 0.2725614607334137, "losses/sft": 1.9950833320617676, "losses/total": 0.2725614607334137, "ref_logps/chosen": -39.049720764160156, "ref_logps/rejected": -43.878639221191406, "rewards/accuracies": 1.0, "rewards/chosen": -1.3656775951385498, "rewards/margins": 1.41501784324646, "rewards/rejected": -2.7806954383850098, "step": 1716 }, { "epoch": 1.62, "grad_norm": 18.773130416870117, "learning_rate": 2.553340328786289e-07, "logps/chosen": -43.274932861328125, "logps/rejected": -56.1312370300293, "loss": 0.4545, "losses/dpo": 0.2814927101135254, "losses/sft": 1.5184011459350586, "losses/total": 0.2814927101135254, "ref_logps/chosen": -30.005794525146484, "ref_logps/rejected": -32.11858367919922, "rewards/accuracies": 0.8125, "rewards/chosen": -1.326913833618164, "rewards/margins": 1.0743515491485596, "rewards/rejected": -2.4012653827667236, "step": 1717 }, { "epoch": 1.62, "grad_norm": 24.9932918548584, "learning_rate": 2.551591465547394e-07, "logps/chosen": -51.914886474609375, "logps/rejected": -59.898231506347656, "loss": 0.5483, "losses/dpo": 0.7883964776992798, "losses/sft": 3.383769989013672, "losses/total": 0.7883964776992798, "ref_logps/chosen": -34.69055938720703, "ref_logps/rejected": -36.196292877197266, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7224326133728027, "rewards/margins": 0.6477614641189575, "rewards/rejected": -2.3701939582824707, "step": 1718 }, { "epoch": 1.62, "grad_norm": 22.577491760253906, "learning_rate": 2.5498426023084995e-07, "logps/chosen": -42.70201110839844, "logps/rejected": -65.31559753417969, "loss": 0.4347, "losses/dpo": 0.5611782073974609, "losses/sft": 2.379319667816162, "losses/total": 0.5611782073974609, "ref_logps/chosen": -31.390506744384766, "ref_logps/rejected": -40.67528533935547, "rewards/accuracies": 0.875, "rewards/chosen": -1.1311503648757935, "rewards/margins": 1.3328800201416016, "rewards/rejected": -2.4640302658081055, "step": 1719 }, { "epoch": 1.62, "grad_norm": 20.03665542602539, "learning_rate": 2.5480937390696047e-07, "logps/chosen": -42.88938522338867, "logps/rejected": -61.84242248535156, "loss": 0.3576, "losses/dpo": 0.3116549849510193, "losses/sft": 1.7542328834533691, "losses/total": 0.3116549849510193, "ref_logps/chosen": -31.158885955810547, "ref_logps/rejected": -38.53749084472656, "rewards/accuracies": 0.875, "rewards/chosen": -1.173049807548523, "rewards/margins": 1.157443642616272, "rewards/rejected": -2.330493450164795, "step": 1720 }, { "epoch": 1.63, "grad_norm": 18.225561141967773, "learning_rate": 2.54634487583071e-07, "logps/chosen": -48.095458984375, "logps/rejected": -86.5268783569336, "loss": 0.299, "losses/dpo": 0.16750237345695496, "losses/sft": 1.5937292575836182, "losses/total": 0.16750237345695496, "ref_logps/chosen": -35.68586730957031, "ref_logps/rejected": -59.16970443725586, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2409590482711792, "rewards/margins": 1.4947586059570312, "rewards/rejected": -2.735717535018921, "step": 1721 }, { "epoch": 1.63, "grad_norm": 33.022579193115234, "learning_rate": 2.5445960125918155e-07, "logps/chosen": -58.868778228759766, "logps/rejected": -59.140811920166016, "loss": 0.6316, "losses/dpo": 0.2693566679954529, "losses/sft": 1.7555820941925049, "losses/total": 0.2693566679954529, "ref_logps/chosen": -42.4088134765625, "ref_logps/rejected": -33.35874557495117, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6459965705871582, "rewards/margins": 0.9322102069854736, "rewards/rejected": -2.5782065391540527, "step": 1722 }, { "epoch": 1.63, "grad_norm": 25.231719970703125, "learning_rate": 2.54284714935292e-07, "logps/chosen": -53.8870849609375, "logps/rejected": -73.10115814208984, "loss": 0.5203, "losses/dpo": 0.18047204613685608, "losses/sft": 2.002629280090332, "losses/total": 0.18047204613685608, "ref_logps/chosen": -38.624595642089844, "ref_logps/rejected": -48.08985900878906, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5262489318847656, "rewards/margins": 0.9748811721801758, "rewards/rejected": -2.5011301040649414, "step": 1723 }, { "epoch": 1.63, "grad_norm": 20.354001998901367, "learning_rate": 2.5410982861140257e-07, "logps/chosen": -57.29927062988281, "logps/rejected": -60.07427215576172, "loss": 0.4153, "losses/dpo": 0.4300401508808136, "losses/sft": 1.3612242937088013, "losses/total": 0.4300401508808136, "ref_logps/chosen": -43.414794921875, "ref_logps/rejected": -34.77241516113281, "rewards/accuracies": 0.75, "rewards/chosen": -1.388447642326355, "rewards/margins": 1.1417381763458252, "rewards/rejected": -2.5301856994628906, "step": 1724 }, { "epoch": 1.63, "grad_norm": 17.782976150512695, "learning_rate": 2.539349422875131e-07, "logps/chosen": -42.95454788208008, "logps/rejected": -68.92463684082031, "loss": 0.305, "losses/dpo": 0.427029013633728, "losses/sft": 1.9913402795791626, "losses/total": 0.427029013633728, "ref_logps/chosen": -31.01287078857422, "ref_logps/rejected": -39.82344055175781, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1941677331924438, "rewards/margins": 1.7159514427185059, "rewards/rejected": -2.9101195335388184, "step": 1725 }, { "epoch": 1.63, "grad_norm": 24.13286018371582, "learning_rate": 2.5376005596362365e-07, "logps/chosen": -50.24007797241211, "logps/rejected": -69.79496765136719, "loss": 0.476, "losses/dpo": 0.5065603852272034, "losses/sft": 1.876365065574646, "losses/total": 0.5065603852272034, "ref_logps/chosen": -35.224449157714844, "ref_logps/rejected": -43.55268859863281, "rewards/accuracies": 0.75, "rewards/chosen": -1.5015625953674316, "rewards/margins": 1.1226651668548584, "rewards/rejected": -2.624228000640869, "step": 1726 }, { "epoch": 1.63, "grad_norm": 18.875690460205078, "learning_rate": 2.5358516963973416e-07, "logps/chosen": -51.90456771850586, "logps/rejected": -74.50015258789062, "loss": 0.3144, "losses/dpo": 0.3848721385002136, "losses/sft": 1.942346215248108, "losses/total": 0.3848721385002136, "ref_logps/chosen": -38.46068572998047, "ref_logps/rejected": -46.12248611450195, "rewards/accuracies": 0.875, "rewards/chosen": -1.3443881273269653, "rewards/margins": 1.493377923965454, "rewards/rejected": -2.837766170501709, "step": 1727 }, { "epoch": 1.63, "grad_norm": 28.37961769104004, "learning_rate": 2.534102833158447e-07, "logps/chosen": -48.577064514160156, "logps/rejected": -82.97050476074219, "loss": 0.425, "losses/dpo": 0.6688879132270813, "losses/sft": 1.9931738376617432, "losses/total": 0.6688879132270813, "ref_logps/chosen": -35.99244689941406, "ref_logps/rejected": -54.301673889160156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2584621906280518, "rewards/margins": 1.6084208488464355, "rewards/rejected": -2.8668832778930664, "step": 1728 }, { "epoch": 1.63, "grad_norm": 22.02532386779785, "learning_rate": 2.5323539699195524e-07, "logps/chosen": -56.200321197509766, "logps/rejected": -90.24491882324219, "loss": 0.3496, "losses/dpo": 0.3043467700481415, "losses/sft": 2.3032021522521973, "losses/total": 0.3043467700481415, "ref_logps/chosen": -43.64707946777344, "ref_logps/rejected": -58.17082214355469, "rewards/accuracies": 0.875, "rewards/chosen": -1.255324363708496, "rewards/margins": 1.9520853757858276, "rewards/rejected": -3.207409381866455, "step": 1729 }, { "epoch": 1.63, "grad_norm": 19.41638946533203, "learning_rate": 2.530605106680657e-07, "logps/chosen": -50.2297477722168, "logps/rejected": -71.56086730957031, "loss": 0.3032, "losses/dpo": 0.34405726194381714, "losses/sft": 1.7360780239105225, "losses/total": 0.34405726194381714, "ref_logps/chosen": -35.395469665527344, "ref_logps/rejected": -41.67345428466797, "rewards/accuracies": 0.875, "rewards/chosen": -1.4834280014038086, "rewards/margins": 1.505313515663147, "rewards/rejected": -2.988741397857666, "step": 1730 }, { "epoch": 1.63, "grad_norm": 22.186115264892578, "learning_rate": 2.5288562434417627e-07, "logps/chosen": -45.201332092285156, "logps/rejected": -68.27487182617188, "loss": 0.4001, "losses/dpo": 0.6906545162200928, "losses/sft": 2.455472707748413, "losses/total": 0.6906545162200928, "ref_logps/chosen": -29.865217208862305, "ref_logps/rejected": -40.72291564941406, "rewards/accuracies": 0.75, "rewards/chosen": -1.533611536026001, "rewards/margins": 1.2215843200683594, "rewards/rejected": -2.7551958560943604, "step": 1731 }, { "epoch": 1.64, "grad_norm": 27.745742797851562, "learning_rate": 2.5271073802028683e-07, "logps/chosen": -54.785179138183594, "logps/rejected": -77.94574737548828, "loss": 0.4921, "losses/dpo": 0.34052154421806335, "losses/sft": 1.7407050132751465, "losses/total": 0.34052154421806335, "ref_logps/chosen": -40.07048797607422, "ref_logps/rejected": -51.32563781738281, "rewards/accuracies": 0.75, "rewards/chosen": -1.4714692831039429, "rewards/margins": 1.190542221069336, "rewards/rejected": -2.6620116233825684, "step": 1732 }, { "epoch": 1.64, "grad_norm": 18.940242767333984, "learning_rate": 2.5253585169639734e-07, "logps/chosen": -32.22381591796875, "logps/rejected": -62.058349609375, "loss": 0.3785, "losses/dpo": 0.43926823139190674, "losses/sft": 1.482996940612793, "losses/total": 0.43926823139190674, "ref_logps/chosen": -23.436565399169922, "ref_logps/rejected": -40.592166900634766, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8787248134613037, "rewards/margins": 1.2678937911987305, "rewards/rejected": -2.146618604660034, "step": 1733 }, { "epoch": 1.64, "grad_norm": 22.96949005126953, "learning_rate": 2.5236096537250786e-07, "logps/chosen": -49.87434005737305, "logps/rejected": -64.28363037109375, "loss": 0.4364, "losses/dpo": 0.3585287034511566, "losses/sft": 2.1542539596557617, "losses/total": 0.3585287034511566, "ref_logps/chosen": -37.18541717529297, "ref_logps/rejected": -41.434173583984375, "rewards/accuracies": 0.75, "rewards/chosen": -1.268892526626587, "rewards/margins": 1.0160531997680664, "rewards/rejected": -2.2849457263946533, "step": 1734 }, { "epoch": 1.64, "grad_norm": 20.29282569885254, "learning_rate": 2.5218607904861837e-07, "logps/chosen": -47.359031677246094, "logps/rejected": -63.904075622558594, "loss": 0.3902, "losses/dpo": 0.5772249698638916, "losses/sft": 2.259298324584961, "losses/total": 0.5772249698638916, "ref_logps/chosen": -33.2038688659668, "ref_logps/rejected": -36.245792388916016, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4155160188674927, "rewards/margins": 1.3503127098083496, "rewards/rejected": -2.7658286094665527, "step": 1735 }, { "epoch": 1.64, "grad_norm": 20.01504135131836, "learning_rate": 2.5201119272472893e-07, "logps/chosen": -61.14841079711914, "logps/rejected": -79.07331848144531, "loss": 0.2827, "losses/dpo": 0.37559935450553894, "losses/sft": 1.6319624185562134, "losses/total": 0.37559935450553894, "ref_logps/chosen": -45.3342399597168, "ref_logps/rejected": -49.348487854003906, "rewards/accuracies": 1.0, "rewards/chosen": -1.5814169645309448, "rewards/margins": 1.391066074371338, "rewards/rejected": -2.9724831581115723, "step": 1736 }, { "epoch": 1.64, "grad_norm": 15.476624488830566, "learning_rate": 2.518363064008394e-07, "logps/chosen": -46.7214469909668, "logps/rejected": -66.35993194580078, "loss": 0.2801, "losses/dpo": 0.38920146226882935, "losses/sft": 1.9700355529785156, "losses/total": 0.38920146226882935, "ref_logps/chosen": -36.837059020996094, "ref_logps/rejected": -41.472259521484375, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9884388446807861, "rewards/margins": 1.5003283023834229, "rewards/rejected": -2.488767147064209, "step": 1737 }, { "epoch": 1.64, "grad_norm": 21.71673583984375, "learning_rate": 2.5166142007694996e-07, "logps/chosen": -53.2696533203125, "logps/rejected": -74.0630111694336, "loss": 0.341, "losses/dpo": 0.22557701170444489, "losses/sft": 2.205395221710205, "losses/total": 0.22557701170444489, "ref_logps/chosen": -35.446632385253906, "ref_logps/rejected": -43.976463317871094, "rewards/accuracies": 0.875, "rewards/chosen": -1.7823021411895752, "rewards/margins": 1.2263524532318115, "rewards/rejected": -3.008654832839966, "step": 1738 }, { "epoch": 1.64, "grad_norm": 18.874605178833008, "learning_rate": 2.514865337530605e-07, "logps/chosen": -49.429691314697266, "logps/rejected": -72.20341491699219, "loss": 0.2869, "losses/dpo": 0.1943396031856537, "losses/sft": 1.6803804636001587, "losses/total": 0.1943396031856537, "ref_logps/chosen": -34.68839645385742, "ref_logps/rejected": -40.69697952270508, "rewards/accuracies": 0.875, "rewards/chosen": -1.4741297960281372, "rewards/margins": 1.6765141487121582, "rewards/rejected": -3.150644063949585, "step": 1739 }, { "epoch": 1.64, "grad_norm": 21.336071014404297, "learning_rate": 2.5131164742917104e-07, "logps/chosen": -41.82895278930664, "logps/rejected": -83.10308074951172, "loss": 0.3066, "losses/dpo": 0.13714835047721863, "losses/sft": 1.9981573820114136, "losses/total": 0.13714835047721863, "ref_logps/chosen": -30.48394775390625, "ref_logps/rejected": -53.38444900512695, "rewards/accuracies": 0.875, "rewards/chosen": -1.134500503540039, "rewards/margins": 1.8373630046844482, "rewards/rejected": -2.9718635082244873, "step": 1740 }, { "epoch": 1.64, "grad_norm": 20.28631019592285, "learning_rate": 2.5113676110528155e-07, "logps/chosen": -45.91974639892578, "logps/rejected": -65.12422180175781, "loss": 0.3384, "losses/dpo": 0.2829352915287018, "losses/sft": 1.9542996883392334, "losses/total": 0.2829352915287018, "ref_logps/chosen": -33.374568939208984, "ref_logps/rejected": -38.56147003173828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2545177936553955, "rewards/margins": 1.4017577171325684, "rewards/rejected": -2.656275510787964, "step": 1741 }, { "epoch": 1.64, "grad_norm": 24.799890518188477, "learning_rate": 2.5096187478139206e-07, "logps/chosen": -50.457008361816406, "logps/rejected": -59.93446731567383, "loss": 0.4073, "losses/dpo": 0.26259562373161316, "losses/sft": 1.6940741539001465, "losses/total": 0.26259562373161316, "ref_logps/chosen": -37.06106185913086, "ref_logps/rejected": -37.168182373046875, "rewards/accuracies": 0.875, "rewards/chosen": -1.3395946025848389, "rewards/margins": 0.9370344877243042, "rewards/rejected": -2.2766289710998535, "step": 1742 }, { "epoch": 1.65, "grad_norm": 17.957420349121094, "learning_rate": 2.5078698845750263e-07, "logps/chosen": -40.55392074584961, "logps/rejected": -55.7088508605957, "loss": 0.3382, "losses/dpo": 0.5663778781890869, "losses/sft": 1.5110385417938232, "losses/total": 0.5663778781890869, "ref_logps/chosen": -29.02698516845703, "ref_logps/rejected": -31.44099235534668, "rewards/accuracies": 0.875, "rewards/chosen": -1.1526936292648315, "rewards/margins": 1.274092197418213, "rewards/rejected": -2.426785945892334, "step": 1743 }, { "epoch": 1.65, "grad_norm": 26.556346893310547, "learning_rate": 2.506121021336131e-07, "logps/chosen": -53.62139892578125, "logps/rejected": -65.5887451171875, "loss": 0.4345, "losses/dpo": 0.18863216042518616, "losses/sft": 1.981642723083496, "losses/total": 0.18863216042518616, "ref_logps/chosen": -39.930259704589844, "ref_logps/rejected": -38.34013748168945, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3691136837005615, "rewards/margins": 1.3557474613189697, "rewards/rejected": -2.7248611450195312, "step": 1744 }, { "epoch": 1.65, "grad_norm": 24.39154052734375, "learning_rate": 2.5043721580972365e-07, "logps/chosen": -45.27629852294922, "logps/rejected": -60.03491973876953, "loss": 0.4674, "losses/dpo": 0.4523618817329407, "losses/sft": 1.9528025388717651, "losses/total": 0.4523618817329407, "ref_logps/chosen": -30.986499786376953, "ref_logps/rejected": -36.605712890625, "rewards/accuracies": 0.75, "rewards/chosen": -1.4289798736572266, "rewards/margins": 0.913940966129303, "rewards/rejected": -2.3429207801818848, "step": 1745 }, { "epoch": 1.65, "grad_norm": 20.71787452697754, "learning_rate": 2.502623294858342e-07, "logps/chosen": -44.60021209716797, "logps/rejected": -66.5357666015625, "loss": 0.4076, "losses/dpo": 0.26822173595428467, "losses/sft": 1.697623372077942, "losses/total": 0.26822173595428467, "ref_logps/chosen": -31.552837371826172, "ref_logps/rejected": -39.9516487121582, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3047373294830322, "rewards/margins": 1.3536741733551025, "rewards/rejected": -2.6584115028381348, "step": 1746 }, { "epoch": 1.65, "grad_norm": 13.634176254272461, "learning_rate": 2.5008744316194473e-07, "logps/chosen": -61.19767379760742, "logps/rejected": -84.01016235351562, "loss": 0.2112, "losses/dpo": 0.1825573891401291, "losses/sft": 1.556693196296692, "losses/total": 0.1825573891401291, "ref_logps/chosen": -47.61400604248047, "ref_logps/rejected": -50.663639068603516, "rewards/accuracies": 1.0, "rewards/chosen": -1.3583669662475586, "rewards/margins": 1.976285457611084, "rewards/rejected": -3.3346526622772217, "step": 1747 }, { "epoch": 1.65, "grad_norm": 18.066137313842773, "learning_rate": 2.4991255683805525e-07, "logps/chosen": -45.3166389465332, "logps/rejected": -62.234588623046875, "loss": 0.2815, "losses/dpo": 0.2690800130367279, "losses/sft": 1.589748501777649, "losses/total": 0.2690800130367279, "ref_logps/chosen": -34.692745208740234, "ref_logps/rejected": -38.325172424316406, "rewards/accuracies": 1.0, "rewards/chosen": -1.0623897314071655, "rewards/margins": 1.3285517692565918, "rewards/rejected": -2.3909413814544678, "step": 1748 }, { "epoch": 1.65, "grad_norm": 27.016061782836914, "learning_rate": 2.4973767051416576e-07, "logps/chosen": -61.679107666015625, "logps/rejected": -79.778564453125, "loss": 0.3811, "losses/dpo": 1.023167610168457, "losses/sft": 2.07210636138916, "losses/total": 1.023167610168457, "ref_logps/chosen": -50.669517517089844, "ref_logps/rejected": -50.098106384277344, "rewards/accuracies": 0.75, "rewards/chosen": -1.1009588241577148, "rewards/margins": 1.8670872449874878, "rewards/rejected": -2.968045711517334, "step": 1749 }, { "epoch": 1.65, "grad_norm": 22.37818717956543, "learning_rate": 2.495627841902763e-07, "logps/chosen": -62.192989349365234, "logps/rejected": -83.9250717163086, "loss": 0.3155, "losses/dpo": 0.3115991950035095, "losses/sft": 2.187771797180176, "losses/total": 0.3115991950035095, "ref_logps/chosen": -44.19801712036133, "ref_logps/rejected": -50.246856689453125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7994972467422485, "rewards/margins": 1.5683242082595825, "rewards/rejected": -3.36782169342041, "step": 1750 }, { "epoch": 1.65, "grad_norm": 37.1116943359375, "learning_rate": 2.4938789786638684e-07, "logps/chosen": -61.98405075073242, "logps/rejected": -70.7381591796875, "loss": 0.6037, "losses/dpo": 0.713307797908783, "losses/sft": 2.509040594100952, "losses/total": 0.713307797908783, "ref_logps/chosen": -43.37413024902344, "ref_logps/rejected": -42.15391159057617, "rewards/accuracies": 0.8125, "rewards/chosen": -1.860992193222046, "rewards/margins": 0.9974318742752075, "rewards/rejected": -2.858423948287964, "step": 1751 }, { "epoch": 1.65, "grad_norm": 22.407917022705078, "learning_rate": 2.4921301154249735e-07, "logps/chosen": -47.93756866455078, "logps/rejected": -72.85267639160156, "loss": 0.3235, "losses/dpo": 0.15469565987586975, "losses/sft": 1.903242588043213, "losses/total": 0.15469565987586975, "ref_logps/chosen": -34.365089416503906, "ref_logps/rejected": -41.953590393066406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.357248067855835, "rewards/margins": 1.7326595783233643, "rewards/rejected": -3.089907646179199, "step": 1752 }, { "epoch": 1.66, "grad_norm": 26.822011947631836, "learning_rate": 2.490381252186079e-07, "logps/chosen": -45.707340240478516, "logps/rejected": -50.74597930908203, "loss": 0.5444, "losses/dpo": 0.6730902194976807, "losses/sft": 1.9205435514450073, "losses/total": 0.6730902194976807, "ref_logps/chosen": -31.910297393798828, "ref_logps/rejected": -30.508390426635742, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3797039985656738, "rewards/margins": 0.6440549492835999, "rewards/rejected": -2.023758888244629, "step": 1753 }, { "epoch": 1.66, "grad_norm": 27.741172790527344, "learning_rate": 2.4886323889471843e-07, "logps/chosen": -56.74720001220703, "logps/rejected": -73.21231079101562, "loss": 0.5465, "losses/dpo": 0.47176164388656616, "losses/sft": 1.7883340120315552, "losses/total": 0.47176164388656616, "ref_logps/chosen": -42.71295166015625, "ref_logps/rejected": -50.758323669433594, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4034249782562256, "rewards/margins": 0.841973602771759, "rewards/rejected": -2.24539852142334, "step": 1754 }, { "epoch": 1.66, "grad_norm": 17.044225692749023, "learning_rate": 2.4868835257082894e-07, "logps/chosen": -43.494998931884766, "logps/rejected": -74.65469360351562, "loss": 0.2601, "losses/dpo": 0.15053287148475647, "losses/sft": 1.662393569946289, "losses/total": 0.15053287148475647, "ref_logps/chosen": -32.523048400878906, "ref_logps/rejected": -46.49173355102539, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0971951484680176, "rewards/margins": 1.7191004753112793, "rewards/rejected": -2.816295623779297, "step": 1755 }, { "epoch": 1.66, "grad_norm": 16.60759925842285, "learning_rate": 2.4851346624693945e-07, "logps/chosen": -55.989253997802734, "logps/rejected": -74.5605239868164, "loss": 0.2856, "losses/dpo": 0.4219213128089905, "losses/sft": 1.8973370790481567, "losses/total": 0.4219213128089905, "ref_logps/chosen": -41.978431701660156, "ref_logps/rejected": -43.160606384277344, "rewards/accuracies": 0.875, "rewards/chosen": -1.4010822772979736, "rewards/margins": 1.7389092445373535, "rewards/rejected": -3.139991521835327, "step": 1756 }, { "epoch": 1.66, "grad_norm": 29.04009246826172, "learning_rate": 2.4833857992305e-07, "logps/chosen": -63.5703010559082, "logps/rejected": -71.96419525146484, "loss": 0.6452, "losses/dpo": 0.6840000152587891, "losses/sft": 1.913475751876831, "losses/total": 0.6840000152587891, "ref_logps/chosen": -40.3055534362793, "ref_logps/rejected": -42.871620178222656, "rewards/accuracies": 0.5, "rewards/chosen": -2.326474666595459, "rewards/margins": 0.5827829837799072, "rewards/rejected": -2.9092578887939453, "step": 1757 }, { "epoch": 1.66, "grad_norm": 28.937654495239258, "learning_rate": 2.4816369359916053e-07, "logps/chosen": -61.76683044433594, "logps/rejected": -68.48959350585938, "loss": 0.5078, "losses/dpo": 0.5240499973297119, "losses/sft": 2.167008638381958, "losses/total": 0.5240499973297119, "ref_logps/chosen": -40.633323669433594, "ref_logps/rejected": -39.12541961669922, "rewards/accuracies": 0.75, "rewards/chosen": -2.1133508682250977, "rewards/margins": 0.8230666518211365, "rewards/rejected": -2.936417579650879, "step": 1758 }, { "epoch": 1.66, "grad_norm": 30.774147033691406, "learning_rate": 2.4798880727527104e-07, "logps/chosen": -54.717796325683594, "logps/rejected": -81.8855209350586, "loss": 0.4419, "losses/dpo": 0.23568269610404968, "losses/sft": 2.6098146438598633, "losses/total": 0.23568269610404968, "ref_logps/chosen": -37.91327667236328, "ref_logps/rejected": -51.40283966064453, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6804518699645996, "rewards/margins": 1.3678154945373535, "rewards/rejected": -3.048267364501953, "step": 1759 }, { "epoch": 1.66, "grad_norm": 33.1243896484375, "learning_rate": 2.478139209513816e-07, "logps/chosen": -53.85862350463867, "logps/rejected": -70.88557434082031, "loss": 0.5587, "losses/dpo": 0.8125826120376587, "losses/sft": 2.3513290882110596, "losses/total": 0.8125826120376587, "ref_logps/chosen": -38.626739501953125, "ref_logps/rejected": -43.82019805908203, "rewards/accuracies": 0.6875, "rewards/chosen": -1.523188591003418, "rewards/margins": 1.1833494901657104, "rewards/rejected": -2.706537961959839, "step": 1760 }, { "epoch": 1.66, "grad_norm": 25.510570526123047, "learning_rate": 2.476390346274921e-07, "logps/chosen": -43.83403015136719, "logps/rejected": -66.24166107177734, "loss": 0.4548, "losses/dpo": 0.2545833885669708, "losses/sft": 2.516761064529419, "losses/total": 0.2545833885669708, "ref_logps/chosen": -33.473411560058594, "ref_logps/rejected": -42.44950485229492, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0360615253448486, "rewards/margins": 1.3431546688079834, "rewards/rejected": -2.379216194152832, "step": 1761 }, { "epoch": 1.66, "grad_norm": 19.211475372314453, "learning_rate": 2.4746414830360263e-07, "logps/chosen": -42.900978088378906, "logps/rejected": -78.2021484375, "loss": 0.2391, "losses/dpo": 0.23803257942199707, "losses/sft": 1.5480340719223022, "losses/total": 0.23803257942199707, "ref_logps/chosen": -30.61618423461914, "ref_logps/rejected": -44.52288055419922, "rewards/accuracies": 0.9375, "rewards/chosen": -1.228479266166687, "rewards/margins": 2.1394476890563965, "rewards/rejected": -3.367927074432373, "step": 1762 }, { "epoch": 1.66, "grad_norm": 27.00369644165039, "learning_rate": 2.4728926197971315e-07, "logps/chosen": -49.99366760253906, "logps/rejected": -70.42268371582031, "loss": 0.5345, "losses/dpo": 0.39466962218284607, "losses/sft": 1.1136701107025146, "losses/total": 0.39466962218284607, "ref_logps/chosen": -33.49297332763672, "ref_logps/rejected": -44.156715393066406, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6500697135925293, "rewards/margins": 0.9765269756317139, "rewards/rejected": -2.626596689224243, "step": 1763 }, { "epoch": 1.67, "grad_norm": 25.0563907623291, "learning_rate": 2.471143756558237e-07, "logps/chosen": -54.64145278930664, "logps/rejected": -72.87681579589844, "loss": 0.4251, "losses/dpo": 0.4190717339515686, "losses/sft": 2.3476083278656006, "losses/total": 0.4190717339515686, "ref_logps/chosen": -39.182796478271484, "ref_logps/rejected": -43.9580078125, "rewards/accuracies": 0.75, "rewards/chosen": -1.545865774154663, "rewards/margins": 1.3460148572921753, "rewards/rejected": -2.891880512237549, "step": 1764 }, { "epoch": 1.67, "grad_norm": 23.983139038085938, "learning_rate": 2.469394893319342e-07, "logps/chosen": -56.65501403808594, "logps/rejected": -71.4188461303711, "loss": 0.3988, "losses/dpo": 0.41120821237564087, "losses/sft": 2.1404871940612793, "losses/total": 0.41120821237564087, "ref_logps/chosen": -38.815696716308594, "ref_logps/rejected": -41.478057861328125, "rewards/accuracies": 0.875, "rewards/chosen": -1.783931851387024, "rewards/margins": 1.2101467847824097, "rewards/rejected": -2.9940786361694336, "step": 1765 }, { "epoch": 1.67, "grad_norm": 23.112445831298828, "learning_rate": 2.4676460300804474e-07, "logps/chosen": -47.889896392822266, "logps/rejected": -62.0364990234375, "loss": 0.4304, "losses/dpo": 0.3706853687763214, "losses/sft": 1.5867184400558472, "losses/total": 0.3706853687763214, "ref_logps/chosen": -33.91569519042969, "ref_logps/rejected": -39.546409606933594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3974204063415527, "rewards/margins": 0.8515884876251221, "rewards/rejected": -2.249008893966675, "step": 1766 }, { "epoch": 1.67, "grad_norm": 24.360191345214844, "learning_rate": 2.465897166841553e-07, "logps/chosen": -63.62725067138672, "logps/rejected": -86.77616882324219, "loss": 0.3422, "losses/dpo": 0.5358516573905945, "losses/sft": 1.811553716659546, "losses/total": 0.5358516573905945, "ref_logps/chosen": -45.95429992675781, "ref_logps/rejected": -51.953426361083984, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7672953605651855, "rewards/margins": 1.7149794101715088, "rewards/rejected": -3.4822750091552734, "step": 1767 }, { "epoch": 1.67, "grad_norm": 15.364474296569824, "learning_rate": 2.464148303602658e-07, "logps/chosen": -46.399410247802734, "logps/rejected": -83.53336334228516, "loss": 0.2582, "losses/dpo": 0.35614290833473206, "losses/sft": 1.4065749645233154, "losses/total": 0.35614290833473206, "ref_logps/chosen": -32.357181549072266, "ref_logps/rejected": -49.31428146362305, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4042229652404785, "rewards/margins": 2.0176854133605957, "rewards/rejected": -3.421908378601074, "step": 1768 }, { "epoch": 1.67, "grad_norm": 23.35055160522461, "learning_rate": 2.4623994403637633e-07, "logps/chosen": -58.016815185546875, "logps/rejected": -59.55890655517578, "loss": 0.4711, "losses/dpo": 0.3933718800544739, "losses/sft": 1.6000556945800781, "losses/total": 0.3933718800544739, "ref_logps/chosen": -41.461463928222656, "ref_logps/rejected": -34.128299713134766, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6555355787277222, "rewards/margins": 0.8875244855880737, "rewards/rejected": -2.543060302734375, "step": 1769 }, { "epoch": 1.67, "grad_norm": 18.778644561767578, "learning_rate": 2.460650577124869e-07, "logps/chosen": -45.72222900390625, "logps/rejected": -76.72969818115234, "loss": 0.2618, "losses/dpo": 0.06690526008605957, "losses/sft": 2.0953080654144287, "losses/total": 0.06690526008605957, "ref_logps/chosen": -30.757808685302734, "ref_logps/rejected": -44.511863708496094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.496442198753357, "rewards/margins": 1.7253413200378418, "rewards/rejected": -3.221783399581909, "step": 1770 }, { "epoch": 1.67, "grad_norm": 15.959135055541992, "learning_rate": 2.458901713885974e-07, "logps/chosen": -32.28383255004883, "logps/rejected": -55.138572692871094, "loss": 0.4931, "losses/dpo": 0.7304418087005615, "losses/sft": 1.6448898315429688, "losses/total": 0.7304418087005615, "ref_logps/chosen": -20.5487060546875, "ref_logps/rejected": -32.69231414794922, "rewards/accuracies": 0.75, "rewards/chosen": -1.173512578010559, "rewards/margins": 1.0711129903793335, "rewards/rejected": -2.2446253299713135, "step": 1771 }, { "epoch": 1.67, "grad_norm": 29.38340187072754, "learning_rate": 2.457152850647079e-07, "logps/chosen": -52.144081115722656, "logps/rejected": -65.97552490234375, "loss": 0.597, "losses/dpo": 0.8049647808074951, "losses/sft": 1.864599347114563, "losses/total": 0.8049647808074951, "ref_logps/chosen": -34.577274322509766, "ref_logps/rejected": -41.40080261230469, "rewards/accuracies": 0.625, "rewards/chosen": -1.7566807270050049, "rewards/margins": 0.7007919549942017, "rewards/rejected": -2.457472562789917, "step": 1772 }, { "epoch": 1.67, "grad_norm": 23.83207130432129, "learning_rate": 2.4554039874081843e-07, "logps/chosen": -59.02446746826172, "logps/rejected": -75.85979461669922, "loss": 0.3537, "losses/dpo": 0.5540534853935242, "losses/sft": 2.0470051765441895, "losses/total": 0.5540534853935242, "ref_logps/chosen": -41.64139175415039, "ref_logps/rejected": -43.01557540893555, "rewards/accuracies": 0.875, "rewards/chosen": -1.7383074760437012, "rewards/margins": 1.5461148023605347, "rewards/rejected": -3.2844221591949463, "step": 1773 }, { "epoch": 1.68, "grad_norm": 29.883790969848633, "learning_rate": 2.45365512416929e-07, "logps/chosen": -82.52509307861328, "logps/rejected": -81.92755126953125, "loss": 0.4266, "losses/dpo": 0.9312683343887329, "losses/sft": 2.6990251541137695, "losses/total": 0.9312683343887329, "ref_logps/chosen": -64.4759521484375, "ref_logps/rejected": -50.77394104003906, "rewards/accuracies": 0.875, "rewards/chosen": -1.8049136400222778, "rewards/margins": 1.3104476928710938, "rewards/rejected": -3.115361213684082, "step": 1774 }, { "epoch": 1.68, "grad_norm": 31.216472625732422, "learning_rate": 2.451906260930395e-07, "logps/chosen": -56.650150299072266, "logps/rejected": -72.89379119873047, "loss": 0.571, "losses/dpo": 0.6400648951530457, "losses/sft": 2.34333872795105, "losses/total": 0.6400648951530457, "ref_logps/chosen": -36.189632415771484, "ref_logps/rejected": -42.272918701171875, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0460517406463623, "rewards/margins": 1.0160362720489502, "rewards/rejected": -3.0620880126953125, "step": 1775 }, { "epoch": 1.68, "grad_norm": 22.13832664489746, "learning_rate": 2.4501573976915e-07, "logps/chosen": -54.67356491088867, "logps/rejected": -63.080604553222656, "loss": 0.4464, "losses/dpo": 0.5262210965156555, "losses/sft": 1.7890100479125977, "losses/total": 0.5262210965156555, "ref_logps/chosen": -39.76952362060547, "ref_logps/rejected": -37.833702087402344, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4904041290283203, "rewards/margins": 1.0342861413955688, "rewards/rejected": -2.524690628051758, "step": 1776 }, { "epoch": 1.68, "grad_norm": 21.75943374633789, "learning_rate": 2.448408534452606e-07, "logps/chosen": -46.36650085449219, "logps/rejected": -64.93172454833984, "loss": 0.3591, "losses/dpo": 0.40173521637916565, "losses/sft": 1.4916573762893677, "losses/total": 0.40173521637916565, "ref_logps/chosen": -32.63059616088867, "ref_logps/rejected": -37.73809051513672, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3735904693603516, "rewards/margins": 1.3457728624343872, "rewards/rejected": -2.719363212585449, "step": 1777 }, { "epoch": 1.68, "grad_norm": 26.07839012145996, "learning_rate": 2.446659671213711e-07, "logps/chosen": -52.337528228759766, "logps/rejected": -62.33589553833008, "loss": 0.487, "losses/dpo": 0.8857058882713318, "losses/sft": 1.9764080047607422, "losses/total": 0.8857058882713318, "ref_logps/chosen": -38.39671325683594, "ref_logps/rejected": -40.387664794921875, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3940818309783936, "rewards/margins": 0.8007413148880005, "rewards/rejected": -2.1948230266571045, "step": 1778 }, { "epoch": 1.68, "grad_norm": 22.114421844482422, "learning_rate": 2.444910807974816e-07, "logps/chosen": -56.41883850097656, "logps/rejected": -71.63005065917969, "loss": 0.386, "losses/dpo": 0.44728463888168335, "losses/sft": 1.8242136240005493, "losses/total": 0.44728463888168335, "ref_logps/chosen": -40.64788055419922, "ref_logps/rejected": -43.111515045166016, "rewards/accuracies": 0.875, "rewards/chosen": -1.5770957469940186, "rewards/margins": 1.2747578620910645, "rewards/rejected": -2.851853847503662, "step": 1779 }, { "epoch": 1.68, "grad_norm": 17.93875503540039, "learning_rate": 2.443161944735921e-07, "logps/chosen": -52.26482009887695, "logps/rejected": -69.38674926757812, "loss": 0.3061, "losses/dpo": 0.43446084856987, "losses/sft": 2.333432674407959, "losses/total": 0.43446084856987, "ref_logps/chosen": -37.2119026184082, "ref_logps/rejected": -37.800071716308594, "rewards/accuracies": 0.875, "rewards/chosen": -1.5052918195724487, "rewards/margins": 1.653376579284668, "rewards/rejected": -3.158668279647827, "step": 1780 }, { "epoch": 1.68, "grad_norm": 18.611656188964844, "learning_rate": 2.441413081497027e-07, "logps/chosen": -46.145103454589844, "logps/rejected": -76.09770965576172, "loss": 0.3005, "losses/dpo": 0.16587722301483154, "losses/sft": 1.7813081741333008, "losses/total": 0.16587722301483154, "ref_logps/chosen": -31.090377807617188, "ref_logps/rejected": -44.38664245605469, "rewards/accuracies": 0.875, "rewards/chosen": -1.5054725408554077, "rewards/margins": 1.665634036064148, "rewards/rejected": -3.1711065769195557, "step": 1781 }, { "epoch": 1.68, "grad_norm": 26.09534454345703, "learning_rate": 2.439664218258132e-07, "logps/chosen": -52.15563201904297, "logps/rejected": -80.50919342041016, "loss": 0.4189, "losses/dpo": 0.33090147376060486, "losses/sft": 1.8237643241882324, "losses/total": 0.33090147376060486, "ref_logps/chosen": -35.654212951660156, "ref_logps/rejected": -51.00825119018555, "rewards/accuracies": 0.8125, "rewards/chosen": -1.650141954421997, "rewards/margins": 1.2999521493911743, "rewards/rejected": -2.950094223022461, "step": 1782 }, { "epoch": 1.68, "grad_norm": 18.725366592407227, "learning_rate": 2.4379153550192377e-07, "logps/chosen": -56.691383361816406, "logps/rejected": -75.12642669677734, "loss": 0.3411, "losses/dpo": 0.416388601064682, "losses/sft": 1.8497592210769653, "losses/total": 0.416388601064682, "ref_logps/chosen": -41.28091812133789, "ref_logps/rejected": -44.83116149902344, "rewards/accuracies": 0.875, "rewards/chosen": -1.541046380996704, "rewards/margins": 1.4884802103042603, "rewards/rejected": -3.029526710510254, "step": 1783 }, { "epoch": 1.68, "grad_norm": 18.87398910522461, "learning_rate": 2.436166491780343e-07, "logps/chosen": -54.67528533935547, "logps/rejected": -76.38240814208984, "loss": 0.2635, "losses/dpo": 0.1386934071779251, "losses/sft": 1.6716876029968262, "losses/total": 0.1386934071779251, "ref_logps/chosen": -43.75564193725586, "ref_logps/rejected": -48.06443405151367, "rewards/accuracies": 0.875, "rewards/chosen": -1.0919640064239502, "rewards/margins": 1.7398333549499512, "rewards/rejected": -2.8317971229553223, "step": 1784 }, { "epoch": 1.69, "grad_norm": 18.562440872192383, "learning_rate": 2.434417628541448e-07, "logps/chosen": -60.26049041748047, "logps/rejected": -77.65402221679688, "loss": 0.2842, "losses/dpo": 0.2546020746231079, "losses/sft": 2.0679266452789307, "losses/total": 0.2546020746231079, "ref_logps/chosen": -44.27286911010742, "ref_logps/rejected": -45.871578216552734, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5987615585327148, "rewards/margins": 1.5794830322265625, "rewards/rejected": -3.1782445907592773, "step": 1785 }, { "epoch": 1.69, "grad_norm": 26.28315544128418, "learning_rate": 2.432668765302553e-07, "logps/chosen": -58.116798400878906, "logps/rejected": -72.88306427001953, "loss": 0.4248, "losses/dpo": 0.24877868592739105, "losses/sft": 1.925865650177002, "losses/total": 0.24877868592739105, "ref_logps/chosen": -40.48954391479492, "ref_logps/rejected": -42.69841766357422, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7627257108688354, "rewards/margins": 1.2557388544082642, "rewards/rejected": -3.0184645652770996, "step": 1786 }, { "epoch": 1.69, "grad_norm": 21.263511657714844, "learning_rate": 2.430919902063658e-07, "logps/chosen": -49.32164001464844, "logps/rejected": -70.8302993774414, "loss": 0.4289, "losses/dpo": 0.45973828434944153, "losses/sft": 1.9106676578521729, "losses/total": 0.45973828434944153, "ref_logps/chosen": -34.64125061035156, "ref_logps/rejected": -46.22222137451172, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4680389165878296, "rewards/margins": 0.9927690625190735, "rewards/rejected": -2.460808038711548, "step": 1787 }, { "epoch": 1.69, "grad_norm": 16.062267303466797, "learning_rate": 2.429171038824764e-07, "logps/chosen": -60.7998046875, "logps/rejected": -80.55281066894531, "loss": 0.2391, "losses/dpo": 0.3039817214012146, "losses/sft": 1.987055778503418, "losses/total": 0.3039817214012146, "ref_logps/chosen": -48.479469299316406, "ref_logps/rejected": -52.036865234375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2320337295532227, "rewards/margins": 1.6195610761642456, "rewards/rejected": -2.8515946865081787, "step": 1788 }, { "epoch": 1.69, "grad_norm": 21.14324188232422, "learning_rate": 2.427422175585869e-07, "logps/chosen": -48.07803726196289, "logps/rejected": -82.919189453125, "loss": 0.2616, "losses/dpo": 0.27730491757392883, "losses/sft": 1.4679170846939087, "losses/total": 0.27730491757392883, "ref_logps/chosen": -32.23628616333008, "ref_logps/rejected": -51.782989501953125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5841751098632812, "rewards/margins": 1.5294442176818848, "rewards/rejected": -3.113619565963745, "step": 1789 }, { "epoch": 1.69, "grad_norm": 20.190723419189453, "learning_rate": 2.4256733123469746e-07, "logps/chosen": -45.28055953979492, "logps/rejected": -67.60964965820312, "loss": 0.3713, "losses/dpo": 0.5195133686065674, "losses/sft": 1.902703046798706, "losses/total": 0.5195133686065674, "ref_logps/chosen": -30.42159080505371, "ref_logps/rejected": -40.092803955078125, "rewards/accuracies": 0.875, "rewards/chosen": -1.4858973026275635, "rewards/margins": 1.2657880783081055, "rewards/rejected": -2.75168514251709, "step": 1790 }, { "epoch": 1.69, "grad_norm": 29.569889068603516, "learning_rate": 2.42392444910808e-07, "logps/chosen": -56.91025924682617, "logps/rejected": -62.00165557861328, "loss": 0.5529, "losses/dpo": 0.6529324650764465, "losses/sft": 1.5989012718200684, "losses/total": 0.6529324650764465, "ref_logps/chosen": -39.31761169433594, "ref_logps/rejected": -37.57944869995117, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7592642307281494, "rewards/margins": 0.6829563975334167, "rewards/rejected": -2.442220687866211, "step": 1791 }, { "epoch": 1.69, "grad_norm": 21.72402000427246, "learning_rate": 2.422175585869185e-07, "logps/chosen": -44.800167083740234, "logps/rejected": -58.93064498901367, "loss": 0.4815, "losses/dpo": 0.8620520830154419, "losses/sft": 1.5104798078536987, "losses/total": 0.8620520830154419, "ref_logps/chosen": -32.5340461730957, "ref_logps/rejected": -36.773292541503906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.226611852645874, "rewards/margins": 0.9891231656074524, "rewards/rejected": -2.2157349586486816, "step": 1792 }, { "epoch": 1.69, "grad_norm": 20.296966552734375, "learning_rate": 2.42042672263029e-07, "logps/chosen": -70.43276977539062, "logps/rejected": -84.78248596191406, "loss": 0.3108, "losses/dpo": 0.308792382478714, "losses/sft": 2.2755091190338135, "losses/total": 0.308792382478714, "ref_logps/chosen": -50.647308349609375, "ref_logps/rejected": -51.45112609863281, "rewards/accuracies": 0.875, "rewards/chosen": -1.978546380996704, "rewards/margins": 1.354589581489563, "rewards/rejected": -3.3331360816955566, "step": 1793 }, { "epoch": 1.69, "grad_norm": 24.659976959228516, "learning_rate": 2.418677859391395e-07, "logps/chosen": -53.954463958740234, "logps/rejected": -70.03739929199219, "loss": 0.4518, "losses/dpo": 0.7430610656738281, "losses/sft": 2.3112823963165283, "losses/total": 0.7430610656738281, "ref_logps/chosen": -38.437843322753906, "ref_logps/rejected": -42.495086669921875, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5516622066497803, "rewards/margins": 1.202569603919983, "rewards/rejected": -2.7542319297790527, "step": 1794 }, { "epoch": 1.69, "grad_norm": 22.181167602539062, "learning_rate": 2.416928996152501e-07, "logps/chosen": -43.811004638671875, "logps/rejected": -59.92433166503906, "loss": 0.4551, "losses/dpo": 0.6735576391220093, "losses/sft": 1.6574383974075317, "losses/total": 0.6735576391220093, "ref_logps/chosen": -28.613182067871094, "ref_logps/rejected": -34.89199447631836, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5197821855545044, "rewards/margins": 0.983451783657074, "rewards/rejected": -2.5032339096069336, "step": 1795 }, { "epoch": 1.7, "grad_norm": 30.978641510009766, "learning_rate": 2.415180132913606e-07, "logps/chosen": -50.89833450317383, "logps/rejected": -57.69483947753906, "loss": 0.5876, "losses/dpo": 0.6637569665908813, "losses/sft": 1.7893388271331787, "losses/total": 0.6637569665908813, "ref_logps/chosen": -36.480552673339844, "ref_logps/rejected": -36.3235969543457, "rewards/accuracies": 0.625, "rewards/chosen": -1.4417781829833984, "rewards/margins": 0.6953457593917847, "rewards/rejected": -2.1371238231658936, "step": 1796 }, { "epoch": 1.7, "grad_norm": 19.025787353515625, "learning_rate": 2.4134312696747116e-07, "logps/chosen": -53.97319030761719, "logps/rejected": -72.19430541992188, "loss": 0.3084, "losses/dpo": 0.26776963472366333, "losses/sft": 1.8403483629226685, "losses/total": 0.26776963472366333, "ref_logps/chosen": -40.94664764404297, "ref_logps/rejected": -42.826873779296875, "rewards/accuracies": 0.875, "rewards/chosen": -1.3026540279388428, "rewards/margins": 1.6340895891189575, "rewards/rejected": -2.9367434978485107, "step": 1797 }, { "epoch": 1.7, "grad_norm": 13.881430625915527, "learning_rate": 2.4116824064358167e-07, "logps/chosen": -42.53218078613281, "logps/rejected": -77.91055297851562, "loss": 0.175, "losses/dpo": 0.23638613522052765, "losses/sft": 1.7883586883544922, "losses/total": 0.23638613522052765, "ref_logps/chosen": -32.17339324951172, "ref_logps/rejected": -46.677001953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.0358785390853882, "rewards/margins": 2.0874767303466797, "rewards/rejected": -3.1233553886413574, "step": 1798 }, { "epoch": 1.7, "grad_norm": 25.556428909301758, "learning_rate": 2.409933543196922e-07, "logps/chosen": -54.38060760498047, "logps/rejected": -74.64018249511719, "loss": 0.5343, "losses/dpo": 0.4501485526561737, "losses/sft": 2.3382349014282227, "losses/total": 0.4501485526561737, "ref_logps/chosen": -36.06700897216797, "ref_logps/rejected": -45.476924896240234, "rewards/accuracies": 0.75, "rewards/chosen": -1.8313599824905396, "rewards/margins": 1.0849659442901611, "rewards/rejected": -2.916325807571411, "step": 1799 }, { "epoch": 1.7, "grad_norm": 20.34830093383789, "learning_rate": 2.408184679958027e-07, "logps/chosen": -54.23207092285156, "logps/rejected": -78.97080993652344, "loss": 0.3925, "losses/dpo": 0.3393729627132416, "losses/sft": 1.8011932373046875, "losses/total": 0.3393729627132416, "ref_logps/chosen": -38.857635498046875, "ref_logps/rejected": -48.183135986328125, "rewards/accuracies": 0.875, "rewards/chosen": -1.5374433994293213, "rewards/margins": 1.5413247346878052, "rewards/rejected": -3.078768253326416, "step": 1800 }, { "epoch": 1.7, "grad_norm": 14.515795707702637, "learning_rate": 2.406435816719132e-07, "logps/chosen": -37.18846893310547, "logps/rejected": -69.02476501464844, "loss": 0.2884, "losses/dpo": 0.4819752871990204, "losses/sft": 1.9720385074615479, "losses/total": 0.4819752871990204, "ref_logps/chosen": -27.404996871948242, "ref_logps/rejected": -42.84521484375, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9783473610877991, "rewards/margins": 1.6396076679229736, "rewards/rejected": -2.617954969406128, "step": 1801 }, { "epoch": 1.7, "grad_norm": 26.68324851989746, "learning_rate": 2.404686953480238e-07, "logps/chosen": -71.21405029296875, "logps/rejected": -81.84751892089844, "loss": 0.4024, "losses/dpo": 0.29949310421943665, "losses/sft": 1.6894594430923462, "losses/total": 0.29949310421943665, "ref_logps/chosen": -49.875282287597656, "ref_logps/rejected": -49.16812515258789, "rewards/accuracies": 0.875, "rewards/chosen": -2.1338772773742676, "rewards/margins": 1.1340622901916504, "rewards/rejected": -3.267939567565918, "step": 1802 }, { "epoch": 1.7, "grad_norm": 27.291259765625, "learning_rate": 2.402938090241343e-07, "logps/chosen": -57.330352783203125, "logps/rejected": -70.62774658203125, "loss": 0.4482, "losses/dpo": 0.527060866355896, "losses/sft": 2.0806970596313477, "losses/total": 0.527060866355896, "ref_logps/chosen": -41.38750076293945, "ref_logps/rejected": -45.277442932128906, "rewards/accuracies": 0.75, "rewards/chosen": -1.594285249710083, "rewards/margins": 0.9407455921173096, "rewards/rejected": -2.5350308418273926, "step": 1803 }, { "epoch": 1.7, "grad_norm": 20.700218200683594, "learning_rate": 2.4011892270024485e-07, "logps/chosen": -49.646339416503906, "logps/rejected": -70.36936950683594, "loss": 0.3494, "losses/dpo": 0.31294238567352295, "losses/sft": 1.506961464881897, "losses/total": 0.31294238567352295, "ref_logps/chosen": -38.689517974853516, "ref_logps/rejected": -47.67042922973633, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0956823825836182, "rewards/margins": 1.1742115020751953, "rewards/rejected": -2.2698938846588135, "step": 1804 }, { "epoch": 1.7, "grad_norm": 25.93865394592285, "learning_rate": 2.3994403637635537e-07, "logps/chosen": -48.94581985473633, "logps/rejected": -72.06764221191406, "loss": 0.5139, "losses/dpo": 0.4126269519329071, "losses/sft": 1.966224193572998, "losses/total": 0.4126269519329071, "ref_logps/chosen": -35.31593704223633, "ref_logps/rejected": -50.644996643066406, "rewards/accuracies": 0.6875, "rewards/chosen": -1.362987995147705, "rewards/margins": 0.779276967048645, "rewards/rejected": -2.1422648429870605, "step": 1805 }, { "epoch": 1.71, "grad_norm": 21.34590721130371, "learning_rate": 2.397691500524659e-07, "logps/chosen": -50.95314025878906, "logps/rejected": -68.23500061035156, "loss": 0.3399, "losses/dpo": 0.3922712206840515, "losses/sft": 2.487985134124756, "losses/total": 0.3922712206840515, "ref_logps/chosen": -35.26377487182617, "ref_logps/rejected": -39.783836364746094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5689367055892944, "rewards/margins": 1.276179313659668, "rewards/rejected": -2.845115900039673, "step": 1806 }, { "epoch": 1.71, "grad_norm": 13.201313018798828, "learning_rate": 2.395942637285764e-07, "logps/chosen": -48.08342361450195, "logps/rejected": -62.19560241699219, "loss": 0.2348, "losses/dpo": 0.19551517069339752, "losses/sft": 1.8445029258728027, "losses/total": 0.19551517069339752, "ref_logps/chosen": -39.11737823486328, "ref_logps/rejected": -35.42494201660156, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8966045379638672, "rewards/margins": 1.7804611921310425, "rewards/rejected": -2.677065849304199, "step": 1807 }, { "epoch": 1.71, "grad_norm": 19.576393127441406, "learning_rate": 2.3941937740468696e-07, "logps/chosen": -62.76116943359375, "logps/rejected": -70.84951782226562, "loss": 0.3568, "losses/dpo": 0.3549603223800659, "losses/sft": 1.8085447549819946, "losses/total": 0.3549603223800659, "ref_logps/chosen": -45.12492752075195, "ref_logps/rejected": -43.808570861816406, "rewards/accuracies": 1.0, "rewards/chosen": -1.7636239528656006, "rewards/margins": 0.9404711127281189, "rewards/rejected": -2.7040953636169434, "step": 1808 }, { "epoch": 1.71, "grad_norm": 20.147262573242188, "learning_rate": 2.3924449108079747e-07, "logps/chosen": -52.591129302978516, "logps/rejected": -63.63362121582031, "loss": 0.3674, "losses/dpo": 0.4235965311527252, "losses/sft": 1.3365919589996338, "losses/total": 0.4235965311527252, "ref_logps/chosen": -38.298648834228516, "ref_logps/rejected": -35.4205436706543, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4292478561401367, "rewards/margins": 1.3920602798461914, "rewards/rejected": -2.821308135986328, "step": 1809 }, { "epoch": 1.71, "grad_norm": 25.43413734436035, "learning_rate": 2.39069604756908e-07, "logps/chosen": -45.40858840942383, "logps/rejected": -62.31193923950195, "loss": 0.4528, "losses/dpo": 0.768020510673523, "losses/sft": 1.4819835424423218, "losses/total": 0.768020510673523, "ref_logps/chosen": -28.7589111328125, "ref_logps/rejected": -34.95357131958008, "rewards/accuracies": 0.75, "rewards/chosen": -1.6649675369262695, "rewards/margins": 1.0708694458007812, "rewards/rejected": -2.735836982727051, "step": 1810 }, { "epoch": 1.71, "grad_norm": 23.087696075439453, "learning_rate": 2.3889471843301855e-07, "logps/chosen": -53.17631530761719, "logps/rejected": -74.31806945800781, "loss": 0.442, "losses/dpo": 0.7953958511352539, "losses/sft": 2.4478635787963867, "losses/total": 0.7953958511352539, "ref_logps/chosen": -36.81160354614258, "ref_logps/rejected": -43.09166717529297, "rewards/accuracies": 0.75, "rewards/chosen": -1.6364715099334717, "rewards/margins": 1.4861682653427124, "rewards/rejected": -3.1226396560668945, "step": 1811 }, { "epoch": 1.71, "grad_norm": 35.091495513916016, "learning_rate": 2.3871983210912906e-07, "logps/chosen": -50.18351745605469, "logps/rejected": -70.42749786376953, "loss": 0.5903, "losses/dpo": 0.6043030023574829, "losses/sft": 2.051079750061035, "losses/total": 0.6043030023574829, "ref_logps/chosen": -33.16597366333008, "ref_logps/rejected": -46.168888092041016, "rewards/accuracies": 0.625, "rewards/chosen": -1.701754093170166, "rewards/margins": 0.7241073846817017, "rewards/rejected": -2.4258618354797363, "step": 1812 }, { "epoch": 1.71, "grad_norm": 20.404088973999023, "learning_rate": 2.3854494578523957e-07, "logps/chosen": -51.82007598876953, "logps/rejected": -72.42236328125, "loss": 0.3263, "losses/dpo": 0.2008453756570816, "losses/sft": 2.5513551235198975, "losses/total": 0.2008453756570816, "ref_logps/chosen": -36.613616943359375, "ref_logps/rejected": -43.31108474731445, "rewards/accuracies": 0.9375, "rewards/chosen": -1.520646095275879, "rewards/margins": 1.39048171043396, "rewards/rejected": -2.911127805709839, "step": 1813 }, { "epoch": 1.71, "grad_norm": 22.575740814208984, "learning_rate": 2.383700594613501e-07, "logps/chosen": -41.14979934692383, "logps/rejected": -57.995208740234375, "loss": 0.4619, "losses/dpo": 0.6094987392425537, "losses/sft": 1.7752008438110352, "losses/total": 0.6094987392425537, "ref_logps/chosen": -28.672632217407227, "ref_logps/rejected": -35.17646789550781, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2477166652679443, "rewards/margins": 1.034157633781433, "rewards/rejected": -2.281874179840088, "step": 1814 }, { "epoch": 1.71, "grad_norm": 36.38962936401367, "learning_rate": 2.3819517313746065e-07, "logps/chosen": -60.74071502685547, "logps/rejected": -69.07544708251953, "loss": 0.8009, "losses/dpo": 1.2549885511398315, "losses/sft": 1.732917070388794, "losses/total": 1.2549885511398315, "ref_logps/chosen": -42.26467514038086, "ref_logps/rejected": -47.442230224609375, "rewards/accuracies": 0.75, "rewards/chosen": -1.8476041555404663, "rewards/margins": 0.3157173991203308, "rewards/rejected": -2.1633217334747314, "step": 1815 }, { "epoch": 1.71, "grad_norm": 24.11556053161621, "learning_rate": 2.3802028681357116e-07, "logps/chosen": -61.94044876098633, "logps/rejected": -86.11589813232422, "loss": 0.3338, "losses/dpo": 0.6760409474372864, "losses/sft": 2.08918833732605, "losses/total": 0.6760409474372864, "ref_logps/chosen": -41.16832733154297, "ref_logps/rejected": -50.388816833496094, "rewards/accuracies": 0.8125, "rewards/chosen": -2.077211856842041, "rewards/margins": 1.4954955577850342, "rewards/rejected": -3.572707414627075, "step": 1816 }, { "epoch": 1.72, "grad_norm": 14.405179023742676, "learning_rate": 2.3784540048968168e-07, "logps/chosen": -38.53070068359375, "logps/rejected": -71.51303100585938, "loss": 0.2274, "losses/dpo": 0.13569137454032898, "losses/sft": 1.565115213394165, "losses/total": 0.13569137454032898, "ref_logps/chosen": -28.008386611938477, "ref_logps/rejected": -41.23868942260742, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0522313117980957, "rewards/margins": 1.975203037261963, "rewards/rejected": -3.0274343490600586, "step": 1817 }, { "epoch": 1.72, "grad_norm": 28.90049934387207, "learning_rate": 2.3767051416579224e-07, "logps/chosen": -43.190391540527344, "logps/rejected": -56.24452209472656, "loss": 0.4207, "losses/dpo": 0.4833914637565613, "losses/sft": 2.234956979751587, "losses/total": 0.4833914637565613, "ref_logps/chosen": -30.222129821777344, "ref_logps/rejected": -30.783945083618164, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2968266010284424, "rewards/margins": 1.2492311000823975, "rewards/rejected": -2.54605770111084, "step": 1818 }, { "epoch": 1.72, "grad_norm": 27.203447341918945, "learning_rate": 2.3749562784190275e-07, "logps/chosen": -54.47657012939453, "logps/rejected": -90.49632263183594, "loss": 0.3377, "losses/dpo": 0.3628811240196228, "losses/sft": 1.9849289655685425, "losses/total": 0.3628811240196228, "ref_logps/chosen": -38.0615234375, "ref_logps/rejected": -56.2956428527832, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6415046453475952, "rewards/margins": 1.7785634994506836, "rewards/rejected": -3.4200680255889893, "step": 1819 }, { "epoch": 1.72, "grad_norm": 23.622251510620117, "learning_rate": 2.373207415180133e-07, "logps/chosen": -62.47504806518555, "logps/rejected": -79.90737915039062, "loss": 0.3726, "losses/dpo": 0.4988325238227844, "losses/sft": 1.9776464700698853, "losses/total": 0.4988325238227844, "ref_logps/chosen": -44.18340301513672, "ref_logps/rejected": -45.385475158691406, "rewards/accuracies": 0.75, "rewards/chosen": -1.8291640281677246, "rewards/margins": 1.6230263710021973, "rewards/rejected": -3.4521901607513428, "step": 1820 }, { "epoch": 1.72, "grad_norm": 32.48872756958008, "learning_rate": 2.371458551941238e-07, "logps/chosen": -56.083763122558594, "logps/rejected": -79.61029052734375, "loss": 0.5124, "losses/dpo": 0.7199397087097168, "losses/sft": 2.0983541011810303, "losses/total": 0.7199397087097168, "ref_logps/chosen": -35.626197814941406, "ref_logps/rejected": -48.16398620605469, "rewards/accuracies": 0.625, "rewards/chosen": -2.0457568168640137, "rewards/margins": 1.0988742113113403, "rewards/rejected": -3.1446309089660645, "step": 1821 }, { "epoch": 1.72, "grad_norm": 27.68033218383789, "learning_rate": 2.3697096887023435e-07, "logps/chosen": -59.46272277832031, "logps/rejected": -69.81183624267578, "loss": 0.4971, "losses/dpo": 0.8627828359603882, "losses/sft": 2.3563098907470703, "losses/total": 0.8627828359603882, "ref_logps/chosen": -39.85981750488281, "ref_logps/rejected": -39.03014373779297, "rewards/accuracies": 0.75, "rewards/chosen": -1.960290789604187, "rewards/margins": 1.1178789138793945, "rewards/rejected": -3.078169822692871, "step": 1822 }, { "epoch": 1.72, "grad_norm": 22.5125675201416, "learning_rate": 2.3679608254634486e-07, "logps/chosen": -57.32041931152344, "logps/rejected": -76.5188217163086, "loss": 0.4545, "losses/dpo": 0.425339937210083, "losses/sft": 1.8877969980239868, "losses/total": 0.425339937210083, "ref_logps/chosen": -42.733543395996094, "ref_logps/rejected": -46.36111068725586, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4586875438690186, "rewards/margins": 1.5570838451385498, "rewards/rejected": -3.0157713890075684, "step": 1823 }, { "epoch": 1.72, "grad_norm": 22.97613525390625, "learning_rate": 2.3662119622245537e-07, "logps/chosen": -63.38905334472656, "logps/rejected": -82.48291015625, "loss": 0.3041, "losses/dpo": 0.2486165463924408, "losses/sft": 1.890404224395752, "losses/total": 0.2486165463924408, "ref_logps/chosen": -47.25609588623047, "ref_logps/rejected": -48.80116271972656, "rewards/accuracies": 0.875, "rewards/chosen": -1.6132957935333252, "rewards/margins": 1.7548785209655762, "rewards/rejected": -3.3681745529174805, "step": 1824 }, { "epoch": 1.72, "grad_norm": 31.059612274169922, "learning_rate": 2.3644630989856594e-07, "logps/chosen": -52.537757873535156, "logps/rejected": -71.47386169433594, "loss": 0.5254, "losses/dpo": 0.19074425101280212, "losses/sft": 2.104780435562134, "losses/total": 0.19074425101280212, "ref_logps/chosen": -36.833351135253906, "ref_logps/rejected": -42.596961975097656, "rewards/accuracies": 0.875, "rewards/chosen": -1.5704405307769775, "rewards/margins": 1.3172492980957031, "rewards/rejected": -2.8876895904541016, "step": 1825 }, { "epoch": 1.72, "grad_norm": 15.249720573425293, "learning_rate": 2.3627142357467645e-07, "logps/chosen": -55.86733627319336, "logps/rejected": -93.32683563232422, "loss": 0.1699, "losses/dpo": 0.02240651845932007, "losses/sft": 1.7759265899658203, "losses/total": 0.02240651845932007, "ref_logps/chosen": -44.871089935302734, "ref_logps/rejected": -56.358184814453125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0996246337890625, "rewards/margins": 2.597240686416626, "rewards/rejected": -3.6968655586242676, "step": 1826 }, { "epoch": 1.73, "grad_norm": 24.925495147705078, "learning_rate": 2.36096537250787e-07, "logps/chosen": -55.65700149536133, "logps/rejected": -80.70500183105469, "loss": 0.3965, "losses/dpo": 1.0382367372512817, "losses/sft": 2.458404302597046, "losses/total": 1.0382367372512817, "ref_logps/chosen": -37.813629150390625, "ref_logps/rejected": -45.53327941894531, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7843371629714966, "rewards/margins": 1.7328346967697144, "rewards/rejected": -3.517171859741211, "step": 1827 }, { "epoch": 1.73, "grad_norm": 20.223031997680664, "learning_rate": 2.359216509268975e-07, "logps/chosen": -59.71818161010742, "logps/rejected": -75.98500061035156, "loss": 0.3323, "losses/dpo": 0.2974328398704529, "losses/sft": 2.027266263961792, "losses/total": 0.2974328398704529, "ref_logps/chosen": -44.14948272705078, "ref_logps/rejected": -45.65176773071289, "rewards/accuracies": 0.875, "rewards/chosen": -1.5568698644638062, "rewards/margins": 1.4764533042907715, "rewards/rejected": -3.033323287963867, "step": 1828 }, { "epoch": 1.73, "grad_norm": 30.462846755981445, "learning_rate": 2.3574676460300804e-07, "logps/chosen": -65.40301513671875, "logps/rejected": -89.72885131835938, "loss": 0.4634, "losses/dpo": 0.7341820001602173, "losses/sft": 2.5555121898651123, "losses/total": 0.7341820001602173, "ref_logps/chosen": -43.19352722167969, "ref_logps/rejected": -52.105194091796875, "rewards/accuracies": 0.875, "rewards/chosen": -2.2209486961364746, "rewards/margins": 1.5414172410964966, "rewards/rejected": -3.7623658180236816, "step": 1829 }, { "epoch": 1.73, "grad_norm": 16.414480209350586, "learning_rate": 2.3557187827911855e-07, "logps/chosen": -49.07080078125, "logps/rejected": -79.08071899414062, "loss": 0.2538, "losses/dpo": 0.0965033695101738, "losses/sft": 1.7428237199783325, "losses/total": 0.0965033695101738, "ref_logps/chosen": -36.794944763183594, "ref_logps/rejected": -48.857421875, "rewards/accuracies": 0.875, "rewards/chosen": -1.2275853157043457, "rewards/margins": 1.7947438955307007, "rewards/rejected": -3.022329330444336, "step": 1830 }, { "epoch": 1.73, "grad_norm": 28.070436477661133, "learning_rate": 2.353969919552291e-07, "logps/chosen": -59.75160217285156, "logps/rejected": -60.827232360839844, "loss": 0.5082, "losses/dpo": 0.4128730595111847, "losses/sft": 1.4651991128921509, "losses/total": 0.4128730595111847, "ref_logps/chosen": -39.54951477050781, "ref_logps/rejected": -33.90257263183594, "rewards/accuracies": 0.875, "rewards/chosen": -2.0202085971832275, "rewards/margins": 0.672257125377655, "rewards/rejected": -2.6924657821655273, "step": 1831 }, { "epoch": 1.73, "grad_norm": 25.752349853515625, "learning_rate": 2.3522210563133963e-07, "logps/chosen": -49.78515625, "logps/rejected": -65.71405792236328, "loss": 0.4287, "losses/dpo": 0.28923720121383667, "losses/sft": 1.5678766965866089, "losses/total": 0.28923720121383667, "ref_logps/chosen": -36.99530792236328, "ref_logps/rejected": -40.70927429199219, "rewards/accuracies": 0.8125, "rewards/chosen": -1.278984785079956, "rewards/margins": 1.2214933633804321, "rewards/rejected": -2.5004782676696777, "step": 1832 }, { "epoch": 1.73, "grad_norm": 21.68288803100586, "learning_rate": 2.3504721930745014e-07, "logps/chosen": -46.7327880859375, "logps/rejected": -76.64437866210938, "loss": 0.306, "losses/dpo": 0.20786938071250916, "losses/sft": 1.6630686521530151, "losses/total": 0.20786938071250916, "ref_logps/chosen": -36.01597213745117, "ref_logps/rejected": -48.47999954223633, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0716817378997803, "rewards/margins": 1.744756817817688, "rewards/rejected": -2.816438674926758, "step": 1833 }, { "epoch": 1.73, "grad_norm": 22.000045776367188, "learning_rate": 2.3487233298356068e-07, "logps/chosen": -48.887638092041016, "logps/rejected": -71.68235778808594, "loss": 0.3812, "losses/dpo": 0.35574308037757874, "losses/sft": 2.0993213653564453, "losses/total": 0.35574308037757874, "ref_logps/chosen": -33.489990234375, "ref_logps/rejected": -42.93003845214844, "rewards/accuracies": 0.875, "rewards/chosen": -1.539764642715454, "rewards/margins": 1.3354679346084595, "rewards/rejected": -2.875232458114624, "step": 1834 }, { "epoch": 1.73, "grad_norm": 23.827600479125977, "learning_rate": 2.346974466596712e-07, "logps/chosen": -51.1121940612793, "logps/rejected": -70.45458984375, "loss": 0.3488, "losses/dpo": 0.39547669887542725, "losses/sft": 2.0413146018981934, "losses/total": 0.39547669887542725, "ref_logps/chosen": -37.81718444824219, "ref_logps/rejected": -43.214599609375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3295011520385742, "rewards/margins": 1.3944984674453735, "rewards/rejected": -2.723999500274658, "step": 1835 }, { "epoch": 1.73, "grad_norm": 31.912322998046875, "learning_rate": 2.3452256033578173e-07, "logps/chosen": -63.27443313598633, "logps/rejected": -74.21241760253906, "loss": 0.5333, "losses/dpo": 0.7827017903327942, "losses/sft": 2.633052110671997, "losses/total": 0.7827017903327942, "ref_logps/chosen": -44.273338317871094, "ref_logps/rejected": -47.25540542602539, "rewards/accuracies": 0.75, "rewards/chosen": -1.9001094102859497, "rewards/margins": 0.7955919504165649, "rewards/rejected": -2.6957013607025146, "step": 1836 }, { "epoch": 1.73, "grad_norm": 24.76863670349121, "learning_rate": 2.3434767401189225e-07, "logps/chosen": -72.81598663330078, "logps/rejected": -78.52141571044922, "loss": 0.4588, "losses/dpo": 0.5126616358757019, "losses/sft": 2.036393880844116, "losses/total": 0.5126616358757019, "ref_logps/chosen": -48.431392669677734, "ref_logps/rejected": -45.08470153808594, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4384589195251465, "rewards/margins": 0.9052125215530396, "rewards/rejected": -3.3436715602874756, "step": 1837 }, { "epoch": 1.74, "grad_norm": 28.28955841064453, "learning_rate": 2.3417278768800279e-07, "logps/chosen": -53.83306121826172, "logps/rejected": -74.90179443359375, "loss": 0.4319, "losses/dpo": 0.41800588369369507, "losses/sft": 2.649404287338257, "losses/total": 0.41800588369369507, "ref_logps/chosen": -38.21922302246094, "ref_logps/rejected": -47.19895935058594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5613840818405151, "rewards/margins": 1.2088998556137085, "rewards/rejected": -2.7702839374542236, "step": 1838 }, { "epoch": 1.74, "grad_norm": 33.06009292602539, "learning_rate": 2.3399790136411333e-07, "logps/chosen": -56.79712677001953, "logps/rejected": -64.22746276855469, "loss": 0.5691, "losses/dpo": 0.3352791666984558, "losses/sft": 1.94242262840271, "losses/total": 0.3352791666984558, "ref_logps/chosen": -40.31622314453125, "ref_logps/rejected": -40.16978454589844, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6480904817581177, "rewards/margins": 0.757677435874939, "rewards/rejected": -2.4057679176330566, "step": 1839 }, { "epoch": 1.74, "grad_norm": 19.5128231048584, "learning_rate": 2.3382301504022384e-07, "logps/chosen": -46.164161682128906, "logps/rejected": -75.56565856933594, "loss": 0.2953, "losses/dpo": 0.6225314140319824, "losses/sft": 1.8770743608474731, "losses/total": 0.6225314140319824, "ref_logps/chosen": -27.919498443603516, "ref_logps/rejected": -39.514408111572266, "rewards/accuracies": 0.875, "rewards/chosen": -1.8244662284851074, "rewards/margins": 1.7806590795516968, "rewards/rejected": -3.6051251888275146, "step": 1840 }, { "epoch": 1.74, "grad_norm": 21.03430938720703, "learning_rate": 2.3364812871633438e-07, "logps/chosen": -60.95615768432617, "logps/rejected": -65.29434967041016, "loss": 0.4591, "losses/dpo": 0.2938690781593323, "losses/sft": 1.9467700719833374, "losses/total": 0.2938690781593323, "ref_logps/chosen": -45.059661865234375, "ref_logps/rejected": -38.9007568359375, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5896495580673218, "rewards/margins": 1.0497106313705444, "rewards/rejected": -2.639360189437866, "step": 1841 }, { "epoch": 1.74, "grad_norm": 24.941936492919922, "learning_rate": 2.334732423924449e-07, "logps/chosen": -53.41764831542969, "logps/rejected": -66.00311279296875, "loss": 0.396, "losses/dpo": 0.2544431984424591, "losses/sft": 1.4687739610671997, "losses/total": 0.2544431984424591, "ref_logps/chosen": -37.06456756591797, "ref_logps/rejected": -36.036800384521484, "rewards/accuracies": 0.875, "rewards/chosen": -1.635308027267456, "rewards/margins": 1.3613231182098389, "rewards/rejected": -2.996631145477295, "step": 1842 }, { "epoch": 1.74, "grad_norm": 29.51819610595703, "learning_rate": 2.3329835606855543e-07, "logps/chosen": -63.126502990722656, "logps/rejected": -66.65913391113281, "loss": 0.51, "losses/dpo": 0.37955766916275024, "losses/sft": 1.9559439420700073, "losses/total": 0.37955766916275024, "ref_logps/chosen": -46.511863708496094, "ref_logps/rejected": -41.78247833251953, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6614642143249512, "rewards/margins": 0.8262010216712952, "rewards/rejected": -2.4876651763916016, "step": 1843 }, { "epoch": 1.74, "grad_norm": 36.848724365234375, "learning_rate": 2.3312346974466597e-07, "logps/chosen": -42.624046325683594, "logps/rejected": -54.71479415893555, "loss": 0.4782, "losses/dpo": 0.4191596508026123, "losses/sft": 2.1579864025115967, "losses/total": 0.4191596508026123, "ref_logps/chosen": -29.775489807128906, "ref_logps/rejected": -30.61602210998535, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2848554849624634, "rewards/margins": 1.1250216960906982, "rewards/rejected": -2.409877300262451, "step": 1844 }, { "epoch": 1.74, "grad_norm": 23.67749786376953, "learning_rate": 2.3294858342077648e-07, "logps/chosen": -49.564029693603516, "logps/rejected": -71.5465087890625, "loss": 0.3629, "losses/dpo": 0.5933958292007446, "losses/sft": 2.0923280715942383, "losses/total": 0.5933958292007446, "ref_logps/chosen": -37.54664993286133, "ref_logps/rejected": -43.87751770019531, "rewards/accuracies": 0.8125, "rewards/chosen": -1.201737880706787, "rewards/margins": 1.565162181854248, "rewards/rejected": -2.766900062561035, "step": 1845 }, { "epoch": 1.74, "grad_norm": 20.85272216796875, "learning_rate": 2.3277369709688702e-07, "logps/chosen": -46.61496353149414, "logps/rejected": -62.098121643066406, "loss": 0.3089, "losses/dpo": 0.19887515902519226, "losses/sft": 1.8970520496368408, "losses/total": 0.19887515902519226, "ref_logps/chosen": -33.598785400390625, "ref_logps/rejected": -34.05204772949219, "rewards/accuracies": 0.875, "rewards/chosen": -1.3016180992126465, "rewards/margins": 1.5029888153076172, "rewards/rejected": -2.8046069145202637, "step": 1846 }, { "epoch": 1.74, "grad_norm": 33.110694885253906, "learning_rate": 2.3259881077299753e-07, "logps/chosen": -66.85472106933594, "logps/rejected": -73.36692810058594, "loss": 0.494, "losses/dpo": 0.4233478903770447, "losses/sft": 2.028045415878296, "losses/total": 0.4233478903770447, "ref_logps/chosen": -49.42625427246094, "ref_logps/rejected": -45.749610900878906, "rewards/accuracies": 0.8125, "rewards/chosen": -1.742846965789795, "rewards/margins": 1.0188846588134766, "rewards/rejected": -2.7617316246032715, "step": 1847 }, { "epoch": 1.75, "grad_norm": 22.06517791748047, "learning_rate": 2.3242392444910807e-07, "logps/chosen": -61.929054260253906, "logps/rejected": -74.1783447265625, "loss": 0.3416, "losses/dpo": 0.2992900013923645, "losses/sft": 2.08903431892395, "losses/total": 0.2992900013923645, "ref_logps/chosen": -46.72382736206055, "ref_logps/rejected": -44.66014862060547, "rewards/accuracies": 0.875, "rewards/chosen": -1.5205228328704834, "rewards/margins": 1.4312971830368042, "rewards/rejected": -2.951820135116577, "step": 1848 }, { "epoch": 1.75, "grad_norm": 24.4124755859375, "learning_rate": 2.3224903812521858e-07, "logps/chosen": -49.27168273925781, "logps/rejected": -66.37191772460938, "loss": 0.4083, "losses/dpo": 0.501542329788208, "losses/sft": 1.5513527393341064, "losses/total": 0.501542329788208, "ref_logps/chosen": -37.170188903808594, "ref_logps/rejected": -41.29085159301758, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2101495265960693, "rewards/margins": 1.297957181930542, "rewards/rejected": -2.5081067085266113, "step": 1849 }, { "epoch": 1.75, "grad_norm": 23.928943634033203, "learning_rate": 2.3207415180132915e-07, "logps/chosen": -49.31248474121094, "logps/rejected": -69.08738708496094, "loss": 0.3848, "losses/dpo": 0.411376416683197, "losses/sft": 1.747654914855957, "losses/total": 0.411376416683197, "ref_logps/chosen": -31.206275939941406, "ref_logps/rejected": -38.59733963012695, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8106211423873901, "rewards/margins": 1.2383838891983032, "rewards/rejected": -3.0490050315856934, "step": 1850 }, { "epoch": 1.75, "grad_norm": 27.012725830078125, "learning_rate": 2.3189926547743966e-07, "logps/chosen": -60.662445068359375, "logps/rejected": -81.42811584472656, "loss": 0.3731, "losses/dpo": 0.3123415410518646, "losses/sft": 2.4386916160583496, "losses/total": 0.3123415410518646, "ref_logps/chosen": -43.75713348388672, "ref_logps/rejected": -49.12187957763672, "rewards/accuracies": 0.875, "rewards/chosen": -1.6905311346054077, "rewards/margins": 1.5400934219360352, "rewards/rejected": -3.2306246757507324, "step": 1851 }, { "epoch": 1.75, "grad_norm": 30.193464279174805, "learning_rate": 2.3172437915355018e-07, "logps/chosen": -63.7210693359375, "logps/rejected": -66.50696563720703, "loss": 0.4058, "losses/dpo": 0.6251382231712341, "losses/sft": 2.281372547149658, "losses/total": 0.6251382231712341, "ref_logps/chosen": -44.03181457519531, "ref_logps/rejected": -36.02568435668945, "rewards/accuracies": 0.875, "rewards/chosen": -1.9689252376556396, "rewards/margins": 1.0792032480239868, "rewards/rejected": -3.048128604888916, "step": 1852 }, { "epoch": 1.75, "grad_norm": 25.303871154785156, "learning_rate": 2.3154949282966071e-07, "logps/chosen": -53.38203430175781, "logps/rejected": -78.54144287109375, "loss": 0.3124, "losses/dpo": 0.5744849443435669, "losses/sft": 2.2462775707244873, "losses/total": 0.5744849443435669, "ref_logps/chosen": -40.01557159423828, "ref_logps/rejected": -49.328025817871094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.336646556854248, "rewards/margins": 1.584695816040039, "rewards/rejected": -2.921342372894287, "step": 1853 }, { "epoch": 1.75, "grad_norm": 18.691640853881836, "learning_rate": 2.3137460650577123e-07, "logps/chosen": -44.97010803222656, "logps/rejected": -75.71627044677734, "loss": 0.321, "losses/dpo": 0.175616055727005, "losses/sft": 1.6014487743377686, "losses/total": 0.175616055727005, "ref_logps/chosen": -32.952781677246094, "ref_logps/rejected": -47.409908294677734, "rewards/accuracies": 0.875, "rewards/chosen": -1.201733112335205, "rewards/margins": 1.6289032697677612, "rewards/rejected": -2.8306362628936768, "step": 1854 }, { "epoch": 1.75, "grad_norm": 15.648659706115723, "learning_rate": 2.3119972018188177e-07, "logps/chosen": -34.525760650634766, "logps/rejected": -66.15505981445312, "loss": 0.276, "losses/dpo": 0.4646318554878235, "losses/sft": 1.5340651273727417, "losses/total": 0.4646318554878235, "ref_logps/chosen": -27.546180725097656, "ref_logps/rejected": -41.33037567138672, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6979577541351318, "rewards/margins": 1.7845112085342407, "rewards/rejected": -2.482469081878662, "step": 1855 }, { "epoch": 1.75, "grad_norm": 21.946725845336914, "learning_rate": 2.3102483385799228e-07, "logps/chosen": -53.30470275878906, "logps/rejected": -75.9454574584961, "loss": 0.3318, "losses/dpo": 0.5257679224014282, "losses/sft": 1.831146240234375, "losses/total": 0.5257679224014282, "ref_logps/chosen": -38.76506042480469, "ref_logps/rejected": -46.32561492919922, "rewards/accuracies": 0.875, "rewards/chosen": -1.4539637565612793, "rewards/margins": 1.5080205202102661, "rewards/rejected": -2.961984395980835, "step": 1856 }, { "epoch": 1.75, "grad_norm": 24.31056022644043, "learning_rate": 2.3084994753410284e-07, "logps/chosen": -61.453392028808594, "logps/rejected": -68.18570709228516, "loss": 0.459, "losses/dpo": 0.37328892946243286, "losses/sft": 2.5342836380004883, "losses/total": 0.37328892946243286, "ref_logps/chosen": -43.84550857543945, "ref_logps/rejected": -42.47376251220703, "rewards/accuracies": 0.75, "rewards/chosen": -1.7607884407043457, "rewards/margins": 0.810405969619751, "rewards/rejected": -2.5711944103240967, "step": 1857 }, { "epoch": 1.75, "grad_norm": 25.795124053955078, "learning_rate": 2.3067506121021336e-07, "logps/chosen": -58.45408630371094, "logps/rejected": -75.3165283203125, "loss": 0.4595, "losses/dpo": 0.39884889125823975, "losses/sft": 1.8553930521011353, "losses/total": 0.39884889125823975, "ref_logps/chosen": -39.70906448364258, "ref_logps/rejected": -43.39494705200195, "rewards/accuracies": 0.75, "rewards/chosen": -1.8745019435882568, "rewards/margins": 1.3176555633544922, "rewards/rejected": -3.192157506942749, "step": 1858 }, { "epoch": 1.76, "grad_norm": 15.022128105163574, "learning_rate": 2.3050017488632387e-07, "logps/chosen": -58.80267333984375, "logps/rejected": -86.05952453613281, "loss": 0.2289, "losses/dpo": 0.3939743638038635, "losses/sft": 2.242748737335205, "losses/total": 0.3939743638038635, "ref_logps/chosen": -44.415199279785156, "ref_logps/rejected": -53.97506332397461, "rewards/accuracies": 1.0, "rewards/chosen": -1.4387474060058594, "rewards/margins": 1.7696987390518188, "rewards/rejected": -3.2084462642669678, "step": 1859 }, { "epoch": 1.76, "grad_norm": 31.836544036865234, "learning_rate": 2.303252885624344e-07, "logps/chosen": -58.42820739746094, "logps/rejected": -78.09474182128906, "loss": 0.5332, "losses/dpo": 0.4501548111438751, "losses/sft": 1.7386133670806885, "losses/total": 0.4501548111438751, "ref_logps/chosen": -41.048240661621094, "ref_logps/rejected": -49.97856903076172, "rewards/accuracies": 0.75, "rewards/chosen": -1.7379968166351318, "rewards/margins": 1.073620319366455, "rewards/rejected": -2.811617136001587, "step": 1860 }, { "epoch": 1.76, "grad_norm": 29.845956802368164, "learning_rate": 2.3015040223854492e-07, "logps/chosen": -46.830650329589844, "logps/rejected": -53.74964141845703, "loss": 0.5937, "losses/dpo": 0.7010772824287415, "losses/sft": 1.4836887121200562, "losses/total": 0.7010772824287415, "ref_logps/chosen": -33.31001281738281, "ref_logps/rejected": -32.46253204345703, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3520640134811401, "rewards/margins": 0.7766469717025757, "rewards/rejected": -2.128710985183716, "step": 1861 }, { "epoch": 1.76, "grad_norm": 27.349044799804688, "learning_rate": 2.2997551591465546e-07, "logps/chosen": -61.350276947021484, "logps/rejected": -81.38579559326172, "loss": 0.4225, "losses/dpo": 0.19630475342273712, "losses/sft": 1.9088757038116455, "losses/total": 0.19630475342273712, "ref_logps/chosen": -41.3489990234375, "ref_logps/rejected": -50.82018280029297, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0001277923583984, "rewards/margins": 1.0564332008361816, "rewards/rejected": -3.056561231613159, "step": 1862 }, { "epoch": 1.76, "grad_norm": 23.62197494506836, "learning_rate": 2.29800629590766e-07, "logps/chosen": -49.401824951171875, "logps/rejected": -73.51557159423828, "loss": 0.4025, "losses/dpo": 0.6817601919174194, "losses/sft": 1.966322660446167, "losses/total": 0.6817601919174194, "ref_logps/chosen": -36.905059814453125, "ref_logps/rejected": -44.787437438964844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2496763467788696, "rewards/margins": 1.6231366395950317, "rewards/rejected": -2.8728129863739014, "step": 1863 }, { "epoch": 1.76, "grad_norm": 22.961708068847656, "learning_rate": 2.2962574326687654e-07, "logps/chosen": -46.41362762451172, "logps/rejected": -69.81623840332031, "loss": 0.4347, "losses/dpo": 0.19165581464767456, "losses/sft": 1.7985776662826538, "losses/total": 0.19165581464767456, "ref_logps/chosen": -32.241111755371094, "ref_logps/rejected": -44.07400131225586, "rewards/accuracies": 0.875, "rewards/chosen": -1.4172515869140625, "rewards/margins": 1.1569719314575195, "rewards/rejected": -2.574223518371582, "step": 1864 }, { "epoch": 1.76, "grad_norm": 32.46565246582031, "learning_rate": 2.2945085694298705e-07, "logps/chosen": -61.946746826171875, "logps/rejected": -59.74442672729492, "loss": 0.6418, "losses/dpo": 0.5373279452323914, "losses/sft": 2.1824910640716553, "losses/total": 0.5373279452323914, "ref_logps/chosen": -45.1789665222168, "ref_logps/rejected": -36.91200256347656, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6767781972885132, "rewards/margins": 0.6064640283584595, "rewards/rejected": -2.2832422256469727, "step": 1865 }, { "epoch": 1.76, "grad_norm": 25.684823989868164, "learning_rate": 2.2927597061909756e-07, "logps/chosen": -58.58709716796875, "logps/rejected": -70.03372192382812, "loss": 0.4293, "losses/dpo": 0.32440823316574097, "losses/sft": 1.9917763471603394, "losses/total": 0.32440823316574097, "ref_logps/chosen": -42.10264587402344, "ref_logps/rejected": -42.35670852661133, "rewards/accuracies": 0.75, "rewards/chosen": -1.6484451293945312, "rewards/margins": 1.1192561388015747, "rewards/rejected": -2.7677011489868164, "step": 1866 }, { "epoch": 1.76, "grad_norm": 21.3986759185791, "learning_rate": 2.291010842952081e-07, "logps/chosen": -48.57206344604492, "logps/rejected": -74.04078674316406, "loss": 0.3395, "losses/dpo": 0.37597930431365967, "losses/sft": 1.6027623414993286, "losses/total": 0.37597930431365967, "ref_logps/chosen": -34.158077239990234, "ref_logps/rejected": -44.176666259765625, "rewards/accuracies": 0.875, "rewards/chosen": -1.4413988590240479, "rewards/margins": 1.5450128316879272, "rewards/rejected": -2.9864115715026855, "step": 1867 }, { "epoch": 1.76, "grad_norm": 23.15447425842285, "learning_rate": 2.2892619797131862e-07, "logps/chosen": -62.379058837890625, "logps/rejected": -90.55410766601562, "loss": 0.4447, "losses/dpo": 0.4550441801548004, "losses/sft": 2.0679984092712402, "losses/total": 0.4550441801548004, "ref_logps/chosen": -45.16303634643555, "ref_logps/rejected": -58.38995361328125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7216026782989502, "rewards/margins": 1.4948129653930664, "rewards/rejected": -3.2164158821105957, "step": 1868 }, { "epoch": 1.76, "grad_norm": 24.27429962158203, "learning_rate": 2.2875131164742918e-07, "logps/chosen": -49.776588439941406, "logps/rejected": -69.47943115234375, "loss": 0.4242, "losses/dpo": 0.5417318940162659, "losses/sft": 2.0911362171173096, "losses/total": 0.5417318940162659, "ref_logps/chosen": -35.80033493041992, "ref_logps/rejected": -44.90788269042969, "rewards/accuracies": 0.8125, "rewards/chosen": -1.397625207901001, "rewards/margins": 1.0595295429229736, "rewards/rejected": -2.4571547508239746, "step": 1869 }, { "epoch": 1.77, "grad_norm": 23.325531005859375, "learning_rate": 2.285764253235397e-07, "logps/chosen": -47.95051574707031, "logps/rejected": -61.81595993041992, "loss": 0.4171, "losses/dpo": 0.7720044255256653, "losses/sft": 1.924033761024475, "losses/total": 0.7720044255256653, "ref_logps/chosen": -35.24822235107422, "ref_logps/rejected": -38.147430419921875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2702295780181885, "rewards/margins": 1.0966236591339111, "rewards/rejected": -2.3668532371520996, "step": 1870 }, { "epoch": 1.77, "grad_norm": 22.157634735107422, "learning_rate": 2.2840153899965023e-07, "logps/chosen": -40.46189880371094, "logps/rejected": -58.247581481933594, "loss": 0.4623, "losses/dpo": 0.4775947034358978, "losses/sft": 1.3777129650115967, "losses/total": 0.4775947034358978, "ref_logps/chosen": -26.85367774963379, "ref_logps/rejected": -37.701988220214844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3608219623565674, "rewards/margins": 0.6937375068664551, "rewards/rejected": -2.0545594692230225, "step": 1871 }, { "epoch": 1.77, "grad_norm": 20.059139251708984, "learning_rate": 2.2822665267576075e-07, "logps/chosen": -47.75612258911133, "logps/rejected": -69.51106262207031, "loss": 0.4207, "losses/dpo": 0.5288746356964111, "losses/sft": 1.7966135740280151, "losses/total": 0.5288746356964111, "ref_logps/chosen": -33.53255844116211, "ref_logps/rejected": -42.634315490722656, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4223566055297852, "rewards/margins": 1.2653181552886963, "rewards/rejected": -2.6876747608184814, "step": 1872 }, { "epoch": 1.77, "grad_norm": 32.08280944824219, "learning_rate": 2.2805176635187126e-07, "logps/chosen": -56.78495788574219, "logps/rejected": -76.56369018554688, "loss": 0.5646, "losses/dpo": 0.4444050192832947, "losses/sft": 2.156550407409668, "losses/total": 0.4444050192832947, "ref_logps/chosen": -38.83460998535156, "ref_logps/rejected": -50.37071990966797, "rewards/accuracies": 0.6875, "rewards/chosen": -1.795034408569336, "rewards/margins": 0.8242623805999756, "rewards/rejected": -2.6192967891693115, "step": 1873 }, { "epoch": 1.77, "grad_norm": 16.14358139038086, "learning_rate": 2.278768800279818e-07, "logps/chosen": -39.84698486328125, "logps/rejected": -73.8427734375, "loss": 0.2922, "losses/dpo": 0.3803018629550934, "losses/sft": 1.5653516054153442, "losses/total": 0.3803018629550934, "ref_logps/chosen": -29.583412170410156, "ref_logps/rejected": -42.70185852050781, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0263575315475464, "rewards/margins": 2.0877342224121094, "rewards/rejected": -3.114091634750366, "step": 1874 }, { "epoch": 1.77, "grad_norm": 22.408313751220703, "learning_rate": 2.277019937040923e-07, "logps/chosen": -54.31877899169922, "logps/rejected": -65.66378021240234, "loss": 0.3885, "losses/dpo": 0.44363775849342346, "losses/sft": 1.8331607580184937, "losses/total": 0.44363775849342346, "ref_logps/chosen": -37.31925964355469, "ref_logps/rejected": -39.318138122558594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6999518871307373, "rewards/margins": 0.9346121549606323, "rewards/rejected": -2.63456392288208, "step": 1875 }, { "epoch": 1.77, "grad_norm": 22.377567291259766, "learning_rate": 2.2752710738020288e-07, "logps/chosen": -38.93741989135742, "logps/rejected": -54.418758392333984, "loss": 0.4623, "losses/dpo": 0.25928351283073425, "losses/sft": 1.7262192964553833, "losses/total": 0.25928351283073425, "ref_logps/chosen": -30.986614227294922, "ref_logps/rejected": -32.59572982788086, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7950806617736816, "rewards/margins": 1.3872219324111938, "rewards/rejected": -2.182302474975586, "step": 1876 }, { "epoch": 1.77, "grad_norm": 23.78643035888672, "learning_rate": 2.273522210563134e-07, "logps/chosen": -40.39408493041992, "logps/rejected": -62.54866027832031, "loss": 0.4088, "losses/dpo": 0.5053045153617859, "losses/sft": 1.2312461137771606, "losses/total": 0.5053045153617859, "ref_logps/chosen": -29.41988754272461, "ref_logps/rejected": -38.173805236816406, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0974197387695312, "rewards/margins": 1.3400654792785645, "rewards/rejected": -2.4374852180480957, "step": 1877 }, { "epoch": 1.77, "grad_norm": 24.650856018066406, "learning_rate": 2.2717733473242393e-07, "logps/chosen": -59.539485931396484, "logps/rejected": -70.56353759765625, "loss": 0.4161, "losses/dpo": 0.5482534170150757, "losses/sft": 2.2541956901550293, "losses/total": 0.5482534170150757, "ref_logps/chosen": -42.4941520690918, "ref_logps/rejected": -42.365028381347656, "rewards/accuracies": 0.8125, "rewards/chosen": -1.704533576965332, "rewards/margins": 1.1153171062469482, "rewards/rejected": -2.8198506832122803, "step": 1878 }, { "epoch": 1.77, "grad_norm": 18.941280364990234, "learning_rate": 2.2700244840853444e-07, "logps/chosen": -46.671268463134766, "logps/rejected": -68.30020141601562, "loss": 0.2868, "losses/dpo": 0.24786376953125, "losses/sft": 1.248977780342102, "losses/total": 0.24786376953125, "ref_logps/chosen": -35.240299224853516, "ref_logps/rejected": -41.1260871887207, "rewards/accuracies": 0.875, "rewards/chosen": -1.143097162246704, "rewards/margins": 1.574313759803772, "rewards/rejected": -2.7174108028411865, "step": 1879 }, { "epoch": 1.78, "grad_norm": 21.882057189941406, "learning_rate": 2.2682756208464495e-07, "logps/chosen": -51.05032730102539, "logps/rejected": -79.54434204101562, "loss": 0.3798, "losses/dpo": 0.3674437999725342, "losses/sft": 1.7481441497802734, "losses/total": 0.3674437999725342, "ref_logps/chosen": -37.21192169189453, "ref_logps/rejected": -53.069183349609375, "rewards/accuracies": 0.75, "rewards/chosen": -1.3838404417037964, "rewards/margins": 1.263675332069397, "rewards/rejected": -2.6475157737731934, "step": 1880 }, { "epoch": 1.78, "grad_norm": 19.097293853759766, "learning_rate": 2.266526757607555e-07, "logps/chosen": -49.598358154296875, "logps/rejected": -68.81123352050781, "loss": 0.4006, "losses/dpo": 0.2633708417415619, "losses/sft": 1.7340539693832397, "losses/total": 0.2633708417415619, "ref_logps/chosen": -32.70411682128906, "ref_logps/rejected": -41.301082611083984, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6894242763519287, "rewards/margins": 1.0615906715393066, "rewards/rejected": -2.7510147094726562, "step": 1881 }, { "epoch": 1.78, "grad_norm": 27.385522842407227, "learning_rate": 2.2647778943686603e-07, "logps/chosen": -55.36388397216797, "logps/rejected": -72.80226135253906, "loss": 0.4856, "losses/dpo": 0.5916553139686584, "losses/sft": 1.4422340393066406, "losses/total": 0.5916553139686584, "ref_logps/chosen": -41.77886199951172, "ref_logps/rejected": -47.21708679199219, "rewards/accuracies": 0.75, "rewards/chosen": -1.358501672744751, "rewards/margins": 1.200016736984253, "rewards/rejected": -2.558518409729004, "step": 1882 }, { "epoch": 1.78, "grad_norm": 22.030410766601562, "learning_rate": 2.2630290311297657e-07, "logps/chosen": -49.24067687988281, "logps/rejected": -82.13379669189453, "loss": 0.3313, "losses/dpo": 0.2792872190475464, "losses/sft": 1.9741783142089844, "losses/total": 0.2792872190475464, "ref_logps/chosen": -33.81756591796875, "ref_logps/rejected": -51.035560607910156, "rewards/accuracies": 0.875, "rewards/chosen": -1.542311668395996, "rewards/margins": 1.567512035369873, "rewards/rejected": -3.10982346534729, "step": 1883 }, { "epoch": 1.78, "grad_norm": 19.523178100585938, "learning_rate": 2.2612801678908708e-07, "logps/chosen": -48.15979766845703, "logps/rejected": -64.07328796386719, "loss": 0.3526, "losses/dpo": 0.44792261719703674, "losses/sft": 1.7503501176834106, "losses/total": 0.44792261719703674, "ref_logps/chosen": -36.24867248535156, "ref_logps/rejected": -37.80896759033203, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1911125183105469, "rewards/margins": 1.435319423675537, "rewards/rejected": -2.626431941986084, "step": 1884 }, { "epoch": 1.78, "grad_norm": 28.6821346282959, "learning_rate": 2.2595313046519762e-07, "logps/chosen": -37.17652893066406, "logps/rejected": -51.31489562988281, "loss": 0.5727, "losses/dpo": 0.5008769631385803, "losses/sft": 1.2679917812347412, "losses/total": 0.5008769631385803, "ref_logps/chosen": -24.934823989868164, "ref_logps/rejected": -32.09640121459961, "rewards/accuracies": 0.75, "rewards/chosen": -1.224170446395874, "rewards/margins": 0.6976792216300964, "rewards/rejected": -1.9218497276306152, "step": 1885 }, { "epoch": 1.78, "grad_norm": 15.205330848693848, "learning_rate": 2.2577824414130813e-07, "logps/chosen": -54.19927215576172, "logps/rejected": -79.3831787109375, "loss": 0.2253, "losses/dpo": 0.2139306217432022, "losses/sft": 1.5638699531555176, "losses/total": 0.2139306217432022, "ref_logps/chosen": -40.40202331542969, "ref_logps/rejected": -50.48923873901367, "rewards/accuracies": 1.0, "rewards/chosen": -1.3797242641448975, "rewards/margins": 1.5096697807312012, "rewards/rejected": -2.8893942832946777, "step": 1886 }, { "epoch": 1.78, "grad_norm": 32.752071380615234, "learning_rate": 2.2560335781741865e-07, "logps/chosen": -61.3519172668457, "logps/rejected": -73.46245574951172, "loss": 0.6125, "losses/dpo": 0.27496880292892456, "losses/sft": 1.528637409210205, "losses/total": 0.27496880292892456, "ref_logps/chosen": -43.78230667114258, "ref_logps/rejected": -47.55207061767578, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7569612264633179, "rewards/margins": 0.8340772390365601, "rewards/rejected": -2.591038227081299, "step": 1887 }, { "epoch": 1.78, "grad_norm": 15.8610200881958, "learning_rate": 2.2542847149352919e-07, "logps/chosen": -50.55162811279297, "logps/rejected": -79.12870788574219, "loss": 0.2502, "losses/dpo": 0.138380765914917, "losses/sft": 2.214749574661255, "losses/total": 0.138380765914917, "ref_logps/chosen": -37.85062026977539, "ref_logps/rejected": -50.561622619628906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2701011896133423, "rewards/margins": 1.5866076946258545, "rewards/rejected": -2.8567090034484863, "step": 1888 }, { "epoch": 1.78, "grad_norm": 22.326772689819336, "learning_rate": 2.2525358516963973e-07, "logps/chosen": -55.27908706665039, "logps/rejected": -86.50544738769531, "loss": 0.3719, "losses/dpo": 0.1820342242717743, "losses/sft": 1.4802669286727905, "losses/total": 0.1820342242717743, "ref_logps/chosen": -37.24418258666992, "ref_logps/rejected": -55.28599166870117, "rewards/accuracies": 0.8125, "rewards/chosen": -1.803490161895752, "rewards/margins": 1.3184552192687988, "rewards/rejected": -3.121945381164551, "step": 1889 }, { "epoch": 1.78, "grad_norm": 21.98392677307129, "learning_rate": 2.2507869884575026e-07, "logps/chosen": -58.31983947753906, "logps/rejected": -82.4329833984375, "loss": 0.3394, "losses/dpo": 0.06195071339607239, "losses/sft": 1.7827625274658203, "losses/total": 0.06195071339607239, "ref_logps/chosen": -43.131805419921875, "ref_logps/rejected": -50.879005432128906, "rewards/accuracies": 0.8125, "rewards/chosen": -1.518803358078003, "rewards/margins": 1.6365940570831299, "rewards/rejected": -3.155397415161133, "step": 1890 }, { "epoch": 1.79, "grad_norm": 25.910991668701172, "learning_rate": 2.2490381252186078e-07, "logps/chosen": -61.02592468261719, "logps/rejected": -102.15164947509766, "loss": 0.4079, "losses/dpo": 0.2999795079231262, "losses/sft": 1.4605165719985962, "losses/total": 0.2999795079231262, "ref_logps/chosen": -40.20536422729492, "ref_logps/rejected": -63.65682601928711, "rewards/accuracies": 0.875, "rewards/chosen": -2.0820555686950684, "rewards/margins": 1.7674269676208496, "rewards/rejected": -3.849482774734497, "step": 1891 }, { "epoch": 1.79, "grad_norm": 27.1877384185791, "learning_rate": 2.2472892619797132e-07, "logps/chosen": -56.80752944946289, "logps/rejected": -65.45315551757812, "loss": 0.5826, "losses/dpo": 0.9997936487197876, "losses/sft": 2.1011176109313965, "losses/total": 0.9997936487197876, "ref_logps/chosen": -39.66331481933594, "ref_logps/rejected": -40.86894226074219, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7144218683242798, "rewards/margins": 0.7440000772476196, "rewards/rejected": -2.4584219455718994, "step": 1892 }, { "epoch": 1.79, "grad_norm": 21.7919921875, "learning_rate": 2.2455403987408183e-07, "logps/chosen": -54.760337829589844, "logps/rejected": -55.419273376464844, "loss": 0.3595, "losses/dpo": 0.6955257058143616, "losses/sft": 2.0487747192382812, "losses/total": 0.6955257058143616, "ref_logps/chosen": -44.6641845703125, "ref_logps/rejected": -34.3460693359375, "rewards/accuracies": 0.875, "rewards/chosen": -1.009615182876587, "rewards/margins": 1.097705364227295, "rewards/rejected": -2.1073203086853027, "step": 1893 }, { "epoch": 1.79, "grad_norm": 15.337175369262695, "learning_rate": 2.2437915355019234e-07, "logps/chosen": -51.50416564941406, "logps/rejected": -100.27093505859375, "loss": 0.174, "losses/dpo": 0.35395389795303345, "losses/sft": 2.671549081802368, "losses/total": 0.35395389795303345, "ref_logps/chosen": -34.17205047607422, "ref_logps/rejected": -58.02793884277344, "rewards/accuracies": 1.0, "rewards/chosen": -1.7332121133804321, "rewards/margins": 2.4910879135131836, "rewards/rejected": -4.224300384521484, "step": 1894 }, { "epoch": 1.79, "grad_norm": 17.456411361694336, "learning_rate": 2.242042672263029e-07, "logps/chosen": -47.11567687988281, "logps/rejected": -61.513282775878906, "loss": 0.3349, "losses/dpo": 0.255015105009079, "losses/sft": 1.9937318563461304, "losses/total": 0.255015105009079, "ref_logps/chosen": -35.42218780517578, "ref_logps/rejected": -36.297943115234375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1693490743637085, "rewards/margins": 1.3521851301193237, "rewards/rejected": -2.5215344429016113, "step": 1895 }, { "epoch": 1.79, "grad_norm": 21.69206428527832, "learning_rate": 2.2402938090241342e-07, "logps/chosen": -56.75226593017578, "logps/rejected": -63.572509765625, "loss": 0.421, "losses/dpo": 0.4266464412212372, "losses/sft": 2.0657413005828857, "losses/total": 0.4266464412212372, "ref_logps/chosen": -40.8559455871582, "ref_logps/rejected": -36.38270568847656, "rewards/accuracies": 0.75, "rewards/chosen": -1.5896315574645996, "rewards/margins": 1.1293489933013916, "rewards/rejected": -2.718980550765991, "step": 1896 }, { "epoch": 1.79, "grad_norm": 24.08820343017578, "learning_rate": 2.2385449457852396e-07, "logps/chosen": -52.32395935058594, "logps/rejected": -86.34336853027344, "loss": 0.3095, "losses/dpo": 0.132833331823349, "losses/sft": 1.6938538551330566, "losses/total": 0.132833331823349, "ref_logps/chosen": -37.865142822265625, "ref_logps/rejected": -53.67011642456055, "rewards/accuracies": 0.875, "rewards/chosen": -1.4458816051483154, "rewards/margins": 1.8214439153671265, "rewards/rejected": -3.2673256397247314, "step": 1897 }, { "epoch": 1.79, "grad_norm": 24.440874099731445, "learning_rate": 2.2367960825463447e-07, "logps/chosen": -48.53782272338867, "logps/rejected": -75.16322326660156, "loss": 0.3891, "losses/dpo": 0.7033171057701111, "losses/sft": 2.48795223236084, "losses/total": 0.7033171057701111, "ref_logps/chosen": -33.62128829956055, "ref_logps/rejected": -41.39613723754883, "rewards/accuracies": 0.75, "rewards/chosen": -1.491653561592102, "rewards/margins": 1.885054349899292, "rewards/rejected": -3.3767080307006836, "step": 1898 }, { "epoch": 1.79, "grad_norm": 28.448230743408203, "learning_rate": 2.23504721930745e-07, "logps/chosen": -51.7152099609375, "logps/rejected": -67.97134399414062, "loss": 0.4612, "losses/dpo": 0.4561270773410797, "losses/sft": 2.098419189453125, "losses/total": 0.4561270773410797, "ref_logps/chosen": -34.344764709472656, "ref_logps/rejected": -39.8526611328125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.737044334411621, "rewards/margins": 1.0748240947723389, "rewards/rejected": -2.811868190765381, "step": 1899 }, { "epoch": 1.79, "grad_norm": 27.743558883666992, "learning_rate": 2.2332983560685552e-07, "logps/chosen": -72.33219909667969, "logps/rejected": -100.0615005493164, "loss": 0.4026, "losses/dpo": 0.13542529940605164, "losses/sft": 2.0535051822662354, "losses/total": 0.13542529940605164, "ref_logps/chosen": -52.81183624267578, "ref_logps/rejected": -67.53926086425781, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9520361423492432, "rewards/margins": 1.3001880645751953, "rewards/rejected": -3.2522242069244385, "step": 1900 }, { "epoch": 1.8, "grad_norm": 23.906633377075195, "learning_rate": 2.2315494928296604e-07, "logps/chosen": -67.08051300048828, "logps/rejected": -79.08184814453125, "loss": 0.3348, "losses/dpo": 0.3648269772529602, "losses/sft": 1.8850147724151611, "losses/total": 0.3648269772529602, "ref_logps/chosen": -48.83037185668945, "ref_logps/rejected": -46.057395935058594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.825014591217041, "rewards/margins": 1.4774305820465088, "rewards/rejected": -3.302445411682129, "step": 1901 }, { "epoch": 1.8, "grad_norm": 28.172666549682617, "learning_rate": 2.229800629590766e-07, "logps/chosen": -51.03199005126953, "logps/rejected": -72.94522094726562, "loss": 0.4074, "losses/dpo": 0.616869330406189, "losses/sft": 1.9095754623413086, "losses/total": 0.616869330406189, "ref_logps/chosen": -33.90076446533203, "ref_logps/rejected": -44.803680419921875, "rewards/accuracies": 0.75, "rewards/chosen": -1.7131222486495972, "rewards/margins": 1.1010315418243408, "rewards/rejected": -2.8141536712646484, "step": 1902 }, { "epoch": 1.8, "grad_norm": 25.836366653442383, "learning_rate": 2.2280517663518711e-07, "logps/chosen": -51.030887603759766, "logps/rejected": -74.72064208984375, "loss": 0.4934, "losses/dpo": 0.9905093312263489, "losses/sft": 2.5590171813964844, "losses/total": 0.9905093312263489, "ref_logps/chosen": -33.1800537109375, "ref_logps/rejected": -44.894935607910156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7850837707519531, "rewards/margins": 1.1974868774414062, "rewards/rejected": -2.9825706481933594, "step": 1903 }, { "epoch": 1.8, "grad_norm": 26.964326858520508, "learning_rate": 2.2263029031129765e-07, "logps/chosen": -52.39605712890625, "logps/rejected": -62.85914993286133, "loss": 0.6155, "losses/dpo": 0.25205937027931213, "losses/sft": 1.5752063989639282, "losses/total": 0.25205937027931213, "ref_logps/chosen": -34.76396179199219, "ref_logps/rejected": -37.6327018737793, "rewards/accuracies": 0.6875, "rewards/chosen": -1.763209581375122, "rewards/margins": 0.7594348192214966, "rewards/rejected": -2.522644519805908, "step": 1904 }, { "epoch": 1.8, "grad_norm": 22.541709899902344, "learning_rate": 2.2245540398740817e-07, "logps/chosen": -42.903236389160156, "logps/rejected": -58.880409240722656, "loss": 0.484, "losses/dpo": 0.5868808627128601, "losses/sft": 1.7349990606307983, "losses/total": 0.5868808627128601, "ref_logps/chosen": -29.372034072875977, "ref_logps/rejected": -36.09337615966797, "rewards/accuracies": 0.75, "rewards/chosen": -1.3531203269958496, "rewards/margins": 0.9255828261375427, "rewards/rejected": -2.278702974319458, "step": 1905 }, { "epoch": 1.8, "grad_norm": 19.29807472229004, "learning_rate": 2.222805176635187e-07, "logps/chosen": -51.947608947753906, "logps/rejected": -80.24134063720703, "loss": 0.3714, "losses/dpo": 0.3035661280155182, "losses/sft": 2.176285743713379, "losses/total": 0.3035661280155182, "ref_logps/chosen": -37.67027282714844, "ref_logps/rejected": -48.94196701049805, "rewards/accuracies": 0.75, "rewards/chosen": -1.4277340173721313, "rewards/margins": 1.7022035121917725, "rewards/rejected": -3.1299376487731934, "step": 1906 }, { "epoch": 1.8, "grad_norm": 15.007548332214355, "learning_rate": 2.2210563133962922e-07, "logps/chosen": -49.67732238769531, "logps/rejected": -91.13972473144531, "loss": 0.184, "losses/dpo": 0.048366762697696686, "losses/sft": 1.3094068765640259, "losses/total": 0.048366762697696686, "ref_logps/chosen": -36.67182159423828, "ref_logps/rejected": -52.19459533691406, "rewards/accuracies": 0.875, "rewards/chosen": -1.3005497455596924, "rewards/margins": 2.593963146209717, "rewards/rejected": -3.8945131301879883, "step": 1907 }, { "epoch": 1.8, "grad_norm": 22.120275497436523, "learning_rate": 2.2193074501573976e-07, "logps/chosen": -42.411258697509766, "logps/rejected": -73.38336181640625, "loss": 0.3994, "losses/dpo": 0.4103068709373474, "losses/sft": 1.3370667695999146, "losses/total": 0.4103068709373474, "ref_logps/chosen": -28.12499237060547, "ref_logps/rejected": -46.879371643066406, "rewards/accuracies": 0.75, "rewards/chosen": -1.4286270141601562, "rewards/margins": 1.2217724323272705, "rewards/rejected": -2.6503994464874268, "step": 1908 }, { "epoch": 1.8, "grad_norm": 21.12205696105957, "learning_rate": 2.217558586918503e-07, "logps/chosen": -50.89750671386719, "logps/rejected": -72.23309326171875, "loss": 0.2866, "losses/dpo": 0.6609861850738525, "losses/sft": 1.9567790031433105, "losses/total": 0.6609861850738525, "ref_logps/chosen": -39.11732482910156, "ref_logps/rejected": -41.536781311035156, "rewards/accuracies": 0.875, "rewards/chosen": -1.1780179738998413, "rewards/margins": 1.8916127681732178, "rewards/rejected": -3.0696308612823486, "step": 1909 }, { "epoch": 1.8, "grad_norm": 27.44991683959961, "learning_rate": 2.215809723679608e-07, "logps/chosen": -58.320369720458984, "logps/rejected": -66.22344970703125, "loss": 0.6954, "losses/dpo": 1.1899237632751465, "losses/sft": 2.1805202960968018, "losses/total": 1.1899237632751465, "ref_logps/chosen": -42.65129089355469, "ref_logps/rejected": -42.52025604248047, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5669080018997192, "rewards/margins": 0.8034118413925171, "rewards/rejected": -2.3703198432922363, "step": 1910 }, { "epoch": 1.8, "grad_norm": 24.695268630981445, "learning_rate": 2.2140608604407135e-07, "logps/chosen": -50.96708679199219, "logps/rejected": -64.78813171386719, "loss": 0.5172, "losses/dpo": 0.35559552907943726, "losses/sft": 1.9054945707321167, "losses/total": 0.35559552907943726, "ref_logps/chosen": -39.908721923828125, "ref_logps/rejected": -42.99611282348633, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1058368682861328, "rewards/margins": 1.0733649730682373, "rewards/rejected": -2.17920184135437, "step": 1911 }, { "epoch": 1.81, "grad_norm": 28.08316993713379, "learning_rate": 2.2123119972018186e-07, "logps/chosen": -58.56736755371094, "logps/rejected": -68.26663970947266, "loss": 0.4828, "losses/dpo": 0.7495432496070862, "losses/sft": 2.0531625747680664, "losses/total": 0.7495432496070862, "ref_logps/chosen": -47.00447082519531, "ref_logps/rejected": -43.70998764038086, "rewards/accuracies": 0.625, "rewards/chosen": -1.1562894582748413, "rewards/margins": 1.2993755340576172, "rewards/rejected": -2.455665111541748, "step": 1912 }, { "epoch": 1.81, "grad_norm": 24.238311767578125, "learning_rate": 2.210563133962924e-07, "logps/chosen": -40.96937561035156, "logps/rejected": -79.12947845458984, "loss": 0.3677, "losses/dpo": 0.2563820481300354, "losses/sft": 1.4979825019836426, "losses/total": 0.2563820481300354, "ref_logps/chosen": -31.857439041137695, "ref_logps/rejected": -51.90716552734375, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9111933708190918, "rewards/margins": 1.8110376596450806, "rewards/rejected": -2.722230911254883, "step": 1913 }, { "epoch": 1.81, "grad_norm": 15.501830101013184, "learning_rate": 2.2088142707240294e-07, "logps/chosen": -56.0780029296875, "logps/rejected": -72.89102172851562, "loss": 0.2175, "losses/dpo": 0.11090554296970367, "losses/sft": 1.774894118309021, "losses/total": 0.11090554296970367, "ref_logps/chosen": -42.51144027709961, "ref_logps/rejected": -39.92144775390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3566561937332153, "rewards/margins": 1.9403011798858643, "rewards/rejected": -3.296957492828369, "step": 1914 }, { "epoch": 1.81, "grad_norm": 23.787960052490234, "learning_rate": 2.2070654074851348e-07, "logps/chosen": -48.06635284423828, "logps/rejected": -66.04922485351562, "loss": 0.4061, "losses/dpo": 0.4237924814224243, "losses/sft": 1.9574851989746094, "losses/total": 0.4237924814224243, "ref_logps/chosen": -35.15850067138672, "ref_logps/rejected": -41.25889205932617, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2907847166061401, "rewards/margins": 1.1882489919662476, "rewards/rejected": -2.4790337085723877, "step": 1915 }, { "epoch": 1.81, "grad_norm": 15.393126487731934, "learning_rate": 2.20531654424624e-07, "logps/chosen": -53.83302307128906, "logps/rejected": -91.80354309082031, "loss": 0.2203, "losses/dpo": 0.16131183505058289, "losses/sft": 1.8837714195251465, "losses/total": 0.16131183505058289, "ref_logps/chosen": -39.37416076660156, "ref_logps/rejected": -56.51863098144531, "rewards/accuracies": 0.875, "rewards/chosen": -1.445886492729187, "rewards/margins": 2.0826048851013184, "rewards/rejected": -3.528491497039795, "step": 1916 }, { "epoch": 1.81, "grad_norm": 21.387184143066406, "learning_rate": 2.203567681007345e-07, "logps/chosen": -60.99091339111328, "logps/rejected": -82.54985046386719, "loss": 0.4402, "losses/dpo": 0.2399728000164032, "losses/sft": 2.080650568008423, "losses/total": 0.2399728000164032, "ref_logps/chosen": -44.19255065917969, "ref_logps/rejected": -50.358707427978516, "rewards/accuracies": 0.9375, "rewards/chosen": -1.679836392402649, "rewards/margins": 1.539278507232666, "rewards/rejected": -3.2191147804260254, "step": 1917 }, { "epoch": 1.81, "grad_norm": 36.407676696777344, "learning_rate": 2.2018188177684504e-07, "logps/chosen": -66.58694458007812, "logps/rejected": -55.33207702636719, "loss": 0.7924, "losses/dpo": 1.0841635465621948, "losses/sft": 1.6932896375656128, "losses/total": 1.0841635465621948, "ref_logps/chosen": -52.79731750488281, "ref_logps/rejected": -36.23854064941406, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3789634704589844, "rewards/margins": 0.5303901433944702, "rewards/rejected": -1.9093537330627441, "step": 1918 }, { "epoch": 1.81, "grad_norm": 25.639163970947266, "learning_rate": 2.2000699545295555e-07, "logps/chosen": -73.86323547363281, "logps/rejected": -64.93158721923828, "loss": 0.5128, "losses/dpo": 0.8661154508590698, "losses/sft": 1.9866821765899658, "losses/total": 0.8661154508590698, "ref_logps/chosen": -57.46168518066406, "ref_logps/rejected": -38.84584045410156, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6401550769805908, "rewards/margins": 0.9684202075004578, "rewards/rejected": -2.6085753440856934, "step": 1919 }, { "epoch": 1.81, "grad_norm": 27.515398025512695, "learning_rate": 2.198321091290661e-07, "logps/chosen": -43.468780517578125, "logps/rejected": -51.47468948364258, "loss": 0.8092, "losses/dpo": 1.607559084892273, "losses/sft": 2.081766366958618, "losses/total": 1.607559084892273, "ref_logps/chosen": -31.525959014892578, "ref_logps/rejected": -33.119041442871094, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1942824125289917, "rewards/margins": 0.6412827968597412, "rewards/rejected": -1.835565209388733, "step": 1920 }, { "epoch": 1.81, "grad_norm": 25.673067092895508, "learning_rate": 2.1965722280517663e-07, "logps/chosen": -67.26519012451172, "logps/rejected": -68.62635803222656, "loss": 0.4484, "losses/dpo": 0.341810017824173, "losses/sft": 1.7387341260910034, "losses/total": 0.341810017824173, "ref_logps/chosen": -49.15861892700195, "ref_logps/rejected": -40.55122375488281, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8106573820114136, "rewards/margins": 0.9968562126159668, "rewards/rejected": -2.80751371383667, "step": 1921 }, { "epoch": 1.81, "grad_norm": 20.926837921142578, "learning_rate": 2.1948233648128717e-07, "logps/chosen": -48.886878967285156, "logps/rejected": -79.37821197509766, "loss": 0.3511, "losses/dpo": 0.1634785532951355, "losses/sft": 1.8016397953033447, "losses/total": 0.1634785532951355, "ref_logps/chosen": -34.55718231201172, "ref_logps/rejected": -49.34617614746094, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4329702854156494, "rewards/margins": 1.5702334642410278, "rewards/rejected": -3.003203868865967, "step": 1922 }, { "epoch": 1.82, "grad_norm": 19.447141647338867, "learning_rate": 2.1930745015739768e-07, "logps/chosen": -50.00474548339844, "logps/rejected": -65.13369750976562, "loss": 0.3732, "losses/dpo": 0.50125652551651, "losses/sft": 1.497388243675232, "losses/total": 0.50125652551651, "ref_logps/chosen": -39.18120574951172, "ref_logps/rejected": -41.847801208496094, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0823540687561035, "rewards/margins": 1.2462358474731445, "rewards/rejected": -2.328589916229248, "step": 1923 }, { "epoch": 1.82, "grad_norm": 23.20139503479004, "learning_rate": 2.191325638335082e-07, "logps/chosen": -54.2501220703125, "logps/rejected": -69.33674621582031, "loss": 0.4377, "losses/dpo": 0.40815237164497375, "losses/sft": 1.551851511001587, "losses/total": 0.40815237164497375, "ref_logps/chosen": -42.55793762207031, "ref_logps/rejected": -42.96839904785156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1692180633544922, "rewards/margins": 1.4676170349121094, "rewards/rejected": -2.6368350982666016, "step": 1924 }, { "epoch": 1.82, "grad_norm": 18.53638458251953, "learning_rate": 2.1895767750961874e-07, "logps/chosen": -46.935935974121094, "logps/rejected": -68.35185241699219, "loss": 0.3251, "losses/dpo": 0.5569038987159729, "losses/sft": 2.3086202144622803, "losses/total": 0.5569038987159729, "ref_logps/chosen": -34.66331481933594, "ref_logps/rejected": -40.83999252319336, "rewards/accuracies": 0.875, "rewards/chosen": -1.2272617816925049, "rewards/margins": 1.5239248275756836, "rewards/rejected": -2.7511866092681885, "step": 1925 }, { "epoch": 1.82, "grad_norm": 25.34766960144043, "learning_rate": 2.1878279118572925e-07, "logps/chosen": -47.13518524169922, "logps/rejected": -63.05792236328125, "loss": 0.5489, "losses/dpo": 1.0756182670593262, "losses/sft": 2.5425522327423096, "losses/total": 1.0756182670593262, "ref_logps/chosen": -32.12316131591797, "ref_logps/rejected": -38.68975067138672, "rewards/accuracies": 0.75, "rewards/chosen": -1.50120210647583, "rewards/margins": 0.9356152415275574, "rewards/rejected": -2.436817169189453, "step": 1926 }, { "epoch": 1.82, "grad_norm": 24.83631706237793, "learning_rate": 2.1860790486183981e-07, "logps/chosen": -54.74627685546875, "logps/rejected": -76.66191101074219, "loss": 0.3896, "losses/dpo": 0.22895357012748718, "losses/sft": 2.293583393096924, "losses/total": 0.22895357012748718, "ref_logps/chosen": -38.66905975341797, "ref_logps/rejected": -44.893951416015625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6077216863632202, "rewards/margins": 1.5690736770629883, "rewards/rejected": -3.176795244216919, "step": 1927 }, { "epoch": 1.82, "grad_norm": 14.02584171295166, "learning_rate": 2.1843301853795033e-07, "logps/chosen": -47.70808410644531, "logps/rejected": -81.25924682617188, "loss": 0.2523, "losses/dpo": 0.3759603202342987, "losses/sft": 2.0829734802246094, "losses/total": 0.3759603202342987, "ref_logps/chosen": -33.322837829589844, "ref_logps/rejected": -46.36687469482422, "rewards/accuracies": 0.875, "rewards/chosen": -1.4385244846343994, "rewards/margins": 2.050712823867798, "rewards/rejected": -3.4892373085021973, "step": 1928 }, { "epoch": 1.82, "grad_norm": 19.891864776611328, "learning_rate": 2.1825813221406087e-07, "logps/chosen": -49.841156005859375, "logps/rejected": -73.42411041259766, "loss": 0.3344, "losses/dpo": 0.23590995371341705, "losses/sft": 1.7683762311935425, "losses/total": 0.23590995371341705, "ref_logps/chosen": -34.62481689453125, "ref_logps/rejected": -43.10502624511719, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5216338634490967, "rewards/margins": 1.5102744102478027, "rewards/rejected": -3.0319082736968994, "step": 1929 }, { "epoch": 1.82, "grad_norm": 19.316604614257812, "learning_rate": 2.1808324589017138e-07, "logps/chosen": -47.80896759033203, "logps/rejected": -75.6249008178711, "loss": 0.3223, "losses/dpo": 0.2652219533920288, "losses/sft": 1.5195528268814087, "losses/total": 0.2652219533920288, "ref_logps/chosen": -34.58601760864258, "ref_logps/rejected": -48.44448471069336, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3222953081130981, "rewards/margins": 1.395746111869812, "rewards/rejected": -2.71804141998291, "step": 1930 }, { "epoch": 1.82, "grad_norm": 26.199796676635742, "learning_rate": 2.179083595662819e-07, "logps/chosen": -48.245792388916016, "logps/rejected": -78.83716583251953, "loss": 0.4373, "losses/dpo": 0.5776351690292358, "losses/sft": 1.9075573682785034, "losses/total": 0.5776351690292358, "ref_logps/chosen": -33.773094177246094, "ref_logps/rejected": -52.131649017333984, "rewards/accuracies": 0.75, "rewards/chosen": -1.447270154953003, "rewards/margins": 1.2232816219329834, "rewards/rejected": -2.6705517768859863, "step": 1931 }, { "epoch": 1.82, "grad_norm": 24.241119384765625, "learning_rate": 2.1773347324239243e-07, "logps/chosen": -51.85051727294922, "logps/rejected": -64.8966064453125, "loss": 0.316, "losses/dpo": 0.11948569118976593, "losses/sft": 1.355791687965393, "losses/total": 0.11948569118976593, "ref_logps/chosen": -41.326324462890625, "ref_logps/rejected": -37.862464904785156, "rewards/accuracies": 0.875, "rewards/chosen": -1.0524190664291382, "rewards/margins": 1.6509957313537598, "rewards/rejected": -2.7034149169921875, "step": 1932 }, { "epoch": 1.83, "grad_norm": 26.71185302734375, "learning_rate": 2.1755858691850294e-07, "logps/chosen": -43.790252685546875, "logps/rejected": -63.17491149902344, "loss": 0.5615, "losses/dpo": 0.666718602180481, "losses/sft": 2.239082098007202, "losses/total": 0.666718602180481, "ref_logps/chosen": -29.334993362426758, "ref_logps/rejected": -38.764217376708984, "rewards/accuracies": 0.625, "rewards/chosen": -1.4455257654190063, "rewards/margins": 0.9955437779426575, "rewards/rejected": -2.4410696029663086, "step": 1933 }, { "epoch": 1.83, "grad_norm": 18.04010772705078, "learning_rate": 2.173837005946135e-07, "logps/chosen": -48.863975524902344, "logps/rejected": -67.9365234375, "loss": 0.2917, "losses/dpo": 0.16975978016853333, "losses/sft": 1.395847201347351, "losses/total": 0.16975978016853333, "ref_logps/chosen": -37.93406295776367, "ref_logps/rejected": -39.69021224975586, "rewards/accuracies": 0.8125, "rewards/chosen": -1.092991590499878, "rewards/margins": 1.731640100479126, "rewards/rejected": -2.824631690979004, "step": 1934 }, { "epoch": 1.83, "grad_norm": 16.366809844970703, "learning_rate": 2.1720881427072402e-07, "logps/chosen": -48.70531463623047, "logps/rejected": -82.24748229980469, "loss": 0.2509, "losses/dpo": 0.45168188214302063, "losses/sft": 2.0609686374664307, "losses/total": 0.45168188214302063, "ref_logps/chosen": -35.18280792236328, "ref_logps/rejected": -51.04685974121094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3522506952285767, "rewards/margins": 1.7678117752075195, "rewards/rejected": -3.1200623512268066, "step": 1935 }, { "epoch": 1.83, "grad_norm": 21.454904556274414, "learning_rate": 2.1703392794683456e-07, "logps/chosen": -44.24781036376953, "logps/rejected": -55.08128356933594, "loss": 0.4836, "losses/dpo": 0.3794158697128296, "losses/sft": 1.919634461402893, "losses/total": 0.3794158697128296, "ref_logps/chosen": -33.00543975830078, "ref_logps/rejected": -37.07406997680664, "rewards/accuracies": 0.75, "rewards/chosen": -1.124237060546875, "rewards/margins": 0.6764841079711914, "rewards/rejected": -1.8007211685180664, "step": 1936 }, { "epoch": 1.83, "grad_norm": 21.988210678100586, "learning_rate": 2.1685904162294507e-07, "logps/chosen": -40.5090446472168, "logps/rejected": -56.91093444824219, "loss": 0.3527, "losses/dpo": 0.8546777963638306, "losses/sft": 1.8751733303070068, "losses/total": 0.8546777963638306, "ref_logps/chosen": -30.719322204589844, "ref_logps/rejected": -32.112998962402344, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9789724349975586, "rewards/margins": 1.5008208751678467, "rewards/rejected": -2.4797935485839844, "step": 1937 }, { "epoch": 1.83, "grad_norm": 21.11084747314453, "learning_rate": 2.1668415529905559e-07, "logps/chosen": -42.49399185180664, "logps/rejected": -57.91488265991211, "loss": 0.4605, "losses/dpo": 0.33433669805526733, "losses/sft": 1.4056564569473267, "losses/total": 0.33433669805526733, "ref_logps/chosen": -28.660724639892578, "ref_logps/rejected": -35.102294921875, "rewards/accuracies": 0.75, "rewards/chosen": -1.383326530456543, "rewards/margins": 0.8979320526123047, "rewards/rejected": -2.2812585830688477, "step": 1938 }, { "epoch": 1.83, "grad_norm": 22.22531509399414, "learning_rate": 2.1650926897516613e-07, "logps/chosen": -46.26589584350586, "logps/rejected": -59.38967514038086, "loss": 0.4035, "losses/dpo": 0.7754935026168823, "losses/sft": 2.324876070022583, "losses/total": 0.7754935026168823, "ref_logps/chosen": -31.906780242919922, "ref_logps/rejected": -33.66438293457031, "rewards/accuracies": 0.875, "rewards/chosen": -1.4359118938446045, "rewards/margins": 1.1366177797317505, "rewards/rejected": -2.5725297927856445, "step": 1939 }, { "epoch": 1.83, "grad_norm": 28.79388427734375, "learning_rate": 2.1633438265127666e-07, "logps/chosen": -55.67766571044922, "logps/rejected": -59.03409957885742, "loss": 0.5738, "losses/dpo": 0.6611247658729553, "losses/sft": 1.9232362508773804, "losses/total": 0.6611247658729553, "ref_logps/chosen": -38.92405700683594, "ref_logps/rejected": -36.29399490356445, "rewards/accuracies": 0.75, "rewards/chosen": -1.675360918045044, "rewards/margins": 0.5986493825912476, "rewards/rejected": -2.274010419845581, "step": 1940 }, { "epoch": 1.83, "grad_norm": 22.21906852722168, "learning_rate": 2.161594963273872e-07, "logps/chosen": -62.16954040527344, "logps/rejected": -79.95361328125, "loss": 0.3554, "losses/dpo": 0.3510781228542328, "losses/sft": 2.206805944442749, "losses/total": 0.3510781228542328, "ref_logps/chosen": -45.09418487548828, "ref_logps/rejected": -49.92413330078125, "rewards/accuracies": 0.875, "rewards/chosen": -1.707535982131958, "rewards/margins": 1.295412540435791, "rewards/rejected": -3.002948522567749, "step": 1941 }, { "epoch": 1.83, "grad_norm": 25.15227508544922, "learning_rate": 2.1598461000349772e-07, "logps/chosen": -62.75640869140625, "logps/rejected": -76.89555358886719, "loss": 0.4364, "losses/dpo": 0.23652169108390808, "losses/sft": 1.976174235343933, "losses/total": 0.23652169108390808, "ref_logps/chosen": -46.537086486816406, "ref_logps/rejected": -44.752655029296875, "rewards/accuracies": 0.75, "rewards/chosen": -1.621931791305542, "rewards/margins": 1.5923576354980469, "rewards/rejected": -3.214289426803589, "step": 1942 }, { "epoch": 1.83, "grad_norm": 27.23257064819336, "learning_rate": 2.1580972367960826e-07, "logps/chosen": -56.82284927368164, "logps/rejected": -80.75686645507812, "loss": 0.4509, "losses/dpo": 0.2272796928882599, "losses/sft": 1.745611548423767, "losses/total": 0.2272796928882599, "ref_logps/chosen": -42.16802978515625, "ref_logps/rejected": -52.64835739135742, "rewards/accuracies": 0.875, "rewards/chosen": -1.4654819965362549, "rewards/margins": 1.3453688621520996, "rewards/rejected": -2.8108506202697754, "step": 1943 }, { "epoch": 1.84, "grad_norm": 27.91211700439453, "learning_rate": 2.1563483735571877e-07, "logps/chosen": -57.24657440185547, "logps/rejected": -81.26807403564453, "loss": 0.4403, "losses/dpo": 0.4680829644203186, "losses/sft": 2.068673610687256, "losses/total": 0.4680829644203186, "ref_logps/chosen": -41.270355224609375, "ref_logps/rejected": -54.46880340576172, "rewards/accuracies": 0.75, "rewards/chosen": -1.5976223945617676, "rewards/margins": 1.0823047161102295, "rewards/rejected": -2.679926872253418, "step": 1944 }, { "epoch": 1.84, "grad_norm": 17.17719268798828, "learning_rate": 2.1545995103182928e-07, "logps/chosen": -52.65362548828125, "logps/rejected": -76.73627471923828, "loss": 0.268, "losses/dpo": 0.25786781311035156, "losses/sft": 2.301936626434326, "losses/total": 0.25786781311035156, "ref_logps/chosen": -39.370201110839844, "ref_logps/rejected": -48.470130920410156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3283424377441406, "rewards/margins": 1.4982726573944092, "rewards/rejected": -2.826615333557129, "step": 1945 }, { "epoch": 1.84, "grad_norm": 17.44093894958496, "learning_rate": 2.1528506470793985e-07, "logps/chosen": -46.42760467529297, "logps/rejected": -64.9055404663086, "loss": 0.2867, "losses/dpo": 0.15678484737873077, "losses/sft": 1.9859936237335205, "losses/total": 0.15678484737873077, "ref_logps/chosen": -34.89744567871094, "ref_logps/rejected": -38.69559860229492, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1530159711837769, "rewards/margins": 1.4679782390594482, "rewards/rejected": -2.6209943294525146, "step": 1946 }, { "epoch": 1.84, "grad_norm": 22.280567169189453, "learning_rate": 2.1511017838405036e-07, "logps/chosen": -47.924156188964844, "logps/rejected": -75.62379455566406, "loss": 0.4304, "losses/dpo": 0.24968941509723663, "losses/sft": 1.5886414051055908, "losses/total": 0.24968941509723663, "ref_logps/chosen": -32.75593948364258, "ref_logps/rejected": -47.03985595703125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.516822099685669, "rewards/margins": 1.3415718078613281, "rewards/rejected": -2.858393669128418, "step": 1947 }, { "epoch": 1.84, "grad_norm": 26.11994171142578, "learning_rate": 2.149352920601609e-07, "logps/chosen": -61.862754821777344, "logps/rejected": -74.32380676269531, "loss": 0.424, "losses/dpo": 0.24698521196842194, "losses/sft": 1.761254906654358, "losses/total": 0.24698521196842194, "ref_logps/chosen": -44.40204620361328, "ref_logps/rejected": -42.56153106689453, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7460711002349854, "rewards/margins": 1.4301559925079346, "rewards/rejected": -3.17622709274292, "step": 1948 }, { "epoch": 1.84, "grad_norm": 26.30391502380371, "learning_rate": 2.147604057362714e-07, "logps/chosen": -64.70724487304688, "logps/rejected": -87.25625610351562, "loss": 0.4695, "losses/dpo": 0.5290414690971375, "losses/sft": 2.2446036338806152, "losses/total": 0.5290414690971375, "ref_logps/chosen": -41.936336517333984, "ref_logps/rejected": -50.98778533935547, "rewards/accuracies": 0.75, "rewards/chosen": -2.2770910263061523, "rewards/margins": 1.3497557640075684, "rewards/rejected": -3.6268467903137207, "step": 1949 }, { "epoch": 1.84, "grad_norm": 21.30093765258789, "learning_rate": 2.1458551941238195e-07, "logps/chosen": -47.91936492919922, "logps/rejected": -73.26781463623047, "loss": 0.2849, "losses/dpo": 0.19459158182144165, "losses/sft": 1.247519850730896, "losses/total": 0.19459158182144165, "ref_logps/chosen": -34.45432662963867, "ref_logps/rejected": -43.84803771972656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3465039730072021, "rewards/margins": 1.5954742431640625, "rewards/rejected": -2.9419782161712646, "step": 1950 }, { "epoch": 1.84, "grad_norm": 24.17926788330078, "learning_rate": 2.1441063308849246e-07, "logps/chosen": -50.253482818603516, "logps/rejected": -58.501869201660156, "loss": 0.4728, "losses/dpo": 0.5476719737052917, "losses/sft": 2.300201654434204, "losses/total": 0.5476719737052917, "ref_logps/chosen": -34.257896423339844, "ref_logps/rejected": -33.900978088378906, "rewards/accuracies": 0.75, "rewards/chosen": -1.5995583534240723, "rewards/margins": 0.8605306148529053, "rewards/rejected": -2.4600892066955566, "step": 1951 }, { "epoch": 1.84, "grad_norm": 26.85236358642578, "learning_rate": 2.1423574676460298e-07, "logps/chosen": -44.034934997558594, "logps/rejected": -66.96251678466797, "loss": 0.5054, "losses/dpo": 0.23814263939857483, "losses/sft": 1.3473509550094604, "losses/total": 0.23814263939857483, "ref_logps/chosen": -29.715856552124023, "ref_logps/rejected": -41.30472946166992, "rewards/accuracies": 0.75, "rewards/chosen": -1.4319078922271729, "rewards/margins": 1.1338706016540527, "rewards/rejected": -2.5657787322998047, "step": 1952 }, { "epoch": 1.84, "grad_norm": 19.77062225341797, "learning_rate": 2.1406086044071354e-07, "logps/chosen": -52.25318145751953, "logps/rejected": -66.92807006835938, "loss": 0.3831, "losses/dpo": 0.5790616273880005, "losses/sft": 1.7756602764129639, "losses/total": 0.5790616273880005, "ref_logps/chosen": -39.21463394165039, "ref_logps/rejected": -39.28076934814453, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3038547039031982, "rewards/margins": 1.460875153541565, "rewards/rejected": -2.7647299766540527, "step": 1953 }, { "epoch": 1.85, "grad_norm": 22.926481246948242, "learning_rate": 2.1388597411682405e-07, "logps/chosen": -52.59754943847656, "logps/rejected": -73.46942138671875, "loss": 0.4746, "losses/dpo": 0.2514271140098572, "losses/sft": 1.4299970865249634, "losses/total": 0.2514271140098572, "ref_logps/chosen": -37.889503479003906, "ref_logps/rejected": -47.44058609008789, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4708044528961182, "rewards/margins": 1.1320796012878418, "rewards/rejected": -2.602884292602539, "step": 1954 }, { "epoch": 1.85, "grad_norm": 19.8757266998291, "learning_rate": 2.137110877929346e-07, "logps/chosen": -50.41484069824219, "logps/rejected": -72.51498413085938, "loss": 0.3462, "losses/dpo": 0.1194656640291214, "losses/sft": 2.324993848800659, "losses/total": 0.1194656640291214, "ref_logps/chosen": -36.714115142822266, "ref_logps/rejected": -44.33071517944336, "rewards/accuracies": 0.875, "rewards/chosen": -1.370072841644287, "rewards/margins": 1.4483534097671509, "rewards/rejected": -2.8184261322021484, "step": 1955 }, { "epoch": 1.85, "grad_norm": 28.71787452697754, "learning_rate": 2.135362014690451e-07, "logps/chosen": -48.01447296142578, "logps/rejected": -62.46440124511719, "loss": 0.6122, "losses/dpo": 1.064414143562317, "losses/sft": 1.727824330329895, "losses/total": 1.064414143562317, "ref_logps/chosen": -33.594505310058594, "ref_logps/rejected": -39.40873718261719, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4419970512390137, "rewards/margins": 0.8635697364807129, "rewards/rejected": -2.3055667877197266, "step": 1956 }, { "epoch": 1.85, "grad_norm": 23.21286964416504, "learning_rate": 2.1336131514515564e-07, "logps/chosen": -58.0375862121582, "logps/rejected": -70.63639831542969, "loss": 0.37, "losses/dpo": 0.5791358947753906, "losses/sft": 1.890541911125183, "losses/total": 0.5791358947753906, "ref_logps/chosen": -45.19242477416992, "ref_logps/rejected": -45.46302032470703, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2845160961151123, "rewards/margins": 1.2328214645385742, "rewards/rejected": -2.5173375606536865, "step": 1957 }, { "epoch": 1.85, "grad_norm": 15.949946403503418, "learning_rate": 2.1318642882126616e-07, "logps/chosen": -33.374114990234375, "logps/rejected": -72.46894836425781, "loss": 0.2829, "losses/dpo": 0.22898954153060913, "losses/sft": 1.339084267616272, "losses/total": 0.22898954153060913, "ref_logps/chosen": -24.883380889892578, "ref_logps/rejected": -48.905128479003906, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8490734100341797, "rewards/margins": 1.507308840751648, "rewards/rejected": -2.356382369995117, "step": 1958 }, { "epoch": 1.85, "grad_norm": 21.047752380371094, "learning_rate": 2.130115424973767e-07, "logps/chosen": -53.520713806152344, "logps/rejected": -69.3406982421875, "loss": 0.3651, "losses/dpo": 0.5611324906349182, "losses/sft": 1.5742043256759644, "losses/total": 0.5611324906349182, "ref_logps/chosen": -38.946136474609375, "ref_logps/rejected": -40.23695755004883, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4574576616287231, "rewards/margins": 1.4529160261154175, "rewards/rejected": -2.9103739261627197, "step": 1959 }, { "epoch": 1.85, "grad_norm": 14.455061912536621, "learning_rate": 2.1283665617348724e-07, "logps/chosen": -53.4486083984375, "logps/rejected": -89.52093505859375, "loss": 0.2338, "losses/dpo": 0.2984086573123932, "losses/sft": 1.398449182510376, "losses/total": 0.2984086573123932, "ref_logps/chosen": -42.681556701660156, "ref_logps/rejected": -58.29749298095703, "rewards/accuracies": 0.875, "rewards/chosen": -1.076704740524292, "rewards/margins": 2.0456392765045166, "rewards/rejected": -3.1223440170288086, "step": 1960 }, { "epoch": 1.85, "grad_norm": 22.436206817626953, "learning_rate": 2.1266176984959775e-07, "logps/chosen": -77.03755187988281, "logps/rejected": -88.14137268066406, "loss": 0.2678, "losses/dpo": 0.19570860266685486, "losses/sft": 2.1246986389160156, "losses/total": 0.19570860266685486, "ref_logps/chosen": -60.108638763427734, "ref_logps/rejected": -54.540679931640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6928906440734863, "rewards/margins": 1.667177677154541, "rewards/rejected": -3.3600685596466064, "step": 1961 }, { "epoch": 1.85, "grad_norm": 24.994171142578125, "learning_rate": 2.124868835257083e-07, "logps/chosen": -58.44688034057617, "logps/rejected": -67.09022521972656, "loss": 0.4604, "losses/dpo": 0.3520793914794922, "losses/sft": 1.989675760269165, "losses/total": 0.3520793914794922, "ref_logps/chosen": -43.235015869140625, "ref_logps/rejected": -42.84651184082031, "rewards/accuracies": 0.9375, "rewards/chosen": -1.521186113357544, "rewards/margins": 0.9031853675842285, "rewards/rejected": -2.4243717193603516, "step": 1962 }, { "epoch": 1.85, "grad_norm": 23.424358367919922, "learning_rate": 2.123119972018188e-07, "logps/chosen": -54.1328010559082, "logps/rejected": -72.67156982421875, "loss": 0.3887, "losses/dpo": 0.4511217474937439, "losses/sft": 1.9106628894805908, "losses/total": 0.4511217474937439, "ref_logps/chosen": -41.939208984375, "ref_logps/rejected": -46.20673370361328, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2193596363067627, "rewards/margins": 1.4271247386932373, "rewards/rejected": -2.646484375, "step": 1963 }, { "epoch": 1.85, "grad_norm": 15.990822792053223, "learning_rate": 2.1213711087792934e-07, "logps/chosen": -41.56805419921875, "logps/rejected": -67.26626586914062, "loss": 0.3243, "losses/dpo": 0.2783471345901489, "losses/sft": 1.7799925804138184, "losses/total": 0.2783471345901489, "ref_logps/chosen": -32.52330780029297, "ref_logps/rejected": -42.371612548828125, "rewards/accuracies": 0.875, "rewards/chosen": -0.9044744968414307, "rewards/margins": 1.5849908590316772, "rewards/rejected": -2.4894652366638184, "step": 1964 }, { "epoch": 1.86, "grad_norm": 31.50393295288086, "learning_rate": 2.1196222455403988e-07, "logps/chosen": -64.89741516113281, "logps/rejected": -79.29817199707031, "loss": 0.4631, "losses/dpo": 0.8046008944511414, "losses/sft": 2.1458544731140137, "losses/total": 0.8046008944511414, "ref_logps/chosen": -49.99016571044922, "ref_logps/rejected": -51.007633209228516, "rewards/accuracies": 0.8125, "rewards/chosen": -1.490724802017212, "rewards/margins": 1.3383294343948364, "rewards/rejected": -2.829054355621338, "step": 1965 }, { "epoch": 1.86, "grad_norm": 23.86702537536621, "learning_rate": 2.117873382301504e-07, "logps/chosen": -49.21942138671875, "logps/rejected": -75.91116333007812, "loss": 0.4431, "losses/dpo": 0.4857032299041748, "losses/sft": 1.9156571626663208, "losses/total": 0.4857032299041748, "ref_logps/chosen": -35.834869384765625, "ref_logps/rejected": -50.44806671142578, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3384552001953125, "rewards/margins": 1.2078537940979004, "rewards/rejected": -2.546308994293213, "step": 1966 }, { "epoch": 1.86, "grad_norm": 25.070886611938477, "learning_rate": 2.1161245190626093e-07, "logps/chosen": -50.53087615966797, "logps/rejected": -68.46553802490234, "loss": 0.467, "losses/dpo": 0.4369206428527832, "losses/sft": 1.8337554931640625, "losses/total": 0.4369206428527832, "ref_logps/chosen": -37.08546447753906, "ref_logps/rejected": -43.849159240722656, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3445405960083008, "rewards/margins": 1.1170969009399414, "rewards/rejected": -2.461637496948242, "step": 1967 }, { "epoch": 1.86, "grad_norm": 20.981592178344727, "learning_rate": 2.1143756558237144e-07, "logps/chosen": -48.76057434082031, "logps/rejected": -72.07445526123047, "loss": 0.3442, "losses/dpo": 0.6560444831848145, "losses/sft": 1.9757767915725708, "losses/total": 0.6560444831848145, "ref_logps/chosen": -33.7701530456543, "ref_logps/rejected": -43.66492462158203, "rewards/accuracies": 0.9375, "rewards/chosen": -1.49904203414917, "rewards/margins": 1.3419103622436523, "rewards/rejected": -2.8409523963928223, "step": 1968 }, { "epoch": 1.86, "grad_norm": 24.32599449157715, "learning_rate": 2.1126267925848198e-07, "logps/chosen": -53.34295654296875, "logps/rejected": -72.85565185546875, "loss": 0.364, "losses/dpo": 0.22470355033874512, "losses/sft": 1.8783470392227173, "losses/total": 0.22470355033874512, "ref_logps/chosen": -37.051368713378906, "ref_logps/rejected": -42.297752380371094, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6291587352752686, "rewards/margins": 1.426631212234497, "rewards/rejected": -3.0557899475097656, "step": 1969 }, { "epoch": 1.86, "grad_norm": 20.983388900756836, "learning_rate": 2.110877929345925e-07, "logps/chosen": -53.12214660644531, "logps/rejected": -67.75334167480469, "loss": 0.3623, "losses/dpo": 0.15745952725410461, "losses/sft": 1.4009560346603394, "losses/total": 0.15745952725410461, "ref_logps/chosen": -41.74241638183594, "ref_logps/rejected": -41.97954177856445, "rewards/accuracies": 0.75, "rewards/chosen": -1.1379724740982056, "rewards/margins": 1.4394073486328125, "rewards/rejected": -2.5773799419403076, "step": 1970 }, { "epoch": 1.86, "grad_norm": 20.522003173828125, "learning_rate": 2.1091290661070303e-07, "logps/chosen": -47.230072021484375, "logps/rejected": -71.14032745361328, "loss": 0.3682, "losses/dpo": 0.6743268370628357, "losses/sft": 1.6973276138305664, "losses/total": 0.6743268370628357, "ref_logps/chosen": -34.39399719238281, "ref_logps/rejected": -42.80510711669922, "rewards/accuracies": 0.875, "rewards/chosen": -1.283607840538025, "rewards/margins": 1.5499138832092285, "rewards/rejected": -2.833521604537964, "step": 1971 }, { "epoch": 1.86, "grad_norm": 17.255136489868164, "learning_rate": 2.1073802028681357e-07, "logps/chosen": -47.24890899658203, "logps/rejected": -74.16438293457031, "loss": 0.269, "losses/dpo": 0.42768216133117676, "losses/sft": 2.1388845443725586, "losses/total": 0.42768216133117676, "ref_logps/chosen": -35.4140739440918, "ref_logps/rejected": -44.785301208496094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1834834814071655, "rewards/margins": 1.7544245719909668, "rewards/rejected": -2.9379079341888428, "step": 1972 }, { "epoch": 1.86, "grad_norm": 21.254287719726562, "learning_rate": 2.1056313396292408e-07, "logps/chosen": -54.706207275390625, "logps/rejected": -81.81231689453125, "loss": 0.3673, "losses/dpo": 0.5571094155311584, "losses/sft": 2.0986173152923584, "losses/total": 0.5571094155311584, "ref_logps/chosen": -40.530128479003906, "ref_logps/rejected": -48.939971923828125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4176077842712402, "rewards/margins": 1.869626522064209, "rewards/rejected": -3.287234306335449, "step": 1973 }, { "epoch": 1.86, "grad_norm": 18.496536254882812, "learning_rate": 2.1038824763903462e-07, "logps/chosen": -54.24553680419922, "logps/rejected": -74.15406036376953, "loss": 0.3115, "losses/dpo": 0.4126966893672943, "losses/sft": 2.081036329269409, "losses/total": 0.4126966893672943, "ref_logps/chosen": -41.41372299194336, "ref_logps/rejected": -46.38686752319336, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2831816673278809, "rewards/margins": 1.4935376644134521, "rewards/rejected": -2.776719331741333, "step": 1974 }, { "epoch": 1.86, "grad_norm": 33.76393508911133, "learning_rate": 2.1021336131514514e-07, "logps/chosen": -55.534854888916016, "logps/rejected": -78.84536743164062, "loss": 0.6097, "losses/dpo": 0.2924114763736725, "losses/sft": 1.6932238340377808, "losses/total": 0.2924114763736725, "ref_logps/chosen": -38.251007080078125, "ref_logps/rejected": -48.34223175048828, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7283849716186523, "rewards/margins": 1.3219285011291504, "rewards/rejected": -3.0503134727478027, "step": 1975 }, { "epoch": 1.87, "grad_norm": 20.681589126586914, "learning_rate": 2.1003847499125568e-07, "logps/chosen": -65.04346466064453, "logps/rejected": -80.50343322753906, "loss": 0.3028, "losses/dpo": 0.29852724075317383, "losses/sft": 2.32308292388916, "losses/total": 0.29852724075317383, "ref_logps/chosen": -48.152496337890625, "ref_logps/rejected": -49.82281494140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6890971660614014, "rewards/margins": 1.3789646625518799, "rewards/rejected": -3.068061590194702, "step": 1976 }, { "epoch": 1.87, "grad_norm": 27.520835876464844, "learning_rate": 2.098635886673662e-07, "logps/chosen": -44.01696014404297, "logps/rejected": -71.24827575683594, "loss": 0.4254, "losses/dpo": 0.835005521774292, "losses/sft": 1.3879934549331665, "losses/total": 0.835005521774292, "ref_logps/chosen": -32.514488220214844, "ref_logps/rejected": -47.72511291503906, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1502470970153809, "rewards/margins": 1.2020692825317383, "rewards/rejected": -2.352316379547119, "step": 1977 }, { "epoch": 1.87, "grad_norm": 26.55544662475586, "learning_rate": 2.0968870234347675e-07, "logps/chosen": -51.0809440612793, "logps/rejected": -57.31834411621094, "loss": 0.5645, "losses/dpo": 0.3740193247795105, "losses/sft": 1.4272371530532837, "losses/total": 0.3740193247795105, "ref_logps/chosen": -37.47397994995117, "ref_logps/rejected": -36.893035888671875, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3606963157653809, "rewards/margins": 0.6818346977233887, "rewards/rejected": -2.0425310134887695, "step": 1978 }, { "epoch": 1.87, "grad_norm": 23.518918991088867, "learning_rate": 2.0951381601958727e-07, "logps/chosen": -51.69788360595703, "logps/rejected": -70.53106689453125, "loss": 0.4227, "losses/dpo": 0.46027320623397827, "losses/sft": 2.0539302825927734, "losses/total": 0.46027320623397827, "ref_logps/chosen": -38.222206115722656, "ref_logps/rejected": -46.31585693359375, "rewards/accuracies": 0.75, "rewards/chosen": -1.3475680351257324, "rewards/margins": 1.0739524364471436, "rewards/rejected": -2.421520709991455, "step": 1979 }, { "epoch": 1.87, "grad_norm": 19.48638343811035, "learning_rate": 2.0933892969569778e-07, "logps/chosen": -47.801025390625, "logps/rejected": -76.14320373535156, "loss": 0.2804, "losses/dpo": 0.06693714112043381, "losses/sft": 1.7252448797225952, "losses/total": 0.06693714112043381, "ref_logps/chosen": -37.69862365722656, "ref_logps/rejected": -43.908775329589844, "rewards/accuracies": 0.875, "rewards/chosen": -1.010240077972412, "rewards/margins": 2.213202476501465, "rewards/rejected": -3.223442554473877, "step": 1980 }, { "epoch": 1.87, "grad_norm": 19.88045310974121, "learning_rate": 2.0916404337180832e-07, "logps/chosen": -45.399925231933594, "logps/rejected": -62.11358642578125, "loss": 0.3748, "losses/dpo": 0.19484129548072815, "losses/sft": 1.7399189472198486, "losses/total": 0.19484129548072815, "ref_logps/chosen": -35.95106506347656, "ref_logps/rejected": -39.6606559753418, "rewards/accuracies": 0.75, "rewards/chosen": -0.9448865652084351, "rewards/margins": 1.3004069328308105, "rewards/rejected": -2.245293617248535, "step": 1981 }, { "epoch": 1.87, "grad_norm": 22.77387809753418, "learning_rate": 2.0898915704791883e-07, "logps/chosen": -49.070858001708984, "logps/rejected": -62.60783386230469, "loss": 0.399, "losses/dpo": 0.4592130184173584, "losses/sft": 1.679467797279358, "losses/total": 0.4592130184173584, "ref_logps/chosen": -36.97407531738281, "ref_logps/rejected": -37.21660232543945, "rewards/accuracies": 0.875, "rewards/chosen": -1.2096785306930542, "rewards/margins": 1.3294445276260376, "rewards/rejected": -2.539123058319092, "step": 1982 }, { "epoch": 1.87, "grad_norm": 31.689556121826172, "learning_rate": 2.0881427072402937e-07, "logps/chosen": -68.82721710205078, "logps/rejected": -70.45463562011719, "loss": 0.4845, "losses/dpo": 0.4984961152076721, "losses/sft": 2.4181315898895264, "losses/total": 0.4984961152076721, "ref_logps/chosen": -51.806785583496094, "ref_logps/rejected": -42.296478271484375, "rewards/accuracies": 0.6875, "rewards/chosen": -1.702043056488037, "rewards/margins": 1.1137725114822388, "rewards/rejected": -2.8158156871795654, "step": 1983 }, { "epoch": 1.87, "grad_norm": 21.13364028930664, "learning_rate": 2.0863938440013988e-07, "logps/chosen": -36.618003845214844, "logps/rejected": -72.97779846191406, "loss": 0.3157, "losses/dpo": 0.06762217730283737, "losses/sft": 2.127976655960083, "losses/total": 0.06762217730283737, "ref_logps/chosen": -25.291261672973633, "ref_logps/rejected": -45.15331268310547, "rewards/accuracies": 0.8125, "rewards/chosen": -1.132674217224121, "rewards/margins": 1.6497740745544434, "rewards/rejected": -2.7824482917785645, "step": 1984 }, { "epoch": 1.87, "grad_norm": 15.921688079833984, "learning_rate": 2.0846449807625045e-07, "logps/chosen": -54.70018005371094, "logps/rejected": -80.42591857910156, "loss": 0.2678, "losses/dpo": 0.3374122381210327, "losses/sft": 1.7384850978851318, "losses/total": 0.3374122381210327, "ref_logps/chosen": -39.920433044433594, "ref_logps/rejected": -50.794761657714844, "rewards/accuracies": 0.875, "rewards/chosen": -1.4779746532440186, "rewards/margins": 1.4851419925689697, "rewards/rejected": -2.9631166458129883, "step": 1985 }, { "epoch": 1.88, "grad_norm": 18.273723602294922, "learning_rate": 2.0828961175236096e-07, "logps/chosen": -61.510658264160156, "logps/rejected": -87.7957534790039, "loss": 0.2687, "losses/dpo": 0.21693351864814758, "losses/sft": 1.8314409255981445, "losses/total": 0.21693351864814758, "ref_logps/chosen": -46.84638595581055, "ref_logps/rejected": -53.92215347290039, "rewards/accuracies": 0.875, "rewards/chosen": -1.4664275646209717, "rewards/margins": 1.9209327697753906, "rewards/rejected": -3.3873603343963623, "step": 1986 }, { "epoch": 1.88, "grad_norm": 27.942218780517578, "learning_rate": 2.0811472542847147e-07, "logps/chosen": -41.060115814208984, "logps/rejected": -77.3597412109375, "loss": 0.4036, "losses/dpo": 0.17683270573616028, "losses/sft": 2.055321216583252, "losses/total": 0.17683270573616028, "ref_logps/chosen": -27.92916488647461, "ref_logps/rejected": -49.761436462402344, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3130953311920166, "rewards/margins": 1.4467356204986572, "rewards/rejected": -2.759830951690674, "step": 1987 }, { "epoch": 1.88, "grad_norm": 11.769414901733398, "learning_rate": 2.07939839104582e-07, "logps/chosen": -43.42335510253906, "logps/rejected": -74.48541259765625, "loss": 0.2253, "losses/dpo": 0.20665507018566132, "losses/sft": 1.1677836179733276, "losses/total": 0.20665507018566132, "ref_logps/chosen": -32.017337799072266, "ref_logps/rejected": -43.09758377075195, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1406021118164062, "rewards/margins": 1.9981807470321655, "rewards/rejected": -3.1387829780578613, "step": 1988 }, { "epoch": 1.88, "grad_norm": 23.761144638061523, "learning_rate": 2.0776495278069253e-07, "logps/chosen": -68.45650482177734, "logps/rejected": -88.13902282714844, "loss": 0.3371, "losses/dpo": 0.3239016532897949, "losses/sft": 2.3723411560058594, "losses/total": 0.3239016532897949, "ref_logps/chosen": -48.0863037109375, "ref_logps/rejected": -49.539794921875, "rewards/accuracies": 0.8125, "rewards/chosen": -2.037020206451416, "rewards/margins": 1.8229024410247803, "rewards/rejected": -3.8599228858947754, "step": 1989 }, { "epoch": 1.88, "grad_norm": 25.765422821044922, "learning_rate": 2.0759006645680306e-07, "logps/chosen": -58.637203216552734, "logps/rejected": -72.1034164428711, "loss": 0.571, "losses/dpo": 0.9255797863006592, "losses/sft": 1.4871019124984741, "losses/total": 0.9255797863006592, "ref_logps/chosen": -40.592498779296875, "ref_logps/rejected": -43.86729431152344, "rewards/accuracies": 0.8125, "rewards/chosen": -1.804470181465149, "rewards/margins": 1.0191423892974854, "rewards/rejected": -2.823612689971924, "step": 1990 }, { "epoch": 1.88, "grad_norm": 24.1730899810791, "learning_rate": 2.074151801329136e-07, "logps/chosen": -52.826576232910156, "logps/rejected": -78.01376342773438, "loss": 0.4196, "losses/dpo": 0.5060432553291321, "losses/sft": 2.1354801654815674, "losses/total": 0.5060432553291321, "ref_logps/chosen": -36.949371337890625, "ref_logps/rejected": -45.54932403564453, "rewards/accuracies": 0.75, "rewards/chosen": -1.5877199172973633, "rewards/margins": 1.6587238311767578, "rewards/rejected": -3.246443748474121, "step": 1991 }, { "epoch": 1.88, "grad_norm": 42.56746292114258, "learning_rate": 2.0724029380902414e-07, "logps/chosen": -58.30364990234375, "logps/rejected": -66.47650146484375, "loss": 0.688, "losses/dpo": 0.6619449853897095, "losses/sft": 2.4093542098999023, "losses/total": 0.6619449853897095, "ref_logps/chosen": -38.64124298095703, "ref_logps/rejected": -41.48748016357422, "rewards/accuracies": 0.625, "rewards/chosen": -1.966240644454956, "rewards/margins": 0.5326617360115051, "rewards/rejected": -2.4989023208618164, "step": 1992 }, { "epoch": 1.88, "grad_norm": 17.661104202270508, "learning_rate": 2.0706540748513466e-07, "logps/chosen": -47.92689895629883, "logps/rejected": -61.3677978515625, "loss": 0.3667, "losses/dpo": 0.6560609340667725, "losses/sft": 1.7580631971359253, "losses/total": 0.6560609340667725, "ref_logps/chosen": -35.225093841552734, "ref_logps/rejected": -33.523284912109375, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2701804637908936, "rewards/margins": 1.5142710208892822, "rewards/rejected": -2.784451484680176, "step": 1993 }, { "epoch": 1.88, "grad_norm": 22.866487503051758, "learning_rate": 2.0689052116124517e-07, "logps/chosen": -48.848724365234375, "logps/rejected": -68.41227722167969, "loss": 0.3555, "losses/dpo": 0.14448793232440948, "losses/sft": 1.8898380994796753, "losses/total": 0.14448793232440948, "ref_logps/chosen": -32.324951171875, "ref_logps/rejected": -38.35407638549805, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6523773670196533, "rewards/margins": 1.3534433841705322, "rewards/rejected": -3.0058207511901855, "step": 1994 }, { "epoch": 1.88, "grad_norm": 26.90878677368164, "learning_rate": 2.067156348373557e-07, "logps/chosen": -52.19657897949219, "logps/rejected": -80.27163696289062, "loss": 0.4643, "losses/dpo": 0.4463430345058441, "losses/sft": 2.5013411045074463, "losses/total": 0.4463430345058441, "ref_logps/chosen": -33.78961181640625, "ref_logps/rejected": -46.71401596069336, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8406972885131836, "rewards/margins": 1.5150644779205322, "rewards/rejected": -3.355762004852295, "step": 1995 }, { "epoch": 1.88, "grad_norm": 26.384733200073242, "learning_rate": 2.0654074851346622e-07, "logps/chosen": -68.16213989257812, "logps/rejected": -86.7209701538086, "loss": 0.4057, "losses/dpo": 0.24786734580993652, "losses/sft": 2.254863739013672, "losses/total": 0.24786734580993652, "ref_logps/chosen": -49.461891174316406, "ref_logps/rejected": -55.3970832824707, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8700259923934937, "rewards/margins": 1.2623631954193115, "rewards/rejected": -3.1323890686035156, "step": 1996 }, { "epoch": 1.89, "grad_norm": 23.183576583862305, "learning_rate": 2.0636586218957679e-07, "logps/chosen": -55.484580993652344, "logps/rejected": -76.90269470214844, "loss": 0.3383, "losses/dpo": 0.3613179922103882, "losses/sft": 1.9333280324935913, "losses/total": 0.3613179922103882, "ref_logps/chosen": -38.94062042236328, "ref_logps/rejected": -41.81243133544922, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6543956995010376, "rewards/margins": 1.854630708694458, "rewards/rejected": -3.509026288986206, "step": 1997 }, { "epoch": 1.89, "grad_norm": 25.89813804626465, "learning_rate": 2.061909758656873e-07, "logps/chosen": -41.725730895996094, "logps/rejected": -76.85957336425781, "loss": 0.4249, "losses/dpo": 0.5903616547584534, "losses/sft": 1.9699132442474365, "losses/total": 0.5903616547584534, "ref_logps/chosen": -27.31644630432129, "ref_logps/rejected": -48.239173889160156, "rewards/accuracies": 0.625, "rewards/chosen": -1.4409284591674805, "rewards/margins": 1.4211115837097168, "rewards/rejected": -2.8620400428771973, "step": 1998 }, { "epoch": 1.89, "grad_norm": 21.803895950317383, "learning_rate": 2.0601608954179784e-07, "logps/chosen": -57.87916564941406, "logps/rejected": -86.5878677368164, "loss": 0.3366, "losses/dpo": 0.16372033953666687, "losses/sft": 1.9345643520355225, "losses/total": 0.16372033953666687, "ref_logps/chosen": -39.35773468017578, "ref_logps/rejected": -54.01332092285156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8521428108215332, "rewards/margins": 1.4053120613098145, "rewards/rejected": -3.2574548721313477, "step": 1999 }, { "epoch": 1.89, "grad_norm": 15.345725059509277, "learning_rate": 2.0584120321790835e-07, "logps/chosen": -66.094482421875, "logps/rejected": -80.86222076416016, "loss": 0.2922, "losses/dpo": 0.48337864875793457, "losses/sft": 2.0383682250976562, "losses/total": 0.48337864875793457, "ref_logps/chosen": -48.794952392578125, "ref_logps/rejected": -45.4182243347168, "rewards/accuracies": 0.875, "rewards/chosen": -1.7299530506134033, "rewards/margins": 1.8144464492797852, "rewards/rejected": -3.5443994998931885, "step": 2000 }, { "epoch": 1.89, "grad_norm": 25.278440475463867, "learning_rate": 2.0566631689401886e-07, "logps/chosen": -56.98828887939453, "logps/rejected": -76.40863800048828, "loss": 0.4741, "losses/dpo": 0.2761775553226471, "losses/sft": 1.8638036251068115, "losses/total": 0.2761775553226471, "ref_logps/chosen": -42.440155029296875, "ref_logps/rejected": -50.553497314453125, "rewards/accuracies": 0.75, "rewards/chosen": -1.4548134803771973, "rewards/margins": 1.1307004690170288, "rewards/rejected": -2.5855140686035156, "step": 2001 }, { "epoch": 1.89, "grad_norm": 19.58589744567871, "learning_rate": 2.054914305701294e-07, "logps/chosen": -53.86283874511719, "logps/rejected": -68.95311737060547, "loss": 0.2785, "losses/dpo": 0.339206337928772, "losses/sft": 1.8253734111785889, "losses/total": 0.339206337928772, "ref_logps/chosen": -40.300052642822266, "ref_logps/rejected": -39.351966857910156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.356278657913208, "rewards/margins": 1.6038364171981812, "rewards/rejected": -2.9601149559020996, "step": 2002 }, { "epoch": 1.89, "grad_norm": 28.8936824798584, "learning_rate": 2.0531654424623991e-07, "logps/chosen": -60.06389617919922, "logps/rejected": -86.05029296875, "loss": 0.4235, "losses/dpo": 0.7493062019348145, "losses/sft": 2.340275287628174, "losses/total": 0.7493062019348145, "ref_logps/chosen": -41.57643127441406, "ref_logps/rejected": -52.40216827392578, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8487460613250732, "rewards/margins": 1.5160664319992065, "rewards/rejected": -3.3648126125335693, "step": 2003 }, { "epoch": 1.89, "grad_norm": 24.26761245727539, "learning_rate": 2.0514165792235048e-07, "logps/chosen": -56.69879913330078, "logps/rejected": -73.68850708007812, "loss": 0.4476, "losses/dpo": 0.8404563069343567, "losses/sft": 2.1810801029205322, "losses/total": 0.8404563069343567, "ref_logps/chosen": -38.946495056152344, "ref_logps/rejected": -41.723838806152344, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7752306461334229, "rewards/margins": 1.4212366342544556, "rewards/rejected": -3.196467399597168, "step": 2004 }, { "epoch": 1.89, "grad_norm": 21.48341178894043, "learning_rate": 2.04966771598461e-07, "logps/chosen": -46.665130615234375, "logps/rejected": -63.29737854003906, "loss": 0.3826, "losses/dpo": 0.19626843929290771, "losses/sft": 1.126360297203064, "losses/total": 0.19626843929290771, "ref_logps/chosen": -36.079742431640625, "ref_logps/rejected": -40.45587921142578, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0585389137268066, "rewards/margins": 1.2256114482879639, "rewards/rejected": -2.2841501235961914, "step": 2005 }, { "epoch": 1.89, "grad_norm": 20.516582489013672, "learning_rate": 2.0479188527457153e-07, "logps/chosen": -52.32270812988281, "logps/rejected": -69.18130493164062, "loss": 0.3467, "losses/dpo": 0.3455691933631897, "losses/sft": 1.6257274150848389, "losses/total": 0.3455691933631897, "ref_logps/chosen": -40.26002502441406, "ref_logps/rejected": -43.94281005859375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.206268310546875, "rewards/margins": 1.3175818920135498, "rewards/rejected": -2.523850202560425, "step": 2006 }, { "epoch": 1.9, "grad_norm": 27.236385345458984, "learning_rate": 2.0461699895068204e-07, "logps/chosen": -60.64058303833008, "logps/rejected": -82.799072265625, "loss": 0.451, "losses/dpo": 0.28404921293258667, "losses/sft": 1.7321925163269043, "losses/total": 0.28404921293258667, "ref_logps/chosen": -46.73709487915039, "ref_logps/rejected": -54.20802688598633, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3903485536575317, "rewards/margins": 1.4687564373016357, "rewards/rejected": -2.859104871749878, "step": 2007 }, { "epoch": 1.9, "grad_norm": 15.246626853942871, "learning_rate": 2.0444211262679256e-07, "logps/chosen": -51.04586410522461, "logps/rejected": -79.669921875, "loss": 0.2003, "losses/dpo": 0.24426805973052979, "losses/sft": 1.8733670711517334, "losses/total": 0.24426805973052979, "ref_logps/chosen": -36.87551498413086, "ref_logps/rejected": -46.27311706542969, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4170351028442383, "rewards/margins": 1.9226452112197876, "rewards/rejected": -3.3396801948547363, "step": 2008 }, { "epoch": 1.9, "grad_norm": 26.957326889038086, "learning_rate": 2.042672263029031e-07, "logps/chosen": -64.02755737304688, "logps/rejected": -87.56922912597656, "loss": 0.3679, "losses/dpo": 0.39934104681015015, "losses/sft": 2.0048184394836426, "losses/total": 0.39934104681015015, "ref_logps/chosen": -45.204986572265625, "ref_logps/rejected": -55.2584342956543, "rewards/accuracies": 0.875, "rewards/chosen": -1.8822574615478516, "rewards/margins": 1.3488216400146484, "rewards/rejected": -3.2310791015625, "step": 2009 }, { "epoch": 1.9, "grad_norm": 28.889867782592773, "learning_rate": 2.0409233997901364e-07, "logps/chosen": -45.40325927734375, "logps/rejected": -64.39208984375, "loss": 0.5301, "losses/dpo": 0.3618355393409729, "losses/sft": 1.6301692724227905, "losses/total": 0.3618355393409729, "ref_logps/chosen": -32.46398162841797, "ref_logps/rejected": -43.23768615722656, "rewards/accuracies": 0.6875, "rewards/chosen": -1.293927550315857, "rewards/margins": 0.8215130567550659, "rewards/rejected": -2.115440607070923, "step": 2010 }, { "epoch": 1.9, "grad_norm": 26.232376098632812, "learning_rate": 2.0391745365512417e-07, "logps/chosen": -58.55329132080078, "logps/rejected": -81.92396545410156, "loss": 0.3592, "losses/dpo": 0.4822644591331482, "losses/sft": 2.224480628967285, "losses/total": 0.4822644591331482, "ref_logps/chosen": -41.89851379394531, "ref_logps/rejected": -51.11354446411133, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6654775142669678, "rewards/margins": 1.4155640602111816, "rewards/rejected": -3.0810418128967285, "step": 2011 }, { "epoch": 1.9, "grad_norm": 18.64391326904297, "learning_rate": 2.037425673312347e-07, "logps/chosen": -57.3790283203125, "logps/rejected": -89.85823822021484, "loss": 0.321, "losses/dpo": 0.5200005769729614, "losses/sft": 1.751615285873413, "losses/total": 0.5200005769729614, "ref_logps/chosen": -38.839447021484375, "ref_logps/rejected": -53.599388122558594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8539587259292603, "rewards/margins": 1.7719268798828125, "rewards/rejected": -3.6258857250213623, "step": 2012 }, { "epoch": 1.9, "grad_norm": 26.9904727935791, "learning_rate": 2.0356768100734523e-07, "logps/chosen": -68.7083740234375, "logps/rejected": -76.96910095214844, "loss": 0.4193, "losses/dpo": 0.272158682346344, "losses/sft": 1.799604892730713, "losses/total": 0.272158682346344, "ref_logps/chosen": -50.99106979370117, "ref_logps/rejected": -49.56971740722656, "rewards/accuracies": 0.875, "rewards/chosen": -1.771729826927185, "rewards/margins": 0.9682086706161499, "rewards/rejected": -2.739938497543335, "step": 2013 }, { "epoch": 1.9, "grad_norm": 26.16299819946289, "learning_rate": 2.0339279468345574e-07, "logps/chosen": -45.748558044433594, "logps/rejected": -64.01423645019531, "loss": 0.5407, "losses/dpo": 0.621539294719696, "losses/sft": 1.330641508102417, "losses/total": 0.621539294719696, "ref_logps/chosen": -32.998722076416016, "ref_logps/rejected": -40.929996490478516, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2749831676483154, "rewards/margins": 1.0334405899047852, "rewards/rejected": -2.3084237575531006, "step": 2014 }, { "epoch": 1.9, "grad_norm": 33.948951721191406, "learning_rate": 2.0321790835956625e-07, "logps/chosen": -50.650291442871094, "logps/rejected": -73.06553649902344, "loss": 0.5064, "losses/dpo": 0.5304527878761292, "losses/sft": 2.369108200073242, "losses/total": 0.5304527878761292, "ref_logps/chosen": -35.19801330566406, "ref_logps/rejected": -44.235130310058594, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5452282428741455, "rewards/margins": 1.3378121852874756, "rewards/rejected": -2.883040428161621, "step": 2015 }, { "epoch": 1.9, "grad_norm": 20.02587890625, "learning_rate": 2.030430220356768e-07, "logps/chosen": -47.7255859375, "logps/rejected": -68.58493041992188, "loss": 0.3913, "losses/dpo": 0.43606191873550415, "losses/sft": 1.7529096603393555, "losses/total": 0.43606191873550415, "ref_logps/chosen": -34.093971252441406, "ref_logps/rejected": -42.21705627441406, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3631618022918701, "rewards/margins": 1.2736256122589111, "rewards/rejected": -2.636787176132202, "step": 2016 }, { "epoch": 1.9, "grad_norm": 28.85910415649414, "learning_rate": 2.0286813571178733e-07, "logps/chosen": -52.251495361328125, "logps/rejected": -71.4780044555664, "loss": 0.4898, "losses/dpo": 0.20923466980457306, "losses/sft": 1.697746992111206, "losses/total": 0.20923466980457306, "ref_logps/chosen": -35.547203063964844, "ref_logps/rejected": -44.104331970214844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.670428991317749, "rewards/margins": 1.0669379234313965, "rewards/rejected": -2.7373671531677246, "step": 2017 }, { "epoch": 1.91, "grad_norm": 20.186370849609375, "learning_rate": 2.0269324938789787e-07, "logps/chosen": -50.92329406738281, "logps/rejected": -73.6741714477539, "loss": 0.3167, "losses/dpo": 0.4310111701488495, "losses/sft": 1.2951059341430664, "losses/total": 0.4310111701488495, "ref_logps/chosen": -35.7137451171875, "ref_logps/rejected": -42.59636688232422, "rewards/accuracies": 0.875, "rewards/chosen": -1.5209550857543945, "rewards/margins": 1.5868254899978638, "rewards/rejected": -3.107780694961548, "step": 2018 }, { "epoch": 1.91, "grad_norm": 24.85271644592285, "learning_rate": 2.0251836306400838e-07, "logps/chosen": -48.684146881103516, "logps/rejected": -63.44342041015625, "loss": 0.3108, "losses/dpo": 0.4532914161682129, "losses/sft": 2.3797152042388916, "losses/total": 0.4532914161682129, "ref_logps/chosen": -36.48652648925781, "ref_logps/rejected": -36.149147033691406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.219761848449707, "rewards/margins": 1.5096651315689087, "rewards/rejected": -2.729426860809326, "step": 2019 }, { "epoch": 1.91, "grad_norm": 23.203615188598633, "learning_rate": 2.0234347674011892e-07, "logps/chosen": -55.359474182128906, "logps/rejected": -64.22511291503906, "loss": 0.4585, "losses/dpo": 0.5014011859893799, "losses/sft": 2.2300329208374023, "losses/total": 0.5014011859893799, "ref_logps/chosen": -38.677513122558594, "ref_logps/rejected": -36.93783950805664, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6681957244873047, "rewards/margins": 1.060531735420227, "rewards/rejected": -2.728727340698242, "step": 2020 }, { "epoch": 1.91, "grad_norm": 32.913204193115234, "learning_rate": 2.0216859041622943e-07, "logps/chosen": -59.758995056152344, "logps/rejected": -73.78396606445312, "loss": 0.5848, "losses/dpo": 0.42151883244514465, "losses/sft": 1.5064970254898071, "losses/total": 0.42151883244514465, "ref_logps/chosen": -45.86241912841797, "ref_logps/rejected": -53.4886474609375, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3896574974060059, "rewards/margins": 0.6398743391036987, "rewards/rejected": -2.029531955718994, "step": 2021 }, { "epoch": 1.91, "grad_norm": 16.41496467590332, "learning_rate": 2.0199370409233995e-07, "logps/chosen": -50.31555938720703, "logps/rejected": -63.7568359375, "loss": 0.3858, "losses/dpo": 0.5128999352455139, "losses/sft": 1.543710708618164, "losses/total": 0.5128999352455139, "ref_logps/chosen": -37.41289520263672, "ref_logps/rejected": -38.08341598510742, "rewards/accuracies": 0.875, "rewards/chosen": -1.290266752243042, "rewards/margins": 1.277075171470642, "rewards/rejected": -2.5673418045043945, "step": 2022 }, { "epoch": 1.91, "grad_norm": 24.01865005493164, "learning_rate": 2.018188177684505e-07, "logps/chosen": -45.16056823730469, "logps/rejected": -82.82772827148438, "loss": 0.2841, "losses/dpo": 0.35318487882614136, "losses/sft": 1.541651964187622, "losses/total": 0.35318487882614136, "ref_logps/chosen": -33.652015686035156, "ref_logps/rejected": -51.68000793457031, "rewards/accuracies": 0.875, "rewards/chosen": -1.1508554220199585, "rewards/margins": 1.9639167785644531, "rewards/rejected": -3.114772319793701, "step": 2023 }, { "epoch": 1.91, "grad_norm": 22.34075164794922, "learning_rate": 2.0164393144456102e-07, "logps/chosen": -48.46492004394531, "logps/rejected": -67.80860900878906, "loss": 0.4035, "losses/dpo": 0.3765292763710022, "losses/sft": 2.082228660583496, "losses/total": 0.3765292763710022, "ref_logps/chosen": -34.32884216308594, "ref_logps/rejected": -43.48926544189453, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4136078357696533, "rewards/margins": 1.0183261632919312, "rewards/rejected": -2.431934118270874, "step": 2024 }, { "epoch": 1.91, "grad_norm": 21.50850486755371, "learning_rate": 2.0146904512067156e-07, "logps/chosen": -45.23872375488281, "logps/rejected": -73.0084228515625, "loss": 0.3491, "losses/dpo": 0.17151105403900146, "losses/sft": 2.066784143447876, "losses/total": 0.17151105403900146, "ref_logps/chosen": -30.906810760498047, "ref_logps/rejected": -43.360382080078125, "rewards/accuracies": 0.875, "rewards/chosen": -1.4331917762756348, "rewards/margins": 1.5316121578216553, "rewards/rejected": -2.96480393409729, "step": 2025 }, { "epoch": 1.91, "grad_norm": 23.876951217651367, "learning_rate": 2.0129415879678208e-07, "logps/chosen": -43.144287109375, "logps/rejected": -57.90933609008789, "loss": 0.4229, "losses/dpo": 0.38650426268577576, "losses/sft": 1.7937887907028198, "losses/total": 0.38650426268577576, "ref_logps/chosen": -32.53365707397461, "ref_logps/rejected": -36.2231330871582, "rewards/accuracies": 0.75, "rewards/chosen": -1.0610628128051758, "rewards/margins": 1.1075578927993774, "rewards/rejected": -2.1686205863952637, "step": 2026 }, { "epoch": 1.91, "grad_norm": 31.716066360473633, "learning_rate": 2.0111927247289261e-07, "logps/chosen": -57.36821365356445, "logps/rejected": -70.847900390625, "loss": 0.5411, "losses/dpo": 0.8157985806465149, "losses/sft": 2.6244165897369385, "losses/total": 0.8157985806465149, "ref_logps/chosen": -41.74764633178711, "ref_logps/rejected": -46.83079528808594, "rewards/accuracies": 0.75, "rewards/chosen": -1.5620566606521606, "rewards/margins": 0.8396540284156799, "rewards/rejected": -2.4017105102539062, "step": 2027 }, { "epoch": 1.92, "grad_norm": 29.944028854370117, "learning_rate": 2.0094438614900313e-07, "logps/chosen": -54.41606521606445, "logps/rejected": -74.38653564453125, "loss": 0.4916, "losses/dpo": 0.35407572984695435, "losses/sft": 1.6522469520568848, "losses/total": 0.35407572984695435, "ref_logps/chosen": -38.94879150390625, "ref_logps/rejected": -46.490760803222656, "rewards/accuracies": 0.75, "rewards/chosen": -1.5467274188995361, "rewards/margins": 1.2428498268127441, "rewards/rejected": -2.7895772457122803, "step": 2028 }, { "epoch": 1.92, "grad_norm": 19.687559127807617, "learning_rate": 2.007694998251137e-07, "logps/chosen": -42.60370635986328, "logps/rejected": -69.28800964355469, "loss": 0.3099, "losses/dpo": 0.27244699001312256, "losses/sft": 1.6144404411315918, "losses/total": 0.27244699001312256, "ref_logps/chosen": -28.908899307250977, "ref_logps/rejected": -42.39570236206055, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3694807291030884, "rewards/margins": 1.3197505474090576, "rewards/rejected": -2.6892311573028564, "step": 2029 }, { "epoch": 1.92, "grad_norm": 18.690031051635742, "learning_rate": 2.005946135012242e-07, "logps/chosen": -43.9360466003418, "logps/rejected": -65.29751586914062, "loss": 0.2687, "losses/dpo": 0.14580021798610687, "losses/sft": 2.0838897228240967, "losses/total": 0.14580021798610687, "ref_logps/chosen": -31.547157287597656, "ref_logps/rejected": -34.753456115722656, "rewards/accuracies": 0.875, "rewards/chosen": -1.2388887405395508, "rewards/margins": 1.8155173063278198, "rewards/rejected": -3.05440616607666, "step": 2030 }, { "epoch": 1.92, "grad_norm": 19.959712982177734, "learning_rate": 2.0041972717733472e-07, "logps/chosen": -54.709930419921875, "logps/rejected": -75.29872131347656, "loss": 0.2827, "losses/dpo": 0.2894763648509979, "losses/sft": 1.991902232170105, "losses/total": 0.2894763648509979, "ref_logps/chosen": -41.17220687866211, "ref_logps/rejected": -46.030967712402344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3537721633911133, "rewards/margins": 1.5730029344558716, "rewards/rejected": -2.9267749786376953, "step": 2031 }, { "epoch": 1.92, "grad_norm": 23.293468475341797, "learning_rate": 2.0024484085344526e-07, "logps/chosen": -48.56928253173828, "logps/rejected": -69.32160186767578, "loss": 0.3616, "losses/dpo": 0.3449220061302185, "losses/sft": 1.977399468421936, "losses/total": 0.3449220061302185, "ref_logps/chosen": -34.26426315307617, "ref_logps/rejected": -43.83550262451172, "rewards/accuracies": 0.9375, "rewards/chosen": -1.43050217628479, "rewards/margins": 1.1181080341339111, "rewards/rejected": -2.548610210418701, "step": 2032 }, { "epoch": 1.92, "grad_norm": 26.006484985351562, "learning_rate": 2.0006995452955577e-07, "logps/chosen": -46.61804962158203, "logps/rejected": -50.099266052246094, "loss": 0.4767, "losses/dpo": 0.5687950849533081, "losses/sft": 1.6788700819015503, "losses/total": 0.5687950849533081, "ref_logps/chosen": -33.719215393066406, "ref_logps/rejected": -29.503496170043945, "rewards/accuracies": 0.875, "rewards/chosen": -1.2898836135864258, "rewards/margins": 0.7696936130523682, "rewards/rejected": -2.059576988220215, "step": 2033 }, { "epoch": 1.92, "grad_norm": 16.64080810546875, "learning_rate": 1.998950682056663e-07, "logps/chosen": -69.95222473144531, "logps/rejected": -103.65144348144531, "loss": 0.236, "losses/dpo": 0.3266555666923523, "losses/sft": 1.8553285598754883, "losses/total": 0.3266555666923523, "ref_logps/chosen": -50.67909622192383, "ref_logps/rejected": -62.793434143066406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9273128509521484, "rewards/margins": 2.1584882736206055, "rewards/rejected": -4.085801124572754, "step": 2034 }, { "epoch": 1.92, "grad_norm": 25.264097213745117, "learning_rate": 1.9972018188177682e-07, "logps/chosen": -48.751007080078125, "logps/rejected": -76.67971801757812, "loss": 0.4608, "losses/dpo": 0.21231618523597717, "losses/sft": 1.9976848363876343, "losses/total": 0.21231618523597717, "ref_logps/chosen": -35.7194709777832, "ref_logps/rejected": -49.99412536621094, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3031532764434814, "rewards/margins": 1.3654061555862427, "rewards/rejected": -2.6685593128204346, "step": 2035 }, { "epoch": 1.92, "grad_norm": 41.93486022949219, "learning_rate": 1.995452955578874e-07, "logps/chosen": -77.87896728515625, "logps/rejected": -84.36576843261719, "loss": 0.8346, "losses/dpo": 0.505761981010437, "losses/sft": 2.4345896244049072, "losses/total": 0.505761981010437, "ref_logps/chosen": -46.7550163269043, "ref_logps/rejected": -50.0225944519043, "rewards/accuracies": 0.625, "rewards/chosen": -3.1123950481414795, "rewards/margins": 0.32192283868789673, "rewards/rejected": -3.4343180656433105, "step": 2036 }, { "epoch": 1.92, "grad_norm": 19.129859924316406, "learning_rate": 1.993704092339979e-07, "logps/chosen": -44.94675064086914, "logps/rejected": -74.08683013916016, "loss": 0.2664, "losses/dpo": 0.23209071159362793, "losses/sft": 1.801204800605774, "losses/total": 0.23209071159362793, "ref_logps/chosen": -32.547752380371094, "ref_logps/rejected": -45.430458068847656, "rewards/accuracies": 1.0, "rewards/chosen": -1.2398995161056519, "rewards/margins": 1.6257374286651611, "rewards/rejected": -2.8656368255615234, "step": 2037 }, { "epoch": 1.92, "grad_norm": 19.115253448486328, "learning_rate": 1.991955229101084e-07, "logps/chosen": -45.65440368652344, "logps/rejected": -60.15361404418945, "loss": 0.3216, "losses/dpo": 0.2604219615459442, "losses/sft": 1.5795276165008545, "losses/total": 0.2604219615459442, "ref_logps/chosen": -34.540252685546875, "ref_logps/rejected": -36.18833923339844, "rewards/accuracies": 0.875, "rewards/chosen": -1.111415147781372, "rewards/margins": 1.2851128578186035, "rewards/rejected": -2.3965280055999756, "step": 2038 }, { "epoch": 1.93, "grad_norm": 29.1007137298584, "learning_rate": 1.9902063658621895e-07, "logps/chosen": -41.4874267578125, "logps/rejected": -60.68952178955078, "loss": 0.5793, "losses/dpo": 0.27138128876686096, "losses/sft": 1.2251626253128052, "losses/total": 0.27138128876686096, "ref_logps/chosen": -28.983592987060547, "ref_logps/rejected": -37.920616149902344, "rewards/accuracies": 0.625, "rewards/chosen": -1.2503833770751953, "rewards/margins": 1.0265073776245117, "rewards/rejected": -2.276890754699707, "step": 2039 }, { "epoch": 1.93, "grad_norm": 28.733631134033203, "learning_rate": 1.9884575026232946e-07, "logps/chosen": -52.36991500854492, "logps/rejected": -50.15070343017578, "loss": 0.5886, "losses/dpo": 0.5438276529312134, "losses/sft": 1.6661931276321411, "losses/total": 0.5438276529312134, "ref_logps/chosen": -36.1097412109375, "ref_logps/rejected": -29.813556671142578, "rewards/accuracies": 0.75, "rewards/chosen": -1.6260173320770264, "rewards/margins": 0.4076976776123047, "rewards/rejected": -2.033715009689331, "step": 2040 }, { "epoch": 1.93, "grad_norm": 15.451970100402832, "learning_rate": 1.9867086393844e-07, "logps/chosen": -49.75353240966797, "logps/rejected": -75.8673095703125, "loss": 0.246, "losses/dpo": 0.4540598392486572, "losses/sft": 2.013915538787842, "losses/total": 0.4540598392486572, "ref_logps/chosen": -36.06848907470703, "ref_logps/rejected": -42.807682037353516, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3685040473937988, "rewards/margins": 1.9374585151672363, "rewards/rejected": -3.305962562561035, "step": 2041 }, { "epoch": 1.93, "grad_norm": 18.874589920043945, "learning_rate": 1.9849597761455054e-07, "logps/chosen": -47.55992126464844, "logps/rejected": -69.62950897216797, "loss": 0.3843, "losses/dpo": 0.4438169598579407, "losses/sft": 1.8290280103683472, "losses/total": 0.4438169598579407, "ref_logps/chosen": -34.679588317871094, "ref_logps/rejected": -41.335845947265625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2880332469940186, "rewards/margins": 1.5413331985473633, "rewards/rejected": -2.829366445541382, "step": 2042 }, { "epoch": 1.93, "grad_norm": 36.710052490234375, "learning_rate": 1.9832109129066108e-07, "logps/chosen": -60.273834228515625, "logps/rejected": -69.26152801513672, "loss": 0.5931, "losses/dpo": 0.38301146030426025, "losses/sft": 1.7894673347473145, "losses/total": 0.38301146030426025, "ref_logps/chosen": -41.36894989013672, "ref_logps/rejected": -41.500797271728516, "rewards/accuracies": 0.75, "rewards/chosen": -1.890488624572754, "rewards/margins": 0.8855847716331482, "rewards/rejected": -2.776073455810547, "step": 2043 }, { "epoch": 1.93, "grad_norm": 20.80672836303711, "learning_rate": 1.981462049667716e-07, "logps/chosen": -54.35045623779297, "logps/rejected": -74.58406066894531, "loss": 0.379, "losses/dpo": 0.47416791319847107, "losses/sft": 3.190709352493286, "losses/total": 0.47416791319847107, "ref_logps/chosen": -36.59331130981445, "ref_logps/rejected": -40.45694351196289, "rewards/accuracies": 0.75, "rewards/chosen": -1.7757145166397095, "rewards/margins": 1.6369965076446533, "rewards/rejected": -3.4127109050750732, "step": 2044 }, { "epoch": 1.93, "grad_norm": 28.5202693939209, "learning_rate": 1.979713186428821e-07, "logps/chosen": -65.97419738769531, "logps/rejected": -87.1558837890625, "loss": 0.3507, "losses/dpo": 0.40661272406578064, "losses/sft": 2.788334369659424, "losses/total": 0.40661272406578064, "ref_logps/chosen": -45.286468505859375, "ref_logps/rejected": -47.588104248046875, "rewards/accuracies": 0.875, "rewards/chosen": -2.068773031234741, "rewards/margins": 1.8880045413970947, "rewards/rejected": -3.956777811050415, "step": 2045 }, { "epoch": 1.93, "grad_norm": 28.022443771362305, "learning_rate": 1.9779643231899265e-07, "logps/chosen": -51.276031494140625, "logps/rejected": -62.86093521118164, "loss": 0.4797, "losses/dpo": 0.5778504610061646, "losses/sft": 1.8465807437896729, "losses/total": 0.5778504610061646, "ref_logps/chosen": -34.459999084472656, "ref_logps/rejected": -37.06925582885742, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6816034317016602, "rewards/margins": 0.8975642919540405, "rewards/rejected": -2.5791680812835693, "step": 2046 }, { "epoch": 1.93, "grad_norm": 20.93613624572754, "learning_rate": 1.9762154599510316e-07, "logps/chosen": -52.076847076416016, "logps/rejected": -69.5322036743164, "loss": 0.2863, "losses/dpo": 0.21077489852905273, "losses/sft": 1.5609585046768188, "losses/total": 0.21077489852905273, "ref_logps/chosen": -38.66948318481445, "ref_logps/rejected": -40.144004821777344, "rewards/accuracies": 0.875, "rewards/chosen": -1.3407361507415771, "rewards/margins": 1.5980840921401978, "rewards/rejected": -2.9388203620910645, "step": 2047 }, { "epoch": 1.93, "grad_norm": 22.441720962524414, "learning_rate": 1.974466596712137e-07, "logps/chosen": -45.73637771606445, "logps/rejected": -67.10670471191406, "loss": 0.4889, "losses/dpo": 0.36217594146728516, "losses/sft": 1.6394838094711304, "losses/total": 0.36217594146728516, "ref_logps/chosen": -34.477806091308594, "ref_logps/rejected": -44.96718215942383, "rewards/accuracies": 0.75, "rewards/chosen": -1.1258573532104492, "rewards/margins": 1.08809494972229, "rewards/rejected": -2.2139523029327393, "step": 2048 }, { "epoch": 1.93, "grad_norm": 20.267269134521484, "learning_rate": 1.9727177334732424e-07, "logps/chosen": -46.09368896484375, "logps/rejected": -75.48944091796875, "loss": 0.3961, "losses/dpo": 0.6624510884284973, "losses/sft": 1.5255367755889893, "losses/total": 0.6624510884284973, "ref_logps/chosen": -33.08018112182617, "ref_logps/rejected": -46.07804870605469, "rewards/accuracies": 0.625, "rewards/chosen": -1.3013508319854736, "rewards/margins": 1.6397885084152222, "rewards/rejected": -2.9411392211914062, "step": 2049 }, { "epoch": 1.94, "grad_norm": 29.125268936157227, "learning_rate": 1.9709688702343478e-07, "logps/chosen": -55.18638610839844, "logps/rejected": -75.74964141845703, "loss": 0.4844, "losses/dpo": 0.1501118689775467, "losses/sft": 1.7127957344055176, "losses/total": 0.1501118689775467, "ref_logps/chosen": -36.874794006347656, "ref_logps/rejected": -43.76411056518555, "rewards/accuracies": 0.875, "rewards/chosen": -1.8311591148376465, "rewards/margins": 1.3673936128616333, "rewards/rejected": -3.1985528469085693, "step": 2050 }, { "epoch": 1.94, "grad_norm": 24.710039138793945, "learning_rate": 1.969220006995453e-07, "logps/chosen": -54.13361358642578, "logps/rejected": -61.68814468383789, "loss": 0.4008, "losses/dpo": 0.29393234848976135, "losses/sft": 1.48407781124115, "losses/total": 0.29393234848976135, "ref_logps/chosen": -40.47198486328125, "ref_logps/rejected": -36.94309997558594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.366162657737732, "rewards/margins": 1.1083418130874634, "rewards/rejected": -2.4745047092437744, "step": 2051 }, { "epoch": 1.94, "grad_norm": 21.571746826171875, "learning_rate": 1.967471143756558e-07, "logps/chosen": -57.80361557006836, "logps/rejected": -79.19660949707031, "loss": 0.2927, "losses/dpo": 0.33083444833755493, "losses/sft": 2.3784596920013428, "losses/total": 0.33083444833755493, "ref_logps/chosen": -39.48957443237305, "ref_logps/rejected": -44.828399658203125, "rewards/accuracies": 0.875, "rewards/chosen": -1.8314040899276733, "rewards/margins": 1.605417013168335, "rewards/rejected": -3.4368209838867188, "step": 2052 }, { "epoch": 1.94, "grad_norm": 20.24934196472168, "learning_rate": 1.9657222805176634e-07, "logps/chosen": -47.747398376464844, "logps/rejected": -67.80306243896484, "loss": 0.3692, "losses/dpo": 0.6854740381240845, "losses/sft": 1.8549668788909912, "losses/total": 0.6854740381240845, "ref_logps/chosen": -34.80888366699219, "ref_logps/rejected": -39.37554931640625, "rewards/accuracies": 0.8125, "rewards/chosen": -1.293851375579834, "rewards/margins": 1.5489002466201782, "rewards/rejected": -2.8427515029907227, "step": 2053 }, { "epoch": 1.94, "grad_norm": 25.91911506652832, "learning_rate": 1.9639734172787685e-07, "logps/chosen": -59.826622009277344, "logps/rejected": -93.56526184082031, "loss": 0.3855, "losses/dpo": 0.1819000095129013, "losses/sft": 2.266031265258789, "losses/total": 0.1819000095129013, "ref_logps/chosen": -40.761009216308594, "ref_logps/rejected": -59.559513092041016, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9065617322921753, "rewards/margins": 1.4940130710601807, "rewards/rejected": -3.4005746841430664, "step": 2054 }, { "epoch": 1.94, "grad_norm": 23.563228607177734, "learning_rate": 1.9622245540398742e-07, "logps/chosen": -52.501014709472656, "logps/rejected": -77.13671875, "loss": 0.4442, "losses/dpo": 0.277631938457489, "losses/sft": 2.213420867919922, "losses/total": 0.277631938457489, "ref_logps/chosen": -36.81894302368164, "ref_logps/rejected": -48.306800842285156, "rewards/accuracies": 0.875, "rewards/chosen": -1.5682072639465332, "rewards/margins": 1.314785361289978, "rewards/rejected": -2.882992744445801, "step": 2055 }, { "epoch": 1.94, "grad_norm": 28.440704345703125, "learning_rate": 1.9604756908009793e-07, "logps/chosen": -61.124229431152344, "logps/rejected": -82.30738067626953, "loss": 0.4886, "losses/dpo": 0.276885986328125, "losses/sft": 1.9950237274169922, "losses/total": 0.276885986328125, "ref_logps/chosen": -40.2645263671875, "ref_logps/rejected": -49.9613151550293, "rewards/accuracies": 0.6875, "rewards/chosen": -2.085969924926758, "rewards/margins": 1.1486362218856812, "rewards/rejected": -3.2346062660217285, "step": 2056 }, { "epoch": 1.94, "grad_norm": 16.89289093017578, "learning_rate": 1.9587268275620847e-07, "logps/chosen": -57.330413818359375, "logps/rejected": -78.9949722290039, "loss": 0.2761, "losses/dpo": 0.2522817850112915, "losses/sft": 1.7089128494262695, "losses/total": 0.2522817850112915, "ref_logps/chosen": -42.53447723388672, "ref_logps/rejected": -49.11338806152344, "rewards/accuracies": 0.875, "rewards/chosen": -1.4795938730239868, "rewards/margins": 1.5085647106170654, "rewards/rejected": -2.9881584644317627, "step": 2057 }, { "epoch": 1.94, "grad_norm": 23.73087501525879, "learning_rate": 1.9569779643231898e-07, "logps/chosen": -56.779701232910156, "logps/rejected": -72.22625732421875, "loss": 0.3604, "losses/dpo": 0.1695684939622879, "losses/sft": 1.6324191093444824, "losses/total": 0.1695684939622879, "ref_logps/chosen": -39.02884292602539, "ref_logps/rejected": -40.934547424316406, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7750858068466187, "rewards/margins": 1.3540852069854736, "rewards/rejected": -3.129171133041382, "step": 2058 }, { "epoch": 1.94, "grad_norm": 19.350387573242188, "learning_rate": 1.955229101084295e-07, "logps/chosen": -50.196685791015625, "logps/rejected": -69.0809097290039, "loss": 0.2892, "losses/dpo": 0.282158762216568, "losses/sft": 1.8599509000778198, "losses/total": 0.282158762216568, "ref_logps/chosen": -38.94902801513672, "ref_logps/rejected": -40.440338134765625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1247658729553223, "rewards/margins": 1.7392911911010742, "rewards/rejected": -2.8640570640563965, "step": 2059 }, { "epoch": 1.95, "grad_norm": 20.693626403808594, "learning_rate": 1.9534802378454004e-07, "logps/chosen": -48.73835754394531, "logps/rejected": -63.906105041503906, "loss": 0.416, "losses/dpo": 0.2316254824399948, "losses/sft": 1.894715428352356, "losses/total": 0.2316254824399948, "ref_logps/chosen": -37.42266082763672, "ref_logps/rejected": -42.40538024902344, "rewards/accuracies": 0.75, "rewards/chosen": -1.1315703392028809, "rewards/margins": 1.0185023546218872, "rewards/rejected": -2.1500725746154785, "step": 2060 }, { "epoch": 1.95, "grad_norm": 30.93864631652832, "learning_rate": 1.9517313746065057e-07, "logps/chosen": -58.22099685668945, "logps/rejected": -66.14900970458984, "loss": 0.5575, "losses/dpo": 0.45409464836120605, "losses/sft": 1.9882922172546387, "losses/total": 0.45409464836120605, "ref_logps/chosen": -40.06817626953125, "ref_logps/rejected": -40.44312286376953, "rewards/accuracies": 0.75, "rewards/chosen": -1.8152823448181152, "rewards/margins": 0.7553058862686157, "rewards/rejected": -2.5705881118774414, "step": 2061 }, { "epoch": 1.95, "grad_norm": 27.666088104248047, "learning_rate": 1.9499825113676111e-07, "logps/chosen": -60.009605407714844, "logps/rejected": -72.30021667480469, "loss": 0.4208, "losses/dpo": 0.6343302130699158, "losses/sft": 2.0791141986846924, "losses/total": 0.6343302130699158, "ref_logps/chosen": -40.573551177978516, "ref_logps/rejected": -39.11536407470703, "rewards/accuracies": 0.75, "rewards/chosen": -1.9436051845550537, "rewards/margins": 1.3748797178268433, "rewards/rejected": -3.3184847831726074, "step": 2062 }, { "epoch": 1.95, "grad_norm": 34.38661193847656, "learning_rate": 1.9482336481287163e-07, "logps/chosen": -49.46623229980469, "logps/rejected": -57.707603454589844, "loss": 0.6619, "losses/dpo": 0.7517745494842529, "losses/sft": 1.9962821006774902, "losses/total": 0.7517745494842529, "ref_logps/chosen": -33.148826599121094, "ref_logps/rejected": -34.319637298583984, "rewards/accuracies": 0.75, "rewards/chosen": -1.6317403316497803, "rewards/margins": 0.7070561647415161, "rewards/rejected": -2.338796615600586, "step": 2063 }, { "epoch": 1.95, "grad_norm": 21.083423614501953, "learning_rate": 1.9464847848898217e-07, "logps/chosen": -56.04933547973633, "logps/rejected": -65.13681030273438, "loss": 0.37, "losses/dpo": 0.23565126955509186, "losses/sft": 1.7713210582733154, "losses/total": 0.23565126955509186, "ref_logps/chosen": -44.885215759277344, "ref_logps/rejected": -38.86481475830078, "rewards/accuracies": 0.875, "rewards/chosen": -1.1164121627807617, "rewards/margins": 1.5107877254486084, "rewards/rejected": -2.627200126647949, "step": 2064 }, { "epoch": 1.95, "grad_norm": 20.880590438842773, "learning_rate": 1.9447359216509268e-07, "logps/chosen": -49.9166259765625, "logps/rejected": -69.47760009765625, "loss": 0.4008, "losses/dpo": 0.26993870735168457, "losses/sft": 1.7678112983703613, "losses/total": 0.26993870735168457, "ref_logps/chosen": -35.69708251953125, "ref_logps/rejected": -41.96161651611328, "rewards/accuracies": 0.875, "rewards/chosen": -1.4219542741775513, "rewards/margins": 1.3296444416046143, "rewards/rejected": -2.751598834991455, "step": 2065 }, { "epoch": 1.95, "grad_norm": 17.91613006591797, "learning_rate": 1.942987058412032e-07, "logps/chosen": -66.38330841064453, "logps/rejected": -88.3729476928711, "loss": 0.2564, "losses/dpo": 0.15901872515678406, "losses/sft": 1.9070210456848145, "losses/total": 0.15901872515678406, "ref_logps/chosen": -48.571468353271484, "ref_logps/rejected": -52.37348937988281, "rewards/accuracies": 0.9375, "rewards/chosen": -1.781184434890747, "rewards/margins": 1.8187617063522339, "rewards/rejected": -3.5999460220336914, "step": 2066 }, { "epoch": 1.95, "grad_norm": 27.85806655883789, "learning_rate": 1.9412381951731373e-07, "logps/chosen": -61.115875244140625, "logps/rejected": -78.0401840209961, "loss": 0.4132, "losses/dpo": 0.8243873119354248, "losses/sft": 1.2907639741897583, "losses/total": 0.8243873119354248, "ref_logps/chosen": -44.757720947265625, "ref_logps/rejected": -47.41073989868164, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6358152627944946, "rewards/margins": 1.427128553390503, "rewards/rejected": -3.062943935394287, "step": 2067 }, { "epoch": 1.95, "grad_norm": 20.662166595458984, "learning_rate": 1.9394893319342427e-07, "logps/chosen": -55.93009567260742, "logps/rejected": -70.70684814453125, "loss": 0.3395, "losses/dpo": 0.2904793620109558, "losses/sft": 2.2393345832824707, "losses/total": 0.2904793620109558, "ref_logps/chosen": -40.470787048339844, "ref_logps/rejected": -43.35810089111328, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5459303855895996, "rewards/margins": 1.1889444589614868, "rewards/rejected": -2.734874963760376, "step": 2068 }, { "epoch": 1.95, "grad_norm": 28.276548385620117, "learning_rate": 1.937740468695348e-07, "logps/chosen": -44.18762969970703, "logps/rejected": -60.83019256591797, "loss": 0.5104, "losses/dpo": 0.8664460182189941, "losses/sft": 1.5060625076293945, "losses/total": 0.8664460182189941, "ref_logps/chosen": -29.414011001586914, "ref_logps/rejected": -36.614662170410156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4773621559143066, "rewards/margins": 0.9441909193992615, "rewards/rejected": -2.421552896499634, "step": 2069 }, { "epoch": 1.95, "grad_norm": 24.872291564941406, "learning_rate": 1.9359916054564532e-07, "logps/chosen": -55.625770568847656, "logps/rejected": -78.26005554199219, "loss": 0.3554, "losses/dpo": 0.5299415588378906, "losses/sft": 1.4370397329330444, "losses/total": 0.5299415588378906, "ref_logps/chosen": -43.666263580322266, "ref_logps/rejected": -52.23921203613281, "rewards/accuracies": 0.875, "rewards/chosen": -1.1959506273269653, "rewards/margins": 1.406134009361267, "rewards/rejected": -2.6020846366882324, "step": 2070 }, { "epoch": 1.96, "grad_norm": 34.20013427734375, "learning_rate": 1.9342427422175586e-07, "logps/chosen": -63.907798767089844, "logps/rejected": -61.06121063232422, "loss": 0.6904, "losses/dpo": 0.7830640077590942, "losses/sft": 2.0493459701538086, "losses/total": 0.7830640077590942, "ref_logps/chosen": -43.97303009033203, "ref_logps/rejected": -38.165138244628906, "rewards/accuracies": 0.625, "rewards/chosen": -1.9934767484664917, "rewards/margins": 0.2961304485797882, "rewards/rejected": -2.289607048034668, "step": 2071 }, { "epoch": 1.96, "grad_norm": 22.545995712280273, "learning_rate": 1.9324938789786637e-07, "logps/chosen": -45.397300720214844, "logps/rejected": -64.16326141357422, "loss": 0.4307, "losses/dpo": 0.2381553053855896, "losses/sft": 1.8798092603683472, "losses/total": 0.2381553053855896, "ref_logps/chosen": -34.55494689941406, "ref_logps/rejected": -40.66218566894531, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0842351913452148, "rewards/margins": 1.2658724784851074, "rewards/rejected": -2.3501076698303223, "step": 2072 }, { "epoch": 1.96, "grad_norm": 23.276906967163086, "learning_rate": 1.9307450157397688e-07, "logps/chosen": -58.593563079833984, "logps/rejected": -74.5292739868164, "loss": 0.392, "losses/dpo": 0.3592626452445984, "losses/sft": 1.6102502346038818, "losses/total": 0.3592626452445984, "ref_logps/chosen": -46.083213806152344, "ref_logps/rejected": -46.22300720214844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2510348558425903, "rewards/margins": 1.579592227935791, "rewards/rejected": -2.830626964569092, "step": 2073 }, { "epoch": 1.96, "grad_norm": 24.0294132232666, "learning_rate": 1.9289961525008745e-07, "logps/chosen": -55.75016784667969, "logps/rejected": -65.87483978271484, "loss": 0.5777, "losses/dpo": 1.2023175954818726, "losses/sft": 2.2473299503326416, "losses/total": 1.2023175954818726, "ref_logps/chosen": -37.31719207763672, "ref_logps/rejected": -38.586856842041016, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8432972431182861, "rewards/margins": 0.8855009078979492, "rewards/rejected": -2.7287983894348145, "step": 2074 }, { "epoch": 1.96, "grad_norm": 25.856199264526367, "learning_rate": 1.9272472892619796e-07, "logps/chosen": -48.15317153930664, "logps/rejected": -69.64471435546875, "loss": 0.3503, "losses/dpo": 0.45898646116256714, "losses/sft": 1.573825716972351, "losses/total": 0.45898646116256714, "ref_logps/chosen": -33.36845397949219, "ref_logps/rejected": -39.878990173339844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4784717559814453, "rewards/margins": 1.498100996017456, "rewards/rejected": -2.9765727519989014, "step": 2075 }, { "epoch": 1.96, "grad_norm": 27.411643981933594, "learning_rate": 1.925498426023085e-07, "logps/chosen": -53.39967346191406, "logps/rejected": -71.10379791259766, "loss": 0.4396, "losses/dpo": 0.21336789429187775, "losses/sft": 1.7873870134353638, "losses/total": 0.21336789429187775, "ref_logps/chosen": -37.71217727661133, "ref_logps/rejected": -44.63653564453125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5687496662139893, "rewards/margins": 1.0779763460159302, "rewards/rejected": -2.646726131439209, "step": 2076 }, { "epoch": 1.96, "grad_norm": 26.992263793945312, "learning_rate": 1.9237495627841901e-07, "logps/chosen": -58.543304443359375, "logps/rejected": -61.3272705078125, "loss": 0.4392, "losses/dpo": 0.49617934226989746, "losses/sft": 1.79050874710083, "losses/total": 0.49617934226989746, "ref_logps/chosen": -43.81988525390625, "ref_logps/rejected": -37.0499267578125, "rewards/accuracies": 0.875, "rewards/chosen": -1.4723420143127441, "rewards/margins": 0.9553926587104797, "rewards/rejected": -2.427734613418579, "step": 2077 }, { "epoch": 1.96, "grad_norm": 29.41219711303711, "learning_rate": 1.9220006995452955e-07, "logps/chosen": -51.70094299316406, "logps/rejected": -58.129615783691406, "loss": 0.489, "losses/dpo": 0.48410069942474365, "losses/sft": 1.8122549057006836, "losses/total": 0.48410069942474365, "ref_logps/chosen": -37.28096389770508, "ref_logps/rejected": -34.442848205566406, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4419982433319092, "rewards/margins": 0.9266785383224487, "rewards/rejected": -2.3686766624450684, "step": 2078 }, { "epoch": 1.96, "grad_norm": 30.24034309387207, "learning_rate": 1.9202518363064007e-07, "logps/chosen": -65.14990997314453, "logps/rejected": -65.8658218383789, "loss": 0.5403, "losses/dpo": 0.5930138826370239, "losses/sft": 1.9618332386016846, "losses/total": 0.5930138826370239, "ref_logps/chosen": -49.0767822265625, "ref_logps/rejected": -39.36961364746094, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6073129177093506, "rewards/margins": 1.0423080921173096, "rewards/rejected": -2.64962100982666, "step": 2079 }, { "epoch": 1.96, "grad_norm": 21.096139907836914, "learning_rate": 1.9185029730675058e-07, "logps/chosen": -54.708377838134766, "logps/rejected": -84.69854736328125, "loss": 0.358, "losses/dpo": 0.3152535855770111, "losses/sft": 2.2408792972564697, "losses/total": 0.3152535855770111, "ref_logps/chosen": -38.508750915527344, "ref_logps/rejected": -52.90130615234375, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6199626922607422, "rewards/margins": 1.5597611665725708, "rewards/rejected": -3.1797237396240234, "step": 2080 }, { "epoch": 1.97, "grad_norm": 17.17258644104004, "learning_rate": 1.9167541098286114e-07, "logps/chosen": -47.74473190307617, "logps/rejected": -74.9290542602539, "loss": 0.3151, "losses/dpo": 0.2950161099433899, "losses/sft": 1.737703800201416, "losses/total": 0.2950161099433899, "ref_logps/chosen": -35.02825164794922, "ref_logps/rejected": -48.01458740234375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2716484069824219, "rewards/margins": 1.4197978973388672, "rewards/rejected": -2.691446304321289, "step": 2081 }, { "epoch": 1.97, "grad_norm": 29.31932258605957, "learning_rate": 1.9150052465897166e-07, "logps/chosen": -53.92249298095703, "logps/rejected": -71.12378692626953, "loss": 0.5637, "losses/dpo": 0.7377325296401978, "losses/sft": 1.9557472467422485, "losses/total": 0.7377325296401978, "ref_logps/chosen": -38.39622497558594, "ref_logps/rejected": -45.32025146484375, "rewards/accuracies": 0.6875, "rewards/chosen": -1.552626609802246, "rewards/margins": 1.0277271270751953, "rewards/rejected": -2.5803537368774414, "step": 2082 }, { "epoch": 1.97, "grad_norm": 13.754678726196289, "learning_rate": 1.913256383350822e-07, "logps/chosen": -47.44029998779297, "logps/rejected": -83.66126251220703, "loss": 0.2069, "losses/dpo": 0.10512280464172363, "losses/sft": 1.3472740650177002, "losses/total": 0.10512280464172363, "ref_logps/chosen": -36.48375701904297, "ref_logps/rejected": -51.67913055419922, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0956543684005737, "rewards/margins": 2.1025586128234863, "rewards/rejected": -3.1982131004333496, "step": 2083 }, { "epoch": 1.97, "grad_norm": 19.123947143554688, "learning_rate": 1.911507520111927e-07, "logps/chosen": -40.42908477783203, "logps/rejected": -65.13935089111328, "loss": 0.4052, "losses/dpo": 0.3695049285888672, "losses/sft": 1.5850168466567993, "losses/total": 0.3695049285888672, "ref_logps/chosen": -31.06500244140625, "ref_logps/rejected": -42.063072204589844, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9364081621170044, "rewards/margins": 1.371219515800476, "rewards/rejected": -2.3076276779174805, "step": 2084 }, { "epoch": 1.97, "grad_norm": 19.283388137817383, "learning_rate": 1.9097586568730325e-07, "logps/chosen": -53.13050842285156, "logps/rejected": -76.23269653320312, "loss": 0.3761, "losses/dpo": 0.26982805132865906, "losses/sft": 2.076272964477539, "losses/total": 0.26982805132865906, "ref_logps/chosen": -34.79920196533203, "ref_logps/rejected": -43.05308532714844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8331308364868164, "rewards/margins": 1.4848307371139526, "rewards/rejected": -3.3179616928100586, "step": 2085 }, { "epoch": 1.97, "grad_norm": 22.932968139648438, "learning_rate": 1.9080097936341376e-07, "logps/chosen": -51.976802825927734, "logps/rejected": -66.0677261352539, "loss": 0.3818, "losses/dpo": 0.4107745885848999, "losses/sft": 2.1810503005981445, "losses/total": 0.4107745885848999, "ref_logps/chosen": -37.13493347167969, "ref_logps/rejected": -40.08832550048828, "rewards/accuracies": 0.875, "rewards/chosen": -1.4841868877410889, "rewards/margins": 1.113753080368042, "rewards/rejected": -2.597939968109131, "step": 2086 }, { "epoch": 1.97, "grad_norm": 25.720481872558594, "learning_rate": 1.906260930395243e-07, "logps/chosen": -55.9951057434082, "logps/rejected": -74.78225708007812, "loss": 0.4095, "losses/dpo": 0.7387577295303345, "losses/sft": 2.833451986312866, "losses/total": 0.7387577295303345, "ref_logps/chosen": -39.956939697265625, "ref_logps/rejected": -46.588706970214844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6038166284561157, "rewards/margins": 1.2155382633209229, "rewards/rejected": -2.819355010986328, "step": 2087 }, { "epoch": 1.97, "grad_norm": 13.170822143554688, "learning_rate": 1.9045120671563484e-07, "logps/chosen": -52.139076232910156, "logps/rejected": -82.11978912353516, "loss": 0.2314, "losses/dpo": 0.15874797105789185, "losses/sft": 1.5309741497039795, "losses/total": 0.15874797105789185, "ref_logps/chosen": -40.98955535888672, "ref_logps/rejected": -51.77434158325195, "rewards/accuracies": 0.875, "rewards/chosen": -1.1149518489837646, "rewards/margins": 1.9195929765701294, "rewards/rejected": -3.0345449447631836, "step": 2088 }, { "epoch": 1.97, "grad_norm": 18.979862213134766, "learning_rate": 1.9027632039174535e-07, "logps/chosen": -49.205665588378906, "logps/rejected": -60.659915924072266, "loss": 0.4428, "losses/dpo": 0.36194419860839844, "losses/sft": 1.6680033206939697, "losses/total": 0.36194419860839844, "ref_logps/chosen": -32.610618591308594, "ref_logps/rejected": -34.23227310180664, "rewards/accuracies": 0.625, "rewards/chosen": -1.6595051288604736, "rewards/margins": 0.9832589626312256, "rewards/rejected": -2.642764091491699, "step": 2089 }, { "epoch": 1.97, "grad_norm": 27.069564819335938, "learning_rate": 1.901014340678559e-07, "logps/chosen": -46.990013122558594, "logps/rejected": -64.36250305175781, "loss": 0.4694, "losses/dpo": 0.8395521640777588, "losses/sft": 1.9547569751739502, "losses/total": 0.8395521640777588, "ref_logps/chosen": -32.83867645263672, "ref_logps/rejected": -37.38503646850586, "rewards/accuracies": 0.625, "rewards/chosen": -1.4151335954666138, "rewards/margins": 1.2826130390167236, "rewards/rejected": -2.697746753692627, "step": 2090 }, { "epoch": 1.97, "grad_norm": 22.355756759643555, "learning_rate": 1.899265477439664e-07, "logps/chosen": -56.93726348876953, "logps/rejected": -73.70066833496094, "loss": 0.3696, "losses/dpo": 0.34817594289779663, "losses/sft": 1.9821481704711914, "losses/total": 0.34817594289779663, "ref_logps/chosen": -46.03300476074219, "ref_logps/rejected": -48.89353561401367, "rewards/accuracies": 0.8125, "rewards/chosen": -1.090425968170166, "rewards/margins": 1.3902873992919922, "rewards/rejected": -2.480713367462158, "step": 2091 }, { "epoch": 1.98, "grad_norm": 23.368337631225586, "learning_rate": 1.8975166142007694e-07, "logps/chosen": -52.14710235595703, "logps/rejected": -65.72732543945312, "loss": 0.3966, "losses/dpo": 0.4847814738750458, "losses/sft": 1.4096633195877075, "losses/total": 0.4847814738750458, "ref_logps/chosen": -38.912132263183594, "ref_logps/rejected": -39.4206657409668, "rewards/accuracies": 0.75, "rewards/chosen": -1.3234974145889282, "rewards/margins": 1.3071682453155518, "rewards/rejected": -2.6306657791137695, "step": 2092 }, { "epoch": 1.98, "grad_norm": 15.476760864257812, "learning_rate": 1.8957677509618748e-07, "logps/chosen": -40.99615478515625, "logps/rejected": -61.18220520019531, "loss": 0.2826, "losses/dpo": 0.48680445551872253, "losses/sft": 1.4409599304199219, "losses/total": 0.48680445551872253, "ref_logps/chosen": -30.948347091674805, "ref_logps/rejected": -36.278907775878906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.004780888557434, "rewards/margins": 1.485548973083496, "rewards/rejected": -2.4903297424316406, "step": 2093 }, { "epoch": 1.98, "grad_norm": 19.184370040893555, "learning_rate": 1.89401888772298e-07, "logps/chosen": -54.1899528503418, "logps/rejected": -67.83609771728516, "loss": 0.3333, "losses/dpo": 0.15602180361747742, "losses/sft": 1.990742564201355, "losses/total": 0.15602180361747742, "ref_logps/chosen": -43.29187774658203, "ref_logps/rejected": -42.60942840576172, "rewards/accuracies": 0.875, "rewards/chosen": -1.0898075103759766, "rewards/margins": 1.4328596591949463, "rewards/rejected": -2.522667169570923, "step": 2094 }, { "epoch": 1.98, "grad_norm": 22.313430786132812, "learning_rate": 1.8922700244840853e-07, "logps/chosen": -48.01663589477539, "logps/rejected": -75.1309814453125, "loss": 0.3928, "losses/dpo": 0.09186310321092606, "losses/sft": 1.7343701124191284, "losses/total": 0.09186310321092606, "ref_logps/chosen": -36.94061279296875, "ref_logps/rejected": -50.28507995605469, "rewards/accuracies": 0.875, "rewards/chosen": -1.1076024770736694, "rewards/margins": 1.3769872188568115, "rewards/rejected": -2.4845895767211914, "step": 2095 }, { "epoch": 1.98, "grad_norm": 21.59351348876953, "learning_rate": 1.8905211612451905e-07, "logps/chosen": -58.79646301269531, "logps/rejected": -79.04988098144531, "loss": 0.3616, "losses/dpo": 0.47118711471557617, "losses/sft": 1.982682466506958, "losses/total": 0.47118711471557617, "ref_logps/chosen": -44.02228546142578, "ref_logps/rejected": -45.99781036376953, "rewards/accuracies": 0.875, "rewards/chosen": -1.4774174690246582, "rewards/margins": 1.8277900218963623, "rewards/rejected": -3.3052072525024414, "step": 2096 }, { "epoch": 1.98, "grad_norm": 19.19754981994629, "learning_rate": 1.8887722980062959e-07, "logps/chosen": -49.19921875, "logps/rejected": -77.63159942626953, "loss": 0.309, "losses/dpo": 0.16407868266105652, "losses/sft": 1.052739143371582, "losses/total": 0.16407868266105652, "ref_logps/chosen": -35.50464630126953, "ref_logps/rejected": -50.42033767700195, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3694572448730469, "rewards/margins": 1.351669192314148, "rewards/rejected": -2.7211265563964844, "step": 2097 }, { "epoch": 1.98, "grad_norm": 19.97472381591797, "learning_rate": 1.887023434767401e-07, "logps/chosen": -55.17011260986328, "logps/rejected": -62.014442443847656, "loss": 0.3012, "losses/dpo": 0.2652393579483032, "losses/sft": 1.9611105918884277, "losses/total": 0.2652393579483032, "ref_logps/chosen": -44.76509475708008, "ref_logps/rejected": -36.59451675415039, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0405018329620361, "rewards/margins": 1.5014909505844116, "rewards/rejected": -2.541992664337158, "step": 2098 }, { "epoch": 1.98, "grad_norm": 19.623929977416992, "learning_rate": 1.8852745715285064e-07, "logps/chosen": -47.913177490234375, "logps/rejected": -59.980003356933594, "loss": 0.4096, "losses/dpo": 0.4777611494064331, "losses/sft": 1.5093517303466797, "losses/total": 0.4777611494064331, "ref_logps/chosen": -34.349632263183594, "ref_logps/rejected": -36.96465301513672, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3563544750213623, "rewards/margins": 0.945180356502533, "rewards/rejected": -2.301534652709961, "step": 2099 }, { "epoch": 1.98, "grad_norm": 23.5931339263916, "learning_rate": 1.8835257082896118e-07, "logps/chosen": -55.432037353515625, "logps/rejected": -84.23030090332031, "loss": 0.4203, "losses/dpo": 0.47841909527778625, "losses/sft": 2.050821542739868, "losses/total": 0.47841909527778625, "ref_logps/chosen": -42.95378112792969, "ref_logps/rejected": -57.02083206176758, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2478258609771729, "rewards/margins": 1.4731216430664062, "rewards/rejected": -2.720947265625, "step": 2100 }, { "epoch": 1.98, "grad_norm": 17.061511993408203, "learning_rate": 1.881776845050717e-07, "logps/chosen": -52.63528823852539, "logps/rejected": -78.40866088867188, "loss": 0.306, "losses/dpo": 0.6880185604095459, "losses/sft": 1.5328595638275146, "losses/total": 0.6880185604095459, "ref_logps/chosen": -39.48440933227539, "ref_logps/rejected": -47.792110443115234, "rewards/accuracies": 0.875, "rewards/chosen": -1.3150876760482788, "rewards/margins": 1.746567726135254, "rewards/rejected": -3.0616555213928223, "step": 2101 }, { "epoch": 1.98, "grad_norm": 24.723827362060547, "learning_rate": 1.8800279818118223e-07, "logps/chosen": -50.295284271240234, "logps/rejected": -59.07971954345703, "loss": 0.4753, "losses/dpo": 0.6563800573348999, "losses/sft": 2.3130757808685303, "losses/total": 0.6563800573348999, "ref_logps/chosen": -37.324432373046875, "ref_logps/rejected": -37.981285095214844, "rewards/accuracies": 0.75, "rewards/chosen": -1.2970848083496094, "rewards/margins": 0.8127583265304565, "rewards/rejected": -2.1098432540893555, "step": 2102 }, { "epoch": 1.99, "grad_norm": 29.858367919921875, "learning_rate": 1.8782791185729274e-07, "logps/chosen": -52.0495719909668, "logps/rejected": -59.01878356933594, "loss": 0.5749, "losses/dpo": 1.0192652940750122, "losses/sft": 2.6816837787628174, "losses/total": 1.0192652940750122, "ref_logps/chosen": -36.212440490722656, "ref_logps/rejected": -33.46721649169922, "rewards/accuracies": 0.625, "rewards/chosen": -1.5837132930755615, "rewards/margins": 0.9714435338973999, "rewards/rejected": -2.555156707763672, "step": 2103 }, { "epoch": 1.99, "grad_norm": 17.373641967773438, "learning_rate": 1.8765302553340328e-07, "logps/chosen": -55.62825012207031, "logps/rejected": -65.21308898925781, "loss": 0.3592, "losses/dpo": 0.35648399591445923, "losses/sft": 1.9869621992111206, "losses/total": 0.35648399591445923, "ref_logps/chosen": -40.22018814086914, "ref_logps/rejected": -37.518821716308594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.540806531906128, "rewards/margins": 1.2286204099655151, "rewards/rejected": -2.7694268226623535, "step": 2104 }, { "epoch": 1.99, "grad_norm": 16.910202026367188, "learning_rate": 1.874781392095138e-07, "logps/chosen": -54.608497619628906, "logps/rejected": -80.50454711914062, "loss": 0.2619, "losses/dpo": 0.21352556347846985, "losses/sft": 1.5494791269302368, "losses/total": 0.21352556347846985, "ref_logps/chosen": -40.56264877319336, "ref_logps/rejected": -50.489662170410156, "rewards/accuracies": 1.0, "rewards/chosen": -1.4045848846435547, "rewards/margins": 1.5969035625457764, "rewards/rejected": -3.001488447189331, "step": 2105 }, { "epoch": 1.99, "grad_norm": 30.094318389892578, "learning_rate": 1.8730325288562436e-07, "logps/chosen": -58.43780517578125, "logps/rejected": -79.49662780761719, "loss": 0.5818, "losses/dpo": 0.9518269896507263, "losses/sft": 1.4864927530288696, "losses/total": 0.9518269896507263, "ref_logps/chosen": -44.42962646484375, "ref_logps/rejected": -53.615257263183594, "rewards/accuracies": 0.625, "rewards/chosen": -1.4008177518844604, "rewards/margins": 1.1873186826705933, "rewards/rejected": -2.5881364345550537, "step": 2106 }, { "epoch": 1.99, "grad_norm": 24.94447898864746, "learning_rate": 1.8712836656173487e-07, "logps/chosen": -57.46205139160156, "logps/rejected": -69.63911437988281, "loss": 0.4224, "losses/dpo": 0.7341169118881226, "losses/sft": 1.85963773727417, "losses/total": 0.7341169118881226, "ref_logps/chosen": -42.052757263183594, "ref_logps/rejected": -43.21053695678711, "rewards/accuracies": 0.75, "rewards/chosen": -1.5409293174743652, "rewards/margins": 1.1019287109375, "rewards/rejected": -2.6428580284118652, "step": 2107 }, { "epoch": 1.99, "grad_norm": 13.211400985717773, "learning_rate": 1.8695348023784538e-07, "logps/chosen": -49.99925231933594, "logps/rejected": -67.35913848876953, "loss": 0.2116, "losses/dpo": 0.27112096548080444, "losses/sft": 2.0193769931793213, "losses/total": 0.27112096548080444, "ref_logps/chosen": -37.098487854003906, "ref_logps/rejected": -36.51006317138672, "rewards/accuracies": 1.0, "rewards/chosen": -1.290076494216919, "rewards/margins": 1.7948315143585205, "rewards/rejected": -3.0849080085754395, "step": 2108 }, { "epoch": 1.99, "grad_norm": 16.180646896362305, "learning_rate": 1.8677859391395592e-07, "logps/chosen": -51.18617248535156, "logps/rejected": -88.51129913330078, "loss": 0.2823, "losses/dpo": 0.27311259508132935, "losses/sft": 1.8351768255233765, "losses/total": 0.27311259508132935, "ref_logps/chosen": -37.81904983520508, "ref_logps/rejected": -59.578643798828125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3367122411727905, "rewards/margins": 1.5565526485443115, "rewards/rejected": -2.8932647705078125, "step": 2109 }, { "epoch": 1.99, "grad_norm": 19.404233932495117, "learning_rate": 1.8660370759006644e-07, "logps/chosen": -46.71330261230469, "logps/rejected": -58.63636779785156, "loss": 0.4132, "losses/dpo": 0.39510804414749146, "losses/sft": 1.4505623579025269, "losses/total": 0.39510804414749146, "ref_logps/chosen": -37.53622055053711, "ref_logps/rejected": -36.431819915771484, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9177082180976868, "rewards/margins": 1.3027459383010864, "rewards/rejected": -2.220454216003418, "step": 2110 }, { "epoch": 1.99, "grad_norm": 26.47079086303711, "learning_rate": 1.8642882126617697e-07, "logps/chosen": -57.6226806640625, "logps/rejected": -80.47708129882812, "loss": 0.408, "losses/dpo": 0.5040897130966187, "losses/sft": 2.2119622230529785, "losses/total": 0.5040897130966187, "ref_logps/chosen": -41.14360809326172, "ref_logps/rejected": -48.865867614746094, "rewards/accuracies": 0.75, "rewards/chosen": -1.6479074954986572, "rewards/margins": 1.5132136344909668, "rewards/rejected": -3.161121368408203, "step": 2111 }, { "epoch": 1.99, "grad_norm": 17.380420684814453, "learning_rate": 1.862539349422875e-07, "logps/chosen": -48.38188934326172, "logps/rejected": -70.9681625366211, "loss": 0.26, "losses/dpo": 0.30021780729293823, "losses/sft": 1.6236364841461182, "losses/total": 0.30021780729293823, "ref_logps/chosen": -36.0531005859375, "ref_logps/rejected": -40.83494567871094, "rewards/accuracies": 0.875, "rewards/chosen": -1.2328790426254272, "rewards/margins": 1.7804433107376099, "rewards/rejected": -3.013322353363037, "step": 2112 }, { "epoch": 2.0, "grad_norm": 18.04389762878418, "learning_rate": 1.8607904861839805e-07, "logps/chosen": -60.78657531738281, "logps/rejected": -67.65718078613281, "loss": 0.3145, "losses/dpo": 0.5886095762252808, "losses/sft": 1.701097011566162, "losses/total": 0.5886095762252808, "ref_logps/chosen": -50.240760803222656, "ref_logps/rejected": -41.739891052246094, "rewards/accuracies": 0.875, "rewards/chosen": -1.0545814037322998, "rewards/margins": 1.5371477603912354, "rewards/rejected": -2.591729164123535, "step": 2113 }, { "epoch": 2.0, "grad_norm": 25.632863998413086, "learning_rate": 1.8590416229450857e-07, "logps/chosen": -50.28253936767578, "logps/rejected": -65.70915222167969, "loss": 0.5052, "losses/dpo": 0.36404773592948914, "losses/sft": 1.6944377422332764, "losses/total": 0.36404773592948914, "ref_logps/chosen": -34.70207977294922, "ref_logps/rejected": -39.907432556152344, "rewards/accuracies": 0.75, "rewards/chosen": -1.5580458641052246, "rewards/margins": 1.0221258401870728, "rewards/rejected": -2.580171823501587, "step": 2114 }, { "epoch": 2.0, "grad_norm": 19.195222854614258, "learning_rate": 1.8572927597061908e-07, "logps/chosen": -53.40088653564453, "logps/rejected": -58.343116760253906, "loss": 0.3877, "losses/dpo": 0.39997634291648865, "losses/sft": 1.6913156509399414, "losses/total": 0.39997634291648865, "ref_logps/chosen": -39.987152099609375, "ref_logps/rejected": -35.145591735839844, "rewards/accuracies": 0.875, "rewards/chosen": -1.3413735628128052, "rewards/margins": 0.9783786535263062, "rewards/rejected": -2.3197522163391113, "step": 2115 }, { "epoch": 2.0, "grad_norm": 24.312637329101562, "learning_rate": 1.8555438964672962e-07, "logps/chosen": -47.844505310058594, "logps/rejected": -61.53788757324219, "loss": 0.4046, "losses/dpo": 0.4707256555557251, "losses/sft": 1.9867011308670044, "losses/total": 0.4707256555557251, "ref_logps/chosen": -36.274505615234375, "ref_logps/rejected": -40.71784973144531, "rewards/accuracies": 0.8125, "rewards/chosen": -1.156999945640564, "rewards/margins": 0.9250036478042603, "rewards/rejected": -2.082003593444824, "step": 2116 }, { "epoch": 2.0, "grad_norm": 20.24012565612793, "learning_rate": 1.8537950332284013e-07, "logps/chosen": -53.673866271972656, "logps/rejected": -69.09025573730469, "loss": 0.3986, "losses/dpo": 0.5724376440048218, "losses/sft": 1.5727691650390625, "losses/total": 0.5724376440048218, "ref_logps/chosen": -38.131690979003906, "ref_logps/rejected": -42.203712463378906, "rewards/accuracies": 0.875, "rewards/chosen": -1.5542174577713013, "rewards/margins": 1.1344361305236816, "rewards/rejected": -2.6886537075042725, "step": 2117 }, { "epoch": 2.0, "grad_norm": 23.998689651489258, "learning_rate": 1.8520461699895067e-07, "logps/chosen": -35.5611572265625, "logps/rejected": -53.264007568359375, "loss": 0.5241, "losses/dpo": 0.7268857955932617, "losses/sft": 2.0944571495056152, "losses/total": 0.7268857955932617, "ref_logps/chosen": -26.67676544189453, "ref_logps/rejected": -33.97700500488281, "rewards/accuracies": 0.625, "rewards/chosen": -0.8884388208389282, "rewards/margins": 1.0402616262435913, "rewards/rejected": -1.9287004470825195, "step": 2118 }, { "epoch": 2.0, "grad_norm": 14.622540473937988, "learning_rate": 1.850297306750612e-07, "logps/chosen": -54.903717041015625, "logps/rejected": -72.09414672851562, "loss": 0.2194, "losses/dpo": 0.27196836471557617, "losses/sft": 1.6568127870559692, "losses/total": 0.27196836471557617, "ref_logps/chosen": -45.634002685546875, "ref_logps/rejected": -46.37932586669922, "rewards/accuracies": 1.0, "rewards/chosen": -0.9269713759422302, "rewards/margins": 1.6445108652114868, "rewards/rejected": -2.5714821815490723, "step": 2119 }, { "epoch": 2.0, "grad_norm": 15.972982406616211, "learning_rate": 1.8485484435117175e-07, "logps/chosen": -43.17649841308594, "logps/rejected": -59.56483840942383, "loss": 0.2986, "losses/dpo": 0.48384755849838257, "losses/sft": 1.9165600538253784, "losses/total": 0.48384755849838257, "ref_logps/chosen": -33.55433654785156, "ref_logps/rejected": -35.56121063232422, "rewards/accuracies": 0.875, "rewards/chosen": -0.9622160196304321, "rewards/margins": 1.4381465911865234, "rewards/rejected": -2.400362730026245, "step": 2120 }, { "epoch": 2.0, "grad_norm": 11.680267333984375, "learning_rate": 1.8467995802728226e-07, "logps/chosen": -35.333065032958984, "logps/rejected": -59.890071868896484, "loss": 0.2523, "losses/dpo": 0.1491500437259674, "losses/sft": 1.8207087516784668, "losses/total": 0.1491500437259674, "ref_logps/chosen": -29.129497528076172, "ref_logps/rejected": -34.53155517578125, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6203566193580627, "rewards/margins": 1.9154951572418213, "rewards/rejected": -2.5358517169952393, "step": 2121 }, { "epoch": 2.0, "grad_norm": 18.440881729125977, "learning_rate": 1.8450507170339277e-07, "logps/chosen": -50.24641418457031, "logps/rejected": -69.30216979980469, "loss": 0.3082, "losses/dpo": 0.06355299055576324, "losses/sft": 1.8499056100845337, "losses/total": 0.06355299055576324, "ref_logps/chosen": -38.864601135253906, "ref_logps/rejected": -40.225372314453125, "rewards/accuracies": 0.875, "rewards/chosen": -1.138181447982788, "rewards/margins": 1.7694981098175049, "rewards/rejected": -2.907679796218872, "step": 2122 }, { "epoch": 2.0, "grad_norm": 12.123652458190918, "learning_rate": 1.843301853795033e-07, "logps/chosen": -47.44609451293945, "logps/rejected": -98.56100463867188, "loss": 0.1402, "losses/dpo": 0.2347048819065094, "losses/sft": 1.9066039323806763, "losses/total": 0.2347048819065094, "ref_logps/chosen": -36.865753173828125, "ref_logps/rejected": -62.88535690307617, "rewards/accuracies": 1.0, "rewards/chosen": -1.058034062385559, "rewards/margins": 2.509530544281006, "rewards/rejected": -3.5675647258758545, "step": 2123 }, { "epoch": 2.01, "grad_norm": 10.315705299377441, "learning_rate": 1.8415529905561382e-07, "logps/chosen": -49.76079559326172, "logps/rejected": -82.68477630615234, "loss": 0.2081, "losses/dpo": 0.13203921914100647, "losses/sft": 1.5670490264892578, "losses/total": 0.13203921914100647, "ref_logps/chosen": -38.33154296875, "ref_logps/rejected": -49.986724853515625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1429252624511719, "rewards/margins": 2.1268796920776367, "rewards/rejected": -3.2698049545288086, "step": 2124 }, { "epoch": 2.01, "grad_norm": 25.119314193725586, "learning_rate": 1.839804127317244e-07, "logps/chosen": -50.736183166503906, "logps/rejected": -63.65363693237305, "loss": 0.476, "losses/dpo": 0.7009780406951904, "losses/sft": 1.9088724851608276, "losses/total": 0.7009780406951904, "ref_logps/chosen": -34.78553771972656, "ref_logps/rejected": -39.48884582519531, "rewards/accuracies": 0.75, "rewards/chosen": -1.595064401626587, "rewards/margins": 0.8214143514633179, "rewards/rejected": -2.4164786338806152, "step": 2125 }, { "epoch": 2.01, "grad_norm": 17.129047393798828, "learning_rate": 1.838055264078349e-07, "logps/chosen": -46.27547836303711, "logps/rejected": -69.67829132080078, "loss": 0.3594, "losses/dpo": 0.6327574849128723, "losses/sft": 1.9159936904907227, "losses/total": 0.6327574849128723, "ref_logps/chosen": -31.954605102539062, "ref_logps/rejected": -41.980838775634766, "rewards/accuracies": 0.875, "rewards/chosen": -1.4320874214172363, "rewards/margins": 1.3376576900482178, "rewards/rejected": -2.769745349884033, "step": 2126 }, { "epoch": 2.01, "grad_norm": 9.828198432922363, "learning_rate": 1.8363064008394544e-07, "logps/chosen": -47.61317443847656, "logps/rejected": -81.50820922851562, "loss": 0.1344, "losses/dpo": 0.10161048173904419, "losses/sft": 1.8394410610198975, "losses/total": 0.10161048173904419, "ref_logps/chosen": -37.03186798095703, "ref_logps/rejected": -45.87834167480469, "rewards/accuracies": 1.0, "rewards/chosen": -1.0581305027008057, "rewards/margins": 2.5048563480377197, "rewards/rejected": -3.5629868507385254, "step": 2127 }, { "epoch": 2.01, "grad_norm": 14.764139175415039, "learning_rate": 1.8345575376005595e-07, "logps/chosen": -57.10142517089844, "logps/rejected": -86.08834075927734, "loss": 0.2137, "losses/dpo": 0.19319021701812744, "losses/sft": 2.0294692516326904, "losses/total": 0.19319021701812744, "ref_logps/chosen": -42.53826904296875, "ref_logps/rejected": -51.38894271850586, "rewards/accuracies": 1.0, "rewards/chosen": -1.456315517425537, "rewards/margins": 2.0136241912841797, "rewards/rejected": -3.4699394702911377, "step": 2128 }, { "epoch": 2.01, "grad_norm": 13.709147453308105, "learning_rate": 1.8328086743616647e-07, "logps/chosen": -51.933265686035156, "logps/rejected": -71.26970672607422, "loss": 0.2108, "losses/dpo": 0.36864811182022095, "losses/sft": 1.5049283504486084, "losses/total": 0.36864811182022095, "ref_logps/chosen": -43.06371307373047, "ref_logps/rejected": -41.1356086730957, "rewards/accuracies": 1.0, "rewards/chosen": -0.886955201625824, "rewards/margins": 2.1264548301696777, "rewards/rejected": -3.0134103298187256, "step": 2129 }, { "epoch": 2.01, "grad_norm": 9.689056396484375, "learning_rate": 1.83105981112277e-07, "logps/chosen": -55.64209747314453, "logps/rejected": -85.40646362304688, "loss": 0.1437, "losses/dpo": 0.190652996301651, "losses/sft": 1.7045769691467285, "losses/total": 0.190652996301651, "ref_logps/chosen": -43.312255859375, "ref_logps/rejected": -47.68393325805664, "rewards/accuracies": 1.0, "rewards/chosen": -1.2329840660095215, "rewards/margins": 2.539268970489502, "rewards/rejected": -3.7722532749176025, "step": 2130 }, { "epoch": 2.01, "grad_norm": 22.669729232788086, "learning_rate": 1.8293109478838752e-07, "logps/chosen": -56.211883544921875, "logps/rejected": -68.6495361328125, "loss": 0.4304, "losses/dpo": 0.40455734729766846, "losses/sft": 2.158808469772339, "losses/total": 0.40455734729766846, "ref_logps/chosen": -40.2336311340332, "ref_logps/rejected": -40.402957916259766, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5978256464004517, "rewards/margins": 1.2268320322036743, "rewards/rejected": -2.824657678604126, "step": 2131 }, { "epoch": 2.01, "grad_norm": 19.535429000854492, "learning_rate": 1.8275620846449808e-07, "logps/chosen": -57.2041015625, "logps/rejected": -61.618003845214844, "loss": 0.2841, "losses/dpo": 0.3089786767959595, "losses/sft": 1.9661866426467896, "losses/total": 0.3089786767959595, "ref_logps/chosen": -42.45734405517578, "ref_logps/rejected": -32.064273834228516, "rewards/accuracies": 0.875, "rewards/chosen": -1.4746756553649902, "rewards/margins": 1.4806971549987793, "rewards/rejected": -2.9553728103637695, "step": 2132 }, { "epoch": 2.01, "grad_norm": 14.503403663635254, "learning_rate": 1.825813221406086e-07, "logps/chosen": -41.284278869628906, "logps/rejected": -65.96885681152344, "loss": 0.2964, "losses/dpo": 0.4060315489768982, "losses/sft": 1.5218085050582886, "losses/total": 0.4060315489768982, "ref_logps/chosen": -33.32521438598633, "ref_logps/rejected": -43.205238342285156, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7959065437316895, "rewards/margins": 1.4804561138153076, "rewards/rejected": -2.276362895965576, "step": 2133 }, { "epoch": 2.02, "grad_norm": 14.652665138244629, "learning_rate": 1.8240643581671914e-07, "logps/chosen": -52.136600494384766, "logps/rejected": -74.16392517089844, "loss": 0.2013, "losses/dpo": 0.1409742534160614, "losses/sft": 1.9089434146881104, "losses/total": 0.1409742534160614, "ref_logps/chosen": -40.16832733154297, "ref_logps/rejected": -43.59336853027344, "rewards/accuracies": 1.0, "rewards/chosen": -1.1968274116516113, "rewards/margins": 1.860228419303894, "rewards/rejected": -3.057055950164795, "step": 2134 }, { "epoch": 2.02, "grad_norm": 20.786081314086914, "learning_rate": 1.8223154949282965e-07, "logps/chosen": -56.99998474121094, "logps/rejected": -78.87982940673828, "loss": 0.255, "losses/dpo": 0.13895924389362335, "losses/sft": 2.1962175369262695, "losses/total": 0.13895924389362335, "ref_logps/chosen": -41.885986328125, "ref_logps/rejected": -46.557987213134766, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5114002227783203, "rewards/margins": 1.7207839488983154, "rewards/rejected": -3.2321841716766357, "step": 2135 }, { "epoch": 2.02, "grad_norm": 20.18816375732422, "learning_rate": 1.820566631689402e-07, "logps/chosen": -53.30295944213867, "logps/rejected": -76.03685760498047, "loss": 0.2507, "losses/dpo": 0.23952838778495789, "losses/sft": 1.6842877864837646, "losses/total": 0.23952838778495789, "ref_logps/chosen": -37.580692291259766, "ref_logps/rejected": -43.69908905029297, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5722270011901855, "rewards/margins": 1.6615500450134277, "rewards/rejected": -3.2337770462036133, "step": 2136 }, { "epoch": 2.02, "grad_norm": 16.824499130249023, "learning_rate": 1.818817768450507e-07, "logps/chosen": -50.670997619628906, "logps/rejected": -68.41815185546875, "loss": 0.272, "losses/dpo": 0.13589981198310852, "losses/sft": 1.4028607606887817, "losses/total": 0.13589981198310852, "ref_logps/chosen": -38.59898376464844, "ref_logps/rejected": -37.87329864501953, "rewards/accuracies": 0.875, "rewards/chosen": -1.2072012424468994, "rewards/margins": 1.847284197807312, "rewards/rejected": -3.054485321044922, "step": 2137 }, { "epoch": 2.02, "grad_norm": 10.899084091186523, "learning_rate": 1.8170689052116124e-07, "logps/chosen": -51.74102020263672, "logps/rejected": -79.1762466430664, "loss": 0.1997, "losses/dpo": 0.2777785062789917, "losses/sft": 2.1594667434692383, "losses/total": 0.2777785062789917, "ref_logps/chosen": -37.96143341064453, "ref_logps/rejected": -43.343299865722656, "rewards/accuracies": 1.0, "rewards/chosen": -1.3779584169387817, "rewards/margins": 2.205335855484009, "rewards/rejected": -3.583294153213501, "step": 2138 }, { "epoch": 2.02, "grad_norm": 18.899595260620117, "learning_rate": 1.8153200419727178e-07, "logps/chosen": -65.54669189453125, "logps/rejected": -69.53129577636719, "loss": 0.2631, "losses/dpo": 0.20167754590511322, "losses/sft": 1.8834009170532227, "losses/total": 0.20167754590511322, "ref_logps/chosen": -51.37293243408203, "ref_logps/rejected": -39.78549575805664, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4173754453659058, "rewards/margins": 1.5572043657302856, "rewards/rejected": -2.9745798110961914, "step": 2139 }, { "epoch": 2.02, "grad_norm": 18.19244384765625, "learning_rate": 1.813571178733823e-07, "logps/chosen": -60.87751388549805, "logps/rejected": -95.88334655761719, "loss": 0.2329, "losses/dpo": 0.21829812228679657, "losses/sft": 2.4819176197052, "losses/total": 0.21829812228679657, "ref_logps/chosen": -42.347862243652344, "ref_logps/rejected": -58.603275299072266, "rewards/accuracies": 1.0, "rewards/chosen": -1.8529653549194336, "rewards/margins": 1.8750416040420532, "rewards/rejected": -3.7280068397521973, "step": 2140 }, { "epoch": 2.02, "grad_norm": 19.8094539642334, "learning_rate": 1.8118223154949283e-07, "logps/chosen": -49.105247497558594, "logps/rejected": -79.30419921875, "loss": 0.3108, "losses/dpo": 0.5008683800697327, "losses/sft": 2.0632882118225098, "losses/total": 0.5008683800697327, "ref_logps/chosen": -32.95293426513672, "ref_logps/rejected": -48.30475616455078, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6152312755584717, "rewards/margins": 1.4847133159637451, "rewards/rejected": -3.099944591522217, "step": 2141 }, { "epoch": 2.02, "grad_norm": 22.921703338623047, "learning_rate": 1.8100734522560334e-07, "logps/chosen": -51.896026611328125, "logps/rejected": -66.95953369140625, "loss": 0.318, "losses/dpo": 0.3040695786476135, "losses/sft": 2.3770389556884766, "losses/total": 0.3040695786476135, "ref_logps/chosen": -37.69776916503906, "ref_logps/rejected": -36.614402770996094, "rewards/accuracies": 0.875, "rewards/chosen": -1.419825792312622, "rewards/margins": 1.614687204360962, "rewards/rejected": -3.034513235092163, "step": 2142 }, { "epoch": 2.02, "grad_norm": 17.001493453979492, "learning_rate": 1.8083245890171388e-07, "logps/chosen": -43.463356018066406, "logps/rejected": -73.8758544921875, "loss": 0.2646, "losses/dpo": 0.2626388669013977, "losses/sft": 1.924068570137024, "losses/total": 0.2626388669013977, "ref_logps/chosen": -28.334320068359375, "ref_logps/rejected": -43.8366813659668, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5129038095474243, "rewards/margins": 1.491014003753662, "rewards/rejected": -3.003917694091797, "step": 2143 }, { "epoch": 2.02, "grad_norm": 15.26026725769043, "learning_rate": 1.806575725778244e-07, "logps/chosen": -51.992671966552734, "logps/rejected": -87.82942199707031, "loss": 0.2071, "losses/dpo": 0.13501951098442078, "losses/sft": 2.489450693130493, "losses/total": 0.13501951098442078, "ref_logps/chosen": -35.501312255859375, "ref_logps/rejected": -49.98847961425781, "rewards/accuracies": 1.0, "rewards/chosen": -1.6491355895996094, "rewards/margins": 2.134958267211914, "rewards/rejected": -3.7840940952301025, "step": 2144 }, { "epoch": 2.03, "grad_norm": 18.45328140258789, "learning_rate": 1.8048268625393493e-07, "logps/chosen": -55.810699462890625, "logps/rejected": -96.5757064819336, "loss": 0.2232, "losses/dpo": 0.2247193455696106, "losses/sft": 1.5977500677108765, "losses/total": 0.2247193455696106, "ref_logps/chosen": -36.68391418457031, "ref_logps/rejected": -55.360992431640625, "rewards/accuracies": 0.875, "rewards/chosen": -1.9126784801483154, "rewards/margins": 2.208792209625244, "rewards/rejected": -4.121470928192139, "step": 2145 }, { "epoch": 2.03, "grad_norm": 21.106897354125977, "learning_rate": 1.8030779993004547e-07, "logps/chosen": -56.17461013793945, "logps/rejected": -68.20716857910156, "loss": 0.3971, "losses/dpo": 0.15008476376533508, "losses/sft": 2.1184966564178467, "losses/total": 0.15008476376533508, "ref_logps/chosen": -36.314857482910156, "ref_logps/rejected": -34.541873931884766, "rewards/accuracies": 0.75, "rewards/chosen": -1.9859752655029297, "rewards/margins": 1.3805540800094604, "rewards/rejected": -3.3665294647216797, "step": 2146 }, { "epoch": 2.03, "grad_norm": 22.801605224609375, "learning_rate": 1.8013291360615599e-07, "logps/chosen": -54.46410369873047, "logps/rejected": -81.90934753417969, "loss": 0.3284, "losses/dpo": 0.43667668104171753, "losses/sft": 2.5421383380889893, "losses/total": 0.43667668104171753, "ref_logps/chosen": -37.55340576171875, "ref_logps/rejected": -50.67803192138672, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6910698413848877, "rewards/margins": 1.4320616722106934, "rewards/rejected": -3.123131275177002, "step": 2147 }, { "epoch": 2.03, "grad_norm": 22.096450805664062, "learning_rate": 1.7995802728226652e-07, "logps/chosen": -60.6259765625, "logps/rejected": -68.10839080810547, "loss": 0.2962, "losses/dpo": 0.1869063675403595, "losses/sft": 2.394782304763794, "losses/total": 0.1869063675403595, "ref_logps/chosen": -41.70875549316406, "ref_logps/rejected": -34.14208221435547, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8917219638824463, "rewards/margins": 1.504908561706543, "rewards/rejected": -3.3966307640075684, "step": 2148 }, { "epoch": 2.03, "grad_norm": 15.411166191101074, "learning_rate": 1.7978314095837704e-07, "logps/chosen": -59.88125991821289, "logps/rejected": -90.34550476074219, "loss": 0.1929, "losses/dpo": 0.11514449119567871, "losses/sft": 1.7957857847213745, "losses/total": 0.11514449119567871, "ref_logps/chosen": -44.15142822265625, "ref_logps/rejected": -51.37074279785156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5729832649230957, "rewards/margins": 2.324493169784546, "rewards/rejected": -3.8974764347076416, "step": 2149 }, { "epoch": 2.03, "grad_norm": 15.002555847167969, "learning_rate": 1.7960825463448758e-07, "logps/chosen": -54.14069747924805, "logps/rejected": -77.42867279052734, "loss": 0.3054, "losses/dpo": 0.29670092463493347, "losses/sft": 2.0884249210357666, "losses/total": 0.29670092463493347, "ref_logps/chosen": -36.3447265625, "ref_logps/rejected": -43.1048469543457, "rewards/accuracies": 0.875, "rewards/chosen": -1.7795970439910889, "rewards/margins": 1.6527860164642334, "rewards/rejected": -3.4323830604553223, "step": 2150 }, { "epoch": 2.03, "grad_norm": 18.32962417602539, "learning_rate": 1.7943336831059812e-07, "logps/chosen": -61.80403137207031, "logps/rejected": -96.60762023925781, "loss": 0.2212, "losses/dpo": 0.5263907313346863, "losses/sft": 2.3820464611053467, "losses/total": 0.5263907313346863, "ref_logps/chosen": -43.27747344970703, "ref_logps/rejected": -55.29908752441406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8526558876037598, "rewards/margins": 2.278197765350342, "rewards/rejected": -4.130853652954102, "step": 2151 }, { "epoch": 2.03, "grad_norm": 25.970869064331055, "learning_rate": 1.7925848198670863e-07, "logps/chosen": -58.544456481933594, "logps/rejected": -80.18570709228516, "loss": 0.3449, "losses/dpo": 0.4173913598060608, "losses/sft": 2.4565041065216064, "losses/total": 0.4173913598060608, "ref_logps/chosen": -38.74884033203125, "ref_logps/rejected": -47.55836486816406, "rewards/accuracies": 0.875, "rewards/chosen": -1.9795615673065186, "rewards/margins": 1.2831730842590332, "rewards/rejected": -3.2627346515655518, "step": 2152 }, { "epoch": 2.03, "grad_norm": 23.25876808166504, "learning_rate": 1.7908359566281917e-07, "logps/chosen": -49.929603576660156, "logps/rejected": -59.478755950927734, "loss": 0.3903, "losses/dpo": 0.45677483081817627, "losses/sft": 1.8607100248336792, "losses/total": 0.45677483081817627, "ref_logps/chosen": -35.993507385253906, "ref_logps/rejected": -31.64887237548828, "rewards/accuracies": 0.875, "rewards/chosen": -1.3936094045639038, "rewards/margins": 1.3893787860870361, "rewards/rejected": -2.7829880714416504, "step": 2153 }, { "epoch": 2.03, "grad_norm": 16.28001594543457, "learning_rate": 1.7890870933892968e-07, "logps/chosen": -57.345603942871094, "logps/rejected": -71.04620361328125, "loss": 0.2178, "losses/dpo": 0.14801138639450073, "losses/sft": 1.5319006443023682, "losses/total": 0.14801138639450073, "ref_logps/chosen": -42.26116180419922, "ref_logps/rejected": -38.639469146728516, "rewards/accuracies": 1.0, "rewards/chosen": -1.508443832397461, "rewards/margins": 1.7322297096252441, "rewards/rejected": -3.240673542022705, "step": 2154 }, { "epoch": 2.03, "grad_norm": 16.706239700317383, "learning_rate": 1.7873382301504022e-07, "logps/chosen": -59.98960876464844, "logps/rejected": -82.62277221679688, "loss": 0.3025, "losses/dpo": 0.26598334312438965, "losses/sft": 2.105565071105957, "losses/total": 0.26598334312438965, "ref_logps/chosen": -41.27506637573242, "ref_logps/rejected": -49.014793395996094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8714544773101807, "rewards/margins": 1.4893431663513184, "rewards/rejected": -3.360797643661499, "step": 2155 }, { "epoch": 2.04, "grad_norm": 10.767970085144043, "learning_rate": 1.7855893669115073e-07, "logps/chosen": -52.65675354003906, "logps/rejected": -82.47964477539062, "loss": 0.146, "losses/dpo": 0.11722319573163986, "losses/sft": 1.8464301824569702, "losses/total": 0.11722319573163986, "ref_logps/chosen": -38.047515869140625, "ref_logps/rejected": -45.859710693359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4609239101409912, "rewards/margins": 2.2010693550109863, "rewards/rejected": -3.6619935035705566, "step": 2156 }, { "epoch": 2.04, "grad_norm": 12.871431350708008, "learning_rate": 1.783840503672613e-07, "logps/chosen": -64.88345336914062, "logps/rejected": -89.14042663574219, "loss": 0.169, "losses/dpo": 0.2960633635520935, "losses/sft": 1.85858154296875, "losses/total": 0.2960633635520935, "ref_logps/chosen": -47.82654571533203, "ref_logps/rejected": -50.038429260253906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7056903839111328, "rewards/margins": 2.204509973526001, "rewards/rejected": -3.910200595855713, "step": 2157 }, { "epoch": 2.04, "grad_norm": 24.01546287536621, "learning_rate": 1.782091640433718e-07, "logps/chosen": -75.90113067626953, "logps/rejected": -83.4007568359375, "loss": 0.2923, "losses/dpo": 0.31727007031440735, "losses/sft": 2.601719856262207, "losses/total": 0.31727007031440735, "ref_logps/chosen": -53.00567626953125, "ref_logps/rejected": -45.32004928588867, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2895455360412598, "rewards/margins": 1.5185257196426392, "rewards/rejected": -3.8080711364746094, "step": 2158 }, { "epoch": 2.04, "grad_norm": 24.344449996948242, "learning_rate": 1.7803427771948232e-07, "logps/chosen": -60.256370544433594, "logps/rejected": -76.394287109375, "loss": 0.3597, "losses/dpo": 0.22860893607139587, "losses/sft": 1.3986338376998901, "losses/total": 0.22860893607139587, "ref_logps/chosen": -41.57615661621094, "ref_logps/rejected": -44.063560485839844, "rewards/accuracies": 0.875, "rewards/chosen": -1.8680219650268555, "rewards/margins": 1.365051507949829, "rewards/rejected": -3.2330734729766846, "step": 2159 }, { "epoch": 2.04, "grad_norm": 15.661075592041016, "learning_rate": 1.7785939139559286e-07, "logps/chosen": -48.448280334472656, "logps/rejected": -78.65946197509766, "loss": 0.1818, "losses/dpo": 0.18213243782520294, "losses/sft": 1.4669668674468994, "losses/total": 0.18213243782520294, "ref_logps/chosen": -34.93203353881836, "ref_logps/rejected": -44.397987365722656, "rewards/accuracies": 1.0, "rewards/chosen": -1.3516244888305664, "rewards/margins": 2.074523448944092, "rewards/rejected": -3.426147937774658, "step": 2160 }, { "epoch": 2.04, "grad_norm": 20.772863388061523, "learning_rate": 1.7768450507170337e-07, "logps/chosen": -63.82762145996094, "logps/rejected": -91.49402618408203, "loss": 0.2212, "losses/dpo": 0.3095247745513916, "losses/sft": 1.7047816514968872, "losses/total": 0.3095247745513916, "ref_logps/chosen": -43.230716705322266, "ref_logps/rejected": -50.3248405456543, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0596907138824463, "rewards/margins": 2.057227611541748, "rewards/rejected": -4.116918087005615, "step": 2161 }, { "epoch": 2.04, "grad_norm": 27.207130432128906, "learning_rate": 1.7750961874781391e-07, "logps/chosen": -59.497711181640625, "logps/rejected": -70.29346466064453, "loss": 0.4447, "losses/dpo": 0.7661054134368896, "losses/sft": 2.3019745349884033, "losses/total": 0.7661054134368896, "ref_logps/chosen": -38.863189697265625, "ref_logps/rejected": -35.97392272949219, "rewards/accuracies": 0.875, "rewards/chosen": -2.0634524822235107, "rewards/margins": 1.3685017824172974, "rewards/rejected": -3.4319543838500977, "step": 2162 }, { "epoch": 2.04, "grad_norm": 17.928508758544922, "learning_rate": 1.7733473242392443e-07, "logps/chosen": -47.27583312988281, "logps/rejected": -63.323204040527344, "loss": 0.2901, "losses/dpo": 0.27205145359039307, "losses/sft": 2.077524185180664, "losses/total": 0.27205145359039307, "ref_logps/chosen": -29.81635284423828, "ref_logps/rejected": -30.439565658569336, "rewards/accuracies": 0.875, "rewards/chosen": -1.7459475994110107, "rewards/margins": 1.5424165725708008, "rewards/rejected": -3.2883639335632324, "step": 2163 }, { "epoch": 2.04, "grad_norm": 20.832271575927734, "learning_rate": 1.77159846100035e-07, "logps/chosen": -58.44321823120117, "logps/rejected": -92.69669342041016, "loss": 0.2528, "losses/dpo": 0.5324923992156982, "losses/sft": 2.9317092895507812, "losses/total": 0.5324923992156982, "ref_logps/chosen": -38.508975982666016, "ref_logps/rejected": -52.49767303466797, "rewards/accuracies": 0.9375, "rewards/chosen": -1.993424415588379, "rewards/margins": 2.026477336883545, "rewards/rejected": -4.019901752471924, "step": 2164 }, { "epoch": 2.04, "grad_norm": 18.230239868164062, "learning_rate": 1.769849597761455e-07, "logps/chosen": -56.35995864868164, "logps/rejected": -76.60353088378906, "loss": 0.2634, "losses/dpo": 0.2897110879421234, "losses/sft": 1.3430176973342896, "losses/total": 0.2897110879421234, "ref_logps/chosen": -46.06782531738281, "ref_logps/rejected": -47.96770095825195, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0292131900787354, "rewards/margins": 1.8343701362609863, "rewards/rejected": -2.8635830879211426, "step": 2165 }, { "epoch": 2.05, "grad_norm": 17.788854598999023, "learning_rate": 1.7681007345225602e-07, "logps/chosen": -61.61143493652344, "logps/rejected": -70.13768005371094, "loss": 0.2113, "losses/dpo": 0.0938451886177063, "losses/sft": 1.7500905990600586, "losses/total": 0.0938451886177063, "ref_logps/chosen": -45.088768005371094, "ref_logps/rejected": -35.49890899658203, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6522667407989502, "rewards/margins": 1.8116098642349243, "rewards/rejected": -3.463876724243164, "step": 2166 }, { "epoch": 2.05, "grad_norm": 17.123401641845703, "learning_rate": 1.7663518712836656e-07, "logps/chosen": -65.25563049316406, "logps/rejected": -88.04917907714844, "loss": 0.2419, "losses/dpo": 0.1335296332836151, "losses/sft": 2.229015827178955, "losses/total": 0.1335296332836151, "ref_logps/chosen": -47.218082427978516, "ref_logps/rejected": -50.403404235839844, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8037550449371338, "rewards/margins": 1.960822582244873, "rewards/rejected": -3.764577865600586, "step": 2167 }, { "epoch": 2.05, "grad_norm": 15.332640647888184, "learning_rate": 1.7646030080447707e-07, "logps/chosen": -56.073631286621094, "logps/rejected": -86.1395263671875, "loss": 0.1611, "losses/dpo": 0.3132442831993103, "losses/sft": 2.0467052459716797, "losses/total": 0.3132442831993103, "ref_logps/chosen": -41.04229736328125, "ref_logps/rejected": -48.011940002441406, "rewards/accuracies": 1.0, "rewards/chosen": -1.5031332969665527, "rewards/margins": 2.3096251487731934, "rewards/rejected": -3.8127589225769043, "step": 2168 }, { "epoch": 2.05, "grad_norm": 17.61914825439453, "learning_rate": 1.762854144805876e-07, "logps/chosen": -53.263023376464844, "logps/rejected": -79.85975646972656, "loss": 0.2624, "losses/dpo": 0.1332562267780304, "losses/sft": 2.0333592891693115, "losses/total": 0.1332562267780304, "ref_logps/chosen": -36.185855865478516, "ref_logps/rejected": -45.8450813293457, "rewards/accuracies": 0.875, "rewards/chosen": -1.707716941833496, "rewards/margins": 1.6937508583068848, "rewards/rejected": -3.401467800140381, "step": 2169 }, { "epoch": 2.05, "grad_norm": 14.759425163269043, "learning_rate": 1.7611052815669815e-07, "logps/chosen": -53.97357940673828, "logps/rejected": -79.67366027832031, "loss": 0.1691, "losses/dpo": 0.2415875792503357, "losses/sft": 1.868658423423767, "losses/total": 0.2415875792503357, "ref_logps/chosen": -38.85692596435547, "ref_logps/rejected": -42.45916748046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5116651058197021, "rewards/margins": 2.209784507751465, "rewards/rejected": -3.721449613571167, "step": 2170 }, { "epoch": 2.05, "grad_norm": 19.53839111328125, "learning_rate": 1.7593564183280869e-07, "logps/chosen": -53.13072967529297, "logps/rejected": -66.05118560791016, "loss": 0.2711, "losses/dpo": 0.1980120837688446, "losses/sft": 1.6273363828659058, "losses/total": 0.1980120837688446, "ref_logps/chosen": -34.85951614379883, "ref_logps/rejected": -32.383243560791016, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8271217346191406, "rewards/margins": 1.5396722555160522, "rewards/rejected": -3.3667941093444824, "step": 2171 }, { "epoch": 2.05, "grad_norm": 19.09229850769043, "learning_rate": 1.757607555089192e-07, "logps/chosen": -52.424198150634766, "logps/rejected": -73.0693359375, "loss": 0.2773, "losses/dpo": 0.15910747647285461, "losses/sft": 1.816064476966858, "losses/total": 0.15910747647285461, "ref_logps/chosen": -35.33361053466797, "ref_logps/rejected": -38.2588996887207, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7090588808059692, "rewards/margins": 1.771984577178955, "rewards/rejected": -3.481043577194214, "step": 2172 }, { "epoch": 2.05, "grad_norm": 14.572972297668457, "learning_rate": 1.755858691850297e-07, "logps/chosen": -52.50975799560547, "logps/rejected": -74.72636413574219, "loss": 0.2217, "losses/dpo": 0.1505233347415924, "losses/sft": 1.9595788717269897, "losses/total": 0.1505233347415924, "ref_logps/chosen": -35.77153015136719, "ref_logps/rejected": -40.46814727783203, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6738229990005493, "rewards/margins": 1.7519986629486084, "rewards/rejected": -3.4258217811584473, "step": 2173 }, { "epoch": 2.05, "grad_norm": 27.026321411132812, "learning_rate": 1.7541098286114025e-07, "logps/chosen": -45.954315185546875, "logps/rejected": -65.09857940673828, "loss": 0.3316, "losses/dpo": 0.600679337978363, "losses/sft": 1.7634345293045044, "losses/total": 0.600679337978363, "ref_logps/chosen": -33.33518600463867, "ref_logps/rejected": -37.236328125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2619129419326782, "rewards/margins": 1.524311900138855, "rewards/rejected": -2.786224842071533, "step": 2174 }, { "epoch": 2.05, "grad_norm": 18.01077651977539, "learning_rate": 1.7523609653725076e-07, "logps/chosen": -48.266197204589844, "logps/rejected": -92.97135162353516, "loss": 0.1879, "losses/dpo": 0.11259466409683228, "losses/sft": 1.7341797351837158, "losses/total": 0.11259466409683228, "ref_logps/chosen": -33.4748649597168, "ref_logps/rejected": -53.5361328125, "rewards/accuracies": 0.875, "rewards/chosen": -1.4791333675384521, "rewards/margins": 2.464388370513916, "rewards/rejected": -3.9435219764709473, "step": 2175 }, { "epoch": 2.05, "grad_norm": 27.163389205932617, "learning_rate": 1.7506121021336133e-07, "logps/chosen": -59.63398742675781, "logps/rejected": -82.30131530761719, "loss": 0.2348, "losses/dpo": 0.13854947686195374, "losses/sft": 1.8816542625427246, "losses/total": 0.13854947686195374, "ref_logps/chosen": -39.3703727722168, "ref_logps/rejected": -42.33588409423828, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0263609886169434, "rewards/margins": 1.970181941986084, "rewards/rejected": -3.9965429306030273, "step": 2176 }, { "epoch": 2.06, "grad_norm": 13.131577491760254, "learning_rate": 1.7488632388947184e-07, "logps/chosen": -65.90166473388672, "logps/rejected": -77.4763412475586, "loss": 0.151, "losses/dpo": 0.18051180243492126, "losses/sft": 2.112351417541504, "losses/total": 0.18051180243492126, "ref_logps/chosen": -54.28382873535156, "ref_logps/rejected": -45.44095230102539, "rewards/accuracies": 1.0, "rewards/chosen": -1.1617836952209473, "rewards/margins": 2.041755199432373, "rewards/rejected": -3.203538656234741, "step": 2177 }, { "epoch": 2.06, "grad_norm": 14.968267440795898, "learning_rate": 1.7471143756558238e-07, "logps/chosen": -70.57845306396484, "logps/rejected": -102.93708801269531, "loss": 0.1333, "losses/dpo": 0.15224751830101013, "losses/sft": 1.9976760149002075, "losses/total": 0.15224751830101013, "ref_logps/chosen": -49.856834411621094, "ref_logps/rejected": -55.65885925292969, "rewards/accuracies": 1.0, "rewards/chosen": -2.0721611976623535, "rewards/margins": 2.6556618213653564, "rewards/rejected": -4.727823257446289, "step": 2178 }, { "epoch": 2.06, "grad_norm": 27.20005989074707, "learning_rate": 1.745365512416929e-07, "logps/chosen": -49.359806060791016, "logps/rejected": -64.03297424316406, "loss": 0.3851, "losses/dpo": 0.2683928906917572, "losses/sft": 1.762873649597168, "losses/total": 0.2683928906917572, "ref_logps/chosen": -32.485660552978516, "ref_logps/rejected": -34.089698791503906, "rewards/accuracies": 0.75, "rewards/chosen": -1.687414526939392, "rewards/margins": 1.3069125413894653, "rewards/rejected": -2.9943268299102783, "step": 2179 }, { "epoch": 2.06, "grad_norm": 28.27048683166504, "learning_rate": 1.743616649178034e-07, "logps/chosen": -66.8674087524414, "logps/rejected": -74.0071792602539, "loss": 0.3318, "losses/dpo": 0.17064473032951355, "losses/sft": 1.7419644594192505, "losses/total": 0.17064473032951355, "ref_logps/chosen": -46.5545654296875, "ref_logps/rejected": -38.44732666015625, "rewards/accuracies": 0.875, "rewards/chosen": -2.0312840938568115, "rewards/margins": 1.5247018337249756, "rewards/rejected": -3.555985927581787, "step": 2180 }, { "epoch": 2.06, "grad_norm": 22.70830535888672, "learning_rate": 1.7418677859391394e-07, "logps/chosen": -53.06361770629883, "logps/rejected": -75.39220428466797, "loss": 0.2433, "losses/dpo": 0.2415391504764557, "losses/sft": 1.8355822563171387, "losses/total": 0.2415391504764557, "ref_logps/chosen": -36.476383209228516, "ref_logps/rejected": -40.51774215698242, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6587237119674683, "rewards/margins": 1.8287230730056763, "rewards/rejected": -3.4874467849731445, "step": 2181 }, { "epoch": 2.06, "grad_norm": 19.944292068481445, "learning_rate": 1.7401189227002446e-07, "logps/chosen": -58.89207077026367, "logps/rejected": -87.72540283203125, "loss": 0.2242, "losses/dpo": 0.32031840085983276, "losses/sft": 1.9566829204559326, "losses/total": 0.32031840085983276, "ref_logps/chosen": -41.21376037597656, "ref_logps/rejected": -48.45860290527344, "rewards/accuracies": 0.875, "rewards/chosen": -1.767831563949585, "rewards/margins": 2.158848762512207, "rewards/rejected": -3.926680088043213, "step": 2182 }, { "epoch": 2.06, "grad_norm": 16.069318771362305, "learning_rate": 1.7383700594613502e-07, "logps/chosen": -47.496742248535156, "logps/rejected": -70.46304321289062, "loss": 0.2476, "losses/dpo": 0.2841242253780365, "losses/sft": 2.312047243118286, "losses/total": 0.2841242253780365, "ref_logps/chosen": -32.34196472167969, "ref_logps/rejected": -36.13676452636719, "rewards/accuracies": 0.875, "rewards/chosen": -1.5154778957366943, "rewards/margins": 1.9171501398086548, "rewards/rejected": -3.4326279163360596, "step": 2183 }, { "epoch": 2.06, "grad_norm": 22.96808433532715, "learning_rate": 1.7366211962224554e-07, "logps/chosen": -59.409332275390625, "logps/rejected": -86.48368835449219, "loss": 0.3256, "losses/dpo": 0.2149156630039215, "losses/sft": 1.9455467462539673, "losses/total": 0.2149156630039215, "ref_logps/chosen": -39.371360778808594, "ref_logps/rejected": -43.310508728027344, "rewards/accuracies": 0.875, "rewards/chosen": -2.0037968158721924, "rewards/margins": 2.313520669937134, "rewards/rejected": -4.317317962646484, "step": 2184 }, { "epoch": 2.06, "grad_norm": 20.13286590576172, "learning_rate": 1.7348723329835607e-07, "logps/chosen": -54.029640197753906, "logps/rejected": -86.71771240234375, "loss": 0.1987, "losses/dpo": 0.06851854175329208, "losses/sft": 1.7582980394363403, "losses/total": 0.06851854175329208, "ref_logps/chosen": -33.388877868652344, "ref_logps/rejected": -44.847572326660156, "rewards/accuracies": 1.0, "rewards/chosen": -2.0640759468078613, "rewards/margins": 2.1229381561279297, "rewards/rejected": -4.187013626098633, "step": 2185 }, { "epoch": 2.06, "grad_norm": 12.484709739685059, "learning_rate": 1.733123469744666e-07, "logps/chosen": -69.00020599365234, "logps/rejected": -92.46781921386719, "loss": 0.1367, "losses/dpo": 0.04129623621702194, "losses/sft": 1.7258503437042236, "losses/total": 0.04129623621702194, "ref_logps/chosen": -46.61714172363281, "ref_logps/rejected": -46.315940856933594, "rewards/accuracies": 1.0, "rewards/chosen": -2.2383062839508057, "rewards/margins": 2.3768811225891113, "rewards/rejected": -4.615187644958496, "step": 2186 }, { "epoch": 2.07, "grad_norm": 21.23772430419922, "learning_rate": 1.731374606505771e-07, "logps/chosen": -53.44989776611328, "logps/rejected": -92.73365783691406, "loss": 0.2705, "losses/dpo": 0.08861647546291351, "losses/sft": 2.0424864292144775, "losses/total": 0.08861647546291351, "ref_logps/chosen": -36.1646728515625, "ref_logps/rejected": -48.25510025024414, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7285223007202148, "rewards/margins": 2.7193336486816406, "rewards/rejected": -4.4478559494018555, "step": 2187 }, { "epoch": 2.07, "grad_norm": 33.41069793701172, "learning_rate": 1.7296257432668764e-07, "logps/chosen": -60.85960006713867, "logps/rejected": -75.24000549316406, "loss": 0.3661, "losses/dpo": 0.09842763841152191, "losses/sft": 1.9827691316604614, "losses/total": 0.09842763841152191, "ref_logps/chosen": -38.05207824707031, "ref_logps/rejected": -38.01402282714844, "rewards/accuracies": 0.875, "rewards/chosen": -2.280752182006836, "rewards/margins": 1.4418456554412842, "rewards/rejected": -3.722598075866699, "step": 2188 }, { "epoch": 2.07, "grad_norm": 13.806150436401367, "learning_rate": 1.7278768800279818e-07, "logps/chosen": -55.86123275756836, "logps/rejected": -85.80702209472656, "loss": 0.1695, "losses/dpo": 0.24726992845535278, "losses/sft": 1.9747188091278076, "losses/total": 0.24726992845535278, "ref_logps/chosen": -35.86094665527344, "ref_logps/rejected": -44.97712326049805, "rewards/accuracies": 1.0, "rewards/chosen": -2.000028133392334, "rewards/margins": 2.0829615592956543, "rewards/rejected": -4.0829901695251465, "step": 2189 }, { "epoch": 2.07, "grad_norm": 14.162352561950684, "learning_rate": 1.7261280167890872e-07, "logps/chosen": -58.23813247680664, "logps/rejected": -93.10269165039062, "loss": 0.1762, "losses/dpo": 0.07838228344917297, "losses/sft": 2.18127703666687, "losses/total": 0.07838228344917297, "ref_logps/chosen": -40.983055114746094, "ref_logps/rejected": -51.26152801513672, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7255079746246338, "rewards/margins": 2.4586076736450195, "rewards/rejected": -4.184115886688232, "step": 2190 }, { "epoch": 2.07, "grad_norm": 26.535694122314453, "learning_rate": 1.7243791535501923e-07, "logps/chosen": -55.426780700683594, "logps/rejected": -79.62908935546875, "loss": 0.3449, "losses/dpo": 0.19528482854366302, "losses/sft": 1.7993446588516235, "losses/total": 0.19528482854366302, "ref_logps/chosen": -37.25511169433594, "ref_logps/rejected": -43.912296295166016, "rewards/accuracies": 0.875, "rewards/chosen": -1.8171671628952026, "rewards/margins": 1.7545119524002075, "rewards/rejected": -3.57167911529541, "step": 2191 }, { "epoch": 2.07, "grad_norm": 24.744495391845703, "learning_rate": 1.7226302903112977e-07, "logps/chosen": -53.65546417236328, "logps/rejected": -85.93426513671875, "loss": 0.3072, "losses/dpo": 0.4467526972293854, "losses/sft": 2.5786640644073486, "losses/total": 0.4467526972293854, "ref_logps/chosen": -33.4858283996582, "ref_logps/rejected": -45.773956298828125, "rewards/accuracies": 0.875, "rewards/chosen": -2.016963481903076, "rewards/margins": 1.999066948890686, "rewards/rejected": -4.016030311584473, "step": 2192 }, { "epoch": 2.07, "grad_norm": 23.44725227355957, "learning_rate": 1.7208814270724028e-07, "logps/chosen": -53.96575927734375, "logps/rejected": -87.75813293457031, "loss": 0.2393, "losses/dpo": 0.07793407142162323, "losses/sft": 1.6732468605041504, "losses/total": 0.07793407142162323, "ref_logps/chosen": -37.87188720703125, "ref_logps/rejected": -51.79331970214844, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6093873977661133, "rewards/margins": 1.9870942831039429, "rewards/rejected": -3.5964813232421875, "step": 2193 }, { "epoch": 2.07, "grad_norm": 24.691877365112305, "learning_rate": 1.719132563833508e-07, "logps/chosen": -55.128334045410156, "logps/rejected": -85.45429992675781, "loss": 0.3788, "losses/dpo": 0.9169511795043945, "losses/sft": 2.684892177581787, "losses/total": 0.9169511795043945, "ref_logps/chosen": -33.98554229736328, "ref_logps/rejected": -42.699195861816406, "rewards/accuracies": 0.9375, "rewards/chosen": -2.114279270172119, "rewards/margins": 2.1612305641174316, "rewards/rejected": -4.275509834289551, "step": 2194 }, { "epoch": 2.07, "grad_norm": 18.590097427368164, "learning_rate": 1.7173837005946133e-07, "logps/chosen": -42.8539924621582, "logps/rejected": -72.0229721069336, "loss": 0.2931, "losses/dpo": 0.15254491567611694, "losses/sft": 2.111268997192383, "losses/total": 0.15254491567611694, "ref_logps/chosen": -27.071752548217773, "ref_logps/rejected": -38.403770446777344, "rewards/accuracies": 0.875, "rewards/chosen": -1.5782239437103271, "rewards/margins": 1.783695936203003, "rewards/rejected": -3.36191987991333, "step": 2195 }, { "epoch": 2.07, "grad_norm": 21.303430557250977, "learning_rate": 1.7156348373557187e-07, "logps/chosen": -56.064388275146484, "logps/rejected": -73.31047058105469, "loss": 0.2569, "losses/dpo": 0.05567019432783127, "losses/sft": 1.4970711469650269, "losses/total": 0.05567019432783127, "ref_logps/chosen": -41.055992126464844, "ref_logps/rejected": -38.608367919921875, "rewards/accuracies": 0.875, "rewards/chosen": -1.5008397102355957, "rewards/margins": 1.9693713188171387, "rewards/rejected": -3.4702110290527344, "step": 2196 }, { "epoch": 2.07, "grad_norm": 24.07655143737793, "learning_rate": 1.713885974116824e-07, "logps/chosen": -52.34514617919922, "logps/rejected": -80.76176452636719, "loss": 0.2951, "losses/dpo": 0.19542747735977173, "losses/sft": 1.5525329113006592, "losses/total": 0.19542747735977173, "ref_logps/chosen": -34.239925384521484, "ref_logps/rejected": -45.56114196777344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8105223178863525, "rewards/margins": 1.7095403671264648, "rewards/rejected": -3.5200626850128174, "step": 2197 }, { "epoch": 2.08, "grad_norm": 22.42549705505371, "learning_rate": 1.7121371108779292e-07, "logps/chosen": -54.095611572265625, "logps/rejected": -80.62397766113281, "loss": 0.234, "losses/dpo": 0.21732285618782043, "losses/sft": 2.2127344608306885, "losses/total": 0.21732285618782043, "ref_logps/chosen": -37.54641342163086, "ref_logps/rejected": -45.050636291503906, "rewards/accuracies": 0.875, "rewards/chosen": -1.6549193859100342, "rewards/margins": 1.9024146795272827, "rewards/rejected": -3.5573339462280273, "step": 2198 }, { "epoch": 2.08, "grad_norm": 20.114450454711914, "learning_rate": 1.7103882476390346e-07, "logps/chosen": -53.59613037109375, "logps/rejected": -94.48097229003906, "loss": 0.1915, "losses/dpo": 0.1815064698457718, "losses/sft": 2.0497143268585205, "losses/total": 0.1815064698457718, "ref_logps/chosen": -37.92587661743164, "ref_logps/rejected": -53.89605712890625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.567025065422058, "rewards/margins": 2.4914662837982178, "rewards/rejected": -4.058491230010986, "step": 2199 }, { "epoch": 2.08, "grad_norm": 13.921630859375, "learning_rate": 1.7086393844001398e-07, "logps/chosen": -65.02278900146484, "logps/rejected": -94.15165710449219, "loss": 0.1687, "losses/dpo": 0.11342591047286987, "losses/sft": 1.8795005083084106, "losses/total": 0.11342591047286987, "ref_logps/chosen": -47.23157501220703, "ref_logps/rejected": -52.740516662597656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.779120922088623, "rewards/margins": 2.361992835998535, "rewards/rejected": -4.141113758087158, "step": 2200 }, { "epoch": 2.08, "grad_norm": 21.20457649230957, "learning_rate": 1.706890521161245e-07, "logps/chosen": -59.54458236694336, "logps/rejected": -89.32866668701172, "loss": 0.2722, "losses/dpo": 0.10039615631103516, "losses/sft": 2.3329107761383057, "losses/total": 0.10039615631103516, "ref_logps/chosen": -38.11788558959961, "ref_logps/rejected": -48.12678909301758, "rewards/accuracies": 0.875, "rewards/chosen": -2.142669916152954, "rewards/margins": 1.97751784324646, "rewards/rejected": -4.120187759399414, "step": 2201 }, { "epoch": 2.08, "grad_norm": 13.924773216247559, "learning_rate": 1.7051416579223505e-07, "logps/chosen": -53.46916198730469, "logps/rejected": -84.70620727539062, "loss": 0.1943, "losses/dpo": 0.16763654351234436, "losses/sft": 2.394869327545166, "losses/total": 0.16763654351234436, "ref_logps/chosen": -31.43370819091797, "ref_logps/rejected": -40.78105926513672, "rewards/accuracies": 1.0, "rewards/chosen": -2.203545093536377, "rewards/margins": 2.1889700889587402, "rewards/rejected": -4.392515182495117, "step": 2202 }, { "epoch": 2.08, "grad_norm": 23.022014617919922, "learning_rate": 1.7033927946834557e-07, "logps/chosen": -57.59918212890625, "logps/rejected": -76.6629409790039, "loss": 0.2921, "losses/dpo": 0.47700318694114685, "losses/sft": 1.7552341222763062, "losses/total": 0.47700318694114685, "ref_logps/chosen": -38.358924865722656, "ref_logps/rejected": -40.03480911254883, "rewards/accuracies": 0.875, "rewards/chosen": -1.9240257740020752, "rewards/margins": 1.7387876510620117, "rewards/rejected": -3.662813425064087, "step": 2203 }, { "epoch": 2.08, "grad_norm": 19.173295974731445, "learning_rate": 1.701643931444561e-07, "logps/chosen": -57.08106994628906, "logps/rejected": -85.2384033203125, "loss": 0.2732, "losses/dpo": 0.09806828200817108, "losses/sft": 2.107323169708252, "losses/total": 0.09806828200817108, "ref_logps/chosen": -35.34751892089844, "ref_logps/rejected": -44.77887725830078, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1733551025390625, "rewards/margins": 1.8725978136062622, "rewards/rejected": -4.045952796936035, "step": 2204 }, { "epoch": 2.08, "grad_norm": 28.111894607543945, "learning_rate": 1.6998950682056662e-07, "logps/chosen": -55.997066497802734, "logps/rejected": -73.00859069824219, "loss": 0.3705, "losses/dpo": 0.20976343750953674, "losses/sft": 2.3299331665039062, "losses/total": 0.20976343750953674, "ref_logps/chosen": -36.010276794433594, "ref_logps/rejected": -41.71709442138672, "rewards/accuracies": 0.875, "rewards/chosen": -1.9986789226531982, "rewards/margins": 1.1304705142974854, "rewards/rejected": -3.1291494369506836, "step": 2205 }, { "epoch": 2.08, "grad_norm": 27.258811950683594, "learning_rate": 1.6981462049667716e-07, "logps/chosen": -63.16986083984375, "logps/rejected": -98.24568939208984, "loss": 0.222, "losses/dpo": 0.19202932715415955, "losses/sft": 2.4920687675476074, "losses/total": 0.19202932715415955, "ref_logps/chosen": -39.95744323730469, "ref_logps/rejected": -51.04297637939453, "rewards/accuracies": 0.9375, "rewards/chosen": -2.321241855621338, "rewards/margins": 2.39902925491333, "rewards/rejected": -4.720271110534668, "step": 2206 }, { "epoch": 2.08, "grad_norm": 19.09151268005371, "learning_rate": 1.6963973417278767e-07, "logps/chosen": -62.666236877441406, "logps/rejected": -93.16124725341797, "loss": 0.186, "losses/dpo": 0.10941722989082336, "losses/sft": 1.7997444868087769, "losses/total": 0.10941722989082336, "ref_logps/chosen": -40.200965881347656, "ref_logps/rejected": -48.780181884765625, "rewards/accuracies": 1.0, "rewards/chosen": -2.2465267181396484, "rewards/margins": 2.191579818725586, "rewards/rejected": -4.438106536865234, "step": 2207 }, { "epoch": 2.08, "grad_norm": 16.559520721435547, "learning_rate": 1.6946484784889818e-07, "logps/chosen": -56.985877990722656, "logps/rejected": -87.59817504882812, "loss": 0.1791, "losses/dpo": 0.21834111213684082, "losses/sft": 1.5174407958984375, "losses/total": 0.21834111213684082, "ref_logps/chosen": -35.917991638183594, "ref_logps/rejected": -43.31269073486328, "rewards/accuracies": 1.0, "rewards/chosen": -2.106788396835327, "rewards/margins": 2.321760654449463, "rewards/rejected": -4.428549289703369, "step": 2208 }, { "epoch": 2.09, "grad_norm": 17.215805053710938, "learning_rate": 1.6928996152500875e-07, "logps/chosen": -45.57110595703125, "logps/rejected": -71.7991943359375, "loss": 0.2052, "losses/dpo": 0.3538817763328552, "losses/sft": 1.5875787734985352, "losses/total": 0.3538817763328552, "ref_logps/chosen": -30.913644790649414, "ref_logps/rejected": -34.5597038269043, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4657464027404785, "rewards/margins": 2.258202314376831, "rewards/rejected": -3.7239487171173096, "step": 2209 }, { "epoch": 2.09, "grad_norm": 8.342625617980957, "learning_rate": 1.6911507520111926e-07, "logps/chosen": -61.34052658081055, "logps/rejected": -111.30182647705078, "loss": 0.083, "losses/dpo": 0.08819499611854553, "losses/sft": 1.736279010772705, "losses/total": 0.08819499611854553, "ref_logps/chosen": -41.58523178100586, "ref_logps/rejected": -60.833126068115234, "rewards/accuracies": 1.0, "rewards/chosen": -1.975529670715332, "rewards/margins": 3.071340560913086, "rewards/rejected": -5.04686975479126, "step": 2210 }, { "epoch": 2.09, "grad_norm": 14.353584289550781, "learning_rate": 1.689401888772298e-07, "logps/chosen": -57.32757568359375, "logps/rejected": -83.46308898925781, "loss": 0.1248, "losses/dpo": 0.10038065910339355, "losses/sft": 1.7414997816085815, "losses/total": 0.10038065910339355, "ref_logps/chosen": -38.945091247558594, "ref_logps/rejected": -41.35407257080078, "rewards/accuracies": 1.0, "rewards/chosen": -1.8382482528686523, "rewards/margins": 2.372653007507324, "rewards/rejected": -4.210901260375977, "step": 2211 }, { "epoch": 2.09, "grad_norm": 15.132885932922363, "learning_rate": 1.6876530255334031e-07, "logps/chosen": -48.53065490722656, "logps/rejected": -79.5280990600586, "loss": 0.1678, "losses/dpo": 0.06708650290966034, "losses/sft": 2.090761661529541, "losses/total": 0.06708650290966034, "ref_logps/chosen": -32.15243148803711, "ref_logps/rejected": -40.45890808105469, "rewards/accuracies": 1.0, "rewards/chosen": -1.6378223896026611, "rewards/margins": 2.269096612930298, "rewards/rejected": -3.906919002532959, "step": 2212 }, { "epoch": 2.09, "grad_norm": 24.768259048461914, "learning_rate": 1.6859041622945085e-07, "logps/chosen": -53.58648681640625, "logps/rejected": -76.79092407226562, "loss": 0.2226, "losses/dpo": 0.31338661909103394, "losses/sft": 2.219318389892578, "losses/total": 0.31338661909103394, "ref_logps/chosen": -32.16387176513672, "ref_logps/rejected": -37.08013153076172, "rewards/accuracies": 1.0, "rewards/chosen": -2.1422619819641113, "rewards/margins": 1.8288161754608154, "rewards/rejected": -3.9710781574249268, "step": 2213 }, { "epoch": 2.09, "grad_norm": 18.416728973388672, "learning_rate": 1.6841552990556137e-07, "logps/chosen": -55.35419845581055, "logps/rejected": -78.583984375, "loss": 0.2559, "losses/dpo": 0.12346776574850082, "losses/sft": 2.094236373901367, "losses/total": 0.12346776574850082, "ref_logps/chosen": -37.566490173339844, "ref_logps/rejected": -40.12315368652344, "rewards/accuracies": 0.875, "rewards/chosen": -1.7787704467773438, "rewards/margins": 2.067312717437744, "rewards/rejected": -3.846083164215088, "step": 2214 }, { "epoch": 2.09, "grad_norm": 20.469013214111328, "learning_rate": 1.682406435816719e-07, "logps/chosen": -72.93456268310547, "logps/rejected": -98.66339874267578, "loss": 0.2145, "losses/dpo": 0.22982379794120789, "losses/sft": 2.3891358375549316, "losses/total": 0.22982379794120789, "ref_logps/chosen": -45.680511474609375, "ref_logps/rejected": -49.3587646484375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.72540545463562, "rewards/margins": 2.2050580978393555, "rewards/rejected": -4.9304633140563965, "step": 2215 }, { "epoch": 2.09, "grad_norm": 21.647539138793945, "learning_rate": 1.6806575725778244e-07, "logps/chosen": -49.575714111328125, "logps/rejected": -80.34425354003906, "loss": 0.239, "losses/dpo": 0.13555070757865906, "losses/sft": 2.7519843578338623, "losses/total": 0.13555070757865906, "ref_logps/chosen": -31.77227783203125, "ref_logps/rejected": -41.82489776611328, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7803436517715454, "rewards/margins": 2.071591854095459, "rewards/rejected": -3.851935863494873, "step": 2216 }, { "epoch": 2.09, "grad_norm": 18.783519744873047, "learning_rate": 1.6789087093389296e-07, "logps/chosen": -42.88027572631836, "logps/rejected": -84.32542419433594, "loss": 0.2282, "losses/dpo": 0.16614983975887299, "losses/sft": 1.6410008668899536, "losses/total": 0.16614983975887299, "ref_logps/chosen": -26.285659790039062, "ref_logps/rejected": -43.653663635253906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.659461498260498, "rewards/margins": 2.4077138900756836, "rewards/rejected": -4.06717586517334, "step": 2217 }, { "epoch": 2.09, "grad_norm": 27.009485244750977, "learning_rate": 1.677159846100035e-07, "logps/chosen": -59.341453552246094, "logps/rejected": -77.44983673095703, "loss": 0.3232, "losses/dpo": 0.41764286160469055, "losses/sft": 2.077141761779785, "losses/total": 0.41764286160469055, "ref_logps/chosen": -37.908653259277344, "ref_logps/rejected": -39.44845199584961, "rewards/accuracies": 0.8125, "rewards/chosen": -2.143280029296875, "rewards/margins": 1.6568586826324463, "rewards/rejected": -3.8001389503479004, "step": 2218 }, { "epoch": 2.1, "grad_norm": 26.723899841308594, "learning_rate": 1.67541098286114e-07, "logps/chosen": -69.6722640991211, "logps/rejected": -91.57075500488281, "loss": 0.3215, "losses/dpo": 0.11519400030374527, "losses/sft": 2.6588547229766846, "losses/total": 0.11519400030374527, "ref_logps/chosen": -41.187618255615234, "ref_logps/rejected": -47.43374252319336, "rewards/accuracies": 0.875, "rewards/chosen": -2.8484644889831543, "rewards/margins": 1.5652365684509277, "rewards/rejected": -4.413701057434082, "step": 2219 }, { "epoch": 2.1, "grad_norm": 18.43507194519043, "learning_rate": 1.6736621196222455e-07, "logps/chosen": -63.70801544189453, "logps/rejected": -100.65912628173828, "loss": 0.1566, "losses/dpo": 0.28203755617141724, "losses/sft": 1.9518189430236816, "losses/total": 0.28203755617141724, "ref_logps/chosen": -43.11907196044922, "ref_logps/rejected": -53.56328582763672, "rewards/accuracies": 0.9375, "rewards/chosen": -2.058894395828247, "rewards/margins": 2.6506900787353516, "rewards/rejected": -4.709585189819336, "step": 2220 }, { "epoch": 2.1, "grad_norm": 27.70952796936035, "learning_rate": 1.6719132563833509e-07, "logps/chosen": -55.762664794921875, "logps/rejected": -93.76734924316406, "loss": 0.3236, "losses/dpo": 0.7263743281364441, "losses/sft": 3.2946126461029053, "losses/total": 0.7263743281364441, "ref_logps/chosen": -32.23370361328125, "ref_logps/rejected": -50.9365348815918, "rewards/accuracies": 0.875, "rewards/chosen": -2.352896213531494, "rewards/margins": 1.9301854372024536, "rewards/rejected": -4.283081531524658, "step": 2221 }, { "epoch": 2.1, "grad_norm": 19.451852798461914, "learning_rate": 1.670164393144456e-07, "logps/chosen": -53.164573669433594, "logps/rejected": -83.31100463867188, "loss": 0.2098, "losses/dpo": 0.13225436210632324, "losses/sft": 1.889532208442688, "losses/total": 0.13225436210632324, "ref_logps/chosen": -35.533294677734375, "ref_logps/rejected": -42.04703140258789, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7631275653839111, "rewards/margins": 2.3632702827453613, "rewards/rejected": -4.126398086547852, "step": 2222 }, { "epoch": 2.1, "grad_norm": 25.091676712036133, "learning_rate": 1.6684155299055614e-07, "logps/chosen": -53.45108413696289, "logps/rejected": -78.944580078125, "loss": 0.317, "losses/dpo": 0.30559539794921875, "losses/sft": 2.2687339782714844, "losses/total": 0.30559539794921875, "ref_logps/chosen": -34.379608154296875, "ref_logps/rejected": -40.762237548828125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9071475267410278, "rewards/margins": 1.9110870361328125, "rewards/rejected": -3.81823468208313, "step": 2223 }, { "epoch": 2.1, "grad_norm": 15.996298789978027, "learning_rate": 1.6666666666666665e-07, "logps/chosen": -53.738426208496094, "logps/rejected": -82.813720703125, "loss": 0.1778, "losses/dpo": 0.152542382478714, "losses/sft": 1.60818612575531, "losses/total": 0.152542382478714, "ref_logps/chosen": -34.05722427368164, "ref_logps/rejected": -41.73652648925781, "rewards/accuracies": 1.0, "rewards/chosen": -1.9681199789047241, "rewards/margins": 2.1395998001098633, "rewards/rejected": -4.107719421386719, "step": 2224 }, { "epoch": 2.1, "grad_norm": 36.061119079589844, "learning_rate": 1.664917803427772e-07, "logps/chosen": -61.44285202026367, "logps/rejected": -101.45341491699219, "loss": 0.3586, "losses/dpo": 0.6123057007789612, "losses/sft": 2.1946911811828613, "losses/total": 0.6123057007789612, "ref_logps/chosen": -39.097747802734375, "ref_logps/rejected": -60.448699951171875, "rewards/accuracies": 0.8125, "rewards/chosen": -2.234510660171509, "rewards/margins": 1.8659616708755493, "rewards/rejected": -4.100472450256348, "step": 2225 }, { "epoch": 2.1, "grad_norm": 33.13676071166992, "learning_rate": 1.663168940188877e-07, "logps/chosen": -65.36542510986328, "logps/rejected": -83.72738647460938, "loss": 0.2638, "losses/dpo": 0.5983446836471558, "losses/sft": 1.8281232118606567, "losses/total": 0.5983446836471558, "ref_logps/chosen": -45.387176513671875, "ref_logps/rejected": -42.81666564941406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9978246688842773, "rewards/margins": 2.093247413635254, "rewards/rejected": -4.091072082519531, "step": 2226 }, { "epoch": 2.1, "grad_norm": 14.86474609375, "learning_rate": 1.6614200769499824e-07, "logps/chosen": -67.95303344726562, "logps/rejected": -97.45545959472656, "loss": 0.1262, "losses/dpo": 0.13807989656925201, "losses/sft": 2.1071507930755615, "losses/total": 0.13807989656925201, "ref_logps/chosen": -43.59563446044922, "ref_logps/rejected": -46.4262580871582, "rewards/accuracies": 1.0, "rewards/chosen": -2.435739517211914, "rewards/margins": 2.667180299758911, "rewards/rejected": -5.102920055389404, "step": 2227 }, { "epoch": 2.1, "grad_norm": 23.756662368774414, "learning_rate": 1.6596712137110878e-07, "logps/chosen": -56.06706237792969, "logps/rejected": -78.91806030273438, "loss": 0.2815, "losses/dpo": 0.2714054584503174, "losses/sft": 1.6447714567184448, "losses/total": 0.2714054584503174, "ref_logps/chosen": -38.203189849853516, "ref_logps/rejected": -41.72379684448242, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7863867282867432, "rewards/margins": 1.9330400228500366, "rewards/rejected": -3.7194266319274902, "step": 2228 }, { "epoch": 2.1, "grad_norm": 40.492034912109375, "learning_rate": 1.657922350472193e-07, "logps/chosen": -68.50590515136719, "logps/rejected": -88.46195220947266, "loss": 0.541, "losses/dpo": 0.8684618473052979, "losses/sft": 3.044482946395874, "losses/total": 0.8684618473052979, "ref_logps/chosen": -44.93730926513672, "ref_logps/rejected": -46.64177322387695, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3568596839904785, "rewards/margins": 1.8251579999923706, "rewards/rejected": -4.182017803192139, "step": 2229 }, { "epoch": 2.11, "grad_norm": 30.27342414855957, "learning_rate": 1.6561734872332983e-07, "logps/chosen": -54.731815338134766, "logps/rejected": -78.69754028320312, "loss": 0.2927, "losses/dpo": 0.08893805742263794, "losses/sft": 2.030031442642212, "losses/total": 0.08893805742263794, "ref_logps/chosen": -36.54650115966797, "ref_logps/rejected": -40.523658752441406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8185312747955322, "rewards/margins": 1.9988573789596558, "rewards/rejected": -3.8173885345458984, "step": 2230 }, { "epoch": 2.11, "grad_norm": 18.653667449951172, "learning_rate": 1.6544246239944034e-07, "logps/chosen": -57.56175994873047, "logps/rejected": -84.8160400390625, "loss": 0.2574, "losses/dpo": 0.560514509677887, "losses/sft": 1.7918487787246704, "losses/total": 0.560514509677887, "ref_logps/chosen": -39.770137786865234, "ref_logps/rejected": -47.24848937988281, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7791621685028076, "rewards/margins": 1.977592945098877, "rewards/rejected": -3.7567548751831055, "step": 2231 }, { "epoch": 2.11, "grad_norm": 14.561543464660645, "learning_rate": 1.6526757607555088e-07, "logps/chosen": -42.29571533203125, "logps/rejected": -69.97169494628906, "loss": 0.1708, "losses/dpo": 0.30205655097961426, "losses/sft": 2.2472383975982666, "losses/total": 0.30205655097961426, "ref_logps/chosen": -29.076812744140625, "ref_logps/rejected": -33.68364715576172, "rewards/accuracies": 0.9375, "rewards/chosen": -1.321890115737915, "rewards/margins": 2.306915044784546, "rewards/rejected": -3.628805160522461, "step": 2232 }, { "epoch": 2.11, "grad_norm": 20.32581901550293, "learning_rate": 1.650926897516614e-07, "logps/chosen": -49.80039978027344, "logps/rejected": -70.1690902709961, "loss": 0.2397, "losses/dpo": 0.2494000494480133, "losses/sft": 1.9949127435684204, "losses/total": 0.2494000494480133, "ref_logps/chosen": -34.77843475341797, "ref_logps/rejected": -38.26068878173828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5021963119506836, "rewards/margins": 1.6886440515518188, "rewards/rejected": -3.190840482711792, "step": 2233 }, { "epoch": 2.11, "grad_norm": 18.533443450927734, "learning_rate": 1.6491780342777196e-07, "logps/chosen": -75.72285461425781, "logps/rejected": -93.07483673095703, "loss": 0.1894, "losses/dpo": 0.14606128633022308, "losses/sft": 2.5543980598449707, "losses/total": 0.14606128633022308, "ref_logps/chosen": -51.62957000732422, "ref_logps/rejected": -47.83635711669922, "rewards/accuracies": 0.9375, "rewards/chosen": -2.409327983856201, "rewards/margins": 2.11452054977417, "rewards/rejected": -4.523848533630371, "step": 2234 }, { "epoch": 2.11, "grad_norm": 17.314498901367188, "learning_rate": 1.6474291710388247e-07, "logps/chosen": -54.58754348754883, "logps/rejected": -100.722900390625, "loss": 0.1374, "losses/dpo": 0.12343157827854156, "losses/sft": 2.27484130859375, "losses/total": 0.12343157827854156, "ref_logps/chosen": -34.945777893066406, "ref_logps/rejected": -53.31879425048828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9641767740249634, "rewards/margins": 2.7762346267700195, "rewards/rejected": -4.740411758422852, "step": 2235 }, { "epoch": 2.11, "grad_norm": 23.696046829223633, "learning_rate": 1.64568030779993e-07, "logps/chosen": -49.654815673828125, "logps/rejected": -80.75139617919922, "loss": 0.2483, "losses/dpo": 0.24056974053382874, "losses/sft": 1.8114200830459595, "losses/total": 0.24056974053382874, "ref_logps/chosen": -30.18017578125, "ref_logps/rejected": -42.11722946166992, "rewards/accuracies": 0.9375, "rewards/chosen": -1.947464108467102, "rewards/margins": 1.9159528017044067, "rewards/rejected": -3.8634166717529297, "step": 2236 }, { "epoch": 2.11, "grad_norm": 18.896560668945312, "learning_rate": 1.6439314445610353e-07, "logps/chosen": -55.08682632446289, "logps/rejected": -81.15058898925781, "loss": 0.2281, "losses/dpo": 0.13519428670406342, "losses/sft": 1.9493145942687988, "losses/total": 0.13519428670406342, "ref_logps/chosen": -36.99998092651367, "ref_logps/rejected": -43.503700256347656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8086843490600586, "rewards/margins": 1.956005334854126, "rewards/rejected": -3.7646899223327637, "step": 2237 }, { "epoch": 2.11, "grad_norm": 21.233287811279297, "learning_rate": 1.6421825813221404e-07, "logps/chosen": -67.05801391601562, "logps/rejected": -84.22810363769531, "loss": 0.2588, "losses/dpo": 0.3462636172771454, "losses/sft": 2.207815170288086, "losses/total": 0.3462636172771454, "ref_logps/chosen": -47.680606842041016, "ref_logps/rejected": -43.420562744140625, "rewards/accuracies": 0.875, "rewards/chosen": -1.9377408027648926, "rewards/margins": 2.143012523651123, "rewards/rejected": -4.080753326416016, "step": 2238 }, { "epoch": 2.11, "grad_norm": 26.304241180419922, "learning_rate": 1.6404337180832458e-07, "logps/chosen": -50.78749084472656, "logps/rejected": -84.84367370605469, "loss": 0.2854, "losses/dpo": 0.4090076684951782, "losses/sft": 1.5576115846633911, "losses/total": 0.4090076684951782, "ref_logps/chosen": -33.83174514770508, "ref_logps/rejected": -45.708221435546875, "rewards/accuracies": 0.875, "rewards/chosen": -1.6955742835998535, "rewards/margins": 2.2179718017578125, "rewards/rejected": -3.913546085357666, "step": 2239 }, { "epoch": 2.12, "grad_norm": 19.78229522705078, "learning_rate": 1.638684854844351e-07, "logps/chosen": -60.204002380371094, "logps/rejected": -91.46977233886719, "loss": 0.2501, "losses/dpo": 0.09898929297924042, "losses/sft": 2.0589511394500732, "losses/total": 0.09898929297924042, "ref_logps/chosen": -39.20741653442383, "ref_logps/rejected": -47.30803680419922, "rewards/accuracies": 0.875, "rewards/chosen": -2.099658250808716, "rewards/margins": 2.3165149688720703, "rewards/rejected": -4.416172981262207, "step": 2240 }, { "epoch": 2.12, "grad_norm": 21.187986373901367, "learning_rate": 1.6369359916054566e-07, "logps/chosen": -51.58329772949219, "logps/rejected": -97.73701477050781, "loss": 0.266, "losses/dpo": 0.1460612267255783, "losses/sft": 2.5049760341644287, "losses/total": 0.1460612267255783, "ref_logps/chosen": -35.03326416015625, "ref_logps/rejected": -55.8988037109375, "rewards/accuracies": 0.875, "rewards/chosen": -1.655003309249878, "rewards/margins": 2.528817892074585, "rewards/rejected": -4.183821201324463, "step": 2241 }, { "epoch": 2.12, "grad_norm": 14.329180717468262, "learning_rate": 1.6351871283665617e-07, "logps/chosen": -44.3607177734375, "logps/rejected": -79.14237213134766, "loss": 0.1686, "losses/dpo": 0.26841437816619873, "losses/sft": 1.3265010118484497, "losses/total": 0.26841437816619873, "ref_logps/chosen": -27.848522186279297, "ref_logps/rejected": -39.2000732421875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6512196063995361, "rewards/margins": 2.343010425567627, "rewards/rejected": -3.994230270385742, "step": 2242 }, { "epoch": 2.12, "grad_norm": 22.371397018432617, "learning_rate": 1.633438265127667e-07, "logps/chosen": -45.22007751464844, "logps/rejected": -79.30278778076172, "loss": 0.2533, "losses/dpo": 0.1291871964931488, "losses/sft": 1.2261956930160522, "losses/total": 0.1291871964931488, "ref_logps/chosen": -31.777246475219727, "ref_logps/rejected": -44.50048065185547, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3442833423614502, "rewards/margins": 2.1359472274780273, "rewards/rejected": -3.4802303314208984, "step": 2243 }, { "epoch": 2.12, "grad_norm": 23.07261085510254, "learning_rate": 1.6316894018887722e-07, "logps/chosen": -51.61261749267578, "logps/rejected": -77.51898193359375, "loss": 0.2759, "losses/dpo": 0.06007974594831467, "losses/sft": 1.7587203979492188, "losses/total": 0.06007974594831467, "ref_logps/chosen": -35.39130401611328, "ref_logps/rejected": -44.41795349121094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6221314668655396, "rewards/margins": 1.6879708766937256, "rewards/rejected": -3.3101024627685547, "step": 2244 }, { "epoch": 2.12, "grad_norm": 19.950515747070312, "learning_rate": 1.6299405386498773e-07, "logps/chosen": -63.1268310546875, "logps/rejected": -98.84368896484375, "loss": 0.2335, "losses/dpo": 0.08571727573871613, "losses/sft": 1.9807275533676147, "losses/total": 0.08571727573871613, "ref_logps/chosen": -41.314613342285156, "ref_logps/rejected": -56.26796340942383, "rewards/accuracies": 0.875, "rewards/chosen": -2.1812214851379395, "rewards/margins": 2.0763511657714844, "rewards/rejected": -4.257572650909424, "step": 2245 }, { "epoch": 2.12, "grad_norm": 16.623170852661133, "learning_rate": 1.6281916754109827e-07, "logps/chosen": -44.955604553222656, "logps/rejected": -86.53844451904297, "loss": 0.1753, "losses/dpo": 0.20481280982494354, "losses/sft": 1.9196656942367554, "losses/total": 0.20481280982494354, "ref_logps/chosen": -29.323665618896484, "ref_logps/rejected": -46.16213607788086, "rewards/accuracies": 0.9375, "rewards/chosen": -1.563193917274475, "rewards/margins": 2.4744367599487305, "rewards/rejected": -4.037631034851074, "step": 2246 }, { "epoch": 2.12, "grad_norm": 26.565893173217773, "learning_rate": 1.626442812172088e-07, "logps/chosen": -56.64362716674805, "logps/rejected": -81.39777374267578, "loss": 0.3296, "losses/dpo": 0.10279586911201477, "losses/sft": 2.5590388774871826, "losses/total": 0.10279586911201477, "ref_logps/chosen": -36.619625091552734, "ref_logps/rejected": -41.5498046875, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0024003982543945, "rewards/margins": 1.9823968410491943, "rewards/rejected": -3.984797239303589, "step": 2247 }, { "epoch": 2.12, "grad_norm": 27.30438232421875, "learning_rate": 1.6246939489331935e-07, "logps/chosen": -52.283355712890625, "logps/rejected": -92.72384643554688, "loss": 0.2662, "losses/dpo": 0.31035855412483215, "losses/sft": 1.8905383348464966, "losses/total": 0.31035855412483215, "ref_logps/chosen": -37.64241027832031, "ref_logps/rejected": -55.542701721191406, "rewards/accuracies": 0.875, "rewards/chosen": -1.464094638824463, "rewards/margins": 2.2540202140808105, "rewards/rejected": -3.7181148529052734, "step": 2248 }, { "epoch": 2.12, "grad_norm": 30.367029190063477, "learning_rate": 1.6229450856942986e-07, "logps/chosen": -60.2539176940918, "logps/rejected": -81.2086181640625, "loss": 0.2924, "losses/dpo": 0.27630990743637085, "losses/sft": 2.526815891265869, "losses/total": 0.27630990743637085, "ref_logps/chosen": -41.47901153564453, "ref_logps/rejected": -44.62622833251953, "rewards/accuracies": 0.9375, "rewards/chosen": -1.877490520477295, "rewards/margins": 1.7807488441467285, "rewards/rejected": -3.6582391262054443, "step": 2249 }, { "epoch": 2.12, "grad_norm": 19.49413299560547, "learning_rate": 1.621196222455404e-07, "logps/chosen": -44.23246765136719, "logps/rejected": -82.84221649169922, "loss": 0.2119, "losses/dpo": 0.2684689164161682, "losses/sft": 2.4392786026000977, "losses/total": 0.2684689164161682, "ref_logps/chosen": -28.507184982299805, "ref_logps/rejected": -42.323036193847656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5725280046463013, "rewards/margins": 2.4793901443481445, "rewards/rejected": -4.0519185066223145, "step": 2250 }, { "epoch": 2.13, "grad_norm": 23.003732681274414, "learning_rate": 1.6194473592165092e-07, "logps/chosen": -55.03348159790039, "logps/rejected": -86.842041015625, "loss": 0.2321, "losses/dpo": 0.07002858817577362, "losses/sft": 2.1327624320983887, "losses/total": 0.07002858817577362, "ref_logps/chosen": -37.50593948364258, "ref_logps/rejected": -46.40522003173828, "rewards/accuracies": 0.875, "rewards/chosen": -1.752753734588623, "rewards/margins": 2.290928363800049, "rewards/rejected": -4.043682098388672, "step": 2251 }, { "epoch": 2.13, "grad_norm": 17.487110137939453, "learning_rate": 1.6176984959776143e-07, "logps/chosen": -55.22237014770508, "logps/rejected": -87.433837890625, "loss": 0.1756, "losses/dpo": 0.3281095623970032, "losses/sft": 2.2306060791015625, "losses/total": 0.3281095623970032, "ref_logps/chosen": -37.505985260009766, "ref_logps/rejected": -44.09942626953125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7716386318206787, "rewards/margins": 2.561802864074707, "rewards/rejected": -4.333441734313965, "step": 2252 }, { "epoch": 2.13, "grad_norm": 22.838956832885742, "learning_rate": 1.61594963273872e-07, "logps/chosen": -56.50987243652344, "logps/rejected": -90.54911804199219, "loss": 0.2184, "losses/dpo": 0.3154241740703583, "losses/sft": 2.2732532024383545, "losses/total": 0.3154241740703583, "ref_logps/chosen": -36.47102737426758, "ref_logps/rejected": -44.3578987121582, "rewards/accuracies": 0.875, "rewards/chosen": -2.003884792327881, "rewards/margins": 2.615237236022949, "rewards/rejected": -4.61912202835083, "step": 2253 }, { "epoch": 2.13, "grad_norm": 25.773296356201172, "learning_rate": 1.614200769499825e-07, "logps/chosen": -58.50932312011719, "logps/rejected": -88.87077331542969, "loss": 0.2326, "losses/dpo": 0.19233441352844238, "losses/sft": 1.700931429862976, "losses/total": 0.19233441352844238, "ref_logps/chosen": -42.74064636230469, "ref_logps/rejected": -52.24690628051758, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5768675804138184, "rewards/margins": 2.085519790649414, "rewards/rejected": -3.6623873710632324, "step": 2254 }, { "epoch": 2.13, "grad_norm": 21.362497329711914, "learning_rate": 1.6124519062609305e-07, "logps/chosen": -55.05992889404297, "logps/rejected": -71.38827514648438, "loss": 0.2201, "losses/dpo": 0.2599198520183563, "losses/sft": 1.693109393119812, "losses/total": 0.2599198520183563, "ref_logps/chosen": -38.30255889892578, "ref_logps/rejected": -35.69434356689453, "rewards/accuracies": 1.0, "rewards/chosen": -1.6757371425628662, "rewards/margins": 1.8936560153961182, "rewards/rejected": -3.5693931579589844, "step": 2255 }, { "epoch": 2.13, "grad_norm": 23.30718421936035, "learning_rate": 1.6107030430220356e-07, "logps/chosen": -65.42486572265625, "logps/rejected": -80.0082778930664, "loss": 0.3433, "losses/dpo": 0.48402729630470276, "losses/sft": 1.9216808080673218, "losses/total": 0.48402729630470276, "ref_logps/chosen": -44.13976287841797, "ref_logps/rejected": -42.1544189453125, "rewards/accuracies": 0.875, "rewards/chosen": -2.1285104751586914, "rewards/margins": 1.6568758487701416, "rewards/rejected": -3.785386562347412, "step": 2256 }, { "epoch": 2.13, "grad_norm": 42.49635314941406, "learning_rate": 1.608954179783141e-07, "logps/chosen": -68.43677520751953, "logps/rejected": -88.00084686279297, "loss": 0.5192, "losses/dpo": 0.1933937668800354, "losses/sft": 2.2337167263031006, "losses/total": 0.1933937668800354, "ref_logps/chosen": -45.55704879760742, "ref_logps/rejected": -53.633323669433594, "rewards/accuracies": 0.75, "rewards/chosen": -2.2879724502563477, "rewards/margins": 1.1487798690795898, "rewards/rejected": -3.4367523193359375, "step": 2257 }, { "epoch": 2.13, "grad_norm": 25.39251708984375, "learning_rate": 1.607205316544246e-07, "logps/chosen": -42.743568420410156, "logps/rejected": -71.08970642089844, "loss": 0.3061, "losses/dpo": 0.3308146595954895, "losses/sft": 1.471168041229248, "losses/total": 0.3308146595954895, "ref_logps/chosen": -31.67058753967285, "ref_logps/rejected": -40.524105072021484, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1072981357574463, "rewards/margins": 1.9492619037628174, "rewards/rejected": -3.0565600395202637, "step": 2258 }, { "epoch": 2.13, "grad_norm": 28.404516220092773, "learning_rate": 1.6054564533053512e-07, "logps/chosen": -56.04657745361328, "logps/rejected": -76.16471862792969, "loss": 0.3164, "losses/dpo": 0.20249655842781067, "losses/sft": 2.0203540325164795, "losses/total": 0.20249655842781067, "ref_logps/chosen": -40.08431625366211, "ref_logps/rejected": -42.63153076171875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5962259769439697, "rewards/margins": 1.7570924758911133, "rewards/rejected": -3.353318691253662, "step": 2259 }, { "epoch": 2.13, "grad_norm": 20.589021682739258, "learning_rate": 1.603707590066457e-07, "logps/chosen": -50.516075134277344, "logps/rejected": -76.57281494140625, "loss": 0.2487, "losses/dpo": 0.12869106233119965, "losses/sft": 2.1255719661712646, "losses/total": 0.12869106233119965, "ref_logps/chosen": -31.348102569580078, "ref_logps/rejected": -39.09846496582031, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9167975187301636, "rewards/margins": 1.83063805103302, "rewards/rejected": -3.7474355697631836, "step": 2260 }, { "epoch": 2.14, "grad_norm": 22.93833351135254, "learning_rate": 1.601958726827562e-07, "logps/chosen": -70.66360473632812, "logps/rejected": -97.74938201904297, "loss": 0.2547, "losses/dpo": 0.40152662992477417, "losses/sft": 2.035598039627075, "losses/total": 0.40152662992477417, "ref_logps/chosen": -46.9935302734375, "ref_logps/rejected": -54.04365539550781, "rewards/accuracies": 0.875, "rewards/chosen": -2.3670077323913574, "rewards/margins": 2.0035650730133057, "rewards/rejected": -4.370573043823242, "step": 2261 }, { "epoch": 2.14, "grad_norm": 25.003437042236328, "learning_rate": 1.6002098635886674e-07, "logps/chosen": -52.92638397216797, "logps/rejected": -78.58967590332031, "loss": 0.2604, "losses/dpo": 0.11700643599033356, "losses/sft": 1.5392807722091675, "losses/total": 0.11700643599033356, "ref_logps/chosen": -40.420047760009766, "ref_logps/rejected": -44.13471603393555, "rewards/accuracies": 0.875, "rewards/chosen": -1.2506334781646729, "rewards/margins": 2.1948623657226562, "rewards/rejected": -3.445496082305908, "step": 2262 }, { "epoch": 2.14, "grad_norm": 28.93943214416504, "learning_rate": 1.5984610003497725e-07, "logps/chosen": -50.6925048828125, "logps/rejected": -64.04227447509766, "loss": 0.374, "losses/dpo": 0.17544978857040405, "losses/sft": 2.4645628929138184, "losses/total": 0.17544978857040405, "ref_logps/chosen": -31.019001007080078, "ref_logps/rejected": -31.54002571105957, "rewards/accuracies": 0.875, "rewards/chosen": -1.9673502445220947, "rewards/margins": 1.2828750610351562, "rewards/rejected": -3.250225305557251, "step": 2263 }, { "epoch": 2.14, "grad_norm": 24.567184448242188, "learning_rate": 1.596712137110878e-07, "logps/chosen": -71.976318359375, "logps/rejected": -95.36058044433594, "loss": 0.2598, "losses/dpo": 0.327330619096756, "losses/sft": 1.4970035552978516, "losses/total": 0.327330619096756, "ref_logps/chosen": -53.477752685546875, "ref_logps/rejected": -55.2637939453125, "rewards/accuracies": 0.875, "rewards/chosen": -1.8498564958572388, "rewards/margins": 2.1598219871520996, "rewards/rejected": -4.009678840637207, "step": 2264 }, { "epoch": 2.14, "grad_norm": 21.226322174072266, "learning_rate": 1.594963273871983e-07, "logps/chosen": -40.626991271972656, "logps/rejected": -77.2824935913086, "loss": 0.2938, "losses/dpo": 0.13791534304618835, "losses/sft": 1.3078718185424805, "losses/total": 0.13791534304618835, "ref_logps/chosen": -24.102535247802734, "ref_logps/rejected": -38.710750579833984, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6524457931518555, "rewards/margins": 2.204728126525879, "rewards/rejected": -3.8571739196777344, "step": 2265 }, { "epoch": 2.14, "grad_norm": 23.8439884185791, "learning_rate": 1.5932144106330884e-07, "logps/chosen": -53.080467224121094, "logps/rejected": -75.99609375, "loss": 0.2822, "losses/dpo": 0.33289408683776855, "losses/sft": 2.283602237701416, "losses/total": 0.33289408683776855, "ref_logps/chosen": -33.83545684814453, "ref_logps/rejected": -36.57042694091797, "rewards/accuracies": 0.875, "rewards/chosen": -1.9245011806488037, "rewards/margins": 2.0180652141571045, "rewards/rejected": -3.94256591796875, "step": 2266 }, { "epoch": 2.14, "grad_norm": 26.943132400512695, "learning_rate": 1.5914655473941938e-07, "logps/chosen": -62.90143966674805, "logps/rejected": -75.03677368164062, "loss": 0.3197, "losses/dpo": 0.2554304599761963, "losses/sft": 1.8869599103927612, "losses/total": 0.2554304599761963, "ref_logps/chosen": -47.20684051513672, "ref_logps/rejected": -44.09348678588867, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5694596767425537, "rewards/margins": 1.5248687267303467, "rewards/rejected": -3.0943284034729004, "step": 2267 }, { "epoch": 2.14, "grad_norm": 18.618547439575195, "learning_rate": 1.589716684155299e-07, "logps/chosen": -61.20868682861328, "logps/rejected": -79.85617065429688, "loss": 0.1833, "losses/dpo": 0.13341785967350006, "losses/sft": 2.1886203289031982, "losses/total": 0.13341785967350006, "ref_logps/chosen": -40.288238525390625, "ref_logps/rejected": -38.06578063964844, "rewards/accuracies": 1.0, "rewards/chosen": -2.0920448303222656, "rewards/margins": 2.0869946479797363, "rewards/rejected": -4.17903995513916, "step": 2268 }, { "epoch": 2.14, "grad_norm": 22.201250076293945, "learning_rate": 1.5879678209164043e-07, "logps/chosen": -55.31974411010742, "logps/rejected": -76.48590087890625, "loss": 0.2601, "losses/dpo": 0.23466461896896362, "losses/sft": 1.6558467149734497, "losses/total": 0.23466461896896362, "ref_logps/chosen": -39.18632507324219, "ref_logps/rejected": -42.365272521972656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.613342046737671, "rewards/margins": 1.7987215518951416, "rewards/rejected": -3.4120635986328125, "step": 2269 }, { "epoch": 2.14, "grad_norm": 15.449275970458984, "learning_rate": 1.5862189576775095e-07, "logps/chosen": -44.29328536987305, "logps/rejected": -90.58779907226562, "loss": 0.1799, "losses/dpo": 0.27283793687820435, "losses/sft": 2.3422796726226807, "losses/total": 0.27283793687820435, "ref_logps/chosen": -30.93407440185547, "ref_logps/rejected": -53.70518493652344, "rewards/accuracies": 1.0, "rewards/chosen": -1.335921049118042, "rewards/margins": 2.3523406982421875, "rewards/rejected": -3.6882619857788086, "step": 2270 }, { "epoch": 2.14, "grad_norm": 17.470685958862305, "learning_rate": 1.5844700944386149e-07, "logps/chosen": -50.318607330322266, "logps/rejected": -78.00550079345703, "loss": 0.2057, "losses/dpo": 0.35125651955604553, "losses/sft": 1.649762749671936, "losses/total": 0.35125651955604553, "ref_logps/chosen": -34.96794128417969, "ref_logps/rejected": -42.396392822265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.535066843032837, "rewards/margins": 2.025843858718872, "rewards/rejected": -3.560910701751709, "step": 2271 }, { "epoch": 2.15, "grad_norm": 26.762725830078125, "learning_rate": 1.5827212311997203e-07, "logps/chosen": -58.390323638916016, "logps/rejected": -72.4156265258789, "loss": 0.3026, "losses/dpo": 0.1839708685874939, "losses/sft": 1.2107266187667847, "losses/total": 0.1839708685874939, "ref_logps/chosen": -40.38214874267578, "ref_logps/rejected": -38.73516845703125, "rewards/accuracies": 0.875, "rewards/chosen": -1.8008177280426025, "rewards/margins": 1.567227840423584, "rewards/rejected": -3.3680455684661865, "step": 2272 }, { "epoch": 2.15, "grad_norm": 34.8433952331543, "learning_rate": 1.5809723679608254e-07, "logps/chosen": -57.27212142944336, "logps/rejected": -68.38531494140625, "loss": 0.4535, "losses/dpo": 0.6827790141105652, "losses/sft": 2.280339241027832, "losses/total": 0.6827790141105652, "ref_logps/chosen": -35.589073181152344, "ref_logps/rejected": -33.419410705566406, "rewards/accuracies": 0.8125, "rewards/chosen": -2.168304443359375, "rewards/margins": 1.3282856941223145, "rewards/rejected": -3.4965901374816895, "step": 2273 }, { "epoch": 2.15, "grad_norm": 19.23264503479004, "learning_rate": 1.5792235047219308e-07, "logps/chosen": -50.28895568847656, "logps/rejected": -67.99697875976562, "loss": 0.2277, "losses/dpo": 0.14969190955162048, "losses/sft": 2.3513457775115967, "losses/total": 0.14969190955162048, "ref_logps/chosen": -33.68083953857422, "ref_logps/rejected": -35.101356506347656, "rewards/accuracies": 1.0, "rewards/chosen": -1.6608121395111084, "rewards/margins": 1.6287498474121094, "rewards/rejected": -3.2895619869232178, "step": 2274 }, { "epoch": 2.15, "grad_norm": 18.94283676147461, "learning_rate": 1.577474641483036e-07, "logps/chosen": -63.68626403808594, "logps/rejected": -87.64292907714844, "loss": 0.2229, "losses/dpo": 0.16320599615573883, "losses/sft": 2.310617685317993, "losses/total": 0.16320599615573883, "ref_logps/chosen": -46.33479309082031, "ref_logps/rejected": -48.5651741027832, "rewards/accuracies": 0.9375, "rewards/chosen": -1.73514723777771, "rewards/margins": 2.172628402709961, "rewards/rejected": -3.907775640487671, "step": 2275 }, { "epoch": 2.15, "grad_norm": 20.575897216796875, "learning_rate": 1.5757257782441413e-07, "logps/chosen": -38.73346710205078, "logps/rejected": -79.06417846679688, "loss": 0.2254, "losses/dpo": 0.18299530446529388, "losses/sft": 1.768555760383606, "losses/total": 0.18299530446529388, "ref_logps/chosen": -25.789077758789062, "ref_logps/rejected": -43.497650146484375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2944390773773193, "rewards/margins": 2.2622132301330566, "rewards/rejected": -3.556652069091797, "step": 2276 }, { "epoch": 2.15, "grad_norm": 31.062278747558594, "learning_rate": 1.5739769150052464e-07, "logps/chosen": -47.12458801269531, "logps/rejected": -69.7198486328125, "loss": 0.44, "losses/dpo": 0.2919631004333496, "losses/sft": 1.9819501638412476, "losses/total": 0.2919631004333496, "ref_logps/chosen": -27.849842071533203, "ref_logps/rejected": -35.79384231567383, "rewards/accuracies": 0.75, "rewards/chosen": -1.9274744987487793, "rewards/margins": 1.4651265144348145, "rewards/rejected": -3.392601251602173, "step": 2277 }, { "epoch": 2.15, "grad_norm": 29.310911178588867, "learning_rate": 1.5722280517663518e-07, "logps/chosen": -45.972023010253906, "logps/rejected": -67.21701049804688, "loss": 0.3201, "losses/dpo": 0.2422136515378952, "losses/sft": 1.3804807662963867, "losses/total": 0.2422136515378952, "ref_logps/chosen": -30.895526885986328, "ref_logps/rejected": -33.24668884277344, "rewards/accuracies": 0.75, "rewards/chosen": -1.5076490640640259, "rewards/margins": 1.8893829584121704, "rewards/rejected": -3.3970322608947754, "step": 2278 }, { "epoch": 2.15, "grad_norm": 21.477489471435547, "learning_rate": 1.5704791885274572e-07, "logps/chosen": -58.60751724243164, "logps/rejected": -84.25736999511719, "loss": 0.2386, "losses/dpo": 0.3093772530555725, "losses/sft": 1.692646861076355, "losses/total": 0.3093772530555725, "ref_logps/chosen": -38.81452560424805, "ref_logps/rejected": -44.74578094482422, "rewards/accuracies": 0.9375, "rewards/chosen": -1.979299545288086, "rewards/margins": 1.971860408782959, "rewards/rejected": -3.951159715652466, "step": 2279 }, { "epoch": 2.15, "grad_norm": 15.371859550476074, "learning_rate": 1.5687303252885623e-07, "logps/chosen": -43.326560974121094, "logps/rejected": -79.91537475585938, "loss": 0.1764, "losses/dpo": 0.07203559577465057, "losses/sft": 1.0480899810791016, "losses/total": 0.07203559577465057, "ref_logps/chosen": -27.940738677978516, "ref_logps/rejected": -42.891754150390625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5385822057724, "rewards/margins": 2.1637799739837646, "rewards/rejected": -3.702362060546875, "step": 2280 }, { "epoch": 2.15, "grad_norm": 21.000402450561523, "learning_rate": 1.5669814620496677e-07, "logps/chosen": -66.86967468261719, "logps/rejected": -110.08772277832031, "loss": 0.2189, "losses/dpo": 0.3041834831237793, "losses/sft": 2.226001024246216, "losses/total": 0.3041834831237793, "ref_logps/chosen": -46.83509063720703, "ref_logps/rejected": -66.58212280273438, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0034589767456055, "rewards/margins": 2.347100257873535, "rewards/rejected": -4.350558757781982, "step": 2281 }, { "epoch": 2.15, "grad_norm": 26.565950393676758, "learning_rate": 1.5652325988107728e-07, "logps/chosen": -61.623512268066406, "logps/rejected": -82.12686157226562, "loss": 0.2483, "losses/dpo": 0.4800388216972351, "losses/sft": 2.0719287395477295, "losses/total": 0.4800388216972351, "ref_logps/chosen": -42.07328414916992, "ref_logps/rejected": -42.147159576416016, "rewards/accuracies": 0.875, "rewards/chosen": -1.9550228118896484, "rewards/margins": 2.042947292327881, "rewards/rejected": -3.9979701042175293, "step": 2282 }, { "epoch": 2.16, "grad_norm": 14.342329025268555, "learning_rate": 1.5634837355718782e-07, "logps/chosen": -52.11302947998047, "logps/rejected": -91.74695587158203, "loss": 0.1433, "losses/dpo": 0.2717965245246887, "losses/sft": 2.016220808029175, "losses/total": 0.2717965245246887, "ref_logps/chosen": -35.9696044921875, "ref_logps/rejected": -49.544063568115234, "rewards/accuracies": 1.0, "rewards/chosen": -1.6143429279327393, "rewards/margins": 2.6059460639953613, "rewards/rejected": -4.2202887535095215, "step": 2283 }, { "epoch": 2.16, "grad_norm": 11.542847633361816, "learning_rate": 1.5617348723329834e-07, "logps/chosen": -69.41544342041016, "logps/rejected": -99.43051147460938, "loss": 0.1041, "losses/dpo": 0.06336472183465958, "losses/sft": 2.059053897857666, "losses/total": 0.06336472183465958, "ref_logps/chosen": -46.148780822753906, "ref_logps/rejected": -48.774330139160156, "rewards/accuracies": 1.0, "rewards/chosen": -2.3266663551330566, "rewards/margins": 2.7389512062072754, "rewards/rejected": -5.065617561340332, "step": 2284 }, { "epoch": 2.16, "grad_norm": 19.886808395385742, "learning_rate": 1.559986009094089e-07, "logps/chosen": -47.85782241821289, "logps/rejected": -90.40819549560547, "loss": 0.1926, "losses/dpo": 0.3982144594192505, "losses/sft": 2.302834987640381, "losses/total": 0.3982144594192505, "ref_logps/chosen": -28.37134552001953, "ref_logps/rejected": -47.10148239135742, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9486477375030518, "rewards/margins": 2.382023572921753, "rewards/rejected": -4.330671310424805, "step": 2285 }, { "epoch": 2.16, "grad_norm": 12.676795959472656, "learning_rate": 1.5582371458551941e-07, "logps/chosen": -56.64424514770508, "logps/rejected": -94.72074890136719, "loss": 0.1119, "losses/dpo": 0.1810685694217682, "losses/sft": 1.5159043073654175, "losses/total": 0.1810685694217682, "ref_logps/chosen": -40.41544723510742, "ref_logps/rejected": -49.2695198059082, "rewards/accuracies": 1.0, "rewards/chosen": -1.6228796243667603, "rewards/margins": 2.9222428798675537, "rewards/rejected": -4.5451226234436035, "step": 2286 }, { "epoch": 2.16, "grad_norm": 28.70562744140625, "learning_rate": 1.5564882826162993e-07, "logps/chosen": -54.82651901245117, "logps/rejected": -84.83499145507812, "loss": 0.3099, "losses/dpo": 0.09100537747144699, "losses/sft": 1.8288042545318604, "losses/total": 0.09100537747144699, "ref_logps/chosen": -33.36064910888672, "ref_logps/rejected": -43.051212310791016, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1465868949890137, "rewards/margins": 2.0317912101745605, "rewards/rejected": -4.178378105163574, "step": 2287 }, { "epoch": 2.16, "grad_norm": 26.49138832092285, "learning_rate": 1.5547394193774047e-07, "logps/chosen": -65.35050964355469, "logps/rejected": -86.64130401611328, "loss": 0.2187, "losses/dpo": 0.055660516023635864, "losses/sft": 2.095733165740967, "losses/total": 0.055660516023635864, "ref_logps/chosen": -45.377235412597656, "ref_logps/rejected": -45.432273864746094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9973276853561401, "rewards/margins": 2.123575448989868, "rewards/rejected": -4.120903015136719, "step": 2288 }, { "epoch": 2.16, "grad_norm": 22.42824935913086, "learning_rate": 1.5529905561385098e-07, "logps/chosen": -66.8343276977539, "logps/rejected": -81.79641723632812, "loss": 0.2325, "losses/dpo": 0.15890973806381226, "losses/sft": 1.6447639465332031, "losses/total": 0.15890973806381226, "ref_logps/chosen": -46.97641372680664, "ref_logps/rejected": -41.37504577636719, "rewards/accuracies": 1.0, "rewards/chosen": -1.985791563987732, "rewards/margins": 2.0563454627990723, "rewards/rejected": -4.0421366691589355, "step": 2289 }, { "epoch": 2.16, "grad_norm": 20.946821212768555, "learning_rate": 1.5512416928996152e-07, "logps/chosen": -55.57846450805664, "logps/rejected": -80.79353332519531, "loss": 0.2134, "losses/dpo": 0.09192565083503723, "losses/sft": 1.7822967767715454, "losses/total": 0.09192565083503723, "ref_logps/chosen": -34.669883728027344, "ref_logps/rejected": -38.41657257080078, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0908584594726562, "rewards/margins": 2.1468381881713867, "rewards/rejected": -4.237696647644043, "step": 2290 }, { "epoch": 2.16, "grad_norm": 18.632497787475586, "learning_rate": 1.5494928296607203e-07, "logps/chosen": -55.583988189697266, "logps/rejected": -84.01304626464844, "loss": 0.1876, "losses/dpo": 0.1862010955810547, "losses/sft": 1.525383710861206, "losses/total": 0.1862010955810547, "ref_logps/chosen": -36.119598388671875, "ref_logps/rejected": -43.54994201660156, "rewards/accuracies": 1.0, "rewards/chosen": -1.9464391469955444, "rewards/margins": 2.0998711585998535, "rewards/rejected": -4.0463104248046875, "step": 2291 }, { "epoch": 2.16, "grad_norm": 34.278316497802734, "learning_rate": 1.547743966421826e-07, "logps/chosen": -69.1971435546875, "logps/rejected": -104.88549041748047, "loss": 0.2795, "losses/dpo": 0.1957712173461914, "losses/sft": 1.8775060176849365, "losses/total": 0.1957712173461914, "ref_logps/chosen": -42.132774353027344, "ref_logps/rejected": -55.01546096801758, "rewards/accuracies": 0.875, "rewards/chosen": -2.7064366340637207, "rewards/margins": 2.280566453933716, "rewards/rejected": -4.987003326416016, "step": 2292 }, { "epoch": 2.17, "grad_norm": 25.116737365722656, "learning_rate": 1.545995103182931e-07, "logps/chosen": -54.52226257324219, "logps/rejected": -87.70243835449219, "loss": 0.2278, "losses/dpo": 0.30233073234558105, "losses/sft": 2.790884017944336, "losses/total": 0.30233073234558105, "ref_logps/chosen": -33.725563049316406, "ref_logps/rejected": -46.07609558105469, "rewards/accuracies": 0.875, "rewards/chosen": -2.079669952392578, "rewards/margins": 2.0829648971557617, "rewards/rejected": -4.16263484954834, "step": 2293 }, { "epoch": 2.17, "grad_norm": 15.063234329223633, "learning_rate": 1.5442462399440362e-07, "logps/chosen": -57.47674560546875, "logps/rejected": -90.12452697753906, "loss": 0.1634, "losses/dpo": 0.2507159113883972, "losses/sft": 2.0140225887298584, "losses/total": 0.2507159113883972, "ref_logps/chosen": -35.16191101074219, "ref_logps/rejected": -45.17369842529297, "rewards/accuracies": 1.0, "rewards/chosen": -2.2314834594726562, "rewards/margins": 2.263598918914795, "rewards/rejected": -4.495082855224609, "step": 2294 }, { "epoch": 2.17, "grad_norm": 13.159616470336914, "learning_rate": 1.5424973767051416e-07, "logps/chosen": -54.27346420288086, "logps/rejected": -81.01445007324219, "loss": 0.1805, "losses/dpo": 0.2564374506473541, "losses/sft": 1.9482932090759277, "losses/total": 0.2564374506473541, "ref_logps/chosen": -36.50727081298828, "ref_logps/rejected": -41.780303955078125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7766196727752686, "rewards/margins": 2.1467955112457275, "rewards/rejected": -3.923415184020996, "step": 2295 }, { "epoch": 2.17, "grad_norm": 12.745832443237305, "learning_rate": 1.5407485134662467e-07, "logps/chosen": -68.29960632324219, "logps/rejected": -98.3267822265625, "loss": 0.1033, "losses/dpo": 0.14031359553337097, "losses/sft": 2.2993533611297607, "losses/total": 0.14031359553337097, "ref_logps/chosen": -49.136695861816406, "ref_logps/rejected": -52.492549896240234, "rewards/accuracies": 1.0, "rewards/chosen": -1.9162909984588623, "rewards/margins": 2.6671321392059326, "rewards/rejected": -4.583423137664795, "step": 2296 }, { "epoch": 2.17, "grad_norm": 31.119064331054688, "learning_rate": 1.538999650227352e-07, "logps/chosen": -56.37617492675781, "logps/rejected": -82.12806701660156, "loss": 0.3443, "losses/dpo": 0.18578927218914032, "losses/sft": 2.2522242069244385, "losses/total": 0.18578927218914032, "ref_logps/chosen": -35.081905364990234, "ref_logps/rejected": -43.395755767822266, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1294267177581787, "rewards/margins": 1.7438042163848877, "rewards/rejected": -3.8732309341430664, "step": 2297 }, { "epoch": 2.17, "grad_norm": 25.81895637512207, "learning_rate": 1.5372507869884575e-07, "logps/chosen": -50.75843048095703, "logps/rejected": -70.24896240234375, "loss": 0.2726, "losses/dpo": 0.7891396284103394, "losses/sft": 2.258631944656372, "losses/total": 0.7891396284103394, "ref_logps/chosen": -38.64788055419922, "ref_logps/rejected": -35.67439270019531, "rewards/accuracies": 0.875, "rewards/chosen": -1.211055040359497, "rewards/margins": 2.2464025020599365, "rewards/rejected": -3.4574575424194336, "step": 2298 }, { "epoch": 2.17, "grad_norm": 21.256067276000977, "learning_rate": 1.535501923749563e-07, "logps/chosen": -62.33543395996094, "logps/rejected": -79.2490463256836, "loss": 0.2035, "losses/dpo": 0.13268226385116577, "losses/sft": 1.95466148853302, "losses/total": 0.13268226385116577, "ref_logps/chosen": -42.913124084472656, "ref_logps/rejected": -37.233734130859375, "rewards/accuracies": 0.875, "rewards/chosen": -1.9422309398651123, "rewards/margins": 2.25930118560791, "rewards/rejected": -4.201531410217285, "step": 2299 }, { "epoch": 2.17, "grad_norm": 25.383174896240234, "learning_rate": 1.533753060510668e-07, "logps/chosen": -52.142601013183594, "logps/rejected": -67.98534393310547, "loss": 0.3036, "losses/dpo": 0.09045708924531937, "losses/sft": 2.255509853363037, "losses/total": 0.09045708924531937, "ref_logps/chosen": -32.18153381347656, "ref_logps/rejected": -32.76335906982422, "rewards/accuracies": 0.875, "rewards/chosen": -1.9961068630218506, "rewards/margins": 1.5260915756225586, "rewards/rejected": -3.522198438644409, "step": 2300 }, { "epoch": 2.17, "grad_norm": 20.05816650390625, "learning_rate": 1.5320041972717732e-07, "logps/chosen": -67.32510375976562, "logps/rejected": -99.2700424194336, "loss": 0.1995, "losses/dpo": 0.29701942205429077, "losses/sft": 2.7833328247070312, "losses/total": 0.29701942205429077, "ref_logps/chosen": -44.362266540527344, "ref_logps/rejected": -53.67930603027344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2962844371795654, "rewards/margins": 2.262789249420166, "rewards/rejected": -4.5590739250183105, "step": 2301 }, { "epoch": 2.17, "grad_norm": 23.345857620239258, "learning_rate": 1.5302553340328785e-07, "logps/chosen": -65.83466339111328, "logps/rejected": -79.39578247070312, "loss": 0.2934, "losses/dpo": 0.2013949155807495, "losses/sft": 2.1690316200256348, "losses/total": 0.2013949155807495, "ref_logps/chosen": -45.309478759765625, "ref_logps/rejected": -41.431556701660156, "rewards/accuracies": 0.875, "rewards/chosen": -2.0525190830230713, "rewards/margins": 1.743903636932373, "rewards/rejected": -3.7964227199554443, "step": 2302 }, { "epoch": 2.17, "grad_norm": 29.08433723449707, "learning_rate": 1.5285064707939837e-07, "logps/chosen": -53.37352752685547, "logps/rejected": -73.21650695800781, "loss": 0.3333, "losses/dpo": 0.2476210743188858, "losses/sft": 1.8561046123504639, "losses/total": 0.2476210743188858, "ref_logps/chosen": -33.61847686767578, "ref_logps/rejected": -36.27830505371094, "rewards/accuracies": 0.875, "rewards/chosen": -1.9755051136016846, "rewards/margins": 1.718315839767456, "rewards/rejected": -3.6938209533691406, "step": 2303 }, { "epoch": 2.18, "grad_norm": 22.244579315185547, "learning_rate": 1.5267576075550893e-07, "logps/chosen": -56.64630889892578, "logps/rejected": -78.9195556640625, "loss": 0.2829, "losses/dpo": 0.75873863697052, "losses/sft": 2.722460985183716, "losses/total": 0.75873863697052, "ref_logps/chosen": -39.257652282714844, "ref_logps/rejected": -40.22932434082031, "rewards/accuracies": 0.9375, "rewards/chosen": -1.738865613937378, "rewards/margins": 2.130156993865967, "rewards/rejected": -3.8690226078033447, "step": 2304 }, { "epoch": 2.18, "grad_norm": 16.203853607177734, "learning_rate": 1.5250087443161945e-07, "logps/chosen": -55.15812301635742, "logps/rejected": -80.3203125, "loss": 0.1578, "losses/dpo": 0.15829437971115112, "losses/sft": 2.101869821548462, "losses/total": 0.15829437971115112, "ref_logps/chosen": -40.08538818359375, "ref_logps/rejected": -42.42623519897461, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5072734355926514, "rewards/margins": 2.2821338176727295, "rewards/rejected": -3.78940749168396, "step": 2305 }, { "epoch": 2.18, "grad_norm": 23.39341163635254, "learning_rate": 1.5232598810772998e-07, "logps/chosen": -57.079349517822266, "logps/rejected": -79.1810531616211, "loss": 0.2492, "losses/dpo": 0.20406943559646606, "losses/sft": 1.9828931093215942, "losses/total": 0.20406943559646606, "ref_logps/chosen": -35.075340270996094, "ref_logps/rejected": -40.44786071777344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2004010677337646, "rewards/margins": 1.6729178428649902, "rewards/rejected": -3.873319149017334, "step": 2306 }, { "epoch": 2.18, "grad_norm": 27.756380081176758, "learning_rate": 1.521511017838405e-07, "logps/chosen": -56.9161376953125, "logps/rejected": -84.33606719970703, "loss": 0.2193, "losses/dpo": 0.2271655797958374, "losses/sft": 1.7130274772644043, "losses/total": 0.2271655797958374, "ref_logps/chosen": -37.48028564453125, "ref_logps/rejected": -43.27394485473633, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9435847997665405, "rewards/margins": 2.162627696990967, "rewards/rejected": -4.106212615966797, "step": 2307 }, { "epoch": 2.18, "grad_norm": 21.934593200683594, "learning_rate": 1.51976215459951e-07, "logps/chosen": -49.73581314086914, "logps/rejected": -78.15467834472656, "loss": 0.1955, "losses/dpo": 0.25588667392730713, "losses/sft": 2.7339768409729004, "losses/total": 0.25588667392730713, "ref_logps/chosen": -32.321189880371094, "ref_logps/rejected": -37.73955535888672, "rewards/accuracies": 1.0, "rewards/chosen": -1.7414628267288208, "rewards/margins": 2.3000504970550537, "rewards/rejected": -4.041513442993164, "step": 2308 }, { "epoch": 2.18, "grad_norm": 29.851484298706055, "learning_rate": 1.5180132913606155e-07, "logps/chosen": -64.11297607421875, "logps/rejected": -87.09363555908203, "loss": 0.2467, "losses/dpo": 0.11850249767303467, "losses/sft": 2.186098575592041, "losses/total": 0.11850249767303467, "ref_logps/chosen": -41.07861328125, "ref_logps/rejected": -43.50823974609375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3034369945526123, "rewards/margins": 2.055102825164795, "rewards/rejected": -4.358539581298828, "step": 2309 }, { "epoch": 2.18, "grad_norm": 40.93378448486328, "learning_rate": 1.5162644281217206e-07, "logps/chosen": -49.8791618347168, "logps/rejected": -77.52201843261719, "loss": 0.4609, "losses/dpo": 1.0263172388076782, "losses/sft": 2.1066715717315674, "losses/total": 1.0263172388076782, "ref_logps/chosen": -27.901844024658203, "ref_logps/rejected": -38.298492431640625, "rewards/accuracies": 0.875, "rewards/chosen": -2.1977317333221436, "rewards/margins": 1.724621295928955, "rewards/rejected": -3.9223532676696777, "step": 2310 }, { "epoch": 2.18, "grad_norm": 18.958892822265625, "learning_rate": 1.5145155648828263e-07, "logps/chosen": -48.95719909667969, "logps/rejected": -90.12373352050781, "loss": 0.1953, "losses/dpo": 0.17861512303352356, "losses/sft": 1.2062950134277344, "losses/total": 0.17861512303352356, "ref_logps/chosen": -34.45989990234375, "ref_logps/rejected": -47.54639434814453, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4497298002243042, "rewards/margins": 2.8080039024353027, "rewards/rejected": -4.257733345031738, "step": 2311 }, { "epoch": 2.18, "grad_norm": 30.231172561645508, "learning_rate": 1.5127667016439314e-07, "logps/chosen": -73.6708755493164, "logps/rejected": -94.07293701171875, "loss": 0.3743, "losses/dpo": 0.8147287964820862, "losses/sft": 2.1452088356018066, "losses/total": 0.8147287964820862, "ref_logps/chosen": -49.843849182128906, "ref_logps/rejected": -49.35814666748047, "rewards/accuracies": 0.8125, "rewards/chosen": -2.382702350616455, "rewards/margins": 2.088776111602783, "rewards/rejected": -4.471478462219238, "step": 2312 }, { "epoch": 2.18, "grad_norm": 18.43680763244629, "learning_rate": 1.5110178384050368e-07, "logps/chosen": -58.45404052734375, "logps/rejected": -72.94013977050781, "loss": 0.2686, "losses/dpo": 0.482033908367157, "losses/sft": 2.4288012981414795, "losses/total": 0.482033908367157, "ref_logps/chosen": -38.907203674316406, "ref_logps/rejected": -37.18342590332031, "rewards/accuracies": 0.875, "rewards/chosen": -1.954683780670166, "rewards/margins": 1.6209882497787476, "rewards/rejected": -3.575672149658203, "step": 2313 }, { "epoch": 2.19, "grad_norm": 16.642179489135742, "learning_rate": 1.509268975166142e-07, "logps/chosen": -55.08019256591797, "logps/rejected": -93.86299896240234, "loss": 0.1586, "losses/dpo": 0.08929822593927383, "losses/sft": 2.444059371948242, "losses/total": 0.08929822593927383, "ref_logps/chosen": -36.147682189941406, "ref_logps/rejected": -48.686256408691406, "rewards/accuracies": 1.0, "rewards/chosen": -1.8932510614395142, "rewards/margins": 2.6244235038757324, "rewards/rejected": -4.517674922943115, "step": 2314 }, { "epoch": 2.19, "grad_norm": 23.275371551513672, "learning_rate": 1.507520111927247e-07, "logps/chosen": -66.2259750366211, "logps/rejected": -101.51622772216797, "loss": 0.187, "losses/dpo": 0.17488284409046173, "losses/sft": 1.972433090209961, "losses/total": 0.17488284409046173, "ref_logps/chosen": -41.19746398925781, "ref_logps/rejected": -51.9842529296875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5028512477874756, "rewards/margins": 2.4503462314605713, "rewards/rejected": -4.953197479248047, "step": 2315 }, { "epoch": 2.19, "grad_norm": 22.9316349029541, "learning_rate": 1.5057712486883524e-07, "logps/chosen": -56.81782531738281, "logps/rejected": -85.59149169921875, "loss": 0.2141, "losses/dpo": 0.27787432074546814, "losses/sft": 2.329253673553467, "losses/total": 0.27787432074546814, "ref_logps/chosen": -39.2373161315918, "ref_logps/rejected": -47.52972412109375, "rewards/accuracies": 0.875, "rewards/chosen": -1.7580509185791016, "rewards/margins": 2.048125743865967, "rewards/rejected": -3.8061766624450684, "step": 2316 }, { "epoch": 2.19, "grad_norm": 22.330493927001953, "learning_rate": 1.5040223854494578e-07, "logps/chosen": -58.29429244995117, "logps/rejected": -87.17301177978516, "loss": 0.199, "losses/dpo": 0.48983046412467957, "losses/sft": 2.282857656478882, "losses/total": 0.48983046412467957, "ref_logps/chosen": -38.17634201049805, "ref_logps/rejected": -46.41453552246094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0117950439453125, "rewards/margins": 2.0640530586242676, "rewards/rejected": -4.075847625732422, "step": 2317 }, { "epoch": 2.19, "grad_norm": 16.657691955566406, "learning_rate": 1.5022735222105632e-07, "logps/chosen": -57.7027587890625, "logps/rejected": -90.79986572265625, "loss": 0.1598, "losses/dpo": 0.19808822870254517, "losses/sft": 1.4461119174957275, "losses/total": 0.19808822870254517, "ref_logps/chosen": -39.55608367919922, "ref_logps/rejected": -47.212581634521484, "rewards/accuracies": 1.0, "rewards/chosen": -1.8146674633026123, "rewards/margins": 2.544060468673706, "rewards/rejected": -4.358727931976318, "step": 2318 }, { "epoch": 2.19, "grad_norm": 19.468774795532227, "learning_rate": 1.5005246589716683e-07, "logps/chosen": -61.18832015991211, "logps/rejected": -84.00276947021484, "loss": 0.2029, "losses/dpo": 0.11266574263572693, "losses/sft": 1.8466558456420898, "losses/total": 0.11266574263572693, "ref_logps/chosen": -40.79590606689453, "ref_logps/rejected": -38.97797393798828, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0392415523529053, "rewards/margins": 2.463238477706909, "rewards/rejected": -4.5024800300598145, "step": 2319 }, { "epoch": 2.19, "grad_norm": 23.325109481811523, "learning_rate": 1.4987757957327737e-07, "logps/chosen": -62.5383415222168, "logps/rejected": -99.259033203125, "loss": 0.1531, "losses/dpo": 0.12385757267475128, "losses/sft": 2.454174518585205, "losses/total": 0.12385757267475128, "ref_logps/chosen": -38.148841857910156, "ref_logps/rejected": -49.837745666503906, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4389500617980957, "rewards/margins": 2.503178596496582, "rewards/rejected": -4.942128658294678, "step": 2320 }, { "epoch": 2.19, "grad_norm": 22.025278091430664, "learning_rate": 1.4970269324938789e-07, "logps/chosen": -53.976966857910156, "logps/rejected": -79.06675720214844, "loss": 0.1789, "losses/dpo": 0.37636545300483704, "losses/sft": 2.159593343734741, "losses/total": 0.37636545300483704, "ref_logps/chosen": -35.98680114746094, "ref_logps/rejected": -38.93898010253906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7990168333053589, "rewards/margins": 2.2137606143951416, "rewards/rejected": -4.012777328491211, "step": 2321 }, { "epoch": 2.19, "grad_norm": 27.695343017578125, "learning_rate": 1.495278069254984e-07, "logps/chosen": -58.63128662109375, "logps/rejected": -84.05570220947266, "loss": 0.2667, "losses/dpo": 0.187092125415802, "losses/sft": 2.039384603500366, "losses/total": 0.187092125415802, "ref_logps/chosen": -38.57199478149414, "ref_logps/rejected": -41.511451721191406, "rewards/accuracies": 0.875, "rewards/chosen": -2.0059292316436768, "rewards/margins": 2.2484960556030273, "rewards/rejected": -4.254425525665283, "step": 2322 }, { "epoch": 2.19, "grad_norm": 33.03314208984375, "learning_rate": 1.4935292060160894e-07, "logps/chosen": -52.55841064453125, "logps/rejected": -89.72498321533203, "loss": 0.3684, "losses/dpo": 0.05208424851298332, "losses/sft": 1.2749131917953491, "losses/total": 0.05208424851298332, "ref_logps/chosen": -39.29067611694336, "ref_logps/rejected": -52.81770324707031, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3267734050750732, "rewards/margins": 2.363955020904541, "rewards/rejected": -3.690728187561035, "step": 2323 }, { "epoch": 2.19, "grad_norm": 36.880924224853516, "learning_rate": 1.4917803427771948e-07, "logps/chosen": -65.55006408691406, "logps/rejected": -76.8304672241211, "loss": 0.4278, "losses/dpo": 0.6489154696464539, "losses/sft": 2.4154653549194336, "losses/total": 0.6489154696464539, "ref_logps/chosen": -44.63751220703125, "ref_logps/rejected": -39.63740921020508, "rewards/accuracies": 0.875, "rewards/chosen": -2.0912556648254395, "rewards/margins": 1.628050684928894, "rewards/rejected": -3.719306230545044, "step": 2324 }, { "epoch": 2.2, "grad_norm": 26.76791000366211, "learning_rate": 1.4900314795383002e-07, "logps/chosen": -43.804527282714844, "logps/rejected": -57.542274475097656, "loss": 0.4118, "losses/dpo": 0.37537091970443726, "losses/sft": 1.997344732284546, "losses/total": 0.37537091970443726, "ref_logps/chosen": -29.272216796875, "ref_logps/rejected": -33.132381439208984, "rewards/accuracies": 0.875, "rewards/chosen": -1.4532310962677002, "rewards/margins": 0.9877585172653198, "rewards/rejected": -2.4409894943237305, "step": 2325 }, { "epoch": 2.2, "grad_norm": 27.466733932495117, "learning_rate": 1.4882826162994053e-07, "logps/chosen": -44.07056427001953, "logps/rejected": -63.016815185546875, "loss": 0.3469, "losses/dpo": 0.39591869711875916, "losses/sft": 1.7456421852111816, "losses/total": 0.39591869711875916, "ref_logps/chosen": -26.791919708251953, "ref_logps/rejected": -32.60139465332031, "rewards/accuracies": 0.875, "rewards/chosen": -1.7278647422790527, "rewards/margins": 1.3136770725250244, "rewards/rejected": -3.041541576385498, "step": 2326 }, { "epoch": 2.2, "grad_norm": 22.862457275390625, "learning_rate": 1.4865337530605107e-07, "logps/chosen": -66.59981536865234, "logps/rejected": -80.0736083984375, "loss": 0.2588, "losses/dpo": 0.19616463780403137, "losses/sft": 2.4209187030792236, "losses/total": 0.19616463780403137, "ref_logps/chosen": -45.21485900878906, "ref_logps/rejected": -40.36358642578125, "rewards/accuracies": 0.875, "rewards/chosen": -2.138495445251465, "rewards/margins": 1.8325070142745972, "rewards/rejected": -3.9710023403167725, "step": 2327 }, { "epoch": 2.2, "grad_norm": 16.2176570892334, "learning_rate": 1.4847848898216158e-07, "logps/chosen": -45.099464416503906, "logps/rejected": -69.90950775146484, "loss": 0.1891, "losses/dpo": 0.08385226875543594, "losses/sft": 1.7653790712356567, "losses/total": 0.08385226875543594, "ref_logps/chosen": -30.71889305114746, "ref_logps/rejected": -34.14701843261719, "rewards/accuracies": 1.0, "rewards/chosen": -1.4380574226379395, "rewards/margins": 2.1381921768188477, "rewards/rejected": -3.576249599456787, "step": 2328 }, { "epoch": 2.2, "grad_norm": 12.499853134155273, "learning_rate": 1.483036026582721e-07, "logps/chosen": -48.76898956298828, "logps/rejected": -87.23249053955078, "loss": 0.1081, "losses/dpo": 0.11485821008682251, "losses/sft": 2.2997212409973145, "losses/total": 0.11485821008682251, "ref_logps/chosen": -34.328887939453125, "ref_logps/rejected": -44.967140197753906, "rewards/accuracies": 1.0, "rewards/chosen": -1.4440099000930786, "rewards/margins": 2.782524585723877, "rewards/rejected": -4.226534843444824, "step": 2329 }, { "epoch": 2.2, "grad_norm": 15.476034164428711, "learning_rate": 1.4812871633438266e-07, "logps/chosen": -56.94268035888672, "logps/rejected": -84.33056640625, "loss": 0.148, "losses/dpo": 0.1531893014907837, "losses/sft": 1.8435088396072388, "losses/total": 0.1531893014907837, "ref_logps/chosen": -37.59951400756836, "ref_logps/rejected": -44.7179069519043, "rewards/accuracies": 1.0, "rewards/chosen": -1.9343163967132568, "rewards/margins": 2.0269503593444824, "rewards/rejected": -3.9612669944763184, "step": 2330 }, { "epoch": 2.2, "grad_norm": 19.033321380615234, "learning_rate": 1.4795383001049317e-07, "logps/chosen": -55.84258270263672, "logps/rejected": -97.86953735351562, "loss": 0.2389, "losses/dpo": 0.44569632411003113, "losses/sft": 2.1921885013580322, "losses/total": 0.44569632411003113, "ref_logps/chosen": -36.10467529296875, "ref_logps/rejected": -53.268577575683594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9737910032272339, "rewards/margins": 2.486304521560669, "rewards/rejected": -4.460095405578613, "step": 2331 }, { "epoch": 2.2, "grad_norm": 26.962873458862305, "learning_rate": 1.477789436866037e-07, "logps/chosen": -53.0608024597168, "logps/rejected": -69.47300720214844, "loss": 0.3438, "losses/dpo": 0.6293489933013916, "losses/sft": 2.8936233520507812, "losses/total": 0.6293489933013916, "ref_logps/chosen": -35.521522521972656, "ref_logps/rejected": -37.66744613647461, "rewards/accuracies": 0.75, "rewards/chosen": -1.7539280652999878, "rewards/margins": 1.4266278743743896, "rewards/rejected": -3.180555820465088, "step": 2332 }, { "epoch": 2.2, "grad_norm": 17.8859806060791, "learning_rate": 1.4760405736271422e-07, "logps/chosen": -49.18883514404297, "logps/rejected": -88.13423156738281, "loss": 0.202, "losses/dpo": 0.19137877225875854, "losses/sft": 2.218813180923462, "losses/total": 0.19137877225875854, "ref_logps/chosen": -27.740039825439453, "ref_logps/rejected": -39.538360595703125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1448793411254883, "rewards/margins": 2.714707851409912, "rewards/rejected": -4.8595871925354, "step": 2333 }, { "epoch": 2.2, "grad_norm": 23.893566131591797, "learning_rate": 1.4742917103882476e-07, "logps/chosen": -57.67502975463867, "logps/rejected": -82.77272033691406, "loss": 0.2789, "losses/dpo": 0.17580771446228027, "losses/sft": 2.105759382247925, "losses/total": 0.17580771446228027, "ref_logps/chosen": -38.39775085449219, "ref_logps/rejected": -43.19354248046875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9277279376983643, "rewards/margins": 2.0301897525787354, "rewards/rejected": -3.9579176902770996, "step": 2334 }, { "epoch": 2.2, "grad_norm": 16.83787727355957, "learning_rate": 1.4725428471493527e-07, "logps/chosen": -60.886390686035156, "logps/rejected": -92.79360961914062, "loss": 0.1671, "losses/dpo": 0.2394506186246872, "losses/sft": 1.7804149389266968, "losses/total": 0.2394506186246872, "ref_logps/chosen": -40.23788070678711, "ref_logps/rejected": -47.62163543701172, "rewards/accuracies": 1.0, "rewards/chosen": -2.0648512840270996, "rewards/margins": 2.4523463249206543, "rewards/rejected": -4.517197608947754, "step": 2335 }, { "epoch": 2.21, "grad_norm": 22.526973724365234, "learning_rate": 1.470793983910458e-07, "logps/chosen": -62.29762268066406, "logps/rejected": -85.05091094970703, "loss": 0.2611, "losses/dpo": 0.11467424035072327, "losses/sft": 1.7647268772125244, "losses/total": 0.11467424035072327, "ref_logps/chosen": -40.084190368652344, "ref_logps/rejected": -41.87248992919922, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2213430404663086, "rewards/margins": 2.096498966217041, "rewards/rejected": -4.317842483520508, "step": 2336 }, { "epoch": 2.21, "grad_norm": 12.491043090820312, "learning_rate": 1.4690451206715635e-07, "logps/chosen": -52.1961669921875, "logps/rejected": -92.15484619140625, "loss": 0.0957, "losses/dpo": 0.10348942875862122, "losses/sft": 2.396301507949829, "losses/total": 0.10348942875862122, "ref_logps/chosen": -31.215700149536133, "ref_logps/rejected": -45.03858947753906, "rewards/accuracies": 1.0, "rewards/chosen": -2.09804630279541, "rewards/margins": 2.613579511642456, "rewards/rejected": -4.711626052856445, "step": 2337 }, { "epoch": 2.21, "grad_norm": 18.863603591918945, "learning_rate": 1.4672962574326687e-07, "logps/chosen": -46.332733154296875, "logps/rejected": -79.79254150390625, "loss": 0.2002, "losses/dpo": 0.18961940705776215, "losses/sft": 2.0179660320281982, "losses/total": 0.18961940705776215, "ref_logps/chosen": -31.654754638671875, "ref_logps/rejected": -40.82588195800781, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4677982330322266, "rewards/margins": 2.4288666248321533, "rewards/rejected": -3.896665096282959, "step": 2338 }, { "epoch": 2.21, "grad_norm": 25.21454620361328, "learning_rate": 1.465547394193774e-07, "logps/chosen": -65.52156066894531, "logps/rejected": -100.70330047607422, "loss": 0.2241, "losses/dpo": 0.26555219292640686, "losses/sft": 1.5325287580490112, "losses/total": 0.26555219292640686, "ref_logps/chosen": -36.212493896484375, "ref_logps/rejected": -48.46067810058594, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9309067726135254, "rewards/margins": 2.293355703353882, "rewards/rejected": -5.224262237548828, "step": 2339 }, { "epoch": 2.21, "grad_norm": 20.153215408325195, "learning_rate": 1.4637985309548792e-07, "logps/chosen": -44.94344711303711, "logps/rejected": -74.34300231933594, "loss": 0.2596, "losses/dpo": 0.20611099898815155, "losses/sft": 1.7320812940597534, "losses/total": 0.20611099898815155, "ref_logps/chosen": -30.651456832885742, "ref_logps/rejected": -37.26879119873047, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4291990995407104, "rewards/margins": 2.27822208404541, "rewards/rejected": -3.70742130279541, "step": 2340 }, { "epoch": 2.21, "grad_norm": 20.724573135375977, "learning_rate": 1.4620496677159846e-07, "logps/chosen": -61.433876037597656, "logps/rejected": -94.84131622314453, "loss": 0.2727, "losses/dpo": 0.18200325965881348, "losses/sft": 2.288672924041748, "losses/total": 0.18200325965881348, "ref_logps/chosen": -39.890411376953125, "ref_logps/rejected": -47.58531188964844, "rewards/accuracies": 0.9375, "rewards/chosen": -2.154345989227295, "rewards/margins": 2.5712552070617676, "rewards/rejected": -4.725600719451904, "step": 2341 }, { "epoch": 2.21, "grad_norm": 17.30550765991211, "learning_rate": 1.4603008044770897e-07, "logps/chosen": -43.660831451416016, "logps/rejected": -93.08619689941406, "loss": 0.204, "losses/dpo": 0.06266417354345322, "losses/sft": 1.5552688837051392, "losses/total": 0.06266417354345322, "ref_logps/chosen": -27.48443031311035, "ref_logps/rejected": -51.1119270324707, "rewards/accuracies": 0.875, "rewards/chosen": -1.6176401376724243, "rewards/margins": 2.5797863006591797, "rewards/rejected": -4.197426795959473, "step": 2342 }, { "epoch": 2.21, "grad_norm": 18.430200576782227, "learning_rate": 1.458551941238195e-07, "logps/chosen": -49.0999755859375, "logps/rejected": -91.00870513916016, "loss": 0.1564, "losses/dpo": 0.10836179554462433, "losses/sft": 1.6327718496322632, "losses/total": 0.10836179554462433, "ref_logps/chosen": -32.151344299316406, "ref_logps/rejected": -47.25897216796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.6948630809783936, "rewards/margins": 2.680109977722168, "rewards/rejected": -4.374973297119141, "step": 2343 }, { "epoch": 2.21, "grad_norm": 22.52871322631836, "learning_rate": 1.4568030779993005e-07, "logps/chosen": -63.45011520385742, "logps/rejected": -87.78423309326172, "loss": 0.2481, "losses/dpo": 0.36345982551574707, "losses/sft": 3.078723669052124, "losses/total": 0.36345982551574707, "ref_logps/chosen": -43.00407791137695, "ref_logps/rejected": -46.24573516845703, "rewards/accuracies": 0.875, "rewards/chosen": -2.0446040630340576, "rewards/margins": 2.109245777130127, "rewards/rejected": -4.1538496017456055, "step": 2344 }, { "epoch": 2.21, "grad_norm": 24.400203704833984, "learning_rate": 1.4550542147604056e-07, "logps/chosen": -53.13116455078125, "logps/rejected": -88.30104064941406, "loss": 0.22, "losses/dpo": 0.12786290049552917, "losses/sft": 2.163546562194824, "losses/total": 0.12786290049552917, "ref_logps/chosen": -34.274723052978516, "ref_logps/rejected": -46.45326614379883, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8856441974639893, "rewards/margins": 2.29913330078125, "rewards/rejected": -4.18477725982666, "step": 2345 }, { "epoch": 2.22, "grad_norm": 18.022647857666016, "learning_rate": 1.453305351521511e-07, "logps/chosen": -43.55058288574219, "logps/rejected": -78.72930908203125, "loss": 0.171, "losses/dpo": 0.03637959808111191, "losses/sft": 2.0351409912109375, "losses/total": 0.03637959808111191, "ref_logps/chosen": -30.565048217773438, "ref_logps/rejected": -40.70380401611328, "rewards/accuracies": 1.0, "rewards/chosen": -1.298553228378296, "rewards/margins": 2.503997564315796, "rewards/rejected": -3.8025505542755127, "step": 2346 }, { "epoch": 2.22, "grad_norm": 32.07356643676758, "learning_rate": 1.451556488282616e-07, "logps/chosen": -61.853782653808594, "logps/rejected": -83.10594177246094, "loss": 0.3792, "losses/dpo": 0.44469523429870605, "losses/sft": 2.6296372413635254, "losses/total": 0.44469523429870605, "ref_logps/chosen": -36.15470504760742, "ref_logps/rejected": -42.85236740112305, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5699076652526855, "rewards/margins": 1.455450177192688, "rewards/rejected": -4.025358200073242, "step": 2347 }, { "epoch": 2.22, "grad_norm": 16.023487091064453, "learning_rate": 1.4498076250437215e-07, "logps/chosen": -55.09426498413086, "logps/rejected": -93.76437377929688, "loss": 0.1763, "losses/dpo": 0.07367486506700516, "losses/sft": 2.2977771759033203, "losses/total": 0.07367486506700516, "ref_logps/chosen": -34.59999084472656, "ref_logps/rejected": -49.805213928222656, "rewards/accuracies": 0.9375, "rewards/chosen": -2.049427032470703, "rewards/margins": 2.3464887142181396, "rewards/rejected": -4.395915508270264, "step": 2348 }, { "epoch": 2.22, "grad_norm": 16.10032844543457, "learning_rate": 1.448058761804827e-07, "logps/chosen": -58.19569396972656, "logps/rejected": -78.85516357421875, "loss": 0.2249, "losses/dpo": 0.41694408655166626, "losses/sft": 1.5890512466430664, "losses/total": 0.41694408655166626, "ref_logps/chosen": -43.76419448852539, "ref_logps/rejected": -40.66889572143555, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4431498050689697, "rewards/margins": 2.3754773139953613, "rewards/rejected": -3.818627119064331, "step": 2349 }, { "epoch": 2.22, "grad_norm": 17.756059646606445, "learning_rate": 1.446309898565932e-07, "logps/chosen": -58.808738708496094, "logps/rejected": -93.82316589355469, "loss": 0.1624, "losses/dpo": 0.36172622442245483, "losses/sft": 2.4447214603424072, "losses/total": 0.36172622442245483, "ref_logps/chosen": -40.85578536987305, "ref_logps/rejected": -51.608863830566406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7952957153320312, "rewards/margins": 2.4261350631713867, "rewards/rejected": -4.221430778503418, "step": 2350 }, { "epoch": 2.22, "grad_norm": 22.392757415771484, "learning_rate": 1.4445610353270374e-07, "logps/chosen": -39.621299743652344, "logps/rejected": -76.97991943359375, "loss": 0.271, "losses/dpo": 0.08346450328826904, "losses/sft": 1.575729489326477, "losses/total": 0.08346450328826904, "ref_logps/chosen": -23.799175262451172, "ref_logps/rejected": -36.59522247314453, "rewards/accuracies": 0.875, "rewards/chosen": -1.582212209701538, "rewards/margins": 2.4562578201293945, "rewards/rejected": -4.038470268249512, "step": 2351 }, { "epoch": 2.22, "grad_norm": 11.101428031921387, "learning_rate": 1.4428121720881425e-07, "logps/chosen": -46.938114166259766, "logps/rejected": -87.33726501464844, "loss": 0.0978, "losses/dpo": 0.07818686217069626, "losses/sft": 2.0432565212249756, "losses/total": 0.07818686217069626, "ref_logps/chosen": -34.40842819213867, "ref_logps/rejected": -44.21738815307617, "rewards/accuracies": 1.0, "rewards/chosen": -1.2529685497283936, "rewards/margins": 3.059019088745117, "rewards/rejected": -4.31198787689209, "step": 2352 }, { "epoch": 2.22, "grad_norm": 18.5439395904541, "learning_rate": 1.441063308849248e-07, "logps/chosen": -54.774200439453125, "logps/rejected": -93.36119079589844, "loss": 0.1625, "losses/dpo": 0.08087873458862305, "losses/sft": 2.0139360427856445, "losses/total": 0.08087873458862305, "ref_logps/chosen": -33.428253173828125, "ref_logps/rejected": -44.51591110229492, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1345949172973633, "rewards/margins": 2.7499332427978516, "rewards/rejected": -4.884528160095215, "step": 2353 }, { "epoch": 2.22, "grad_norm": 29.636119842529297, "learning_rate": 1.439314445610353e-07, "logps/chosen": -71.17277526855469, "logps/rejected": -88.97742462158203, "loss": 0.341, "losses/dpo": 0.40482181310653687, "losses/sft": 2.618036985397339, "losses/total": 0.40482181310653687, "ref_logps/chosen": -44.85466766357422, "ref_logps/rejected": -45.272884368896484, "rewards/accuracies": 0.8125, "rewards/chosen": -2.631810188293457, "rewards/margins": 1.7386436462402344, "rewards/rejected": -4.370453834533691, "step": 2354 }, { "epoch": 2.22, "grad_norm": 31.361713409423828, "learning_rate": 1.4375655823714585e-07, "logps/chosen": -63.979164123535156, "logps/rejected": -75.9061050415039, "loss": 0.3415, "losses/dpo": 0.20038992166519165, "losses/sft": 2.0793874263763428, "losses/total": 0.20038992166519165, "ref_logps/chosen": -43.39042282104492, "ref_logps/rejected": -38.34117889404297, "rewards/accuracies": 0.875, "rewards/chosen": -2.0588741302490234, "rewards/margins": 1.6976183652877808, "rewards/rejected": -3.7564926147460938, "step": 2355 }, { "epoch": 2.22, "grad_norm": 28.45371437072754, "learning_rate": 1.4358167191325638e-07, "logps/chosen": -51.93365478515625, "logps/rejected": -75.81988525390625, "loss": 0.3713, "losses/dpo": 0.4351145029067993, "losses/sft": 2.2038331031799316, "losses/total": 0.4351145029067993, "ref_logps/chosen": -32.08757019042969, "ref_logps/rejected": -40.00370788574219, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9846086502075195, "rewards/margins": 1.5970089435577393, "rewards/rejected": -3.581617593765259, "step": 2356 }, { "epoch": 2.23, "grad_norm": 15.803486824035645, "learning_rate": 1.4340678558936692e-07, "logps/chosen": -48.984432220458984, "logps/rejected": -86.11408233642578, "loss": 0.1101, "losses/dpo": 0.014929844997823238, "losses/sft": 1.5590956211090088, "losses/total": 0.014929844997823238, "ref_logps/chosen": -37.49921417236328, "ref_logps/rejected": -46.27743148803711, "rewards/accuracies": 1.0, "rewards/chosen": -1.148522138595581, "rewards/margins": 2.8351430892944336, "rewards/rejected": -3.9836652278900146, "step": 2357 }, { "epoch": 2.23, "grad_norm": 22.702421188354492, "learning_rate": 1.4323189926547744e-07, "logps/chosen": -51.1621208190918, "logps/rejected": -63.68791961669922, "loss": 0.315, "losses/dpo": 0.33401402831077576, "losses/sft": 1.608689308166504, "losses/total": 0.33401402831077576, "ref_logps/chosen": -36.11867904663086, "ref_logps/rejected": -29.403697967529297, "rewards/accuracies": 0.875, "rewards/chosen": -1.5043443441390991, "rewards/margins": 1.9240778684616089, "rewards/rejected": -3.428422451019287, "step": 2358 }, { "epoch": 2.23, "grad_norm": 20.503326416015625, "learning_rate": 1.4305701294158795e-07, "logps/chosen": -56.55320739746094, "logps/rejected": -74.6923599243164, "loss": 0.1809, "losses/dpo": 0.2541070878505707, "losses/sft": 1.267479658126831, "losses/total": 0.2541070878505707, "ref_logps/chosen": -37.958709716796875, "ref_logps/rejected": -35.907806396484375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.859449863433838, "rewards/margins": 2.01900577545166, "rewards/rejected": -3.878455638885498, "step": 2359 }, { "epoch": 2.23, "grad_norm": 24.33203887939453, "learning_rate": 1.428821266176985e-07, "logps/chosen": -61.42462158203125, "logps/rejected": -82.93911743164062, "loss": 0.2563, "losses/dpo": 0.2705605626106262, "losses/sft": 2.0370235443115234, "losses/total": 0.2705605626106262, "ref_logps/chosen": -42.64265823364258, "ref_logps/rejected": -45.80293273925781, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8781962394714355, "rewards/margins": 1.8354219198226929, "rewards/rejected": -3.713618278503418, "step": 2360 }, { "epoch": 2.23, "grad_norm": 25.733675003051758, "learning_rate": 1.42707240293809e-07, "logps/chosen": -53.19091033935547, "logps/rejected": -80.37325286865234, "loss": 0.3386, "losses/dpo": 0.33262988924980164, "losses/sft": 2.1511356830596924, "losses/total": 0.33262988924980164, "ref_logps/chosen": -36.03321075439453, "ref_logps/rejected": -45.60382843017578, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7157700061798096, "rewards/margins": 1.7611720561981201, "rewards/rejected": -3.4769420623779297, "step": 2361 }, { "epoch": 2.23, "grad_norm": 21.893434524536133, "learning_rate": 1.4253235396991957e-07, "logps/chosen": -46.79673767089844, "logps/rejected": -77.75418090820312, "loss": 0.2763, "losses/dpo": 0.23092472553253174, "losses/sft": 2.5948309898376465, "losses/total": 0.23092472553253174, "ref_logps/chosen": -31.810062408447266, "ref_logps/rejected": -43.440086364746094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4986672401428223, "rewards/margins": 1.932742953300476, "rewards/rejected": -3.431410312652588, "step": 2362 }, { "epoch": 2.23, "grad_norm": 31.391372680664062, "learning_rate": 1.4235746764603008e-07, "logps/chosen": -51.307395935058594, "logps/rejected": -78.97651672363281, "loss": 0.289, "losses/dpo": 0.30398237705230713, "losses/sft": 1.74521803855896, "losses/total": 0.30398237705230713, "ref_logps/chosen": -35.805572509765625, "ref_logps/rejected": -42.899356842041016, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5501821041107178, "rewards/margins": 2.057533025741577, "rewards/rejected": -3.607715368270874, "step": 2363 }, { "epoch": 2.23, "grad_norm": 9.573360443115234, "learning_rate": 1.4218258132214062e-07, "logps/chosen": -56.36796569824219, "logps/rejected": -96.30144500732422, "loss": 0.1066, "losses/dpo": 0.18614837527275085, "losses/sft": 2.1442477703094482, "losses/total": 0.18614837527275085, "ref_logps/chosen": -38.60167694091797, "ref_logps/rejected": -52.30154037475586, "rewards/accuracies": 1.0, "rewards/chosen": -1.7766289710998535, "rewards/margins": 2.6233623027801514, "rewards/rejected": -4.399991035461426, "step": 2364 }, { "epoch": 2.23, "grad_norm": 12.25535774230957, "learning_rate": 1.4200769499825113e-07, "logps/chosen": -57.207275390625, "logps/rejected": -87.8372802734375, "loss": 0.1169, "losses/dpo": 0.1619815230369568, "losses/sft": 2.0699262619018555, "losses/total": 0.1619815230369568, "ref_logps/chosen": -42.47515106201172, "ref_logps/rejected": -45.31751251220703, "rewards/accuracies": 1.0, "rewards/chosen": -1.473212718963623, "rewards/margins": 2.778763771057129, "rewards/rejected": -4.251976490020752, "step": 2365 }, { "epoch": 2.23, "grad_norm": 21.626710891723633, "learning_rate": 1.4183280867436164e-07, "logps/chosen": -50.645103454589844, "logps/rejected": -66.97547912597656, "loss": 0.2146, "losses/dpo": 0.21027088165283203, "losses/sft": 1.540096640586853, "losses/total": 0.21027088165283203, "ref_logps/chosen": -31.896881103515625, "ref_logps/rejected": -30.79705238342285, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8748222589492798, "rewards/margins": 1.7430202960968018, "rewards/rejected": -3.617842674255371, "step": 2366 }, { "epoch": 2.24, "grad_norm": 31.71278190612793, "learning_rate": 1.4165792235047218e-07, "logps/chosen": -53.707672119140625, "logps/rejected": -86.32237243652344, "loss": 0.3126, "losses/dpo": 0.06811884790658951, "losses/sft": 2.075042486190796, "losses/total": 0.06811884790658951, "ref_logps/chosen": -35.07553482055664, "ref_logps/rejected": -44.44194793701172, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8632137775421143, "rewards/margins": 2.3248291015625, "rewards/rejected": -4.188042640686035, "step": 2367 }, { "epoch": 2.24, "grad_norm": 36.270774841308594, "learning_rate": 1.4148303602658272e-07, "logps/chosen": -59.30921173095703, "logps/rejected": -73.14798736572266, "loss": 0.491, "losses/dpo": 0.2740963399410248, "losses/sft": 2.3004026412963867, "losses/total": 0.2740963399410248, "ref_logps/chosen": -34.68144607543945, "ref_logps/rejected": -36.50506591796875, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4627766609191895, "rewards/margins": 1.2015153169631958, "rewards/rejected": -3.6642918586730957, "step": 2368 }, { "epoch": 2.24, "grad_norm": 25.790407180786133, "learning_rate": 1.4130814970269326e-07, "logps/chosen": -50.60401916503906, "logps/rejected": -75.64742279052734, "loss": 0.2233, "losses/dpo": 0.2719334661960602, "losses/sft": 2.431874990463257, "losses/total": 0.2719334661960602, "ref_logps/chosen": -33.43584442138672, "ref_logps/rejected": -35.54055404663086, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7168177366256714, "rewards/margins": 2.2938692569732666, "rewards/rejected": -4.010686874389648, "step": 2369 }, { "epoch": 2.24, "grad_norm": 21.304140090942383, "learning_rate": 1.4113326337880377e-07, "logps/chosen": -48.10052490234375, "logps/rejected": -89.85772705078125, "loss": 0.1887, "losses/dpo": 0.12991724908351898, "losses/sft": 1.800195574760437, "losses/total": 0.12991724908351898, "ref_logps/chosen": -32.903175354003906, "ref_logps/rejected": -51.25528335571289, "rewards/accuracies": 1.0, "rewards/chosen": -1.519735336303711, "rewards/margins": 2.3405098915100098, "rewards/rejected": -3.8602449893951416, "step": 2370 }, { "epoch": 2.24, "grad_norm": 21.90868377685547, "learning_rate": 1.409583770549143e-07, "logps/chosen": -55.55950164794922, "logps/rejected": -85.34005737304688, "loss": 0.3056, "losses/dpo": 0.6460038423538208, "losses/sft": 1.9490422010421753, "losses/total": 0.6460038423538208, "ref_logps/chosen": -33.34410095214844, "ref_logps/rejected": -44.41395568847656, "rewards/accuracies": 0.875, "rewards/chosen": -2.2215399742126465, "rewards/margins": 1.871070384979248, "rewards/rejected": -4.0926103591918945, "step": 2371 }, { "epoch": 2.24, "grad_norm": 35.088531494140625, "learning_rate": 1.4078349073102483e-07, "logps/chosen": -60.84961700439453, "logps/rejected": -65.9256591796875, "loss": 0.4156, "losses/dpo": 0.5359123945236206, "losses/sft": 3.1174604892730713, "losses/total": 0.5359123945236206, "ref_logps/chosen": -38.57980728149414, "ref_logps/rejected": -30.240612030029297, "rewards/accuracies": 0.875, "rewards/chosen": -2.226980686187744, "rewards/margins": 1.3415242433547974, "rewards/rejected": -3.568504810333252, "step": 2372 }, { "epoch": 2.24, "grad_norm": 24.721359252929688, "learning_rate": 1.4060860440713534e-07, "logps/chosen": -74.81805419921875, "logps/rejected": -91.27822875976562, "loss": 0.2603, "losses/dpo": 0.43100637197494507, "losses/sft": 2.326233148574829, "losses/total": 0.43100637197494507, "ref_logps/chosen": -49.39601135253906, "ref_logps/rejected": -43.19419860839844, "rewards/accuracies": 0.875, "rewards/chosen": -2.5422043800354004, "rewards/margins": 2.2661991119384766, "rewards/rejected": -4.808403491973877, "step": 2373 }, { "epoch": 2.24, "grad_norm": 33.49697494506836, "learning_rate": 1.4043371808324588e-07, "logps/chosen": -46.592891693115234, "logps/rejected": -66.65484619140625, "loss": 0.412, "losses/dpo": 0.2121744453907013, "losses/sft": 1.344349980354309, "losses/total": 0.2121744453907013, "ref_logps/chosen": -31.860713958740234, "ref_logps/rejected": -37.57476043701172, "rewards/accuracies": 0.75, "rewards/chosen": -1.4732177257537842, "rewards/margins": 1.4347916841506958, "rewards/rejected": -2.9080092906951904, "step": 2374 }, { "epoch": 2.24, "grad_norm": 18.769556045532227, "learning_rate": 1.4025883175935642e-07, "logps/chosen": -58.47972869873047, "logps/rejected": -79.76788330078125, "loss": 0.1919, "losses/dpo": 0.21788561344146729, "losses/sft": 2.000558376312256, "losses/total": 0.21788561344146729, "ref_logps/chosen": -42.12378692626953, "ref_logps/rejected": -41.602020263671875, "rewards/accuracies": 1.0, "rewards/chosen": -1.635594129562378, "rewards/margins": 2.1809921264648438, "rewards/rejected": -3.8165862560272217, "step": 2375 }, { "epoch": 2.24, "grad_norm": 22.678112030029297, "learning_rate": 1.4008394543546696e-07, "logps/chosen": -46.83934020996094, "logps/rejected": -74.54325866699219, "loss": 0.2744, "losses/dpo": 0.034999292343854904, "losses/sft": 1.8867835998535156, "losses/total": 0.034999292343854904, "ref_logps/chosen": -30.624225616455078, "ref_logps/rejected": -38.76499938964844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.621511697769165, "rewards/margins": 1.9563138484954834, "rewards/rejected": -3.5778255462646484, "step": 2376 }, { "epoch": 2.24, "grad_norm": 21.544294357299805, "learning_rate": 1.3990905911157747e-07, "logps/chosen": -58.8060302734375, "logps/rejected": -82.62645721435547, "loss": 0.2119, "losses/dpo": 0.2927456498146057, "losses/sft": 2.033172607421875, "losses/total": 0.2927456498146057, "ref_logps/chosen": -37.91547393798828, "ref_logps/rejected": -43.22669219970703, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0890557765960693, "rewards/margins": 1.850920557975769, "rewards/rejected": -3.939976453781128, "step": 2377 }, { "epoch": 2.25, "grad_norm": 22.206777572631836, "learning_rate": 1.39734172787688e-07, "logps/chosen": -55.01869201660156, "logps/rejected": -75.8746109008789, "loss": 0.2801, "losses/dpo": 0.16016580164432526, "losses/sft": 2.433382034301758, "losses/total": 0.16016580164432526, "ref_logps/chosen": -32.20459747314453, "ref_logps/rejected": -33.72629165649414, "rewards/accuracies": 0.875, "rewards/chosen": -2.281409740447998, "rewards/margins": 1.9334220886230469, "rewards/rejected": -4.214831829071045, "step": 2378 }, { "epoch": 2.25, "grad_norm": 16.628137588500977, "learning_rate": 1.3955928646379852e-07, "logps/chosen": -52.125919342041016, "logps/rejected": -88.80117797851562, "loss": 0.1552, "losses/dpo": 0.12228988856077194, "losses/sft": 1.8678429126739502, "losses/total": 0.12228988856077194, "ref_logps/chosen": -30.66082000732422, "ref_logps/rejected": -42.06886291503906, "rewards/accuracies": 1.0, "rewards/chosen": -2.146510124206543, "rewards/margins": 2.5267210006713867, "rewards/rejected": -4.6732306480407715, "step": 2379 }, { "epoch": 2.25, "grad_norm": 28.080888748168945, "learning_rate": 1.3938440013990903e-07, "logps/chosen": -56.184120178222656, "logps/rejected": -83.13200378417969, "loss": 0.3831, "losses/dpo": 0.04662206023931503, "losses/sft": 2.3052868843078613, "losses/total": 0.04662206023931503, "ref_logps/chosen": -35.02573776245117, "ref_logps/rejected": -39.637577056884766, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1158385276794434, "rewards/margins": 2.2336044311523438, "rewards/rejected": -4.349442481994629, "step": 2380 }, { "epoch": 2.25, "grad_norm": 19.871089935302734, "learning_rate": 1.392095138160196e-07, "logps/chosen": -62.149200439453125, "logps/rejected": -101.46636962890625, "loss": 0.1846, "losses/dpo": 0.22456835210323334, "losses/sft": 2.095123767852783, "losses/total": 0.22456835210323334, "ref_logps/chosen": -41.12150192260742, "ref_logps/rejected": -53.73661422729492, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1027703285217285, "rewards/margins": 2.6702051162719727, "rewards/rejected": -4.772974967956543, "step": 2381 }, { "epoch": 2.25, "grad_norm": 32.6857795715332, "learning_rate": 1.390346274921301e-07, "logps/chosen": -49.76288604736328, "logps/rejected": -80.27776336669922, "loss": 0.3087, "losses/dpo": 0.052292875945568085, "losses/sft": 1.8107963800430298, "losses/total": 0.052292875945568085, "ref_logps/chosen": -31.52740478515625, "ref_logps/rejected": -39.43199920654297, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8235479593276978, "rewards/margins": 2.261028528213501, "rewards/rejected": -4.084576606750488, "step": 2382 }, { "epoch": 2.25, "grad_norm": 11.212374687194824, "learning_rate": 1.3885974116824065e-07, "logps/chosen": -48.82699966430664, "logps/rejected": -77.65071105957031, "loss": 0.1289, "losses/dpo": 0.1088331937789917, "losses/sft": 1.497929573059082, "losses/total": 0.1088331937789917, "ref_logps/chosen": -35.838314056396484, "ref_logps/rejected": -37.895538330078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.2988685369491577, "rewards/margins": 2.676649570465088, "rewards/rejected": -3.975518226623535, "step": 2383 }, { "epoch": 2.25, "grad_norm": 24.601694107055664, "learning_rate": 1.3868485484435116e-07, "logps/chosen": -56.765804290771484, "logps/rejected": -81.39866638183594, "loss": 0.2784, "losses/dpo": 0.2070329636335373, "losses/sft": 2.3070905208587646, "losses/total": 0.2070329636335373, "ref_logps/chosen": -37.4853515625, "ref_logps/rejected": -43.59917449951172, "rewards/accuracies": 0.875, "rewards/chosen": -1.9280447959899902, "rewards/margins": 1.851904273033142, "rewards/rejected": -3.779949188232422, "step": 2384 }, { "epoch": 2.25, "grad_norm": 22.381103515625, "learning_rate": 1.385099685204617e-07, "logps/chosen": -51.57588195800781, "logps/rejected": -84.54014587402344, "loss": 0.2206, "losses/dpo": 0.3153095245361328, "losses/sft": 2.236558437347412, "losses/total": 0.3153095245361328, "ref_logps/chosen": -31.67160415649414, "ref_logps/rejected": -43.47623062133789, "rewards/accuracies": 0.9375, "rewards/chosen": -1.990427851676941, "rewards/margins": 2.115964412689209, "rewards/rejected": -4.106391906738281, "step": 2385 }, { "epoch": 2.25, "grad_norm": 12.015800476074219, "learning_rate": 1.3833508219657221e-07, "logps/chosen": -64.45236206054688, "logps/rejected": -99.07717895507812, "loss": 0.1092, "losses/dpo": 0.09402421116828918, "losses/sft": 2.278538465499878, "losses/total": 0.09402421116828918, "ref_logps/chosen": -44.67734146118164, "ref_logps/rejected": -53.46605682373047, "rewards/accuracies": 1.0, "rewards/chosen": -1.9775025844573975, "rewards/margins": 2.5836098194122314, "rewards/rejected": -4.561112403869629, "step": 2386 }, { "epoch": 2.25, "grad_norm": 29.570575714111328, "learning_rate": 1.3816019587268273e-07, "logps/chosen": -59.985172271728516, "logps/rejected": -72.76171112060547, "loss": 0.3698, "losses/dpo": 0.46376681327819824, "losses/sft": 2.2356157302856445, "losses/total": 0.46376681327819824, "ref_logps/chosen": -41.023773193359375, "ref_logps/rejected": -40.67151641845703, "rewards/accuracies": 0.875, "rewards/chosen": -1.8961400985717773, "rewards/margins": 1.3128788471221924, "rewards/rejected": -3.2090187072753906, "step": 2387 }, { "epoch": 2.25, "grad_norm": 25.337995529174805, "learning_rate": 1.379853095487933e-07, "logps/chosen": -60.974884033203125, "logps/rejected": -80.356689453125, "loss": 0.2601, "losses/dpo": 0.15893080830574036, "losses/sft": 2.2057037353515625, "losses/total": 0.15893080830574036, "ref_logps/chosen": -43.78391647338867, "ref_logps/rejected": -43.60730743408203, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7190968990325928, "rewards/margins": 1.9558415412902832, "rewards/rejected": -3.674938440322876, "step": 2388 }, { "epoch": 2.26, "grad_norm": 37.78849411010742, "learning_rate": 1.378104232249038e-07, "logps/chosen": -61.546714782714844, "logps/rejected": -69.55838775634766, "loss": 0.5174, "losses/dpo": 0.7308023571968079, "losses/sft": 2.1762781143188477, "losses/total": 0.7308023571968079, "ref_logps/chosen": -35.81056594848633, "ref_logps/rejected": -31.574665069580078, "rewards/accuracies": 0.8125, "rewards/chosen": -2.573615074157715, "rewards/margins": 1.2247569561004639, "rewards/rejected": -3.7983717918395996, "step": 2389 }, { "epoch": 2.26, "grad_norm": 32.76581954956055, "learning_rate": 1.3763553690101434e-07, "logps/chosen": -62.23737716674805, "logps/rejected": -102.82528686523438, "loss": 0.2494, "losses/dpo": 0.10765311121940613, "losses/sft": 2.1158807277679443, "losses/total": 0.10765311121940613, "ref_logps/chosen": -40.06000518798828, "ref_logps/rejected": -56.96181869506836, "rewards/accuracies": 0.875, "rewards/chosen": -2.2177374362945557, "rewards/margins": 2.3686089515686035, "rewards/rejected": -4.586346626281738, "step": 2390 }, { "epoch": 2.26, "grad_norm": 17.851558685302734, "learning_rate": 1.3746065057712486e-07, "logps/chosen": -54.43299102783203, "logps/rejected": -95.2369613647461, "loss": 0.1583, "losses/dpo": 0.08966552466154099, "losses/sft": 1.845613956451416, "losses/total": 0.08966552466154099, "ref_logps/chosen": -34.57926940917969, "ref_logps/rejected": -48.206199645996094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9853723049163818, "rewards/margins": 2.7177042961120605, "rewards/rejected": -4.7030768394470215, "step": 2391 }, { "epoch": 2.26, "grad_norm": 23.088077545166016, "learning_rate": 1.372857642532354e-07, "logps/chosen": -51.7119026184082, "logps/rejected": -79.51332092285156, "loss": 0.3184, "losses/dpo": 0.5947718024253845, "losses/sft": 1.8849107027053833, "losses/total": 0.5947718024253845, "ref_logps/chosen": -34.84232711791992, "ref_logps/rejected": -42.829750061035156, "rewards/accuracies": 0.875, "rewards/chosen": -1.6869573593139648, "rewards/margins": 1.9813995361328125, "rewards/rejected": -3.6683568954467773, "step": 2392 }, { "epoch": 2.26, "grad_norm": 27.97321128845215, "learning_rate": 1.371108779293459e-07, "logps/chosen": -60.52677917480469, "logps/rejected": -85.35513305664062, "loss": 0.2969, "losses/dpo": 0.27566102147102356, "losses/sft": 1.8344444036483765, "losses/total": 0.27566102147102356, "ref_logps/chosen": -41.73499298095703, "ref_logps/rejected": -47.36833953857422, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8791784048080444, "rewards/margins": 1.9195013046264648, "rewards/rejected": -3.798679828643799, "step": 2393 }, { "epoch": 2.26, "grad_norm": 24.059640884399414, "learning_rate": 1.3693599160545645e-07, "logps/chosen": -64.58656311035156, "logps/rejected": -81.40239715576172, "loss": 0.2808, "losses/dpo": 0.1934591382741928, "losses/sft": 1.9751237630844116, "losses/total": 0.1934591382741928, "ref_logps/chosen": -43.479068756103516, "ref_logps/rejected": -39.68476104736328, "rewards/accuracies": 0.875, "rewards/chosen": -2.1107492446899414, "rewards/margins": 2.06101393699646, "rewards/rejected": -4.1717634201049805, "step": 2394 }, { "epoch": 2.26, "grad_norm": 32.36083221435547, "learning_rate": 1.3676110528156699e-07, "logps/chosen": -58.72663497924805, "logps/rejected": -70.34164428710938, "loss": 0.3824, "losses/dpo": 0.33511149883270264, "losses/sft": 1.4628716707229614, "losses/total": 0.33511149883270264, "ref_logps/chosen": -39.878639221191406, "ref_logps/rejected": -37.69023895263672, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8847999572753906, "rewards/margins": 1.3803400993347168, "rewards/rejected": -3.2651400566101074, "step": 2395 }, { "epoch": 2.26, "grad_norm": 24.17458152770996, "learning_rate": 1.365862189576775e-07, "logps/chosen": -49.89300537109375, "logps/rejected": -68.76500701904297, "loss": 0.3302, "losses/dpo": 0.3179434835910797, "losses/sft": 1.9142531156539917, "losses/total": 0.3179434835910797, "ref_logps/chosen": -33.66688919067383, "ref_logps/rejected": -36.99160385131836, "rewards/accuracies": 0.75, "rewards/chosen": -1.6226119995117188, "rewards/margins": 1.5547285079956055, "rewards/rejected": -3.177340507507324, "step": 2396 }, { "epoch": 2.26, "grad_norm": 14.464322090148926, "learning_rate": 1.3641133263378804e-07, "logps/chosen": -54.700801849365234, "logps/rejected": -98.97001647949219, "loss": 0.1051, "losses/dpo": 0.13569232821464539, "losses/sft": 2.107184886932373, "losses/total": 0.13569232821464539, "ref_logps/chosen": -35.661781311035156, "ref_logps/rejected": -47.221923828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.903902292251587, "rewards/margins": 3.270906448364258, "rewards/rejected": -5.174808979034424, "step": 2397 }, { "epoch": 2.26, "grad_norm": 24.608543395996094, "learning_rate": 1.3623644630989855e-07, "logps/chosen": -56.23943328857422, "logps/rejected": -78.3583755493164, "loss": 0.264, "losses/dpo": 0.2555888593196869, "losses/sft": 2.392543315887451, "losses/total": 0.2555888593196869, "ref_logps/chosen": -36.91911315917969, "ref_logps/rejected": -44.044044494628906, "rewards/accuracies": 1.0, "rewards/chosen": -1.9320323467254639, "rewards/margins": 1.4994008541107178, "rewards/rejected": -3.4314332008361816, "step": 2398 }, { "epoch": 2.27, "grad_norm": 22.063926696777344, "learning_rate": 1.360615599860091e-07, "logps/chosen": -59.61087417602539, "logps/rejected": -84.21319580078125, "loss": 0.2006, "losses/dpo": 0.30503690242767334, "losses/sft": 2.4765117168426514, "losses/total": 0.30503690242767334, "ref_logps/chosen": -41.401397705078125, "ref_logps/rejected": -44.19184875488281, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8209476470947266, "rewards/margins": 2.1811866760253906, "rewards/rejected": -4.002134323120117, "step": 2399 }, { "epoch": 2.27, "grad_norm": 29.38550567626953, "learning_rate": 1.3588667366211963e-07, "logps/chosen": -71.275146484375, "logps/rejected": -89.43241882324219, "loss": 0.2534, "losses/dpo": 0.06601622700691223, "losses/sft": 2.1628527641296387, "losses/total": 0.06601622700691223, "ref_logps/chosen": -51.447181701660156, "ref_logps/rejected": -47.4654426574707, "rewards/accuracies": 0.875, "rewards/chosen": -1.9827961921691895, "rewards/margins": 2.213902473449707, "rewards/rejected": -4.196698188781738, "step": 2400 }, { "epoch": 2.27, "grad_norm": 16.175073623657227, "learning_rate": 1.3571178733823014e-07, "logps/chosen": -59.07041931152344, "logps/rejected": -92.49278259277344, "loss": 0.1621, "losses/dpo": 0.3248475193977356, "losses/sft": 1.802823781967163, "losses/total": 0.3248475193977356, "ref_logps/chosen": -40.613304138183594, "ref_logps/rejected": -49.56654357910156, "rewards/accuracies": 1.0, "rewards/chosen": -1.8457117080688477, "rewards/margins": 2.4469127655029297, "rewards/rejected": -4.292624473571777, "step": 2401 }, { "epoch": 2.27, "grad_norm": 20.280170440673828, "learning_rate": 1.3553690101434068e-07, "logps/chosen": -32.33776092529297, "logps/rejected": -64.39653015136719, "loss": 0.2531, "losses/dpo": 0.3441128134727478, "losses/sft": 1.8245842456817627, "losses/total": 0.3441128134727478, "ref_logps/chosen": -23.20067024230957, "ref_logps/rejected": -34.498069763183594, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9137091040611267, "rewards/margins": 2.076137065887451, "rewards/rejected": -2.9898462295532227, "step": 2402 }, { "epoch": 2.27, "grad_norm": 18.15727424621582, "learning_rate": 1.353620146904512e-07, "logps/chosen": -64.0350341796875, "logps/rejected": -92.22100830078125, "loss": 0.184, "losses/dpo": 0.1796242594718933, "losses/sft": 2.6987340450286865, "losses/total": 0.1796242594718933, "ref_logps/chosen": -40.579315185546875, "ref_logps/rejected": -46.809898376464844, "rewards/accuracies": 0.9375, "rewards/chosen": -2.345571517944336, "rewards/margins": 2.195540428161621, "rewards/rejected": -4.541111946105957, "step": 2403 }, { "epoch": 2.27, "grad_norm": 16.40632438659668, "learning_rate": 1.3518712836656173e-07, "logps/chosen": -54.407630920410156, "logps/rejected": -84.37345886230469, "loss": 0.1674, "losses/dpo": 0.29198169708251953, "losses/sft": 1.8292515277862549, "losses/total": 0.29198169708251953, "ref_logps/chosen": -43.080963134765625, "ref_logps/rejected": -48.06843566894531, "rewards/accuracies": 1.0, "rewards/chosen": -1.132666826248169, "rewards/margins": 2.497835159301758, "rewards/rejected": -3.630502223968506, "step": 2404 }, { "epoch": 2.27, "grad_norm": 15.126834869384766, "learning_rate": 1.3501224204267225e-07, "logps/chosen": -51.35548400878906, "logps/rejected": -89.8807373046875, "loss": 0.17, "losses/dpo": 0.10265373438596725, "losses/sft": 2.852729320526123, "losses/total": 0.10265373438596725, "ref_logps/chosen": -35.860774993896484, "ref_logps/rejected": -51.35160446166992, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5494709014892578, "rewards/margins": 2.303443193435669, "rewards/rejected": -3.8529140949249268, "step": 2405 }, { "epoch": 2.27, "grad_norm": 26.159523010253906, "learning_rate": 1.3483735571878278e-07, "logps/chosen": -58.79133605957031, "logps/rejected": -61.3942985534668, "loss": 0.3852, "losses/dpo": 0.44203728437423706, "losses/sft": 2.013800859451294, "losses/total": 0.44203728437423706, "ref_logps/chosen": -35.162986755371094, "ref_logps/rejected": -27.51243019104004, "rewards/accuracies": 0.875, "rewards/chosen": -2.362834930419922, "rewards/margins": 1.0253522396087646, "rewards/rejected": -3.3881869316101074, "step": 2406 }, { "epoch": 2.27, "grad_norm": 12.882963180541992, "learning_rate": 1.3466246939489332e-07, "logps/chosen": -47.99653244018555, "logps/rejected": -77.51039123535156, "loss": 0.1679, "losses/dpo": 0.18950128555297852, "losses/sft": 1.9544509649276733, "losses/total": 0.18950128555297852, "ref_logps/chosen": -30.21261215209961, "ref_logps/rejected": -35.950096130371094, "rewards/accuracies": 1.0, "rewards/chosen": -1.7783918380737305, "rewards/margins": 2.3776371479034424, "rewards/rejected": -4.156029224395752, "step": 2407 }, { "epoch": 2.27, "grad_norm": 22.991592407226562, "learning_rate": 1.3448758307100384e-07, "logps/chosen": -55.77557373046875, "logps/rejected": -82.40957641601562, "loss": 0.4157, "losses/dpo": 0.35862863063812256, "losses/sft": 2.105030059814453, "losses/total": 0.35862863063812256, "ref_logps/chosen": -35.95896911621094, "ref_logps/rejected": -44.39405059814453, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9816608428955078, "rewards/margins": 1.8198909759521484, "rewards/rejected": -3.8015518188476562, "step": 2408 }, { "epoch": 2.27, "grad_norm": 19.473819732666016, "learning_rate": 1.3431269674711438e-07, "logps/chosen": -66.45231628417969, "logps/rejected": -86.8429946899414, "loss": 0.1994, "losses/dpo": 0.23618318140506744, "losses/sft": 2.5551085472106934, "losses/total": 0.23618318140506744, "ref_logps/chosen": -46.22987365722656, "ref_logps/rejected": -47.191810607910156, "rewards/accuracies": 1.0, "rewards/chosen": -2.022244453430176, "rewards/margins": 1.9428741931915283, "rewards/rejected": -3.965118646621704, "step": 2409 }, { "epoch": 2.28, "grad_norm": 22.52762794494629, "learning_rate": 1.341378104232249e-07, "logps/chosen": -54.11133575439453, "logps/rejected": -70.26641845703125, "loss": 0.2801, "losses/dpo": 0.39310312271118164, "losses/sft": 2.870839834213257, "losses/total": 0.39310312271118164, "ref_logps/chosen": -36.081295013427734, "ref_logps/rejected": -33.66963195800781, "rewards/accuracies": 0.875, "rewards/chosen": -1.8030041456222534, "rewards/margins": 1.8566750288009644, "rewards/rejected": -3.6596791744232178, "step": 2410 }, { "epoch": 2.28, "grad_norm": 24.872575759887695, "learning_rate": 1.3396292409933543e-07, "logps/chosen": -49.60558319091797, "logps/rejected": -70.62821960449219, "loss": 0.3292, "losses/dpo": 0.5849834084510803, "losses/sft": 2.177232027053833, "losses/total": 0.5849834084510803, "ref_logps/chosen": -29.09119987487793, "ref_logps/rejected": -37.09852600097656, "rewards/accuracies": 0.8125, "rewards/chosen": -2.051438808441162, "rewards/margins": 1.301531195640564, "rewards/rejected": -3.3529698848724365, "step": 2411 }, { "epoch": 2.28, "grad_norm": 17.264997482299805, "learning_rate": 1.3378803777544594e-07, "logps/chosen": -60.85906219482422, "logps/rejected": -89.72657012939453, "loss": 0.1469, "losses/dpo": 0.2169148176908493, "losses/sft": 2.3806140422821045, "losses/total": 0.2169148176908493, "ref_logps/chosen": -37.93870544433594, "ref_logps/rejected": -45.111961364746094, "rewards/accuracies": 1.0, "rewards/chosen": -2.2920358180999756, "rewards/margins": 2.169424533843994, "rewards/rejected": -4.461460590362549, "step": 2412 }, { "epoch": 2.28, "grad_norm": 23.59300422668457, "learning_rate": 1.336131514515565e-07, "logps/chosen": -47.2181396484375, "logps/rejected": -77.3600082397461, "loss": 0.3153, "losses/dpo": 0.057991232722997665, "losses/sft": 1.221850872039795, "losses/total": 0.057991232722997665, "ref_logps/chosen": -30.850021362304688, "ref_logps/rejected": -41.15699005126953, "rewards/accuracies": 0.875, "rewards/chosen": -1.6368123292922974, "rewards/margins": 1.9834893941879272, "rewards/rejected": -3.6203017234802246, "step": 2413 }, { "epoch": 2.28, "grad_norm": 12.763352394104004, "learning_rate": 1.3343826512766702e-07, "logps/chosen": -55.571128845214844, "logps/rejected": -85.33956146240234, "loss": 0.1619, "losses/dpo": 0.10864780843257904, "losses/sft": 2.114583730697632, "losses/total": 0.10864780843257904, "ref_logps/chosen": -36.87445068359375, "ref_logps/rejected": -42.569793701171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8696682453155518, "rewards/margins": 2.4073095321655273, "rewards/rejected": -4.2769775390625, "step": 2414 }, { "epoch": 2.28, "grad_norm": 16.972850799560547, "learning_rate": 1.3326337880377753e-07, "logps/chosen": -63.30195617675781, "logps/rejected": -95.20246124267578, "loss": 0.1504, "losses/dpo": 0.12854431569576263, "losses/sft": 1.3681620359420776, "losses/total": 0.12854431569576263, "ref_logps/chosen": -43.029457092285156, "ref_logps/rejected": -50.25859069824219, "rewards/accuracies": 1.0, "rewards/chosen": -2.027250051498413, "rewards/margins": 2.467137336730957, "rewards/rejected": -4.494387626647949, "step": 2415 }, { "epoch": 2.28, "grad_norm": 18.43023681640625, "learning_rate": 1.3308849247988807e-07, "logps/chosen": -53.49689483642578, "logps/rejected": -77.71681213378906, "loss": 0.2667, "losses/dpo": 0.3872087895870209, "losses/sft": 1.7538005113601685, "losses/total": 0.3872087895870209, "ref_logps/chosen": -35.99681091308594, "ref_logps/rejected": -41.74824523925781, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7500083446502686, "rewards/margins": 1.8468482494354248, "rewards/rejected": -3.5968565940856934, "step": 2416 }, { "epoch": 2.28, "grad_norm": 30.0272159576416, "learning_rate": 1.3291360615599858e-07, "logps/chosen": -50.79872131347656, "logps/rejected": -79.16336822509766, "loss": 0.4276, "losses/dpo": 0.20233452320098877, "losses/sft": 1.5577571392059326, "losses/total": 0.20233452320098877, "ref_logps/chosen": -33.007076263427734, "ref_logps/rejected": -43.126792907714844, "rewards/accuracies": 0.875, "rewards/chosen": -1.779164433479309, "rewards/margins": 1.824493169784546, "rewards/rejected": -3.6036574840545654, "step": 2417 }, { "epoch": 2.28, "grad_norm": 23.389415740966797, "learning_rate": 1.3273871983210912e-07, "logps/chosen": -63.302425384521484, "logps/rejected": -95.86807250976562, "loss": 0.2034, "losses/dpo": 0.2668130695819855, "losses/sft": 3.084998369216919, "losses/total": 0.2668130695819855, "ref_logps/chosen": -39.01983642578125, "ref_logps/rejected": -48.74082946777344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4282588958740234, "rewards/margins": 2.2844650745391846, "rewards/rejected": -4.712724208831787, "step": 2418 }, { "epoch": 2.28, "grad_norm": 29.843721389770508, "learning_rate": 1.3256383350821963e-07, "logps/chosen": -70.4579086303711, "logps/rejected": -96.57888793945312, "loss": 0.2756, "losses/dpo": 0.23706570267677307, "losses/sft": 2.3109374046325684, "losses/total": 0.23706570267677307, "ref_logps/chosen": -51.540157318115234, "ref_logps/rejected": -55.11073303222656, "rewards/accuracies": 0.875, "rewards/chosen": -1.8917758464813232, "rewards/margins": 2.2550394535064697, "rewards/rejected": -4.146815299987793, "step": 2419 }, { "epoch": 2.29, "grad_norm": 26.076583862304688, "learning_rate": 1.323889471843302e-07, "logps/chosen": -59.901817321777344, "logps/rejected": -90.6585464477539, "loss": 0.4032, "losses/dpo": 0.1096503883600235, "losses/sft": 1.7867217063903809, "losses/total": 0.1096503883600235, "ref_logps/chosen": -35.881282806396484, "ref_logps/rejected": -43.73113250732422, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4020538330078125, "rewards/margins": 2.290687322616577, "rewards/rejected": -4.692741394042969, "step": 2420 }, { "epoch": 2.29, "grad_norm": 24.19892692565918, "learning_rate": 1.322140608604407e-07, "logps/chosen": -46.25697326660156, "logps/rejected": -83.18038940429688, "loss": 0.2321, "losses/dpo": 0.3132858872413635, "losses/sft": 2.0489869117736816, "losses/total": 0.3132858872413635, "ref_logps/chosen": -27.728519439697266, "ref_logps/rejected": -42.803306579589844, "rewards/accuracies": 0.875, "rewards/chosen": -1.8528456687927246, "rewards/margins": 2.1848621368408203, "rewards/rejected": -4.037708282470703, "step": 2421 }, { "epoch": 2.29, "grad_norm": 15.319928169250488, "learning_rate": 1.3203917453655123e-07, "logps/chosen": -42.185203552246094, "logps/rejected": -74.96870422363281, "loss": 0.2152, "losses/dpo": 0.1868710219860077, "losses/sft": 1.4592739343643188, "losses/total": 0.1868710219860077, "ref_logps/chosen": -27.96023178100586, "ref_logps/rejected": -39.26026916503906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.422497034072876, "rewards/margins": 2.148346185684204, "rewards/rejected": -3.570843458175659, "step": 2422 }, { "epoch": 2.29, "grad_norm": 18.8427677154541, "learning_rate": 1.3186428821266176e-07, "logps/chosen": -54.79179382324219, "logps/rejected": -75.80045318603516, "loss": 0.2228, "losses/dpo": 0.2674533426761627, "losses/sft": 2.0315146446228027, "losses/total": 0.2674533426761627, "ref_logps/chosen": -40.59490966796875, "ref_logps/rejected": -39.550743103027344, "rewards/accuracies": 0.875, "rewards/chosen": -1.4196884632110596, "rewards/margins": 2.2052831649780273, "rewards/rejected": -3.624971389770508, "step": 2423 }, { "epoch": 2.29, "grad_norm": 24.370243072509766, "learning_rate": 1.3168940188877228e-07, "logps/chosen": -65.93751525878906, "logps/rejected": -82.40924072265625, "loss": 0.2725, "losses/dpo": 0.1942099779844284, "losses/sft": 1.8575749397277832, "losses/total": 0.1942099779844284, "ref_logps/chosen": -43.61890411376953, "ref_logps/rejected": -39.99061584472656, "rewards/accuracies": 0.875, "rewards/chosen": -2.231860637664795, "rewards/margins": 2.0100016593933105, "rewards/rejected": -4.2418622970581055, "step": 2424 }, { "epoch": 2.29, "grad_norm": 12.209527015686035, "learning_rate": 1.3151451556488282e-07, "logps/chosen": -52.96317672729492, "logps/rejected": -84.84678649902344, "loss": 0.1069, "losses/dpo": 0.06721833348274231, "losses/sft": 1.5265836715698242, "losses/total": 0.06721833348274231, "ref_logps/chosen": -39.00221252441406, "ref_logps/rejected": -44.65850830078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.3960963487625122, "rewards/margins": 2.6227309703826904, "rewards/rejected": -4.018827438354492, "step": 2425 }, { "epoch": 2.29, "grad_norm": 13.12696647644043, "learning_rate": 1.3133962924099336e-07, "logps/chosen": -54.62923049926758, "logps/rejected": -87.98492431640625, "loss": 0.1217, "losses/dpo": 0.11109773069620132, "losses/sft": 1.8667696714401245, "losses/total": 0.11109773069620132, "ref_logps/chosen": -38.25688934326172, "ref_logps/rejected": -47.481788635253906, "rewards/accuracies": 1.0, "rewards/chosen": -1.6372339725494385, "rewards/margins": 2.4130802154541016, "rewards/rejected": -4.050313949584961, "step": 2426 }, { "epoch": 2.29, "grad_norm": 13.050585746765137, "learning_rate": 1.311647429171039e-07, "logps/chosen": -52.75262451171875, "logps/rejected": -86.94068908691406, "loss": 0.1198, "losses/dpo": 0.1460084170103073, "losses/sft": 2.224973440170288, "losses/total": 0.1460084170103073, "ref_logps/chosen": -35.61548614501953, "ref_logps/rejected": -42.66188049316406, "rewards/accuracies": 1.0, "rewards/chosen": -1.7137138843536377, "rewards/margins": 2.7141664028167725, "rewards/rejected": -4.42788028717041, "step": 2427 }, { "epoch": 2.29, "grad_norm": 36.96118927001953, "learning_rate": 1.309898565932144e-07, "logps/chosen": -71.69375610351562, "logps/rejected": -80.9984359741211, "loss": 0.4443, "losses/dpo": 0.6484484076499939, "losses/sft": 1.8623161315917969, "losses/total": 0.6484484076499939, "ref_logps/chosen": -52.135955810546875, "ref_logps/rejected": -45.18701934814453, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9557801485061646, "rewards/margins": 1.6253615617752075, "rewards/rejected": -3.581141948699951, "step": 2428 }, { "epoch": 2.29, "grad_norm": 21.450525283813477, "learning_rate": 1.3081497026932492e-07, "logps/chosen": -46.33113098144531, "logps/rejected": -80.69264221191406, "loss": 0.2107, "losses/dpo": 0.37929439544677734, "losses/sft": 1.9647760391235352, "losses/total": 0.37929439544677734, "ref_logps/chosen": -31.963848114013672, "ref_logps/rejected": -43.37226104736328, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4367282390594482, "rewards/margins": 2.295309543609619, "rewards/rejected": -3.7320377826690674, "step": 2429 }, { "epoch": 2.29, "grad_norm": 27.140365600585938, "learning_rate": 1.3064008394543546e-07, "logps/chosen": -61.80070495605469, "logps/rejected": -75.57101440429688, "loss": 0.2846, "losses/dpo": 0.18821236491203308, "losses/sft": 2.1133620738983154, "losses/total": 0.18821236491203308, "ref_logps/chosen": -40.08721160888672, "ref_logps/rejected": -39.165260314941406, "rewards/accuracies": 0.8125, "rewards/chosen": -2.171349287033081, "rewards/margins": 1.469226598739624, "rewards/rejected": -3.640575885772705, "step": 2430 }, { "epoch": 2.3, "grad_norm": 22.137954711914062, "learning_rate": 1.3046519762154597e-07, "logps/chosen": -65.85840606689453, "logps/rejected": -113.12633514404297, "loss": 0.1886, "losses/dpo": 0.03326120227575302, "losses/sft": 2.66357421875, "losses/total": 0.03326120227575302, "ref_logps/chosen": -45.154388427734375, "ref_logps/rejected": -63.15598678588867, "rewards/accuracies": 0.875, "rewards/chosen": -2.070401668548584, "rewards/margins": 2.926633358001709, "rewards/rejected": -4.997035026550293, "step": 2431 }, { "epoch": 2.3, "grad_norm": 17.694807052612305, "learning_rate": 1.3029031129765654e-07, "logps/chosen": -64.27177429199219, "logps/rejected": -83.77233123779297, "loss": 0.2356, "losses/dpo": 0.09184356778860092, "losses/sft": 1.846799373626709, "losses/total": 0.09184356778860092, "ref_logps/chosen": -45.82080841064453, "ref_logps/rejected": -42.62569808959961, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8450958728790283, "rewards/margins": 2.2695670127868652, "rewards/rejected": -4.114663124084473, "step": 2432 }, { "epoch": 2.3, "grad_norm": 16.929241180419922, "learning_rate": 1.3011542497376705e-07, "logps/chosen": -66.10371398925781, "logps/rejected": -96.54380798339844, "loss": 0.1577, "losses/dpo": 0.0324687696993351, "losses/sft": 1.9238481521606445, "losses/total": 0.0324687696993351, "ref_logps/chosen": -47.739498138427734, "ref_logps/rejected": -52.19666290283203, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8364222049713135, "rewards/margins": 2.598292112350464, "rewards/rejected": -4.434714317321777, "step": 2433 }, { "epoch": 2.3, "grad_norm": 21.235267639160156, "learning_rate": 1.299405386498776e-07, "logps/chosen": -58.0411376953125, "logps/rejected": -95.41883087158203, "loss": 0.2436, "losses/dpo": 0.2698054909706116, "losses/sft": 2.0384953022003174, "losses/total": 0.2698054909706116, "ref_logps/chosen": -36.53667449951172, "ref_logps/rejected": -47.482093811035156, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1504464149475098, "rewards/margins": 2.6432273387908936, "rewards/rejected": -4.793673515319824, "step": 2434 }, { "epoch": 2.3, "grad_norm": 32.499664306640625, "learning_rate": 1.297656523259881e-07, "logps/chosen": -51.987548828125, "logps/rejected": -71.0145034790039, "loss": 0.3761, "losses/dpo": 0.8973588943481445, "losses/sft": 2.3473894596099854, "losses/total": 0.8973588943481445, "ref_logps/chosen": -34.76100158691406, "ref_logps/rejected": -36.838016510009766, "rewards/accuracies": 0.875, "rewards/chosen": -1.7226548194885254, "rewards/margins": 1.6949940919876099, "rewards/rejected": -3.4176487922668457, "step": 2435 }, { "epoch": 2.3, "grad_norm": 24.999605178833008, "learning_rate": 1.2959076600209861e-07, "logps/chosen": -60.05938720703125, "logps/rejected": -89.8451919555664, "loss": 0.2166, "losses/dpo": 0.13032609224319458, "losses/sft": 2.4050872325897217, "losses/total": 0.13032609224319458, "ref_logps/chosen": -39.30598449707031, "ref_logps/rejected": -45.87055587768555, "rewards/accuracies": 0.875, "rewards/chosen": -2.0753402709960938, "rewards/margins": 2.3221235275268555, "rewards/rejected": -4.397463798522949, "step": 2436 }, { "epoch": 2.3, "grad_norm": 17.191051483154297, "learning_rate": 1.2941587967820915e-07, "logps/chosen": -53.162628173828125, "logps/rejected": -86.98213958740234, "loss": 0.1366, "losses/dpo": 0.09298518300056458, "losses/sft": 2.3359131813049316, "losses/total": 0.09298518300056458, "ref_logps/chosen": -36.64933395385742, "ref_logps/rejected": -43.11638641357422, "rewards/accuracies": 1.0, "rewards/chosen": -1.6513291597366333, "rewards/margins": 2.735246181488037, "rewards/rejected": -4.386575222015381, "step": 2437 }, { "epoch": 2.3, "grad_norm": 22.428136825561523, "learning_rate": 1.2924099335431967e-07, "logps/chosen": -57.398590087890625, "logps/rejected": -96.95325469970703, "loss": 0.196, "losses/dpo": 0.14919836819171906, "losses/sft": 2.094348192214966, "losses/total": 0.14919836819171906, "ref_logps/chosen": -29.801013946533203, "ref_logps/rejected": -48.45726013183594, "rewards/accuracies": 1.0, "rewards/chosen": -2.7597575187683105, "rewards/margins": 2.089841842651367, "rewards/rejected": -4.849599361419678, "step": 2438 }, { "epoch": 2.3, "grad_norm": 16.206666946411133, "learning_rate": 1.2906610703043023e-07, "logps/chosen": -48.533172607421875, "logps/rejected": -82.30545806884766, "loss": 0.21, "losses/dpo": 0.15454255044460297, "losses/sft": 1.960620641708374, "losses/total": 0.15454255044460297, "ref_logps/chosen": -29.086952209472656, "ref_logps/rejected": -43.64354705810547, "rewards/accuracies": 1.0, "rewards/chosen": -1.9446223974227905, "rewards/margins": 1.9215692281723022, "rewards/rejected": -3.8661916255950928, "step": 2439 }, { "epoch": 2.3, "grad_norm": 17.442277908325195, "learning_rate": 1.2889122070654074e-07, "logps/chosen": -56.63894271850586, "logps/rejected": -85.51051330566406, "loss": 0.168, "losses/dpo": 0.13820341229438782, "losses/sft": 1.964359164237976, "losses/total": 0.13820341229438782, "ref_logps/chosen": -39.571189880371094, "ref_logps/rejected": -46.58123779296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.706775426864624, "rewards/margins": 2.1861517429351807, "rewards/rejected": -3.8929271697998047, "step": 2440 }, { "epoch": 2.31, "grad_norm": 18.458139419555664, "learning_rate": 1.2871633438265128e-07, "logps/chosen": -56.450103759765625, "logps/rejected": -84.12791442871094, "loss": 0.1951, "losses/dpo": 0.12744677066802979, "losses/sft": 1.800889492034912, "losses/total": 0.12744677066802979, "ref_logps/chosen": -38.28672790527344, "ref_logps/rejected": -47.266780853271484, "rewards/accuracies": 1.0, "rewards/chosen": -1.8163377046585083, "rewards/margins": 1.8697752952575684, "rewards/rejected": -3.686112880706787, "step": 2441 }, { "epoch": 2.31, "grad_norm": 28.57596206665039, "learning_rate": 1.285414480587618e-07, "logps/chosen": -67.91671752929688, "logps/rejected": -78.28376770019531, "loss": 0.382, "losses/dpo": 0.8102523684501648, "losses/sft": 2.4306821823120117, "losses/total": 0.8102523684501648, "ref_logps/chosen": -42.26472473144531, "ref_logps/rejected": -38.39464569091797, "rewards/accuracies": 0.9375, "rewards/chosen": -2.565199851989746, "rewards/margins": 1.4237124919891357, "rewards/rejected": -3.988912582397461, "step": 2442 }, { "epoch": 2.31, "grad_norm": 24.497249603271484, "learning_rate": 1.283665617348723e-07, "logps/chosen": -57.40618133544922, "logps/rejected": -76.92585754394531, "loss": 0.2653, "losses/dpo": 0.3706018626689911, "losses/sft": 2.0360872745513916, "losses/total": 0.3706018626689911, "ref_logps/chosen": -34.803768157958984, "ref_logps/rejected": -35.438758850097656, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2602412700653076, "rewards/margins": 1.8884680271148682, "rewards/rejected": -4.148709297180176, "step": 2443 }, { "epoch": 2.31, "grad_norm": 23.49471092224121, "learning_rate": 1.2819167541098285e-07, "logps/chosen": -43.90263366699219, "logps/rejected": -105.9566421508789, "loss": 0.1804, "losses/dpo": 0.03583944961428642, "losses/sft": 1.913006067276001, "losses/total": 0.03583944961428642, "ref_logps/chosen": -26.37704849243164, "ref_logps/rejected": -59.380035400390625, "rewards/accuracies": 0.875, "rewards/chosen": -1.7525585889816284, "rewards/margins": 2.905101776123047, "rewards/rejected": -4.657660484313965, "step": 2444 }, { "epoch": 2.31, "grad_norm": 19.033655166625977, "learning_rate": 1.2801678908709339e-07, "logps/chosen": -57.934593200683594, "logps/rejected": -95.11457824707031, "loss": 0.1616, "losses/dpo": 0.3845723867416382, "losses/sft": 2.000866651535034, "losses/total": 0.3845723867416382, "ref_logps/chosen": -35.448612213134766, "ref_logps/rejected": -45.77138900756836, "rewards/accuracies": 0.9375, "rewards/chosen": -2.248598575592041, "rewards/margins": 2.6857199668884277, "rewards/rejected": -4.934318542480469, "step": 2445 }, { "epoch": 2.31, "grad_norm": 21.187564849853516, "learning_rate": 1.2784190276320393e-07, "logps/chosen": -41.606624603271484, "logps/rejected": -76.82884216308594, "loss": 0.2662, "losses/dpo": 0.26558205485343933, "losses/sft": 1.6570028066635132, "losses/total": 0.26558205485343933, "ref_logps/chosen": -26.780683517456055, "ref_logps/rejected": -36.45180892944336, "rewards/accuracies": 0.875, "rewards/chosen": -1.4825940132141113, "rewards/margins": 2.5551092624664307, "rewards/rejected": -4.037703514099121, "step": 2446 }, { "epoch": 2.31, "grad_norm": 20.586835861206055, "learning_rate": 1.2766701643931444e-07, "logps/chosen": -52.53807830810547, "logps/rejected": -95.30747985839844, "loss": 0.178, "losses/dpo": 0.025985386222600937, "losses/sft": 1.456533670425415, "losses/total": 0.025985386222600937, "ref_logps/chosen": -37.67286682128906, "ref_logps/rejected": -50.75, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4865206480026245, "rewards/margins": 2.9692273139953613, "rewards/rejected": -4.455747604370117, "step": 2447 }, { "epoch": 2.31, "grad_norm": 20.420146942138672, "learning_rate": 1.2749213011542498e-07, "logps/chosen": -64.54875183105469, "logps/rejected": -94.70415496826172, "loss": 0.2212, "losses/dpo": 0.1658926010131836, "losses/sft": 1.9655648469924927, "losses/total": 0.1658926010131836, "ref_logps/chosen": -41.79730224609375, "ref_logps/rejected": -47.0723991394043, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2751455307006836, "rewards/margins": 2.488029718399048, "rewards/rejected": -4.7631754875183105, "step": 2448 }, { "epoch": 2.31, "grad_norm": 31.835233688354492, "learning_rate": 1.273172437915355e-07, "logps/chosen": -76.6187744140625, "logps/rejected": -98.84121704101562, "loss": 0.3508, "losses/dpo": 0.2180909961462021, "losses/sft": 2.0946733951568604, "losses/total": 0.2180909961462021, "ref_logps/chosen": -51.33055114746094, "ref_logps/rejected": -54.965362548828125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5288217067718506, "rewards/margins": 1.8587641716003418, "rewards/rejected": -4.387585639953613, "step": 2449 }, { "epoch": 2.31, "grad_norm": 18.53763198852539, "learning_rate": 1.27142357467646e-07, "logps/chosen": -54.00996017456055, "logps/rejected": -84.0213623046875, "loss": 0.2441, "losses/dpo": 0.48685282468795776, "losses/sft": 2.1103055477142334, "losses/total": 0.48685282468795776, "ref_logps/chosen": -34.60890197753906, "ref_logps/rejected": -45.56768798828125, "rewards/accuracies": 0.875, "rewards/chosen": -1.940105676651001, "rewards/margins": 1.9052613973617554, "rewards/rejected": -3.845366954803467, "step": 2450 }, { "epoch": 2.31, "grad_norm": 27.337562561035156, "learning_rate": 1.2696747114375654e-07, "logps/chosen": -58.65935516357422, "logps/rejected": -75.87437438964844, "loss": 0.2933, "losses/dpo": 0.14911752939224243, "losses/sft": 1.7263939380645752, "losses/total": 0.14911752939224243, "ref_logps/chosen": -40.06737518310547, "ref_logps/rejected": -37.62965393066406, "rewards/accuracies": 0.875, "rewards/chosen": -1.8591978549957275, "rewards/margins": 1.9652740955352783, "rewards/rejected": -3.824471950531006, "step": 2451 }, { "epoch": 2.32, "grad_norm": 20.54986000061035, "learning_rate": 1.2679258481986708e-07, "logps/chosen": -63.70112228393555, "logps/rejected": -99.12324523925781, "loss": 0.2124, "losses/dpo": 0.25114142894744873, "losses/sft": 1.7811933755874634, "losses/total": 0.25114142894744873, "ref_logps/chosen": -42.85333251953125, "ref_logps/rejected": -56.14357376098633, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0847787857055664, "rewards/margins": 2.2131879329681396, "rewards/rejected": -4.297966957092285, "step": 2452 }, { "epoch": 2.32, "grad_norm": 25.26395034790039, "learning_rate": 1.2661769849597762e-07, "logps/chosen": -55.94580078125, "logps/rejected": -68.66764831542969, "loss": 0.3117, "losses/dpo": 0.12438103556632996, "losses/sft": 2.262509822845459, "losses/total": 0.12438103556632996, "ref_logps/chosen": -37.385475158691406, "ref_logps/rejected": -33.213523864746094, "rewards/accuracies": 0.875, "rewards/chosen": -1.8560330867767334, "rewards/margins": 1.6893800497055054, "rewards/rejected": -3.545413017272949, "step": 2453 }, { "epoch": 2.32, "grad_norm": 20.834056854248047, "learning_rate": 1.2644281217208813e-07, "logps/chosen": -51.35419464111328, "logps/rejected": -78.75160217285156, "loss": 0.2838, "losses/dpo": 0.26044735312461853, "losses/sft": 2.5525283813476562, "losses/total": 0.26044735312461853, "ref_logps/chosen": -32.40174102783203, "ref_logps/rejected": -41.621246337890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.8952455520629883, "rewards/margins": 1.8177897930145264, "rewards/rejected": -3.7130353450775146, "step": 2454 }, { "epoch": 2.32, "grad_norm": 10.098645210266113, "learning_rate": 1.2626792584819867e-07, "logps/chosen": -54.063453674316406, "logps/rejected": -97.94427490234375, "loss": 0.0728, "losses/dpo": 0.08777850866317749, "losses/sft": 2.1690478324890137, "losses/total": 0.08777850866317749, "ref_logps/chosen": -35.80142593383789, "ref_logps/rejected": -46.71717834472656, "rewards/accuracies": 1.0, "rewards/chosen": -1.8262027502059937, "rewards/margins": 3.296506404876709, "rewards/rejected": -5.122709274291992, "step": 2455 }, { "epoch": 2.32, "grad_norm": 15.536396026611328, "learning_rate": 1.2609303952430918e-07, "logps/chosen": -56.88234329223633, "logps/rejected": -98.4262466430664, "loss": 0.1293, "losses/dpo": 0.10162603110074997, "losses/sft": 2.2636406421661377, "losses/total": 0.10162603110074997, "ref_logps/chosen": -39.09235382080078, "ref_logps/rejected": -51.033958435058594, "rewards/accuracies": 1.0, "rewards/chosen": -1.7789990901947021, "rewards/margins": 2.9602303504943848, "rewards/rejected": -4.739229202270508, "step": 2456 }, { "epoch": 2.32, "grad_norm": 32.14033889770508, "learning_rate": 1.259181532004197e-07, "logps/chosen": -50.93202209472656, "logps/rejected": -88.27909088134766, "loss": 0.283, "losses/dpo": 0.331728458404541, "losses/sft": 2.056981325149536, "losses/total": 0.331728458404541, "ref_logps/chosen": -32.174049377441406, "ref_logps/rejected": -43.49720764160156, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8757970333099365, "rewards/margins": 2.602391242980957, "rewards/rejected": -4.478188514709473, "step": 2457 }, { "epoch": 2.32, "grad_norm": 20.82699203491211, "learning_rate": 1.2574326687653026e-07, "logps/chosen": -44.46734619140625, "logps/rejected": -68.64436340332031, "loss": 0.2503, "losses/dpo": 0.1542724221944809, "losses/sft": 2.5364983081817627, "losses/total": 0.1542724221944809, "ref_logps/chosen": -27.635202407836914, "ref_logps/rejected": -34.36957550048828, "rewards/accuracies": 0.875, "rewards/chosen": -1.6832144260406494, "rewards/margins": 1.7442653179168701, "rewards/rejected": -3.4274797439575195, "step": 2458 }, { "epoch": 2.32, "grad_norm": 17.210628509521484, "learning_rate": 1.2556838055264078e-07, "logps/chosen": -56.43772888183594, "logps/rejected": -88.28271484375, "loss": 0.1769, "losses/dpo": 0.3156144618988037, "losses/sft": 2.474200487136841, "losses/total": 0.3156144618988037, "ref_logps/chosen": -31.542987823486328, "ref_logps/rejected": -42.54148483276367, "rewards/accuracies": 1.0, "rewards/chosen": -2.489474058151245, "rewards/margins": 2.0846495628356934, "rewards/rejected": -4.574123382568359, "step": 2459 }, { "epoch": 2.32, "grad_norm": 28.804651260375977, "learning_rate": 1.2539349422875131e-07, "logps/chosen": -76.59201049804688, "logps/rejected": -86.26632690429688, "loss": 0.3618, "losses/dpo": 0.4398012161254883, "losses/sft": 2.768176317214966, "losses/total": 0.4398012161254883, "ref_logps/chosen": -51.46018981933594, "ref_logps/rejected": -43.8438720703125, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5131821632385254, "rewards/margins": 1.7290639877319336, "rewards/rejected": -4.242246150970459, "step": 2460 }, { "epoch": 2.32, "grad_norm": 20.570898056030273, "learning_rate": 1.2521860790486183e-07, "logps/chosen": -60.46925735473633, "logps/rejected": -95.93611145019531, "loss": 0.2352, "losses/dpo": 0.1121564507484436, "losses/sft": 2.411712169647217, "losses/total": 0.1121564507484436, "ref_logps/chosen": -35.38029479980469, "ref_logps/rejected": -45.97947311401367, "rewards/accuracies": 0.875, "rewards/chosen": -2.5088961124420166, "rewards/margins": 2.4867677688598633, "rewards/rejected": -4.995663642883301, "step": 2461 }, { "epoch": 2.32, "grad_norm": 20.29084587097168, "learning_rate": 1.2504372158097237e-07, "logps/chosen": -60.67136001586914, "logps/rejected": -100.66184997558594, "loss": 0.1832, "losses/dpo": 0.059751611202955246, "losses/sft": 1.9690364599227905, "losses/total": 0.059751611202955246, "ref_logps/chosen": -37.356910705566406, "ref_logps/rejected": -52.58418273925781, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3314452171325684, "rewards/margins": 2.476321220397949, "rewards/rejected": -4.807766914367676, "step": 2462 }, { "epoch": 2.33, "grad_norm": 23.233488082885742, "learning_rate": 1.2486883525708288e-07, "logps/chosen": -58.75841522216797, "logps/rejected": -90.21353912353516, "loss": 0.2688, "losses/dpo": 0.18277209997177124, "losses/sft": 2.602585792541504, "losses/total": 0.18277209997177124, "ref_logps/chosen": -39.41297912597656, "ref_logps/rejected": -47.370506286621094, "rewards/accuracies": 0.875, "rewards/chosen": -1.9345438480377197, "rewards/margins": 2.349759578704834, "rewards/rejected": -4.284303665161133, "step": 2463 }, { "epoch": 2.33, "grad_norm": 46.90460968017578, "learning_rate": 1.2469394893319342e-07, "logps/chosen": -62.881072998046875, "logps/rejected": -76.18502807617188, "loss": 0.6715, "losses/dpo": 0.14692619442939758, "losses/sft": 1.9703174829483032, "losses/total": 0.14692619442939758, "ref_logps/chosen": -38.223548889160156, "ref_logps/rejected": -42.19207763671875, "rewards/accuracies": 0.75, "rewards/chosen": -2.4657530784606934, "rewards/margins": 0.933542013168335, "rewards/rejected": -3.399294853210449, "step": 2464 }, { "epoch": 2.33, "grad_norm": 25.74579620361328, "learning_rate": 1.2451906260930396e-07, "logps/chosen": -57.81740951538086, "logps/rejected": -80.21636199951172, "loss": 0.3415, "losses/dpo": 0.6382368803024292, "losses/sft": 2.155486583709717, "losses/total": 0.6382368803024292, "ref_logps/chosen": -36.00206756591797, "ref_logps/rejected": -42.33823013305664, "rewards/accuracies": 0.875, "rewards/chosen": -2.1815338134765625, "rewards/margins": 1.6062794923782349, "rewards/rejected": -3.787813186645508, "step": 2465 }, { "epoch": 2.33, "grad_norm": 26.63974380493164, "learning_rate": 1.2434417628541447e-07, "logps/chosen": -63.47370910644531, "logps/rejected": -84.77128601074219, "loss": 0.3019, "losses/dpo": 0.5920904874801636, "losses/sft": 2.2707266807556152, "losses/total": 0.5920904874801636, "ref_logps/chosen": -44.79814147949219, "ref_logps/rejected": -45.82119369506836, "rewards/accuracies": 0.875, "rewards/chosen": -1.8675568103790283, "rewards/margins": 2.0274524688720703, "rewards/rejected": -3.8950095176696777, "step": 2466 }, { "epoch": 2.33, "grad_norm": 18.14527702331543, "learning_rate": 1.24169289961525e-07, "logps/chosen": -35.8369026184082, "logps/rejected": -73.15737915039062, "loss": 0.2277, "losses/dpo": 0.23918096721172333, "losses/sft": 1.267582654953003, "losses/total": 0.23918096721172333, "ref_logps/chosen": -25.805511474609375, "ref_logps/rejected": -39.50239944458008, "rewards/accuracies": 0.875, "rewards/chosen": -1.0031393766403198, "rewards/margins": 2.362358570098877, "rewards/rejected": -3.3654980659484863, "step": 2467 }, { "epoch": 2.33, "grad_norm": 10.962759017944336, "learning_rate": 1.2399440363763552e-07, "logps/chosen": -58.845863342285156, "logps/rejected": -109.3837890625, "loss": 0.1015, "losses/dpo": 0.18106605112552643, "losses/sft": 1.651471734046936, "losses/total": 0.18106605112552643, "ref_logps/chosen": -42.6986083984375, "ref_logps/rejected": -62.83644485473633, "rewards/accuracies": 1.0, "rewards/chosen": -1.6147253513336182, "rewards/margins": 3.040008544921875, "rewards/rejected": -4.654733657836914, "step": 2468 }, { "epoch": 2.33, "grad_norm": 37.31141662597656, "learning_rate": 1.2381951731374606e-07, "logps/chosen": -52.12660598754883, "logps/rejected": -74.52386474609375, "loss": 0.4707, "losses/dpo": 0.13884441554546356, "losses/sft": 2.2521181106567383, "losses/total": 0.13884441554546356, "ref_logps/chosen": -32.10844802856445, "ref_logps/rejected": -39.178443908691406, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0018160343170166, "rewards/margins": 1.5327260494232178, "rewards/rejected": -3.5345423221588135, "step": 2469 }, { "epoch": 2.33, "grad_norm": 23.709136962890625, "learning_rate": 1.2364463098985657e-07, "logps/chosen": -57.718055725097656, "logps/rejected": -94.78102111816406, "loss": 0.1607, "losses/dpo": 0.23916247487068176, "losses/sft": 1.9120935201644897, "losses/total": 0.23916247487068176, "ref_logps/chosen": -38.44097137451172, "ref_logps/rejected": -45.318538665771484, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9277087450027466, "rewards/margins": 3.018540382385254, "rewards/rejected": -4.946249008178711, "step": 2470 }, { "epoch": 2.33, "grad_norm": 22.588701248168945, "learning_rate": 1.234697446659671e-07, "logps/chosen": -63.049468994140625, "logps/rejected": -91.10270690917969, "loss": 0.2421, "losses/dpo": 0.20788276195526123, "losses/sft": 2.512965679168701, "losses/total": 0.20788276195526123, "ref_logps/chosen": -41.786827087402344, "ref_logps/rejected": -46.390743255615234, "rewards/accuracies": 0.875, "rewards/chosen": -2.1262640953063965, "rewards/margins": 2.3449323177337646, "rewards/rejected": -4.47119665145874, "step": 2471 }, { "epoch": 2.33, "grad_norm": 29.13214111328125, "learning_rate": 1.2329485834207765e-07, "logps/chosen": -48.514976501464844, "logps/rejected": -75.62982177734375, "loss": 0.4032, "losses/dpo": 0.09179812669754028, "losses/sft": 1.7202165126800537, "losses/total": 0.09179812669754028, "ref_logps/chosen": -28.74802589416504, "ref_logps/rejected": -34.98958206176758, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9766952991485596, "rewards/margins": 2.0873286724090576, "rewards/rejected": -4.064023971557617, "step": 2472 }, { "epoch": 2.34, "grad_norm": 33.538761138916016, "learning_rate": 1.2311997201818816e-07, "logps/chosen": -57.00736999511719, "logps/rejected": -96.07063293457031, "loss": 0.3945, "losses/dpo": 0.2000495195388794, "losses/sft": 2.2454514503479004, "losses/total": 0.2000495195388794, "ref_logps/chosen": -33.50537872314453, "ref_logps/rejected": -52.868507385253906, "rewards/accuracies": 0.875, "rewards/chosen": -2.350198984146118, "rewards/margins": 1.9700140953063965, "rewards/rejected": -4.320213794708252, "step": 2473 }, { "epoch": 2.34, "grad_norm": 17.343101501464844, "learning_rate": 1.229450856942987e-07, "logps/chosen": -57.00798034667969, "logps/rejected": -80.87181091308594, "loss": 0.1811, "losses/dpo": 0.1261172890663147, "losses/sft": 2.116879940032959, "losses/total": 0.1261172890663147, "ref_logps/chosen": -40.87028503417969, "ref_logps/rejected": -41.24085235595703, "rewards/accuracies": 1.0, "rewards/chosen": -1.613769292831421, "rewards/margins": 2.3493261337280273, "rewards/rejected": -3.963095188140869, "step": 2474 }, { "epoch": 2.34, "grad_norm": 25.425739288330078, "learning_rate": 1.2277019937040922e-07, "logps/chosen": -55.171783447265625, "logps/rejected": -92.42965698242188, "loss": 0.1827, "losses/dpo": 0.08154292404651642, "losses/sft": 1.7210462093353271, "losses/total": 0.08154292404651642, "ref_logps/chosen": -36.099735260009766, "ref_logps/rejected": -47.89049530029297, "rewards/accuracies": 0.875, "rewards/chosen": -1.9072051048278809, "rewards/margins": 2.5467114448547363, "rewards/rejected": -4.453916549682617, "step": 2475 }, { "epoch": 2.34, "grad_norm": 24.284379959106445, "learning_rate": 1.2259531304651976e-07, "logps/chosen": -51.57073974609375, "logps/rejected": -72.46035766601562, "loss": 0.246, "losses/dpo": 0.1052037850022316, "losses/sft": 2.075737714767456, "losses/total": 0.1052037850022316, "ref_logps/chosen": -37.08552932739258, "ref_logps/rejected": -37.51117706298828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4485211372375488, "rewards/margins": 2.0463967323303223, "rewards/rejected": -3.494917869567871, "step": 2476 }, { "epoch": 2.34, "grad_norm": 23.2425479888916, "learning_rate": 1.224204267226303e-07, "logps/chosen": -46.36212158203125, "logps/rejected": -75.45295715332031, "loss": 0.2452, "losses/dpo": 0.4087217152118683, "losses/sft": 2.0530078411102295, "losses/total": 0.4087217152118683, "ref_logps/chosen": -27.779163360595703, "ref_logps/rejected": -38.845252990722656, "rewards/accuracies": 0.875, "rewards/chosen": -1.8582955598831177, "rewards/margins": 1.802475094795227, "rewards/rejected": -3.6607704162597656, "step": 2477 }, { "epoch": 2.34, "grad_norm": 26.23789405822754, "learning_rate": 1.222455403987408e-07, "logps/chosen": -60.44539260864258, "logps/rejected": -90.34058380126953, "loss": 0.2432, "losses/dpo": 0.21942941844463348, "losses/sft": 1.828418254852295, "losses/total": 0.21942941844463348, "ref_logps/chosen": -38.12716293334961, "ref_logps/rejected": -46.1883544921875, "rewards/accuracies": 0.875, "rewards/chosen": -2.231823444366455, "rewards/margins": 2.1833996772766113, "rewards/rejected": -4.415222644805908, "step": 2478 }, { "epoch": 2.34, "grad_norm": 31.031585693359375, "learning_rate": 1.2207065407485135e-07, "logps/chosen": -59.69524002075195, "logps/rejected": -94.47356414794922, "loss": 0.2372, "losses/dpo": 0.3599066436290741, "losses/sft": 1.6405057907104492, "losses/total": 0.3599066436290741, "ref_logps/chosen": -36.046424865722656, "ref_logps/rejected": -46.17823791503906, "rewards/accuracies": 0.875, "rewards/chosen": -2.3648810386657715, "rewards/margins": 2.464651107788086, "rewards/rejected": -4.829532623291016, "step": 2479 }, { "epoch": 2.34, "grad_norm": 23.124414443969727, "learning_rate": 1.2189576775096189e-07, "logps/chosen": -52.421485900878906, "logps/rejected": -87.51934051513672, "loss": 0.2083, "losses/dpo": 0.15195035934448242, "losses/sft": 2.460550308227539, "losses/total": 0.15195035934448242, "ref_logps/chosen": -36.099639892578125, "ref_logps/rejected": -41.90242385864258, "rewards/accuracies": 1.0, "rewards/chosen": -1.6321848630905151, "rewards/margins": 2.92950701713562, "rewards/rejected": -4.561691761016846, "step": 2480 }, { "epoch": 2.34, "grad_norm": 24.015825271606445, "learning_rate": 1.217208814270724e-07, "logps/chosen": -51.32763671875, "logps/rejected": -71.25943756103516, "loss": 0.2565, "losses/dpo": 0.23391401767730713, "losses/sft": 2.285999059677124, "losses/total": 0.23391401767730713, "ref_logps/chosen": -33.14604949951172, "ref_logps/rejected": -38.33258056640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.8181586265563965, "rewards/margins": 1.4745268821716309, "rewards/rejected": -3.2926855087280273, "step": 2481 }, { "epoch": 2.34, "grad_norm": 25.586483001708984, "learning_rate": 1.215459951031829e-07, "logps/chosen": -53.34920883178711, "logps/rejected": -75.64665222167969, "loss": 0.2474, "losses/dpo": 0.4022758901119232, "losses/sft": 1.703517198562622, "losses/total": 0.4022758901119232, "ref_logps/chosen": -32.40521240234375, "ref_logps/rejected": -35.93869400024414, "rewards/accuracies": 0.9375, "rewards/chosen": -2.094399929046631, "rewards/margins": 1.8763959407806396, "rewards/rejected": -3.9707958698272705, "step": 2482 }, { "epoch": 2.34, "grad_norm": 17.13437843322754, "learning_rate": 1.2137110877929345e-07, "logps/chosen": -56.20327377319336, "logps/rejected": -77.69754791259766, "loss": 0.1637, "losses/dpo": 0.13188134133815765, "losses/sft": 1.2916014194488525, "losses/total": 0.13188134133815765, "ref_logps/chosen": -35.04595184326172, "ref_logps/rejected": -34.603843688964844, "rewards/accuracies": 1.0, "rewards/chosen": -2.115732431411743, "rewards/margins": 2.1936378479003906, "rewards/rejected": -4.309370040893555, "step": 2483 }, { "epoch": 2.35, "grad_norm": 28.694690704345703, "learning_rate": 1.21196222455404e-07, "logps/chosen": -55.59763717651367, "logps/rejected": -94.02726745605469, "loss": 0.1727, "losses/dpo": 0.08780666440725327, "losses/sft": 2.1725800037384033, "losses/total": 0.08780666440725327, "ref_logps/chosen": -33.64509582519531, "ref_logps/rejected": -46.372535705566406, "rewards/accuracies": 0.9375, "rewards/chosen": -2.195254325866699, "rewards/margins": 2.570219039916992, "rewards/rejected": -4.765473365783691, "step": 2484 }, { "epoch": 2.35, "grad_norm": 32.29655075073242, "learning_rate": 1.210213361315145e-07, "logps/chosen": -53.50304412841797, "logps/rejected": -76.4126205444336, "loss": 0.4029, "losses/dpo": 0.2864738702774048, "losses/sft": 2.3736648559570312, "losses/total": 0.2864738702774048, "ref_logps/chosen": -33.7008056640625, "ref_logps/rejected": -41.582855224609375, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9802236557006836, "rewards/margins": 1.502752423286438, "rewards/rejected": -3.482976198196411, "step": 2485 }, { "epoch": 2.35, "grad_norm": 10.374073028564453, "learning_rate": 1.2084644980762504e-07, "logps/chosen": -51.38214874267578, "logps/rejected": -111.7244873046875, "loss": 0.0784, "losses/dpo": 0.04466250538825989, "losses/sft": 2.186174154281616, "losses/total": 0.04466250538825989, "ref_logps/chosen": -35.338314056396484, "ref_logps/rejected": -59.774009704589844, "rewards/accuracies": 1.0, "rewards/chosen": -1.6043837070465088, "rewards/margins": 3.5906643867492676, "rewards/rejected": -5.195047855377197, "step": 2486 }, { "epoch": 2.35, "grad_norm": 16.83829116821289, "learning_rate": 1.2067156348373558e-07, "logps/chosen": -48.547996520996094, "logps/rejected": -87.23783111572266, "loss": 0.1686, "losses/dpo": 0.15708595514297485, "losses/sft": 1.4696367979049683, "losses/total": 0.15708595514297485, "ref_logps/chosen": -27.32805061340332, "ref_logps/rejected": -43.719696044921875, "rewards/accuracies": 1.0, "rewards/chosen": -2.121994972229004, "rewards/margins": 2.2298190593719482, "rewards/rejected": -4.351813793182373, "step": 2487 }, { "epoch": 2.35, "grad_norm": 24.458301544189453, "learning_rate": 1.204966771598461e-07, "logps/chosen": -63.58038330078125, "logps/rejected": -87.37181091308594, "loss": 0.2324, "losses/dpo": 0.24954506754875183, "losses/sft": 2.0932254791259766, "losses/total": 0.24954506754875183, "ref_logps/chosen": -42.60572052001953, "ref_logps/rejected": -40.25506591796875, "rewards/accuracies": 0.875, "rewards/chosen": -2.097466468811035, "rewards/margins": 2.6142077445983887, "rewards/rejected": -4.711674690246582, "step": 2488 }, { "epoch": 2.35, "grad_norm": 21.959930419921875, "learning_rate": 1.203217908359566e-07, "logps/chosen": -64.03722381591797, "logps/rejected": -78.38963317871094, "loss": 0.2408, "losses/dpo": 0.4552140235900879, "losses/sft": 2.167886734008789, "losses/total": 0.4552140235900879, "ref_logps/chosen": -45.30060577392578, "ref_logps/rejected": -40.939735412597656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8736615180969238, "rewards/margins": 1.871328353881836, "rewards/rejected": -3.7449898719787598, "step": 2489 }, { "epoch": 2.35, "grad_norm": 23.76074981689453, "learning_rate": 1.2014690451206714e-07, "logps/chosen": -54.80952835083008, "logps/rejected": -82.96211242675781, "loss": 0.2265, "losses/dpo": 0.3063853085041046, "losses/sft": 2.179708480834961, "losses/total": 0.3063853085041046, "ref_logps/chosen": -34.65270233154297, "ref_logps/rejected": -41.05809783935547, "rewards/accuracies": 1.0, "rewards/chosen": -2.0156826972961426, "rewards/margins": 2.1747193336486816, "rewards/rejected": -4.190402030944824, "step": 2490 }, { "epoch": 2.35, "grad_norm": 17.532278060913086, "learning_rate": 1.1997201818817768e-07, "logps/chosen": -65.1349105834961, "logps/rejected": -83.5277099609375, "loss": 0.2153, "losses/dpo": 0.08171360939741135, "losses/sft": 1.9716815948486328, "losses/total": 0.08171360939741135, "ref_logps/chosen": -44.480445861816406, "ref_logps/rejected": -41.43211364746094, "rewards/accuracies": 0.875, "rewards/chosen": -2.0654468536376953, "rewards/margins": 2.1441125869750977, "rewards/rejected": -4.209559440612793, "step": 2491 }, { "epoch": 2.35, "grad_norm": 25.128236770629883, "learning_rate": 1.197971318642882e-07, "logps/chosen": -66.88798522949219, "logps/rejected": -73.8001708984375, "loss": 0.2689, "losses/dpo": 0.44392144680023193, "losses/sft": 2.020564317703247, "losses/total": 0.44392144680023193, "ref_logps/chosen": -41.911354064941406, "ref_logps/rejected": -32.91697692871094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4976630210876465, "rewards/margins": 1.5906562805175781, "rewards/rejected": -4.088319301605225, "step": 2492 }, { "epoch": 2.35, "grad_norm": 15.019405364990234, "learning_rate": 1.1962224554039873e-07, "logps/chosen": -52.15613555908203, "logps/rejected": -87.82102966308594, "loss": 0.144, "losses/dpo": 0.09639427065849304, "losses/sft": 2.326068878173828, "losses/total": 0.09639427065849304, "ref_logps/chosen": -34.46449279785156, "ref_logps/rejected": -47.426231384277344, "rewards/accuracies": 1.0, "rewards/chosen": -1.7691645622253418, "rewards/margins": 2.270314931869507, "rewards/rejected": -4.0394792556762695, "step": 2493 }, { "epoch": 2.36, "grad_norm": 27.142698287963867, "learning_rate": 1.1944735921650927e-07, "logps/chosen": -49.20192337036133, "logps/rejected": -75.52554321289062, "loss": 0.2958, "losses/dpo": 0.09309644997119904, "losses/sft": 1.8377737998962402, "losses/total": 0.09309644997119904, "ref_logps/chosen": -29.45509910583496, "ref_logps/rejected": -36.688255310058594, "rewards/accuracies": 0.8125, "rewards/chosen": -1.974682331085205, "rewards/margins": 1.9090458154678345, "rewards/rejected": -3.883728265762329, "step": 2494 }, { "epoch": 2.36, "grad_norm": 14.402860641479492, "learning_rate": 1.1927247289261979e-07, "logps/chosen": -49.971622467041016, "logps/rejected": -88.38668823242188, "loss": 0.1399, "losses/dpo": 0.10670387744903564, "losses/sft": 2.0274529457092285, "losses/total": 0.10670387744903564, "ref_logps/chosen": -33.17326354980469, "ref_logps/rejected": -46.11180114746094, "rewards/accuracies": 1.0, "rewards/chosen": -1.6798359155654907, "rewards/margins": 2.54765248298645, "rewards/rejected": -4.2274885177612305, "step": 2495 }, { "epoch": 2.36, "grad_norm": 17.647789001464844, "learning_rate": 1.1909758656873033e-07, "logps/chosen": -61.177093505859375, "logps/rejected": -96.60371398925781, "loss": 0.167, "losses/dpo": 0.1981060802936554, "losses/sft": 2.3658909797668457, "losses/total": 0.1981060802936554, "ref_logps/chosen": -37.59406661987305, "ref_logps/rejected": -48.2691650390625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3583028316497803, "rewards/margins": 2.475151777267456, "rewards/rejected": -4.833454608917236, "step": 2496 }, { "epoch": 2.36, "grad_norm": 39.10746765136719, "learning_rate": 1.1892270024484084e-07, "logps/chosen": -65.44963073730469, "logps/rejected": -86.73764038085938, "loss": 0.3805, "losses/dpo": 0.4502178430557251, "losses/sft": 2.1416373252868652, "losses/total": 0.4502178430557251, "ref_logps/chosen": -40.37435531616211, "ref_logps/rejected": -44.7657356262207, "rewards/accuracies": 0.6875, "rewards/chosen": -2.5075275897979736, "rewards/margins": 1.6896629333496094, "rewards/rejected": -4.197190284729004, "step": 2497 }, { "epoch": 2.36, "grad_norm": 18.289409637451172, "learning_rate": 1.1874781392095138e-07, "logps/chosen": -48.01121520996094, "logps/rejected": -77.09693908691406, "loss": 0.1928, "losses/dpo": 0.1729419082403183, "losses/sft": 2.5575506687164307, "losses/total": 0.1729419082403183, "ref_logps/chosen": -31.58893585205078, "ref_logps/rejected": -40.06235885620117, "rewards/accuracies": 1.0, "rewards/chosen": -1.642228364944458, "rewards/margins": 2.061229705810547, "rewards/rejected": -3.703458070755005, "step": 2498 }, { "epoch": 2.36, "grad_norm": 34.7858772277832, "learning_rate": 1.185729275970619e-07, "logps/chosen": -54.143707275390625, "logps/rejected": -83.10661315917969, "loss": 0.396, "losses/dpo": 0.031737472862005234, "losses/sft": 2.1121714115142822, "losses/total": 0.031737472862005234, "ref_logps/chosen": -33.59892654418945, "ref_logps/rejected": -42.253868103027344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.054478168487549, "rewards/margins": 2.030796527862549, "rewards/rejected": -4.085274696350098, "step": 2499 }, { "epoch": 2.36, "grad_norm": 32.00473403930664, "learning_rate": 1.1839804127317243e-07, "logps/chosen": -60.17320251464844, "logps/rejected": -80.67930603027344, "loss": 0.3692, "losses/dpo": 0.31357210874557495, "losses/sft": 2.699254274368286, "losses/total": 0.31357210874557495, "ref_logps/chosen": -37.89751434326172, "ref_logps/rejected": -40.754493713378906, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2275688648223877, "rewards/margins": 1.7649123668670654, "rewards/rejected": -3.992481231689453, "step": 2500 }, { "epoch": 2.36, "grad_norm": 18.617389678955078, "learning_rate": 1.1822315494928297e-07, "logps/chosen": -59.37726593017578, "logps/rejected": -73.59318542480469, "loss": 0.2353, "losses/dpo": 0.34349995851516724, "losses/sft": 1.656891107559204, "losses/total": 0.34349995851516724, "ref_logps/chosen": -43.758975982666016, "ref_logps/rejected": -36.68318176269531, "rewards/accuracies": 0.875, "rewards/chosen": -1.5618293285369873, "rewards/margins": 2.1291706562042236, "rewards/rejected": -3.69100022315979, "step": 2501 }, { "epoch": 2.36, "grad_norm": 33.56885528564453, "learning_rate": 1.180482686253935e-07, "logps/chosen": -54.31251525878906, "logps/rejected": -69.59628295898438, "loss": 0.3615, "losses/dpo": 0.1370834857225418, "losses/sft": 1.7437453269958496, "losses/total": 0.1370834857225418, "ref_logps/chosen": -35.061424255371094, "ref_logps/rejected": -36.689937591552734, "rewards/accuracies": 0.8125, "rewards/chosen": -1.925109624862671, "rewards/margins": 1.3655250072479248, "rewards/rejected": -3.2906343936920166, "step": 2502 }, { "epoch": 2.36, "grad_norm": 22.351438522338867, "learning_rate": 1.1787338230150402e-07, "logps/chosen": -64.93675231933594, "logps/rejected": -88.02465057373047, "loss": 0.2347, "losses/dpo": 0.2633363604545593, "losses/sft": 2.7023799419403076, "losses/total": 0.2633363604545593, "ref_logps/chosen": -41.82102966308594, "ref_logps/rejected": -45.46723175048828, "rewards/accuracies": 0.9375, "rewards/chosen": -2.311573028564453, "rewards/margins": 1.944169044494629, "rewards/rejected": -4.25574254989624, "step": 2503 }, { "epoch": 2.36, "grad_norm": 17.059335708618164, "learning_rate": 1.1769849597761455e-07, "logps/chosen": -58.19172668457031, "logps/rejected": -88.29953002929688, "loss": 0.152, "losses/dpo": 0.15331706404685974, "losses/sft": 1.6098928451538086, "losses/total": 0.15331706404685974, "ref_logps/chosen": -39.69989776611328, "ref_logps/rejected": -43.158870697021484, "rewards/accuracies": 0.9375, "rewards/chosen": -1.849183201789856, "rewards/margins": 2.6648826599121094, "rewards/rejected": -4.514065742492676, "step": 2504 }, { "epoch": 2.37, "grad_norm": 10.571551322937012, "learning_rate": 1.1752360965372507e-07, "logps/chosen": -47.77638626098633, "logps/rejected": -90.98786163330078, "loss": 0.1173, "losses/dpo": 0.07016374170780182, "losses/sft": 1.7941679954528809, "losses/total": 0.07016374170780182, "ref_logps/chosen": -31.9508113861084, "ref_logps/rejected": -45.91461181640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.5825573205947876, "rewards/margins": 2.92476749420166, "rewards/rejected": -4.507324695587158, "step": 2505 }, { "epoch": 2.37, "grad_norm": 13.991881370544434, "learning_rate": 1.173487233298356e-07, "logps/chosen": -51.79463195800781, "logps/rejected": -89.44017028808594, "loss": 0.1979, "losses/dpo": 0.3450295627117157, "losses/sft": 1.1797661781311035, "losses/total": 0.3450295627117157, "ref_logps/chosen": -32.96784210205078, "ref_logps/rejected": -46.98945236206055, "rewards/accuracies": 1.0, "rewards/chosen": -1.882678747177124, "rewards/margins": 2.3623924255371094, "rewards/rejected": -4.2450714111328125, "step": 2506 }, { "epoch": 2.37, "grad_norm": 36.70252990722656, "learning_rate": 1.1717383700594612e-07, "logps/chosen": -58.41950225830078, "logps/rejected": -90.22645568847656, "loss": 0.275, "losses/dpo": 0.510208785533905, "losses/sft": 2.7526657581329346, "losses/total": 0.510208785533905, "ref_logps/chosen": -35.70545196533203, "ref_logps/rejected": -42.26300048828125, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2714052200317383, "rewards/margins": 2.524940013885498, "rewards/rejected": -4.7963457107543945, "step": 2507 }, { "epoch": 2.37, "grad_norm": 16.828462600708008, "learning_rate": 1.1699895068205666e-07, "logps/chosen": -65.06836700439453, "logps/rejected": -99.44084167480469, "loss": 0.1796, "losses/dpo": 0.18887639045715332, "losses/sft": 1.9328739643096924, "losses/total": 0.18887639045715332, "ref_logps/chosen": -41.1427001953125, "ref_logps/rejected": -54.88560485839844, "rewards/accuracies": 1.0, "rewards/chosen": -2.392566442489624, "rewards/margins": 2.062957286834717, "rewards/rejected": -4.455523490905762, "step": 2508 }, { "epoch": 2.37, "grad_norm": 29.831985473632812, "learning_rate": 1.1682406435816719e-07, "logps/chosen": -65.11944580078125, "logps/rejected": -83.46836853027344, "loss": 0.3405, "losses/dpo": 0.06395786255598068, "losses/sft": 1.8726327419281006, "losses/total": 0.06395786255598068, "ref_logps/chosen": -42.787269592285156, "ref_logps/rejected": -41.564571380615234, "rewards/accuracies": 0.8125, "rewards/chosen": -2.233217477798462, "rewards/margins": 1.9571622610092163, "rewards/rejected": -4.190380096435547, "step": 2509 }, { "epoch": 2.37, "grad_norm": 19.849464416503906, "learning_rate": 1.1664917803427771e-07, "logps/chosen": -48.45545196533203, "logps/rejected": -65.52117919921875, "loss": 0.2649, "losses/dpo": 0.23613230884075165, "losses/sft": 2.165217399597168, "losses/total": 0.23613230884075165, "ref_logps/chosen": -31.89284896850586, "ref_logps/rejected": -30.07781219482422, "rewards/accuracies": 0.875, "rewards/chosen": -1.6562602519989014, "rewards/margins": 1.8880764245986938, "rewards/rejected": -3.5443365573883057, "step": 2510 }, { "epoch": 2.37, "grad_norm": 31.271276473999023, "learning_rate": 1.1647429171038824e-07, "logps/chosen": -60.59358596801758, "logps/rejected": -74.07872772216797, "loss": 0.3818, "losses/dpo": 0.19057197868824005, "losses/sft": 1.6542092561721802, "losses/total": 0.19057197868824005, "ref_logps/chosen": -39.452354431152344, "ref_logps/rejected": -34.966094970703125, "rewards/accuracies": 0.875, "rewards/chosen": -2.1141233444213867, "rewards/margins": 1.7971405982971191, "rewards/rejected": -3.9112634658813477, "step": 2511 }, { "epoch": 2.37, "grad_norm": 29.318376541137695, "learning_rate": 1.1629940538649877e-07, "logps/chosen": -68.43071746826172, "logps/rejected": -93.36837768554688, "loss": 0.2093, "losses/dpo": 0.2968805730342865, "losses/sft": 2.332024097442627, "losses/total": 0.2968805730342865, "ref_logps/chosen": -46.62525939941406, "ref_logps/rejected": -48.09784698486328, "rewards/accuracies": 0.9375, "rewards/chosen": -2.180546283721924, "rewards/margins": 2.3465075492858887, "rewards/rejected": -4.5270538330078125, "step": 2512 }, { "epoch": 2.37, "grad_norm": 25.011404037475586, "learning_rate": 1.1612451906260929e-07, "logps/chosen": -78.1155014038086, "logps/rejected": -82.64877319335938, "loss": 0.2416, "losses/dpo": 0.14893746376037598, "losses/sft": 2.395493984222412, "losses/total": 0.14893746376037598, "ref_logps/chosen": -56.980194091796875, "ref_logps/rejected": -40.27091979980469, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1135311126708984, "rewards/margins": 2.124253988265991, "rewards/rejected": -4.237785339355469, "step": 2513 }, { "epoch": 2.37, "grad_norm": 29.251052856445312, "learning_rate": 1.1594963273871983e-07, "logps/chosen": -70.18621063232422, "logps/rejected": -106.8014907836914, "loss": 0.292, "losses/dpo": 0.35300523042678833, "losses/sft": 2.156134843826294, "losses/total": 0.35300523042678833, "ref_logps/chosen": -48.75346374511719, "ref_logps/rejected": -60.8981819152832, "rewards/accuracies": 0.875, "rewards/chosen": -2.1432740688323975, "rewards/margins": 2.447057008743286, "rewards/rejected": -4.590331077575684, "step": 2514 }, { "epoch": 2.37, "grad_norm": 22.282623291015625, "learning_rate": 1.1577474641483036e-07, "logps/chosen": -55.873451232910156, "logps/rejected": -77.88690185546875, "loss": 0.292, "losses/dpo": 0.42617663741111755, "losses/sft": 1.9727500677108765, "losses/total": 0.42617663741111755, "ref_logps/chosen": -38.03065490722656, "ref_logps/rejected": -36.25072479248047, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7842793464660645, "rewards/margins": 2.379338026046753, "rewards/rejected": -4.163617134094238, "step": 2515 }, { "epoch": 2.38, "grad_norm": 19.478694915771484, "learning_rate": 1.1559986009094088e-07, "logps/chosen": -46.964141845703125, "logps/rejected": -89.1031494140625, "loss": 0.1336, "losses/dpo": 0.23474769294261932, "losses/sft": 2.006344795227051, "losses/total": 0.23474769294261932, "ref_logps/chosen": -28.179340362548828, "ref_logps/rejected": -41.634239196777344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8784804344177246, "rewards/margins": 2.868411064147949, "rewards/rejected": -4.746891498565674, "step": 2516 }, { "epoch": 2.38, "grad_norm": 16.617446899414062, "learning_rate": 1.1542497376705142e-07, "logps/chosen": -40.45554733276367, "logps/rejected": -87.22796630859375, "loss": 0.1389, "losses/dpo": 0.10199884325265884, "losses/sft": 0.8668092489242554, "losses/total": 0.10199884325265884, "ref_logps/chosen": -27.659257888793945, "ref_logps/rejected": -44.386199951171875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2796292304992676, "rewards/margins": 3.004546642303467, "rewards/rejected": -4.284175872802734, "step": 2517 }, { "epoch": 2.38, "grad_norm": 25.946596145629883, "learning_rate": 1.1525008744316193e-07, "logps/chosen": -58.72694396972656, "logps/rejected": -87.80030822753906, "loss": 0.1976, "losses/dpo": 0.2447998821735382, "losses/sft": 2.1436166763305664, "losses/total": 0.2447998821735382, "ref_logps/chosen": -40.694984436035156, "ref_logps/rejected": -45.388946533203125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8031957149505615, "rewards/margins": 2.4379405975341797, "rewards/rejected": -4.24113655090332, "step": 2518 }, { "epoch": 2.38, "grad_norm": 28.655118942260742, "learning_rate": 1.1507520111927246e-07, "logps/chosen": -71.76898193359375, "logps/rejected": -90.77214050292969, "loss": 0.2743, "losses/dpo": 0.09876863658428192, "losses/sft": 1.8171237707138062, "losses/total": 0.09876863658428192, "ref_logps/chosen": -46.837554931640625, "ref_logps/rejected": -47.927669525146484, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4931435585021973, "rewards/margins": 1.7913038730621338, "rewards/rejected": -4.284447193145752, "step": 2519 }, { "epoch": 2.38, "grad_norm": 18.40703773498535, "learning_rate": 1.14900314795383e-07, "logps/chosen": -69.4997329711914, "logps/rejected": -88.21411895751953, "loss": 0.2095, "losses/dpo": 0.06394433975219727, "losses/sft": 1.9794204235076904, "losses/total": 0.06394433975219727, "ref_logps/chosen": -46.442657470703125, "ref_logps/rejected": -41.188011169433594, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3057076930999756, "rewards/margins": 2.3969035148620605, "rewards/rejected": -4.702611446380615, "step": 2520 }, { "epoch": 2.38, "grad_norm": 44.416439056396484, "learning_rate": 1.1472542847149353e-07, "logps/chosen": -72.70677947998047, "logps/rejected": -84.92475891113281, "loss": 0.4243, "losses/dpo": 0.20675459504127502, "losses/sft": 1.9799474477767944, "losses/total": 0.20675459504127502, "ref_logps/chosen": -47.42877960205078, "ref_logps/rejected": -46.17876052856445, "rewards/accuracies": 0.875, "rewards/chosen": -2.5278000831604004, "rewards/margins": 1.3468000888824463, "rewards/rejected": -3.874600410461426, "step": 2521 }, { "epoch": 2.38, "grad_norm": 35.40708923339844, "learning_rate": 1.1455054214760405e-07, "logps/chosen": -57.54240417480469, "logps/rejected": -79.65693664550781, "loss": 0.2964, "losses/dpo": 0.25444450974464417, "losses/sft": 2.3195719718933105, "losses/total": 0.25444450974464417, "ref_logps/chosen": -39.25010299682617, "ref_logps/rejected": -40.364768981933594, "rewards/accuracies": 0.875, "rewards/chosen": -1.8292301893234253, "rewards/margins": 2.0999863147735596, "rewards/rejected": -3.9292166233062744, "step": 2522 }, { "epoch": 2.38, "grad_norm": 26.783090591430664, "learning_rate": 1.1437565582371459e-07, "logps/chosen": -55.58489990234375, "logps/rejected": -88.6790771484375, "loss": 0.3058, "losses/dpo": 0.5128049850463867, "losses/sft": 3.0865347385406494, "losses/total": 0.5128049850463867, "ref_logps/chosen": -31.529624938964844, "ref_logps/rejected": -46.409820556640625, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4055278301239014, "rewards/margins": 1.8213980197906494, "rewards/rejected": -4.226925849914551, "step": 2523 }, { "epoch": 2.38, "grad_norm": 18.745254516601562, "learning_rate": 1.1420076949982512e-07, "logps/chosen": -55.39482879638672, "logps/rejected": -86.84278106689453, "loss": 0.2169, "losses/dpo": 0.22749954462051392, "losses/sft": 2.523874521255493, "losses/total": 0.22749954462051392, "ref_logps/chosen": -35.19559860229492, "ref_logps/rejected": -48.01429748535156, "rewards/accuracies": 0.9375, "rewards/chosen": -2.019923210144043, "rewards/margins": 1.8629255294799805, "rewards/rejected": -3.8828487396240234, "step": 2524 }, { "epoch": 2.38, "grad_norm": 36.80396270751953, "learning_rate": 1.1402588317593563e-07, "logps/chosen": -68.5572509765625, "logps/rejected": -93.6121826171875, "loss": 0.3319, "losses/dpo": 0.9290879964828491, "losses/sft": 2.5073792934417725, "losses/total": 0.9290879964828491, "ref_logps/chosen": -42.86677551269531, "ref_logps/rejected": -47.364646911621094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5690479278564453, "rewards/margins": 2.055706024169922, "rewards/rejected": -4.624753952026367, "step": 2525 }, { "epoch": 2.39, "grad_norm": 14.319046974182129, "learning_rate": 1.1385099685204616e-07, "logps/chosen": -44.667320251464844, "logps/rejected": -78.54658508300781, "loss": 0.1266, "losses/dpo": 0.06307297945022583, "losses/sft": 1.4635499715805054, "losses/total": 0.06307297945022583, "ref_logps/chosen": -31.404142379760742, "ref_logps/rejected": -39.424076080322266, "rewards/accuracies": 1.0, "rewards/chosen": -1.3263180255889893, "rewards/margins": 2.585933208465576, "rewards/rejected": -3.9122512340545654, "step": 2526 }, { "epoch": 2.39, "grad_norm": 21.520751953125, "learning_rate": 1.136761105281567e-07, "logps/chosen": -47.26658248901367, "logps/rejected": -66.99307250976562, "loss": 0.1981, "losses/dpo": 0.07307786494493484, "losses/sft": 1.6557444334030151, "losses/total": 0.07307786494493484, "ref_logps/chosen": -33.730133056640625, "ref_logps/rejected": -29.53289031982422, "rewards/accuracies": 1.0, "rewards/chosen": -1.3536449670791626, "rewards/margins": 2.3923730850219727, "rewards/rejected": -3.7460179328918457, "step": 2527 }, { "epoch": 2.39, "grad_norm": 15.640666961669922, "learning_rate": 1.1350122420426722e-07, "logps/chosen": -65.79432678222656, "logps/rejected": -97.02033996582031, "loss": 0.1134, "losses/dpo": 0.10471897572278976, "losses/sft": 1.8932609558105469, "losses/total": 0.10471897572278976, "ref_logps/chosen": -43.85139846801758, "ref_logps/rejected": -49.76023864746094, "rewards/accuracies": 1.0, "rewards/chosen": -2.1942920684814453, "rewards/margins": 2.5317177772521973, "rewards/rejected": -4.726009845733643, "step": 2528 }, { "epoch": 2.39, "grad_norm": 13.496094703674316, "learning_rate": 1.1332633788037775e-07, "logps/chosen": -57.23871612548828, "logps/rejected": -91.66221618652344, "loss": 0.1418, "losses/dpo": 0.062215834856033325, "losses/sft": 1.5759575366973877, "losses/total": 0.062215834856033325, "ref_logps/chosen": -39.60386657714844, "ref_logps/rejected": -44.322872161865234, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7634848356246948, "rewards/margins": 2.970449686050415, "rewards/rejected": -4.73393440246582, "step": 2529 }, { "epoch": 2.39, "grad_norm": 28.31764793395996, "learning_rate": 1.1315145155648829e-07, "logps/chosen": -60.44569396972656, "logps/rejected": -83.93952941894531, "loss": 0.2322, "losses/dpo": 0.05488099902868271, "losses/sft": 1.9449849128723145, "losses/total": 0.05488099902868271, "ref_logps/chosen": -39.794559478759766, "ref_logps/rejected": -40.099056243896484, "rewards/accuracies": 0.875, "rewards/chosen": -2.0651135444641113, "rewards/margins": 2.3189339637756348, "rewards/rejected": -4.384047508239746, "step": 2530 }, { "epoch": 2.39, "grad_norm": 29.370267868041992, "learning_rate": 1.1297656523259881e-07, "logps/chosen": -51.62361145019531, "logps/rejected": -90.83674621582031, "loss": 0.2629, "losses/dpo": 0.23380114138126373, "losses/sft": 1.6566338539123535, "losses/total": 0.23380114138126373, "ref_logps/chosen": -33.09646224975586, "ref_logps/rejected": -47.39875411987305, "rewards/accuracies": 0.875, "rewards/chosen": -1.8527146577835083, "rewards/margins": 2.4910855293273926, "rewards/rejected": -4.3438005447387695, "step": 2531 }, { "epoch": 2.39, "grad_norm": 24.555252075195312, "learning_rate": 1.1280167890870932e-07, "logps/chosen": -72.76583099365234, "logps/rejected": -95.81634521484375, "loss": 0.2592, "losses/dpo": 0.2598763108253479, "losses/sft": 2.5177524089813232, "losses/total": 0.2598763108253479, "ref_logps/chosen": -45.730281829833984, "ref_logps/rejected": -50.23405456542969, "rewards/accuracies": 0.8125, "rewards/chosen": -2.703554630279541, "rewards/margins": 1.8546749353408813, "rewards/rejected": -4.558229923248291, "step": 2532 }, { "epoch": 2.39, "grad_norm": 21.70107650756836, "learning_rate": 1.1262679258481986e-07, "logps/chosen": -71.17634582519531, "logps/rejected": -101.7744140625, "loss": 0.1884, "losses/dpo": 0.10532114654779434, "losses/sft": 3.040187120437622, "losses/total": 0.10532114654779434, "ref_logps/chosen": -40.8497200012207, "ref_logps/rejected": -45.84651184082031, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0326626300811768, "rewards/margins": 2.560127019882202, "rewards/rejected": -5.592789649963379, "step": 2533 }, { "epoch": 2.39, "grad_norm": 24.463098526000977, "learning_rate": 1.1245190626093039e-07, "logps/chosen": -52.83331298828125, "logps/rejected": -75.98428344726562, "loss": 0.2238, "losses/dpo": 0.24562160670757294, "losses/sft": 1.7588242292404175, "losses/total": 0.24562160670757294, "ref_logps/chosen": -34.70185852050781, "ref_logps/rejected": -36.12361526489258, "rewards/accuracies": 0.9375, "rewards/chosen": -1.813145637512207, "rewards/margins": 2.1729211807250977, "rewards/rejected": -3.9860668182373047, "step": 2534 }, { "epoch": 2.39, "grad_norm": 40.53074264526367, "learning_rate": 1.1227701993704091e-07, "logps/chosen": -61.78066635131836, "logps/rejected": -79.13165283203125, "loss": 0.3981, "losses/dpo": 0.37873753905296326, "losses/sft": 1.501401424407959, "losses/total": 0.37873753905296326, "ref_logps/chosen": -38.14493179321289, "ref_logps/rejected": -38.703392028808594, "rewards/accuracies": 0.75, "rewards/chosen": -2.3635733127593994, "rewards/margins": 1.6792532205581665, "rewards/rejected": -4.0428266525268555, "step": 2535 }, { "epoch": 2.39, "grad_norm": 34.54536819458008, "learning_rate": 1.1210213361315145e-07, "logps/chosen": -60.59071731567383, "logps/rejected": -83.66757202148438, "loss": 0.3721, "losses/dpo": 0.22701193392276764, "losses/sft": 2.5241289138793945, "losses/total": 0.22701193392276764, "ref_logps/chosen": -34.511104583740234, "ref_logps/rejected": -36.30790710449219, "rewards/accuracies": 0.75, "rewards/chosen": -2.6079609394073486, "rewards/margins": 2.1280055046081543, "rewards/rejected": -4.735966682434082, "step": 2536 }, { "epoch": 2.4, "grad_norm": 27.279024124145508, "learning_rate": 1.1192724728926198e-07, "logps/chosen": -72.88031005859375, "logps/rejected": -92.48606872558594, "loss": 0.2682, "losses/dpo": 0.3463936448097229, "losses/sft": 2.2874464988708496, "losses/total": 0.3463936448097229, "ref_logps/chosen": -48.92679977416992, "ref_logps/rejected": -46.500892639160156, "rewards/accuracies": 0.875, "rewards/chosen": -2.3953514099121094, "rewards/margins": 2.2031667232513428, "rewards/rejected": -4.598518371582031, "step": 2537 }, { "epoch": 2.4, "grad_norm": 26.769601821899414, "learning_rate": 1.117523609653725e-07, "logps/chosen": -46.56377410888672, "logps/rejected": -75.41265869140625, "loss": 0.287, "losses/dpo": 0.284978985786438, "losses/sft": 2.4260573387145996, "losses/total": 0.284978985786438, "ref_logps/chosen": -27.320091247558594, "ref_logps/rejected": -36.27096176147461, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9243682622909546, "rewards/margins": 1.9898014068603516, "rewards/rejected": -3.9141695499420166, "step": 2538 }, { "epoch": 2.4, "grad_norm": 25.439987182617188, "learning_rate": 1.1157747464148302e-07, "logps/chosen": -75.16964721679688, "logps/rejected": -103.0256118774414, "loss": 0.2039, "losses/dpo": 0.23099377751350403, "losses/sft": 2.4316399097442627, "losses/total": 0.23099377751350403, "ref_logps/chosen": -49.023441314697266, "ref_logps/rejected": -54.69520568847656, "rewards/accuracies": 0.9375, "rewards/chosen": -2.614621162414551, "rewards/margins": 2.2184195518493652, "rewards/rejected": -4.833040714263916, "step": 2539 }, { "epoch": 2.4, "grad_norm": 19.881114959716797, "learning_rate": 1.1140258831759356e-07, "logps/chosen": -56.24695587158203, "logps/rejected": -99.95590209960938, "loss": 0.2281, "losses/dpo": 0.3772699236869812, "losses/sft": 1.3900045156478882, "losses/total": 0.3772699236869812, "ref_logps/chosen": -38.65867614746094, "ref_logps/rejected": -57.306697845458984, "rewards/accuracies": 0.875, "rewards/chosen": -1.7588279247283936, "rewards/margins": 2.506092071533203, "rewards/rejected": -4.264919757843018, "step": 2540 }, { "epoch": 2.4, "grad_norm": 11.859535217285156, "learning_rate": 1.1122770199370408e-07, "logps/chosen": -57.50248718261719, "logps/rejected": -87.93421173095703, "loss": 0.1335, "losses/dpo": 0.13757172226905823, "losses/sft": 2.3896234035491943, "losses/total": 0.13757172226905823, "ref_logps/chosen": -37.72437286376953, "ref_logps/rejected": -43.9356575012207, "rewards/accuracies": 1.0, "rewards/chosen": -1.9778120517730713, "rewards/margins": 2.422043800354004, "rewards/rejected": -4.399855613708496, "step": 2541 }, { "epoch": 2.4, "grad_norm": 29.46007537841797, "learning_rate": 1.1105281566981461e-07, "logps/chosen": -51.233314514160156, "logps/rejected": -77.33616638183594, "loss": 0.3359, "losses/dpo": 0.34855157136917114, "losses/sft": 1.9801949262619019, "losses/total": 0.34855157136917114, "ref_logps/chosen": -33.40945053100586, "ref_logps/rejected": -40.61573791503906, "rewards/accuracies": 0.875, "rewards/chosen": -1.7823864221572876, "rewards/margins": 1.8896558284759521, "rewards/rejected": -3.6720423698425293, "step": 2542 }, { "epoch": 2.4, "grad_norm": 21.368791580200195, "learning_rate": 1.1087792934592515e-07, "logps/chosen": -62.33343505859375, "logps/rejected": -101.64625549316406, "loss": 0.1896, "losses/dpo": 0.18004998564720154, "losses/sft": 3.0143661499023438, "losses/total": 0.18004998564720154, "ref_logps/chosen": -38.726539611816406, "ref_logps/rejected": -51.92256164550781, "rewards/accuracies": 0.9375, "rewards/chosen": -2.360690116882324, "rewards/margins": 2.6116790771484375, "rewards/rejected": -4.972369194030762, "step": 2543 }, { "epoch": 2.4, "grad_norm": 35.730865478515625, "learning_rate": 1.1070304302203567e-07, "logps/chosen": -70.31576538085938, "logps/rejected": -100.08964538574219, "loss": 0.4345, "losses/dpo": 1.000927448272705, "losses/sft": 3.285482168197632, "losses/total": 1.000927448272705, "ref_logps/chosen": -41.02024841308594, "ref_logps/rejected": -49.754337310791016, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9295520782470703, "rewards/margins": 2.1039786338806152, "rewards/rejected": -5.0335307121276855, "step": 2544 }, { "epoch": 2.4, "grad_norm": 18.468238830566406, "learning_rate": 1.105281566981462e-07, "logps/chosen": -41.055992126464844, "logps/rejected": -68.91433715820312, "loss": 0.2171, "losses/dpo": 0.2452457845211029, "losses/sft": 1.721042275428772, "losses/total": 0.2452457845211029, "ref_logps/chosen": -22.163066864013672, "ref_logps/rejected": -31.338363647460938, "rewards/accuracies": 1.0, "rewards/chosen": -1.8892927169799805, "rewards/margins": 1.8683040142059326, "rewards/rejected": -3.757596969604492, "step": 2545 }, { "epoch": 2.4, "grad_norm": 22.803457260131836, "learning_rate": 1.1035327037425674e-07, "logps/chosen": -53.94634246826172, "logps/rejected": -80.46607971191406, "loss": 0.2559, "losses/dpo": 0.16401897370815277, "losses/sft": 1.861817717552185, "losses/total": 0.16401897370815277, "ref_logps/chosen": -36.254173278808594, "ref_logps/rejected": -39.521995544433594, "rewards/accuracies": 0.875, "rewards/chosen": -1.7692170143127441, "rewards/margins": 2.3251914978027344, "rewards/rejected": -4.0944085121154785, "step": 2546 }, { "epoch": 2.41, "grad_norm": 38.432125091552734, "learning_rate": 1.1017838405036725e-07, "logps/chosen": -57.14134216308594, "logps/rejected": -79.35484313964844, "loss": 0.3582, "losses/dpo": 0.15578432381153107, "losses/sft": 1.6802306175231934, "losses/total": 0.15578432381153107, "ref_logps/chosen": -41.46296310424805, "ref_logps/rejected": -43.66193771362305, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5678377151489258, "rewards/margins": 2.001453161239624, "rewards/rejected": -3.56929087638855, "step": 2547 }, { "epoch": 2.41, "grad_norm": 25.972978591918945, "learning_rate": 1.1000349772647778e-07, "logps/chosen": -49.720951080322266, "logps/rejected": -93.64736938476562, "loss": 0.1612, "losses/dpo": 0.15485922992229462, "losses/sft": 1.4682108163833618, "losses/total": 0.15485922992229462, "ref_logps/chosen": -30.325763702392578, "ref_logps/rejected": -45.82988739013672, "rewards/accuracies": 0.9375, "rewards/chosen": -1.939518690109253, "rewards/margins": 2.8422303199768066, "rewards/rejected": -4.7817487716674805, "step": 2548 }, { "epoch": 2.41, "grad_norm": 18.726795196533203, "learning_rate": 1.0982861140258832e-07, "logps/chosen": -61.978515625, "logps/rejected": -96.72171020507812, "loss": 0.197, "losses/dpo": 0.20206576585769653, "losses/sft": 2.011671781539917, "losses/total": 0.20206576585769653, "ref_logps/chosen": -39.43273162841797, "ref_logps/rejected": -50.971038818359375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2545785903930664, "rewards/margins": 2.320488929748535, "rewards/rejected": -4.575067520141602, "step": 2549 }, { "epoch": 2.41, "grad_norm": 14.803045272827148, "learning_rate": 1.0965372507869884e-07, "logps/chosen": -51.179534912109375, "logps/rejected": -93.59822082519531, "loss": 0.1145, "losses/dpo": 0.12413838505744934, "losses/sft": 2.185044288635254, "losses/total": 0.12413838505744934, "ref_logps/chosen": -32.79823684692383, "ref_logps/rejected": -47.055362701416016, "rewards/accuracies": 1.0, "rewards/chosen": -1.838130235671997, "rewards/margins": 2.816154956817627, "rewards/rejected": -4.654285430908203, "step": 2550 }, { "epoch": 2.41, "grad_norm": 20.38247299194336, "learning_rate": 1.0947883875480937e-07, "logps/chosen": -59.00342559814453, "logps/rejected": -91.65235900878906, "loss": 0.205, "losses/dpo": 0.1265772134065628, "losses/sft": 2.0871572494506836, "losses/total": 0.1265772134065628, "ref_logps/chosen": -40.99570846557617, "ref_logps/rejected": -49.13435745239258, "rewards/accuracies": 1.0, "rewards/chosen": -1.800771713256836, "rewards/margins": 2.4510293006896973, "rewards/rejected": -4.251801013946533, "step": 2551 }, { "epoch": 2.41, "grad_norm": 24.816736221313477, "learning_rate": 1.0930395243091991e-07, "logps/chosen": -44.99361038208008, "logps/rejected": -75.62315368652344, "loss": 0.2588, "losses/dpo": 0.06230252981185913, "losses/sft": 1.9967135190963745, "losses/total": 0.06230252981185913, "ref_logps/chosen": -25.542572021484375, "ref_logps/rejected": -37.40641784667969, "rewards/accuracies": 0.875, "rewards/chosen": -1.9451038837432861, "rewards/margins": 1.8765697479248047, "rewards/rejected": -3.821673631668091, "step": 2552 }, { "epoch": 2.41, "grad_norm": 29.295747756958008, "learning_rate": 1.0912906610703043e-07, "logps/chosen": -52.34647750854492, "logps/rejected": -74.05122375488281, "loss": 0.3451, "losses/dpo": 0.5134259462356567, "losses/sft": 2.4235899448394775, "losses/total": 0.5134259462356567, "ref_logps/chosen": -34.62861633300781, "ref_logps/rejected": -40.31925964355469, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7717859745025635, "rewards/margins": 1.6014108657836914, "rewards/rejected": -3.373196840286255, "step": 2553 }, { "epoch": 2.41, "grad_norm": 20.89582061767578, "learning_rate": 1.0895417978314095e-07, "logps/chosen": -46.52728271484375, "logps/rejected": -73.043701171875, "loss": 0.2388, "losses/dpo": 0.17445340752601624, "losses/sft": 1.5419580936431885, "losses/total": 0.17445340752601624, "ref_logps/chosen": -30.312349319458008, "ref_logps/rejected": -37.24897384643555, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6214933395385742, "rewards/margins": 1.957979679107666, "rewards/rejected": -3.5794732570648193, "step": 2554 }, { "epoch": 2.41, "grad_norm": 32.36873245239258, "learning_rate": 1.0877929345925147e-07, "logps/chosen": -45.970191955566406, "logps/rejected": -62.24394989013672, "loss": 0.4179, "losses/dpo": 0.23007695376873016, "losses/sft": 1.738694190979004, "losses/total": 0.23007695376873016, "ref_logps/chosen": -27.8089599609375, "ref_logps/rejected": -31.732833862304688, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8161232471466064, "rewards/margins": 1.234989047050476, "rewards/rejected": -3.051112174987793, "step": 2555 }, { "epoch": 2.41, "grad_norm": 18.925806045532227, "learning_rate": 1.0860440713536201e-07, "logps/chosen": -59.07218933105469, "logps/rejected": -83.79288482666016, "loss": 0.1978, "losses/dpo": 0.10686750710010529, "losses/sft": 2.293520212173462, "losses/total": 0.10686750710010529, "ref_logps/chosen": -41.20150375366211, "ref_logps/rejected": -45.5233268737793, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7870683670043945, "rewards/margins": 2.0398879051208496, "rewards/rejected": -3.826956272125244, "step": 2556 }, { "epoch": 2.41, "grad_norm": 26.95958137512207, "learning_rate": 1.0842952081147254e-07, "logps/chosen": -64.685302734375, "logps/rejected": -84.84998321533203, "loss": 0.2623, "losses/dpo": 0.3503202795982361, "losses/sft": 2.1921966075897217, "losses/total": 0.3503202795982361, "ref_logps/chosen": -39.75926971435547, "ref_logps/rejected": -39.21153259277344, "rewards/accuracies": 0.875, "rewards/chosen": -2.492602586746216, "rewards/margins": 2.071241617202759, "rewards/rejected": -4.563844203948975, "step": 2557 }, { "epoch": 2.42, "grad_norm": 27.554433822631836, "learning_rate": 1.0825463448758306e-07, "logps/chosen": -55.231040954589844, "logps/rejected": -94.85967254638672, "loss": 0.2275, "losses/dpo": 0.5521403551101685, "losses/sft": 2.0651941299438477, "losses/total": 0.5521403551101685, "ref_logps/chosen": -35.30305480957031, "ref_logps/rejected": -46.768211364746094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9927988052368164, "rewards/margins": 2.816347599029541, "rewards/rejected": -4.809145927429199, "step": 2558 }, { "epoch": 2.42, "grad_norm": 30.89105987548828, "learning_rate": 1.080797481636936e-07, "logps/chosen": -63.46617889404297, "logps/rejected": -88.59207916259766, "loss": 0.2614, "losses/dpo": 0.18831908702850342, "losses/sft": 1.423599123954773, "losses/total": 0.18831908702850342, "ref_logps/chosen": -41.77220916748047, "ref_logps/rejected": -43.777137756347656, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1693973541259766, "rewards/margins": 2.3120970726013184, "rewards/rejected": -4.481494426727295, "step": 2559 }, { "epoch": 2.42, "grad_norm": 26.34918785095215, "learning_rate": 1.0790486183980413e-07, "logps/chosen": -63.056236267089844, "logps/rejected": -89.27275085449219, "loss": 0.2281, "losses/dpo": 0.29373589158058167, "losses/sft": 2.6011059284210205, "losses/total": 0.29373589158058167, "ref_logps/chosen": -42.57627868652344, "ref_logps/rejected": -49.83995819091797, "rewards/accuracies": 0.875, "rewards/chosen": -2.0479960441589355, "rewards/margins": 1.8952829837799072, "rewards/rejected": -3.9432787895202637, "step": 2560 }, { "epoch": 2.42, "grad_norm": 23.103620529174805, "learning_rate": 1.0772997551591464e-07, "logps/chosen": -64.57014465332031, "logps/rejected": -99.75311279296875, "loss": 0.208, "losses/dpo": 0.22294288873672485, "losses/sft": 2.720385789871216, "losses/total": 0.22294288873672485, "ref_logps/chosen": -41.784584045410156, "ref_logps/rejected": -52.6777229309082, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2785568237304688, "rewards/margins": 2.4289824962615967, "rewards/rejected": -4.7075395584106445, "step": 2561 }, { "epoch": 2.42, "grad_norm": 33.543739318847656, "learning_rate": 1.0755508919202518e-07, "logps/chosen": -65.54817199707031, "logps/rejected": -69.56430053710938, "loss": 0.4232, "losses/dpo": 0.22556215524673462, "losses/sft": 2.6789019107818604, "losses/total": 0.22556215524673462, "ref_logps/chosen": -40.86676025390625, "ref_logps/rejected": -32.30731964111328, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4681406021118164, "rewards/margins": 1.2575569152832031, "rewards/rejected": -3.7256975173950195, "step": 2562 }, { "epoch": 2.42, "grad_norm": 26.381847381591797, "learning_rate": 1.073802028681357e-07, "logps/chosen": -60.17300033569336, "logps/rejected": -82.67431640625, "loss": 0.2909, "losses/dpo": 0.23626123368740082, "losses/sft": 2.309133768081665, "losses/total": 0.23626123368740082, "ref_logps/chosen": -39.730655670166016, "ref_logps/rejected": -44.660614013671875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.044234275817871, "rewards/margins": 1.757136344909668, "rewards/rejected": -3.80137038230896, "step": 2563 }, { "epoch": 2.42, "grad_norm": 18.88750457763672, "learning_rate": 1.0720531654424623e-07, "logps/chosen": -55.77342224121094, "logps/rejected": -91.54100036621094, "loss": 0.2088, "losses/dpo": 0.19664618372917175, "losses/sft": 2.034778594970703, "losses/total": 0.19664618372917175, "ref_logps/chosen": -37.26781463623047, "ref_logps/rejected": -49.17311096191406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8505603075027466, "rewards/margins": 2.386228084564209, "rewards/rejected": -4.236788272857666, "step": 2564 }, { "epoch": 2.42, "grad_norm": 24.47402572631836, "learning_rate": 1.0703043022035677e-07, "logps/chosen": -69.35269927978516, "logps/rejected": -87.98463439941406, "loss": 0.2034, "losses/dpo": 0.23828518390655518, "losses/sft": 2.5770516395568848, "losses/total": 0.23828518390655518, "ref_logps/chosen": -44.25381088256836, "ref_logps/rejected": -43.23814392089844, "rewards/accuracies": 1.0, "rewards/chosen": -2.5098886489868164, "rewards/margins": 1.9647605419158936, "rewards/rejected": -4.474648952484131, "step": 2565 }, { "epoch": 2.42, "grad_norm": 28.16697883605957, "learning_rate": 1.068555438964673e-07, "logps/chosen": -58.12694549560547, "logps/rejected": -96.69146728515625, "loss": 0.2996, "losses/dpo": 0.7032175064086914, "losses/sft": 2.742940664291382, "losses/total": 0.7032175064086914, "ref_logps/chosen": -36.491485595703125, "ref_logps/rejected": -49.2968635559082, "rewards/accuracies": 0.8125, "rewards/chosen": -2.163546085357666, "rewards/margins": 2.5759146213531494, "rewards/rejected": -4.739460468292236, "step": 2566 }, { "epoch": 2.42, "grad_norm": 22.47705078125, "learning_rate": 1.0668065757257782e-07, "logps/chosen": -55.596065521240234, "logps/rejected": -90.7059326171875, "loss": 0.2267, "losses/dpo": 0.09340497851371765, "losses/sft": 1.8198113441467285, "losses/total": 0.09340497851371765, "ref_logps/chosen": -34.655731201171875, "ref_logps/rejected": -43.12435531616211, "rewards/accuracies": 0.875, "rewards/chosen": -2.0940330028533936, "rewards/margins": 2.6641249656677246, "rewards/rejected": -4.758157730102539, "step": 2567 }, { "epoch": 2.42, "grad_norm": 15.867330551147461, "learning_rate": 1.0650577124868835e-07, "logps/chosen": -57.36811828613281, "logps/rejected": -103.16275787353516, "loss": 0.1051, "losses/dpo": 0.054004017263650894, "losses/sft": 1.3831908702850342, "losses/total": 0.054004017263650894, "ref_logps/chosen": -38.60383224487305, "ref_logps/rejected": -53.714534759521484, "rewards/accuracies": 1.0, "rewards/chosen": -1.8764283657073975, "rewards/margins": 3.0683937072753906, "rewards/rejected": -4.944822311401367, "step": 2568 }, { "epoch": 2.43, "grad_norm": 15.959575653076172, "learning_rate": 1.0633088492479887e-07, "logps/chosen": -62.00328826904297, "logps/rejected": -103.62792205810547, "loss": 0.1124, "losses/dpo": 0.1275884211063385, "losses/sft": 2.610710382461548, "losses/total": 0.1275884211063385, "ref_logps/chosen": -39.876678466796875, "ref_logps/rejected": -56.215484619140625, "rewards/accuracies": 1.0, "rewards/chosen": -2.2126615047454834, "rewards/margins": 2.5285825729370117, "rewards/rejected": -4.741243839263916, "step": 2569 }, { "epoch": 2.43, "grad_norm": 22.52553939819336, "learning_rate": 1.061559986009094e-07, "logps/chosen": -53.69630813598633, "logps/rejected": -77.34857177734375, "loss": 0.2359, "losses/dpo": 0.0458427369594574, "losses/sft": 1.746375560760498, "losses/total": 0.0458427369594574, "ref_logps/chosen": -37.15336608886719, "ref_logps/rejected": -40.85997009277344, "rewards/accuracies": 0.875, "rewards/chosen": -1.654294490814209, "rewards/margins": 1.994565725326538, "rewards/rejected": -3.648860216140747, "step": 2570 }, { "epoch": 2.43, "grad_norm": 31.48843765258789, "learning_rate": 1.0598111227701994e-07, "logps/chosen": -55.549407958984375, "logps/rejected": -86.17488098144531, "loss": 0.3064, "losses/dpo": 0.3240250051021576, "losses/sft": 2.280369997024536, "losses/total": 0.3240250051021576, "ref_logps/chosen": -39.13319396972656, "ref_logps/rejected": -46.76850128173828, "rewards/accuracies": 0.75, "rewards/chosen": -1.6416208744049072, "rewards/margins": 2.2990169525146484, "rewards/rejected": -3.9406378269195557, "step": 2571 }, { "epoch": 2.43, "grad_norm": 20.905593872070312, "learning_rate": 1.0580622595313046e-07, "logps/chosen": -68.0166015625, "logps/rejected": -99.62675476074219, "loss": 0.1966, "losses/dpo": 0.25101208686828613, "losses/sft": 2.055720567703247, "losses/total": 0.25101208686828613, "ref_logps/chosen": -44.68172836303711, "ref_logps/rejected": -50.23406219482422, "rewards/accuracies": 1.0, "rewards/chosen": -2.3334875106811523, "rewards/margins": 2.6057815551757812, "rewards/rejected": -4.939269065856934, "step": 2572 }, { "epoch": 2.43, "grad_norm": 15.05843448638916, "learning_rate": 1.0563133962924099e-07, "logps/chosen": -68.5745849609375, "logps/rejected": -104.69305419921875, "loss": 0.1654, "losses/dpo": 0.09332914650440216, "losses/sft": 2.0536437034606934, "losses/total": 0.09332914650440216, "ref_logps/chosen": -45.46392059326172, "ref_logps/rejected": -57.96839141845703, "rewards/accuracies": 1.0, "rewards/chosen": -2.311066150665283, "rewards/margins": 2.3613998889923096, "rewards/rejected": -4.672466278076172, "step": 2573 }, { "epoch": 2.43, "grad_norm": 30.56990623474121, "learning_rate": 1.0545645330535152e-07, "logps/chosen": -60.13212203979492, "logps/rejected": -73.24850463867188, "loss": 0.3406, "losses/dpo": 0.3165338635444641, "losses/sft": 2.2543745040893555, "losses/total": 0.3165338635444641, "ref_logps/chosen": -39.25490951538086, "ref_logps/rejected": -37.137939453125, "rewards/accuracies": 0.875, "rewards/chosen": -2.087721347808838, "rewards/margins": 1.523335337638855, "rewards/rejected": -3.6110568046569824, "step": 2574 }, { "epoch": 2.43, "grad_norm": 37.013343811035156, "learning_rate": 1.0528156698146204e-07, "logps/chosen": -63.27751922607422, "logps/rejected": -78.61744689941406, "loss": 0.3501, "losses/dpo": 0.6447678208351135, "losses/sft": 2.375950574874878, "losses/total": 0.6447678208351135, "ref_logps/chosen": -40.818180084228516, "ref_logps/rejected": -38.092140197753906, "rewards/accuracies": 0.875, "rewards/chosen": -2.245934009552002, "rewards/margins": 1.8065966367721558, "rewards/rejected": -4.052530765533447, "step": 2575 }, { "epoch": 2.43, "grad_norm": 22.064970016479492, "learning_rate": 1.0510668065757257e-07, "logps/chosen": -53.35215377807617, "logps/rejected": -93.520263671875, "loss": 0.2302, "losses/dpo": 0.2972893714904785, "losses/sft": 2.070923089981079, "losses/total": 0.2972893714904785, "ref_logps/chosen": -32.72959899902344, "ref_logps/rejected": -48.75401306152344, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0622551441192627, "rewards/margins": 2.414369821548462, "rewards/rejected": -4.476624965667725, "step": 2576 }, { "epoch": 2.43, "grad_norm": 23.87879753112793, "learning_rate": 1.049317943336831e-07, "logps/chosen": -53.36554718017578, "logps/rejected": -73.52658081054688, "loss": 0.2544, "losses/dpo": 0.09178003668785095, "losses/sft": 1.391703724861145, "losses/total": 0.09178003668785095, "ref_logps/chosen": -34.63602828979492, "ref_logps/rejected": -33.592864990234375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.872951626777649, "rewards/margins": 2.12041974067688, "rewards/rejected": -3.9933714866638184, "step": 2577 }, { "epoch": 2.43, "grad_norm": 24.76368522644043, "learning_rate": 1.0475690800979363e-07, "logps/chosen": -55.37023162841797, "logps/rejected": -80.47941589355469, "loss": 0.22, "losses/dpo": 0.23654115200042725, "losses/sft": 2.030259132385254, "losses/total": 0.23654115200042725, "ref_logps/chosen": -38.04815673828125, "ref_logps/rejected": -41.07991409301758, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7322070598602295, "rewards/margins": 2.2077434062957764, "rewards/rejected": -3.939950466156006, "step": 2578 }, { "epoch": 2.44, "grad_norm": 15.50444221496582, "learning_rate": 1.0458202168590416e-07, "logps/chosen": -51.69697952270508, "logps/rejected": -95.99156951904297, "loss": 0.1231, "losses/dpo": 0.11117083579301834, "losses/sft": 2.241940975189209, "losses/total": 0.11117083579301834, "ref_logps/chosen": -30.247873306274414, "ref_logps/rejected": -48.499053955078125, "rewards/accuracies": 1.0, "rewards/chosen": -2.1449108123779297, "rewards/margins": 2.6043403148651123, "rewards/rejected": -4.749251365661621, "step": 2579 }, { "epoch": 2.44, "grad_norm": 22.83673095703125, "learning_rate": 1.0440713536201469e-07, "logps/chosen": -65.80226135253906, "logps/rejected": -99.94749450683594, "loss": 0.1994, "losses/dpo": 0.07799456268548965, "losses/sft": 2.1029629707336426, "losses/total": 0.07799456268548965, "ref_logps/chosen": -38.49122619628906, "ref_logps/rejected": -48.85552978515625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7311038970947266, "rewards/margins": 2.378091812133789, "rewards/rejected": -5.109195709228516, "step": 2580 }, { "epoch": 2.44, "grad_norm": 19.489704132080078, "learning_rate": 1.0423224903812522e-07, "logps/chosen": -57.17464828491211, "logps/rejected": -94.33549499511719, "loss": 0.2041, "losses/dpo": 0.11986584216356277, "losses/sft": 2.0400967597961426, "losses/total": 0.11986584216356277, "ref_logps/chosen": -37.687808990478516, "ref_logps/rejected": -48.437835693359375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9486839771270752, "rewards/margins": 2.6410815715789795, "rewards/rejected": -4.589765548706055, "step": 2581 }, { "epoch": 2.44, "grad_norm": 30.269742965698242, "learning_rate": 1.0405736271423574e-07, "logps/chosen": -66.55439758300781, "logps/rejected": -91.45822143554688, "loss": 0.3001, "losses/dpo": 0.36642366647720337, "losses/sft": 1.6556113958358765, "losses/total": 0.36642366647720337, "ref_logps/chosen": -37.89095687866211, "ref_logps/rejected": -43.350677490234375, "rewards/accuracies": 0.875, "rewards/chosen": -2.8663437366485596, "rewards/margins": 1.9444106817245483, "rewards/rejected": -4.810754776000977, "step": 2582 }, { "epoch": 2.44, "grad_norm": 31.75640296936035, "learning_rate": 1.0388247639034626e-07, "logps/chosen": -60.32160568237305, "logps/rejected": -93.30657196044922, "loss": 0.3227, "losses/dpo": 0.5739339590072632, "losses/sft": 2.398925542831421, "losses/total": 0.5739339590072632, "ref_logps/chosen": -38.183006286621094, "ref_logps/rejected": -53.49128341674805, "rewards/accuracies": 0.8125, "rewards/chosen": -2.213860034942627, "rewards/margins": 1.7676692008972168, "rewards/rejected": -3.9815292358398438, "step": 2583 }, { "epoch": 2.44, "grad_norm": 28.80004119873047, "learning_rate": 1.037075900664568e-07, "logps/chosen": -61.17211151123047, "logps/rejected": -76.71539306640625, "loss": 0.3286, "losses/dpo": 0.16467228531837463, "losses/sft": 2.1150360107421875, "losses/total": 0.16467228531837463, "ref_logps/chosen": -38.43989944458008, "ref_logps/rejected": -36.76949691772461, "rewards/accuracies": 0.875, "rewards/chosen": -2.273221492767334, "rewards/margins": 1.7213687896728516, "rewards/rejected": -3.9945902824401855, "step": 2584 }, { "epoch": 2.44, "grad_norm": 16.38739776611328, "learning_rate": 1.0353270374256733e-07, "logps/chosen": -52.807159423828125, "logps/rejected": -105.39630889892578, "loss": 0.1077, "losses/dpo": 0.20742735266685486, "losses/sft": 1.9841108322143555, "losses/total": 0.20742735266685486, "ref_logps/chosen": -37.28129196166992, "ref_logps/rejected": -54.11236572265625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5525871515274048, "rewards/margins": 3.5758073329925537, "rewards/rejected": -5.12839412689209, "step": 2585 }, { "epoch": 2.44, "grad_norm": 16.015092849731445, "learning_rate": 1.0335781741867785e-07, "logps/chosen": -61.46519470214844, "logps/rejected": -107.0567398071289, "loss": 0.1243, "losses/dpo": 0.03934570401906967, "losses/sft": 1.920172929763794, "losses/total": 0.03934570401906967, "ref_logps/chosen": -39.61329650878906, "ref_logps/rejected": -55.37855529785156, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1851894855499268, "rewards/margins": 2.982628583908081, "rewards/rejected": -5.167818069458008, "step": 2586 }, { "epoch": 2.44, "grad_norm": 21.995683670043945, "learning_rate": 1.0318293109478839e-07, "logps/chosen": -37.99213409423828, "logps/rejected": -69.28343963623047, "loss": 0.2505, "losses/dpo": 0.2805306613445282, "losses/sft": 1.5321217775344849, "losses/total": 0.2805306613445282, "ref_logps/chosen": -22.738601684570312, "ref_logps/rejected": -33.14030075073242, "rewards/accuracies": 0.9375, "rewards/chosen": -1.525353193283081, "rewards/margins": 2.088960647583008, "rewards/rejected": -3.614313840866089, "step": 2587 }, { "epoch": 2.44, "grad_norm": 27.576040267944336, "learning_rate": 1.0300804477089892e-07, "logps/chosen": -47.47115707397461, "logps/rejected": -82.44940185546875, "loss": 0.2467, "losses/dpo": 0.24677318334579468, "losses/sft": 1.7984297275543213, "losses/total": 0.24677318334579468, "ref_logps/chosen": -24.82701301574707, "ref_logps/rejected": -38.341651916503906, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2644143104553223, "rewards/margins": 2.146359920501709, "rewards/rejected": -4.410774230957031, "step": 2588 }, { "epoch": 2.44, "grad_norm": 15.27244758605957, "learning_rate": 1.0283315844700943e-07, "logps/chosen": -50.437843322753906, "logps/rejected": -84.30396270751953, "loss": 0.1588, "losses/dpo": 0.2607237994670868, "losses/sft": 2.5970253944396973, "losses/total": 0.2607237994670868, "ref_logps/chosen": -31.924461364746094, "ref_logps/rejected": -41.21435546875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8513380289077759, "rewards/margins": 2.45762300491333, "rewards/rejected": -4.308960914611816, "step": 2589 }, { "epoch": 2.45, "grad_norm": 16.532649993896484, "learning_rate": 1.0265827212311996e-07, "logps/chosen": -65.29950714111328, "logps/rejected": -98.13487243652344, "loss": 0.1633, "losses/dpo": 0.10464746505022049, "losses/sft": 2.533601999282837, "losses/total": 0.10464746505022049, "ref_logps/chosen": -37.745697021484375, "ref_logps/rejected": -46.759918212890625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7553815841674805, "rewards/margins": 2.3821136951446533, "rewards/rejected": -5.137495517730713, "step": 2590 }, { "epoch": 2.45, "grad_norm": 18.9774227142334, "learning_rate": 1.024833857992305e-07, "logps/chosen": -75.89419555664062, "logps/rejected": -92.86912536621094, "loss": 0.146, "losses/dpo": 0.1274433583021164, "losses/sft": 2.581130266189575, "losses/total": 0.1274433583021164, "ref_logps/chosen": -49.07698440551758, "ref_logps/rejected": -42.79907989501953, "rewards/accuracies": 0.9375, "rewards/chosen": -2.681720495223999, "rewards/margins": 2.3252837657928467, "rewards/rejected": -5.007004261016846, "step": 2591 }, { "epoch": 2.45, "grad_norm": 28.013385772705078, "learning_rate": 1.0230849947534102e-07, "logps/chosen": -56.53223419189453, "logps/rejected": -87.07548522949219, "loss": 0.2968, "losses/dpo": 0.7539942264556885, "losses/sft": 2.2622296810150146, "losses/total": 0.7539942264556885, "ref_logps/chosen": -36.85771560668945, "ref_logps/rejected": -43.74913787841797, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9674524068832397, "rewards/margins": 2.3651814460754395, "rewards/rejected": -4.332633972167969, "step": 2592 }, { "epoch": 2.45, "grad_norm": 17.80710220336914, "learning_rate": 1.0213361315145155e-07, "logps/chosen": -73.81526184082031, "logps/rejected": -98.49223327636719, "loss": 0.1632, "losses/dpo": 0.07690103352069855, "losses/sft": 2.688950300216675, "losses/total": 0.07690103352069855, "ref_logps/chosen": -49.22899627685547, "ref_logps/rejected": -50.03904724121094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4586265087127686, "rewards/margins": 2.3866920471191406, "rewards/rejected": -4.845318794250488, "step": 2593 }, { "epoch": 2.45, "grad_norm": 33.03223419189453, "learning_rate": 1.0195872682756209e-07, "logps/chosen": -54.94720458984375, "logps/rejected": -76.68730163574219, "loss": 0.3758, "losses/dpo": 0.06294659525156021, "losses/sft": 1.5065737962722778, "losses/total": 0.06294659525156021, "ref_logps/chosen": -34.556373596191406, "ref_logps/rejected": -38.32640838623047, "rewards/accuracies": 0.875, "rewards/chosen": -2.039083242416382, "rewards/margins": 1.7970057725906372, "rewards/rejected": -3.8360891342163086, "step": 2594 }, { "epoch": 2.45, "grad_norm": 23.080896377563477, "learning_rate": 1.0178384050367261e-07, "logps/chosen": -69.1246109008789, "logps/rejected": -86.28018951416016, "loss": 0.204, "losses/dpo": 0.3689577281475067, "losses/sft": 1.9399421215057373, "losses/total": 0.3689577281475067, "ref_logps/chosen": -50.673126220703125, "ref_logps/rejected": -43.97283172607422, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8451478481292725, "rewards/margins": 2.385587453842163, "rewards/rejected": -4.2307353019714355, "step": 2595 }, { "epoch": 2.45, "grad_norm": 31.5340576171875, "learning_rate": 1.0160895417978313e-07, "logps/chosen": -59.139892578125, "logps/rejected": -81.00654602050781, "loss": 0.344, "losses/dpo": 0.5528707504272461, "losses/sft": 3.7191176414489746, "losses/total": 0.5528707504272461, "ref_logps/chosen": -35.80817413330078, "ref_logps/rejected": -39.8519287109375, "rewards/accuracies": 0.875, "rewards/chosen": -2.333172082901001, "rewards/margins": 1.7822892665863037, "rewards/rejected": -4.115461349487305, "step": 2596 }, { "epoch": 2.45, "grad_norm": 31.34276008605957, "learning_rate": 1.0143406785589366e-07, "logps/chosen": -64.4171142578125, "logps/rejected": -86.96149444580078, "loss": 0.2704, "losses/dpo": 0.1981174349784851, "losses/sft": 1.7797414064407349, "losses/total": 0.1981174349784851, "ref_logps/chosen": -43.85118865966797, "ref_logps/rejected": -44.20526885986328, "rewards/accuracies": 0.875, "rewards/chosen": -2.0565929412841797, "rewards/margins": 2.2190299034118652, "rewards/rejected": -4.275622844696045, "step": 2597 }, { "epoch": 2.45, "grad_norm": 24.233430862426758, "learning_rate": 1.0125918153200419e-07, "logps/chosen": -60.92336654663086, "logps/rejected": -73.82623291015625, "loss": 0.2384, "losses/dpo": 0.2107107937335968, "losses/sft": 1.8149641752243042, "losses/total": 0.2107107937335968, "ref_logps/chosen": -40.020103454589844, "ref_logps/rejected": -32.9814567565918, "rewards/accuracies": 0.875, "rewards/chosen": -2.0903263092041016, "rewards/margins": 1.9941506385803223, "rewards/rejected": -4.084476947784424, "step": 2598 }, { "epoch": 2.45, "grad_norm": 25.469482421875, "learning_rate": 1.0108429520811472e-07, "logps/chosen": -75.62583923339844, "logps/rejected": -85.56845092773438, "loss": 0.1917, "losses/dpo": 0.10696591436862946, "losses/sft": 2.345310926437378, "losses/total": 0.10696591436862946, "ref_logps/chosen": -50.843589782714844, "ref_logps/rejected": -40.18869400024414, "rewards/accuracies": 0.9375, "rewards/chosen": -2.478224754333496, "rewards/margins": 2.059751510620117, "rewards/rejected": -4.537976264953613, "step": 2599 }, { "epoch": 2.46, "grad_norm": 17.674009323120117, "learning_rate": 1.0090940888422526e-07, "logps/chosen": -55.87786102294922, "logps/rejected": -95.49864959716797, "loss": 0.1263, "losses/dpo": 0.13038519024848938, "losses/sft": 2.023768901824951, "losses/total": 0.13038519024848938, "ref_logps/chosen": -33.11652374267578, "ref_logps/rejected": -46.888946533203125, "rewards/accuracies": 1.0, "rewards/chosen": -2.2761332988739014, "rewards/margins": 2.584836959838867, "rewards/rejected": -4.860970497131348, "step": 2600 }, { "epoch": 2.46, "grad_norm": 27.673967361450195, "learning_rate": 1.0073452256033578e-07, "logps/chosen": -73.77913665771484, "logps/rejected": -82.05780029296875, "loss": 0.1891, "losses/dpo": 0.26632291078567505, "losses/sft": 2.3219902515411377, "losses/total": 0.26632291078567505, "ref_logps/chosen": -51.91088104248047, "ref_logps/rejected": -38.380714416503906, "rewards/accuracies": 0.9375, "rewards/chosen": -2.186825752258301, "rewards/margins": 2.1808829307556152, "rewards/rejected": -4.367708206176758, "step": 2601 }, { "epoch": 2.46, "grad_norm": 26.718460083007812, "learning_rate": 1.0055963623644631e-07, "logps/chosen": -50.28725051879883, "logps/rejected": -88.12123107910156, "loss": 0.2317, "losses/dpo": 0.5363413095474243, "losses/sft": 2.115255117416382, "losses/total": 0.5363413095474243, "ref_logps/chosen": -31.593265533447266, "ref_logps/rejected": -43.55470275878906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.869398832321167, "rewards/margins": 2.587254524230957, "rewards/rejected": -4.456653118133545, "step": 2602 }, { "epoch": 2.46, "grad_norm": 25.41333770751953, "learning_rate": 1.0038474991255685e-07, "logps/chosen": -47.423858642578125, "logps/rejected": -92.43830108642578, "loss": 0.1706, "losses/dpo": 0.36899334192276, "losses/sft": 1.9994312524795532, "losses/total": 0.36899334192276, "ref_logps/chosen": -30.479087829589844, "ref_logps/rejected": -48.30529022216797, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6944770812988281, "rewards/margins": 2.7188243865966797, "rewards/rejected": -4.413301467895508, "step": 2603 }, { "epoch": 2.46, "grad_norm": 24.383352279663086, "learning_rate": 1.0020986358866736e-07, "logps/chosen": -55.91067123413086, "logps/rejected": -92.13478088378906, "loss": 0.1873, "losses/dpo": 0.21596641838550568, "losses/sft": 2.039780616760254, "losses/total": 0.21596641838550568, "ref_logps/chosen": -32.247535705566406, "ref_logps/rejected": -46.783451080322266, "rewards/accuracies": 0.9375, "rewards/chosen": -2.366313934326172, "rewards/margins": 2.168818473815918, "rewards/rejected": -4.53513240814209, "step": 2604 }, { "epoch": 2.46, "grad_norm": 19.391090393066406, "learning_rate": 1.0003497726477789e-07, "logps/chosen": -53.40163040161133, "logps/rejected": -83.4697265625, "loss": 0.2159, "losses/dpo": 0.04920261353254318, "losses/sft": 2.0974574089050293, "losses/total": 0.04920261353254318, "ref_logps/chosen": -35.043418884277344, "ref_logps/rejected": -41.9373664855957, "rewards/accuracies": 0.9375, "rewards/chosen": -1.835821270942688, "rewards/margins": 2.3174145221710205, "rewards/rejected": -4.15323543548584, "step": 2605 }, { "epoch": 2.46, "grad_norm": 26.59425163269043, "learning_rate": 9.986009094088841e-08, "logps/chosen": -46.27521514892578, "logps/rejected": -64.21771240234375, "loss": 0.3517, "losses/dpo": 0.4971241056919098, "losses/sft": 1.7221955060958862, "losses/total": 0.4971241056919098, "ref_logps/chosen": -27.062803268432617, "ref_logps/rejected": -29.00020980834961, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9212415218353271, "rewards/margins": 1.600508689880371, "rewards/rejected": -3.5217502117156982, "step": 2606 }, { "epoch": 2.46, "grad_norm": 33.43109130859375, "learning_rate": 9.968520461699895e-08, "logps/chosen": -76.14991760253906, "logps/rejected": -97.0946044921875, "loss": 0.2625, "losses/dpo": 0.17947545647621155, "losses/sft": 1.9435949325561523, "losses/total": 0.17947545647621155, "ref_logps/chosen": -45.67512893676758, "ref_logps/rejected": -44.292972564697266, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0474791526794434, "rewards/margins": 2.232684373855591, "rewards/rejected": -5.280163764953613, "step": 2607 }, { "epoch": 2.46, "grad_norm": 29.819128036499023, "learning_rate": 9.951031829310948e-08, "logps/chosen": -60.859375, "logps/rejected": -78.77252197265625, "loss": 0.2921, "losses/dpo": 0.21985659003257751, "losses/sft": 1.9781346321105957, "losses/total": 0.21985659003257751, "ref_logps/chosen": -41.72977828979492, "ref_logps/rejected": -42.71349334716797, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9129598140716553, "rewards/margins": 1.6929433345794678, "rewards/rejected": -3.605903148651123, "step": 2608 }, { "epoch": 2.46, "grad_norm": 31.35969352722168, "learning_rate": 9.933543196922e-08, "logps/chosen": -68.019287109375, "logps/rejected": -86.71482849121094, "loss": 0.404, "losses/dpo": 0.45526280999183655, "losses/sft": 2.063840389251709, "losses/total": 0.45526280999183655, "ref_logps/chosen": -41.12223815917969, "ref_logps/rejected": -43.22393035888672, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6897048950195312, "rewards/margins": 1.6593852043151855, "rewards/rejected": -4.349089622497559, "step": 2609 }, { "epoch": 2.46, "grad_norm": 32.93818283081055, "learning_rate": 9.916054564533054e-08, "logps/chosen": -65.99555969238281, "logps/rejected": -87.19364929199219, "loss": 0.2947, "losses/dpo": 0.05186900496482849, "losses/sft": 1.5554454326629639, "losses/total": 0.05186900496482849, "ref_logps/chosen": -46.03703689575195, "ref_logps/rejected": -46.50900650024414, "rewards/accuracies": 0.875, "rewards/chosen": -1.9958527088165283, "rewards/margins": 2.0726115703582764, "rewards/rejected": -4.068464279174805, "step": 2610 }, { "epoch": 2.47, "grad_norm": 20.314306259155273, "learning_rate": 9.898565932144105e-08, "logps/chosen": -49.21299362182617, "logps/rejected": -86.56928253173828, "loss": 0.1739, "losses/dpo": 0.24736928939819336, "losses/sft": 2.3513190746307373, "losses/total": 0.24736928939819336, "ref_logps/chosen": -29.592952728271484, "ref_logps/rejected": -38.746429443359375, "rewards/accuracies": 0.875, "rewards/chosen": -1.9620039463043213, "rewards/margins": 2.820281505584717, "rewards/rejected": -4.782285213470459, "step": 2611 }, { "epoch": 2.47, "grad_norm": 26.15523910522461, "learning_rate": 9.881077299755158e-08, "logps/chosen": -61.650848388671875, "logps/rejected": -92.20849609375, "loss": 0.3197, "losses/dpo": 0.10007618367671967, "losses/sft": 1.8990036249160767, "losses/total": 0.10007618367671967, "ref_logps/chosen": -39.098411560058594, "ref_logps/rejected": -44.057594299316406, "rewards/accuracies": 0.875, "rewards/chosen": -2.255244016647339, "rewards/margins": 2.5598459243774414, "rewards/rejected": -4.815090179443359, "step": 2612 }, { "epoch": 2.47, "grad_norm": 31.91933822631836, "learning_rate": 9.863588667366212e-08, "logps/chosen": -59.646156311035156, "logps/rejected": -69.28634643554688, "loss": 0.3912, "losses/dpo": 0.6779725551605225, "losses/sft": 2.219099521636963, "losses/total": 0.6779725551605225, "ref_logps/chosen": -39.468257904052734, "ref_logps/rejected": -34.59103775024414, "rewards/accuracies": 0.8125, "rewards/chosen": -2.017789840698242, "rewards/margins": 1.4517407417297363, "rewards/rejected": -3.4695305824279785, "step": 2613 }, { "epoch": 2.47, "grad_norm": 15.879584312438965, "learning_rate": 9.846100034977264e-08, "logps/chosen": -57.7434196472168, "logps/rejected": -81.94081115722656, "loss": 0.1865, "losses/dpo": 0.3243858218193054, "losses/sft": 2.1442439556121826, "losses/total": 0.3243858218193054, "ref_logps/chosen": -35.57036590576172, "ref_logps/rejected": -36.337947845458984, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2173056602478027, "rewards/margins": 2.3429813385009766, "rewards/rejected": -4.560286521911621, "step": 2614 }, { "epoch": 2.47, "grad_norm": 27.229232788085938, "learning_rate": 9.828611402588317e-08, "logps/chosen": -57.22346496582031, "logps/rejected": -92.30690002441406, "loss": 0.2385, "losses/dpo": 0.048968106508255005, "losses/sft": 1.6596750020980835, "losses/total": 0.048968106508255005, "ref_logps/chosen": -37.21009826660156, "ref_logps/rejected": -47.969364166259766, "rewards/accuracies": 0.9375, "rewards/chosen": -2.001336097717285, "rewards/margins": 2.432417869567871, "rewards/rejected": -4.433753967285156, "step": 2615 }, { "epoch": 2.47, "grad_norm": 19.067716598510742, "learning_rate": 9.811122770199371e-08, "logps/chosen": -54.6844482421875, "logps/rejected": -95.34724426269531, "loss": 0.1926, "losses/dpo": 0.05017391964793205, "losses/sft": 1.4284586906433105, "losses/total": 0.05017391964793205, "ref_logps/chosen": -36.69255065917969, "ref_logps/rejected": -48.41150665283203, "rewards/accuracies": 0.875, "rewards/chosen": -1.799189805984497, "rewards/margins": 2.8943839073181152, "rewards/rejected": -4.693573951721191, "step": 2616 }, { "epoch": 2.47, "grad_norm": 33.83940124511719, "learning_rate": 9.793634137810424e-08, "logps/chosen": -51.00293731689453, "logps/rejected": -67.70549011230469, "loss": 0.3807, "losses/dpo": 0.34572577476501465, "losses/sft": 1.9972656965255737, "losses/total": 0.34572577476501465, "ref_logps/chosen": -30.170448303222656, "ref_logps/rejected": -34.406410217285156, "rewards/accuracies": 0.8125, "rewards/chosen": -2.083249092102051, "rewards/margins": 1.2466588020324707, "rewards/rejected": -3.3299078941345215, "step": 2617 }, { "epoch": 2.47, "grad_norm": 17.854434967041016, "learning_rate": 9.776145505421475e-08, "logps/chosen": -57.60009002685547, "logps/rejected": -89.62419128417969, "loss": 0.1826, "losses/dpo": 0.082145094871521, "losses/sft": 1.8801279067993164, "losses/total": 0.082145094871521, "ref_logps/chosen": -41.04216003417969, "ref_logps/rejected": -45.35902404785156, "rewards/accuracies": 0.875, "rewards/chosen": -1.6557927131652832, "rewards/margins": 2.770723819732666, "rewards/rejected": -4.426516532897949, "step": 2618 }, { "epoch": 2.47, "grad_norm": 39.21747970581055, "learning_rate": 9.758656873032529e-08, "logps/chosen": -70.81407165527344, "logps/rejected": -74.14389038085938, "loss": 0.3921, "losses/dpo": 0.4029538333415985, "losses/sft": 1.9141440391540527, "losses/total": 0.4029538333415985, "ref_logps/chosen": -47.187129974365234, "ref_logps/rejected": -36.16504669189453, "rewards/accuracies": 0.8125, "rewards/chosen": -2.362694501876831, "rewards/margins": 1.4351894855499268, "rewards/rejected": -3.797883987426758, "step": 2619 }, { "epoch": 2.47, "grad_norm": 33.028900146484375, "learning_rate": 9.741168240643581e-08, "logps/chosen": -69.96118927001953, "logps/rejected": -98.61248016357422, "loss": 0.3076, "losses/dpo": 0.31740495562553406, "losses/sft": 1.7501455545425415, "losses/total": 0.31740495562553406, "ref_logps/chosen": -44.765323638916016, "ref_logps/rejected": -47.95121383666992, "rewards/accuracies": 0.875, "rewards/chosen": -2.5195865631103516, "rewards/margins": 2.546539783477783, "rewards/rejected": -5.066126346588135, "step": 2620 }, { "epoch": 2.47, "grad_norm": 43.19218444824219, "learning_rate": 9.723679608254634e-08, "logps/chosen": -70.31956481933594, "logps/rejected": -97.96747589111328, "loss": 0.3709, "losses/dpo": 0.26920267939567566, "losses/sft": 2.482292890548706, "losses/total": 0.26920267939567566, "ref_logps/chosen": -45.48460388183594, "ref_logps/rejected": -50.64182662963867, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4834964275360107, "rewards/margins": 2.249068260192871, "rewards/rejected": -4.732564926147461, "step": 2621 }, { "epoch": 2.48, "grad_norm": 17.518869400024414, "learning_rate": 9.706190975865686e-08, "logps/chosen": -54.61959457397461, "logps/rejected": -91.7092056274414, "loss": 0.2001, "losses/dpo": 0.10851458460092545, "losses/sft": 2.083570718765259, "losses/total": 0.10851458460092545, "ref_logps/chosen": -37.579498291015625, "ref_logps/rejected": -50.20752716064453, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7040092945098877, "rewards/margins": 2.4461588859558105, "rewards/rejected": -4.150168418884277, "step": 2622 }, { "epoch": 2.48, "grad_norm": 21.378183364868164, "learning_rate": 9.68870234347674e-08, "logps/chosen": -52.947940826416016, "logps/rejected": -71.7638931274414, "loss": 0.2138, "losses/dpo": 0.1524897813796997, "losses/sft": 2.109269857406616, "losses/total": 0.1524897813796997, "ref_logps/chosen": -32.70970916748047, "ref_logps/rejected": -32.23436737060547, "rewards/accuracies": 1.0, "rewards/chosen": -2.023822784423828, "rewards/margins": 1.929129719734192, "rewards/rejected": -3.9529523849487305, "step": 2623 }, { "epoch": 2.48, "grad_norm": 27.453723907470703, "learning_rate": 9.671213711087793e-08, "logps/chosen": -59.317291259765625, "logps/rejected": -81.62176513671875, "loss": 0.253, "losses/dpo": 0.1010105311870575, "losses/sft": 2.640120506286621, "losses/total": 0.1010105311870575, "ref_logps/chosen": -42.9470329284668, "ref_logps/rejected": -43.72551727294922, "rewards/accuracies": 0.9375, "rewards/chosen": -1.637026071548462, "rewards/margins": 2.1525988578796387, "rewards/rejected": -3.7896246910095215, "step": 2624 }, { "epoch": 2.48, "grad_norm": 32.864959716796875, "learning_rate": 9.653725078698844e-08, "logps/chosen": -48.011375427246094, "logps/rejected": -68.41641998291016, "loss": 0.398, "losses/dpo": 0.38649439811706543, "losses/sft": 1.7968426942825317, "losses/total": 0.38649439811706543, "ref_logps/chosen": -29.747228622436523, "ref_logps/rejected": -32.32157516479492, "rewards/accuracies": 0.75, "rewards/chosen": -1.8264145851135254, "rewards/margins": 1.7830703258514404, "rewards/rejected": -3.6094846725463867, "step": 2625 }, { "epoch": 2.48, "grad_norm": 35.30326843261719, "learning_rate": 9.636236446309898e-08, "logps/chosen": -64.98481750488281, "logps/rejected": -89.75151824951172, "loss": 0.3121, "losses/dpo": 0.2173253893852234, "losses/sft": 2.1693167686462402, "losses/total": 0.2173253893852234, "ref_logps/chosen": -42.099998474121094, "ref_logps/rejected": -49.113101959228516, "rewards/accuracies": 0.875, "rewards/chosen": -2.2884809970855713, "rewards/margins": 1.7753610610961914, "rewards/rejected": -4.063841819763184, "step": 2626 }, { "epoch": 2.48, "grad_norm": 13.172626495361328, "learning_rate": 9.618747813920951e-08, "logps/chosen": -64.95667266845703, "logps/rejected": -95.32521057128906, "loss": 0.1226, "losses/dpo": 0.14937573671340942, "losses/sft": 1.979300618171692, "losses/total": 0.14937573671340942, "ref_logps/chosen": -45.157470703125, "ref_logps/rejected": -49.7628173828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.9799200296401978, "rewards/margins": 2.576319694519043, "rewards/rejected": -4.556240081787109, "step": 2627 }, { "epoch": 2.48, "grad_norm": 46.55250930786133, "learning_rate": 9.601259181532003e-08, "logps/chosen": -68.29072570800781, "logps/rejected": -87.84684753417969, "loss": 0.4144, "losses/dpo": 0.7455044984817505, "losses/sft": 2.2339420318603516, "losses/total": 0.7455044984817505, "ref_logps/chosen": -42.974945068359375, "ref_logps/rejected": -46.17321014404297, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5315780639648438, "rewards/margins": 1.6357853412628174, "rewards/rejected": -4.16736364364624, "step": 2628 }, { "epoch": 2.48, "grad_norm": 35.92108917236328, "learning_rate": 9.583770549143057e-08, "logps/chosen": -62.43308639526367, "logps/rejected": -70.52333068847656, "loss": 0.3736, "losses/dpo": 0.18482337892055511, "losses/sft": 1.562248945236206, "losses/total": 0.18482337892055511, "ref_logps/chosen": -38.28158187866211, "ref_logps/rejected": -32.70833969116211, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4151501655578613, "rewards/margins": 1.3663485050201416, "rewards/rejected": -3.781498670578003, "step": 2629 }, { "epoch": 2.48, "grad_norm": 27.783422470092773, "learning_rate": 9.56628191675411e-08, "logps/chosen": -56.60566711425781, "logps/rejected": -82.60130310058594, "loss": 0.3281, "losses/dpo": 0.22605109214782715, "losses/sft": 1.8187787532806396, "losses/total": 0.22605109214782715, "ref_logps/chosen": -37.79899215698242, "ref_logps/rejected": -44.11222839355469, "rewards/accuracies": 0.875, "rewards/chosen": -1.8806679248809814, "rewards/margins": 1.9682402610778809, "rewards/rejected": -3.8489081859588623, "step": 2630 }, { "epoch": 2.48, "grad_norm": 28.24785614013672, "learning_rate": 9.548793284365162e-08, "logps/chosen": -44.786865234375, "logps/rejected": -81.65611267089844, "loss": 0.2532, "losses/dpo": 0.40884193778038025, "losses/sft": 2.1563503742218018, "losses/total": 0.40884193778038025, "ref_logps/chosen": -27.189533233642578, "ref_logps/rejected": -44.9838752746582, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7597332000732422, "rewards/margins": 1.9074907302856445, "rewards/rejected": -3.6672239303588867, "step": 2631 }, { "epoch": 2.49, "grad_norm": 6.792993068695068, "learning_rate": 9.531304651976215e-08, "logps/chosen": -51.781490325927734, "logps/rejected": -101.36820983886719, "loss": 0.0638, "losses/dpo": 0.09195427596569061, "losses/sft": 1.6872437000274658, "losses/total": 0.09195427596569061, "ref_logps/chosen": -33.46580123901367, "ref_logps/rejected": -50.52802658081055, "rewards/accuracies": 1.0, "rewards/chosen": -1.831568717956543, "rewards/margins": 3.2524495124816895, "rewards/rejected": -5.084018230438232, "step": 2632 }, { "epoch": 2.49, "grad_norm": 26.637428283691406, "learning_rate": 9.513816019587268e-08, "logps/chosen": -58.11437225341797, "logps/rejected": -88.03756713867188, "loss": 0.2448, "losses/dpo": 0.10347309708595276, "losses/sft": 2.7814109325408936, "losses/total": 0.10347309708595276, "ref_logps/chosen": -34.66980743408203, "ref_logps/rejected": -45.40983581542969, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3444571495056152, "rewards/margins": 1.918316125869751, "rewards/rejected": -4.262773036956787, "step": 2633 }, { "epoch": 2.49, "grad_norm": 24.358135223388672, "learning_rate": 9.49632738719832e-08, "logps/chosen": -61.41733169555664, "logps/rejected": -99.54876708984375, "loss": 0.2404, "losses/dpo": 0.16507381200790405, "losses/sft": 2.5139079093933105, "losses/total": 0.16507381200790405, "ref_logps/chosen": -38.32343292236328, "ref_logps/rejected": -55.90496063232422, "rewards/accuracies": 0.875, "rewards/chosen": -2.309389591217041, "rewards/margins": 2.0549912452697754, "rewards/rejected": -4.364380836486816, "step": 2634 }, { "epoch": 2.49, "grad_norm": 21.77463150024414, "learning_rate": 9.478838754809374e-08, "logps/chosen": -45.654022216796875, "logps/rejected": -88.64103698730469, "loss": 0.2014, "losses/dpo": 0.24121077358722687, "losses/sft": 2.3094232082366943, "losses/total": 0.24121077358722687, "ref_logps/chosen": -29.739646911621094, "ref_logps/rejected": -46.01719284057617, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5914379358291626, "rewards/margins": 2.6709470748901367, "rewards/rejected": -4.262385368347168, "step": 2635 }, { "epoch": 2.49, "grad_norm": 30.00931739807129, "learning_rate": 9.461350122420427e-08, "logps/chosen": -52.32457733154297, "logps/rejected": -72.18357849121094, "loss": 0.3245, "losses/dpo": 0.15869125723838806, "losses/sft": 1.7572306394577026, "losses/total": 0.15869125723838806, "ref_logps/chosen": -33.57781982421875, "ref_logps/rejected": -38.852386474609375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8746757507324219, "rewards/margins": 1.458443284034729, "rewards/rejected": -3.3331191539764404, "step": 2636 }, { "epoch": 2.49, "grad_norm": 18.77487564086914, "learning_rate": 9.443861490031479e-08, "logps/chosen": -39.521217346191406, "logps/rejected": -74.77930450439453, "loss": 0.2339, "losses/dpo": 0.12782526016235352, "losses/sft": 1.8056789636611938, "losses/total": 0.12782526016235352, "ref_logps/chosen": -28.591285705566406, "ref_logps/rejected": -38.67945098876953, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0929930210113525, "rewards/margins": 2.5169925689697266, "rewards/rejected": -3.609985828399658, "step": 2637 }, { "epoch": 2.49, "grad_norm": 25.57986831665039, "learning_rate": 9.426372857642532e-08, "logps/chosen": -57.35826110839844, "logps/rejected": -80.22688293457031, "loss": 0.3642, "losses/dpo": 0.3260922133922577, "losses/sft": 1.7510254383087158, "losses/total": 0.3260922133922577, "ref_logps/chosen": -34.019405364990234, "ref_logps/rejected": -41.680458068847656, "rewards/accuracies": 0.8125, "rewards/chosen": -2.333885669708252, "rewards/margins": 1.520756483078003, "rewards/rejected": -3.854642391204834, "step": 2638 }, { "epoch": 2.49, "grad_norm": 21.60886573791504, "learning_rate": 9.408884225253584e-08, "logps/chosen": -64.35485076904297, "logps/rejected": -79.99525451660156, "loss": 0.2163, "losses/dpo": 0.06006474047899246, "losses/sft": 2.2289817333221436, "losses/total": 0.06006474047899246, "ref_logps/chosen": -47.54611587524414, "ref_logps/rejected": -40.913818359375, "rewards/accuracies": 0.875, "rewards/chosen": -1.6808732748031616, "rewards/margins": 2.2272701263427734, "rewards/rejected": -3.9081432819366455, "step": 2639 }, { "epoch": 2.49, "grad_norm": 22.581619262695312, "learning_rate": 9.391395592864637e-08, "logps/chosen": -51.91297912597656, "logps/rejected": -88.46798706054688, "loss": 0.3111, "losses/dpo": 0.7317584156990051, "losses/sft": 2.3876187801361084, "losses/total": 0.7317584156990051, "ref_logps/chosen": -32.81362533569336, "ref_logps/rejected": -47.68986511230469, "rewards/accuracies": 0.9375, "rewards/chosen": -1.909935474395752, "rewards/margins": 2.167876720428467, "rewards/rejected": -4.077812194824219, "step": 2640 }, { "epoch": 2.49, "grad_norm": 17.824546813964844, "learning_rate": 9.37390696047569e-08, "logps/chosen": -66.83539581298828, "logps/rejected": -86.10513305664062, "loss": 0.1963, "losses/dpo": 0.05992724001407623, "losses/sft": 2.7904083728790283, "losses/total": 0.05992724001407623, "ref_logps/chosen": -45.83329772949219, "ref_logps/rejected": -44.290245056152344, "rewards/accuracies": 1.0, "rewards/chosen": -2.1002094745635986, "rewards/margins": 2.0812788009643555, "rewards/rejected": -4.181488037109375, "step": 2641 }, { "epoch": 2.49, "grad_norm": 33.15191650390625, "learning_rate": 9.356418328086744e-08, "logps/chosen": -50.72719192504883, "logps/rejected": -81.42460632324219, "loss": 0.3367, "losses/dpo": 0.31758102774620056, "losses/sft": 1.3459575176239014, "losses/total": 0.31758102774620056, "ref_logps/chosen": -32.37253952026367, "ref_logps/rejected": -47.08313751220703, "rewards/accuracies": 0.8125, "rewards/chosen": -1.835465431213379, "rewards/margins": 1.5986818075180054, "rewards/rejected": -3.4341471195220947, "step": 2642 }, { "epoch": 2.5, "grad_norm": 18.970203399658203, "learning_rate": 9.338929695697796e-08, "logps/chosen": -62.68254852294922, "logps/rejected": -97.68290710449219, "loss": 0.1524, "losses/dpo": 0.2148987352848053, "losses/sft": 2.4488165378570557, "losses/total": 0.2148987352848053, "ref_logps/chosen": -41.273277282714844, "ref_logps/rejected": -47.41449737548828, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1409270763397217, "rewards/margins": 2.8859143257141113, "rewards/rejected": -5.026841163635254, "step": 2643 }, { "epoch": 2.5, "grad_norm": 16.483991622924805, "learning_rate": 9.321441063308849e-08, "logps/chosen": -56.09375762939453, "logps/rejected": -82.86587524414062, "loss": 0.1589, "losses/dpo": 0.14035722613334656, "losses/sft": 1.6219528913497925, "losses/total": 0.14035722613334656, "ref_logps/chosen": -39.73691940307617, "ref_logps/rejected": -41.95567321777344, "rewards/accuracies": 1.0, "rewards/chosen": -1.6356831789016724, "rewards/margins": 2.455336809158325, "rewards/rejected": -4.091020107269287, "step": 2644 }, { "epoch": 2.5, "grad_norm": 23.772016525268555, "learning_rate": 9.303952430919903e-08, "logps/chosen": -45.77806091308594, "logps/rejected": -71.56695556640625, "loss": 0.291, "losses/dpo": 0.3063003718852997, "losses/sft": 1.6803115606307983, "losses/total": 0.3063003718852997, "ref_logps/chosen": -29.976591110229492, "ref_logps/rejected": -39.285186767578125, "rewards/accuracies": 0.875, "rewards/chosen": -1.5801470279693604, "rewards/margins": 1.6480300426483154, "rewards/rejected": -3.228177070617676, "step": 2645 }, { "epoch": 2.5, "grad_norm": 21.726228713989258, "learning_rate": 9.286463798530954e-08, "logps/chosen": -53.46764373779297, "logps/rejected": -83.09614562988281, "loss": 0.2369, "losses/dpo": 0.22365626692771912, "losses/sft": 1.9487669467926025, "losses/total": 0.22365626692771912, "ref_logps/chosen": -33.389488220214844, "ref_logps/rejected": -41.57721710205078, "rewards/accuracies": 1.0, "rewards/chosen": -2.0078158378601074, "rewards/margins": 2.1440773010253906, "rewards/rejected": -4.15189266204834, "step": 2646 }, { "epoch": 2.5, "grad_norm": 19.15696144104004, "learning_rate": 9.268975166142006e-08, "logps/chosen": -55.86904525756836, "logps/rejected": -96.14935302734375, "loss": 0.199, "losses/dpo": 0.0832444280385971, "losses/sft": 2.557432174682617, "losses/total": 0.0832444280385971, "ref_logps/chosen": -35.647789001464844, "ref_logps/rejected": -49.84021759033203, "rewards/accuracies": 0.875, "rewards/chosen": -2.022125720977783, "rewards/margins": 2.60878849029541, "rewards/rejected": -4.630914211273193, "step": 2647 }, { "epoch": 2.5, "grad_norm": 19.90987777709961, "learning_rate": 9.25148653375306e-08, "logps/chosen": -58.71638107299805, "logps/rejected": -79.5850601196289, "loss": 0.1919, "losses/dpo": 0.09517229348421097, "losses/sft": 2.3693816661834717, "losses/total": 0.09517229348421097, "ref_logps/chosen": -41.172203063964844, "ref_logps/rejected": -42.81840515136719, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7544176578521729, "rewards/margins": 1.9222476482391357, "rewards/rejected": -3.6766653060913086, "step": 2648 }, { "epoch": 2.5, "grad_norm": 19.189653396606445, "learning_rate": 9.233997901364113e-08, "logps/chosen": -48.83949279785156, "logps/rejected": -82.58723449707031, "loss": 0.2218, "losses/dpo": 0.28850340843200684, "losses/sft": 1.326021432876587, "losses/total": 0.28850340843200684, "ref_logps/chosen": -32.46562957763672, "ref_logps/rejected": -41.25714874267578, "rewards/accuracies": 0.875, "rewards/chosen": -1.6373860836029053, "rewards/margins": 2.4956226348876953, "rewards/rejected": -4.1330084800720215, "step": 2649 }, { "epoch": 2.5, "grad_norm": 14.481467247009277, "learning_rate": 9.216509268975166e-08, "logps/chosen": -60.59910583496094, "logps/rejected": -90.18119812011719, "loss": 0.153, "losses/dpo": 0.04702283442020416, "losses/sft": 2.3079562187194824, "losses/total": 0.04702283442020416, "ref_logps/chosen": -38.13494873046875, "ref_logps/rejected": -44.97789001464844, "rewards/accuracies": 0.9375, "rewards/chosen": -2.246415615081787, "rewards/margins": 2.2739148139953613, "rewards/rejected": -4.520330429077148, "step": 2650 }, { "epoch": 2.5, "grad_norm": 27.231679916381836, "learning_rate": 9.19902063658622e-08, "logps/chosen": -53.07997131347656, "logps/rejected": -84.53821563720703, "loss": 0.2813, "losses/dpo": 0.15879404544830322, "losses/sft": 1.921830415725708, "losses/total": 0.15879404544830322, "ref_logps/chosen": -32.77253723144531, "ref_logps/rejected": -41.605411529541016, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0307438373565674, "rewards/margins": 2.26253604888916, "rewards/rejected": -4.293279647827148, "step": 2651 }, { "epoch": 2.5, "grad_norm": 32.732086181640625, "learning_rate": 9.181532004197272e-08, "logps/chosen": -58.05944061279297, "logps/rejected": -66.1270751953125, "loss": 0.3329, "losses/dpo": 0.5357910990715027, "losses/sft": 1.9927314519882202, "losses/total": 0.5357910990715027, "ref_logps/chosen": -39.23957061767578, "ref_logps/rejected": -32.11646270751953, "rewards/accuracies": 0.875, "rewards/chosen": -1.8819873332977295, "rewards/margins": 1.5190739631652832, "rewards/rejected": -3.4010612964630127, "step": 2652 }, { "epoch": 2.51, "grad_norm": 18.582731246948242, "learning_rate": 9.164043371808323e-08, "logps/chosen": -62.617984771728516, "logps/rejected": -90.97095489501953, "loss": 0.2075, "losses/dpo": 0.2849293351173401, "losses/sft": 2.2467405796051025, "losses/total": 0.2849293351173401, "ref_logps/chosen": -41.20893096923828, "ref_logps/rejected": -47.07484436035156, "rewards/accuracies": 1.0, "rewards/chosen": -2.1409051418304443, "rewards/margins": 2.248706340789795, "rewards/rejected": -4.38961124420166, "step": 2653 }, { "epoch": 2.51, "grad_norm": 13.544334411621094, "learning_rate": 9.146554739419376e-08, "logps/chosen": -63.693058013916016, "logps/rejected": -98.46592712402344, "loss": 0.1346, "losses/dpo": 0.22762365639209747, "losses/sft": 2.903892755508423, "losses/total": 0.22762365639209747, "ref_logps/chosen": -39.51471710205078, "ref_logps/rejected": -48.387489318847656, "rewards/accuracies": 1.0, "rewards/chosen": -2.4178342819213867, "rewards/margins": 2.590010643005371, "rewards/rejected": -5.007844924926758, "step": 2654 }, { "epoch": 2.51, "grad_norm": 31.12428092956543, "learning_rate": 9.12906610703043e-08, "logps/chosen": -51.019309997558594, "logps/rejected": -82.58992767333984, "loss": 0.4647, "losses/dpo": 0.11014776676893234, "losses/sft": 2.443131923675537, "losses/total": 0.11014776676893234, "ref_logps/chosen": -29.98655128479004, "ref_logps/rejected": -43.18667221069336, "rewards/accuracies": 0.8125, "rewards/chosen": -2.103276014328003, "rewards/margins": 1.8370500802993774, "rewards/rejected": -3.940326452255249, "step": 2655 }, { "epoch": 2.51, "grad_norm": 21.865400314331055, "learning_rate": 9.111577474641482e-08, "logps/chosen": -58.064613342285156, "logps/rejected": -79.7762451171875, "loss": 0.2234, "losses/dpo": 0.20587067306041718, "losses/sft": 1.972915530204773, "losses/total": 0.20587067306041718, "ref_logps/chosen": -37.52762985229492, "ref_logps/rejected": -41.9873046875, "rewards/accuracies": 1.0, "rewards/chosen": -2.0536983013153076, "rewards/margins": 1.7251962423324585, "rewards/rejected": -3.7788944244384766, "step": 2656 }, { "epoch": 2.51, "grad_norm": 20.920183181762695, "learning_rate": 9.094088842252535e-08, "logps/chosen": -70.60042572021484, "logps/rejected": -90.38031005859375, "loss": 0.2386, "losses/dpo": 0.2561028003692627, "losses/sft": 2.2593626976013184, "losses/total": 0.2561028003692627, "ref_logps/chosen": -45.15994644165039, "ref_logps/rejected": -46.44893264770508, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5440478324890137, "rewards/margins": 1.849090337753296, "rewards/rejected": -4.3931379318237305, "step": 2657 }, { "epoch": 2.51, "grad_norm": 15.381429672241211, "learning_rate": 9.076600209863589e-08, "logps/chosen": -48.48783493041992, "logps/rejected": -85.4599380493164, "loss": 0.1252, "losses/dpo": 0.16669030487537384, "losses/sft": 1.9194079637527466, "losses/total": 0.16669030487537384, "ref_logps/chosen": -32.741188049316406, "ref_logps/rejected": -43.610870361328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.5746644735336304, "rewards/margins": 2.6102418899536133, "rewards/rejected": -4.184906482696533, "step": 2658 }, { "epoch": 2.51, "grad_norm": 21.14702033996582, "learning_rate": 9.059111577474642e-08, "logps/chosen": -48.063499450683594, "logps/rejected": -81.52400207519531, "loss": 0.3334, "losses/dpo": 0.15759655833244324, "losses/sft": 1.1650971174240112, "losses/total": 0.15759655833244324, "ref_logps/chosen": -31.4576473236084, "ref_logps/rejected": -42.72325134277344, "rewards/accuracies": 0.875, "rewards/chosen": -1.6605851650238037, "rewards/margins": 2.2194900512695312, "rewards/rejected": -3.880075216293335, "step": 2659 }, { "epoch": 2.51, "grad_norm": 20.476564407348633, "learning_rate": 9.041622945085694e-08, "logps/chosen": -57.79673767089844, "logps/rejected": -87.91827392578125, "loss": 0.183, "losses/dpo": 0.048319194465875626, "losses/sft": 2.017071485519409, "losses/total": 0.048319194465875626, "ref_logps/chosen": -38.672584533691406, "ref_logps/rejected": -45.02015686035156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9124152660369873, "rewards/margins": 2.3773961067199707, "rewards/rejected": -4.289811134338379, "step": 2660 }, { "epoch": 2.51, "grad_norm": 19.473453521728516, "learning_rate": 9.024134312696747e-08, "logps/chosen": -45.143959045410156, "logps/rejected": -69.37344360351562, "loss": 0.2067, "losses/dpo": 0.21326187252998352, "losses/sft": 1.9567234516143799, "losses/total": 0.21326187252998352, "ref_logps/chosen": -30.596092224121094, "ref_logps/rejected": -35.80542755126953, "rewards/accuracies": 1.0, "rewards/chosen": -1.4547865390777588, "rewards/margins": 1.902014970779419, "rewards/rejected": -3.3568012714385986, "step": 2661 }, { "epoch": 2.51, "grad_norm": 38.11095428466797, "learning_rate": 9.006645680307799e-08, "logps/chosen": -59.61377716064453, "logps/rejected": -80.72429656982422, "loss": 0.4039, "losses/dpo": 0.7609592080116272, "losses/sft": 1.9940719604492188, "losses/total": 0.7609592080116272, "ref_logps/chosen": -35.80747985839844, "ref_logps/rejected": -40.880069732666016, "rewards/accuracies": 0.9375, "rewards/chosen": -2.380629301071167, "rewards/margins": 1.603793740272522, "rewards/rejected": -3.9844231605529785, "step": 2662 }, { "epoch": 2.51, "grad_norm": 14.014473915100098, "learning_rate": 8.989157047918852e-08, "logps/chosen": -53.547298431396484, "logps/rejected": -102.56118774414062, "loss": 0.1184, "losses/dpo": 0.1557226926088333, "losses/sft": 2.2434537410736084, "losses/total": 0.1557226926088333, "ref_logps/chosen": -39.66558074951172, "ref_logps/rejected": -57.48854446411133, "rewards/accuracies": 1.0, "rewards/chosen": -1.3881717920303345, "rewards/margins": 3.1190924644470215, "rewards/rejected": -4.507264137268066, "step": 2663 }, { "epoch": 2.52, "grad_norm": 20.364011764526367, "learning_rate": 8.971668415529906e-08, "logps/chosen": -73.42503356933594, "logps/rejected": -92.31561279296875, "loss": 0.1569, "losses/dpo": 0.20702314376831055, "losses/sft": 2.134514093399048, "losses/total": 0.20702314376831055, "ref_logps/chosen": -51.78997802734375, "ref_logps/rejected": -46.695194244384766, "rewards/accuracies": 1.0, "rewards/chosen": -2.1635055541992188, "rewards/margins": 2.398536443710327, "rewards/rejected": -4.562042236328125, "step": 2664 }, { "epoch": 2.52, "grad_norm": 31.631244659423828, "learning_rate": 8.954179783140958e-08, "logps/chosen": -67.80427551269531, "logps/rejected": -77.00192260742188, "loss": 0.3346, "losses/dpo": 0.2949151396751404, "losses/sft": 2.028554677963257, "losses/total": 0.2949151396751404, "ref_logps/chosen": -48.55534362792969, "ref_logps/rejected": -43.106842041015625, "rewards/accuracies": 0.875, "rewards/chosen": -1.924892544746399, "rewards/margins": 1.464616298675537, "rewards/rejected": -3.3895087242126465, "step": 2665 }, { "epoch": 2.52, "grad_norm": 16.61313247680664, "learning_rate": 8.936691150752011e-08, "logps/chosen": -46.049842834472656, "logps/rejected": -78.18761444091797, "loss": 0.1422, "losses/dpo": 0.06558723002672195, "losses/sft": 1.3075509071350098, "losses/total": 0.06558723002672195, "ref_logps/chosen": -29.03466033935547, "ref_logps/rejected": -37.812522888183594, "rewards/accuracies": 1.0, "rewards/chosen": -1.7015185356140137, "rewards/margins": 2.3359904289245605, "rewards/rejected": -4.037508964538574, "step": 2666 }, { "epoch": 2.52, "grad_norm": 18.713289260864258, "learning_rate": 8.919202518363065e-08, "logps/chosen": -57.84708786010742, "logps/rejected": -93.6070556640625, "loss": 0.178, "losses/dpo": 0.10467052459716797, "losses/sft": 1.7599592208862305, "losses/total": 0.10467052459716797, "ref_logps/chosen": -41.1086311340332, "ref_logps/rejected": -49.096893310546875, "rewards/accuracies": 0.875, "rewards/chosen": -1.673845887184143, "rewards/margins": 2.777170181274414, "rewards/rejected": -4.451015949249268, "step": 2667 }, { "epoch": 2.52, "grad_norm": 16.586545944213867, "learning_rate": 8.901713885974116e-08, "logps/chosen": -71.38506317138672, "logps/rejected": -106.7919692993164, "loss": 0.1381, "losses/dpo": 0.042315613478422165, "losses/sft": 1.8805639743804932, "losses/total": 0.042315613478422165, "ref_logps/chosen": -47.56238555908203, "ref_logps/rejected": -55.274505615234375, "rewards/accuracies": 1.0, "rewards/chosen": -2.382267475128174, "rewards/margins": 2.769479513168335, "rewards/rejected": -5.15174674987793, "step": 2668 }, { "epoch": 2.52, "grad_norm": 21.321243286132812, "learning_rate": 8.884225253585169e-08, "logps/chosen": -56.752079010009766, "logps/rejected": -91.5252685546875, "loss": 0.1944, "losses/dpo": 0.13461831212043762, "losses/sft": 2.2940287590026855, "losses/total": 0.13461831212043762, "ref_logps/chosen": -37.261627197265625, "ref_logps/rejected": -46.732460021972656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.949044942855835, "rewards/margins": 2.5302371978759766, "rewards/rejected": -4.479282379150391, "step": 2669 }, { "epoch": 2.52, "grad_norm": 26.792482376098633, "learning_rate": 8.866736621196221e-08, "logps/chosen": -56.12199020385742, "logps/rejected": -73.48761749267578, "loss": 0.3073, "losses/dpo": 0.16398343443870544, "losses/sft": 2.3927910327911377, "losses/total": 0.16398343443870544, "ref_logps/chosen": -34.62025451660156, "ref_logps/rejected": -34.842750549316406, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1501736640930176, "rewards/margins": 1.714313268661499, "rewards/rejected": -3.8644871711730957, "step": 2670 }, { "epoch": 2.52, "grad_norm": 22.86536979675293, "learning_rate": 8.849247988807275e-08, "logps/chosen": -58.844024658203125, "logps/rejected": -111.99443054199219, "loss": 0.2107, "losses/dpo": 0.1297515630722046, "losses/sft": 1.7009419202804565, "losses/total": 0.1297515630722046, "ref_logps/chosen": -41.70057678222656, "ref_logps/rejected": -70.78482055664062, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7143445014953613, "rewards/margins": 2.406615972518921, "rewards/rejected": -4.120960235595703, "step": 2671 }, { "epoch": 2.52, "grad_norm": 17.73824119567871, "learning_rate": 8.831759356418328e-08, "logps/chosen": -49.567779541015625, "logps/rejected": -93.88105773925781, "loss": 0.1715, "losses/dpo": 0.4700128436088562, "losses/sft": 2.277287483215332, "losses/total": 0.4700128436088562, "ref_logps/chosen": -33.02092742919922, "ref_logps/rejected": -49.19862747192383, "rewards/accuracies": 0.875, "rewards/chosen": -1.654685139656067, "rewards/margins": 2.8135576248168945, "rewards/rejected": -4.468242645263672, "step": 2672 }, { "epoch": 2.52, "grad_norm": 31.842103958129883, "learning_rate": 8.81427072402938e-08, "logps/chosen": -56.82010269165039, "logps/rejected": -84.50248718261719, "loss": 0.316, "losses/dpo": 0.45291897654533386, "losses/sft": 1.5067979097366333, "losses/total": 0.45291897654533386, "ref_logps/chosen": -33.516632080078125, "ref_logps/rejected": -40.809226989746094, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3303468227386475, "rewards/margins": 2.0389795303344727, "rewards/rejected": -4.369326591491699, "step": 2673 }, { "epoch": 2.53, "grad_norm": 25.733028411865234, "learning_rate": 8.796782091640434e-08, "logps/chosen": -45.055824279785156, "logps/rejected": -68.95246124267578, "loss": 0.3068, "losses/dpo": 0.18253163993358612, "losses/sft": 1.8108981847763062, "losses/total": 0.18253163993358612, "ref_logps/chosen": -27.325088500976562, "ref_logps/rejected": -33.991943359375, "rewards/accuracies": 0.875, "rewards/chosen": -1.773073434829712, "rewards/margins": 1.7229784727096558, "rewards/rejected": -3.496051788330078, "step": 2674 }, { "epoch": 2.53, "grad_norm": 24.55254554748535, "learning_rate": 8.779293459251486e-08, "logps/chosen": -62.67566680908203, "logps/rejected": -98.48059844970703, "loss": 0.1498, "losses/dpo": 0.051351405680179596, "losses/sft": 1.9819999933242798, "losses/total": 0.051351405680179596, "ref_logps/chosen": -38.56451416015625, "ref_logps/rejected": -48.215606689453125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4111151695251465, "rewards/margins": 2.6153836250305176, "rewards/rejected": -5.026498794555664, "step": 2675 }, { "epoch": 2.53, "grad_norm": 15.96773910522461, "learning_rate": 8.761804826862538e-08, "logps/chosen": -59.49025344848633, "logps/rejected": -101.77914428710938, "loss": 0.1423, "losses/dpo": 0.08034539967775345, "losses/sft": 2.561227560043335, "losses/total": 0.08034539967775345, "ref_logps/chosen": -37.52191162109375, "ref_logps/rejected": -56.185550689697266, "rewards/accuracies": 1.0, "rewards/chosen": -2.196834087371826, "rewards/margins": 2.36252498626709, "rewards/rejected": -4.559359550476074, "step": 2676 }, { "epoch": 2.53, "grad_norm": 21.04685401916504, "learning_rate": 8.744316194473592e-08, "logps/chosen": -49.12049102783203, "logps/rejected": -87.42820739746094, "loss": 0.1864, "losses/dpo": 0.14930391311645508, "losses/sft": 1.7568103075027466, "losses/total": 0.14930391311645508, "ref_logps/chosen": -29.447824478149414, "ref_logps/rejected": -41.89769744873047, "rewards/accuracies": 0.9375, "rewards/chosen": -1.96726655960083, "rewards/margins": 2.585784435272217, "rewards/rejected": -4.553050994873047, "step": 2677 }, { "epoch": 2.53, "grad_norm": 30.19243621826172, "learning_rate": 8.726827562084645e-08, "logps/chosen": -63.23504638671875, "logps/rejected": -87.2403335571289, "loss": 0.2948, "losses/dpo": 0.17851522564888, "losses/sft": 2.166452169418335, "losses/total": 0.17851522564888, "ref_logps/chosen": -41.614871978759766, "ref_logps/rejected": -45.09661865234375, "rewards/accuracies": 0.9375, "rewards/chosen": -2.162017822265625, "rewards/margins": 2.052354335784912, "rewards/rejected": -4.214372158050537, "step": 2678 }, { "epoch": 2.53, "grad_norm": 17.124975204467773, "learning_rate": 8.709338929695697e-08, "logps/chosen": -52.939613342285156, "logps/rejected": -80.61222839355469, "loss": 0.1665, "losses/dpo": 0.2314153015613556, "losses/sft": 1.5011587142944336, "losses/total": 0.2314153015613556, "ref_logps/chosen": -31.999927520751953, "ref_logps/rejected": -33.60721969604492, "rewards/accuracies": 1.0, "rewards/chosen": -2.0939688682556152, "rewards/margins": 2.606532573699951, "rewards/rejected": -4.700501441955566, "step": 2679 }, { "epoch": 2.53, "grad_norm": 26.842926025390625, "learning_rate": 8.691850297306751e-08, "logps/chosen": -53.796592712402344, "logps/rejected": -80.64501190185547, "loss": 0.2724, "losses/dpo": 0.22778701782226562, "losses/sft": 1.9416782855987549, "losses/total": 0.22778701782226562, "ref_logps/chosen": -33.74138641357422, "ref_logps/rejected": -41.23072052001953, "rewards/accuracies": 0.9375, "rewards/chosen": -2.005521059036255, "rewards/margins": 1.9359074831008911, "rewards/rejected": -3.9414284229278564, "step": 2680 }, { "epoch": 2.53, "grad_norm": 14.247771263122559, "learning_rate": 8.674361664917804e-08, "logps/chosen": -60.75342559814453, "logps/rejected": -100.37091064453125, "loss": 0.0971, "losses/dpo": 0.07497790455818176, "losses/sft": 1.9749066829681396, "losses/total": 0.07497790455818176, "ref_logps/chosen": -36.79957962036133, "ref_logps/rejected": -47.23640441894531, "rewards/accuracies": 1.0, "rewards/chosen": -2.3953840732574463, "rewards/margins": 2.9180665016174316, "rewards/rejected": -5.313450813293457, "step": 2681 }, { "epoch": 2.53, "grad_norm": 25.20327377319336, "learning_rate": 8.656873032528855e-08, "logps/chosen": -60.66781997680664, "logps/rejected": -85.57939147949219, "loss": 0.256, "losses/dpo": 0.4076347053050995, "losses/sft": 2.5626718997955322, "losses/total": 0.4076347053050995, "ref_logps/chosen": -38.370948791503906, "ref_logps/rejected": -44.13459396362305, "rewards/accuracies": 0.9375, "rewards/chosen": -2.229686975479126, "rewards/margins": 1.914792776107788, "rewards/rejected": -4.144479751586914, "step": 2682 }, { "epoch": 2.53, "grad_norm": 21.946439743041992, "learning_rate": 8.639384400139909e-08, "logps/chosen": -51.11888122558594, "logps/rejected": -80.06268310546875, "loss": 0.1651, "losses/dpo": 0.15955320000648499, "losses/sft": 1.7931838035583496, "losses/total": 0.15955320000648499, "ref_logps/chosen": -36.465118408203125, "ref_logps/rejected": -40.24945068359375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4653767347335815, "rewards/margins": 2.515946626663208, "rewards/rejected": -3.981323480606079, "step": 2683 }, { "epoch": 2.53, "grad_norm": 22.575918197631836, "learning_rate": 8.621895767750962e-08, "logps/chosen": -53.03132629394531, "logps/rejected": -88.59447479248047, "loss": 0.1848, "losses/dpo": 0.223446786403656, "losses/sft": 2.388427257537842, "losses/total": 0.223446786403656, "ref_logps/chosen": -31.093069076538086, "ref_logps/rejected": -40.585906982421875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1938254833221436, "rewards/margins": 2.6070315837860107, "rewards/rejected": -4.8008575439453125, "step": 2684 }, { "epoch": 2.54, "grad_norm": 25.52772331237793, "learning_rate": 8.604407135362014e-08, "logps/chosen": -53.61585998535156, "logps/rejected": -75.51603698730469, "loss": 0.3007, "losses/dpo": 0.28772467374801636, "losses/sft": 3.019272804260254, "losses/total": 0.28772467374801636, "ref_logps/chosen": -32.3934440612793, "ref_logps/rejected": -36.954978942871094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.122241735458374, "rewards/margins": 1.733864188194275, "rewards/rejected": -3.8561058044433594, "step": 2685 }, { "epoch": 2.54, "grad_norm": 23.419788360595703, "learning_rate": 8.586918502973067e-08, "logps/chosen": -51.07176208496094, "logps/rejected": -74.2645263671875, "loss": 0.2598, "losses/dpo": 0.22374330461025238, "losses/sft": 2.271658420562744, "losses/total": 0.22374330461025238, "ref_logps/chosen": -30.589229583740234, "ref_logps/rejected": -34.38256072998047, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0482535362243652, "rewards/margins": 1.9399421215057373, "rewards/rejected": -3.9881958961486816, "step": 2686 }, { "epoch": 2.54, "grad_norm": 28.787511825561523, "learning_rate": 8.56942987058412e-08, "logps/chosen": -58.16155242919922, "logps/rejected": -75.07455444335938, "loss": 0.3659, "losses/dpo": 0.2110963761806488, "losses/sft": 2.0601718425750732, "losses/total": 0.2110963761806488, "ref_logps/chosen": -34.536338806152344, "ref_logps/rejected": -38.25844955444336, "rewards/accuracies": 0.875, "rewards/chosen": -2.3625218868255615, "rewards/margins": 1.3190889358520508, "rewards/rejected": -3.6816108226776123, "step": 2687 }, { "epoch": 2.54, "grad_norm": 34.83714294433594, "learning_rate": 8.551941238195173e-08, "logps/chosen": -51.61262512207031, "logps/rejected": -82.34982299804688, "loss": 0.319, "losses/dpo": 0.04639798402786255, "losses/sft": 1.6721293926239014, "losses/total": 0.04639798402786255, "ref_logps/chosen": -36.343528747558594, "ref_logps/rejected": -46.852760314941406, "rewards/accuracies": 0.875, "rewards/chosen": -1.5269098281860352, "rewards/margins": 2.022796630859375, "rewards/rejected": -3.549706220626831, "step": 2688 }, { "epoch": 2.54, "grad_norm": 35.27690887451172, "learning_rate": 8.534452605806224e-08, "logps/chosen": -72.50088500976562, "logps/rejected": -93.8785400390625, "loss": 0.3794, "losses/dpo": 0.2738884687423706, "losses/sft": 2.269841432571411, "losses/total": 0.2738884687423706, "ref_logps/chosen": -44.61711120605469, "ref_logps/rejected": -46.61103057861328, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7883777618408203, "rewards/margins": 1.9383740425109863, "rewards/rejected": -4.726752281188965, "step": 2689 }, { "epoch": 2.54, "grad_norm": 22.528913497924805, "learning_rate": 8.516963973417278e-08, "logps/chosen": -57.423797607421875, "logps/rejected": -78.01254272460938, "loss": 0.2056, "losses/dpo": 0.29599159955978394, "losses/sft": 2.3086752891540527, "losses/total": 0.29599159955978394, "ref_logps/chosen": -38.14057159423828, "ref_logps/rejected": -36.20295715332031, "rewards/accuracies": 1.0, "rewards/chosen": -1.9283225536346436, "rewards/margins": 2.2526357173919678, "rewards/rejected": -4.180958271026611, "step": 2690 }, { "epoch": 2.54, "grad_norm": 41.20805740356445, "learning_rate": 8.499475341028331e-08, "logps/chosen": -54.827720642089844, "logps/rejected": -71.26068878173828, "loss": 0.4547, "losses/dpo": 0.1558263748884201, "losses/sft": 1.9702070951461792, "losses/total": 0.1558263748884201, "ref_logps/chosen": -35.821510314941406, "ref_logps/rejected": -35.55598449707031, "rewards/accuracies": 0.875, "rewards/chosen": -1.900620937347412, "rewards/margins": 1.6698490381240845, "rewards/rejected": -3.570470094680786, "step": 2691 }, { "epoch": 2.54, "grad_norm": 36.05692672729492, "learning_rate": 8.481986708639384e-08, "logps/chosen": -64.08729553222656, "logps/rejected": -89.35417938232422, "loss": 0.3963, "losses/dpo": 0.3824594020843506, "losses/sft": 1.5775214433670044, "losses/total": 0.3824594020843506, "ref_logps/chosen": -38.27870178222656, "ref_logps/rejected": -46.996070861816406, "rewards/accuracies": 0.875, "rewards/chosen": -2.580859422683716, "rewards/margins": 1.654951572418213, "rewards/rejected": -4.23581075668335, "step": 2692 }, { "epoch": 2.54, "grad_norm": 28.538545608520508, "learning_rate": 8.464498076250437e-08, "logps/chosen": -58.3211669921875, "logps/rejected": -86.10749053955078, "loss": 0.3228, "losses/dpo": 0.28001561760902405, "losses/sft": 2.0562100410461426, "losses/total": 0.28001561760902405, "ref_logps/chosen": -35.05115509033203, "ref_logps/rejected": -45.59699630737305, "rewards/accuracies": 0.875, "rewards/chosen": -2.3270010948181152, "rewards/margins": 1.7240484952926636, "rewards/rejected": -4.051049709320068, "step": 2693 }, { "epoch": 2.54, "grad_norm": 30.47183609008789, "learning_rate": 8.44700944386149e-08, "logps/chosen": -65.7847900390625, "logps/rejected": -85.92161560058594, "loss": 0.2933, "losses/dpo": 0.21794924139976501, "losses/sft": 2.74277925491333, "losses/total": 0.21794924139976501, "ref_logps/chosen": -43.84074401855469, "ref_logps/rejected": -44.49517059326172, "rewards/accuracies": 0.875, "rewards/chosen": -2.194404125213623, "rewards/margins": 1.9482409954071045, "rewards/rejected": -4.142644882202148, "step": 2694 }, { "epoch": 2.54, "grad_norm": 22.02395248413086, "learning_rate": 8.429520811472543e-08, "logps/chosen": -70.81207275390625, "logps/rejected": -84.12870788574219, "loss": 0.2327, "losses/dpo": 0.28347504138946533, "losses/sft": 2.1498355865478516, "losses/total": 0.28347504138946533, "ref_logps/chosen": -46.21925354003906, "ref_logps/rejected": -40.54079055786133, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4592814445495605, "rewards/margins": 1.8995107412338257, "rewards/rejected": -4.358792304992676, "step": 2695 }, { "epoch": 2.55, "grad_norm": 14.962983131408691, "learning_rate": 8.412032179083595e-08, "logps/chosen": -52.652095794677734, "logps/rejected": -108.85685729980469, "loss": 0.1631, "losses/dpo": 0.18968872725963593, "losses/sft": 2.0677170753479004, "losses/total": 0.18968872725963593, "ref_logps/chosen": -32.3492546081543, "ref_logps/rejected": -63.92298126220703, "rewards/accuracies": 1.0, "rewards/chosen": -2.0302841663360596, "rewards/margins": 2.463104248046875, "rewards/rejected": -4.4933881759643555, "step": 2696 }, { "epoch": 2.55, "grad_norm": 38.845951080322266, "learning_rate": 8.394543546694648e-08, "logps/chosen": -60.15574645996094, "logps/rejected": -88.33204650878906, "loss": 0.3749, "losses/dpo": 0.0761902928352356, "losses/sft": 2.1660962104797363, "losses/total": 0.0761902928352356, "ref_logps/chosen": -39.53703308105469, "ref_logps/rejected": -44.3358154296875, "rewards/accuracies": 0.75, "rewards/chosen": -2.061871290206909, "rewards/margins": 2.337751865386963, "rewards/rejected": -4.399622917175293, "step": 2697 }, { "epoch": 2.55, "grad_norm": 30.22638702392578, "learning_rate": 8.3770549143057e-08, "logps/chosen": -57.35099411010742, "logps/rejected": -77.09319305419922, "loss": 0.3672, "losses/dpo": 0.3272969722747803, "losses/sft": 1.9871859550476074, "losses/total": 0.3272969722747803, "ref_logps/chosen": -38.44696044921875, "ref_logps/rejected": -37.48995590209961, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8904036283493042, "rewards/margins": 2.069920539855957, "rewards/rejected": -3.960324287414551, "step": 2698 }, { "epoch": 2.55, "grad_norm": 24.346715927124023, "learning_rate": 8.359566281916754e-08, "logps/chosen": -57.777427673339844, "logps/rejected": -80.12771606445312, "loss": 0.2426, "losses/dpo": 0.2551974356174469, "losses/sft": 1.7932424545288086, "losses/total": 0.2551974356174469, "ref_logps/chosen": -37.71232223510742, "ref_logps/rejected": -39.012168884277344, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0065104961395264, "rewards/margins": 2.105043888092041, "rewards/rejected": -4.111554145812988, "step": 2699 }, { "epoch": 2.55, "grad_norm": 17.088525772094727, "learning_rate": 8.342077649527807e-08, "logps/chosen": -54.74834442138672, "logps/rejected": -87.44673156738281, "loss": 0.201, "losses/dpo": 0.4395669400691986, "losses/sft": 2.2243034839630127, "losses/total": 0.4395669400691986, "ref_logps/chosen": -31.777137756347656, "ref_logps/rejected": -38.3187255859375, "rewards/accuracies": 0.875, "rewards/chosen": -2.2971205711364746, "rewards/margins": 2.6156809329986572, "rewards/rejected": -4.912801265716553, "step": 2700 }, { "epoch": 2.55, "grad_norm": 16.262378692626953, "learning_rate": 8.32458901713886e-08, "logps/chosen": -56.27473449707031, "logps/rejected": -72.77891540527344, "loss": 0.2048, "losses/dpo": 0.1984620839357376, "losses/sft": 1.8290061950683594, "losses/total": 0.1984620839357376, "ref_logps/chosen": -38.23634719848633, "ref_logps/rejected": -34.218109130859375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8038387298583984, "rewards/margins": 2.052241802215576, "rewards/rejected": -3.8560807704925537, "step": 2701 }, { "epoch": 2.55, "grad_norm": 21.502120971679688, "learning_rate": 8.307100384749912e-08, "logps/chosen": -53.68284606933594, "logps/rejected": -87.90415954589844, "loss": 0.2486, "losses/dpo": 0.0498664416372776, "losses/sft": 1.9409465789794922, "losses/total": 0.0498664416372776, "ref_logps/chosen": -32.84949493408203, "ref_logps/rejected": -40.87139129638672, "rewards/accuracies": 0.875, "rewards/chosen": -2.0833353996276855, "rewards/margins": 2.6199421882629395, "rewards/rejected": -4.703277587890625, "step": 2702 }, { "epoch": 2.55, "grad_norm": 24.44669532775879, "learning_rate": 8.289611752360965e-08, "logps/chosen": -56.875633239746094, "logps/rejected": -86.78715515136719, "loss": 0.2103, "losses/dpo": 0.15162293612957, "losses/sft": 2.292759895324707, "losses/total": 0.15162293612957, "ref_logps/chosen": -36.75770568847656, "ref_logps/rejected": -43.673057556152344, "rewards/accuracies": 1.0, "rewards/chosen": -2.0117931365966797, "rewards/margins": 2.299616813659668, "rewards/rejected": -4.311409950256348, "step": 2703 }, { "epoch": 2.55, "grad_norm": 11.615337371826172, "learning_rate": 8.272123119972017e-08, "logps/chosen": -50.485042572021484, "logps/rejected": -96.719482421875, "loss": 0.0889, "losses/dpo": 0.13951162993907928, "losses/sft": 2.236678123474121, "losses/total": 0.13951162993907928, "ref_logps/chosen": -34.31647872924805, "ref_logps/rejected": -52.05599594116211, "rewards/accuracies": 1.0, "rewards/chosen": -1.616856575012207, "rewards/margins": 2.849492073059082, "rewards/rejected": -4.466348648071289, "step": 2704 }, { "epoch": 2.55, "grad_norm": 20.754470825195312, "learning_rate": 8.25463448758307e-08, "logps/chosen": -53.029170989990234, "logps/rejected": -69.87765502929688, "loss": 0.2586, "losses/dpo": 0.19815683364868164, "losses/sft": 2.0173227787017822, "losses/total": 0.19815683364868164, "ref_logps/chosen": -37.79646301269531, "ref_logps/rejected": -35.85797119140625, "rewards/accuracies": 0.875, "rewards/chosen": -1.5232707262039185, "rewards/margins": 1.8786981105804443, "rewards/rejected": -3.4019687175750732, "step": 2705 }, { "epoch": 2.56, "grad_norm": 21.824260711669922, "learning_rate": 8.237145855194124e-08, "logps/chosen": -55.73167419433594, "logps/rejected": -80.55992126464844, "loss": 0.247, "losses/dpo": 0.10748860239982605, "losses/sft": 2.1623952388763428, "losses/total": 0.10748860239982605, "ref_logps/chosen": -35.19187545776367, "ref_logps/rejected": -39.41991424560547, "rewards/accuracies": 0.875, "rewards/chosen": -2.0539801120758057, "rewards/margins": 2.0600204467773438, "rewards/rejected": -4.11400032043457, "step": 2706 }, { "epoch": 2.56, "grad_norm": 19.404788970947266, "learning_rate": 8.219657222805176e-08, "logps/chosen": -63.99925231933594, "logps/rejected": -88.44627380371094, "loss": 0.171, "losses/dpo": 0.2757548987865448, "losses/sft": 2.140808343887329, "losses/total": 0.2757548987865448, "ref_logps/chosen": -43.223228454589844, "ref_logps/rejected": -45.635955810546875, "rewards/accuracies": 1.0, "rewards/chosen": -2.0776021480560303, "rewards/margins": 2.203429698944092, "rewards/rejected": -4.281031608581543, "step": 2707 }, { "epoch": 2.56, "grad_norm": 19.738277435302734, "learning_rate": 8.202168590416229e-08, "logps/chosen": -59.29216766357422, "logps/rejected": -83.80810546875, "loss": 0.193, "losses/dpo": 0.11343620717525482, "losses/sft": 1.7162336111068726, "losses/total": 0.11343620717525482, "ref_logps/chosen": -40.56107711791992, "ref_logps/rejected": -40.640689849853516, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8731088638305664, "rewards/margins": 2.4436323642730713, "rewards/rejected": -4.316740989685059, "step": 2708 }, { "epoch": 2.56, "grad_norm": 18.394798278808594, "learning_rate": 8.184679958027283e-08, "logps/chosen": -48.421905517578125, "logps/rejected": -86.56251525878906, "loss": 0.1685, "losses/dpo": 0.2300102710723877, "losses/sft": 2.1626460552215576, "losses/total": 0.2300102710723877, "ref_logps/chosen": -29.724613189697266, "ref_logps/rejected": -41.89875793457031, "rewards/accuracies": 1.0, "rewards/chosen": -1.8697292804718018, "rewards/margins": 2.596646308898926, "rewards/rejected": -4.466375350952148, "step": 2709 }, { "epoch": 2.56, "grad_norm": 20.218198776245117, "learning_rate": 8.167191325638335e-08, "logps/chosen": -59.41455841064453, "logps/rejected": -82.58963775634766, "loss": 0.2338, "losses/dpo": 0.23479968309402466, "losses/sft": 2.1468422412872314, "losses/total": 0.23479968309402466, "ref_logps/chosen": -40.77599334716797, "ref_logps/rejected": -38.94643020629883, "rewards/accuracies": 0.875, "rewards/chosen": -1.863856554031372, "rewards/margins": 2.5004639625549316, "rewards/rejected": -4.364320755004883, "step": 2710 }, { "epoch": 2.56, "grad_norm": 28.48883819580078, "learning_rate": 8.149702693249387e-08, "logps/chosen": -65.20120239257812, "logps/rejected": -91.61866760253906, "loss": 0.2368, "losses/dpo": 0.13447558879852295, "losses/sft": 2.661884069442749, "losses/total": 0.13447558879852295, "ref_logps/chosen": -40.47429656982422, "ref_logps/rejected": -43.73830795288086, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4726903438568115, "rewards/margins": 2.3153457641601562, "rewards/rejected": -4.788036346435547, "step": 2711 }, { "epoch": 2.56, "grad_norm": 25.97309684753418, "learning_rate": 8.13221406086044e-08, "logps/chosen": -77.37887573242188, "logps/rejected": -94.94210052490234, "loss": 0.2287, "losses/dpo": 0.4562962055206299, "losses/sft": 2.44805645942688, "losses/total": 0.4562962055206299, "ref_logps/chosen": -48.929874420166016, "ref_logps/rejected": -44.67776870727539, "rewards/accuracies": 0.9375, "rewards/chosen": -2.844900131225586, "rewards/margins": 2.1815333366394043, "rewards/rejected": -5.02643346786499, "step": 2712 }, { "epoch": 2.56, "grad_norm": 24.289073944091797, "learning_rate": 8.114725428471493e-08, "logps/chosen": -74.5272445678711, "logps/rejected": -101.11073303222656, "loss": 0.2716, "losses/dpo": 0.0493089035153389, "losses/sft": 2.828669786453247, "losses/total": 0.0493089035153389, "ref_logps/chosen": -46.1937255859375, "ref_logps/rejected": -50.59699249267578, "rewards/accuracies": 0.875, "rewards/chosen": -2.83335280418396, "rewards/margins": 2.218021869659424, "rewards/rejected": -5.051374435424805, "step": 2713 }, { "epoch": 2.56, "grad_norm": 16.502212524414062, "learning_rate": 8.097236796082546e-08, "logps/chosen": -57.70896911621094, "logps/rejected": -88.69378662109375, "loss": 0.1388, "losses/dpo": 0.13913074135780334, "losses/sft": 2.0559241771698, "losses/total": 0.13913074135780334, "ref_logps/chosen": -37.42401885986328, "ref_logps/rejected": -39.46044921875, "rewards/accuracies": 1.0, "rewards/chosen": -2.0284950733184814, "rewards/margins": 2.89483904838562, "rewards/rejected": -4.923334121704102, "step": 2714 }, { "epoch": 2.56, "grad_norm": 17.884679794311523, "learning_rate": 8.0797481636936e-08, "logps/chosen": -50.73571014404297, "logps/rejected": -82.34507751464844, "loss": 0.1387, "losses/dpo": 0.05677824094891548, "losses/sft": 1.0485986471176147, "losses/total": 0.05677824094891548, "ref_logps/chosen": -33.30804443359375, "ref_logps/rejected": -36.03643035888672, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7427667379379272, "rewards/margins": 2.8880972862243652, "rewards/rejected": -4.630864143371582, "step": 2715 }, { "epoch": 2.56, "grad_norm": 16.64714813232422, "learning_rate": 8.062259531304652e-08, "logps/chosen": -53.00878143310547, "logps/rejected": -89.87242889404297, "loss": 0.1451, "losses/dpo": 0.20994940400123596, "losses/sft": 2.305251359939575, "losses/total": 0.20994940400123596, "ref_logps/chosen": -32.36865997314453, "ref_logps/rejected": -44.373199462890625, "rewards/accuracies": 1.0, "rewards/chosen": -2.064012050628662, "rewards/margins": 2.4859113693237305, "rewards/rejected": -4.549923419952393, "step": 2716 }, { "epoch": 2.57, "grad_norm": 30.641056060791016, "learning_rate": 8.044770898915705e-08, "logps/chosen": -51.4459114074707, "logps/rejected": -80.82252502441406, "loss": 0.3217, "losses/dpo": 0.32456889748573303, "losses/sft": 2.557231903076172, "losses/total": 0.32456889748573303, "ref_logps/chosen": -29.376659393310547, "ref_logps/rejected": -40.18138122558594, "rewards/accuracies": 0.875, "rewards/chosen": -2.206925392150879, "rewards/margins": 1.8571892976760864, "rewards/rejected": -4.064114570617676, "step": 2717 }, { "epoch": 2.57, "grad_norm": 29.355276107788086, "learning_rate": 8.027282266526756e-08, "logps/chosen": -53.154075622558594, "logps/rejected": -82.87979888916016, "loss": 0.2962, "losses/dpo": 0.3268861174583435, "losses/sft": 2.273249387741089, "losses/total": 0.3268861174583435, "ref_logps/chosen": -34.71628189086914, "ref_logps/rejected": -44.28424072265625, "rewards/accuracies": 0.875, "rewards/chosen": -1.8437790870666504, "rewards/margins": 2.0157763957977295, "rewards/rejected": -3.859555721282959, "step": 2718 }, { "epoch": 2.57, "grad_norm": 37.05657958984375, "learning_rate": 8.00979363413781e-08, "logps/chosen": -74.75926208496094, "logps/rejected": -105.12036895751953, "loss": 0.2862, "losses/dpo": 0.18447448313236237, "losses/sft": 2.504068613052368, "losses/total": 0.18447448313236237, "ref_logps/chosen": -44.20771026611328, "ref_logps/rejected": -52.926090240478516, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0551552772521973, "rewards/margins": 2.1642727851867676, "rewards/rejected": -5.219428062438965, "step": 2719 }, { "epoch": 2.57, "grad_norm": 28.68370819091797, "learning_rate": 7.992305001748863e-08, "logps/chosen": -62.89386749267578, "logps/rejected": -86.82422637939453, "loss": 0.2742, "losses/dpo": 0.29928988218307495, "losses/sft": 2.10127592086792, "losses/total": 0.29928988218307495, "ref_logps/chosen": -41.423397064208984, "ref_logps/rejected": -39.5869255065918, "rewards/accuracies": 0.75, "rewards/chosen": -2.1470468044281006, "rewards/margins": 2.576683521270752, "rewards/rejected": -4.723730564117432, "step": 2720 }, { "epoch": 2.57, "grad_norm": 30.622920989990234, "learning_rate": 7.974816369359915e-08, "logps/chosen": -58.17392349243164, "logps/rejected": -87.57852935791016, "loss": 0.2811, "losses/dpo": 0.3850134313106537, "losses/sft": 2.2896618843078613, "losses/total": 0.3850134313106537, "ref_logps/chosen": -35.89087677001953, "ref_logps/rejected": -43.804901123046875, "rewards/accuracies": 0.875, "rewards/chosen": -2.228304624557495, "rewards/margins": 2.1490578651428223, "rewards/rejected": -4.3773627281188965, "step": 2721 }, { "epoch": 2.57, "grad_norm": 26.471567153930664, "learning_rate": 7.957327736970969e-08, "logps/chosen": -56.93463897705078, "logps/rejected": -79.65208435058594, "loss": 0.2862, "losses/dpo": 0.439805805683136, "losses/sft": 2.0052285194396973, "losses/total": 0.439805805683136, "ref_logps/chosen": -36.83883285522461, "ref_logps/rejected": -39.14894104003906, "rewards/accuracies": 0.875, "rewards/chosen": -2.009580373764038, "rewards/margins": 2.04073429107666, "rewards/rejected": -4.050314903259277, "step": 2722 }, { "epoch": 2.57, "grad_norm": 40.461177825927734, "learning_rate": 7.939839104582022e-08, "logps/chosen": -60.62139129638672, "logps/rejected": -85.19065856933594, "loss": 0.3285, "losses/dpo": 0.18352824449539185, "losses/sft": 1.780651330947876, "losses/total": 0.18352824449539185, "ref_logps/chosen": -34.476436614990234, "ref_logps/rejected": -40.504791259765625, "rewards/accuracies": 0.875, "rewards/chosen": -2.614495277404785, "rewards/margins": 1.854091763496399, "rewards/rejected": -4.4685869216918945, "step": 2723 }, { "epoch": 2.57, "grad_norm": 17.81596565246582, "learning_rate": 7.922350472193074e-08, "logps/chosen": -52.41864013671875, "logps/rejected": -87.20779418945312, "loss": 0.1905, "losses/dpo": 0.20663711428642273, "losses/sft": 1.5303996801376343, "losses/total": 0.20663711428642273, "ref_logps/chosen": -34.06294250488281, "ref_logps/rejected": -42.91332244873047, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8355698585510254, "rewards/margins": 2.5938773155212402, "rewards/rejected": -4.429447174072266, "step": 2724 }, { "epoch": 2.57, "grad_norm": 26.453439712524414, "learning_rate": 7.904861839804127e-08, "logps/chosen": -65.81513977050781, "logps/rejected": -90.457275390625, "loss": 0.2071, "losses/dpo": 0.09725739806890488, "losses/sft": 2.0027050971984863, "losses/total": 0.09725739806890488, "ref_logps/chosen": -41.82478332519531, "ref_logps/rejected": -43.45936584472656, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3990349769592285, "rewards/margins": 2.300755739212036, "rewards/rejected": -4.699790954589844, "step": 2725 }, { "epoch": 2.57, "grad_norm": 36.95955276489258, "learning_rate": 7.88737320741518e-08, "logps/chosen": -59.159141540527344, "logps/rejected": -93.71883392333984, "loss": 0.2399, "losses/dpo": 0.1004643589258194, "losses/sft": 2.725231885910034, "losses/total": 0.1004643589258194, "ref_logps/chosen": -35.653419494628906, "ref_logps/rejected": -44.3263053894043, "rewards/accuracies": 0.9375, "rewards/chosen": -2.350572109222412, "rewards/margins": 2.5886809825897217, "rewards/rejected": -4.939253330230713, "step": 2726 }, { "epoch": 2.58, "grad_norm": 19.50714874267578, "learning_rate": 7.869884575026232e-08, "logps/chosen": -61.15166473388672, "logps/rejected": -90.53507995605469, "loss": 0.2022, "losses/dpo": 0.587996780872345, "losses/sft": 2.068999767303467, "losses/total": 0.587996780872345, "ref_logps/chosen": -40.30133819580078, "ref_logps/rejected": -44.567108154296875, "rewards/accuracies": 0.875, "rewards/chosen": -2.0850329399108887, "rewards/margins": 2.5117645263671875, "rewards/rejected": -4.596797943115234, "step": 2727 }, { "epoch": 2.58, "grad_norm": 20.398759841918945, "learning_rate": 7.852395942637286e-08, "logps/chosen": -56.559783935546875, "logps/rejected": -87.5220718383789, "loss": 0.1689, "losses/dpo": 0.07520067691802979, "losses/sft": 2.4023282527923584, "losses/total": 0.07520067691802979, "ref_logps/chosen": -36.88642120361328, "ref_logps/rejected": -42.54554748535156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9673360586166382, "rewards/margins": 2.53031587600708, "rewards/rejected": -4.497652053833008, "step": 2728 }, { "epoch": 2.58, "grad_norm": 23.103666305541992, "learning_rate": 7.834907310248339e-08, "logps/chosen": -56.202880859375, "logps/rejected": -94.07498168945312, "loss": 0.2416, "losses/dpo": 0.537917971611023, "losses/sft": 1.7297097444534302, "losses/total": 0.537917971611023, "ref_logps/chosen": -37.405799865722656, "ref_logps/rejected": -48.333030700683594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8797078132629395, "rewards/margins": 2.6944875717163086, "rewards/rejected": -4.57419490814209, "step": 2729 }, { "epoch": 2.58, "grad_norm": 35.609649658203125, "learning_rate": 7.817418677859391e-08, "logps/chosen": -48.1756706237793, "logps/rejected": -83.10713195800781, "loss": 0.5478, "losses/dpo": 0.2470855414867401, "losses/sft": 1.257510781288147, "losses/total": 0.2470855414867401, "ref_logps/chosen": -29.944290161132812, "ref_logps/rejected": -43.08921813964844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8231382369995117, "rewards/margins": 2.1786537170410156, "rewards/rejected": -4.001791954040527, "step": 2730 }, { "epoch": 2.58, "grad_norm": 29.666091918945312, "learning_rate": 7.799930045470445e-08, "logps/chosen": -59.72986602783203, "logps/rejected": -74.16252899169922, "loss": 0.3179, "losses/dpo": 0.16925600171089172, "losses/sft": 1.5958210229873657, "losses/total": 0.16925600171089172, "ref_logps/chosen": -37.968505859375, "ref_logps/rejected": -34.77012634277344, "rewards/accuracies": 0.875, "rewards/chosen": -2.1761364936828613, "rewards/margins": 1.763104796409607, "rewards/rejected": -3.9392411708831787, "step": 2731 }, { "epoch": 2.58, "grad_norm": 16.219690322875977, "learning_rate": 7.782441413081496e-08, "logps/chosen": -46.803672790527344, "logps/rejected": -86.58584594726562, "loss": 0.1355, "losses/dpo": 0.18694616854190826, "losses/sft": 1.8560783863067627, "losses/total": 0.18694616854190826, "ref_logps/chosen": -32.62672805786133, "ref_logps/rejected": -43.35367202758789, "rewards/accuracies": 1.0, "rewards/chosen": -1.417694330215454, "rewards/margins": 2.9055228233337402, "rewards/rejected": -4.323217391967773, "step": 2732 }, { "epoch": 2.58, "grad_norm": 28.264787673950195, "learning_rate": 7.764952780692549e-08, "logps/chosen": -55.259429931640625, "logps/rejected": -88.17562866210938, "loss": 0.2243, "losses/dpo": 0.37570884823799133, "losses/sft": 2.372860908508301, "losses/total": 0.37570884823799133, "ref_logps/chosen": -35.449188232421875, "ref_logps/rejected": -42.4696159362793, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9810240268707275, "rewards/margins": 2.5895774364471436, "rewards/rejected": -4.570601463317871, "step": 2733 }, { "epoch": 2.58, "grad_norm": 19.84027671813965, "learning_rate": 7.747464148303602e-08, "logps/chosen": -57.22923278808594, "logps/rejected": -75.40006256103516, "loss": 0.1557, "losses/dpo": 0.08267943561077118, "losses/sft": 1.6729727983474731, "losses/total": 0.08267943561077118, "ref_logps/chosen": -41.01884460449219, "ref_logps/rejected": -37.75150680541992, "rewards/accuracies": 1.0, "rewards/chosen": -1.6210386753082275, "rewards/margins": 2.14381742477417, "rewards/rejected": -3.7648563385009766, "step": 2734 }, { "epoch": 2.58, "grad_norm": 35.98508071899414, "learning_rate": 7.729975515914655e-08, "logps/chosen": -66.03498840332031, "logps/rejected": -90.2000732421875, "loss": 0.3154, "losses/dpo": 0.667961061000824, "losses/sft": 2.4656476974487305, "losses/total": 0.667961061000824, "ref_logps/chosen": -47.4753532409668, "ref_logps/rejected": -49.57347869873047, "rewards/accuracies": 0.875, "rewards/chosen": -1.8559632301330566, "rewards/margins": 2.206695556640625, "rewards/rejected": -4.062658786773682, "step": 2735 }, { "epoch": 2.58, "grad_norm": 20.190582275390625, "learning_rate": 7.712486883525708e-08, "logps/chosen": -69.85173034667969, "logps/rejected": -105.54653930664062, "loss": 0.1527, "losses/dpo": 0.2493601143360138, "losses/sft": 1.9513732194900513, "losses/total": 0.2493601143360138, "ref_logps/chosen": -44.38006591796875, "ref_logps/rejected": -50.512794494628906, "rewards/accuracies": 0.9375, "rewards/chosen": -2.547166347503662, "rewards/margins": 2.9562084674835205, "rewards/rejected": -5.503375053405762, "step": 2736 }, { "epoch": 2.58, "grad_norm": 28.619308471679688, "learning_rate": 7.69499825113676e-08, "logps/chosen": -55.38597106933594, "logps/rejected": -93.68559265136719, "loss": 0.2459, "losses/dpo": 0.07029101997613907, "losses/sft": 2.2137067317962646, "losses/total": 0.07029101997613907, "ref_logps/chosen": -35.886619567871094, "ref_logps/rejected": -49.30084228515625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9499354362487793, "rewards/margins": 2.488539934158325, "rewards/rejected": -4.438475131988525, "step": 2737 }, { "epoch": 2.59, "grad_norm": 16.033260345458984, "learning_rate": 7.677509618747815e-08, "logps/chosen": -65.22747039794922, "logps/rejected": -92.58159637451172, "loss": 0.1134, "losses/dpo": 0.16272282600402832, "losses/sft": 2.371382713317871, "losses/total": 0.16272282600402832, "ref_logps/chosen": -41.89468002319336, "ref_logps/rejected": -41.7523078918457, "rewards/accuracies": 1.0, "rewards/chosen": -2.3332786560058594, "rewards/margins": 2.749650716781616, "rewards/rejected": -5.082929611206055, "step": 2738 }, { "epoch": 2.59, "grad_norm": 27.556373596191406, "learning_rate": 7.660020986358866e-08, "logps/chosen": -57.74921798706055, "logps/rejected": -76.01809692382812, "loss": 0.2424, "losses/dpo": 0.07248252630233765, "losses/sft": 3.415224075317383, "losses/total": 0.07248252630233765, "ref_logps/chosen": -33.27150344848633, "ref_logps/rejected": -32.11667251586914, "rewards/accuracies": 0.875, "rewards/chosen": -2.4477713108062744, "rewards/margins": 1.942371129989624, "rewards/rejected": -4.390142440795898, "step": 2739 }, { "epoch": 2.59, "grad_norm": 31.120080947875977, "learning_rate": 7.642532353969918e-08, "logps/chosen": -64.96121215820312, "logps/rejected": -95.09601593017578, "loss": 0.3059, "losses/dpo": 0.16930629312992096, "losses/sft": 2.241840362548828, "losses/total": 0.16930629312992096, "ref_logps/chosen": -41.666969299316406, "ref_logps/rejected": -47.29644012451172, "rewards/accuracies": 0.875, "rewards/chosen": -2.3294241428375244, "rewards/margins": 2.450533390045166, "rewards/rejected": -4.7799577713012695, "step": 2740 }, { "epoch": 2.59, "grad_norm": 31.812015533447266, "learning_rate": 7.625043721580972e-08, "logps/chosen": -58.548728942871094, "logps/rejected": -84.16853332519531, "loss": 0.2639, "losses/dpo": 0.4040836989879608, "losses/sft": 2.3799593448638916, "losses/total": 0.4040836989879608, "ref_logps/chosen": -36.088958740234375, "ref_logps/rejected": -36.762855529785156, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2459769248962402, "rewards/margins": 2.494590997695923, "rewards/rejected": -4.740568161010742, "step": 2741 }, { "epoch": 2.59, "grad_norm": 24.292409896850586, "learning_rate": 7.607555089192025e-08, "logps/chosen": -53.20740509033203, "logps/rejected": -72.34294128417969, "loss": 0.2668, "losses/dpo": 0.457363098859787, "losses/sft": 1.9364184141159058, "losses/total": 0.457363098859787, "ref_logps/chosen": -30.19186019897461, "ref_logps/rejected": -31.325294494628906, "rewards/accuracies": 1.0, "rewards/chosen": -2.3015542030334473, "rewards/margins": 1.8002103567123413, "rewards/rejected": -4.101764678955078, "step": 2742 }, { "epoch": 2.59, "grad_norm": 29.611892700195312, "learning_rate": 7.590066456803077e-08, "logps/chosen": -62.867698669433594, "logps/rejected": -86.58934020996094, "loss": 0.2814, "losses/dpo": 0.3269091844558716, "losses/sft": 1.7992746829986572, "losses/total": 0.3269091844558716, "ref_logps/chosen": -38.58263397216797, "ref_logps/rejected": -42.31630325317383, "rewards/accuracies": 1.0, "rewards/chosen": -2.428506851196289, "rewards/margins": 1.9987969398498535, "rewards/rejected": -4.427303791046143, "step": 2743 }, { "epoch": 2.59, "grad_norm": 34.87808609008789, "learning_rate": 7.572577824414131e-08, "logps/chosen": -58.59577941894531, "logps/rejected": -67.25249481201172, "loss": 0.4421, "losses/dpo": 0.6237035989761353, "losses/sft": 2.1839563846588135, "losses/total": 0.6237035989761353, "ref_logps/chosen": -37.07997131347656, "ref_logps/rejected": -31.63884735107422, "rewards/accuracies": 0.625, "rewards/chosen": -2.151581287384033, "rewards/margins": 1.4097837209701538, "rewards/rejected": -3.5613648891448975, "step": 2744 }, { "epoch": 2.59, "grad_norm": 14.540555000305176, "learning_rate": 7.555089192025184e-08, "logps/chosen": -52.120262145996094, "logps/rejected": -81.17356872558594, "loss": 0.1524, "losses/dpo": 0.12189151346683502, "losses/sft": 2.514629364013672, "losses/total": 0.12189151346683502, "ref_logps/chosen": -33.45191955566406, "ref_logps/rejected": -37.32148361206055, "rewards/accuracies": 1.0, "rewards/chosen": -1.8668346405029297, "rewards/margins": 2.5183732509613037, "rewards/rejected": -4.3852081298828125, "step": 2745 }, { "epoch": 2.59, "grad_norm": 26.12554931640625, "learning_rate": 7.537600559636235e-08, "logps/chosen": -51.76294708251953, "logps/rejected": -77.05157470703125, "loss": 0.3377, "losses/dpo": 0.31474778056144714, "losses/sft": 2.283543109893799, "losses/total": 0.31474778056144714, "ref_logps/chosen": -34.063941955566406, "ref_logps/rejected": -38.45088195800781, "rewards/accuracies": 0.875, "rewards/chosen": -1.7699003219604492, "rewards/margins": 2.090169906616211, "rewards/rejected": -3.86007022857666, "step": 2746 }, { "epoch": 2.59, "grad_norm": 26.18435287475586, "learning_rate": 7.520111927247289e-08, "logps/chosen": -63.85174560546875, "logps/rejected": -88.06969451904297, "loss": 0.3193, "losses/dpo": 0.0951785072684288, "losses/sft": 1.9448986053466797, "losses/total": 0.0951785072684288, "ref_logps/chosen": -39.32052230834961, "ref_logps/rejected": -45.76739501953125, "rewards/accuracies": 0.875, "rewards/chosen": -2.453122138977051, "rewards/margins": 1.7771079540252686, "rewards/rejected": -4.230230331420898, "step": 2747 }, { "epoch": 2.59, "grad_norm": 27.94809913635254, "learning_rate": 7.502623294858342e-08, "logps/chosen": -48.2830924987793, "logps/rejected": -84.45472717285156, "loss": 0.3111, "losses/dpo": 0.6783143877983093, "losses/sft": 2.2958736419677734, "losses/total": 0.6783143877983093, "ref_logps/chosen": -31.046344757080078, "ref_logps/rejected": -42.30171203613281, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7236746549606323, "rewards/margins": 2.4916274547576904, "rewards/rejected": -4.215302467346191, "step": 2748 }, { "epoch": 2.6, "grad_norm": 23.95720100402832, "learning_rate": 7.485134662469394e-08, "logps/chosen": -68.78559112548828, "logps/rejected": -87.39158630371094, "loss": 0.2563, "losses/dpo": 0.3453255295753479, "losses/sft": 2.2026615142822266, "losses/total": 0.3453255295753479, "ref_logps/chosen": -45.66846466064453, "ref_logps/rejected": -46.090431213378906, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3117127418518066, "rewards/margins": 1.8184019327163696, "rewards/rejected": -4.130114555358887, "step": 2749 }, { "epoch": 2.6, "grad_norm": 24.049285888671875, "learning_rate": 7.467646030080447e-08, "logps/chosen": -52.744720458984375, "logps/rejected": -85.0572509765625, "loss": 0.2297, "losses/dpo": 0.04395807906985283, "losses/sft": 2.03694486618042, "losses/total": 0.04395807906985283, "ref_logps/chosen": -33.33172607421875, "ref_logps/rejected": -40.23902893066406, "rewards/accuracies": 0.9375, "rewards/chosen": -1.941299557685852, "rewards/margins": 2.540522813796997, "rewards/rejected": -4.481822490692139, "step": 2750 }, { "epoch": 2.6, "grad_norm": 30.164949417114258, "learning_rate": 7.450157397691501e-08, "logps/chosen": -40.5528564453125, "logps/rejected": -55.17308044433594, "loss": 0.3725, "losses/dpo": 0.36549150943756104, "losses/sft": 1.51241934299469, "losses/total": 0.36549150943756104, "ref_logps/chosen": -27.111711502075195, "ref_logps/rejected": -28.69839096069336, "rewards/accuracies": 0.75, "rewards/chosen": -1.3441150188446045, "rewards/margins": 1.303354024887085, "rewards/rejected": -2.6474692821502686, "step": 2751 }, { "epoch": 2.6, "grad_norm": 23.710203170776367, "learning_rate": 7.432668765302553e-08, "logps/chosen": -56.65016174316406, "logps/rejected": -88.03787994384766, "loss": 0.2623, "losses/dpo": 0.4416307210922241, "losses/sft": 2.197373151779175, "losses/total": 0.4416307210922241, "ref_logps/chosen": -29.66163444519043, "ref_logps/rejected": -42.691070556640625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6988525390625, "rewards/margins": 1.8358286619186401, "rewards/rejected": -4.53468132019043, "step": 2752 }, { "epoch": 2.6, "grad_norm": 23.91667366027832, "learning_rate": 7.415180132913605e-08, "logps/chosen": -52.05896759033203, "logps/rejected": -88.61227416992188, "loss": 0.2444, "losses/dpo": 0.24901221692562103, "losses/sft": 2.4008641242980957, "losses/total": 0.24901221692562103, "ref_logps/chosen": -31.663232803344727, "ref_logps/rejected": -46.722476959228516, "rewards/accuracies": 1.0, "rewards/chosen": -2.0395736694335938, "rewards/margins": 2.1494064331054688, "rewards/rejected": -4.1889801025390625, "step": 2753 }, { "epoch": 2.6, "grad_norm": 31.838598251342773, "learning_rate": 7.397691500524659e-08, "logps/chosen": -59.711021423339844, "logps/rejected": -83.3906478881836, "loss": 0.3189, "losses/dpo": 0.08191178739070892, "losses/sft": 1.777084469795227, "losses/total": 0.08191178739070892, "ref_logps/chosen": -38.904083251953125, "ref_logps/rejected": -41.59431076049805, "rewards/accuracies": 0.875, "rewards/chosen": -2.080693244934082, "rewards/margins": 2.09894061088562, "rewards/rejected": -4.179634094238281, "step": 2754 }, { "epoch": 2.6, "grad_norm": 18.359704971313477, "learning_rate": 7.380202868135711e-08, "logps/chosen": -53.37405014038086, "logps/rejected": -80.7494125366211, "loss": 0.2247, "losses/dpo": 0.14001595973968506, "losses/sft": 1.684976577758789, "losses/total": 0.14001595973968506, "ref_logps/chosen": -35.51100158691406, "ref_logps/rejected": -40.45469665527344, "rewards/accuracies": 1.0, "rewards/chosen": -1.7863047122955322, "rewards/margins": 2.24316668510437, "rewards/rejected": -4.0294718742370605, "step": 2755 }, { "epoch": 2.6, "grad_norm": 14.300028800964355, "learning_rate": 7.362714235746764e-08, "logps/chosen": -67.7360610961914, "logps/rejected": -108.64254760742188, "loss": 0.1267, "losses/dpo": 0.11517482995986938, "losses/sft": 1.967227816581726, "losses/total": 0.11517482995986938, "ref_logps/chosen": -46.20024871826172, "ref_logps/rejected": -55.243709564208984, "rewards/accuracies": 1.0, "rewards/chosen": -2.1535816192626953, "rewards/margins": 3.186302661895752, "rewards/rejected": -5.3398847579956055, "step": 2756 }, { "epoch": 2.6, "grad_norm": 15.87247371673584, "learning_rate": 7.345225603357818e-08, "logps/chosen": -74.22026824951172, "logps/rejected": -102.53128814697266, "loss": 0.1413, "losses/dpo": 0.09355618804693222, "losses/sft": 2.145048141479492, "losses/total": 0.09355618804693222, "ref_logps/chosen": -49.551414489746094, "ref_logps/rejected": -54.68934631347656, "rewards/accuracies": 1.0, "rewards/chosen": -2.4668853282928467, "rewards/margins": 2.3173089027404785, "rewards/rejected": -4.784193992614746, "step": 2757 }, { "epoch": 2.6, "grad_norm": 25.568130493164062, "learning_rate": 7.32773697096887e-08, "logps/chosen": -59.52566909790039, "logps/rejected": -92.83795166015625, "loss": 0.3106, "losses/dpo": 0.6710569262504578, "losses/sft": 1.9152323007583618, "losses/total": 0.6710569262504578, "ref_logps/chosen": -32.93742752075195, "ref_logps/rejected": -43.587005615234375, "rewards/accuracies": 0.75, "rewards/chosen": -2.6588244438171387, "rewards/margins": 2.266270160675049, "rewards/rejected": -4.9250946044921875, "step": 2758 }, { "epoch": 2.61, "grad_norm": 35.34548568725586, "learning_rate": 7.310248338579923e-08, "logps/chosen": -55.91728591918945, "logps/rejected": -80.59571075439453, "loss": 0.3067, "losses/dpo": 0.14326077699661255, "losses/sft": 2.452972173690796, "losses/total": 0.14326077699661255, "ref_logps/chosen": -37.061195373535156, "ref_logps/rejected": -44.04087829589844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8856089115142822, "rewards/margins": 1.769874095916748, "rewards/rejected": -3.6554832458496094, "step": 2759 }, { "epoch": 2.61, "grad_norm": 44.63969802856445, "learning_rate": 7.292759706190975e-08, "logps/chosen": -70.88312530517578, "logps/rejected": -78.41619873046875, "loss": 0.6211, "losses/dpo": 0.39397814869880676, "losses/sft": 1.8053189516067505, "losses/total": 0.39397814869880676, "ref_logps/chosen": -46.417572021484375, "ref_logps/rejected": -42.08167266845703, "rewards/accuracies": 0.75, "rewards/chosen": -2.4465560913085938, "rewards/margins": 1.1868970394134521, "rewards/rejected": -3.633452892303467, "step": 2760 }, { "epoch": 2.61, "grad_norm": 19.750978469848633, "learning_rate": 7.275271073802028e-08, "logps/chosen": -66.57269287109375, "logps/rejected": -101.32249450683594, "loss": 0.1487, "losses/dpo": 0.20585063099861145, "losses/sft": 1.814422845840454, "losses/total": 0.20585063099861145, "ref_logps/chosen": -44.48317337036133, "ref_logps/rejected": -52.13437271118164, "rewards/accuracies": 1.0, "rewards/chosen": -2.2089526653289795, "rewards/margins": 2.709860324859619, "rewards/rejected": -4.9188127517700195, "step": 2761 }, { "epoch": 2.61, "grad_norm": 23.789087295532227, "learning_rate": 7.25778244141308e-08, "logps/chosen": -59.16730499267578, "logps/rejected": -89.41313171386719, "loss": 0.1899, "losses/dpo": 0.07216180115938187, "losses/sft": 1.716038703918457, "losses/total": 0.07216180115938187, "ref_logps/chosen": -39.268524169921875, "ref_logps/rejected": -44.30385971069336, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9898784160614014, "rewards/margins": 2.5210492610931396, "rewards/rejected": -4.510927677154541, "step": 2762 }, { "epoch": 2.61, "grad_norm": 20.332326889038086, "learning_rate": 7.240293809024135e-08, "logps/chosen": -58.525840759277344, "logps/rejected": -90.68019104003906, "loss": 0.205, "losses/dpo": 0.11882972717285156, "losses/sft": 2.0520684719085693, "losses/total": 0.11882972717285156, "ref_logps/chosen": -41.305606842041016, "ref_logps/rejected": -46.05442428588867, "rewards/accuracies": 1.0, "rewards/chosen": -1.7220234870910645, "rewards/margins": 2.740553140640259, "rewards/rejected": -4.462576866149902, "step": 2763 }, { "epoch": 2.61, "grad_norm": 25.217487335205078, "learning_rate": 7.222805176635187e-08, "logps/chosen": -40.85335922241211, "logps/rejected": -67.79035949707031, "loss": 0.2726, "losses/dpo": 0.32088738679885864, "losses/sft": 1.9489741325378418, "losses/total": 0.32088738679885864, "ref_logps/chosen": -26.03641128540039, "ref_logps/rejected": -32.96977615356445, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4816948175430298, "rewards/margins": 2.000363349914551, "rewards/rejected": -3.482058525085449, "step": 2764 }, { "epoch": 2.61, "grad_norm": 11.892221450805664, "learning_rate": 7.20531654424624e-08, "logps/chosen": -58.42539978027344, "logps/rejected": -97.72834777832031, "loss": 0.0853, "losses/dpo": 0.04946961626410484, "losses/sft": 2.365954637527466, "losses/total": 0.04946961626410484, "ref_logps/chosen": -34.785057067871094, "ref_logps/rejected": -45.975440979003906, "rewards/accuracies": 1.0, "rewards/chosen": -2.364034652709961, "rewards/margins": 2.811256170272827, "rewards/rejected": -5.175291061401367, "step": 2765 }, { "epoch": 2.61, "grad_norm": 19.934789657592773, "learning_rate": 7.187827911857292e-08, "logps/chosen": -56.808448791503906, "logps/rejected": -83.65318298339844, "loss": 0.2082, "losses/dpo": 0.024278657510876656, "losses/sft": 2.5071146488189697, "losses/total": 0.024278657510876656, "ref_logps/chosen": -36.50736618041992, "ref_logps/rejected": -40.96238708496094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0301082134246826, "rewards/margins": 2.238971710205078, "rewards/rejected": -4.26908016204834, "step": 2766 }, { "epoch": 2.61, "grad_norm": 16.937761306762695, "learning_rate": 7.170339279468346e-08, "logps/chosen": -68.17940521240234, "logps/rejected": -105.52655029296875, "loss": 0.1581, "losses/dpo": 0.28504469990730286, "losses/sft": 2.2126517295837402, "losses/total": 0.28504469990730286, "ref_logps/chosen": -47.13844680786133, "ref_logps/rejected": -58.97942352294922, "rewards/accuracies": 1.0, "rewards/chosen": -2.104095935821533, "rewards/margins": 2.550617218017578, "rewards/rejected": -4.6547136306762695, "step": 2767 }, { "epoch": 2.61, "grad_norm": 31.370891571044922, "learning_rate": 7.152850647079397e-08, "logps/chosen": -42.02412033081055, "logps/rejected": -80.32880401611328, "loss": 0.2981, "losses/dpo": 0.09788006544113159, "losses/sft": 1.3362228870391846, "losses/total": 0.09788006544113159, "ref_logps/chosen": -26.39270782470703, "ref_logps/rejected": -42.535484313964844, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5631415843963623, "rewards/margins": 2.216191053390503, "rewards/rejected": -3.779332160949707, "step": 2768 }, { "epoch": 2.61, "grad_norm": 22.922773361206055, "learning_rate": 7.13536201469045e-08, "logps/chosen": -47.84135818481445, "logps/rejected": -77.00415802001953, "loss": 0.2842, "losses/dpo": 0.09399730712175369, "losses/sft": 1.5567306280136108, "losses/total": 0.09399730712175369, "ref_logps/chosen": -30.130083084106445, "ref_logps/rejected": -42.87702178955078, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7711275815963745, "rewards/margins": 1.6415858268737793, "rewards/rejected": -3.4127135276794434, "step": 2769 }, { "epoch": 2.62, "grad_norm": 16.778785705566406, "learning_rate": 7.117873382301504e-08, "logps/chosen": -46.029396057128906, "logps/rejected": -102.69303894042969, "loss": 0.1253, "losses/dpo": 0.048510029911994934, "losses/sft": 1.8755203485488892, "losses/total": 0.048510029911994934, "ref_logps/chosen": -25.625782012939453, "ref_logps/rejected": -48.27287292480469, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0403614044189453, "rewards/margins": 3.401655912399292, "rewards/rejected": -5.442017555236816, "step": 2770 }, { "epoch": 2.62, "grad_norm": 21.509645462036133, "learning_rate": 7.100384749912557e-08, "logps/chosen": -51.85521697998047, "logps/rejected": -81.54478454589844, "loss": 0.2322, "losses/dpo": 0.18543429672718048, "losses/sft": 1.5244815349578857, "losses/total": 0.18543429672718048, "ref_logps/chosen": -31.410240173339844, "ref_logps/rejected": -40.685272216796875, "rewards/accuracies": 0.875, "rewards/chosen": -2.044497489929199, "rewards/margins": 2.0414538383483887, "rewards/rejected": -4.085951328277588, "step": 2771 }, { "epoch": 2.62, "grad_norm": 18.010059356689453, "learning_rate": 7.082896117523609e-08, "logps/chosen": -51.89374542236328, "logps/rejected": -82.58511352539062, "loss": 0.2133, "losses/dpo": 0.39312419295310974, "losses/sft": 2.261950731277466, "losses/total": 0.39312419295310974, "ref_logps/chosen": -33.64170837402344, "ref_logps/rejected": -37.86516571044922, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8252036571502686, "rewards/margins": 2.6467905044555664, "rewards/rejected": -4.471994400024414, "step": 2772 }, { "epoch": 2.62, "grad_norm": 17.342485427856445, "learning_rate": 7.065407485134663e-08, "logps/chosen": -71.14315795898438, "logps/rejected": -91.72359466552734, "loss": 0.1973, "losses/dpo": 0.16242167353630066, "losses/sft": 2.4318346977233887, "losses/total": 0.16242167353630066, "ref_logps/chosen": -44.687644958496094, "ref_logps/rejected": -45.36669158935547, "rewards/accuracies": 0.9375, "rewards/chosen": -2.645550489425659, "rewards/margins": 1.9901399612426758, "rewards/rejected": -4.635690689086914, "step": 2773 }, { "epoch": 2.62, "grad_norm": 31.10148048400879, "learning_rate": 7.047918852745716e-08, "logps/chosen": -63.20632553100586, "logps/rejected": -75.41357421875, "loss": 0.3465, "losses/dpo": 0.5767205953598022, "losses/sft": 2.44036602973938, "losses/total": 0.5767205953598022, "ref_logps/chosen": -39.650474548339844, "ref_logps/rejected": -41.25980758666992, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3555850982666016, "rewards/margins": 1.0597914457321167, "rewards/rejected": -3.415376663208008, "step": 2774 }, { "epoch": 2.62, "grad_norm": 21.61287498474121, "learning_rate": 7.030430220356767e-08, "logps/chosen": -54.62089157104492, "logps/rejected": -89.15480041503906, "loss": 0.26, "losses/dpo": 0.5773274898529053, "losses/sft": 1.0455708503723145, "losses/total": 0.5773274898529053, "ref_logps/chosen": -37.89388656616211, "ref_logps/rejected": -48.036537170410156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6727001667022705, "rewards/margins": 2.43912672996521, "rewards/rejected": -4.1118268966674805, "step": 2775 }, { "epoch": 2.62, "grad_norm": 12.734532356262207, "learning_rate": 7.012941587967821e-08, "logps/chosen": -50.53339385986328, "logps/rejected": -81.3243179321289, "loss": 0.1239, "losses/dpo": 0.09055262804031372, "losses/sft": 1.4335606098175049, "losses/total": 0.09055262804031372, "ref_logps/chosen": -33.74474334716797, "ref_logps/rejected": -37.206478118896484, "rewards/accuracies": 1.0, "rewards/chosen": -1.6788654327392578, "rewards/margins": 2.7329187393188477, "rewards/rejected": -4.4117841720581055, "step": 2776 }, { "epoch": 2.62, "grad_norm": 12.526095390319824, "learning_rate": 6.995452955578873e-08, "logps/chosen": -54.03600311279297, "logps/rejected": -92.27059936523438, "loss": 0.1018, "losses/dpo": 0.1050320714712143, "losses/sft": 2.236359119415283, "losses/total": 0.1050320714712143, "ref_logps/chosen": -37.53479766845703, "ref_logps/rejected": -49.787750244140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.650120496749878, "rewards/margins": 2.5981643199920654, "rewards/rejected": -4.248284816741943, "step": 2777 }, { "epoch": 2.62, "grad_norm": 27.205617904663086, "learning_rate": 6.977964323189926e-08, "logps/chosen": -68.04354858398438, "logps/rejected": -85.22676086425781, "loss": 0.2743, "losses/dpo": 0.23525092005729675, "losses/sft": 2.535377264022827, "losses/total": 0.23525092005729675, "ref_logps/chosen": -43.5046501159668, "ref_logps/rejected": -41.47509765625, "rewards/accuracies": 0.875, "rewards/chosen": -2.453890562057495, "rewards/margins": 1.9212758541107178, "rewards/rejected": -4.375166416168213, "step": 2778 }, { "epoch": 2.62, "grad_norm": 26.89873504638672, "learning_rate": 6.96047569080098e-08, "logps/chosen": -58.988155364990234, "logps/rejected": -82.1798095703125, "loss": 0.2435, "losses/dpo": 0.3528088629245758, "losses/sft": 2.489513635635376, "losses/total": 0.3528088629245758, "ref_logps/chosen": -37.62434768676758, "ref_logps/rejected": -40.86037826538086, "rewards/accuracies": 0.9375, "rewards/chosen": -2.136380910873413, "rewards/margins": 1.9955627918243408, "rewards/rejected": -4.131943702697754, "step": 2779 }, { "epoch": 2.63, "grad_norm": 14.128700256347656, "learning_rate": 6.942987058412032e-08, "logps/chosen": -61.918033599853516, "logps/rejected": -90.81793212890625, "loss": 0.1436, "losses/dpo": 0.19881035387516022, "losses/sft": 1.9981014728546143, "losses/total": 0.19881035387516022, "ref_logps/chosen": -43.85242462158203, "ref_logps/rejected": -47.815494537353516, "rewards/accuracies": 1.0, "rewards/chosen": -1.8065611124038696, "rewards/margins": 2.493682861328125, "rewards/rejected": -4.300244331359863, "step": 2780 }, { "epoch": 2.63, "grad_norm": 17.454092025756836, "learning_rate": 6.925498426023085e-08, "logps/chosen": -65.04185485839844, "logps/rejected": -89.77005767822266, "loss": 0.1662, "losses/dpo": 0.16697481274604797, "losses/sft": 1.6025989055633545, "losses/total": 0.16697481274604797, "ref_logps/chosen": -45.2525634765625, "ref_logps/rejected": -48.22856140136719, "rewards/accuracies": 1.0, "rewards/chosen": -1.9789294004440308, "rewards/margins": 2.1752207279205322, "rewards/rejected": -4.154150009155273, "step": 2781 }, { "epoch": 2.63, "grad_norm": 23.606170654296875, "learning_rate": 6.908009793634136e-08, "logps/chosen": -51.40668869018555, "logps/rejected": -77.44808959960938, "loss": 0.2499, "losses/dpo": 0.06719326227903366, "losses/sft": 2.4284608364105225, "losses/total": 0.06719326227903366, "ref_logps/chosen": -30.66219139099121, "ref_logps/rejected": -32.262874603271484, "rewards/accuracies": 0.875, "rewards/chosen": -2.0744497776031494, "rewards/margins": 2.4440712928771973, "rewards/rejected": -4.518521308898926, "step": 2782 }, { "epoch": 2.63, "grad_norm": 35.55862045288086, "learning_rate": 6.89052116124519e-08, "logps/chosen": -57.03900146484375, "logps/rejected": -86.1204833984375, "loss": 0.453, "losses/dpo": 0.8028914928436279, "losses/sft": 2.2781355381011963, "losses/total": 0.8028914928436279, "ref_logps/chosen": -31.307281494140625, "ref_logps/rejected": -47.18540573120117, "rewards/accuracies": 0.75, "rewards/chosen": -2.573172092437744, "rewards/margins": 1.3203353881835938, "rewards/rejected": -3.893507719039917, "step": 2783 }, { "epoch": 2.63, "grad_norm": 25.089380264282227, "learning_rate": 6.873032528856243e-08, "logps/chosen": -49.195579528808594, "logps/rejected": -76.71953582763672, "loss": 0.2077, "losses/dpo": 0.31144824624061584, "losses/sft": 2.3947620391845703, "losses/total": 0.31144824624061584, "ref_logps/chosen": -29.806447982788086, "ref_logps/rejected": -33.67094039916992, "rewards/accuracies": 0.9375, "rewards/chosen": -1.938913106918335, "rewards/margins": 2.3659462928771973, "rewards/rejected": -4.304859161376953, "step": 2784 }, { "epoch": 2.63, "grad_norm": 24.925973892211914, "learning_rate": 6.855543896467295e-08, "logps/chosen": -62.95465850830078, "logps/rejected": -75.41535949707031, "loss": 0.2814, "losses/dpo": 0.2505493760108948, "losses/sft": 1.8324346542358398, "losses/total": 0.2505493760108948, "ref_logps/chosen": -43.36646270751953, "ref_logps/rejected": -35.1422233581543, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9588196277618408, "rewards/margins": 2.0684938430786133, "rewards/rejected": -4.027313232421875, "step": 2785 }, { "epoch": 2.63, "grad_norm": 36.95057678222656, "learning_rate": 6.838055264078349e-08, "logps/chosen": -59.67194747924805, "logps/rejected": -85.59119415283203, "loss": 0.2985, "losses/dpo": 0.39724084734916687, "losses/sft": 2.0060245990753174, "losses/total": 0.39724084734916687, "ref_logps/chosen": -38.386375427246094, "ref_logps/rejected": -44.242923736572266, "rewards/accuracies": 0.875, "rewards/chosen": -2.128556728363037, "rewards/margins": 2.006269931793213, "rewards/rejected": -4.13482666015625, "step": 2786 }, { "epoch": 2.63, "grad_norm": 11.359419822692871, "learning_rate": 6.820566631689402e-08, "logps/chosen": -49.66535186767578, "logps/rejected": -87.66726684570312, "loss": 0.1295, "losses/dpo": 0.1356457620859146, "losses/sft": 1.7585225105285645, "losses/total": 0.1356457620859146, "ref_logps/chosen": -31.357789993286133, "ref_logps/rejected": -42.3443603515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.830756425857544, "rewards/margins": 2.7015347480773926, "rewards/rejected": -4.532291412353516, "step": 2787 }, { "epoch": 2.63, "grad_norm": 17.60601234436035, "learning_rate": 6.803077999300455e-08, "logps/chosen": -55.6259765625, "logps/rejected": -82.13127136230469, "loss": 0.2109, "losses/dpo": 0.14678038656711578, "losses/sft": 1.7931146621704102, "losses/total": 0.14678038656711578, "ref_logps/chosen": -37.12821578979492, "ref_logps/rejected": -44.43461227416992, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8497759103775024, "rewards/margins": 1.9198906421661377, "rewards/rejected": -3.7696664333343506, "step": 2788 }, { "epoch": 2.63, "grad_norm": 23.402408599853516, "learning_rate": 6.785589366911507e-08, "logps/chosen": -53.22080993652344, "logps/rejected": -80.79360961914062, "loss": 0.2426, "losses/dpo": 0.07986205071210861, "losses/sft": 1.734291911125183, "losses/total": 0.07986205071210861, "ref_logps/chosen": -31.365550994873047, "ref_logps/rejected": -39.95213317871094, "rewards/accuracies": 0.875, "rewards/chosen": -2.185525894165039, "rewards/margins": 1.898621916770935, "rewards/rejected": -4.084147930145264, "step": 2789 }, { "epoch": 2.63, "grad_norm": 18.517105102539062, "learning_rate": 6.76810073452256e-08, "logps/chosen": -63.89511489868164, "logps/rejected": -76.96041870117188, "loss": 0.1742, "losses/dpo": 0.12401974201202393, "losses/sft": 2.07351016998291, "losses/total": 0.12401974201202393, "ref_logps/chosen": -43.70576858520508, "ref_logps/rejected": -34.34823226928711, "rewards/accuracies": 1.0, "rewards/chosen": -2.018934726715088, "rewards/margins": 2.242283821105957, "rewards/rejected": -4.261219024658203, "step": 2790 }, { "epoch": 2.64, "grad_norm": 28.592937469482422, "learning_rate": 6.750612102133612e-08, "logps/chosen": -72.5536117553711, "logps/rejected": -108.85317993164062, "loss": 0.3279, "losses/dpo": 0.13034898042678833, "losses/sft": 2.3115530014038086, "losses/total": 0.13034898042678833, "ref_logps/chosen": -46.214637756347656, "ref_logps/rejected": -54.15791320800781, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6338977813720703, "rewards/margins": 2.8356289863586426, "rewards/rejected": -5.469526290893555, "step": 2791 }, { "epoch": 2.64, "grad_norm": 16.924577713012695, "learning_rate": 6.733123469744666e-08, "logps/chosen": -65.43248748779297, "logps/rejected": -96.6303939819336, "loss": 0.1932, "losses/dpo": 0.2527864873409271, "losses/sft": 1.8339967727661133, "losses/total": 0.2527864873409271, "ref_logps/chosen": -42.82279586791992, "ref_logps/rejected": -53.38432312011719, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2609691619873047, "rewards/margins": 2.063638210296631, "rewards/rejected": -4.3246073722839355, "step": 2792 }, { "epoch": 2.64, "grad_norm": 34.77162170410156, "learning_rate": 6.715634837355719e-08, "logps/chosen": -60.72416687011719, "logps/rejected": -77.06990814208984, "loss": 0.3762, "losses/dpo": 0.4854225516319275, "losses/sft": 2.3445606231689453, "losses/total": 0.4854225516319275, "ref_logps/chosen": -37.12005615234375, "ref_logps/rejected": -36.52295684814453, "rewards/accuracies": 0.875, "rewards/chosen": -2.3604109287261963, "rewards/margins": 1.6942837238311768, "rewards/rejected": -4.054694652557373, "step": 2793 }, { "epoch": 2.64, "grad_norm": 30.284433364868164, "learning_rate": 6.698146204966771e-08, "logps/chosen": -72.28326416015625, "logps/rejected": -96.45884704589844, "loss": 0.2627, "losses/dpo": 0.37639346718788147, "losses/sft": 1.7413197755813599, "losses/total": 0.37639346718788147, "ref_logps/chosen": -47.78852844238281, "ref_logps/rejected": -47.57475662231445, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4494733810424805, "rewards/margins": 2.4389357566833496, "rewards/rejected": -4.88840913772583, "step": 2794 }, { "epoch": 2.64, "grad_norm": 21.282243728637695, "learning_rate": 6.680657572577825e-08, "logps/chosen": -54.01810073852539, "logps/rejected": -71.20744323730469, "loss": 0.2563, "losses/dpo": 0.33943137526512146, "losses/sft": 2.083404064178467, "losses/total": 0.33943137526512146, "ref_logps/chosen": -36.095115661621094, "ref_logps/rejected": -33.575401306152344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7922985553741455, "rewards/margins": 1.9709055423736572, "rewards/rejected": -3.7632040977478027, "step": 2795 }, { "epoch": 2.64, "grad_norm": 31.79059600830078, "learning_rate": 6.663168940188877e-08, "logps/chosen": -60.04279708862305, "logps/rejected": -81.55189514160156, "loss": 0.3565, "losses/dpo": 0.6937614679336548, "losses/sft": 1.5668808221817017, "losses/total": 0.6937614679336548, "ref_logps/chosen": -37.2183952331543, "ref_logps/rejected": -44.807838439941406, "rewards/accuracies": 0.8125, "rewards/chosen": -2.282440423965454, "rewards/margins": 1.3919651508331299, "rewards/rejected": -3.674405574798584, "step": 2796 }, { "epoch": 2.64, "grad_norm": 20.556947708129883, "learning_rate": 6.645680307799929e-08, "logps/chosen": -60.282859802246094, "logps/rejected": -89.69305419921875, "loss": 0.2597, "losses/dpo": 0.2210417240858078, "losses/sft": 2.085726499557495, "losses/total": 0.2210417240858078, "ref_logps/chosen": -37.647403717041016, "ref_logps/rejected": -46.58287811279297, "rewards/accuracies": 0.9375, "rewards/chosen": -2.263545513153076, "rewards/margins": 2.047470808029175, "rewards/rejected": -4.31101655960083, "step": 2797 }, { "epoch": 2.64, "grad_norm": 23.896883010864258, "learning_rate": 6.628191675410982e-08, "logps/chosen": -53.391578674316406, "logps/rejected": -82.85118865966797, "loss": 0.1982, "losses/dpo": 0.3769487738609314, "losses/sft": 1.865683913230896, "losses/total": 0.3769487738609314, "ref_logps/chosen": -32.8033561706543, "ref_logps/rejected": -36.25730895996094, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0588221549987793, "rewards/margins": 2.6005661487579346, "rewards/rejected": -4.659388542175293, "step": 2798 }, { "epoch": 2.64, "grad_norm": 20.717090606689453, "learning_rate": 6.610703043022036e-08, "logps/chosen": -44.62938690185547, "logps/rejected": -86.13666534423828, "loss": 0.198, "losses/dpo": 0.32698315382003784, "losses/sft": 1.6401935815811157, "losses/total": 0.32698315382003784, "ref_logps/chosen": -29.89868927001953, "ref_logps/rejected": -41.154136657714844, "rewards/accuracies": 0.875, "rewards/chosen": -1.473069429397583, "rewards/margins": 3.02518367767334, "rewards/rejected": -4.498252868652344, "step": 2799 }, { "epoch": 2.64, "grad_norm": 31.287242889404297, "learning_rate": 6.593214410633088e-08, "logps/chosen": -66.99874877929688, "logps/rejected": -83.04682159423828, "loss": 0.4646, "losses/dpo": 0.29867488145828247, "losses/sft": 2.061471462249756, "losses/total": 0.29867488145828247, "ref_logps/chosen": -41.9615364074707, "ref_logps/rejected": -43.84099578857422, "rewards/accuracies": 0.875, "rewards/chosen": -2.503721237182617, "rewards/margins": 1.4168610572814941, "rewards/rejected": -3.9205825328826904, "step": 2800 }, { "epoch": 2.64, "grad_norm": 20.528745651245117, "learning_rate": 6.575725778244141e-08, "logps/chosen": -61.36975860595703, "logps/rejected": -82.8834457397461, "loss": 0.1926, "losses/dpo": 0.13481679558753967, "losses/sft": 2.364569902420044, "losses/total": 0.13481679558753967, "ref_logps/chosen": -38.4459114074707, "ref_logps/rejected": -39.14704132080078, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2923851013183594, "rewards/margins": 2.081256151199341, "rewards/rejected": -4.373641014099121, "step": 2801 }, { "epoch": 2.65, "grad_norm": 30.4796085357666, "learning_rate": 6.558237145855195e-08, "logps/chosen": -70.28099060058594, "logps/rejected": -87.71537017822266, "loss": 0.3285, "losses/dpo": 0.19485768675804138, "losses/sft": 2.2904770374298096, "losses/total": 0.19485768675804138, "ref_logps/chosen": -47.85303497314453, "ref_logps/rejected": -45.99406051635742, "rewards/accuracies": 0.75, "rewards/chosen": -2.242794990539551, "rewards/margins": 1.929335594177246, "rewards/rejected": -4.172130584716797, "step": 2802 }, { "epoch": 2.65, "grad_norm": 22.948678970336914, "learning_rate": 6.540748513466246e-08, "logps/chosen": -66.8612060546875, "logps/rejected": -85.68760681152344, "loss": 0.274, "losses/dpo": 0.12161294370889664, "losses/sft": 2.5325162410736084, "losses/total": 0.12161294370889664, "ref_logps/chosen": -39.060401916503906, "ref_logps/rejected": -38.514678955078125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.780081033706665, "rewards/margins": 1.9372124671936035, "rewards/rejected": -4.717293739318848, "step": 2803 }, { "epoch": 2.65, "grad_norm": 29.647390365600586, "learning_rate": 6.523259881077299e-08, "logps/chosen": -45.707313537597656, "logps/rejected": -62.48001480102539, "loss": 0.4069, "losses/dpo": 0.22883006930351257, "losses/sft": 1.4940004348754883, "losses/total": 0.22883006930351257, "ref_logps/chosen": -30.91168975830078, "ref_logps/rejected": -33.89064025878906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4795620441436768, "rewards/margins": 1.37937593460083, "rewards/rejected": -2.858937978744507, "step": 2804 }, { "epoch": 2.65, "grad_norm": 13.591337203979492, "learning_rate": 6.505771248688352e-08, "logps/chosen": -56.497833251953125, "logps/rejected": -83.28926849365234, "loss": 0.1288, "losses/dpo": 0.09809619188308716, "losses/sft": 1.549083948135376, "losses/total": 0.09809619188308716, "ref_logps/chosen": -41.87842559814453, "ref_logps/rejected": -43.66329574584961, "rewards/accuracies": 1.0, "rewards/chosen": -1.4619405269622803, "rewards/margins": 2.5006566047668457, "rewards/rejected": -3.962597131729126, "step": 2805 }, { "epoch": 2.65, "grad_norm": 17.058685302734375, "learning_rate": 6.488282616299405e-08, "logps/chosen": -57.05895233154297, "logps/rejected": -105.78681182861328, "loss": 0.1118, "losses/dpo": 0.08921144902706146, "losses/sft": 2.2917540073394775, "losses/total": 0.08921144902706146, "ref_logps/chosen": -36.44682312011719, "ref_logps/rejected": -56.53832244873047, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0612125396728516, "rewards/margins": 2.863636016845703, "rewards/rejected": -4.924848556518555, "step": 2806 }, { "epoch": 2.65, "grad_norm": 24.081478118896484, "learning_rate": 6.470793983910458e-08, "logps/chosen": -67.83106994628906, "logps/rejected": -102.46758270263672, "loss": 0.2318, "losses/dpo": 0.38107314705848694, "losses/sft": 2.170287847518921, "losses/total": 0.38107314705848694, "ref_logps/chosen": -43.02593994140625, "ref_logps/rejected": -57.54987335205078, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4805126190185547, "rewards/margins": 2.0112593173980713, "rewards/rejected": -4.491771697998047, "step": 2807 }, { "epoch": 2.65, "grad_norm": 41.07267761230469, "learning_rate": 6.453305351521512e-08, "logps/chosen": -56.034263610839844, "logps/rejected": -74.24307250976562, "loss": 0.422, "losses/dpo": 0.7111890912055969, "losses/sft": 2.3105666637420654, "losses/total": 0.7111890912055969, "ref_logps/chosen": -36.072364807128906, "ref_logps/rejected": -38.674285888671875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9961903095245361, "rewards/margins": 1.5606892108917236, "rewards/rejected": -3.5568795204162598, "step": 2808 }, { "epoch": 2.65, "grad_norm": 18.853776931762695, "learning_rate": 6.435816719132564e-08, "logps/chosen": -61.940704345703125, "logps/rejected": -97.80964660644531, "loss": 0.1706, "losses/dpo": 0.09961122274398804, "losses/sft": 2.048339605331421, "losses/total": 0.09961122274398804, "ref_logps/chosen": -40.614463806152344, "ref_logps/rejected": -55.26185989379883, "rewards/accuracies": 1.0, "rewards/chosen": -2.1326239109039307, "rewards/margins": 2.122154712677002, "rewards/rejected": -4.254778861999512, "step": 2809 }, { "epoch": 2.65, "grad_norm": 18.57338523864746, "learning_rate": 6.418328086743615e-08, "logps/chosen": -58.30417251586914, "logps/rejected": -106.75396728515625, "loss": 0.1749, "losses/dpo": 0.11486898362636566, "losses/sft": 2.597902536392212, "losses/total": 0.11486898362636566, "ref_logps/chosen": -38.71113586425781, "ref_logps/rejected": -59.33735656738281, "rewards/accuracies": 0.9375, "rewards/chosen": -1.959303617477417, "rewards/margins": 2.782358407974243, "rewards/rejected": -4.74166202545166, "step": 2810 }, { "epoch": 2.65, "grad_norm": 17.233592987060547, "learning_rate": 6.400839454354669e-08, "logps/chosen": -55.027259826660156, "logps/rejected": -78.33998107910156, "loss": 0.1425, "losses/dpo": 0.08399466425180435, "losses/sft": 1.9579206705093384, "losses/total": 0.08399466425180435, "ref_logps/chosen": -37.600929260253906, "ref_logps/rejected": -34.71483612060547, "rewards/accuracies": 1.0, "rewards/chosen": -1.7426331043243408, "rewards/margins": 2.6198816299438477, "rewards/rejected": -4.362514495849609, "step": 2811 }, { "epoch": 2.66, "grad_norm": 26.96534538269043, "learning_rate": 6.383350821965722e-08, "logps/chosen": -54.305416107177734, "logps/rejected": -74.97158813476562, "loss": 0.305, "losses/dpo": 0.39059174060821533, "losses/sft": 1.5081398487091064, "losses/total": 0.39059174060821533, "ref_logps/chosen": -31.655078887939453, "ref_logps/rejected": -34.95521926879883, "rewards/accuracies": 0.875, "rewards/chosen": -2.265033721923828, "rewards/margins": 1.7366034984588623, "rewards/rejected": -4.001636981964111, "step": 2812 }, { "epoch": 2.66, "grad_norm": 16.062406539916992, "learning_rate": 6.365862189576775e-08, "logps/chosen": -57.12964630126953, "logps/rejected": -101.63590240478516, "loss": 0.0998, "losses/dpo": 0.059631701558828354, "losses/sft": 2.075115442276001, "losses/total": 0.059631701558828354, "ref_logps/chosen": -36.83037567138672, "ref_logps/rejected": -46.61079788208008, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0299272537231445, "rewards/margins": 3.4725828170776367, "rewards/rejected": -5.502510070800781, "step": 2813 }, { "epoch": 2.66, "grad_norm": 12.354023933410645, "learning_rate": 6.348373557187827e-08, "logps/chosen": -65.61959838867188, "logps/rejected": -94.36149597167969, "loss": 0.1151, "losses/dpo": 0.20401525497436523, "losses/sft": 2.2358593940734863, "losses/total": 0.20401525497436523, "ref_logps/chosen": -46.7528076171875, "ref_logps/rejected": -49.06699752807617, "rewards/accuracies": 1.0, "rewards/chosen": -1.88667893409729, "rewards/margins": 2.642770767211914, "rewards/rejected": -4.529449462890625, "step": 2814 }, { "epoch": 2.66, "grad_norm": 34.59256362915039, "learning_rate": 6.330884924798881e-08, "logps/chosen": -61.12478256225586, "logps/rejected": -85.32469177246094, "loss": 0.2882, "losses/dpo": 0.08062613755464554, "losses/sft": 2.407926321029663, "losses/total": 0.08062613755464554, "ref_logps/chosen": -40.063072204589844, "ref_logps/rejected": -45.3946533203125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.106171131134033, "rewards/margins": 1.8868328332901, "rewards/rejected": -3.993004083633423, "step": 2815 }, { "epoch": 2.66, "grad_norm": 14.632702827453613, "learning_rate": 6.313396292409934e-08, "logps/chosen": -53.33955001831055, "logps/rejected": -83.21797180175781, "loss": 0.1651, "losses/dpo": 0.2551000118255615, "losses/sft": 2.0775253772735596, "losses/total": 0.2551000118255615, "ref_logps/chosen": -37.003536224365234, "ref_logps/rejected": -46.398101806640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6336013078689575, "rewards/margins": 2.0483856201171875, "rewards/rejected": -3.6819865703582764, "step": 2816 }, { "epoch": 2.66, "grad_norm": 21.115245819091797, "learning_rate": 6.295907660020985e-08, "logps/chosen": -54.940757751464844, "logps/rejected": -87.11189270019531, "loss": 0.2, "losses/dpo": 0.08007544279098511, "losses/sft": 2.2354846000671387, "losses/total": 0.08007544279098511, "ref_logps/chosen": -33.57972717285156, "ref_logps/rejected": -42.48321533203125, "rewards/accuracies": 1.0, "rewards/chosen": -2.1361031532287598, "rewards/margins": 2.326765537261963, "rewards/rejected": -4.462868690490723, "step": 2817 }, { "epoch": 2.66, "grad_norm": 24.38103485107422, "learning_rate": 6.278419027632039e-08, "logps/chosen": -53.5403938293457, "logps/rejected": -70.36354064941406, "loss": 0.3236, "losses/dpo": 0.4314962923526764, "losses/sft": 1.524219274520874, "losses/total": 0.4314962923526764, "ref_logps/chosen": -33.54059600830078, "ref_logps/rejected": -35.4715461730957, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9999796152114868, "rewards/margins": 1.489220142364502, "rewards/rejected": -3.489199638366699, "step": 2818 }, { "epoch": 2.66, "grad_norm": 13.823323249816895, "learning_rate": 6.260930395243091e-08, "logps/chosen": -52.77983856201172, "logps/rejected": -78.38325500488281, "loss": 0.1391, "losses/dpo": 0.06568809598684311, "losses/sft": 1.9338798522949219, "losses/total": 0.06568809598684311, "ref_logps/chosen": -32.31357955932617, "ref_logps/rejected": -33.70201873779297, "rewards/accuracies": 1.0, "rewards/chosen": -2.046626091003418, "rewards/margins": 2.4214982986450195, "rewards/rejected": -4.4681243896484375, "step": 2819 }, { "epoch": 2.66, "grad_norm": 26.558530807495117, "learning_rate": 6.243441762854144e-08, "logps/chosen": -47.277793884277344, "logps/rejected": -95.91194152832031, "loss": 0.3083, "losses/dpo": 0.09661128371953964, "losses/sft": 1.964137077331543, "losses/total": 0.09661128371953964, "ref_logps/chosen": -28.412307739257812, "ref_logps/rejected": -50.79615783691406, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8865485191345215, "rewards/margins": 2.6250295639038086, "rewards/rejected": -4.511578559875488, "step": 2820 }, { "epoch": 2.66, "grad_norm": 24.625911712646484, "learning_rate": 6.225953130465198e-08, "logps/chosen": -53.90091323852539, "logps/rejected": -79.53909301757812, "loss": 0.2804, "losses/dpo": 0.41780102252960205, "losses/sft": 1.9195451736450195, "losses/total": 0.41780102252960205, "ref_logps/chosen": -34.44166564941406, "ref_logps/rejected": -38.059112548828125, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9459251165390015, "rewards/margins": 2.202073574066162, "rewards/rejected": -4.147998809814453, "step": 2821 }, { "epoch": 2.66, "grad_norm": 45.21463394165039, "learning_rate": 6.20846449807625e-08, "logps/chosen": -64.99295043945312, "logps/rejected": -77.49156188964844, "loss": 0.3923, "losses/dpo": 0.5282461047172546, "losses/sft": 1.621016502380371, "losses/total": 0.5282461047172546, "ref_logps/chosen": -45.96197509765625, "ref_logps/rejected": -41.91083526611328, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9030970335006714, "rewards/margins": 1.6549763679504395, "rewards/rejected": -3.5580732822418213, "step": 2822 }, { "epoch": 2.67, "grad_norm": 33.73280334472656, "learning_rate": 6.190975865687303e-08, "logps/chosen": -68.11634063720703, "logps/rejected": -94.88563537597656, "loss": 0.3487, "losses/dpo": 0.04625094681978226, "losses/sft": 1.466246485710144, "losses/total": 0.04625094681978226, "ref_logps/chosen": -40.46844482421875, "ref_logps/rejected": -49.142311096191406, "rewards/accuracies": 0.8125, "rewards/chosen": -2.76478910446167, "rewards/margins": 1.8095433712005615, "rewards/rejected": -4.5743327140808105, "step": 2823 }, { "epoch": 2.67, "grad_norm": 25.91737174987793, "learning_rate": 6.173487233298356e-08, "logps/chosen": -71.0357437133789, "logps/rejected": -91.23492431640625, "loss": 0.3423, "losses/dpo": 0.03390391170978546, "losses/sft": 1.9826462268829346, "losses/total": 0.03390391170978546, "ref_logps/chosen": -48.21184539794922, "ref_logps/rejected": -46.81634521484375, "rewards/accuracies": 0.875, "rewards/chosen": -2.2823901176452637, "rewards/margins": 2.1594676971435547, "rewards/rejected": -4.44185733795166, "step": 2824 }, { "epoch": 2.67, "grad_norm": 32.87423324584961, "learning_rate": 6.155998600909408e-08, "logps/chosen": -66.14747619628906, "logps/rejected": -84.25035095214844, "loss": 0.3121, "losses/dpo": 0.27095383405685425, "losses/sft": 2.4791924953460693, "losses/total": 0.27095383405685425, "ref_logps/chosen": -40.39013671875, "ref_logps/rejected": -40.88392639160156, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5757346153259277, "rewards/margins": 1.7609083652496338, "rewards/rejected": -4.336643218994141, "step": 2825 }, { "epoch": 2.67, "grad_norm": 22.894607543945312, "learning_rate": 6.138509968520461e-08, "logps/chosen": -59.413307189941406, "logps/rejected": -95.7797622680664, "loss": 0.1864, "losses/dpo": 0.08187133818864822, "losses/sft": 2.186403512954712, "losses/total": 0.08187133818864822, "ref_logps/chosen": -37.663307189941406, "ref_logps/rejected": -46.59320068359375, "rewards/accuracies": 0.875, "rewards/chosen": -2.174999952316284, "rewards/margins": 2.7436561584472656, "rewards/rejected": -4.918656349182129, "step": 2826 }, { "epoch": 2.67, "grad_norm": 15.680797576904297, "learning_rate": 6.121021336131515e-08, "logps/chosen": -55.5383186340332, "logps/rejected": -89.84370422363281, "loss": 0.1492, "losses/dpo": 0.2526828944683075, "losses/sft": 2.1954054832458496, "losses/total": 0.2526828944683075, "ref_logps/chosen": -34.80877685546875, "ref_logps/rejected": -45.156471252441406, "rewards/accuracies": 1.0, "rewards/chosen": -2.0729541778564453, "rewards/margins": 2.3957691192626953, "rewards/rejected": -4.468723297119141, "step": 2827 }, { "epoch": 2.67, "grad_norm": 13.249262809753418, "learning_rate": 6.103532703742567e-08, "logps/chosen": -45.910484313964844, "logps/rejected": -83.5672607421875, "loss": 0.1422, "losses/dpo": 0.2480028122663498, "losses/sft": 1.4911564588546753, "losses/total": 0.2480028122663498, "ref_logps/chosen": -31.42209243774414, "ref_logps/rejected": -43.59312057495117, "rewards/accuracies": 1.0, "rewards/chosen": -1.4488391876220703, "rewards/margins": 2.548574924468994, "rewards/rejected": -3.9974141120910645, "step": 2828 }, { "epoch": 2.67, "grad_norm": 28.451623916625977, "learning_rate": 6.08604407135362e-08, "logps/chosen": -54.65733337402344, "logps/rejected": -69.976806640625, "loss": 0.3127, "losses/dpo": 0.296722948551178, "losses/sft": 1.2962543964385986, "losses/total": 0.296722948551178, "ref_logps/chosen": -36.54468536376953, "ref_logps/rejected": -34.26911163330078, "rewards/accuracies": 0.9375, "rewards/chosen": -1.811265230178833, "rewards/margins": 1.7595045566558838, "rewards/rejected": -3.5707693099975586, "step": 2829 }, { "epoch": 2.67, "grad_norm": 11.739123344421387, "learning_rate": 6.068555438964672e-08, "logps/chosen": -51.673057556152344, "logps/rejected": -89.33514404296875, "loss": 0.1661, "losses/dpo": 0.3497820794582367, "losses/sft": 1.7831809520721436, "losses/total": 0.3497820794582367, "ref_logps/chosen": -33.37078857421875, "ref_logps/rejected": -47.4122314453125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8302268981933594, "rewards/margins": 2.3620643615722656, "rewards/rejected": -4.192291259765625, "step": 2830 }, { "epoch": 2.67, "grad_norm": 24.126773834228516, "learning_rate": 6.051066806575725e-08, "logps/chosen": -49.084659576416016, "logps/rejected": -73.62368774414062, "loss": 0.3012, "losses/dpo": 0.44974058866500854, "losses/sft": 1.5654181241989136, "losses/total": 0.44974058866500854, "ref_logps/chosen": -33.50338363647461, "ref_logps/rejected": -40.60755920410156, "rewards/accuracies": 0.875, "rewards/chosen": -1.5581278800964355, "rewards/margins": 1.7434853315353394, "rewards/rejected": -3.3016133308410645, "step": 2831 }, { "epoch": 2.67, "grad_norm": 31.02278709411621, "learning_rate": 6.033578174186779e-08, "logps/chosen": -59.54981231689453, "logps/rejected": -81.86512756347656, "loss": 0.2806, "losses/dpo": 0.250593900680542, "losses/sft": 2.108398675918579, "losses/total": 0.250593900680542, "ref_logps/chosen": -40.206787109375, "ref_logps/rejected": -40.99497604370117, "rewards/accuracies": 0.875, "rewards/chosen": -1.934302806854248, "rewards/margins": 2.15271258354187, "rewards/rejected": -4.087015628814697, "step": 2832 }, { "epoch": 2.68, "grad_norm": 35.50676345825195, "learning_rate": 6.01608954179783e-08, "logps/chosen": -60.410980224609375, "logps/rejected": -84.25567626953125, "loss": 0.4141, "losses/dpo": 0.9166530966758728, "losses/sft": 2.4444801807403564, "losses/total": 0.9166530966758728, "ref_logps/chosen": -38.02571487426758, "ref_logps/rejected": -43.96631622314453, "rewards/accuracies": 0.875, "rewards/chosen": -2.2385263442993164, "rewards/margins": 1.7904090881347656, "rewards/rejected": -4.028935432434082, "step": 2833 }, { "epoch": 2.68, "grad_norm": 18.488130569458008, "learning_rate": 5.998600909408884e-08, "logps/chosen": -53.45903015136719, "logps/rejected": -96.04625701904297, "loss": 0.1785, "losses/dpo": 0.09850554168224335, "losses/sft": 2.2874128818511963, "losses/total": 0.09850554168224335, "ref_logps/chosen": -36.234947204589844, "ref_logps/rejected": -50.32763671875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7224082946777344, "rewards/margins": 2.849454402923584, "rewards/rejected": -4.571862697601318, "step": 2834 }, { "epoch": 2.68, "grad_norm": 24.639921188354492, "learning_rate": 5.981112277019937e-08, "logps/chosen": -57.561161041259766, "logps/rejected": -96.45553588867188, "loss": 0.2501, "losses/dpo": 0.1356874406337738, "losses/sft": 2.844369888305664, "losses/total": 0.1356874406337738, "ref_logps/chosen": -34.99861145019531, "ref_logps/rejected": -50.56623840332031, "rewards/accuracies": 0.875, "rewards/chosen": -2.2562549114227295, "rewards/margins": 2.332674503326416, "rewards/rejected": -4.588929176330566, "step": 2835 }, { "epoch": 2.68, "grad_norm": 16.97640609741211, "learning_rate": 5.963623644630989e-08, "logps/chosen": -53.053009033203125, "logps/rejected": -96.39279174804688, "loss": 0.1831, "losses/dpo": 0.2519689202308655, "losses/sft": 2.359346866607666, "losses/total": 0.2519689202308655, "ref_logps/chosen": -34.11677932739258, "ref_logps/rejected": -51.430335998535156, "rewards/accuracies": 1.0, "rewards/chosen": -1.8936229944229126, "rewards/margins": 2.6026225090026855, "rewards/rejected": -4.496245384216309, "step": 2836 }, { "epoch": 2.68, "grad_norm": 22.97124481201172, "learning_rate": 5.946135012242042e-08, "logps/chosen": -58.60469055175781, "logps/rejected": -86.85208892822266, "loss": 0.1883, "losses/dpo": 0.19095231592655182, "losses/sft": 2.2699618339538574, "losses/total": 0.19095231592655182, "ref_logps/chosen": -36.17578125, "ref_logps/rejected": -43.6269645690918, "rewards/accuracies": 1.0, "rewards/chosen": -2.2428905963897705, "rewards/margins": 2.0796217918395996, "rewards/rejected": -4.322512626647949, "step": 2837 }, { "epoch": 2.68, "grad_norm": 13.44681453704834, "learning_rate": 5.928646379853095e-08, "logps/chosen": -60.35519790649414, "logps/rejected": -97.99239349365234, "loss": 0.1259, "losses/dpo": 0.1397063136100769, "losses/sft": 1.6836830377578735, "losses/total": 0.1397063136100769, "ref_logps/chosen": -42.25647735595703, "ref_logps/rejected": -51.19499969482422, "rewards/accuracies": 1.0, "rewards/chosen": -1.8098721504211426, "rewards/margins": 2.8698675632476807, "rewards/rejected": -4.679739952087402, "step": 2838 }, { "epoch": 2.68, "grad_norm": 25.163185119628906, "learning_rate": 5.9111577474641484e-08, "logps/chosen": -56.017662048339844, "logps/rejected": -73.30323791503906, "loss": 0.2781, "losses/dpo": 0.09470625221729279, "losses/sft": 1.7585679292678833, "losses/total": 0.09470625221729279, "ref_logps/chosen": -35.404571533203125, "ref_logps/rejected": -35.85308074951172, "rewards/accuracies": 0.875, "rewards/chosen": -2.061309576034546, "rewards/margins": 1.6837064027786255, "rewards/rejected": -3.7450156211853027, "step": 2839 }, { "epoch": 2.68, "grad_norm": 18.755857467651367, "learning_rate": 5.893669115075201e-08, "logps/chosen": -52.348114013671875, "logps/rejected": -86.44564819335938, "loss": 0.2045, "losses/dpo": 0.29966428875923157, "losses/sft": 2.5235798358917236, "losses/total": 0.29966428875923157, "ref_logps/chosen": -37.292572021484375, "ref_logps/rejected": -49.929527282714844, "rewards/accuracies": 1.0, "rewards/chosen": -1.505554437637329, "rewards/margins": 2.1460580825805664, "rewards/rejected": -3.6516122817993164, "step": 2840 }, { "epoch": 2.68, "grad_norm": 15.73888874053955, "learning_rate": 5.8761804826862536e-08, "logps/chosen": -63.68634033203125, "logps/rejected": -88.6485595703125, "loss": 0.1394, "losses/dpo": 0.16902875900268555, "losses/sft": 1.6753729581832886, "losses/total": 0.16902875900268555, "ref_logps/chosen": -45.48716735839844, "ref_logps/rejected": -44.093048095703125, "rewards/accuracies": 1.0, "rewards/chosen": -1.8199174404144287, "rewards/margins": 2.6356332302093506, "rewards/rejected": -4.455550670623779, "step": 2841 }, { "epoch": 2.68, "grad_norm": 19.015249252319336, "learning_rate": 5.858691850297306e-08, "logps/chosen": -60.6126708984375, "logps/rejected": -75.68568420410156, "loss": 0.1691, "losses/dpo": 0.32353365421295166, "losses/sft": 1.8243657350540161, "losses/total": 0.32353365421295166, "ref_logps/chosen": -45.986183166503906, "ref_logps/rejected": -38.06608963012695, "rewards/accuracies": 0.9375, "rewards/chosen": -1.462648630142212, "rewards/margins": 2.2993111610412598, "rewards/rejected": -3.7619595527648926, "step": 2842 }, { "epoch": 2.68, "grad_norm": 26.87492561340332, "learning_rate": 5.8412032179083594e-08, "logps/chosen": -47.727516174316406, "logps/rejected": -100.16867065429688, "loss": 0.2355, "losses/dpo": 0.6074020862579346, "losses/sft": 2.9043121337890625, "losses/total": 0.6074020862579346, "ref_logps/chosen": -31.07196807861328, "ref_logps/rejected": -55.296180725097656, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6655550003051758, "rewards/margins": 2.8216946125030518, "rewards/rejected": -4.487249374389648, "step": 2843 }, { "epoch": 2.69, "grad_norm": 30.815824508666992, "learning_rate": 5.823714585519412e-08, "logps/chosen": -54.60810852050781, "logps/rejected": -84.23872375488281, "loss": 0.3046, "losses/dpo": 0.07891845703125, "losses/sft": 2.2529456615448, "losses/total": 0.07891845703125, "ref_logps/chosen": -33.00050354003906, "ref_logps/rejected": -39.17915725708008, "rewards/accuracies": 0.875, "rewards/chosen": -2.1607604026794434, "rewards/margins": 2.3451969623565674, "rewards/rejected": -4.50595760345459, "step": 2844 }, { "epoch": 2.69, "grad_norm": 26.856121063232422, "learning_rate": 5.8062259531304646e-08, "logps/chosen": -61.337127685546875, "logps/rejected": -88.42852783203125, "loss": 0.3975, "losses/dpo": 0.7157143950462341, "losses/sft": 2.255458354949951, "losses/total": 0.7157143950462341, "ref_logps/chosen": -39.10514831542969, "ref_logps/rejected": -49.1973762512207, "rewards/accuracies": 0.8125, "rewards/chosen": -2.223198413848877, "rewards/margins": 1.6999170780181885, "rewards/rejected": -3.9231152534484863, "step": 2845 }, { "epoch": 2.69, "grad_norm": 12.56474494934082, "learning_rate": 5.788737320741518e-08, "logps/chosen": -43.921138763427734, "logps/rejected": -80.57787322998047, "loss": 0.1118, "losses/dpo": 0.1052473857998848, "losses/sft": 1.0585225820541382, "losses/total": 0.1052473857998848, "ref_logps/chosen": -30.996601104736328, "ref_logps/rejected": -40.49279022216797, "rewards/accuracies": 1.0, "rewards/chosen": -1.2924540042877197, "rewards/margins": 2.7160539627075195, "rewards/rejected": -4.00850772857666, "step": 2846 }, { "epoch": 2.69, "grad_norm": 23.446834564208984, "learning_rate": 5.771248688352571e-08, "logps/chosen": -75.3303451538086, "logps/rejected": -99.12146759033203, "loss": 0.231, "losses/dpo": 0.17752321064472198, "losses/sft": 2.1565511226654053, "losses/total": 0.17752321064472198, "ref_logps/chosen": -47.45655059814453, "ref_logps/rejected": -48.7686767578125, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7873799800872803, "rewards/margins": 2.2478995323181152, "rewards/rejected": -5.035279750823975, "step": 2847 }, { "epoch": 2.69, "grad_norm": 18.20689582824707, "learning_rate": 5.753760055963623e-08, "logps/chosen": -52.07342529296875, "logps/rejected": -78.18859100341797, "loss": 0.1835, "losses/dpo": 0.09994463622570038, "losses/sft": 1.597030520439148, "losses/total": 0.09994463622570038, "ref_logps/chosen": -37.117149353027344, "ref_logps/rejected": -39.53816604614258, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4956278800964355, "rewards/margins": 2.369415283203125, "rewards/rejected": -3.8650431632995605, "step": 2848 }, { "epoch": 2.69, "grad_norm": 12.670719146728516, "learning_rate": 5.736271423574676e-08, "logps/chosen": -38.847774505615234, "logps/rejected": -80.98487854003906, "loss": 0.129, "losses/dpo": 0.19369924068450928, "losses/sft": 1.7729889154434204, "losses/total": 0.19369924068450928, "ref_logps/chosen": -28.20480728149414, "ref_logps/rejected": -43.106536865234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.0642967224121094, "rewards/margins": 2.7235376834869385, "rewards/rejected": -3.787834405899048, "step": 2849 }, { "epoch": 2.69, "grad_norm": 19.287364959716797, "learning_rate": 5.7187827911857295e-08, "logps/chosen": -57.862728118896484, "logps/rejected": -91.24444580078125, "loss": 0.1451, "losses/dpo": 0.10392201691865921, "losses/sft": 1.9286572933197021, "losses/total": 0.10392201691865921, "ref_logps/chosen": -40.45824432373047, "ref_logps/rejected": -46.64256286621094, "rewards/accuracies": 1.0, "rewards/chosen": -1.7404483556747437, "rewards/margins": 2.719740152359009, "rewards/rejected": -4.460188865661621, "step": 2850 }, { "epoch": 2.69, "grad_norm": 18.712642669677734, "learning_rate": 5.7012941587967815e-08, "logps/chosen": -58.4813346862793, "logps/rejected": -97.10137176513672, "loss": 0.1882, "losses/dpo": 0.29807591438293457, "losses/sft": 1.7365291118621826, "losses/total": 0.29807591438293457, "ref_logps/chosen": -39.16337585449219, "ref_logps/rejected": -51.623409271240234, "rewards/accuracies": 0.875, "rewards/chosen": -1.931795597076416, "rewards/margins": 2.616000175476074, "rewards/rejected": -4.547796249389648, "step": 2851 }, { "epoch": 2.69, "grad_norm": 32.193443298339844, "learning_rate": 5.683805526407835e-08, "logps/chosen": -59.583824157714844, "logps/rejected": -75.64508819580078, "loss": 0.3345, "losses/dpo": 0.35838064551353455, "losses/sft": 2.1990456581115723, "losses/total": 0.35838064551353455, "ref_logps/chosen": -39.830238342285156, "ref_logps/rejected": -39.453948974609375, "rewards/accuracies": 0.875, "rewards/chosen": -1.9753586053848267, "rewards/margins": 1.6437549591064453, "rewards/rejected": -3.6191134452819824, "step": 2852 }, { "epoch": 2.69, "grad_norm": 21.557411193847656, "learning_rate": 5.666316894018887e-08, "logps/chosen": -52.954158782958984, "logps/rejected": -79.87135314941406, "loss": 0.2439, "losses/dpo": 0.28020960092544556, "losses/sft": 2.0722146034240723, "losses/total": 0.28020960092544556, "ref_logps/chosen": -33.953643798828125, "ref_logps/rejected": -40.866371154785156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9000515937805176, "rewards/margins": 2.0004472732543945, "rewards/rejected": -3.900498867034912, "step": 2853 }, { "epoch": 2.69, "grad_norm": 22.58413314819336, "learning_rate": 5.6488282616299405e-08, "logps/chosen": -57.0494384765625, "logps/rejected": -80.87075805664062, "loss": 0.2772, "losses/dpo": 0.06300322711467743, "losses/sft": 2.2173423767089844, "losses/total": 0.06300322711467743, "ref_logps/chosen": -32.17005920410156, "ref_logps/rejected": -34.647682189941406, "rewards/accuracies": 0.8125, "rewards/chosen": -2.487938404083252, "rewards/margins": 2.134368896484375, "rewards/rejected": -4.622306823730469, "step": 2854 }, { "epoch": 2.7, "grad_norm": 17.20001792907715, "learning_rate": 5.631339629240993e-08, "logps/chosen": -68.9234390258789, "logps/rejected": -94.60698699951172, "loss": 0.1944, "losses/dpo": 0.11728018522262573, "losses/sft": 2.2426631450653076, "losses/total": 0.11728018522262573, "ref_logps/chosen": -46.50959014892578, "ref_logps/rejected": -49.548004150390625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.241384983062744, "rewards/margins": 2.2645134925842285, "rewards/rejected": -4.505898475646973, "step": 2855 }, { "epoch": 2.7, "grad_norm": 23.192480087280273, "learning_rate": 5.613850996852046e-08, "logps/chosen": -54.01043701171875, "logps/rejected": -75.58695983886719, "loss": 0.2488, "losses/dpo": 0.3014264404773712, "losses/sft": 1.4761443138122559, "losses/total": 0.3014264404773712, "ref_logps/chosen": -36.561363220214844, "ref_logps/rejected": -37.017738342285156, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7449076175689697, "rewards/margins": 2.1120147705078125, "rewards/rejected": -3.856922149658203, "step": 2856 }, { "epoch": 2.7, "grad_norm": 19.56039047241211, "learning_rate": 5.596362364463099e-08, "logps/chosen": -49.30674743652344, "logps/rejected": -83.095703125, "loss": 0.1686, "losses/dpo": 0.3388398289680481, "losses/sft": 1.8534343242645264, "losses/total": 0.3388398289680481, "ref_logps/chosen": -35.508235931396484, "ref_logps/rejected": -44.92726135253906, "rewards/accuracies": 1.0, "rewards/chosen": -1.3798513412475586, "rewards/margins": 2.4369935989379883, "rewards/rejected": -3.816844940185547, "step": 2857 }, { "epoch": 2.7, "grad_norm": 21.908437728881836, "learning_rate": 5.578873732074151e-08, "logps/chosen": -60.04517364501953, "logps/rejected": -78.47991943359375, "loss": 0.3128, "losses/dpo": 0.4070974588394165, "losses/sft": 1.322322964668274, "losses/total": 0.4070974588394165, "ref_logps/chosen": -39.38600158691406, "ref_logps/rejected": -40.62982177734375, "rewards/accuracies": 0.8125, "rewards/chosen": -2.065917491912842, "rewards/margins": 1.7190923690795898, "rewards/rejected": -3.7850098609924316, "step": 2858 }, { "epoch": 2.7, "grad_norm": 17.502845764160156, "learning_rate": 5.561385099685204e-08, "logps/chosen": -62.451568603515625, "logps/rejected": -91.19190216064453, "loss": 0.1822, "losses/dpo": 0.09059447795152664, "losses/sft": 2.5058538913726807, "losses/total": 0.09059447795152664, "ref_logps/chosen": -42.02899169921875, "ref_logps/rejected": -44.19563674926758, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0422580242156982, "rewards/margins": 2.6573684215545654, "rewards/rejected": -4.699626445770264, "step": 2859 }, { "epoch": 2.7, "grad_norm": 31.259292602539062, "learning_rate": 5.5438964672962574e-08, "logps/chosen": -55.31084442138672, "logps/rejected": -83.58088684082031, "loss": 0.3248, "losses/dpo": 0.25898510217666626, "losses/sft": 1.929835557937622, "losses/total": 0.25898510217666626, "ref_logps/chosen": -35.6456413269043, "ref_logps/rejected": -46.37433624267578, "rewards/accuracies": 0.875, "rewards/chosen": -1.966520071029663, "rewards/margins": 1.7541351318359375, "rewards/rejected": -3.7206552028656006, "step": 2860 }, { "epoch": 2.7, "grad_norm": 15.265108108520508, "learning_rate": 5.52640783490731e-08, "logps/chosen": -62.39719772338867, "logps/rejected": -97.02392578125, "loss": 0.1688, "losses/dpo": 0.08139872550964355, "losses/sft": 2.539510726928711, "losses/total": 0.08139872550964355, "ref_logps/chosen": -39.57550048828125, "ref_logps/rejected": -46.76481628417969, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2821693420410156, "rewards/margins": 2.743741512298584, "rewards/rejected": -5.0259108543396, "step": 2861 }, { "epoch": 2.7, "grad_norm": 25.72768783569336, "learning_rate": 5.5089192025183626e-08, "logps/chosen": -62.99943923950195, "logps/rejected": -101.4157485961914, "loss": 0.2289, "losses/dpo": 0.444305956363678, "losses/sft": 2.4931867122650146, "losses/total": 0.444305956363678, "ref_logps/chosen": -38.98030090332031, "ref_logps/rejected": -54.532142639160156, "rewards/accuracies": 0.9375, "rewards/chosen": -2.40191388130188, "rewards/margins": 2.286447048187256, "rewards/rejected": -4.688361167907715, "step": 2862 }, { "epoch": 2.7, "grad_norm": 28.17253303527832, "learning_rate": 5.491430570129416e-08, "logps/chosen": -67.42326354980469, "logps/rejected": -98.20608520507812, "loss": 0.2794, "losses/dpo": 0.2669195234775543, "losses/sft": 2.1227869987487793, "losses/total": 0.2669195234775543, "ref_logps/chosen": -42.004920959472656, "ref_logps/rejected": -54.77240753173828, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5418338775634766, "rewards/margins": 1.8015345335006714, "rewards/rejected": -4.3433685302734375, "step": 2863 }, { "epoch": 2.7, "grad_norm": 15.804169654846191, "learning_rate": 5.4739419377404684e-08, "logps/chosen": -54.603328704833984, "logps/rejected": -87.33549499511719, "loss": 0.135, "losses/dpo": 0.17943991720676422, "losses/sft": 2.426722764968872, "losses/total": 0.17943991720676422, "ref_logps/chosen": -32.791717529296875, "ref_logps/rejected": -40.36088562011719, "rewards/accuracies": 1.0, "rewards/chosen": -2.181161403656006, "rewards/margins": 2.516299247741699, "rewards/rejected": -4.697461128234863, "step": 2864 }, { "epoch": 2.71, "grad_norm": 27.17184066772461, "learning_rate": 5.4564533053515217e-08, "logps/chosen": -60.46577453613281, "logps/rejected": -81.92391967773438, "loss": 0.2859, "losses/dpo": 0.20735451579093933, "losses/sft": 2.2000725269317627, "losses/total": 0.20735451579093933, "ref_logps/chosen": -39.78398895263672, "ref_logps/rejected": -40.30161666870117, "rewards/accuracies": 0.875, "rewards/chosen": -2.068178653717041, "rewards/margins": 2.0940513610839844, "rewards/rejected": -4.162229537963867, "step": 2865 }, { "epoch": 2.71, "grad_norm": 20.945539474487305, "learning_rate": 5.4389646729625736e-08, "logps/chosen": -53.99884033203125, "logps/rejected": -77.11982727050781, "loss": 0.2387, "losses/dpo": 0.12420934438705444, "losses/sft": 2.1732325553894043, "losses/total": 0.12420934438705444, "ref_logps/chosen": -36.194007873535156, "ref_logps/rejected": -37.71255111694336, "rewards/accuracies": 0.8125, "rewards/chosen": -1.780483603477478, "rewards/margins": 2.1602444648742676, "rewards/rejected": -3.940728187561035, "step": 2866 }, { "epoch": 2.71, "grad_norm": 15.74340534210205, "learning_rate": 5.421476040573627e-08, "logps/chosen": -72.09315490722656, "logps/rejected": -87.51078796386719, "loss": 0.1616, "losses/dpo": 0.15017655491828918, "losses/sft": 2.0582592487335205, "losses/total": 0.15017655491828918, "ref_logps/chosen": -52.09022903442383, "ref_logps/rejected": -46.710243225097656, "rewards/accuracies": 1.0, "rewards/chosen": -2.0002927780151367, "rewards/margins": 2.0797617435455322, "rewards/rejected": -4.080055236816406, "step": 2867 }, { "epoch": 2.71, "grad_norm": 24.89318084716797, "learning_rate": 5.40398740818468e-08, "logps/chosen": -57.139404296875, "logps/rejected": -96.89073944091797, "loss": 0.1922, "losses/dpo": 0.1747281700372696, "losses/sft": 2.065976858139038, "losses/total": 0.1747281700372696, "ref_logps/chosen": -37.90196990966797, "ref_logps/rejected": -48.73188781738281, "rewards/accuracies": 1.0, "rewards/chosen": -1.923743724822998, "rewards/margins": 2.892141819000244, "rewards/rejected": -4.815885543823242, "step": 2868 }, { "epoch": 2.71, "grad_norm": 30.109760284423828, "learning_rate": 5.386498775795732e-08, "logps/chosen": -68.16104125976562, "logps/rejected": -94.56878662109375, "loss": 0.2708, "losses/dpo": 0.4975349009037018, "losses/sft": 2.309370994567871, "losses/total": 0.4975349009037018, "ref_logps/chosen": -39.060020446777344, "ref_logps/rejected": -44.24861145019531, "rewards/accuracies": 0.875, "rewards/chosen": -2.910101890563965, "rewards/margins": 2.121915102005005, "rewards/rejected": -5.032016754150391, "step": 2869 }, { "epoch": 2.71, "grad_norm": 29.30947494506836, "learning_rate": 5.369010143406785e-08, "logps/chosen": -62.62954330444336, "logps/rejected": -79.90357971191406, "loss": 0.3123, "losses/dpo": 0.2233206331729889, "losses/sft": 2.1770215034484863, "losses/total": 0.2233206331729889, "ref_logps/chosen": -39.58228302001953, "ref_logps/rejected": -37.34901428222656, "rewards/accuracies": 0.875, "rewards/chosen": -2.3047261238098145, "rewards/margins": 1.950730800628662, "rewards/rejected": -4.255456924438477, "step": 2870 }, { "epoch": 2.71, "grad_norm": 19.843786239624023, "learning_rate": 5.3515215110178385e-08, "logps/chosen": -58.096275329589844, "logps/rejected": -85.56560516357422, "loss": 0.1962, "losses/dpo": 0.15028533339500427, "losses/sft": 1.9977455139160156, "losses/total": 0.15028533339500427, "ref_logps/chosen": -36.27116394042969, "ref_logps/rejected": -42.68719482421875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.182511329650879, "rewards/margins": 2.105329751968384, "rewards/rejected": -4.287840843200684, "step": 2871 }, { "epoch": 2.71, "grad_norm": 38.27719497680664, "learning_rate": 5.334032878628891e-08, "logps/chosen": -53.89379119873047, "logps/rejected": -83.87779235839844, "loss": 0.4339, "losses/dpo": 0.5076555609703064, "losses/sft": 2.490772008895874, "losses/total": 0.5076555609703064, "ref_logps/chosen": -33.37085723876953, "ref_logps/rejected": -45.197574615478516, "rewards/accuracies": 0.75, "rewards/chosen": -2.052293300628662, "rewards/margins": 1.815728783607483, "rewards/rejected": -3.8680224418640137, "step": 2872 }, { "epoch": 2.71, "grad_norm": 13.567605972290039, "learning_rate": 5.316544246239944e-08, "logps/chosen": -58.44673538208008, "logps/rejected": -91.78823852539062, "loss": 0.1496, "losses/dpo": 0.04928778111934662, "losses/sft": 1.6096879243850708, "losses/total": 0.04928778111934662, "ref_logps/chosen": -40.113067626953125, "ref_logps/rejected": -44.49457550048828, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8333666324615479, "rewards/margins": 2.896000385284424, "rewards/rejected": -4.729366779327393, "step": 2873 }, { "epoch": 2.71, "grad_norm": 28.203121185302734, "learning_rate": 5.299055613850997e-08, "logps/chosen": -61.545440673828125, "logps/rejected": -88.96922302246094, "loss": 0.26, "losses/dpo": 0.15058162808418274, "losses/sft": 1.7406771183013916, "losses/total": 0.15058162808418274, "ref_logps/chosen": -39.28202819824219, "ref_logps/rejected": -45.285858154296875, "rewards/accuracies": 0.875, "rewards/chosen": -2.2263410091400146, "rewards/margins": 2.141995429992676, "rewards/rejected": -4.368336200714111, "step": 2874 }, { "epoch": 2.71, "grad_norm": 23.05544090270996, "learning_rate": 5.2815669814620495e-08, "logps/chosen": -61.41276550292969, "logps/rejected": -80.32254791259766, "loss": 0.3129, "losses/dpo": 0.16426196694374084, "losses/sft": 2.6184425354003906, "losses/total": 0.16426196694374084, "ref_logps/chosen": -39.670345306396484, "ref_logps/rejected": -43.33232116699219, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1742424964904785, "rewards/margins": 1.524780035018921, "rewards/rejected": -3.6990227699279785, "step": 2875 }, { "epoch": 2.72, "grad_norm": 23.09227180480957, "learning_rate": 5.264078349073102e-08, "logps/chosen": -74.49089050292969, "logps/rejected": -90.31794738769531, "loss": 0.1765, "losses/dpo": 0.2602127194404602, "losses/sft": 1.8905940055847168, "losses/total": 0.2602127194404602, "ref_logps/chosen": -51.54088592529297, "ref_logps/rejected": -46.022525787353516, "rewards/accuracies": 1.0, "rewards/chosen": -2.2950005531311035, "rewards/margins": 2.1345412731170654, "rewards/rejected": -4.429542064666748, "step": 2876 }, { "epoch": 2.72, "grad_norm": 22.58933448791504, "learning_rate": 5.246589716684155e-08, "logps/chosen": -57.41387939453125, "logps/rejected": -69.24373626708984, "loss": 0.2936, "losses/dpo": 0.4625975489616394, "losses/sft": 1.7165277004241943, "losses/total": 0.4625975489616394, "ref_logps/chosen": -43.284934997558594, "ref_logps/rejected": -36.15578842163086, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4128944873809814, "rewards/margins": 1.8959007263183594, "rewards/rejected": -3.308795213699341, "step": 2877 }, { "epoch": 2.72, "grad_norm": 16.131208419799805, "learning_rate": 5.229101084295208e-08, "logps/chosen": -48.987396240234375, "logps/rejected": -83.63958740234375, "loss": 0.1453, "losses/dpo": 0.0889534279704094, "losses/sft": 1.8422859907150269, "losses/total": 0.0889534279704094, "ref_logps/chosen": -32.04210662841797, "ref_logps/rejected": -41.86195373535156, "rewards/accuracies": 1.0, "rewards/chosen": -1.6945290565490723, "rewards/margins": 2.4832351207733154, "rewards/rejected": -4.177763938903809, "step": 2878 }, { "epoch": 2.72, "grad_norm": 48.708351135253906, "learning_rate": 5.211612451906261e-08, "logps/chosen": -56.94121551513672, "logps/rejected": -69.2843017578125, "loss": 0.4896, "losses/dpo": 0.645794689655304, "losses/sft": 2.6366021633148193, "losses/total": 0.645794689655304, "ref_logps/chosen": -33.79083251953125, "ref_logps/rejected": -30.095840454101562, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3150386810302734, "rewards/margins": 1.6038072109222412, "rewards/rejected": -3.9188458919525146, "step": 2879 }, { "epoch": 2.72, "grad_norm": 29.57944679260254, "learning_rate": 5.194123819517313e-08, "logps/chosen": -58.34238815307617, "logps/rejected": -90.1668472290039, "loss": 0.3344, "losses/dpo": 0.44302046298980713, "losses/sft": 1.897892713546753, "losses/total": 0.44302046298980713, "ref_logps/chosen": -38.493927001953125, "ref_logps/rejected": -51.56311798095703, "rewards/accuracies": 0.875, "rewards/chosen": -1.9848463535308838, "rewards/margins": 1.8755269050598145, "rewards/rejected": -3.8603732585906982, "step": 2880 }, { "epoch": 2.72, "grad_norm": 23.48588752746582, "learning_rate": 5.1766351871283664e-08, "logps/chosen": -82.89336395263672, "logps/rejected": -102.16773986816406, "loss": 0.2056, "losses/dpo": 0.36578232049942017, "losses/sft": 2.3649439811706543, "losses/total": 0.36578232049942017, "ref_logps/chosen": -58.64131164550781, "ref_logps/rejected": -53.85184097290039, "rewards/accuracies": 1.0, "rewards/chosen": -2.4252052307128906, "rewards/margins": 2.4063849449157715, "rewards/rejected": -4.831589698791504, "step": 2881 }, { "epoch": 2.72, "grad_norm": 19.96122169494629, "learning_rate": 5.1591465547394196e-08, "logps/chosen": -42.434059143066406, "logps/rejected": -64.46273803710938, "loss": 0.2572, "losses/dpo": 0.35179752111434937, "losses/sft": 2.2044763565063477, "losses/total": 0.35179752111434937, "ref_logps/chosen": -25.632247924804688, "ref_logps/rejected": -28.573287963867188, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6801811456680298, "rewards/margins": 1.9087638854980469, "rewards/rejected": -3.588944911956787, "step": 2882 }, { "epoch": 2.72, "grad_norm": 28.02341651916504, "learning_rate": 5.1416579223504716e-08, "logps/chosen": -65.40029907226562, "logps/rejected": -77.53459167480469, "loss": 0.2917, "losses/dpo": 0.5614790916442871, "losses/sft": 1.9322682619094849, "losses/total": 0.5614790916442871, "ref_logps/chosen": -44.492366790771484, "ref_logps/rejected": -40.451541900634766, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0907931327819824, "rewards/margins": 1.6175122261047363, "rewards/rejected": -3.7083053588867188, "step": 2883 }, { "epoch": 2.72, "grad_norm": 18.873231887817383, "learning_rate": 5.124169289961525e-08, "logps/chosen": -53.775421142578125, "logps/rejected": -82.19868469238281, "loss": 0.1725, "losses/dpo": 0.04656333848834038, "losses/sft": 1.74667227268219, "losses/total": 0.04656333848834038, "ref_logps/chosen": -38.103355407714844, "ref_logps/rejected": -41.982879638671875, "rewards/accuracies": 0.9375, "rewards/chosen": -1.567206859588623, "rewards/margins": 2.454373359680176, "rewards/rejected": -4.021579742431641, "step": 2884 }, { "epoch": 2.72, "grad_norm": 15.36976432800293, "learning_rate": 5.1066806575725774e-08, "logps/chosen": -48.32014465332031, "logps/rejected": -80.34864044189453, "loss": 0.1677, "losses/dpo": 0.20992562174797058, "losses/sft": 1.912480354309082, "losses/total": 0.20992562174797058, "ref_logps/chosen": -31.82781410217285, "ref_logps/rejected": -39.51568603515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6492328643798828, "rewards/margins": 2.4340627193450928, "rewards/rejected": -4.083295822143555, "step": 2885 }, { "epoch": 2.73, "grad_norm": 20.57769012451172, "learning_rate": 5.0891920251836306e-08, "logps/chosen": -79.54473876953125, "logps/rejected": -93.16749572753906, "loss": 0.2527, "losses/dpo": 0.2616657614707947, "losses/sft": 2.74359393119812, "losses/total": 0.2616657614707947, "ref_logps/chosen": -50.75069046020508, "ref_logps/rejected": -48.24676513671875, "rewards/accuracies": 1.0, "rewards/chosen": -2.8794045448303223, "rewards/margins": 1.6126682758331299, "rewards/rejected": -4.492073059082031, "step": 2886 }, { "epoch": 2.73, "grad_norm": 23.890918731689453, "learning_rate": 5.071703392794683e-08, "logps/chosen": -59.0823974609375, "logps/rejected": -79.6904525756836, "loss": 0.2393, "losses/dpo": 0.30438071489334106, "losses/sft": 2.2941267490386963, "losses/total": 0.30438071489334106, "ref_logps/chosen": -38.12369155883789, "ref_logps/rejected": -38.59575653076172, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0958709716796875, "rewards/margins": 2.013599395751953, "rewards/rejected": -4.109470367431641, "step": 2887 }, { "epoch": 2.73, "grad_norm": 14.827569961547852, "learning_rate": 5.054214760405736e-08, "logps/chosen": -39.25969314575195, "logps/rejected": -79.71253967285156, "loss": 0.1596, "losses/dpo": 0.3309593200683594, "losses/sft": 1.6219087839126587, "losses/total": 0.3309593200683594, "ref_logps/chosen": -28.565309524536133, "ref_logps/rejected": -40.868919372558594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0694384574890137, "rewards/margins": 2.814924478530884, "rewards/rejected": -3.8843631744384766, "step": 2888 }, { "epoch": 2.73, "grad_norm": 21.80104637145996, "learning_rate": 5.036726128016789e-08, "logps/chosen": -62.342769622802734, "logps/rejected": -94.05638122558594, "loss": 0.1875, "losses/dpo": 0.24158428609371185, "losses/sft": 3.037687063217163, "losses/total": 0.24158428609371185, "ref_logps/chosen": -36.985023498535156, "ref_logps/rejected": -47.24561309814453, "rewards/accuracies": 1.0, "rewards/chosen": -2.5357747077941895, "rewards/margins": 2.1453018188476562, "rewards/rejected": -4.681077003479004, "step": 2889 }, { "epoch": 2.73, "grad_norm": 24.7808837890625, "learning_rate": 5.019237495627842e-08, "logps/chosen": -49.591758728027344, "logps/rejected": -75.19595336914062, "loss": 0.3314, "losses/dpo": 0.09084201604127884, "losses/sft": 2.1008388996124268, "losses/total": 0.09084201604127884, "ref_logps/chosen": -27.8034610748291, "ref_logps/rejected": -37.023075103759766, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1788296699523926, "rewards/margins": 1.6384576559066772, "rewards/rejected": -3.8172874450683594, "step": 2890 }, { "epoch": 2.73, "grad_norm": 24.186092376708984, "learning_rate": 5.001748863238894e-08, "logps/chosen": -53.1131591796875, "logps/rejected": -79.9860610961914, "loss": 0.2231, "losses/dpo": 0.12335456907749176, "losses/sft": 1.8055753707885742, "losses/total": 0.12335456907749176, "ref_logps/chosen": -33.05149841308594, "ref_logps/rejected": -38.446929931640625, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0061657428741455, "rewards/margins": 2.147747755050659, "rewards/rejected": -4.153913497924805, "step": 2891 }, { "epoch": 2.73, "grad_norm": 25.13157081604004, "learning_rate": 4.9842602308499475e-08, "logps/chosen": -52.623966217041016, "logps/rejected": -80.99105072021484, "loss": 0.3294, "losses/dpo": 0.14942845702171326, "losses/sft": 2.0872185230255127, "losses/total": 0.14942845702171326, "ref_logps/chosen": -35.113525390625, "ref_logps/rejected": -42.599098205566406, "rewards/accuracies": 0.875, "rewards/chosen": -1.7510440349578857, "rewards/margins": 2.088151454925537, "rewards/rejected": -3.839195489883423, "step": 2892 }, { "epoch": 2.73, "grad_norm": 22.575241088867188, "learning_rate": 4.966771598461e-08, "logps/chosen": -64.890625, "logps/rejected": -83.49400329589844, "loss": 0.2308, "losses/dpo": 0.09400902688503265, "losses/sft": 2.699190378189087, "losses/total": 0.09400902688503265, "ref_logps/chosen": -45.70062255859375, "ref_logps/rejected": -41.647560119628906, "rewards/accuracies": 0.9375, "rewards/chosen": -1.919000267982483, "rewards/margins": 2.26564359664917, "rewards/rejected": -4.1846442222595215, "step": 2893 }, { "epoch": 2.73, "grad_norm": 22.032365798950195, "learning_rate": 4.949282966072053e-08, "logps/chosen": -71.71018981933594, "logps/rejected": -105.13894653320312, "loss": 0.1838, "losses/dpo": 0.09269165992736816, "losses/sft": 2.3309507369995117, "losses/total": 0.09269165992736816, "ref_logps/chosen": -47.39867401123047, "ref_logps/rejected": -56.04137420654297, "rewards/accuracies": 0.9375, "rewards/chosen": -2.431152105331421, "rewards/margins": 2.478605270385742, "rewards/rejected": -4.909757137298584, "step": 2894 }, { "epoch": 2.73, "grad_norm": 18.26418685913086, "learning_rate": 4.931794333683106e-08, "logps/chosen": -42.78246307373047, "logps/rejected": -81.12641143798828, "loss": 0.1778, "losses/dpo": 0.14242404699325562, "losses/sft": 2.016181707382202, "losses/total": 0.14242404699325562, "ref_logps/chosen": -30.147472381591797, "ref_logps/rejected": -42.09385681152344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2634994983673096, "rewards/margins": 2.639756679534912, "rewards/rejected": -3.9032559394836426, "step": 2895 }, { "epoch": 2.73, "grad_norm": 23.775630950927734, "learning_rate": 4.9143057012941585e-08, "logps/chosen": -66.25042724609375, "logps/rejected": -90.31790161132812, "loss": 0.257, "losses/dpo": 0.29815739393234253, "losses/sft": 2.7361183166503906, "losses/total": 0.29815739393234253, "ref_logps/chosen": -41.492977142333984, "ref_logps/rejected": -47.90015411376953, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4757447242736816, "rewards/margins": 1.7660298347473145, "rewards/rejected": -4.241774559020996, "step": 2896 }, { "epoch": 2.74, "grad_norm": 30.72845458984375, "learning_rate": 4.896817068905212e-08, "logps/chosen": -56.872501373291016, "logps/rejected": -86.8343505859375, "loss": 0.5212, "losses/dpo": 1.8256490230560303, "losses/sft": 2.678504467010498, "losses/total": 1.8256490230560303, "ref_logps/chosen": -33.75708770751953, "ref_logps/rejected": -42.83272933959961, "rewards/accuracies": 0.875, "rewards/chosen": -2.3115410804748535, "rewards/margins": 2.088620901107788, "rewards/rejected": -4.4001617431640625, "step": 2897 }, { "epoch": 2.74, "grad_norm": 13.860445022583008, "learning_rate": 4.8793284365162644e-08, "logps/chosen": -71.68281555175781, "logps/rejected": -101.97732543945312, "loss": 0.1145, "losses/dpo": 0.17511504888534546, "losses/sft": 2.1915576457977295, "losses/total": 0.17511504888534546, "ref_logps/chosen": -47.659629821777344, "ref_logps/rejected": -49.48004150390625, "rewards/accuracies": 1.0, "rewards/chosen": -2.4023187160491943, "rewards/margins": 2.847410202026367, "rewards/rejected": -5.249728679656982, "step": 2898 }, { "epoch": 2.74, "grad_norm": 21.314437866210938, "learning_rate": 4.861839804127317e-08, "logps/chosen": -61.1273193359375, "logps/rejected": -84.61996459960938, "loss": 0.1693, "losses/dpo": 0.17391903698444366, "losses/sft": 2.1920340061187744, "losses/total": 0.17391903698444366, "ref_logps/chosen": -41.756996154785156, "ref_logps/rejected": -42.61939239501953, "rewards/accuracies": 1.0, "rewards/chosen": -1.9370322227478027, "rewards/margins": 2.2630255222320557, "rewards/rejected": -4.2000579833984375, "step": 2899 }, { "epoch": 2.74, "grad_norm": 31.29777717590332, "learning_rate": 4.84435117173837e-08, "logps/chosen": -52.42212677001953, "logps/rejected": -68.17816162109375, "loss": 0.5142, "losses/dpo": 1.3692716360092163, "losses/sft": 2.1053643226623535, "losses/total": 1.3692716360092163, "ref_logps/chosen": -33.53604507446289, "ref_logps/rejected": -30.62323760986328, "rewards/accuracies": 0.875, "rewards/chosen": -1.8886083364486694, "rewards/margins": 1.8668839931488037, "rewards/rejected": -3.7554922103881836, "step": 2900 }, { "epoch": 2.74, "grad_norm": 25.538320541381836, "learning_rate": 4.826862539349422e-08, "logps/chosen": -53.15333557128906, "logps/rejected": -73.51534271240234, "loss": 0.2187, "losses/dpo": 0.20950065553188324, "losses/sft": 2.373840808868408, "losses/total": 0.20950065553188324, "ref_logps/chosen": -33.88386535644531, "ref_logps/rejected": -33.111846923828125, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9269468784332275, "rewards/margins": 2.113402843475342, "rewards/rejected": -4.04034948348999, "step": 2901 }, { "epoch": 2.74, "grad_norm": 20.94049835205078, "learning_rate": 4.8093739069604754e-08, "logps/chosen": -44.767662048339844, "logps/rejected": -71.72140502929688, "loss": 0.2182, "losses/dpo": 0.2349478304386139, "losses/sft": 1.956683874130249, "losses/total": 0.2349478304386139, "ref_logps/chosen": -29.535459518432617, "ref_logps/rejected": -34.54790496826172, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5232203006744385, "rewards/margins": 2.1941304206848145, "rewards/rejected": -3.717350721359253, "step": 2902 }, { "epoch": 2.74, "grad_norm": 19.078367233276367, "learning_rate": 4.7918852745715286e-08, "logps/chosen": -51.2137451171875, "logps/rejected": -74.24647521972656, "loss": 0.1702, "losses/dpo": 0.2708870768547058, "losses/sft": 1.9555599689483643, "losses/total": 0.2708870768547058, "ref_logps/chosen": -32.18997573852539, "ref_logps/rejected": -32.63280487060547, "rewards/accuracies": 1.0, "rewards/chosen": -1.9023771286010742, "rewards/margins": 2.258990526199341, "rewards/rejected": -4.161367893218994, "step": 2903 }, { "epoch": 2.74, "grad_norm": 24.8669490814209, "learning_rate": 4.774396642182581e-08, "logps/chosen": -43.82025146484375, "logps/rejected": -76.4781494140625, "loss": 0.2336, "losses/dpo": 0.23846878111362457, "losses/sft": 1.27152419090271, "losses/total": 0.23846878111362457, "ref_logps/chosen": -30.502214431762695, "ref_logps/rejected": -38.13578796386719, "rewards/accuracies": 0.9375, "rewards/chosen": -1.331803798675537, "rewards/margins": 2.5024328231811523, "rewards/rejected": -3.8342366218566895, "step": 2904 }, { "epoch": 2.74, "grad_norm": 20.20412826538086, "learning_rate": 4.756908009793634e-08, "logps/chosen": -56.2938232421875, "logps/rejected": -98.10809326171875, "loss": 0.177, "losses/dpo": 0.1635667085647583, "losses/sft": 1.756253719329834, "losses/total": 0.1635667085647583, "ref_logps/chosen": -35.66443634033203, "ref_logps/rejected": -50.62413787841797, "rewards/accuracies": 0.9375, "rewards/chosen": -2.062938690185547, "rewards/margins": 2.685457229614258, "rewards/rejected": -4.748395919799805, "step": 2905 }, { "epoch": 2.74, "grad_norm": 13.31275463104248, "learning_rate": 4.739419377404687e-08, "logps/chosen": -74.900146484375, "logps/rejected": -103.60855102539062, "loss": 0.123, "losses/dpo": 0.1407439410686493, "losses/sft": 2.33221435546875, "losses/total": 0.1407439410686493, "ref_logps/chosen": -51.773983001708984, "ref_logps/rejected": -53.90714645385742, "rewards/accuracies": 1.0, "rewards/chosen": -2.3126163482666016, "rewards/margins": 2.6575241088867188, "rewards/rejected": -4.97014045715332, "step": 2906 }, { "epoch": 2.75, "grad_norm": 46.8740348815918, "learning_rate": 4.7219307450157396e-08, "logps/chosen": -64.64573669433594, "logps/rejected": -62.52058410644531, "loss": 0.4977, "losses/dpo": 1.0391278266906738, "losses/sft": 2.072814702987671, "losses/total": 1.0391278266906738, "ref_logps/chosen": -44.504512786865234, "ref_logps/rejected": -30.27450942993164, "rewards/accuracies": 0.8125, "rewards/chosen": -2.014122247695923, "rewards/margins": 1.2104852199554443, "rewards/rejected": -3.224607467651367, "step": 2907 }, { "epoch": 2.75, "grad_norm": 16.31307601928711, "learning_rate": 4.704442112626792e-08, "logps/chosen": -62.43891906738281, "logps/rejected": -89.83383178710938, "loss": 0.1648, "losses/dpo": 0.12543222308158875, "losses/sft": 2.3578948974609375, "losses/total": 0.12543222308158875, "ref_logps/chosen": -42.93967056274414, "ref_logps/rejected": -46.32426452636719, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9499249458312988, "rewards/margins": 2.4010322093963623, "rewards/rejected": -4.350956916809082, "step": 2908 }, { "epoch": 2.75, "grad_norm": 18.701644897460938, "learning_rate": 4.686953480237845e-08, "logps/chosen": -65.79672241210938, "logps/rejected": -68.97135925292969, "loss": 0.1877, "losses/dpo": 0.269913911819458, "losses/sft": 1.9493757486343384, "losses/total": 0.269913911819458, "ref_logps/chosen": -49.953006744384766, "ref_logps/rejected": -30.98887062072754, "rewards/accuracies": 1.0, "rewards/chosen": -1.5843713283538818, "rewards/margins": 2.213876724243164, "rewards/rejected": -3.798248052597046, "step": 2909 }, { "epoch": 2.75, "grad_norm": 15.259963035583496, "learning_rate": 4.669464847848898e-08, "logps/chosen": -56.39866638183594, "logps/rejected": -98.07179260253906, "loss": 0.1472, "losses/dpo": 0.21368534862995148, "losses/sft": 2.2640364170074463, "losses/total": 0.21368534862995148, "ref_logps/chosen": -35.53032684326172, "ref_logps/rejected": -52.81883239746094, "rewards/accuracies": 1.0, "rewards/chosen": -2.0868337154388428, "rewards/margins": 2.438462257385254, "rewards/rejected": -4.525296211242676, "step": 2910 }, { "epoch": 2.75, "grad_norm": 23.00604820251465, "learning_rate": 4.651976215459951e-08, "logps/chosen": -55.955814361572266, "logps/rejected": -82.95382690429688, "loss": 0.1878, "losses/dpo": 0.12088857591152191, "losses/sft": 1.6624360084533691, "losses/total": 0.12088857591152191, "ref_logps/chosen": -38.27434539794922, "ref_logps/rejected": -40.77610778808594, "rewards/accuracies": 1.0, "rewards/chosen": -1.7681469917297363, "rewards/margins": 2.449625015258789, "rewards/rejected": -4.217772006988525, "step": 2911 }, { "epoch": 2.75, "grad_norm": 28.070449829101562, "learning_rate": 4.634487583071003e-08, "logps/chosen": -64.6600570678711, "logps/rejected": -86.61955261230469, "loss": 0.2678, "losses/dpo": 0.11538442224264145, "losses/sft": 1.9681587219238281, "losses/total": 0.11538442224264145, "ref_logps/chosen": -39.92943572998047, "ref_logps/rejected": -42.636722564697266, "rewards/accuracies": 0.875, "rewards/chosen": -2.473062038421631, "rewards/margins": 1.9252204895019531, "rewards/rejected": -4.398282527923584, "step": 2912 }, { "epoch": 2.75, "grad_norm": 15.563338279724121, "learning_rate": 4.6169989506820565e-08, "logps/chosen": -52.811279296875, "logps/rejected": -81.2079086303711, "loss": 0.1518, "losses/dpo": 0.15597154200077057, "losses/sft": 1.8160310983657837, "losses/total": 0.15597154200077057, "ref_logps/chosen": -34.96300506591797, "ref_logps/rejected": -38.81499099731445, "rewards/accuracies": 1.0, "rewards/chosen": -1.7848271131515503, "rewards/margins": 2.454464912414551, "rewards/rejected": -4.239291667938232, "step": 2913 }, { "epoch": 2.75, "grad_norm": 24.842069625854492, "learning_rate": 4.59951031829311e-08, "logps/chosen": -58.31574630737305, "logps/rejected": -89.58419799804688, "loss": 0.2135, "losses/dpo": 0.35399264097213745, "losses/sft": 2.2470216751098633, "losses/total": 0.35399264097213745, "ref_logps/chosen": -38.719276428222656, "ref_logps/rejected": -47.51642990112305, "rewards/accuracies": 0.875, "rewards/chosen": -1.9596470594406128, "rewards/margins": 2.247130870819092, "rewards/rejected": -4.206777572631836, "step": 2914 }, { "epoch": 2.75, "grad_norm": 25.518869400024414, "learning_rate": 4.5820216859041617e-08, "logps/chosen": -60.399696350097656, "logps/rejected": -80.20470428466797, "loss": 0.2861, "losses/dpo": 0.3874371647834778, "losses/sft": 2.599649667739868, "losses/total": 0.3874371647834778, "ref_logps/chosen": -36.75235366821289, "ref_logps/rejected": -40.736690521240234, "rewards/accuracies": 0.9375, "rewards/chosen": -2.364734411239624, "rewards/margins": 1.5820667743682861, "rewards/rejected": -3.94680118560791, "step": 2915 }, { "epoch": 2.75, "grad_norm": 24.60268211364746, "learning_rate": 4.564533053515215e-08, "logps/chosen": -50.21382141113281, "logps/rejected": -69.26856994628906, "loss": 0.3713, "losses/dpo": 0.8575896620750427, "losses/sft": 1.8644320964813232, "losses/total": 0.8575896620750427, "ref_logps/chosen": -31.54848289489746, "ref_logps/rejected": -31.33245849609375, "rewards/accuracies": 0.875, "rewards/chosen": -1.8665341138839722, "rewards/margins": 1.927076816558838, "rewards/rejected": -3.7936110496520996, "step": 2916 }, { "epoch": 2.75, "grad_norm": 24.392568588256836, "learning_rate": 4.5470444211262675e-08, "logps/chosen": -63.81717300415039, "logps/rejected": -78.88933563232422, "loss": 0.3482, "losses/dpo": 0.45321568846702576, "losses/sft": 1.9605830907821655, "losses/total": 0.45321568846702576, "ref_logps/chosen": -43.75624084472656, "ref_logps/rejected": -37.84739685058594, "rewards/accuracies": 0.875, "rewards/chosen": -2.0060930252075195, "rewards/margins": 2.0981009006500244, "rewards/rejected": -4.104194164276123, "step": 2917 }, { "epoch": 2.76, "grad_norm": 19.76136016845703, "learning_rate": 4.529555788737321e-08, "logps/chosen": -57.73360824584961, "logps/rejected": -89.73143005371094, "loss": 0.1611, "losses/dpo": 0.1569322943687439, "losses/sft": 2.173398494720459, "losses/total": 0.1569322943687439, "ref_logps/chosen": -34.68782043457031, "ref_logps/rejected": -44.75007629394531, "rewards/accuracies": 1.0, "rewards/chosen": -2.3045785427093506, "rewards/margins": 2.1935575008392334, "rewards/rejected": -4.498136043548584, "step": 2918 }, { "epoch": 2.76, "grad_norm": 41.902217864990234, "learning_rate": 4.5120671563483733e-08, "logps/chosen": -79.74898529052734, "logps/rejected": -94.09258270263672, "loss": 0.4207, "losses/dpo": 0.5761823058128357, "losses/sft": 2.102543354034424, "losses/total": 0.5761823058128357, "ref_logps/chosen": -52.83766174316406, "ref_logps/rejected": -46.913352966308594, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6911330223083496, "rewards/margins": 2.026789903640747, "rewards/rejected": -4.717922687530518, "step": 2919 }, { "epoch": 2.76, "grad_norm": 16.969646453857422, "learning_rate": 4.494578523959426e-08, "logps/chosen": -73.46839904785156, "logps/rejected": -108.63909912109375, "loss": 0.1134, "losses/dpo": 0.25589433312416077, "losses/sft": 2.5820610523223877, "losses/total": 0.25589433312416077, "ref_logps/chosen": -47.53834533691406, "ref_logps/rejected": -46.04529571533203, "rewards/accuracies": 0.9375, "rewards/chosen": -2.593005895614624, "rewards/margins": 3.6663739681243896, "rewards/rejected": -6.259379863739014, "step": 2920 }, { "epoch": 2.76, "grad_norm": 24.645444869995117, "learning_rate": 4.477089891570479e-08, "logps/chosen": -63.86751937866211, "logps/rejected": -88.18708801269531, "loss": 0.316, "losses/dpo": 0.4615601897239685, "losses/sft": 2.747189521789551, "losses/total": 0.4615601897239685, "ref_logps/chosen": -40.57049560546875, "ref_logps/rejected": -46.73444366455078, "rewards/accuracies": 0.875, "rewards/chosen": -2.329702854156494, "rewards/margins": 1.8155622482299805, "rewards/rejected": -4.145264625549316, "step": 2921 }, { "epoch": 2.76, "grad_norm": 22.891401290893555, "learning_rate": 4.4596012591815324e-08, "logps/chosen": -56.50829315185547, "logps/rejected": -83.11656951904297, "loss": 0.2162, "losses/dpo": 0.28885069489479065, "losses/sft": 1.9294531345367432, "losses/total": 0.28885069489479065, "ref_logps/chosen": -32.53710174560547, "ref_logps/rejected": -37.08204650878906, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3971192836761475, "rewards/margins": 2.2063326835632324, "rewards/rejected": -4.603451728820801, "step": 2922 }, { "epoch": 2.76, "grad_norm": 41.41575622558594, "learning_rate": 4.4421126267925844e-08, "logps/chosen": -61.92214584350586, "logps/rejected": -82.36434936523438, "loss": 0.4097, "losses/dpo": 0.1857387274503708, "losses/sft": 1.9243720769882202, "losses/total": 0.1857387274503708, "ref_logps/chosen": -40.911468505859375, "ref_logps/rejected": -44.80412673950195, "rewards/accuracies": 0.75, "rewards/chosen": -2.101067543029785, "rewards/margins": 1.6549553871154785, "rewards/rejected": -3.7560229301452637, "step": 2923 }, { "epoch": 2.76, "grad_norm": 20.074983596801758, "learning_rate": 4.4246239944036376e-08, "logps/chosen": -47.114009857177734, "logps/rejected": -78.469970703125, "loss": 0.2168, "losses/dpo": 0.13913625478744507, "losses/sft": 1.5697959661483765, "losses/total": 0.13913625478744507, "ref_logps/chosen": -32.79225158691406, "ref_logps/rejected": -37.911094665527344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4321759939193726, "rewards/margins": 2.623711585998535, "rewards/rejected": -4.055887699127197, "step": 2924 }, { "epoch": 2.76, "grad_norm": 38.66976547241211, "learning_rate": 4.40713536201469e-08, "logps/chosen": -59.42203903198242, "logps/rejected": -84.69801330566406, "loss": 0.3001, "losses/dpo": 0.09689392149448395, "losses/sft": 2.3049609661102295, "losses/total": 0.09689392149448395, "ref_logps/chosen": -36.72865295410156, "ref_logps/rejected": -37.53166198730469, "rewards/accuracies": 0.9375, "rewards/chosen": -2.269338369369507, "rewards/margins": 2.4472970962524414, "rewards/rejected": -4.716635704040527, "step": 2925 }, { "epoch": 2.76, "grad_norm": 14.12325668334961, "learning_rate": 4.389646729625743e-08, "logps/chosen": -62.84257507324219, "logps/rejected": -102.05506896972656, "loss": 0.1012, "losses/dpo": 0.036279600113630295, "losses/sft": 2.1291112899780273, "losses/total": 0.036279600113630295, "ref_logps/chosen": -41.30048370361328, "ref_logps/rejected": -52.78526306152344, "rewards/accuracies": 1.0, "rewards/chosen": -2.1542088985443115, "rewards/margins": 2.7727713584899902, "rewards/rejected": -4.926980018615723, "step": 2926 }, { "epoch": 2.76, "grad_norm": 28.979934692382812, "learning_rate": 4.372158097236796e-08, "logps/chosen": -63.19284439086914, "logps/rejected": -84.02626037597656, "loss": 0.3178, "losses/dpo": 0.2483692318201065, "losses/sft": 1.6956597566604614, "losses/total": 0.2483692318201065, "ref_logps/chosen": -37.93545913696289, "ref_logps/rejected": -39.247894287109375, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5257389545440674, "rewards/margins": 1.95209801197052, "rewards/rejected": -4.477836608886719, "step": 2927 }, { "epoch": 2.76, "grad_norm": 30.049333572387695, "learning_rate": 4.3546694648478486e-08, "logps/chosen": -50.92780303955078, "logps/rejected": -73.13703918457031, "loss": 0.2777, "losses/dpo": 0.24642544984817505, "losses/sft": 1.7158806324005127, "losses/total": 0.24642544984817505, "ref_logps/chosen": -34.705142974853516, "ref_logps/rejected": -38.05579376220703, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6222660541534424, "rewards/margins": 1.8858588933944702, "rewards/rejected": -3.508125066757202, "step": 2928 }, { "epoch": 2.77, "grad_norm": 25.350297927856445, "learning_rate": 4.337180832458902e-08, "logps/chosen": -53.08549499511719, "logps/rejected": -70.6810302734375, "loss": 0.3429, "losses/dpo": 0.27654868364334106, "losses/sft": 2.1185340881347656, "losses/total": 0.27654868364334106, "ref_logps/chosen": -33.332984924316406, "ref_logps/rejected": -33.37082290649414, "rewards/accuracies": 0.875, "rewards/chosen": -1.9752511978149414, "rewards/margins": 1.7557696104049683, "rewards/rejected": -3.731020927429199, "step": 2929 }, { "epoch": 2.77, "grad_norm": 31.370952606201172, "learning_rate": 4.3196922000699545e-08, "logps/chosen": -70.31005859375, "logps/rejected": -95.06546020507812, "loss": 0.3223, "losses/dpo": 0.8907086253166199, "losses/sft": 2.5566813945770264, "losses/total": 0.8907086253166199, "ref_logps/chosen": -45.06493377685547, "ref_logps/rejected": -45.993873596191406, "rewards/accuracies": 0.875, "rewards/chosen": -2.524512529373169, "rewards/margins": 2.382645845413208, "rewards/rejected": -4.907158374786377, "step": 2930 }, { "epoch": 2.77, "grad_norm": 24.798593521118164, "learning_rate": 4.302203567681007e-08, "logps/chosen": -41.98005676269531, "logps/rejected": -79.919677734375, "loss": 0.2423, "losses/dpo": 0.16114863753318787, "losses/sft": 1.46969735622406, "losses/total": 0.16114863753318787, "ref_logps/chosen": -26.345916748046875, "ref_logps/rejected": -37.114524841308594, "rewards/accuracies": 0.9375, "rewards/chosen": -1.563414216041565, "rewards/margins": 2.717101573944092, "rewards/rejected": -4.280515670776367, "step": 2931 }, { "epoch": 2.77, "grad_norm": 22.61949920654297, "learning_rate": 4.28471493529206e-08, "logps/chosen": -65.4260025024414, "logps/rejected": -100.01286315917969, "loss": 0.2688, "losses/dpo": 0.15575724840164185, "losses/sft": 1.9339677095413208, "losses/total": 0.15575724840164185, "ref_logps/chosen": -46.596397399902344, "ref_logps/rejected": -56.88796615600586, "rewards/accuracies": 0.9375, "rewards/chosen": -1.882960557937622, "rewards/margins": 2.429529905319214, "rewards/rejected": -4.312490463256836, "step": 2932 }, { "epoch": 2.77, "grad_norm": 25.711580276489258, "learning_rate": 4.267226302903112e-08, "logps/chosen": -50.37397384643555, "logps/rejected": -67.489013671875, "loss": 0.2586, "losses/dpo": 0.2976170778274536, "losses/sft": 2.023776054382324, "losses/total": 0.2976170778274536, "ref_logps/chosen": -36.26109313964844, "ref_logps/rejected": -31.36077308654785, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4112882614135742, "rewards/margins": 2.20153546333313, "rewards/rejected": -3.612823724746704, "step": 2933 }, { "epoch": 2.77, "grad_norm": 28.1121768951416, "learning_rate": 4.2497376705141655e-08, "logps/chosen": -62.54951477050781, "logps/rejected": -89.92970275878906, "loss": 0.1856, "losses/dpo": 0.26242735981941223, "losses/sft": 2.670637845993042, "losses/total": 0.26242735981941223, "ref_logps/chosen": -41.26399612426758, "ref_logps/rejected": -44.05115509033203, "rewards/accuracies": 0.9375, "rewards/chosen": -2.128551959991455, "rewards/margins": 2.4593024253845215, "rewards/rejected": -4.587854385375977, "step": 2934 }, { "epoch": 2.77, "grad_norm": 19.07452392578125, "learning_rate": 4.232249038125219e-08, "logps/chosen": -51.450740814208984, "logps/rejected": -78.8673095703125, "loss": 0.2404, "losses/dpo": 0.10036514699459076, "losses/sft": 1.9438000917434692, "losses/total": 0.10036514699459076, "ref_logps/chosen": -32.28302001953125, "ref_logps/rejected": -37.61000061035156, "rewards/accuracies": 0.875, "rewards/chosen": -1.9167721271514893, "rewards/margins": 2.2089591026306152, "rewards/rejected": -4.125731468200684, "step": 2935 }, { "epoch": 2.77, "grad_norm": 24.527149200439453, "learning_rate": 4.214760405736271e-08, "logps/chosen": -55.682861328125, "logps/rejected": -83.66441345214844, "loss": 0.3108, "losses/dpo": 0.12902814149856567, "losses/sft": 2.277679681777954, "losses/total": 0.12902814149856567, "ref_logps/chosen": -34.76591110229492, "ref_logps/rejected": -38.99586868286133, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0916948318481445, "rewards/margins": 2.37515926361084, "rewards/rejected": -4.466854095458984, "step": 2936 }, { "epoch": 2.77, "grad_norm": 14.175972938537598, "learning_rate": 4.197271773347324e-08, "logps/chosen": -55.338077545166016, "logps/rejected": -91.476806640625, "loss": 0.1644, "losses/dpo": 0.18473079800605774, "losses/sft": 1.5740883350372314, "losses/total": 0.18473079800605774, "ref_logps/chosen": -33.325950622558594, "ref_logps/rejected": -48.24712371826172, "rewards/accuracies": 1.0, "rewards/chosen": -2.2012124061584473, "rewards/margins": 2.121755838394165, "rewards/rejected": -4.322968482971191, "step": 2937 }, { "epoch": 2.77, "grad_norm": 30.652555465698242, "learning_rate": 4.179783140958377e-08, "logps/chosen": -59.166534423828125, "logps/rejected": -92.97552490234375, "loss": 0.294, "losses/dpo": 0.3652971684932709, "losses/sft": 2.5270166397094727, "losses/total": 0.3652971684932709, "ref_logps/chosen": -36.86262512207031, "ref_logps/rejected": -48.15949249267578, "rewards/accuracies": 0.9375, "rewards/chosen": -2.230390787124634, "rewards/margins": 2.2512118816375732, "rewards/rejected": -4.481602668762207, "step": 2938 }, { "epoch": 2.78, "grad_norm": 13.035469055175781, "learning_rate": 4.16229450856943e-08, "logps/chosen": -59.53193664550781, "logps/rejected": -108.38406372070312, "loss": 0.0777, "losses/dpo": 0.056367017328739166, "losses/sft": 2.0953269004821777, "losses/total": 0.056367017328739166, "ref_logps/chosen": -36.9190559387207, "ref_logps/rejected": -55.25392532348633, "rewards/accuracies": 1.0, "rewards/chosen": -2.2612881660461426, "rewards/margins": 3.051725387573242, "rewards/rejected": -5.313013553619385, "step": 2939 }, { "epoch": 2.78, "grad_norm": 33.68614196777344, "learning_rate": 4.144805876180482e-08, "logps/chosen": -66.39884948730469, "logps/rejected": -81.25732421875, "loss": 0.303, "losses/dpo": 0.11013530939817429, "losses/sft": 2.38643479347229, "losses/total": 0.11013530939817429, "ref_logps/chosen": -43.77596664428711, "ref_logps/rejected": -39.08415222167969, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2622883319854736, "rewards/margins": 1.9550285339355469, "rewards/rejected": -4.217316627502441, "step": 2940 }, { "epoch": 2.78, "grad_norm": 19.431079864501953, "learning_rate": 4.127317243791535e-08, "logps/chosen": -50.00224304199219, "logps/rejected": -82.65943908691406, "loss": 0.193, "losses/dpo": 0.08156921714544296, "losses/sft": 1.507349967956543, "losses/total": 0.08156921714544296, "ref_logps/chosen": -34.5750617980957, "ref_logps/rejected": -45.306766510009766, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5427186489105225, "rewards/margins": 2.1925485134124756, "rewards/rejected": -3.735267162322998, "step": 2941 }, { "epoch": 2.78, "grad_norm": 21.818389892578125, "learning_rate": 4.109828611402588e-08, "logps/chosen": -69.96769714355469, "logps/rejected": -92.95931243896484, "loss": 0.2367, "losses/dpo": 0.412746787071228, "losses/sft": 1.8832069635391235, "losses/total": 0.412746787071228, "ref_logps/chosen": -42.7349853515625, "ref_logps/rejected": -44.36425018310547, "rewards/accuracies": 0.9375, "rewards/chosen": -2.723271369934082, "rewards/margins": 2.136235237121582, "rewards/rejected": -4.859506607055664, "step": 2942 }, { "epoch": 2.78, "grad_norm": 22.550331115722656, "learning_rate": 4.0923399790136414e-08, "logps/chosen": -65.5389404296875, "logps/rejected": -95.45225524902344, "loss": 0.2065, "losses/dpo": 0.1375802606344223, "losses/sft": 2.6348206996917725, "losses/total": 0.1375802606344223, "ref_logps/chosen": -40.30699157714844, "ref_logps/rejected": -50.36699676513672, "rewards/accuracies": 0.9375, "rewards/chosen": -2.523195266723633, "rewards/margins": 1.9853311777114868, "rewards/rejected": -4.50852632522583, "step": 2943 }, { "epoch": 2.78, "grad_norm": 27.819787979125977, "learning_rate": 4.0748513466246933e-08, "logps/chosen": -59.16255569458008, "logps/rejected": -91.4408187866211, "loss": 0.2817, "losses/dpo": 0.21156972646713257, "losses/sft": 2.3053195476531982, "losses/total": 0.21156972646713257, "ref_logps/chosen": -33.958587646484375, "ref_logps/rejected": -44.284446716308594, "rewards/accuracies": 0.875, "rewards/chosen": -2.520397186279297, "rewards/margins": 2.1952402591705322, "rewards/rejected": -4.715637683868408, "step": 2944 }, { "epoch": 2.78, "grad_norm": 32.6779670715332, "learning_rate": 4.0573627142357466e-08, "logps/chosen": -64.88963317871094, "logps/rejected": -87.98614501953125, "loss": 0.2971, "losses/dpo": 0.05765041336417198, "losses/sft": 1.792264699935913, "losses/total": 0.05765041336417198, "ref_logps/chosen": -42.00370788574219, "ref_logps/rejected": -46.18706512451172, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2885916233062744, "rewards/margins": 1.8913166522979736, "rewards/rejected": -4.179908275604248, "step": 2945 }, { "epoch": 2.78, "grad_norm": 28.0327205657959, "learning_rate": 4.0398740818468e-08, "logps/chosen": -58.09992218017578, "logps/rejected": -97.98863220214844, "loss": 0.2519, "losses/dpo": 0.2719775438308716, "losses/sft": 2.390690565109253, "losses/total": 0.2719775438308716, "ref_logps/chosen": -32.832759857177734, "ref_logps/rejected": -50.393028259277344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5267162322998047, "rewards/margins": 2.232844352722168, "rewards/rejected": -4.759560585021973, "step": 2946 }, { "epoch": 2.78, "grad_norm": 23.211795806884766, "learning_rate": 4.0223854494578524e-08, "logps/chosen": -59.732635498046875, "logps/rejected": -89.6614990234375, "loss": 0.2501, "losses/dpo": 0.14845877885818481, "losses/sft": 1.9149870872497559, "losses/total": 0.14845877885818481, "ref_logps/chosen": -39.32978820800781, "ref_logps/rejected": -50.03449249267578, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0402846336364746, "rewards/margins": 1.9224166870117188, "rewards/rejected": -3.9627015590667725, "step": 2947 }, { "epoch": 2.78, "grad_norm": 16.810693740844727, "learning_rate": 4.004896817068905e-08, "logps/chosen": -55.253021240234375, "logps/rejected": -95.67185974121094, "loss": 0.1394, "losses/dpo": 0.26655498147010803, "losses/sft": 1.4401518106460571, "losses/total": 0.26655498147010803, "ref_logps/chosen": -35.22863006591797, "ref_logps/rejected": -48.77995300292969, "rewards/accuracies": 1.0, "rewards/chosen": -2.002439022064209, "rewards/margins": 2.686751127243042, "rewards/rejected": -4.689189910888672, "step": 2948 }, { "epoch": 2.78, "grad_norm": 25.708545684814453, "learning_rate": 3.9874081846799576e-08, "logps/chosen": -55.23566436767578, "logps/rejected": -83.1280746459961, "loss": 0.2876, "losses/dpo": 0.08939340710639954, "losses/sft": 1.760914921760559, "losses/total": 0.08939340710639954, "ref_logps/chosen": -37.5933952331543, "ref_logps/rejected": -44.07594680786133, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7642266750335693, "rewards/margins": 2.1409859657287598, "rewards/rejected": -3.905212879180908, "step": 2949 }, { "epoch": 2.79, "grad_norm": 18.22446632385254, "learning_rate": 3.969919552291011e-08, "logps/chosen": -35.238712310791016, "logps/rejected": -66.91839599609375, "loss": 0.2692, "losses/dpo": 0.10803490877151489, "losses/sft": 1.8808940649032593, "losses/total": 0.10803490877151489, "ref_logps/chosen": -24.548784255981445, "ref_logps/rejected": -32.27580642700195, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0689928531646729, "rewards/margins": 2.395265817642212, "rewards/rejected": -3.4642586708068848, "step": 2950 }, { "epoch": 2.79, "grad_norm": 24.135835647583008, "learning_rate": 3.9524309199020634e-08, "logps/chosen": -48.49653244018555, "logps/rejected": -72.26859283447266, "loss": 0.2484, "losses/dpo": 0.1773497313261032, "losses/sft": 1.6088541746139526, "losses/total": 0.1773497313261032, "ref_logps/chosen": -32.05012893676758, "ref_logps/rejected": -35.045982360839844, "rewards/accuracies": 0.875, "rewards/chosen": -1.6446402072906494, "rewards/margins": 2.0776209831237793, "rewards/rejected": -3.7222609519958496, "step": 2951 }, { "epoch": 2.79, "grad_norm": 16.43662452697754, "learning_rate": 3.934942287513116e-08, "logps/chosen": -61.54220199584961, "logps/rejected": -94.9558334350586, "loss": 0.1711, "losses/dpo": 0.08148401975631714, "losses/sft": 2.9786620140075684, "losses/total": 0.08148401975631714, "ref_logps/chosen": -39.89582824707031, "ref_logps/rejected": -46.92484664916992, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1646370887756348, "rewards/margins": 2.6384620666503906, "rewards/rejected": -4.803099155426025, "step": 2952 }, { "epoch": 2.79, "grad_norm": 16.742591857910156, "learning_rate": 3.917453655124169e-08, "logps/chosen": -50.09156036376953, "logps/rejected": -75.986328125, "loss": 0.2142, "losses/dpo": 0.18750856816768646, "losses/sft": 2.043002128601074, "losses/total": 0.18750856816768646, "ref_logps/chosen": -33.27346420288086, "ref_logps/rejected": -37.96230697631836, "rewards/accuracies": 1.0, "rewards/chosen": -1.681809902191162, "rewards/margins": 2.120591640472412, "rewards/rejected": -3.802401542663574, "step": 2953 }, { "epoch": 2.79, "grad_norm": 14.109723091125488, "learning_rate": 3.8999650227352225e-08, "logps/chosen": -70.32637023925781, "logps/rejected": -127.77806091308594, "loss": 0.1114, "losses/dpo": 0.16246438026428223, "losses/sft": 3.349597215652466, "losses/total": 0.16246438026428223, "ref_logps/chosen": -46.12102127075195, "ref_logps/rejected": -70.36228942871094, "rewards/accuracies": 1.0, "rewards/chosen": -2.420535087585449, "rewards/margins": 3.3210411071777344, "rewards/rejected": -5.741575717926025, "step": 2954 }, { "epoch": 2.79, "grad_norm": 16.419734954833984, "learning_rate": 3.8824763903462745e-08, "logps/chosen": -58.6666374206543, "logps/rejected": -82.16273498535156, "loss": 0.1733, "losses/dpo": 0.19920210540294647, "losses/sft": 1.8356988430023193, "losses/total": 0.19920210540294647, "ref_logps/chosen": -35.208919525146484, "ref_logps/rejected": -37.94212341308594, "rewards/accuracies": 1.0, "rewards/chosen": -2.3457717895507812, "rewards/margins": 2.076289176940918, "rewards/rejected": -4.422060966491699, "step": 2955 }, { "epoch": 2.79, "grad_norm": 38.09503936767578, "learning_rate": 3.864987757957328e-08, "logps/chosen": -64.63117980957031, "logps/rejected": -83.3462905883789, "loss": 0.4737, "losses/dpo": 0.19685395061969757, "losses/sft": 3.2698616981506348, "losses/total": 0.19685395061969757, "ref_logps/chosen": -37.46269226074219, "ref_logps/rejected": -34.36955261230469, "rewards/accuracies": 0.8125, "rewards/chosen": -2.716848611831665, "rewards/margins": 2.1808247566223145, "rewards/rejected": -4.8976731300354, "step": 2956 }, { "epoch": 2.79, "grad_norm": 24.42434310913086, "learning_rate": 3.84749912556838e-08, "logps/chosen": -53.592613220214844, "logps/rejected": -71.39588928222656, "loss": 0.3187, "losses/dpo": 0.16775572299957275, "losses/sft": 2.2685396671295166, "losses/total": 0.16775572299957275, "ref_logps/chosen": -32.739906311035156, "ref_logps/rejected": -33.00611877441406, "rewards/accuracies": 0.8125, "rewards/chosen": -2.085270643234253, "rewards/margins": 1.7537059783935547, "rewards/rejected": -3.8389763832092285, "step": 2957 }, { "epoch": 2.79, "grad_norm": 27.867319107055664, "learning_rate": 3.830010493179433e-08, "logps/chosen": -46.6761360168457, "logps/rejected": -76.2200927734375, "loss": 0.3352, "losses/dpo": 0.37638312578201294, "losses/sft": 1.684161901473999, "losses/total": 0.37638312578201294, "ref_logps/chosen": -30.49056625366211, "ref_logps/rejected": -40.442466735839844, "rewards/accuracies": 0.75, "rewards/chosen": -1.6185567378997803, "rewards/margins": 1.9592053890228271, "rewards/rejected": -3.5777623653411865, "step": 2958 }, { "epoch": 2.79, "grad_norm": 33.544219970703125, "learning_rate": 3.812521860790486e-08, "logps/chosen": -65.1119613647461, "logps/rejected": -85.15020751953125, "loss": 0.3739, "losses/dpo": 0.2625933885574341, "losses/sft": 2.014401435852051, "losses/total": 0.2625933885574341, "ref_logps/chosen": -45.40331268310547, "ref_logps/rejected": -47.31351089477539, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9708645343780518, "rewards/margins": 1.81280517578125, "rewards/rejected": -3.783669948577881, "step": 2959 }, { "epoch": 2.8, "grad_norm": 42.03447723388672, "learning_rate": 3.795033228401539e-08, "logps/chosen": -59.89862060546875, "logps/rejected": -72.97151184082031, "loss": 0.4636, "losses/dpo": 0.20353052020072937, "losses/sft": 1.6133798360824585, "losses/total": 0.20353052020072937, "ref_logps/chosen": -40.813438415527344, "ref_logps/rejected": -38.648433685302734, "rewards/accuracies": 0.75, "rewards/chosen": -1.9085184335708618, "rewards/margins": 1.5237895250320435, "rewards/rejected": -3.432307720184326, "step": 2960 }, { "epoch": 2.8, "grad_norm": 8.90991497039795, "learning_rate": 3.777544596012592e-08, "logps/chosen": -54.34622573852539, "logps/rejected": -94.96410369873047, "loss": 0.0769, "losses/dpo": 0.07265409082174301, "losses/sft": 2.0861217975616455, "losses/total": 0.07265409082174301, "ref_logps/chosen": -37.764564514160156, "ref_logps/rejected": -48.592872619628906, "rewards/accuracies": 1.0, "rewards/chosen": -1.6581659317016602, "rewards/margins": 2.978957176208496, "rewards/rejected": -4.637123107910156, "step": 2961 }, { "epoch": 2.8, "grad_norm": 29.597333908081055, "learning_rate": 3.7600559636236446e-08, "logps/chosen": -45.32130432128906, "logps/rejected": -75.54582977294922, "loss": 0.3432, "losses/dpo": 0.17707788944244385, "losses/sft": 1.8706631660461426, "losses/total": 0.17707788944244385, "ref_logps/chosen": -28.261106491088867, "ref_logps/rejected": -38.920448303222656, "rewards/accuracies": 0.875, "rewards/chosen": -1.7060202360153198, "rewards/margins": 1.9565176963806152, "rewards/rejected": -3.6625380516052246, "step": 2962 }, { "epoch": 2.8, "grad_norm": 19.86264419555664, "learning_rate": 3.742567331234697e-08, "logps/chosen": -54.12257766723633, "logps/rejected": -88.94450378417969, "loss": 0.163, "losses/dpo": 0.0474557988345623, "losses/sft": 2.0292882919311523, "losses/total": 0.0474557988345623, "ref_logps/chosen": -35.720436096191406, "ref_logps/rejected": -42.519832611083984, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8402135372161865, "rewards/margins": 2.8022539615631104, "rewards/rejected": -4.642467975616455, "step": 2963 }, { "epoch": 2.8, "grad_norm": 33.89479064941406, "learning_rate": 3.7250786988457504e-08, "logps/chosen": -69.56108093261719, "logps/rejected": -84.26187133789062, "loss": 0.4123, "losses/dpo": 0.42647796869277954, "losses/sft": 2.724936008453369, "losses/total": 0.42647796869277954, "ref_logps/chosen": -45.12382125854492, "ref_logps/rejected": -39.894134521484375, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4437255859375, "rewards/margins": 1.9930481910705566, "rewards/rejected": -4.436773777008057, "step": 2964 }, { "epoch": 2.8, "grad_norm": 24.20273780822754, "learning_rate": 3.707590066456802e-08, "logps/chosen": -57.218040466308594, "logps/rejected": -90.99298095703125, "loss": 0.25, "losses/dpo": 0.1578081250190735, "losses/sft": 1.372912883758545, "losses/total": 0.1578081250190735, "ref_logps/chosen": -36.720550537109375, "ref_logps/rejected": -47.78184509277344, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0497493743896484, "rewards/margins": 2.271364212036133, "rewards/rejected": -4.321113586425781, "step": 2965 }, { "epoch": 2.8, "grad_norm": 18.949331283569336, "learning_rate": 3.6901014340678556e-08, "logps/chosen": -74.29554748535156, "logps/rejected": -115.29342651367188, "loss": 0.1696, "losses/dpo": 0.07121007144451141, "losses/sft": 2.2247562408447266, "losses/total": 0.07121007144451141, "ref_logps/chosen": -47.42454528808594, "ref_logps/rejected": -64.8714599609375, "rewards/accuracies": 1.0, "rewards/chosen": -2.6870999336242676, "rewards/margins": 2.3550961017608643, "rewards/rejected": -5.042195796966553, "step": 2966 }, { "epoch": 2.8, "grad_norm": 22.12060546875, "learning_rate": 3.672612801678909e-08, "logps/chosen": -52.25728225708008, "logps/rejected": -88.52669525146484, "loss": 0.1735, "losses/dpo": 0.3398432731628418, "losses/sft": 2.1218903064727783, "losses/total": 0.3398432731628418, "ref_logps/chosen": -32.640262603759766, "ref_logps/rejected": -42.928184509277344, "rewards/accuracies": 0.9375, "rewards/chosen": -1.96170175075531, "rewards/margins": 2.5981497764587402, "rewards/rejected": -4.55985164642334, "step": 2967 }, { "epoch": 2.8, "grad_norm": 29.145397186279297, "learning_rate": 3.6551241692899614e-08, "logps/chosen": -60.1705436706543, "logps/rejected": -102.53408813476562, "loss": 0.2463, "losses/dpo": 0.6488381028175354, "losses/sft": 1.8082618713378906, "losses/total": 0.6488381028175354, "ref_logps/chosen": -39.983062744140625, "ref_logps/rejected": -56.66352081298828, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0187482833862305, "rewards/margins": 2.568307876586914, "rewards/rejected": -4.5870561599731445, "step": 2968 }, { "epoch": 2.8, "grad_norm": 45.12929916381836, "learning_rate": 3.637635536901014e-08, "logps/chosen": -77.0869140625, "logps/rejected": -83.4021987915039, "loss": 0.4624, "losses/dpo": 1.125467300415039, "losses/sft": 2.4663610458374023, "losses/total": 1.125467300415039, "ref_logps/chosen": -47.52934265136719, "ref_logps/rejected": -40.71036911010742, "rewards/accuracies": 0.8125, "rewards/chosen": -2.955756664276123, "rewards/margins": 1.3134262561798096, "rewards/rejected": -4.269183158874512, "step": 2969 }, { "epoch": 2.8, "grad_norm": 33.336448669433594, "learning_rate": 3.620146904512067e-08, "logps/chosen": -67.48674011230469, "logps/rejected": -104.83739471435547, "loss": 0.2856, "losses/dpo": 0.5604097843170166, "losses/sft": 2.52388858795166, "losses/total": 0.5604097843170166, "ref_logps/chosen": -45.390689849853516, "ref_logps/rejected": -58.22635269165039, "rewards/accuracies": 0.875, "rewards/chosen": -2.2096049785614014, "rewards/margins": 2.4514989852905273, "rewards/rejected": -4.661104202270508, "step": 2970 }, { "epoch": 2.81, "grad_norm": 37.19539260864258, "learning_rate": 3.60265827212312e-08, "logps/chosen": -64.23799896240234, "logps/rejected": -72.99726867675781, "loss": 0.5128, "losses/dpo": 0.291890025138855, "losses/sft": 2.6363255977630615, "losses/total": 0.291890025138855, "ref_logps/chosen": -40.86454772949219, "ref_logps/rejected": -40.237030029296875, "rewards/accuracies": 0.75, "rewards/chosen": -2.3373451232910156, "rewards/margins": 0.938678503036499, "rewards/rejected": -3.2760238647460938, "step": 2971 }, { "epoch": 2.81, "grad_norm": 36.78740692138672, "learning_rate": 3.585169639734173e-08, "logps/chosen": -51.64378356933594, "logps/rejected": -76.26048278808594, "loss": 0.4277, "losses/dpo": 0.7999288439750671, "losses/sft": 1.7480086088180542, "losses/total": 0.7999288439750671, "ref_logps/chosen": -31.028268814086914, "ref_logps/rejected": -38.83718490600586, "rewards/accuracies": 0.8125, "rewards/chosen": -2.061551570892334, "rewards/margins": 1.6807782649993896, "rewards/rejected": -3.7423300743103027, "step": 2972 }, { "epoch": 2.81, "grad_norm": 26.65422821044922, "learning_rate": 3.567681007345225e-08, "logps/chosen": -61.479705810546875, "logps/rejected": -96.65974426269531, "loss": 0.2269, "losses/dpo": 0.13492417335510254, "losses/sft": 2.4486684799194336, "losses/total": 0.13492417335510254, "ref_logps/chosen": -42.64320373535156, "ref_logps/rejected": -52.526371002197266, "rewards/accuracies": 0.8125, "rewards/chosen": -1.883650779724121, "rewards/margins": 2.529686689376831, "rewards/rejected": -4.413337230682373, "step": 2973 }, { "epoch": 2.81, "grad_norm": 25.414209365844727, "learning_rate": 3.550192374956278e-08, "logps/chosen": -66.7447509765625, "logps/rejected": -90.71731567382812, "loss": 0.3875, "losses/dpo": 0.10386046022176743, "losses/sft": 1.5561758279800415, "losses/total": 0.10386046022176743, "ref_logps/chosen": -45.63017654418945, "ref_logps/rejected": -47.700557708740234, "rewards/accuracies": 0.75, "rewards/chosen": -2.1114578247070312, "rewards/margins": 2.190218687057495, "rewards/rejected": -4.3016767501831055, "step": 2974 }, { "epoch": 2.81, "grad_norm": 31.883968353271484, "learning_rate": 3.5327037425673315e-08, "logps/chosen": -51.42143249511719, "logps/rejected": -58.91716003417969, "loss": 0.4093, "losses/dpo": 0.3280029296875, "losses/sft": 1.6561799049377441, "losses/total": 0.3280029296875, "ref_logps/chosen": -31.118148803710938, "ref_logps/rejected": -26.301490783691406, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0303287506103516, "rewards/margins": 1.231238603591919, "rewards/rejected": -3.2615673542022705, "step": 2975 }, { "epoch": 2.81, "grad_norm": 18.696861267089844, "learning_rate": 3.5152151101783834e-08, "logps/chosen": -62.46991729736328, "logps/rejected": -110.99563598632812, "loss": 0.1321, "losses/dpo": 0.14700421690940857, "losses/sft": 1.9267423152923584, "losses/total": 0.14700421690940857, "ref_logps/chosen": -41.423683166503906, "ref_logps/rejected": -58.88855743408203, "rewards/accuracies": 1.0, "rewards/chosen": -2.1046228408813477, "rewards/margins": 3.1060850620269775, "rewards/rejected": -5.210708141326904, "step": 2976 }, { "epoch": 2.81, "grad_norm": 32.2402229309082, "learning_rate": 3.497726477789437e-08, "logps/chosen": -57.33751678466797, "logps/rejected": -98.6733169555664, "loss": 0.2619, "losses/dpo": 0.09131017327308655, "losses/sft": 2.0977137088775635, "losses/total": 0.09131017327308655, "ref_logps/chosen": -35.32010269165039, "ref_logps/rejected": -51.638694763183594, "rewards/accuracies": 0.875, "rewards/chosen": -2.2017414569854736, "rewards/margins": 2.501720905303955, "rewards/rejected": -4.703462600708008, "step": 2977 }, { "epoch": 2.81, "grad_norm": 20.025590896606445, "learning_rate": 3.48023784540049e-08, "logps/chosen": -54.157447814941406, "logps/rejected": -86.65713500976562, "loss": 0.1588, "losses/dpo": 0.2667253017425537, "losses/sft": 1.8768854141235352, "losses/total": 0.2667253017425537, "ref_logps/chosen": -39.21525573730469, "ref_logps/rejected": -44.985740661621094, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4942193031311035, "rewards/margins": 2.6729202270507812, "rewards/rejected": -4.167139530181885, "step": 2978 }, { "epoch": 2.81, "grad_norm": 35.18009567260742, "learning_rate": 3.4627492130115425e-08, "logps/chosen": -60.42721176147461, "logps/rejected": -101.02044677734375, "loss": 0.2923, "losses/dpo": 0.49971938133239746, "losses/sft": 2.3838436603546143, "losses/total": 0.49971938133239746, "ref_logps/chosen": -36.92533874511719, "ref_logps/rejected": -52.1884765625, "rewards/accuracies": 0.75, "rewards/chosen": -2.350187301635742, "rewards/margins": 2.5330095291137695, "rewards/rejected": -4.883196830749512, "step": 2979 }, { "epoch": 2.81, "grad_norm": 18.57384490966797, "learning_rate": 3.445260580622595e-08, "logps/chosen": -62.590736389160156, "logps/rejected": -87.78916931152344, "loss": 0.1762, "losses/dpo": 0.04849696531891823, "losses/sft": 1.909379482269287, "losses/total": 0.04849696531891823, "ref_logps/chosen": -42.78089904785156, "ref_logps/rejected": -41.12408447265625, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9809837341308594, "rewards/margins": 2.6855244636535645, "rewards/rejected": -4.666508197784424, "step": 2980 }, { "epoch": 2.81, "grad_norm": 24.224576950073242, "learning_rate": 3.427771948233648e-08, "logps/chosen": -57.722599029541016, "logps/rejected": -90.49748992919922, "loss": 0.2004, "losses/dpo": 0.23663319647312164, "losses/sft": 1.8057953119277954, "losses/total": 0.23663319647312164, "ref_logps/chosen": -36.739173889160156, "ref_logps/rejected": -46.24281311035156, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0983424186706543, "rewards/margins": 2.327125310897827, "rewards/rejected": -4.425467491149902, "step": 2981 }, { "epoch": 2.82, "grad_norm": 16.721393585205078, "learning_rate": 3.410283315844701e-08, "logps/chosen": -50.42713165283203, "logps/rejected": -97.43521118164062, "loss": 0.1399, "losses/dpo": 0.1284208744764328, "losses/sft": 2.3800570964813232, "losses/total": 0.1284208744764328, "ref_logps/chosen": -37.059600830078125, "ref_logps/rejected": -50.89272689819336, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3367533683776855, "rewards/margins": 3.3174943923950195, "rewards/rejected": -4.654247760772705, "step": 2982 }, { "epoch": 2.82, "grad_norm": 18.984500885009766, "learning_rate": 3.3927946834557535e-08, "logps/chosen": -55.87278747558594, "logps/rejected": -93.3619384765625, "loss": 0.2242, "losses/dpo": 0.06402628868818283, "losses/sft": 2.0018904209136963, "losses/total": 0.06402628868818283, "ref_logps/chosen": -37.17364501953125, "ref_logps/rejected": -48.63155746459961, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8699142932891846, "rewards/margins": 2.6031241416931152, "rewards/rejected": -4.473038196563721, "step": 2983 }, { "epoch": 2.82, "grad_norm": 18.34654426574707, "learning_rate": 3.375306051066806e-08, "logps/chosen": -54.484527587890625, "logps/rejected": -98.38478088378906, "loss": 0.1873, "losses/dpo": 0.060594167560338974, "losses/sft": 2.0047285556793213, "losses/total": 0.060594167560338974, "ref_logps/chosen": -32.541080474853516, "ref_logps/rejected": -48.39060592651367, "rewards/accuracies": 0.9375, "rewards/chosen": -2.194344997406006, "rewards/margins": 2.805072546005249, "rewards/rejected": -4.999417304992676, "step": 2984 }, { "epoch": 2.82, "grad_norm": 19.110803604125977, "learning_rate": 3.3578174186778594e-08, "logps/chosen": -59.22347640991211, "logps/rejected": -93.82707977294922, "loss": 0.1775, "losses/dpo": 0.06492815911769867, "losses/sft": 2.290909767150879, "losses/total": 0.06492815911769867, "ref_logps/chosen": -38.76340103149414, "ref_logps/rejected": -47.39862060546875, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0460081100463867, "rewards/margins": 2.5968379974365234, "rewards/rejected": -4.64284610748291, "step": 2985 }, { "epoch": 2.82, "grad_norm": 14.808489799499512, "learning_rate": 3.3403287862889126e-08, "logps/chosen": -54.92408752441406, "logps/rejected": -97.23770141601562, "loss": 0.1103, "losses/dpo": 0.056466784328222275, "losses/sft": 2.340762138366699, "losses/total": 0.056466784328222275, "ref_logps/chosen": -36.700828552246094, "ref_logps/rejected": -51.280738830566406, "rewards/accuracies": 1.0, "rewards/chosen": -1.8223259449005127, "rewards/margins": 2.7733707427978516, "rewards/rejected": -4.595696926116943, "step": 2986 }, { "epoch": 2.82, "grad_norm": 28.081100463867188, "learning_rate": 3.3228401538999646e-08, "logps/chosen": -67.30946350097656, "logps/rejected": -85.007568359375, "loss": 0.2949, "losses/dpo": 0.2029864341020584, "losses/sft": 2.2702720165252686, "losses/total": 0.2029864341020584, "ref_logps/chosen": -40.41299819946289, "ref_logps/rejected": -39.83832550048828, "rewards/accuracies": 0.875, "rewards/chosen": -2.689646005630493, "rewards/margins": 1.8272786140441895, "rewards/rejected": -4.516924858093262, "step": 2987 }, { "epoch": 2.82, "grad_norm": 26.84359359741211, "learning_rate": 3.305351521511018e-08, "logps/chosen": -71.78427124023438, "logps/rejected": -83.98118591308594, "loss": 0.257, "losses/dpo": 0.09366091340780258, "losses/sft": 2.0670480728149414, "losses/total": 0.09366091340780258, "ref_logps/chosen": -47.53776550292969, "ref_logps/rejected": -41.261619567871094, "rewards/accuracies": 0.875, "rewards/chosen": -2.4246511459350586, "rewards/margins": 1.8473058938980103, "rewards/rejected": -4.271956920623779, "step": 2988 }, { "epoch": 2.82, "grad_norm": 22.925477981567383, "learning_rate": 3.2878628891220704e-08, "logps/chosen": -57.029563903808594, "logps/rejected": -101.94001007080078, "loss": 0.2671, "losses/dpo": 0.19132955372333527, "losses/sft": 2.351469039916992, "losses/total": 0.19132955372333527, "ref_logps/chosen": -32.821189880371094, "ref_logps/rejected": -54.15678405761719, "rewards/accuracies": 0.8125, "rewards/chosen": -2.42083740234375, "rewards/margins": 2.357485294342041, "rewards/rejected": -4.778322219848633, "step": 2989 }, { "epoch": 2.82, "grad_norm": 15.527331352233887, "learning_rate": 3.270374256733123e-08, "logps/chosen": -59.65545654296875, "logps/rejected": -92.93575286865234, "loss": 0.1897, "losses/dpo": 0.20035426318645477, "losses/sft": 1.5696300268173218, "losses/total": 0.20035426318645477, "ref_logps/chosen": -37.63036346435547, "ref_logps/rejected": -48.36811065673828, "rewards/accuracies": 1.0, "rewards/chosen": -2.202509880065918, "rewards/margins": 2.25425386428833, "rewards/rejected": -4.456763744354248, "step": 2990 }, { "epoch": 2.82, "grad_norm": 34.997982025146484, "learning_rate": 3.252885624344176e-08, "logps/chosen": -53.74292755126953, "logps/rejected": -73.917236328125, "loss": 0.4323, "losses/dpo": 0.14197325706481934, "losses/sft": 1.5360971689224243, "losses/total": 0.14197325706481934, "ref_logps/chosen": -34.2266731262207, "ref_logps/rejected": -40.91029739379883, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9516255855560303, "rewards/margins": 1.3490679264068604, "rewards/rejected": -3.3006935119628906, "step": 2991 }, { "epoch": 2.83, "grad_norm": 24.652267456054688, "learning_rate": 3.235396991955229e-08, "logps/chosen": -56.34262466430664, "logps/rejected": -77.86944580078125, "loss": 0.2396, "losses/dpo": 0.33956053853034973, "losses/sft": 2.0223419666290283, "losses/total": 0.33956053853034973, "ref_logps/chosen": -35.858177185058594, "ref_logps/rejected": -34.715274810791016, "rewards/accuracies": 0.875, "rewards/chosen": -2.048444986343384, "rewards/margins": 2.2669718265533447, "rewards/rejected": -4.3154168128967285, "step": 2992 }, { "epoch": 2.83, "grad_norm": 20.87251853942871, "learning_rate": 3.217908359566282e-08, "logps/chosen": -75.1152114868164, "logps/rejected": -113.919677734375, "loss": 0.1501, "losses/dpo": 0.040977198630571365, "losses/sft": 2.842691659927368, "losses/total": 0.040977198630571365, "ref_logps/chosen": -48.413238525390625, "ref_logps/rejected": -61.866790771484375, "rewards/accuracies": 1.0, "rewards/chosen": -2.6701972484588623, "rewards/margins": 2.5350914001464844, "rewards/rejected": -5.205288410186768, "step": 2993 }, { "epoch": 2.83, "grad_norm": 21.815921783447266, "learning_rate": 3.2004197271773347e-08, "logps/chosen": -59.997947692871094, "logps/rejected": -87.62559509277344, "loss": 0.2309, "losses/dpo": 0.41971027851104736, "losses/sft": 1.6817584037780762, "losses/total": 0.41971027851104736, "ref_logps/chosen": -40.48411560058594, "ref_logps/rejected": -46.842369079589844, "rewards/accuracies": 0.9375, "rewards/chosen": -1.951383113861084, "rewards/margins": 2.1269400119781494, "rewards/rejected": -4.0783233642578125, "step": 2994 }, { "epoch": 2.83, "grad_norm": 26.691635131835938, "learning_rate": 3.182931094788387e-08, "logps/chosen": -61.08686828613281, "logps/rejected": -94.94357299804688, "loss": 0.303, "losses/dpo": 0.11571475863456726, "losses/sft": 2.303887128829956, "losses/total": 0.11571475863456726, "ref_logps/chosen": -39.1475830078125, "ref_logps/rejected": -49.49618911743164, "rewards/accuracies": 0.875, "rewards/chosen": -2.1939287185668945, "rewards/margins": 2.350809335708618, "rewards/rejected": -4.544737815856934, "step": 2995 }, { "epoch": 2.83, "grad_norm": 26.821645736694336, "learning_rate": 3.1654424623994405e-08, "logps/chosen": -55.198997497558594, "logps/rejected": -75.54061889648438, "loss": 0.281, "losses/dpo": 0.4803215265274048, "losses/sft": 2.131777048110962, "losses/total": 0.4803215265274048, "ref_logps/chosen": -33.7226676940918, "ref_logps/rejected": -35.69221878051758, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1476328372955322, "rewards/margins": 1.8372067213058472, "rewards/rejected": -3.984839677810669, "step": 2996 }, { "epoch": 2.83, "grad_norm": 19.113996505737305, "learning_rate": 3.1479538300104924e-08, "logps/chosen": -61.06296157836914, "logps/rejected": -98.74284362792969, "loss": 0.1654, "losses/dpo": 0.18672429025173187, "losses/sft": 2.309131145477295, "losses/total": 0.18672429025173187, "ref_logps/chosen": -40.02710723876953, "ref_logps/rejected": -52.59275817871094, "rewards/accuracies": 1.0, "rewards/chosen": -2.103585720062256, "rewards/margins": 2.511423110961914, "rewards/rejected": -4.61500883102417, "step": 2997 }, { "epoch": 2.83, "grad_norm": 22.21666717529297, "learning_rate": 3.130465197621546e-08, "logps/chosen": -46.25444793701172, "logps/rejected": -73.90019226074219, "loss": 0.2209, "losses/dpo": 0.35864341259002686, "losses/sft": 2.213911294937134, "losses/total": 0.35864341259002686, "ref_logps/chosen": -29.645763397216797, "ref_logps/rejected": -36.79693603515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6608679294586182, "rewards/margins": 2.0494577884674072, "rewards/rejected": -3.7103259563446045, "step": 2998 }, { "epoch": 2.83, "grad_norm": 22.629308700561523, "learning_rate": 3.112976565232599e-08, "logps/chosen": -57.04795837402344, "logps/rejected": -71.7969970703125, "loss": 0.2753, "losses/dpo": 0.2649767994880676, "losses/sft": 1.9152793884277344, "losses/total": 0.2649767994880676, "ref_logps/chosen": -40.077362060546875, "ref_logps/rejected": -38.61092758178711, "rewards/accuracies": 0.875, "rewards/chosen": -1.6970595121383667, "rewards/margins": 1.6215471029281616, "rewards/rejected": -3.3186066150665283, "step": 2999 }, { "epoch": 2.83, "grad_norm": 29.257476806640625, "learning_rate": 3.0954879328436515e-08, "logps/chosen": -64.85026550292969, "logps/rejected": -99.04454803466797, "loss": 0.2222, "losses/dpo": 0.2852153182029724, "losses/sft": 2.0283477306365967, "losses/total": 0.2852153182029724, "ref_logps/chosen": -37.707908630371094, "ref_logps/rejected": -47.53254318237305, "rewards/accuracies": 0.875, "rewards/chosen": -2.714235782623291, "rewards/margins": 2.436964273452759, "rewards/rejected": -5.151200294494629, "step": 3000 } ], "logging_steps": 1.0, "max_steps": 3177, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }