diff --git "a/checkpoint-500/trainer_state.json" "b/checkpoint-500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-500/trainer_state.json" @@ -0,0 +1,7533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1827318410232983, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003654636820465966, + "grad_norm": 70.50516510009766, + "kl": 0.0, + "learning_rate": 1e-05, + "logits/chosen": -66672234.666666664, + "logits/rejected": -85497435.42857143, + "logps/chosen": -414.2180447048611, + "logps/rejected": -344.10721261160717, + "loss": 0.275, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0007309273640931932, + "grad_norm": 69.13626861572266, + "kl": 0.0, + "learning_rate": 2e-05, + "logits/chosen": -66327543.46666667, + "logits/rejected": -48240937.4117647, + "logps/chosen": -422.21435546875, + "logps/rejected": -276.88039981617646, + "loss": 0.3125, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0010963910461397899, + "grad_norm": 94.15017700195312, + "kl": 0.10230207443237305, + "learning_rate": 3e-05, + "logits/chosen": -77653326.76923077, + "logits/rejected": -69068126.31578948, + "logps/chosen": -427.74459134615387, + "logps/rejected": -336.28831722861844, + "loss": 0.3289, + "rewards/chosen": -0.02957458679492657, + "rewards/margins": 0.010732028407123888, + "rewards/rejected": -0.04030661520205046, + "step": 3 + }, + { + "epoch": 0.0014618547281863865, + "grad_norm": 99.499267578125, + "kl": 0.0018157958984375, + "learning_rate": 4e-05, + "logits/chosen": -63872645.333333336, + "logits/rejected": -70730592.0, + "logps/chosen": -430.9491780598958, + "logps/rejected": -360.9772705078125, + "loss": 0.3189, + "rewards/chosen": -0.022309874494870503, + "rewards/margins": 0.18317682842413582, + "rewards/rejected": -0.20548670291900634, + "step": 4 + }, + { + "epoch": 0.001827318410232983, + "grad_norm": 103.61483764648438, + "kl": 0.012647151947021484, + "learning_rate": 5e-05, + "logits/chosen": -60096945.23076923, + "logits/rejected": -65103366.7368421, + "logps/chosen": -292.55611478365387, + "logps/rejected": -460.35911800986844, + "loss": 0.2501, + "rewards/chosen": -0.13560827878805307, + "rewards/margins": 0.5146523955379904, + "rewards/rejected": -0.6502606743260434, + "step": 5 + }, + { + "epoch": 0.0021927820922795797, + "grad_norm": 64.95700073242188, + "kl": 0.0, + "learning_rate": 6e-05, + "logits/chosen": -89392679.38461539, + "logits/rejected": -54245170.526315786, + "logps/chosen": -412.2142803485577, + "logps/rejected": -375.92197779605266, + "loss": 0.1876, + "rewards/chosen": -0.5762283618633564, + "rewards/margins": 0.7616908733661358, + "rewards/rejected": -1.3379192352294922, + "step": 6 + }, + { + "epoch": 0.0025582457743261763, + "grad_norm": 21.49436378479004, + "kl": 0.0, + "learning_rate": 7e-05, + "logits/chosen": -72870521.26315789, + "logits/rejected": -79904659.6923077, + "logps/chosen": -391.34200246710526, + "logps/rejected": -518.6966271033654, + "loss": 0.1233, + "rewards/chosen": -1.5921392942729748, + "rewards/margins": 2.0520765096069833, + "rewards/rejected": -3.644215803879958, + "step": 7 + }, + { + "epoch": 0.002923709456372773, + "grad_norm": 15.191149711608887, + "kl": 0.0, + "learning_rate": 8e-05, + "logits/chosen": -80619752.0, + "logits/rejected": -63836768.0, + "logps/chosen": -438.9120788574219, + "logps/rejected": -462.61907958984375, + "loss": 0.1209, + "rewards/chosen": -2.851921319961548, + "rewards/margins": 2.961158037185669, + "rewards/rejected": -5.813079357147217, + "step": 8 + }, + { + "epoch": 0.0032891731384193696, + "grad_norm": 10.538928031921387, + "kl": 0.0, + "learning_rate": 9e-05, + "logits/chosen": -69753120.0, + "logits/rejected": -67056665.6, + "logps/chosen": -415.3586832682292, + "logps/rejected": -357.4203125, + "loss": 0.0958, + "rewards/chosen": -4.56794802347819, + "rewards/margins": 3.072142155965169, + "rewards/rejected": -7.6400901794433596, + "step": 9 + }, + { + "epoch": 0.003654636820465966, + "grad_norm": 3.222597122192383, + "kl": 0.0, + "learning_rate": 0.0001, + "logits/chosen": -84851421.86666666, + "logits/rejected": -61368357.64705882, + "logps/chosen": -384.41090494791666, + "logps/rejected": -465.8205135569853, + "loss": 0.0918, + "rewards/chosen": -6.086501057942709, + "rewards/margins": 7.457770074582568, + "rewards/rejected": -13.544271132525276, + "step": 10 + }, + { + "epoch": 0.004020100502512563, + "grad_norm": 0.7196786403656006, + "kl": 0.0, + "learning_rate": 9.99989723479183e-05, + "logits/chosen": -81370248.0, + "logits/rejected": -91569408.0, + "logps/chosen": -442.4510803222656, + "logps/rejected": -516.6836547851562, + "loss": 0.1001, + "rewards/chosen": -10.411160469055176, + "rewards/margins": 11.409586906433105, + "rewards/rejected": -21.82074737548828, + "step": 11 + }, + { + "epoch": 0.004385564184559159, + "grad_norm": 2.4945011138916016, + "kl": 0.0, + "learning_rate": 9.999588943391597e-05, + "logits/chosen": -118624777.14285715, + "logits/rejected": -81275242.66666667, + "logps/chosen": -437.8573521205357, + "logps/rejected": -597.7122938368055, + "loss": 0.0845, + "rewards/chosen": -10.227762494768415, + "rewards/margins": 20.10963645813957, + "rewards/rejected": -30.337398952907986, + "step": 12 + }, + { + "epoch": 0.0047510278666057565, + "grad_norm": 0.59647136926651, + "kl": 0.0, + "learning_rate": 9.999075138471951e-05, + "logits/chosen": -115453986.13333334, + "logits/rejected": -76846275.76470588, + "logps/chosen": -453.15276692708335, + "logps/rejected": -557.3473690257352, + "loss": 0.0934, + "rewards/chosen": -13.898647054036458, + "rewards/margins": 14.38333895814185, + "rewards/rejected": -28.281986012178308, + "step": 13 + }, + { + "epoch": 0.005116491548652353, + "grad_norm": 4.384544849395752, + "kl": 0.0, + "learning_rate": 9.9983558411534e-05, + "logits/chosen": -112966163.6923077, + "logits/rejected": -103012412.63157895, + "logps/chosen": -486.83071664663464, + "logps/rejected": -487.32252261513156, + "loss": 0.0863, + "rewards/chosen": -14.020332923302284, + "rewards/margins": 13.25188708594936, + "rewards/rejected": -27.272220009251644, + "step": 14 + }, + { + "epoch": 0.00548195523069895, + "grad_norm": 0.7163640856742859, + "kl": 0.0, + "learning_rate": 9.99743108100344e-05, + "logits/chosen": -101596401.77777778, + "logits/rejected": -95033782.85714285, + "logps/chosen": -531.5575629340278, + "logps/rejected": -488.21561104910717, + "loss": 0.1123, + "rewards/chosen": -14.693433973524305, + "rewards/margins": 14.402351984902035, + "rewards/rejected": -29.09578595842634, + "step": 15 + }, + { + "epoch": 0.005847418912745546, + "grad_norm": 2.3758387565612793, + "kl": 0.0, + "learning_rate": 9.996300896035339e-05, + "logits/chosen": -66563285.333333336, + "logits/rejected": -116772242.28571428, + "logps/chosen": -412.4872775607639, + "logps/rejected": -560.7509765625, + "loss": 0.1082, + "rewards/chosen": -11.090181986490885, + "rewards/margins": 20.835895719982332, + "rewards/rejected": -31.926077706473215, + "step": 16 + }, + { + "epoch": 0.006212882594792143, + "grad_norm": 4.935825824737549, + "kl": 0.0, + "learning_rate": 9.994965332706573e-05, + "logits/chosen": -95241365.33333333, + "logits/rejected": -72146628.57142857, + "logps/chosen": -357.1007486979167, + "logps/rejected": -425.796630859375, + "loss": 0.1061, + "rewards/chosen": -10.44845920138889, + "rewards/margins": 16.569448077489458, + "rewards/rejected": -27.017907278878347, + "step": 17 + }, + { + "epoch": 0.006578346276838739, + "grad_norm": 2.1070468425750732, + "kl": 0.0, + "learning_rate": 9.993424445916923e-05, + "logits/chosen": -95185226.66666667, + "logits/rejected": -59045593.6, + "logps/chosen": -488.2670491536458, + "logps/rejected": -467.557177734375, + "loss": 0.0739, + "rewards/chosen": -12.333049774169922, + "rewards/margins": 15.72508010864258, + "rewards/rejected": -28.0581298828125, + "step": 18 + }, + { + "epoch": 0.006943809958885336, + "grad_norm": 2.696626901626587, + "kl": 0.0, + "learning_rate": 9.991678299006205e-05, + "logits/chosen": -100491392.0, + "logits/rejected": -78168736.0, + "logps/chosen": -447.7462972005208, + "logps/rejected": -548.93388671875, + "loss": 0.0735, + "rewards/chosen": -9.27017593383789, + "rewards/margins": 20.79569320678711, + "rewards/rejected": -30.065869140625, + "step": 19 + }, + { + "epoch": 0.007309273640931932, + "grad_norm": 7.118461608886719, + "kl": 0.0, + "learning_rate": 9.989726963751682e-05, + "logits/chosen": -79133110.85714285, + "logits/rejected": -78483392.0, + "logps/chosen": -368.60341099330356, + "logps/rejected": -551.7664388020834, + "loss": 0.0808, + "rewards/chosen": -5.415279933384487, + "rewards/margins": 22.936009603833394, + "rewards/rejected": -28.35128953721788, + "step": 20 + }, + { + "epoch": 0.007674737322978529, + "grad_norm": 14.0170259475708, + "kl": 0.0, + "learning_rate": 9.987570520365104e-05, + "logits/chosen": -94006579.2, + "logits/rejected": -65066563.76470588, + "logps/chosen": -470.67978515625, + "logps/rejected": -464.52550551470586, + "loss": 0.0782, + "rewards/chosen": -3.98671137491862, + "rewards/margins": 18.300774981928807, + "rewards/rejected": -22.287486356847428, + "step": 21 + }, + { + "epoch": 0.008040201005025126, + "grad_norm": 11.112717628479004, + "kl": 0.0, + "learning_rate": 9.98520905748941e-05, + "logits/chosen": -58957184.0, + "logits/rejected": -82073927.1111111, + "logps/chosen": -377.43223353794644, + "logps/rejected": -580.0238715277778, + "loss": 0.0608, + "rewards/chosen": -1.3486518859863281, + "rewards/margins": 23.146312713623047, + "rewards/rejected": -24.494964599609375, + "step": 22 + }, + { + "epoch": 0.008405664687071723, + "grad_norm": 11.628118515014648, + "kl": 0.5444526672363281, + "learning_rate": 9.982642672195092e-05, + "logits/chosen": -73654023.52941176, + "logits/rejected": -45703005.86666667, + "logps/chosen": -409.02473000919116, + "logps/rejected": -438.523046875, + "loss": 0.0573, + "rewards/chosen": -0.02788083693560432, + "rewards/margins": 15.746053082335228, + "rewards/rejected": -15.773933919270833, + "step": 23 + }, + { + "epoch": 0.008771128369118319, + "grad_norm": 19.863908767700195, + "kl": 1.022608757019043, + "learning_rate": 9.979871469976196e-05, + "logits/chosen": -53002395.82608695, + "logits/rejected": -62163328.0, + "logps/chosen": -303.9137015964674, + "logps/rejected": -471.96875, + "loss": 0.0552, + "rewards/chosen": 2.6770420903744907, + "rewards/margins": 15.497729278417024, + "rewards/rejected": -12.820687188042534, + "step": 24 + }, + { + "epoch": 0.009136592051164915, + "grad_norm": 5.07356071472168, + "kl": 1.4929475784301758, + "learning_rate": 9.976895564745991e-05, + "logits/chosen": -52722089.14285714, + "logits/rejected": -63057265.777777776, + "logps/chosen": -374.541015625, + "logps/rejected": -504.29741753472223, + "loss": 0.0122, + "rewards/chosen": 5.3456540788922995, + "rewards/margins": 18.508099994962176, + "rewards/rejected": -13.162445916069878, + "step": 25 + }, + { + "epoch": 0.009502055733211513, + "grad_norm": 9.457200050354004, + "kl": 4.141006946563721, + "learning_rate": 9.973715078832288e-05, + "logits/chosen": -57916832.0, + "logits/rejected": -60996544.0, + "logps/chosen": -423.02412109375, + "logps/rejected": -527.6825764973959, + "loss": 0.0382, + "rewards/chosen": 4.26049575805664, + "rewards/margins": 15.99791997273763, + "rewards/rejected": -11.73742421468099, + "step": 26 + }, + { + "epoch": 0.009867519415258109, + "grad_norm": 16.997724533081055, + "kl": 4.360161781311035, + "learning_rate": 9.970330142972401e-05, + "logits/chosen": -59864072.53333333, + "logits/rejected": -39135585.88235294, + "logps/chosen": -592.0302083333333, + "logps/rejected": -396.6316923253676, + "loss": 0.0187, + "rewards/chosen": 6.339927673339844, + "rewards/margins": 13.3125659718233, + "rewards/rejected": -6.972638298483456, + "step": 27 + }, + { + "epoch": 0.010232983097304705, + "grad_norm": 9.559985160827637, + "kl": 2.8432774543762207, + "learning_rate": 9.966740896307791e-05, + "logits/chosen": -32762090.0, + "logits/rejected": -64243928.0, + "logps/chosen": -359.212158203125, + "logps/rejected": -476.642822265625, + "loss": 0.0543, + "rewards/chosen": 5.413540363311768, + "rewards/margins": 12.094419002532959, + "rewards/rejected": -6.680878639221191, + "step": 28 + }, + { + "epoch": 0.010598446779351301, + "grad_norm": 9.041191101074219, + "kl": 5.258052825927734, + "learning_rate": 9.962947486378326e-05, + "logits/chosen": -63143973.64705882, + "logits/rejected": -46839790.93333333, + "logps/chosen": -453.3519646139706, + "logps/rejected": -312.64703776041665, + "loss": 0.0284, + "rewards/chosen": 6.458764917710248, + "rewards/margins": 14.038505823471967, + "rewards/rejected": -7.579740905761719, + "step": 29 + }, + { + "epoch": 0.0109639104613979, + "grad_norm": 19.824125289916992, + "kl": 3.294300079345703, + "learning_rate": 9.95895006911623e-05, + "logits/chosen": -46934882.461538464, + "logits/rejected": -52136461.473684214, + "logps/chosen": -373.4299128605769, + "logps/rejected": -472.3502261513158, + "loss": 0.0807, + "rewards/chosen": 5.015818082369291, + "rewards/margins": 12.79021046132694, + "rewards/rejected": -7.774392378957648, + "step": 30 + }, + { + "epoch": 0.011329374143444496, + "grad_norm": 6.117930889129639, + "kl": 3.7903122901916504, + "learning_rate": 9.954748808839674e-05, + "logits/chosen": -55276661.89473684, + "logits/rejected": -61740268.307692304, + "logps/chosen": -380.1571751644737, + "logps/rejected": -421.41811899038464, + "loss": 0.0218, + "rewards/chosen": 5.947867142526727, + "rewards/margins": 13.426362813725646, + "rewards/rejected": -7.478495671198918, + "step": 31 + }, + { + "epoch": 0.011694837825491092, + "grad_norm": 9.04257869720459, + "kl": 2.820640802383423, + "learning_rate": 9.95034387824601e-05, + "logits/chosen": -60048967.11111111, + "logits/rejected": -46811172.571428575, + "logps/chosen": -370.3642849392361, + "logps/rejected": -431.4790736607143, + "loss": 0.03, + "rewards/chosen": 5.1152538723415795, + "rewards/margins": 15.065753270709324, + "rewards/rejected": -9.950499398367745, + "step": 32 + }, + { + "epoch": 0.012060301507537688, + "grad_norm": 5.166853427886963, + "kl": 4.204875946044922, + "learning_rate": 9.945735458404681e-05, + "logits/chosen": -44660590.93333333, + "logits/rejected": -65401336.47058824, + "logps/chosen": -398.87623697916666, + "logps/rejected": -530.9944852941177, + "loss": 0.0151, + "rewards/chosen": 6.006900024414063, + "rewards/margins": 17.966142901252297, + "rewards/rejected": -11.959242876838236, + "step": 33 + }, + { + "epoch": 0.012425765189584286, + "grad_norm": 23.333698272705078, + "kl": 0.9312124252319336, + "learning_rate": 9.940923738749778e-05, + "logits/chosen": -50497694.11764706, + "logits/rejected": -26872443.733333334, + "logps/chosen": -283.69157858455884, + "logps/rejected": -292.580859375, + "loss": 0.0596, + "rewards/chosen": 4.435907251694623, + "rewards/margins": 10.932175460516238, + "rewards/rejected": -6.496268208821615, + "step": 34 + }, + { + "epoch": 0.012791228871630882, + "grad_norm": 5.268362522125244, + "kl": 3.4059906005859375, + "learning_rate": 9.935908917072252e-05, + "logits/chosen": -53245845.333333336, + "logits/rejected": -41666959.058823526, + "logps/chosen": -421.66468098958336, + "logps/rejected": -396.19045840992646, + "loss": 0.0179, + "rewards/chosen": 4.784971110026041, + "rewards/margins": 13.946780694699754, + "rewards/rejected": -9.161809584673714, + "step": 35 + }, + { + "epoch": 0.013156692553677478, + "grad_norm": 2.412320613861084, + "kl": 1.53816556930542, + "learning_rate": 9.930691199511775e-05, + "logits/chosen": -41895466.666666664, + "logits/rejected": -59891267.76470588, + "logps/chosen": -382.1077473958333, + "logps/rejected": -449.4469784007353, + "loss": 0.005, + "rewards/chosen": 6.4715830485026045, + "rewards/margins": 19.50740978764553, + "rewards/rejected": -13.035826739142923, + "step": 36 + }, + { + "epoch": 0.013522156235724074, + "grad_norm": 6.030672073364258, + "kl": 5.197851181030273, + "learning_rate": 9.925270800548285e-05, + "logits/chosen": -63401881.6, + "logits/rejected": -48393333.333333336, + "logps/chosen": -407.56083984375, + "logps/rejected": -361.7208658854167, + "loss": 0.0263, + "rewards/chosen": 5.010283279418945, + "rewards/margins": 13.853185780843098, + "rewards/rejected": -8.842902501424154, + "step": 37 + }, + { + "epoch": 0.013887619917770672, + "grad_norm": 6.735757827758789, + "kl": 2.6782684326171875, + "learning_rate": 9.919647942993148e-05, + "logits/chosen": -55142304.0, + "logits/rejected": -44863476.0, + "logps/chosen": -365.00543212890625, + "logps/rejected": -543.1677856445312, + "loss": 0.0196, + "rewards/chosen": 3.9138436317443848, + "rewards/margins": 18.014302730560303, + "rewards/rejected": -14.100459098815918, + "step": 38 + }, + { + "epoch": 0.014253083599817268, + "grad_norm": 5.515551567077637, + "kl": 2.4172496795654297, + "learning_rate": 9.91382285798002e-05, + "logits/chosen": -61824143.058823526, + "logits/rejected": -58962423.46666667, + "logps/chosen": -365.96599264705884, + "logps/rejected": -485.2033203125, + "loss": 0.0134, + "rewards/chosen": 4.332511004279642, + "rewards/margins": 17.40482647465725, + "rewards/rejected": -13.072315470377605, + "step": 39 + }, + { + "epoch": 0.014618547281863865, + "grad_norm": 8.376943588256836, + "kl": 0.7859287261962891, + "learning_rate": 9.907795784955327e-05, + "logits/chosen": -50139801.6, + "logits/rejected": -52966418.823529415, + "logps/chosen": -343.3307291666667, + "logps/rejected": -476.7391716452206, + "loss": 0.019, + "rewards/chosen": 6.3787684122721355, + "rewards/margins": 18.696476147221585, + "rewards/rejected": -12.31770773494945, + "step": 40 + }, + { + "epoch": 0.01498401096391046, + "grad_norm": 4.217868328094482, + "kl": 7.657425880432129, + "learning_rate": 9.901566971668437e-05, + "logits/chosen": -48509728.0, + "logits/rejected": -23746154.666666668, + "logps/chosen": -324.2577880859375, + "logps/rejected": -378.572998046875, + "loss": 0.0192, + "rewards/chosen": 6.645659637451172, + "rewards/margins": 16.82607256571452, + "rewards/rejected": -10.180412928263346, + "step": 41 + }, + { + "epoch": 0.015349474645957059, + "grad_norm": 3.5086028575897217, + "kl": 1.2947101593017578, + "learning_rate": 9.895136674161465e-05, + "logits/chosen": -66508174.76923077, + "logits/rejected": -60780672.0, + "logps/chosen": -327.66590294471155, + "logps/rejected": -400.8971011513158, + "loss": 0.008, + "rewards/chosen": 6.866602971003606, + "rewards/margins": 16.84938141981117, + "rewards/rejected": -9.982778448807565, + "step": 42 + }, + { + "epoch": 0.015714938328003653, + "grad_norm": 11.383273124694824, + "kl": 4.8486528396606445, + "learning_rate": 9.888505156758759e-05, + "logits/chosen": -63846784.0, + "logits/rejected": -55246641.777777776, + "logps/chosen": -361.37552315848217, + "logps/rejected": -442.8365885416667, + "loss": 0.0595, + "rewards/chosen": 6.539271218436105, + "rewards/margins": 15.308385394868395, + "rewards/rejected": -8.769114176432291, + "step": 43 + }, + { + "epoch": 0.016080402010050253, + "grad_norm": 4.625300407409668, + "kl": 5.606626987457275, + "learning_rate": 9.881672692056021e-05, + "logits/chosen": -45201365.333333336, + "logits/rejected": -40087081.14285714, + "logps/chosen": -364.97667100694446, + "logps/rejected": -253.65269252232142, + "loss": 0.0131, + "rewards/chosen": 6.814608679877387, + "rewards/margins": 14.540797339545357, + "rewards/rejected": -7.726188659667969, + "step": 44 + }, + { + "epoch": 0.01644586569209685, + "grad_norm": 1.9459388256072998, + "kl": 1.063331127166748, + "learning_rate": 9.874639560909117e-05, + "logits/chosen": -49554261.333333336, + "logits/rejected": -39169133.71428572, + "logps/chosen": -323.4977213541667, + "logps/rejected": -365.045654296875, + "loss": 0.0082, + "rewards/chosen": 5.9193136427137585, + "rewards/margins": 15.729785313681951, + "rewards/rejected": -9.810471670968193, + "step": 45 + }, + { + "epoch": 0.016811329374143445, + "grad_norm": 10.669745445251465, + "kl": 11.293217658996582, + "learning_rate": 9.867406052422524e-05, + "logits/chosen": -59732997.81818182, + "logits/rejected": -45335836.8, + "logps/chosen": -402.67214133522725, + "logps/rejected": -392.5699951171875, + "loss": 0.0387, + "rewards/chosen": 6.792078885165128, + "rewards/margins": 18.25916186246005, + "rewards/rejected": -11.467082977294922, + "step": 46 + }, + { + "epoch": 0.01717679305619004, + "grad_norm": 4.895358562469482, + "kl": 5.924516201019287, + "learning_rate": 9.859972463937441e-05, + "logits/chosen": -47930160.0, + "logits/rejected": -44561610.666666664, + "logps/chosen": -343.3493896484375, + "logps/rejected": -452.20849609375, + "loss": 0.0168, + "rewards/chosen": 6.354761123657227, + "rewards/margins": 16.014861424764, + "rewards/rejected": -9.660100301106771, + "step": 47 + }, + { + "epoch": 0.017542256738236638, + "grad_norm": 7.401646614074707, + "kl": 3.7730391025543213, + "learning_rate": 9.852339101019574e-05, + "logits/chosen": -30919124.57142857, + "logits/rejected": -39209255.11111111, + "logps/chosen": -269.38614327566967, + "logps/rejected": -556.4567599826389, + "loss": 0.0333, + "rewards/chosen": 4.891801016671317, + "rewards/margins": 20.362538050091455, + "rewards/rejected": -15.47073703342014, + "step": 48 + }, + { + "epoch": 0.017907720420283234, + "grad_norm": 23.92951774597168, + "kl": 1.085057258605957, + "learning_rate": 9.844506277446577e-05, + "logits/chosen": -30682542.0, + "logits/rejected": -46926880.0, + "logps/chosen": -253.35186767578125, + "logps/rejected": -473.3765462239583, + "loss": 0.0386, + "rewards/chosen": 6.101473808288574, + "rewards/margins": 18.379438082377114, + "rewards/rejected": -12.277964274088541, + "step": 49 + }, + { + "epoch": 0.01827318410232983, + "grad_norm": 8.554891586303711, + "kl": 1.4251766204833984, + "learning_rate": 9.836474315195147e-05, + "logits/chosen": -52996062.11764706, + "logits/rejected": -48182971.733333334, + "logps/chosen": -335.1333582261029, + "logps/rejected": -413.86982421875, + "loss": 0.0302, + "rewards/chosen": 4.55959140553194, + "rewards/margins": 12.97196221445121, + "rewards/rejected": -8.41237080891927, + "step": 50 + }, + { + "epoch": 0.018638647784376426, + "grad_norm": 4.158783435821533, + "kl": 4.646789073944092, + "learning_rate": 9.828243544427796e-05, + "logits/chosen": -48258592.0, + "logits/rejected": -58305577.14285714, + "logps/chosen": -288.0983072916667, + "logps/rejected": -463.22607421875, + "loss": 0.0122, + "rewards/chosen": 6.595457712809245, + "rewards/margins": 19.841273534865607, + "rewards/rejected": -13.245815822056361, + "step": 51 + }, + { + "epoch": 0.019004111466423026, + "grad_norm": 2.682189464569092, + "kl": 2.4769458770751953, + "learning_rate": 9.819814303479267e-05, + "logits/chosen": -69832394.66666667, + "logits/rejected": -49748073.6, + "logps/chosen": -438.50634765625, + "logps/rejected": -484.369775390625, + "loss": 0.0328, + "rewards/chosen": 9.220114390055338, + "rewards/margins": 21.066194407145183, + "rewards/rejected": -11.846080017089843, + "step": 52 + }, + { + "epoch": 0.019369575148469622, + "grad_norm": 7.231258392333984, + "kl": 5.255629062652588, + "learning_rate": 9.811186938842645e-05, + "logits/chosen": -71031184.0, + "logits/rejected": -56307680.0, + "logps/chosen": -393.62603759765625, + "logps/rejected": -514.8372802734375, + "loss": 0.0119, + "rewards/chosen": 9.535325050354004, + "rewards/margins": 22.810078620910645, + "rewards/rejected": -13.27475357055664, + "step": 53 + }, + { + "epoch": 0.019735038830516218, + "grad_norm": 7.897918224334717, + "kl": 4.446504592895508, + "learning_rate": 9.802361805155097e-05, + "logits/chosen": -51625570.461538464, + "logits/rejected": -45185003.78947368, + "logps/chosen": -379.41346153846155, + "logps/rejected": -640.4477796052631, + "loss": 0.0175, + "rewards/chosen": 5.7126593956580525, + "rewards/margins": 26.850794803758383, + "rewards/rejected": -21.13813540810033, + "step": 54 + }, + { + "epoch": 0.020100502512562814, + "grad_norm": 4.542209148406982, + "kl": 2.1538782119750977, + "learning_rate": 9.793339265183303e-05, + "logits/chosen": -40115584.0, + "logits/rejected": -59064515.55555555, + "logps/chosen": -353.450927734375, + "logps/rejected": -619.6857096354166, + "loss": 0.0162, + "rewards/chosen": 5.434125082833426, + "rewards/margins": 19.83607888600183, + "rewards/rejected": -14.401953803168404, + "step": 55 + }, + { + "epoch": 0.02046596619460941, + "grad_norm": 7.430333614349365, + "kl": 0.5837211608886719, + "learning_rate": 9.784119689808544e-05, + "logits/chosen": -28610328.0, + "logits/rejected": -37285267.2, + "logps/chosen": -281.4744059244792, + "logps/rejected": -427.88486328125, + "loss": 0.0317, + "rewards/chosen": 6.235033671061198, + "rewards/margins": 19.226700846354166, + "rewards/rejected": -12.991667175292969, + "step": 56 + }, + { + "epoch": 0.020831429876656007, + "grad_norm": 14.218281745910645, + "kl": 2.6655921936035156, + "learning_rate": 9.774703458011453e-05, + "logits/chosen": -35870300.44444445, + "logits/rejected": -51763912.347826086, + "logps/chosen": -474.1842990451389, + "logps/rejected": -464.2010020380435, + "loss": 0.0335, + "rewards/chosen": 5.657533009847005, + "rewards/margins": 16.456544240315754, + "rewards/rejected": -10.79901123046875, + "step": 57 + }, + { + "epoch": 0.021196893558702603, + "grad_norm": 6.1396379470825195, + "kl": 2.680887222290039, + "learning_rate": 9.765090956856436e-05, + "logits/chosen": -39752972.8, + "logits/rejected": -50118452.705882356, + "logps/chosen": -337.64710286458336, + "logps/rejected": -483.1478056066176, + "loss": 0.0194, + "rewards/chosen": 5.75444590250651, + "rewards/margins": 21.916373967189415, + "rewards/rejected": -16.161928064682904, + "step": 58 + }, + { + "epoch": 0.0215623572407492, + "grad_norm": 11.815670013427734, + "kl": 5.052088737487793, + "learning_rate": 9.755282581475769e-05, + "logits/chosen": -55888608.0, + "logits/rejected": -54900144.0, + "logps/chosen": -439.7640686035156, + "logps/rejected": -312.67437744140625, + "loss": 0.0247, + "rewards/chosen": 8.315971374511719, + "rewards/margins": 16.8193359375, + "rewards/rejected": -8.503364562988281, + "step": 59 + }, + { + "epoch": 0.0219278209227958, + "grad_norm": 3.399034023284912, + "kl": 0.0, + "learning_rate": 9.745278735053343e-05, + "logits/chosen": -30328192.0, + "logits/rejected": -38406880.0, + "logps/chosen": -300.8030192057292, + "logps/rejected": -372.9355224609375, + "loss": 0.0038, + "rewards/chosen": 6.941567103068034, + "rewards/margins": 20.73451296488444, + "rewards/rejected": -13.792945861816406, + "step": 60 + }, + { + "epoch": 0.022293284604842395, + "grad_norm": 14.877612113952637, + "kl": 3.2787771224975586, + "learning_rate": 9.735079828808107e-05, + "logits/chosen": -54578709.333333336, + "logits/rejected": -57578782.11764706, + "logps/chosen": -316.0634765625, + "logps/rejected": -518.0548023897059, + "loss": 0.0239, + "rewards/chosen": 6.856675720214843, + "rewards/margins": 25.521863780302162, + "rewards/rejected": -18.665188060087317, + "step": 61 + }, + { + "epoch": 0.02265874828688899, + "grad_norm": 3.7056872844696045, + "kl": 2.769613742828369, + "learning_rate": 9.724686281977146e-05, + "logits/chosen": -54293920.0, + "logits/rejected": -57358412.0, + "logps/chosen": -362.36322021484375, + "logps/rejected": -437.716796875, + "loss": 0.0118, + "rewards/chosen": 7.557523727416992, + "rewards/margins": 22.848549842834473, + "rewards/rejected": -15.29102611541748, + "step": 62 + }, + { + "epoch": 0.023024211968935587, + "grad_norm": 7.155191898345947, + "kl": 0.9947853088378906, + "learning_rate": 9.714098521798465e-05, + "logits/chosen": -56705880.615384616, + "logits/rejected": -46115129.2631579, + "logps/chosen": -440.0930363581731, + "logps/rejected": -505.4751233552632, + "loss": 0.0107, + "rewards/chosen": 6.279456505408654, + "rewards/margins": 22.39212579765783, + "rewards/rejected": -16.112669292249176, + "step": 63 + }, + { + "epoch": 0.023389675650982183, + "grad_norm": 5.921576976776123, + "kl": 2.5714855194091797, + "learning_rate": 9.703316983493414e-05, + "logits/chosen": -41153717.89473684, + "logits/rejected": -66649604.92307692, + "logps/chosen": -408.80581825657896, + "logps/rejected": -532.0378981370193, + "loss": 0.0181, + "rewards/chosen": 5.973050970780222, + "rewards/margins": 27.444178994367963, + "rewards/rejected": -21.47112802358774, + "step": 64 + }, + { + "epoch": 0.02375513933302878, + "grad_norm": 6.441226959228516, + "kl": 4.569951057434082, + "learning_rate": 9.692342110248802e-05, + "logits/chosen": -48828369.777777776, + "logits/rejected": -37597776.0, + "logps/chosen": -341.35687934027777, + "logps/rejected": -341.09946986607144, + "loss": 0.0329, + "rewards/chosen": 6.38562986585829, + "rewards/margins": 21.198414333282955, + "rewards/rejected": -14.812784467424665, + "step": 65 + }, + { + "epoch": 0.024120603015075376, + "grad_norm": 6.141626834869385, + "kl": 0.8777332305908203, + "learning_rate": 9.681174353198687e-05, + "logits/chosen": -27000544.0, + "logits/rejected": -40192785.06666667, + "logps/chosen": -369.6226447610294, + "logps/rejected": -503.09759114583335, + "loss": 0.0145, + "rewards/chosen": 6.301104826085708, + "rewards/margins": 21.401970508051853, + "rewards/rejected": -15.100865681966146, + "step": 66 + }, + { + "epoch": 0.024486066697121972, + "grad_norm": 6.145155906677246, + "kl": 3.8379316329956055, + "learning_rate": 9.669814171405816e-05, + "logits/chosen": -32335815.111111112, + "logits/rejected": -38183881.14285714, + "logps/chosen": -297.35590277777777, + "logps/rejected": -354.58872767857144, + "loss": 0.0263, + "rewards/chosen": 6.240455203586155, + "rewards/margins": 17.790007031153117, + "rewards/rejected": -11.549551827566964, + "step": 67 + }, + { + "epoch": 0.02485153037916857, + "grad_norm": 5.1387481689453125, + "kl": 7.461835861206055, + "learning_rate": 9.65826203184277e-05, + "logits/chosen": -58576753.777777776, + "logits/rejected": -46430317.71428572, + "logps/chosen": -414.0234375, + "logps/rejected": -470.3946010044643, + "loss": 0.0163, + "rewards/chosen": 8.588349236382378, + "rewards/margins": 23.756410629030256, + "rewards/rejected": -15.16806139264788, + "step": 68 + }, + { + "epoch": 0.025216994061215168, + "grad_norm": 4.586066722869873, + "kl": 7.936642646789551, + "learning_rate": 9.64651840937276e-05, + "logits/chosen": -48964741.81818182, + "logits/rejected": -54479142.4, + "logps/chosen": -356.181884765625, + "logps/rejected": -499.202001953125, + "loss": 0.0281, + "rewards/chosen": 6.949990706010298, + "rewards/margins": 22.57191758589311, + "rewards/rejected": -15.621926879882812, + "step": 69 + }, + { + "epoch": 0.025582457743261764, + "grad_norm": 5.896798610687256, + "kl": 4.361974716186523, + "learning_rate": 9.63458378673011e-05, + "logits/chosen": -45619443.2, + "logits/rejected": -23227202.666666668, + "logps/chosen": -276.57431640625, + "logps/rejected": -394.6361897786458, + "loss": 0.0337, + "rewards/chosen": 6.1711772918701175, + "rewards/margins": 16.92397346496582, + "rewards/rejected": -10.752796173095703, + "step": 70 + }, + { + "epoch": 0.02594792142530836, + "grad_norm": 8.044129371643066, + "kl": 4.886088848114014, + "learning_rate": 9.622458654500409e-05, + "logits/chosen": -32569085.53846154, + "logits/rejected": -41087205.05263158, + "logps/chosen": -358.5640399639423, + "logps/rejected": -417.02626439144734, + "loss": 0.0159, + "rewards/chosen": 8.606779245229868, + "rewards/margins": 18.563793630252484, + "rewards/rejected": -9.957014385022616, + "step": 71 + }, + { + "epoch": 0.026313385107354956, + "grad_norm": 3.8766796588897705, + "kl": 4.4892730712890625, + "learning_rate": 9.610143511100354e-05, + "logits/chosen": -34946744.0, + "logits/rejected": -40909920.0, + "logps/chosen": -417.0111389160156, + "logps/rejected": -540.6241455078125, + "loss": 0.0048, + "rewards/chosen": 9.499505043029785, + "rewards/margins": 22.59523105621338, + "rewards/rejected": -13.095726013183594, + "step": 72 + }, + { + "epoch": 0.026678848789401553, + "grad_norm": 3.4914629459381104, + "kl": 7.008626937866211, + "learning_rate": 9.597638862757255e-05, + "logits/chosen": -37020928.0, + "logits/rejected": -37666613.333333336, + "logps/chosen": -377.010009765625, + "logps/rejected": -358.0433756510417, + "loss": 0.0147, + "rewards/chosen": 7.385871887207031, + "rewards/margins": 15.435877482096354, + "rewards/rejected": -8.050005594889322, + "step": 73 + }, + { + "epoch": 0.02704431247144815, + "grad_norm": 21.232009887695312, + "kl": 7.479496955871582, + "learning_rate": 9.584945223488227e-05, + "logits/chosen": -23484949.333333332, + "logits/rejected": -35032530.28571428, + "logps/chosen": -320.8006184895833, + "logps/rejected": -401.1339634486607, + "loss": 0.0546, + "rewards/chosen": 7.029023912217882, + "rewards/margins": 15.557043953547403, + "rewards/rejected": -8.52802004132952, + "step": 74 + }, + { + "epoch": 0.027409776153494745, + "grad_norm": 1.4355311393737793, + "kl": 11.160795211791992, + "learning_rate": 9.572063115079063e-05, + "logits/chosen": -26418411.42857143, + "logits/rejected": -44502410.666666664, + "logps/chosen": -326.80458286830356, + "logps/rejected": -410.8125813802083, + "loss": 0.0139, + "rewards/chosen": 9.285047258649554, + "rewards/margins": 18.055134606739834, + "rewards/rejected": -8.770087348090279, + "step": 75 + }, + { + "epoch": 0.027775239835541345, + "grad_norm": 6.367202281951904, + "kl": 4.941760063171387, + "learning_rate": 9.558993067062785e-05, + "logits/chosen": -30686507.42857143, + "logits/rejected": -33273493.333333332, + "logps/chosen": -331.98423549107144, + "logps/rejected": -399.31407335069446, + "loss": 0.0184, + "rewards/chosen": 5.848648616245815, + "rewards/margins": 14.488197023906405, + "rewards/rejected": -8.639548407660591, + "step": 76 + }, + { + "epoch": 0.02814070351758794, + "grad_norm": 7.1556782722473145, + "kl": 9.753096580505371, + "learning_rate": 9.545735616697875e-05, + "logits/chosen": -26850902.0, + "logits/rejected": -39894508.0, + "logps/chosen": -363.8820495605469, + "logps/rejected": -415.097412109375, + "loss": 0.0148, + "rewards/chosen": 8.675457954406738, + "rewards/margins": 19.9981746673584, + "rewards/rejected": -11.32271671295166, + "step": 77 + }, + { + "epoch": 0.028506167199634537, + "grad_norm": 6.342607498168945, + "kl": 9.045159339904785, + "learning_rate": 9.53229130894619e-05, + "logits/chosen": -34480309.333333336, + "logits/rejected": -41044848.0, + "logps/chosen": -389.5137125651042, + "logps/rejected": -607.550146484375, + "loss": 0.0221, + "rewards/chosen": 9.95743497212728, + "rewards/margins": 24.688401158650716, + "rewards/rejected": -14.730966186523437, + "step": 78 + }, + { + "epoch": 0.028871630881681133, + "grad_norm": 1.0725568532943726, + "kl": 5.477190971374512, + "learning_rate": 9.518660696450568e-05, + "logits/chosen": -44996597.333333336, + "logits/rejected": -23080464.0, + "logps/chosen": -446.3680013020833, + "logps/rejected": -412.258740234375, + "loss": 0.0077, + "rewards/chosen": 9.444354375203451, + "rewards/margins": 22.551628239949544, + "rewards/rejected": -13.107273864746094, + "step": 79 + }, + { + "epoch": 0.02923709456372773, + "grad_norm": 3.533200740814209, + "kl": 4.987822532653809, + "learning_rate": 9.504844339512095e-05, + "logits/chosen": -27770840.0, + "logits/rejected": -28882556.8, + "logps/chosen": -341.8219807942708, + "logps/rejected": -445.261083984375, + "loss": 0.0093, + "rewards/chosen": 6.604569753011067, + "rewards/margins": 17.269882710774738, + "rewards/rejected": -10.665312957763671, + "step": 80 + }, + { + "epoch": 0.029602558245774326, + "grad_norm": 1.3078417778015137, + "kl": 6.236767768859863, + "learning_rate": 9.490842806067095e-05, + "logits/chosen": -31380043.29411765, + "logits/rejected": -46493320.53333333, + "logps/chosen": -327.4697840073529, + "logps/rejected": -503.6666666666667, + "loss": 0.0113, + "rewards/chosen": 8.652278226964613, + "rewards/margins": 24.477358589920343, + "rewards/rejected": -15.82508036295573, + "step": 81 + }, + { + "epoch": 0.02996802192782092, + "grad_norm": 0.9292804002761841, + "kl": 7.139398574829102, + "learning_rate": 9.476656671663765e-05, + "logits/chosen": -40411835.733333334, + "logits/rejected": -37733078.5882353, + "logps/chosen": -431.92571614583335, + "logps/rejected": -490.7525850183824, + "loss": 0.0129, + "rewards/chosen": 8.565556844075521, + "rewards/margins": 23.004124061734068, + "rewards/rejected": -14.438567217658548, + "step": 82 + }, + { + "epoch": 0.030333485609867518, + "grad_norm": 7.188210487365723, + "kl": 7.373342037200928, + "learning_rate": 9.46228651943853e-05, + "logits/chosen": -54043712.0, + "logits/rejected": -37557165.333333336, + "logps/chosen": -322.804541015625, + "logps/rejected": -460.3668212890625, + "loss": 0.0175, + "rewards/chosen": 7.64395751953125, + "rewards/margins": 21.538275146484374, + "rewards/rejected": -13.894317626953125, + "step": 83 + }, + { + "epoch": 0.030698949291914118, + "grad_norm": 5.98512601852417, + "kl": 2.127194404602051, + "learning_rate": 9.44773294009206e-05, + "logits/chosen": -39313645.71428572, + "logits/rejected": -43236412.44444445, + "logps/chosen": -376.50913783482144, + "logps/rejected": -564.1276584201389, + "loss": 0.0081, + "rewards/chosen": 8.587449210030693, + "rewards/margins": 26.700571090456037, + "rewards/rejected": -18.113121880425346, + "step": 84 + }, + { + "epoch": 0.031064412973960714, + "grad_norm": 8.364197731018066, + "kl": 3.6299943923950195, + "learning_rate": 9.432996531865002e-05, + "logits/chosen": -49738910.11764706, + "logits/rejected": -62371524.266666666, + "logps/chosen": -301.84007352941177, + "logps/rejected": -474.6634765625, + "loss": 0.0109, + "rewards/chosen": 6.6499158073874085, + "rewards/margins": 21.304260493259804, + "rewards/rejected": -14.654344685872395, + "step": 85 + }, + { + "epoch": 0.031429876656007306, + "grad_norm": 11.131648063659668, + "kl": 0.9635066986083984, + "learning_rate": 9.418077900513377e-05, + "logits/chosen": -29042400.0, + "logits/rejected": -51443334.4, + "logps/chosen": -283.01108805338544, + "logps/rejected": -413.430224609375, + "loss": 0.0095, + "rewards/chosen": 7.071775436401367, + "rewards/margins": 23.17550926208496, + "rewards/rejected": -16.103733825683594, + "step": 86 + }, + { + "epoch": 0.0317953403380539, + "grad_norm": 2.274010419845581, + "kl": 1.5090999603271484, + "learning_rate": 9.40297765928369e-05, + "logits/chosen": -29211932.0, + "logits/rejected": -32424992.0, + "logps/chosen": -267.6764221191406, + "logps/rejected": -351.9060363769531, + "loss": 0.0177, + "rewards/chosen": 7.210115909576416, + "rewards/margins": 23.191922664642334, + "rewards/rejected": -15.981806755065918, + "step": 87 + }, + { + "epoch": 0.032160804020100506, + "grad_norm": 3.8534750938415527, + "kl": 1.0469717979431152, + "learning_rate": 9.387696428887716e-05, + "logits/chosen": -40250990.93333333, + "logits/rejected": -26059026.82352941, + "logps/chosen": -348.96025390625, + "logps/rejected": -401.61853745404414, + "loss": 0.0076, + "rewards/chosen": 5.922635396321614, + "rewards/margins": 23.717219782810584, + "rewards/rejected": -17.79458438648897, + "step": 88 + }, + { + "epoch": 0.0325262677021471, + "grad_norm": 4.639629364013672, + "kl": 0.889378547668457, + "learning_rate": 9.372234837476978e-05, + "logits/chosen": -47629276.0, + "logits/rejected": -46884048.0, + "logps/chosen": -348.05352783203125, + "logps/rejected": -561.5498046875, + "loss": 0.0252, + "rewards/chosen": 5.22454309463501, + "rewards/margins": 23.24092721939087, + "rewards/rejected": -18.01638412475586, + "step": 89 + }, + { + "epoch": 0.0328917313841937, + "grad_norm": 3.8778488636016846, + "kl": 4.593163013458252, + "learning_rate": 9.356593520616948e-05, + "logits/chosen": -28529976.470588237, + "logits/rejected": -23993877.333333332, + "logps/chosen": -320.5784696691176, + "logps/rejected": -275.25576171875, + "loss": 0.0122, + "rewards/chosen": 6.165865729836857, + "rewards/margins": 18.66365218817019, + "rewards/rejected": -12.497786458333334, + "step": 90 + }, + { + "epoch": 0.033257195066240294, + "grad_norm": 3.869450569152832, + "kl": 4.628847599029541, + "learning_rate": 9.340773121260893e-05, + "logits/chosen": -36773368.47058824, + "logits/rejected": -39695684.266666666, + "logps/chosen": -321.6018497242647, + "logps/rejected": -385.8181966145833, + "loss": 0.013, + "rewards/chosen": 7.261572893928079, + "rewards/margins": 20.196593849331727, + "rewards/rejected": -12.935020955403646, + "step": 91 + }, + { + "epoch": 0.03362265874828689, + "grad_norm": 6.570498466491699, + "kl": 7.923100471496582, + "learning_rate": 9.324774289723468e-05, + "logits/chosen": -41189649.777777776, + "logits/rejected": -40854500.571428575, + "logps/chosen": -414.1661783854167, + "logps/rejected": -521.8468889508929, + "loss": 0.0179, + "rewards/chosen": 6.903472052680121, + "rewards/margins": 24.003510853600883, + "rewards/rejected": -17.10003880092076, + "step": 92 + }, + { + "epoch": 0.03398812243033349, + "grad_norm": 3.6953377723693848, + "kl": 4.583061218261719, + "learning_rate": 9.308597683653975e-05, + "logits/chosen": -31183828.210526317, + "logits/rejected": -33290806.153846152, + "logps/chosen": -387.0103053042763, + "logps/rejected": -422.2001953125, + "loss": 0.0048, + "rewards/chosen": 8.692630165501646, + "rewards/margins": 24.99088902029431, + "rewards/rejected": -16.298258854792667, + "step": 93 + }, + { + "epoch": 0.03435358611238008, + "grad_norm": 4.497894763946533, + "kl": 4.432340621948242, + "learning_rate": 9.292243968009331e-05, + "logits/chosen": -26371536.0, + "logits/rejected": -21584090.666666668, + "logps/chosen": -315.2775634765625, + "logps/rejected": -503.4237874348958, + "loss": 0.0087, + "rewards/chosen": 6.665242767333984, + "rewards/margins": 28.2120974222819, + "rewards/rejected": -21.546854654947918, + "step": 94 + }, + { + "epoch": 0.03471904979442668, + "grad_norm": 3.493032932281494, + "kl": 3.130887985229492, + "learning_rate": 9.275713815026731e-05, + "logits/chosen": -26396160.0, + "logits/rejected": -43744042.666666664, + "logps/chosen": -410.61624581473217, + "logps/rejected": -389.35004340277777, + "loss": 0.0095, + "rewards/chosen": 8.23011234828404, + "rewards/margins": 21.48780023484003, + "rewards/rejected": -13.25768788655599, + "step": 95 + }, + { + "epoch": 0.035084513476473275, + "grad_norm": 4.609635829925537, + "kl": 1.9781148433685303, + "learning_rate": 9.259007904196023e-05, + "logits/chosen": -30621874.0, + "logits/rejected": -24159632.0, + "logps/chosen": -312.6817321777344, + "logps/rejected": -411.6536560058594, + "loss": 0.0057, + "rewards/chosen": 6.837421417236328, + "rewards/margins": 19.352378845214844, + "rewards/rejected": -12.514957427978516, + "step": 96 + }, + { + "epoch": 0.03544997715851987, + "grad_norm": 3.911794424057007, + "kl": 1.7758150100708008, + "learning_rate": 9.242126922231763e-05, + "logits/chosen": -25005936.94117647, + "logits/rejected": -26132309.333333332, + "logps/chosen": -304.5228917738971, + "logps/rejected": -515.6973307291667, + "loss": 0.008, + "rewards/chosen": 7.817889942842371, + "rewards/margins": 20.89239352357154, + "rewards/rejected": -13.074503580729166, + "step": 97 + }, + { + "epoch": 0.03581544084056647, + "grad_norm": 10.93310260772705, + "kl": 0.0, + "learning_rate": 9.225071563045007e-05, + "logits/chosen": -32446668.0, + "logits/rejected": -31420088.0, + "logps/chosen": -385.95794677734375, + "logps/rejected": -462.2115885416667, + "loss": 0.0179, + "rewards/chosen": 8.103047370910645, + "rewards/margins": 21.233390490214028, + "rewards/rejected": -13.130343119303385, + "step": 98 + }, + { + "epoch": 0.036180904522613064, + "grad_norm": 2.214240074157715, + "kl": 1.1216678619384766, + "learning_rate": 9.207842527714767e-05, + "logits/chosen": -42358813.538461536, + "logits/rejected": -37142268.631578945, + "logps/chosen": -405.7210036057692, + "logps/rejected": -493.3158408717105, + "loss": 0.0065, + "rewards/chosen": 8.069786071777344, + "rewards/margins": 22.240832278603, + "rewards/rejected": -14.171046206825658, + "step": 99 + }, + { + "epoch": 0.03654636820465966, + "grad_norm": 4.253543376922607, + "kl": 6.843319892883301, + "learning_rate": 9.190440524459203e-05, + "logits/chosen": -26094864.94117647, + "logits/rejected": -32877809.066666666, + "logps/chosen": -304.0940372242647, + "logps/rejected": -444.9977213541667, + "loss": 0.0191, + "rewards/chosen": 8.618506936465993, + "rewards/margins": 20.40099900189568, + "rewards/rejected": -11.782492065429688, + "step": 100 + }, + { + "epoch": 0.036911831886706256, + "grad_norm": 4.099592685699463, + "kl": 2.627181053161621, + "learning_rate": 9.172866268606513e-05, + "logits/chosen": -38153634.90909091, + "logits/rejected": -24039457.523809522, + "logps/chosen": -337.33096590909093, + "logps/rejected": -446.8031063988095, + "loss": 0.0187, + "rewards/chosen": 5.4638051119717685, + "rewards/margins": 18.64679432328129, + "rewards/rejected": -13.182989211309524, + "step": 101 + }, + { + "epoch": 0.03727729556875285, + "grad_norm": 3.477898597717285, + "kl": 5.307547569274902, + "learning_rate": 9.155120482565521e-05, + "logits/chosen": -37803064.88888889, + "logits/rejected": -30550571.42857143, + "logps/chosen": -354.0104709201389, + "logps/rejected": -488.79739815848217, + "loss": 0.0087, + "rewards/chosen": 8.774648878309462, + "rewards/margins": 22.556626940530442, + "rewards/rejected": -13.781978062220983, + "step": 102 + }, + { + "epoch": 0.03764275925079945, + "grad_norm": 8.65983772277832, + "kl": 8.958831787109375, + "learning_rate": 9.137203895795983e-05, + "logits/chosen": -35876150.85714286, + "logits/rejected": -30704293.818181816, + "logps/chosen": -336.96210007440476, + "logps/rejected": -496.71315696022725, + "loss": 0.0223, + "rewards/chosen": 7.153175717308407, + "rewards/margins": 19.01286976471608, + "rewards/rejected": -11.85969404740767, + "step": 103 + }, + { + "epoch": 0.03800822293284605, + "grad_norm": 11.845693588256836, + "kl": 2.2604990005493164, + "learning_rate": 9.119117244778607e-05, + "logits/chosen": -33321863.384615384, + "logits/rejected": -40389894.7368421, + "logps/chosen": -338.03354116586536, + "logps/rejected": -462.0747327302632, + "loss": 0.0115, + "rewards/chosen": 9.559725247896635, + "rewards/margins": 23.082558821087424, + "rewards/rejected": -13.52283357319079, + "step": 104 + }, + { + "epoch": 0.03837368661489265, + "grad_norm": 3.929539680480957, + "kl": 9.226816177368164, + "learning_rate": 9.10086127298478e-05, + "logits/chosen": -28004862.11764706, + "logits/rejected": -34591616.0, + "logps/chosen": -368.80325137867646, + "logps/rejected": -428.3048828125, + "loss": 0.0157, + "rewards/chosen": 8.625091552734375, + "rewards/margins": 22.166370646158853, + "rewards/rejected": -13.54127909342448, + "step": 105 + }, + { + "epoch": 0.038739150296939244, + "grad_norm": 7.086399555206299, + "kl": 5.677105903625488, + "learning_rate": 9.082436730845993e-05, + "logits/chosen": -37228549.333333336, + "logits/rejected": -31119241.6, + "logps/chosen": -417.1453857421875, + "logps/rejected": -465.59287109375, + "loss": 0.0289, + "rewards/chosen": 6.946853001912435, + "rewards/margins": 20.81446622212728, + "rewards/rejected": -13.867613220214844, + "step": 106 + }, + { + "epoch": 0.03910461397898584, + "grad_norm": 2.494783878326416, + "kl": 12.536182403564453, + "learning_rate": 9.063844375723014e-05, + "logits/chosen": -29469715.555555556, + "logits/rejected": -35742962.28571428, + "logps/chosen": -398.61485460069446, + "logps/rejected": -526.6864885602679, + "loss": 0.0136, + "rewards/chosen": 10.75872802734375, + "rewards/margins": 25.600702558244976, + "rewards/rejected": -14.841974530901227, + "step": 107 + }, + { + "epoch": 0.039470077661032436, + "grad_norm": 2.447875499725342, + "kl": 1.5802021026611328, + "learning_rate": 9.045084971874738e-05, + "logits/chosen": -40079336.72727273, + "logits/rejected": -35073746.28571428, + "logps/chosen": -425.91996626420456, + "logps/rejected": -501.5933314732143, + "loss": 0.0051, + "rewards/chosen": 11.33784970370206, + "rewards/margins": 27.476164087072597, + "rewards/rejected": -16.138314383370535, + "step": 108 + }, + { + "epoch": 0.03983554134307903, + "grad_norm": 4.472402095794678, + "kl": 7.777850151062012, + "learning_rate": 9.02615929042678e-05, + "logits/chosen": -39484612.92307692, + "logits/rejected": -46556890.94736842, + "logps/chosen": -351.8323317307692, + "logps/rejected": -418.0727025082237, + "loss": 0.0158, + "rewards/chosen": 9.19132056603065, + "rewards/margins": 22.550823258002275, + "rewards/rejected": -13.359502691971628, + "step": 109 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 4.362159729003906, + "kl": 9.24830436706543, + "learning_rate": 9.007068109339784e-05, + "logits/chosen": -34914912.0, + "logits/rejected": -53206777.6, + "logps/chosen": -442.7127574573864, + "logps/rejected": -573.83603515625, + "loss": 0.018, + "rewards/chosen": 9.137822931463068, + "rewards/margins": 21.902453058416192, + "rewards/rejected": -12.764630126953126, + "step": 110 + }, + { + "epoch": 0.040566468707172225, + "grad_norm": 2.3844363689422607, + "kl": 3.848033905029297, + "learning_rate": 8.987812213377424e-05, + "logits/chosen": -41293036.307692304, + "logits/rejected": -32670403.36842105, + "logps/chosen": -384.64734825721155, + "logps/rejected": -461.97085731907896, + "loss": 0.0095, + "rewards/chosen": 8.858161926269531, + "rewards/margins": 23.253070630525286, + "rewards/rejected": -14.394908704255757, + "step": 111 + }, + { + "epoch": 0.04093193238921882, + "grad_norm": 4.107156753540039, + "kl": 4.59416389465332, + "learning_rate": 8.968392394074164e-05, + "logits/chosen": -24259356.23529412, + "logits/rejected": -40533777.06666667, + "logps/chosen": -305.0940946691176, + "logps/rejected": -534.0719401041666, + "loss": 0.0231, + "rewards/chosen": 6.991543938131893, + "rewards/margins": 18.158666333965225, + "rewards/rejected": -11.167122395833333, + "step": 112 + }, + { + "epoch": 0.04129739607126542, + "grad_norm": 3.683842658996582, + "kl": 6.614081382751465, + "learning_rate": 8.948809449702711e-05, + "logits/chosen": -31701672.0, + "logits/rejected": -47852736.0, + "logps/chosen": -347.9965515136719, + "logps/rejected": -484.1363830566406, + "loss": 0.0063, + "rewards/chosen": 9.465049743652344, + "rewards/margins": 24.32558536529541, + "rewards/rejected": -14.860535621643066, + "step": 113 + }, + { + "epoch": 0.04166285975331201, + "grad_norm": 4.799642086029053, + "kl": 4.523487567901611, + "learning_rate": 8.929064185241213e-05, + "logits/chosen": -37363976.53333333, + "logits/rejected": -27903503.05882353, + "logps/chosen": -223.542919921875, + "logps/rejected": -515.1665900735294, + "loss": 0.0242, + "rewards/chosen": 6.066276041666667, + "rewards/margins": 18.198430618585327, + "rewards/rejected": -12.132154576918659, + "step": 114 + }, + { + "epoch": 0.04202832343535861, + "grad_norm": 1.5732982158660889, + "kl": 5.359951972961426, + "learning_rate": 8.90915741234015e-05, + "logits/chosen": -37210144.0, + "logits/rejected": -49808716.0, + "logps/chosen": -354.8924560546875, + "logps/rejected": -565.97607421875, + "loss": 0.0098, + "rewards/chosen": 9.922457695007324, + "rewards/margins": 25.488935470581055, + "rewards/rejected": -15.56647777557373, + "step": 115 + }, + { + "epoch": 0.042393787117405206, + "grad_norm": 2.5431289672851562, + "kl": 8.323074340820312, + "learning_rate": 8.889089949288986e-05, + "logits/chosen": -32031748.0, + "logits/rejected": -37969088.0, + "logps/chosen": -331.563232421875, + "logps/rejected": -426.9198913574219, + "loss": 0.0026, + "rewards/chosen": 9.050741195678711, + "rewards/margins": 20.511554718017578, + "rewards/rejected": -11.460813522338867, + "step": 116 + }, + { + "epoch": 0.0427592507994518, + "grad_norm": 2.4684252738952637, + "kl": 6.228133678436279, + "learning_rate": 8.868862620982534e-05, + "logits/chosen": -26141841.777777776, + "logits/rejected": -40421668.571428575, + "logps/chosen": -383.61431206597223, + "logps/rejected": -528.8011997767857, + "loss": 0.0142, + "rewards/chosen": 7.288478427463108, + "rewards/margins": 20.678058200412327, + "rewards/rejected": -13.389579772949219, + "step": 117 + }, + { + "epoch": 0.0431247144814984, + "grad_norm": 3.3828604221343994, + "kl": 9.635725021362305, + "learning_rate": 8.848476258887031e-05, + "logits/chosen": -34122983.61904762, + "logits/rejected": -34187534.54545455, + "logps/chosen": -324.4385695684524, + "logps/rejected": -425.9974254261364, + "loss": 0.0221, + "rewards/chosen": 7.775054205031622, + "rewards/margins": 22.53334250181784, + "rewards/rejected": -14.75828829678622, + "step": 118 + }, + { + "epoch": 0.043490178163544994, + "grad_norm": 18.695188522338867, + "kl": 3.486945629119873, + "learning_rate": 8.827931701005974e-05, + "logits/chosen": -32471527.384615384, + "logits/rejected": -31171927.57894737, + "logps/chosen": -384.1367938701923, + "logps/rejected": -510.8258634868421, + "loss": 0.0168, + "rewards/chosen": 9.219330420860878, + "rewards/margins": 21.40822137994805, + "rewards/rejected": -12.188890959087171, + "step": 119 + }, + { + "epoch": 0.0438556418455916, + "grad_norm": 1.4896206855773926, + "kl": 7.4243879318237305, + "learning_rate": 8.807229791845673e-05, + "logits/chosen": -22926546.82352941, + "logits/rejected": -39986184.53333333, + "logps/chosen": -313.80905330882354, + "logps/rejected": -493.6060546875, + "loss": 0.0138, + "rewards/chosen": 8.630747178021599, + "rewards/margins": 22.445148423138786, + "rewards/rejected": -13.814401245117187, + "step": 120 + }, + { + "epoch": 0.044221105527638194, + "grad_norm": 15.042485237121582, + "kl": 7.030808925628662, + "learning_rate": 8.786371382380528e-05, + "logits/chosen": -22164546.90909091, + "logits/rejected": -33053568.0, + "logps/chosen": -372.3087269176136, + "logps/rejected": -480.8042689732143, + "loss": 0.0353, + "rewards/chosen": 8.92121748490767, + "rewards/margins": 20.267590543408414, + "rewards/rejected": -11.346373058500744, + "step": 121 + }, + { + "epoch": 0.04458656920968479, + "grad_norm": 4.385624885559082, + "kl": 13.904373168945312, + "learning_rate": 8.765357330018056e-05, + "logits/chosen": -13394754.0, + "logits/rejected": -32771898.0, + "logps/chosen": -385.470947265625, + "logps/rejected": -507.3829345703125, + "loss": 0.0187, + "rewards/chosen": 8.200074195861816, + "rewards/margins": 22.01417636871338, + "rewards/rejected": -13.814102172851562, + "step": 122 + }, + { + "epoch": 0.044952032891731386, + "grad_norm": 2.888279914855957, + "kl": 3.44219970703125, + "learning_rate": 8.744188498563641e-05, + "logits/chosen": -29559440.94117647, + "logits/rejected": -32810423.466666665, + "logps/chosen": -322.3883846507353, + "logps/rejected": -476.08134765625, + "loss": 0.0096, + "rewards/chosen": 8.27079413918888, + "rewards/margins": 21.863239004097736, + "rewards/rejected": -13.592444864908854, + "step": 123 + }, + { + "epoch": 0.04531749657377798, + "grad_norm": 7.156754016876221, + "kl": 11.657984733581543, + "learning_rate": 8.722865758185035e-05, + "logits/chosen": -29726720.0, + "logits/rejected": -26009784.470588237, + "logps/chosen": -405.5867513020833, + "logps/rejected": -462.2352079503676, + "loss": 0.0228, + "rewards/chosen": 9.056148274739583, + "rewards/margins": 19.708162614411, + "rewards/rejected": -10.652014339671416, + "step": 124 + }, + { + "epoch": 0.04568296025582458, + "grad_norm": 1.9241881370544434, + "kl": 10.992962837219238, + "learning_rate": 8.701389985376578e-05, + "logits/chosen": -33494584.888888888, + "logits/rejected": -28275019.42857143, + "logps/chosen": -390.51502821180554, + "logps/rejected": -432.02085658482144, + "loss": 0.0201, + "rewards/chosen": 9.984032524956596, + "rewards/margins": 25.02918219187903, + "rewards/rejected": -15.045149666922432, + "step": 125 + }, + { + "epoch": 0.046048423937871175, + "grad_norm": 4.758550643920898, + "kl": 9.395109176635742, + "learning_rate": 8.679762062923175e-05, + "logits/chosen": -11328824.888888888, + "logits/rejected": -49017302.85714286, + "logps/chosen": -386.6030544704861, + "logps/rejected": -573.0137765066964, + "loss": 0.0175, + "rewards/chosen": 8.314485337999132, + "rewards/margins": 20.924610198490203, + "rewards/rejected": -12.610124860491071, + "step": 126 + }, + { + "epoch": 0.04641388761991777, + "grad_norm": 3.189591884613037, + "kl": 5.045682907104492, + "learning_rate": 8.657982879864007e-05, + "logits/chosen": -52660928.0, + "logits/rejected": -22908872.0, + "logps/chosen": -363.4823404947917, + "logps/rejected": -432.112939453125, + "loss": 0.0127, + "rewards/chosen": 8.550245920817057, + "rewards/margins": 21.691986338297525, + "rewards/rejected": -13.141740417480468, + "step": 127 + }, + { + "epoch": 0.04677935130196437, + "grad_norm": 7.768436908721924, + "kl": 4.893695831298828, + "learning_rate": 8.636053331455987e-05, + "logits/chosen": -19024720.0, + "logits/rejected": -11870157.333333334, + "logps/chosen": -340.36279296875, + "logps/rejected": -477.72802734375, + "loss": 0.0169, + "rewards/chosen": 6.748569488525391, + "rewards/margins": 21.24854405721029, + "rewards/rejected": -14.499974568684896, + "step": 128 + }, + { + "epoch": 0.04714481498401096, + "grad_norm": 6.462328910827637, + "kl": 7.855134010314941, + "learning_rate": 8.613974319136958e-05, + "logits/chosen": -32957366.85714286, + "logits/rejected": -34900357.81818182, + "logps/chosen": -345.69256882440476, + "logps/rejected": -376.31613991477275, + "loss": 0.0174, + "rewards/chosen": 7.068138485863095, + "rewards/margins": 19.82476773612943, + "rewards/rejected": -12.756629250266336, + "step": 129 + }, + { + "epoch": 0.04751027866605756, + "grad_norm": 6.749807834625244, + "kl": 6.279943466186523, + "learning_rate": 8.591746750488639e-05, + "logits/chosen": -31112480.0, + "logits/rejected": -37449440.0, + "logps/chosen": -328.711083984375, + "logps/rejected": -521.7969563802084, + "loss": 0.0145, + "rewards/chosen": 8.407637786865234, + "rewards/margins": 21.947111002604167, + "rewards/rejected": -13.539473215738932, + "step": 130 + }, + { + "epoch": 0.047875742348104156, + "grad_norm": 10.204842567443848, + "kl": 1.7673616409301758, + "learning_rate": 8.569371539199316e-05, + "logits/chosen": -37636050.666666664, + "logits/rejected": -25786320.0, + "logps/chosen": -376.6614990234375, + "logps/rejected": -473.077490234375, + "loss": 0.0098, + "rewards/chosen": 9.883068084716797, + "rewards/margins": 22.04599838256836, + "rewards/rejected": -12.162930297851563, + "step": 131 + }, + { + "epoch": 0.04824120603015075, + "grad_norm": 1.5196152925491333, + "kl": 6.041738986968994, + "learning_rate": 8.54684960502629e-05, + "logits/chosen": -34432869.64705882, + "logits/rejected": -33060744.533333335, + "logps/chosen": -297.23609834558823, + "logps/rejected": -432.93837890625, + "loss": 0.0086, + "rewards/chosen": 8.036726110121784, + "rewards/margins": 20.355531041762408, + "rewards/rejected": -12.318804931640624, + "step": 132 + }, + { + "epoch": 0.04860666971219735, + "grad_norm": 4.4139933586120605, + "kl": 5.9731903076171875, + "learning_rate": 8.524181873758059e-05, + "logits/chosen": -29148693.333333332, + "logits/rejected": -34617346.28571428, + "logps/chosen": -313.3556315104167, + "logps/rejected": -440.9218052455357, + "loss": 0.0093, + "rewards/chosen": 8.365788777669271, + "rewards/margins": 22.307918730236235, + "rewards/rejected": -13.942129952566964, + "step": 133 + }, + { + "epoch": 0.048972133394243944, + "grad_norm": 4.669734954833984, + "kl": 4.484433174133301, + "learning_rate": 8.501369277176276e-05, + "logits/chosen": -37621051.07692308, + "logits/rejected": -51850731.78947368, + "logps/chosen": -378.05262169471155, + "logps/rejected": -357.8472964638158, + "loss": 0.0139, + "rewards/chosen": 8.4474851168119, + "rewards/margins": 22.600776517922096, + "rewards/rejected": -14.153291401110197, + "step": 134 + }, + { + "epoch": 0.04933759707629054, + "grad_norm": 3.8269076347351074, + "kl": 3.0405588150024414, + "learning_rate": 8.478412753017433e-05, + "logits/chosen": -50635735.27272727, + "logits/rejected": -40710640.76190476, + "logps/chosen": -336.33686967329544, + "logps/rejected": -490.99595424107144, + "loss": 0.0131, + "rewards/chosen": 7.602625760165128, + "rewards/margins": 25.30353460270605, + "rewards/rejected": -17.700908842540922, + "step": 135 + }, + { + "epoch": 0.04970306075833714, + "grad_norm": 4.64284086227417, + "kl": 4.288601875305176, + "learning_rate": 8.455313244934324e-05, + "logits/chosen": -37124844.307692304, + "logits/rejected": -31651226.94736842, + "logps/chosen": -368.30724158653845, + "logps/rejected": -519.6214021381579, + "loss": 0.0083, + "rewards/chosen": 8.17648432804988, + "rewards/margins": 26.35454797165596, + "rewards/rejected": -18.178063643606084, + "step": 136 + }, + { + "epoch": 0.05006852444038374, + "grad_norm": 9.166915893554688, + "kl": 3.1957833766937256, + "learning_rate": 8.432071702457252e-05, + "logits/chosen": -19971820.0, + "logits/rejected": -39954416.0, + "logps/chosen": -392.2398986816406, + "logps/rejected": -356.62542724609375, + "loss": 0.0161, + "rewards/chosen": 8.67671012878418, + "rewards/margins": 20.90134620666504, + "rewards/rejected": -12.22463607788086, + "step": 137 + }, + { + "epoch": 0.050433988122430336, + "grad_norm": 5.984962463378906, + "kl": 0.09490013122558594, + "learning_rate": 8.408689080954998e-05, + "logits/chosen": -56916085.333333336, + "logits/rejected": -34009542.4, + "logps/chosen": -371.3211669921875, + "logps/rejected": -439.77919921875, + "loss": 0.0094, + "rewards/chosen": 5.444234212239583, + "rewards/margins": 20.338495381673177, + "rewards/rejected": -14.894261169433594, + "step": 138 + }, + { + "epoch": 0.05079945180447693, + "grad_norm": 0.4479733109474182, + "kl": 0.0, + "learning_rate": 8.385166341595548e-05, + "logits/chosen": -23358480.0, + "logits/rejected": -36381641.14285714, + "logps/chosen": -302.4280894886364, + "logps/rejected": -490.02715773809524, + "loss": 0.0008, + "rewards/chosen": 8.983969254927201, + "rewards/margins": 25.035687830541043, + "rewards/rejected": -16.05171857561384, + "step": 139 + }, + { + "epoch": 0.05116491548652353, + "grad_norm": 6.6932549476623535, + "kl": 5.087161540985107, + "learning_rate": 8.361504451306585e-05, + "logits/chosen": -19172888.0, + "logits/rejected": -29464570.666666668, + "logps/chosen": -281.700341796875, + "logps/rejected": -372.93115234375, + "loss": 0.0127, + "rewards/chosen": 7.251024627685547, + "rewards/margins": 19.892823282877604, + "rewards/rejected": -12.641798655192057, + "step": 140 + }, + { + "epoch": 0.051530379168570124, + "grad_norm": 5.902066707611084, + "kl": 6.660229682922363, + "learning_rate": 8.33770438273574e-05, + "logits/chosen": -39327835.428571425, + "logits/rejected": -27682455.272727273, + "logps/chosen": -321.85907273065476, + "logps/rejected": -416.8302556818182, + "loss": 0.0169, + "rewards/chosen": 7.136968703497024, + "rewards/margins": 21.812885895435944, + "rewards/rejected": -14.67591719193892, + "step": 141 + }, + { + "epoch": 0.05189584285061672, + "grad_norm": 2.5956599712371826, + "kl": 3.9250850677490234, + "learning_rate": 8.313767114210615e-05, + "logits/chosen": -40608861.538461536, + "logits/rejected": -36192471.578947365, + "logps/chosen": -405.92709585336536, + "logps/rejected": -483.7523643092105, + "loss": 0.0045, + "rewards/chosen": 8.45544198843149, + "rewards/margins": 26.898499400026886, + "rewards/rejected": -18.443057411595394, + "step": 142 + }, + { + "epoch": 0.05226130653266332, + "grad_norm": 1.836310863494873, + "kl": 2.898988723754883, + "learning_rate": 8.289693629698564e-05, + "logits/chosen": -33169533.333333332, + "logits/rejected": -47413168.0, + "logps/chosen": -370.8831380208333, + "logps/rejected": -609.94736328125, + "loss": 0.0018, + "rewards/chosen": 10.799910227457682, + "rewards/margins": 28.98655573527018, + "rewards/rejected": -18.1866455078125, + "step": 143 + }, + { + "epoch": 0.05262677021470991, + "grad_norm": 5.187747955322266, + "kl": 8.429890632629395, + "learning_rate": 8.265484918766243e-05, + "logits/chosen": -20554602.0, + "logits/rejected": -31753056.0, + "logps/chosen": -316.80816650390625, + "logps/rejected": -437.7784729003906, + "loss": 0.0163, + "rewards/chosen": 9.590449333190918, + "rewards/margins": 23.719423294067383, + "rewards/rejected": -14.128973960876465, + "step": 144 + }, + { + "epoch": 0.05299223389675651, + "grad_norm": 5.367190837860107, + "kl": 11.206666946411133, + "learning_rate": 8.241141976538943e-05, + "logits/chosen": -40488704.0, + "logits/rejected": -37375149.333333336, + "logps/chosen": -336.3726806640625, + "logps/rejected": -526.9472249348959, + "loss": 0.0318, + "rewards/chosen": 8.726722717285156, + "rewards/margins": 23.846473693847656, + "rewards/rejected": -15.1197509765625, + "step": 145 + }, + { + "epoch": 0.053357697578803105, + "grad_norm": 6.1537933349609375, + "kl": 7.488029956817627, + "learning_rate": 8.216665803659671e-05, + "logits/chosen": -17490802.285714287, + "logits/rejected": -24749454.545454547, + "logps/chosen": -302.71609933035717, + "logps/rejected": -321.91195401278407, + "loss": 0.0174, + "rewards/chosen": 7.857398623511905, + "rewards/margins": 18.530006045386905, + "rewards/rejected": -10.672607421875, + "step": 146 + }, + { + "epoch": 0.0537231612608497, + "grad_norm": 23.973215103149414, + "kl": 8.004353523254395, + "learning_rate": 8.192057406248028e-05, + "logits/chosen": -11659069.714285715, + "logits/rejected": -31665980.444444444, + "logps/chosen": -329.85323660714283, + "logps/rejected": -498.25352647569446, + "loss": 0.0289, + "rewards/chosen": 9.294958932059151, + "rewards/margins": 24.97127423967634, + "rewards/rejected": -15.676315307617188, + "step": 147 + }, + { + "epoch": 0.0540886249428963, + "grad_norm": 3.193537712097168, + "kl": 0.8649396896362305, + "learning_rate": 8.167317795858851e-05, + "logits/chosen": -14098165.714285715, + "logits/rejected": -19797928.888888888, + "logps/chosen": -238.59490094866072, + "logps/rejected": -433.498779296875, + "loss": 0.0145, + "rewards/chosen": 5.340590340750558, + "rewards/margins": 18.65028780982608, + "rewards/rejected": -13.309697469075521, + "step": 148 + }, + { + "epoch": 0.054454088624942894, + "grad_norm": 4.464662551879883, + "kl": 8.388834953308105, + "learning_rate": 8.142447989440618e-05, + "logits/chosen": -18256884.0, + "logits/rejected": -26674926.0, + "logps/chosen": -288.3346862792969, + "logps/rejected": -408.85137939453125, + "loss": 0.018, + "rewards/chosen": 8.224873542785645, + "rewards/margins": 17.814136505126953, + "rewards/rejected": -9.589262962341309, + "step": 149 + }, + { + "epoch": 0.05481955230698949, + "grad_norm": 15.688850402832031, + "kl": 7.512658596038818, + "learning_rate": 8.117449009293668e-05, + "logits/chosen": -24875326.11764706, + "logits/rejected": -23949489.066666666, + "logps/chosen": -285.2714269301471, + "logps/rejected": -423.3251953125, + "loss": 0.0363, + "rewards/chosen": 8.441160314223346, + "rewards/margins": 17.377709183038448, + "rewards/rejected": -8.936548868815104, + "step": 150 + }, + { + "epoch": 0.055185015989036086, + "grad_norm": 18.866296768188477, + "kl": 6.525827407836914, + "learning_rate": 8.092321883028158e-05, + "logits/chosen": -22583564.0, + "logits/rejected": -32450880.0, + "logps/chosen": -287.40631103515625, + "logps/rejected": -471.9981384277344, + "loss": 0.0304, + "rewards/chosen": 8.566713333129883, + "rewards/margins": 19.275400161743164, + "rewards/rejected": -10.708686828613281, + "step": 151 + }, + { + "epoch": 0.05555047967108269, + "grad_norm": 3.900456428527832, + "kl": 11.181682586669922, + "learning_rate": 8.067067643521834e-05, + "logits/chosen": -15145715.555555556, + "logits/rejected": -12945529.142857144, + "logps/chosen": -271.58203125, + "logps/rejected": -495.61366489955356, + "loss": 0.0086, + "rewards/chosen": 9.396018134223091, + "rewards/margins": 21.07727510966952, + "rewards/rejected": -11.681256975446429, + "step": 152 + }, + { + "epoch": 0.055915943353129285, + "grad_norm": 1.2899030447006226, + "kl": 6.685637950897217, + "learning_rate": 8.041687328877567e-05, + "logits/chosen": -11986612.266666668, + "logits/rejected": -19600841.411764707, + "logps/chosen": -345.98743489583336, + "logps/rejected": -457.33820657169116, + "loss": 0.0076, + "rewards/chosen": 10.541527303059896, + "rewards/margins": 22.469056133195465, + "rewards/rejected": -11.92752883013557, + "step": 153 + }, + { + "epoch": 0.05628140703517588, + "grad_norm": 3.640664577484131, + "kl": 6.153097152709961, + "learning_rate": 8.016181982380682e-05, + "logits/chosen": -27638136.0, + "logits/rejected": -14189388.8, + "logps/chosen": -393.5046793619792, + "logps/rejected": -428.01044921875, + "loss": 0.0074, + "rewards/chosen": 9.385032018025717, + "rewards/margins": 21.475312169392904, + "rewards/rejected": -12.090280151367187, + "step": 154 + }, + { + "epoch": 0.05664687071722248, + "grad_norm": 2.055399179458618, + "kl": 7.720605850219727, + "learning_rate": 7.990552652456081e-05, + "logits/chosen": -20083619.555555556, + "logits/rejected": -25948745.14285714, + "logps/chosen": -345.41004774305554, + "logps/rejected": -467.7896205357143, + "loss": 0.0151, + "rewards/chosen": 9.123803880479601, + "rewards/margins": 20.44527822827536, + "rewards/rejected": -11.321474347795759, + "step": 155 + }, + { + "epoch": 0.057012334399269074, + "grad_norm": 2.521709442138672, + "kl": 4.136674880981445, + "learning_rate": 7.964800392625129e-05, + "logits/chosen": -24411598.769230768, + "logits/rejected": -23131924.210526317, + "logps/chosen": -372.10745943509613, + "logps/rejected": -459.2008634868421, + "loss": 0.0066, + "rewards/chosen": 8.230770404522236, + "rewards/margins": 21.248242521092962, + "rewards/rejected": -13.017472116570724, + "step": 156 + }, + { + "epoch": 0.05737779808131567, + "grad_norm": 2.4811270236968994, + "kl": 4.052708625793457, + "learning_rate": 7.938926261462366e-05, + "logits/chosen": -26443162.181818184, + "logits/rejected": -26732790.85714286, + "logps/chosen": -279.29030539772725, + "logps/rejected": -473.4995349702381, + "loss": 0.0031, + "rewards/chosen": 8.906252774325283, + "rewards/margins": 22.31977467722707, + "rewards/rejected": -13.413521902901786, + "step": 157 + }, + { + "epoch": 0.057743261763362266, + "grad_norm": 23.89385986328125, + "kl": 7.045736312866211, + "learning_rate": 7.91293132255198e-05, + "logits/chosen": -28469895.529411763, + "logits/rejected": -35620768.0, + "logps/chosen": -331.1364315257353, + "logps/rejected": -513.88671875, + "loss": 0.0232, + "rewards/chosen": 10.059974221622243, + "rewards/margins": 26.979096536075367, + "rewards/rejected": -16.919122314453126, + "step": 158 + }, + { + "epoch": 0.05810872544540886, + "grad_norm": 0.40293624997138977, + "kl": 7.891809463500977, + "learning_rate": 7.886816644444098e-05, + "logits/chosen": -20793583.157894738, + "logits/rejected": -41698372.92307692, + "logps/chosen": -325.18911903782896, + "logps/rejected": -543.1114783653846, + "loss": 0.0005, + "rewards/chosen": 11.437842118112664, + "rewards/margins": 30.14801902616555, + "rewards/rejected": -18.710176908052883, + "step": 159 + }, + { + "epoch": 0.05847418912745546, + "grad_norm": 1.8891913890838623, + "kl": 2.793865203857422, + "learning_rate": 7.860583300610849e-05, + "logits/chosen": -29260708.923076924, + "logits/rejected": -18127292.63157895, + "logps/chosen": -321.8430363581731, + "logps/rejected": -431.8196957236842, + "loss": 0.0023, + "rewards/chosen": 8.246222275954027, + "rewards/margins": 22.301471941866858, + "rewards/rejected": -14.055249665912829, + "step": 160 + }, + { + "epoch": 0.058839652809502055, + "grad_norm": 1.9422544240951538, + "kl": 12.665194511413574, + "learning_rate": 7.83423236940225e-05, + "logits/chosen": -28159568.0, + "logits/rejected": -24138496.0, + "logps/chosen": -394.5556884765625, + "logps/rejected": -578.223876953125, + "loss": 0.008, + "rewards/chosen": 11.073470306396484, + "rewards/margins": 31.373287709554035, + "rewards/rejected": -20.29981740315755, + "step": 161 + }, + { + "epoch": 0.05920511649154865, + "grad_norm": 3.1766672134399414, + "kl": 6.990899085998535, + "learning_rate": 7.807764934001874e-05, + "logits/chosen": -26544121.6, + "logits/rejected": -19686550.666666668, + "logps/chosen": -333.9852294921875, + "logps/rejected": -511.3344319661458, + "loss": 0.0076, + "rewards/chosen": 9.996422576904298, + "rewards/margins": 28.168974049886067, + "rewards/rejected": -18.17255147298177, + "step": 162 + }, + { + "epoch": 0.05957058017359525, + "grad_norm": 0.21863658726215363, + "kl": 0.14936065673828125, + "learning_rate": 7.781182082382325e-05, + "logits/chosen": -29460635.42857143, + "logits/rejected": -42360552.96, + "logps/chosen": -282.3582066127232, + "logps/rejected": -567.3075, + "loss": 0.0003, + "rewards/chosen": 8.516668592180524, + "rewards/margins": 33.123619275774274, + "rewards/rejected": -24.60695068359375, + "step": 163 + }, + { + "epoch": 0.05993604385564184, + "grad_norm": 1.2451578378677368, + "kl": 0.5987348556518555, + "learning_rate": 7.754484907260513e-05, + "logits/chosen": -20192930.133333333, + "logits/rejected": -42657400.47058824, + "logps/chosen": -346.25188802083335, + "logps/rejected": -437.8046013327206, + "loss": 0.0022, + "rewards/chosen": 9.76473388671875, + "rewards/margins": 25.446478630514704, + "rewards/rejected": -15.681744743795957, + "step": 164 + }, + { + "epoch": 0.06030150753768844, + "grad_norm": 3.866580009460449, + "kl": 2.7086315155029297, + "learning_rate": 7.727674506052743e-05, + "logits/chosen": -26440046.0, + "logits/rejected": -29380092.0, + "logps/chosen": -381.23895263671875, + "logps/rejected": -377.08551025390625, + "loss": 0.0044, + "rewards/chosen": 8.652615547180176, + "rewards/margins": 23.57578754425049, + "rewards/rejected": -14.923171997070312, + "step": 165 + }, + { + "epoch": 0.060666971219735036, + "grad_norm": 2.844273567199707, + "kl": 4.637930870056152, + "learning_rate": 7.700751980829602e-05, + "logits/chosen": -31675544.888888888, + "logits/rejected": -27553568.0, + "logps/chosen": -372.06182183159723, + "logps/rejected": -462.28529575892856, + "loss": 0.0158, + "rewards/chosen": 9.151282416449654, + "rewards/margins": 27.116133795844185, + "rewards/rejected": -17.96485137939453, + "step": 166 + }, + { + "epoch": 0.06103243490178163, + "grad_norm": 3.541506767272949, + "kl": 6.787299156188965, + "learning_rate": 7.673718438270648e-05, + "logits/chosen": -22027502.0, + "logits/rejected": -34251656.0, + "logps/chosen": -347.6698303222656, + "logps/rejected": -606.9769897460938, + "loss": 0.0039, + "rewards/chosen": 11.097044944763184, + "rewards/margins": 33.89655590057373, + "rewards/rejected": -22.799510955810547, + "step": 167 + }, + { + "epoch": 0.061397898583828235, + "grad_norm": 0.833300769329071, + "kl": 2.59067440032959, + "learning_rate": 7.646574989618938e-05, + "logits/chosen": -27996830.11764706, + "logits/rejected": -28619340.8, + "logps/chosen": -333.8934972426471, + "logps/rejected": -350.493359375, + "loss": 0.0007, + "rewards/chosen": 11.668790031881894, + "rewards/margins": 26.86366613051471, + "rewards/rejected": -15.194876098632813, + "step": 168 + }, + { + "epoch": 0.06176336226587483, + "grad_norm": 3.527308702468872, + "kl": 7.594015121459961, + "learning_rate": 7.619322750635327e-05, + "logits/chosen": -26531843.76470588, + "logits/rejected": -35968247.46666667, + "logps/chosen": -345.0752814797794, + "logps/rejected": -437.68125, + "loss": 0.0288, + "rewards/chosen": 9.16690871294807, + "rewards/margins": 26.81819858925015, + "rewards/rejected": -17.651289876302084, + "step": 169 + }, + { + "epoch": 0.06212882594792143, + "grad_norm": 1.2971023321151733, + "kl": 2.31536865234375, + "learning_rate": 7.591962841552627e-05, + "logits/chosen": -26244359.384615384, + "logits/rejected": -45446366.315789476, + "logps/chosen": -313.24350210336536, + "logps/rejected": -513.1398540296053, + "loss": 0.0074, + "rewards/chosen": 6.6549805861253, + "rewards/margins": 24.913075775270038, + "rewards/rejected": -18.258095189144736, + "step": 170 + }, + { + "epoch": 0.062494289629968024, + "grad_norm": 3.557342052459717, + "kl": 5.60274076461792, + "learning_rate": 7.564496387029532e-05, + "logits/chosen": -39386353.45454545, + "logits/rejected": -20774008.38095238, + "logps/chosen": -520.8385564630681, + "logps/rejected": -550.8647693452381, + "loss": 0.0082, + "rewards/chosen": 11.914052789861506, + "rewards/margins": 32.00259881618219, + "rewards/rejected": -20.088546026320685, + "step": 171 + }, + { + "epoch": 0.06285975331201461, + "grad_norm": 3.5589373111724854, + "kl": 5.933588981628418, + "learning_rate": 7.536924516104411e-05, + "logits/chosen": -27152112.0, + "logits/rejected": -50107624.0, + "logps/chosen": -366.93133544921875, + "logps/rejected": -696.5687255859375, + "loss": 0.0038, + "rewards/chosen": 9.678105354309082, + "rewards/margins": 32.10176181793213, + "rewards/rejected": -22.423656463623047, + "step": 172 + }, + { + "epoch": 0.06322521699406121, + "grad_norm": 3.3520402908325195, + "kl": 2.0572586059570312, + "learning_rate": 7.509248362148889e-05, + "logits/chosen": -43752658.28571428, + "logits/rejected": -29103024.0, + "logps/chosen": -359.9296177455357, + "logps/rejected": -394.8307291666667, + "loss": 0.0059, + "rewards/chosen": 10.492998395647321, + "rewards/margins": 25.35122559562562, + "rewards/rejected": -14.858227199978298, + "step": 173 + }, + { + "epoch": 0.0635906806761078, + "grad_norm": 5.46427583694458, + "kl": 10.225854873657227, + "learning_rate": 7.481469062821252e-05, + "logits/chosen": -16171220.705882354, + "logits/rejected": -40917333.333333336, + "logps/chosen": -379.10437729779414, + "logps/rejected": -585.1021484375, + "loss": 0.0109, + "rewards/chosen": 9.008084465475644, + "rewards/margins": 25.82785578709023, + "rewards/rejected": -16.819771321614585, + "step": 174 + }, + { + "epoch": 0.06395614435815442, + "grad_norm": 5.1015625, + "kl": 2.563922882080078, + "learning_rate": 7.45358776001969e-05, + "logits/chosen": -16222793.846153846, + "logits/rejected": -27810910.315789472, + "logps/chosen": -295.1984299879808, + "logps/rejected": -432.94351356907896, + "loss": 0.0109, + "rewards/chosen": 7.500771155724158, + "rewards/margins": 23.959707345074488, + "rewards/rejected": -16.45893618935033, + "step": 175 + }, + { + "epoch": 0.06432160804020101, + "grad_norm": 3.8610777854919434, + "kl": 7.177302360534668, + "learning_rate": 7.425605599835361e-05, + "logits/chosen": -25172003.76470588, + "logits/rejected": -22787142.4, + "logps/chosen": -292.3703182444853, + "logps/rejected": -470.09189453125, + "loss": 0.0132, + "rewards/chosen": 8.342832677504596, + "rewards/margins": 23.846096023858763, + "rewards/rejected": -15.503263346354167, + "step": 176 + }, + { + "epoch": 0.06468707172224761, + "grad_norm": 2.0071771144866943, + "kl": 4.37033748626709, + "learning_rate": 7.39752373250527e-05, + "logits/chosen": -22452845.47368421, + "logits/rejected": -18204893.53846154, + "logps/chosen": -353.97923519736844, + "logps/rejected": -355.22701322115387, + "loss": 0.014, + "rewards/chosen": 8.854493793688322, + "rewards/margins": 23.776462693928707, + "rewards/rejected": -14.921968900240385, + "step": 177 + }, + { + "epoch": 0.0650525354042942, + "grad_norm": 3.627746820449829, + "kl": 1.5145297050476074, + "learning_rate": 7.369343312364993e-05, + "logits/chosen": -15702925.090909092, + "logits/rejected": -31865740.19047619, + "logps/chosen": -406.5943714488636, + "logps/rejected": -511.3160807291667, + "loss": 0.0046, + "rewards/chosen": 7.576521439985796, + "rewards/margins": 28.051678678174042, + "rewards/rejected": -20.475157238188245, + "step": 178 + }, + { + "epoch": 0.0654179990863408, + "grad_norm": 6.4200897216796875, + "kl": 0.8539514541625977, + "learning_rate": 7.34106549780123e-05, + "logits/chosen": -24184929.777777776, + "logits/rejected": -28082998.85714286, + "logps/chosen": -268.18614366319446, + "logps/rejected": -423.1952427455357, + "loss": 0.0238, + "rewards/chosen": 8.05920155843099, + "rewards/margins": 26.53423381987072, + "rewards/rejected": -18.475032261439733, + "step": 179 + }, + { + "epoch": 0.0657834627683874, + "grad_norm": 7.6310038566589355, + "kl": 5.401153564453125, + "learning_rate": 7.312691451204178e-05, + "logits/chosen": -18243305.411764707, + "logits/rejected": -31956266.666666668, + "logps/chosen": -348.9458869485294, + "logps/rejected": -531.7514322916667, + "loss": 0.0095, + "rewards/chosen": 8.305734073414522, + "rewards/margins": 27.884455183440565, + "rewards/rejected": -19.57872111002604, + "step": 180 + }, + { + "epoch": 0.06614892645043399, + "grad_norm": 2.335725784301758, + "kl": 7.708461761474609, + "learning_rate": 7.284222338919758e-05, + "logits/chosen": -26012860.63157895, + "logits/rejected": -42592851.692307696, + "logps/chosen": -356.98671361019734, + "logps/rejected": -538.6518930288462, + "loss": 0.0081, + "rewards/chosen": 10.63698377107319, + "rewards/margins": 29.442090300895906, + "rewards/rejected": -18.805106529822716, + "step": 181 + }, + { + "epoch": 0.06651439013248059, + "grad_norm": 0.36761146783828735, + "kl": 0.5928888320922852, + "learning_rate": 7.255659331201673e-05, + "logits/chosen": -12418821.0, + "logits/rejected": -28695830.0, + "logps/chosen": -296.0104064941406, + "logps/rejected": -399.7171630859375, + "loss": 0.0005, + "rewards/chosen": 8.365544319152832, + "rewards/margins": 23.712489128112793, + "rewards/rejected": -15.346944808959961, + "step": 182 + }, + { + "epoch": 0.06687985381452718, + "grad_norm": 3.1898019313812256, + "kl": 4.412371635437012, + "learning_rate": 7.227003602163295e-05, + "logits/chosen": -32701537.88235294, + "logits/rejected": -27279308.8, + "logps/chosen": -333.3812902113971, + "logps/rejected": -495.849609375, + "loss": 0.0035, + "rewards/chosen": 8.908136704388786, + "rewards/margins": 27.76432973824295, + "rewards/rejected": -18.856193033854165, + "step": 183 + }, + { + "epoch": 0.06724531749657378, + "grad_norm": 5.877505302429199, + "kl": 9.530031204223633, + "learning_rate": 7.198256329729412e-05, + "logits/chosen": -26423022.222222224, + "logits/rejected": -31291769.14285714, + "logps/chosen": -414.0874294704861, + "logps/rejected": -372.7887486049107, + "loss": 0.0258, + "rewards/chosen": 8.350198533799913, + "rewards/margins": 22.83146401057168, + "rewards/rejected": -14.481265476771764, + "step": 184 + }, + { + "epoch": 0.06761078117862038, + "grad_norm": 2.177922248840332, + "kl": 5.663725852966309, + "learning_rate": 7.169418695587791e-05, + "logits/chosen": -17323242.666666668, + "logits/rejected": -35728736.0, + "logps/chosen": -327.0119900173611, + "logps/rejected": -412.28857421875, + "loss": 0.0097, + "rewards/chosen": 8.765028211805555, + "rewards/margins": 21.80064440530444, + "rewards/rejected": -13.035616193498884, + "step": 185 + }, + { + "epoch": 0.06797624486066697, + "grad_norm": 3.550206422805786, + "kl": 3.0919437408447266, + "learning_rate": 7.14049188514063e-05, + "logits/chosen": -32884610.133333333, + "logits/rejected": -26519314.82352941, + "logps/chosen": -367.80719401041665, + "logps/rejected": -523.7941176470588, + "loss": 0.0133, + "rewards/chosen": 9.515175374348958, + "rewards/margins": 26.152344707414215, + "rewards/rejected": -16.63716933306526, + "step": 186 + }, + { + "epoch": 0.06834170854271357, + "grad_norm": 10.884556770324707, + "kl": 1.7897157669067383, + "learning_rate": 7.1114770874558e-05, + "logits/chosen": -22171801.333333332, + "logits/rejected": -25333996.8, + "logps/chosen": -309.26092529296875, + "logps/rejected": -387.01396484375, + "loss": 0.0328, + "rewards/chosen": 7.895773569742839, + "rewards/margins": 19.213196818033854, + "rewards/rejected": -11.317423248291016, + "step": 187 + }, + { + "epoch": 0.06870717222476017, + "grad_norm": 5.466808795928955, + "kl": 2.5527238845825195, + "learning_rate": 7.082375495217995e-05, + "logits/chosen": -18060890.181818184, + "logits/rejected": -30156083.80952381, + "logps/chosen": -302.6913396661932, + "logps/rejected": -556.6991722470239, + "loss": 0.0082, + "rewards/chosen": 7.865567294034091, + "rewards/margins": 24.122369807519952, + "rewards/rejected": -16.256802513485862, + "step": 188 + }, + { + "epoch": 0.06907263590680676, + "grad_norm": 3.0002336502075195, + "kl": 8.360982894897461, + "learning_rate": 7.05318830467969e-05, + "logits/chosen": -16755614.11764706, + "logits/rejected": -21894227.2, + "logps/chosen": -408.18017578125, + "logps/rejected": -459.7638346354167, + "loss": 0.0056, + "rewards/chosen": 9.41691140567555, + "rewards/margins": 26.219946887446383, + "rewards/rejected": -16.803035481770834, + "step": 189 + }, + { + "epoch": 0.06943809958885336, + "grad_norm": 4.142535209655762, + "kl": 7.133042335510254, + "learning_rate": 7.023916715611969e-05, + "logits/chosen": -23126666.666666668, + "logits/rejected": -20533851.42857143, + "logps/chosen": -387.868408203125, + "logps/rejected": -469.752685546875, + "loss": 0.0177, + "rewards/chosen": 10.944979349772135, + "rewards/margins": 31.230931236630397, + "rewards/rejected": -20.28595188685826, + "step": 190 + }, + { + "epoch": 0.06980356327089995, + "grad_norm": 2.85412335395813, + "kl": 11.062262535095215, + "learning_rate": 6.99456193125521e-05, + "logits/chosen": -27307265.777777776, + "logits/rejected": -28493636.57142857, + "logps/chosen": -354.16937934027777, + "logps/rejected": -471.00142996651783, + "loss": 0.0128, + "rewards/chosen": 10.214418199327257, + "rewards/margins": 24.905228266640314, + "rewards/rejected": -14.690810067313057, + "step": 191 + }, + { + "epoch": 0.07016902695294655, + "grad_norm": 1.3512822389602661, + "kl": 1.3055610656738281, + "learning_rate": 6.965125158269619e-05, + "logits/chosen": -16317834.666666666, + "logits/rejected": -16164295.529411765, + "logps/chosen": -380.50751953125, + "logps/rejected": -408.55471622242646, + "loss": 0.0015, + "rewards/chosen": 9.312572224934895, + "rewards/margins": 24.847101967007504, + "rewards/rejected": -15.53452974207261, + "step": 192 + }, + { + "epoch": 0.07053449063499315, + "grad_norm": 4.69804573059082, + "kl": 4.003072738647461, + "learning_rate": 6.935607606685642e-05, + "logits/chosen": -11690886.0, + "logits/rejected": -32503590.0, + "logps/chosen": -323.9889831542969, + "logps/rejected": -587.8881225585938, + "loss": 0.0088, + "rewards/chosen": 8.519304275512695, + "rewards/margins": 24.23267364501953, + "rewards/rejected": -15.713369369506836, + "step": 193 + }, + { + "epoch": 0.07089995431703974, + "grad_norm": 6.894629001617432, + "kl": 4.381341934204102, + "learning_rate": 6.906010489854209e-05, + "logits/chosen": -15500551.384615384, + "logits/rejected": -23350785.684210528, + "logps/chosen": -336.66488882211536, + "logps/rejected": -493.66015625, + "loss": 0.01, + "rewards/chosen": 9.593859159029448, + "rewards/margins": 29.907944466903622, + "rewards/rejected": -20.314085307874176, + "step": 194 + }, + { + "epoch": 0.07126541799908634, + "grad_norm": 4.009649276733398, + "kl": 2.7879061698913574, + "learning_rate": 6.876335024396872e-05, + "logits/chosen": -16030878.11764706, + "logits/rejected": -25318824.533333335, + "logps/chosen": -379.2220243566176, + "logps/rejected": -391.873046875, + "loss": 0.0154, + "rewards/chosen": 9.184873693129596, + "rewards/margins": 24.023857864679073, + "rewards/rejected": -14.838984171549479, + "step": 195 + }, + { + "epoch": 0.07163088168113294, + "grad_norm": 3.0264062881469727, + "kl": 3.0212202072143555, + "learning_rate": 6.846582430155783e-05, + "logits/chosen": -28066490.666666668, + "logits/rejected": -20612891.2, + "logps/chosen": -404.5044352213542, + "logps/rejected": -499.241943359375, + "loss": 0.0056, + "rewards/chosen": 11.594781239827475, + "rewards/margins": 30.13075383504232, + "rewards/rejected": -18.535972595214844, + "step": 196 + }, + { + "epoch": 0.07199634536317953, + "grad_norm": 7.040284633636475, + "kl": 6.741287708282471, + "learning_rate": 6.816753930143558e-05, + "logits/chosen": -30354290.52631579, + "logits/rejected": -21988878.769230768, + "logps/chosen": -410.85572574013156, + "logps/rejected": -503.72554837740387, + "loss": 0.0168, + "rewards/chosen": 9.378283048930921, + "rewards/margins": 26.061604488233804, + "rewards/rejected": -16.683321439302883, + "step": 197 + }, + { + "epoch": 0.07236180904522613, + "grad_norm": 2.600749969482422, + "kl": 5.778753280639648, + "learning_rate": 6.786850750493006e-05, + "logits/chosen": -26646170.0, + "logits/rejected": -32755876.0, + "logps/chosen": -330.4010009765625, + "logps/rejected": -559.676025390625, + "loss": 0.0029, + "rewards/chosen": 11.444527626037598, + "rewards/margins": 32.67663860321045, + "rewards/rejected": -21.23211097717285, + "step": 198 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 2.6039352416992188, + "kl": 6.420266151428223, + "learning_rate": 6.756874120406714e-05, + "logits/chosen": -17844263.529411763, + "logits/rejected": -29301909.333333332, + "logps/chosen": -283.4960075827206, + "logps/rejected": -408.2862955729167, + "loss": 0.0064, + "rewards/chosen": 11.534720028147978, + "rewards/margins": 28.537897925283396, + "rewards/rejected": -17.003177897135416, + "step": 199 + }, + { + "epoch": 0.07309273640931932, + "grad_norm": 21.8624210357666, + "kl": 9.30317497253418, + "learning_rate": 6.726825272106538e-05, + "logits/chosen": -17542368.0, + "logits/rejected": -33012054.4, + "logps/chosen": -447.385009765625, + "logps/rejected": -426.5275390625, + "loss": 0.0312, + "rewards/chosen": 11.985382080078125, + "rewards/margins": 26.30437774658203, + "rewards/rejected": -14.318995666503906, + "step": 200 + }, + { + "epoch": 0.07345820009136592, + "grad_norm": 2.5858287811279297, + "kl": 3.965498447418213, + "learning_rate": 6.696705440782938e-05, + "logits/chosen": -9650119.529411765, + "logits/rejected": -33370368.0, + "logps/chosen": -325.3095128676471, + "logps/rejected": -472.2998046875, + "loss": 0.0065, + "rewards/chosen": 8.496898875517005, + "rewards/margins": 27.648915070178465, + "rewards/rejected": -19.15201619466146, + "step": 201 + }, + { + "epoch": 0.07382366377341251, + "grad_norm": 3.9915287494659424, + "kl": 4.856705188751221, + "learning_rate": 6.666515864544209e-05, + "logits/chosen": -10674379.733333332, + "logits/rejected": -26997652.70588235, + "logps/chosen": -341.36578776041665, + "logps/rejected": -388.1389590992647, + "loss": 0.0125, + "rewards/chosen": 9.1323486328125, + "rewards/margins": 23.453135950425093, + "rewards/rejected": -14.320787317612591, + "step": 202 + }, + { + "epoch": 0.07418912745545911, + "grad_norm": 1.1339577436447144, + "kl": 8.12137222290039, + "learning_rate": 6.636257784365584e-05, + "logits/chosen": -28512064.0, + "logits/rejected": -31698720.0, + "logps/chosen": -406.0943603515625, + "logps/rejected": -469.087109375, + "loss": 0.0009, + "rewards/chosen": 11.925963083902994, + "rewards/margins": 28.539972178141277, + "rewards/rejected": -16.61400909423828, + "step": 203 + }, + { + "epoch": 0.0745545911375057, + "grad_norm": 3.5026252269744873, + "kl": 10.456775665283203, + "learning_rate": 6.605932444038229e-05, + "logits/chosen": -23463275.789473683, + "logits/rejected": -25056219.076923076, + "logps/chosen": -392.6201171875, + "logps/rejected": -383.59487680288464, + "loss": 0.0106, + "rewards/chosen": 10.768471968801398, + "rewards/margins": 25.321538515901757, + "rewards/rejected": -14.55306654710036, + "step": 204 + }, + { + "epoch": 0.0749200548195523, + "grad_norm": 3.906362533569336, + "kl": 3.322394371032715, + "learning_rate": 6.575541090118105e-05, + "logits/chosen": -15754039.0, + "logits/rejected": -17539962.0, + "logps/chosen": -350.6761169433594, + "logps/rejected": -438.69488525390625, + "loss": 0.0049, + "rewards/chosen": 10.065705299377441, + "rewards/margins": 27.962946891784668, + "rewards/rejected": -17.897241592407227, + "step": 205 + }, + { + "epoch": 0.0752855185015989, + "grad_norm": 1.6618068218231201, + "kl": 3.0612802505493164, + "learning_rate": 6.545084971874738e-05, + "logits/chosen": -17744706.0, + "logits/rejected": -30604094.0, + "logps/chosen": -380.76800537109375, + "logps/rejected": -345.7037048339844, + "loss": 0.0021, + "rewards/chosen": 10.43433952331543, + "rewards/margins": 23.519795417785645, + "rewards/rejected": -13.085455894470215, + "step": 206 + }, + { + "epoch": 0.07565098218364551, + "grad_norm": 4.202422142028809, + "kl": 0.6333751678466797, + "learning_rate": 6.514565341239861e-05, + "logits/chosen": -12794493.090909092, + "logits/rejected": -26353200.76190476, + "logps/chosen": -282.27676669034093, + "logps/rejected": -562.2889694940476, + "loss": 0.0028, + "rewards/chosen": 10.231034712357955, + "rewards/margins": 30.530086632930875, + "rewards/rejected": -20.299051920572918, + "step": 207 + }, + { + "epoch": 0.0760164458656921, + "grad_norm": 10.078681945800781, + "kl": 8.571239471435547, + "learning_rate": 6.483983452755953e-05, + "logits/chosen": -23175141.333333332, + "logits/rejected": -19565435.42857143, + "logps/chosen": -344.9137912326389, + "logps/rejected": -472.61160714285717, + "loss": 0.0271, + "rewards/chosen": 9.744666205512154, + "rewards/margins": 25.424937899150546, + "rewards/rejected": -15.680271693638392, + "step": 208 + }, + { + "epoch": 0.0763819095477387, + "grad_norm": 3.214233160018921, + "kl": 4.615579605102539, + "learning_rate": 6.453340563524669e-05, + "logits/chosen": -27926275.36842105, + "logits/rejected": -43859387.07692308, + "logps/chosen": -343.65013363486844, + "logps/rejected": -648.1381460336538, + "loss": 0.0144, + "rewards/chosen": 9.325342278731497, + "rewards/margins": 26.778057638932818, + "rewards/rejected": -17.45271536020132, + "step": 209 + }, + { + "epoch": 0.0767473732297853, + "grad_norm": 4.102214813232422, + "kl": 12.734009742736816, + "learning_rate": 6.422637933155162e-05, + "logits/chosen": -21794382.222222224, + "logits/rejected": -37066509.71428572, + "logps/chosen": -374.322265625, + "logps/rejected": -411.320556640625, + "loss": 0.0151, + "rewards/chosen": 11.905857510036892, + "rewards/margins": 31.9157234070793, + "rewards/rejected": -20.00986589704241, + "step": 210 + }, + { + "epoch": 0.07711283691183189, + "grad_norm": 1.798668622970581, + "kl": 11.159156799316406, + "learning_rate": 6.391876823712317e-05, + "logits/chosen": -14627946.666666666, + "logits/rejected": -22159533.714285713, + "logps/chosen": -348.218017578125, + "logps/rejected": -406.6967075892857, + "loss": 0.0084, + "rewards/chosen": 10.321999443901909, + "rewards/margins": 25.591409713502912, + "rewards/rejected": -15.269410269601005, + "step": 211 + }, + { + "epoch": 0.07747830059387849, + "grad_norm": 1.6394319534301758, + "kl": 1.3272171020507812, + "learning_rate": 6.361058499664856e-05, + "logits/chosen": -15764256.0, + "logits/rejected": -22281297.066666666, + "logps/chosen": -284.83108340992646, + "logps/rejected": -419.1984049479167, + "loss": 0.004, + "rewards/chosen": 8.969033633961397, + "rewards/margins": 27.95632407992494, + "rewards/rejected": -18.98729044596354, + "step": 212 + }, + { + "epoch": 0.07784376427592508, + "grad_norm": 7.77843713760376, + "kl": 9.889145851135254, + "learning_rate": 6.330184227833376e-05, + "logits/chosen": -25250788.266666666, + "logits/rejected": -13045229.176470589, + "logps/chosen": -413.92347005208336, + "logps/rejected": -360.48127297794116, + "loss": 0.0251, + "rewards/chosen": 10.608176676432292, + "rewards/margins": 25.95429448146446, + "rewards/rejected": -15.346117805032168, + "step": 213 + }, + { + "epoch": 0.07820922795797168, + "grad_norm": 20.824413299560547, + "kl": 14.523167610168457, + "learning_rate": 6.299255277338265e-05, + "logits/chosen": -19816242.285714287, + "logits/rejected": -40381358.54545455, + "logps/chosen": -422.2455357142857, + "logps/rejected": -556.1534978693181, + "loss": 0.0265, + "rewards/chosen": 11.749418712797619, + "rewards/margins": 30.31490111247802, + "rewards/rejected": -18.5654823996804, + "step": 214 + }, + { + "epoch": 0.07857469164001828, + "grad_norm": 3.581092119216919, + "kl": 3.292311668395996, + "learning_rate": 6.268272919547537e-05, + "logits/chosen": -24386698.0, + "logits/rejected": -31041716.0, + "logps/chosen": -345.5860595703125, + "logps/rejected": -566.6504516601562, + "loss": 0.0164, + "rewards/chosen": 7.3111958503723145, + "rewards/margins": 29.830772876739502, + "rewards/rejected": -22.519577026367188, + "step": 215 + }, + { + "epoch": 0.07894015532206487, + "grad_norm": 0.4833795726299286, + "kl": 4.088086128234863, + "learning_rate": 6.237238428024572e-05, + "logits/chosen": -33289578.666666668, + "logits/rejected": -28853592.470588237, + "logps/chosen": -423.1152669270833, + "logps/rejected": -492.9146943933824, + "loss": 0.0005, + "rewards/chosen": 11.488334147135417, + "rewards/margins": 29.890440937117035, + "rewards/rejected": -18.402106789981616, + "step": 216 + }, + { + "epoch": 0.07930561900411147, + "grad_norm": 2.3684840202331543, + "kl": 1.4258842468261719, + "learning_rate": 6.206153078475763e-05, + "logits/chosen": -25143190.4, + "logits/rejected": -23845405.09090909, + "logps/chosen": -348.024267578125, + "logps/rejected": -462.46826171875, + "loss": 0.0071, + "rewards/chosen": 8.960430145263672, + "rewards/margins": 26.658276991410688, + "rewards/rejected": -17.697846846147016, + "step": 217 + }, + { + "epoch": 0.07967108268615807, + "grad_norm": 3.4027488231658936, + "kl": 3.7008113861083984, + "learning_rate": 6.175018148698077e-05, + "logits/chosen": -29465239.57894737, + "logits/rejected": -32449619.692307692, + "logps/chosen": -399.4223889802632, + "logps/rejected": -417.22164212740387, + "loss": 0.0096, + "rewards/chosen": 8.753948010896382, + "rewards/margins": 21.67959427930083, + "rewards/rejected": -12.925646268404448, + "step": 218 + }, + { + "epoch": 0.08003654636820466, + "grad_norm": 5.0857672691345215, + "kl": 5.551398277282715, + "learning_rate": 6.143834918526527e-05, + "logits/chosen": -16367331.764705881, + "logits/rejected": -24942521.6, + "logps/chosen": -356.2471564797794, + "logps/rejected": -417.2796875, + "loss": 0.0205, + "rewards/chosen": 8.824462890625, + "rewards/margins": 27.034195963541666, + "rewards/rejected": -18.209733072916666, + "step": 219 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 5.110848426818848, + "kl": 5.129820823669434, + "learning_rate": 6.112604669781572e-05, + "logits/chosen": -3879200.5, + "logits/rejected": -24118480.0, + "logps/chosen": -329.4516906738281, + "logps/rejected": -450.46142578125, + "loss": 0.0129, + "rewards/chosen": 8.563543319702148, + "rewards/margins": 29.11154556274414, + "rewards/rejected": -20.548002243041992, + "step": 220 + }, + { + "epoch": 0.08076747373229785, + "grad_norm": 1.9817887544631958, + "kl": 2.890030860900879, + "learning_rate": 6.081328686216418e-05, + "logits/chosen": -37181917.86666667, + "logits/rejected": -40966580.705882356, + "logps/chosen": -413.9736653645833, + "logps/rejected": -630.806640625, + "loss": 0.0033, + "rewards/chosen": 10.92909647623698, + "rewards/margins": 36.46276634066713, + "rewards/rejected": -25.53366986443015, + "step": 221 + }, + { + "epoch": 0.08113293741434445, + "grad_norm": 3.6742677688598633, + "kl": 6.32096529006958, + "learning_rate": 6.0500082534642464e-05, + "logits/chosen": -32207366.4, + "logits/rejected": -22394597.647058822, + "logps/chosen": -402.03782552083334, + "logps/rejected": -504.08616727941177, + "loss": 0.0053, + "rewards/chosen": 10.916023763020833, + "rewards/margins": 30.193345971200984, + "rewards/rejected": -19.27732220818015, + "step": 222 + }, + { + "epoch": 0.08149840109639105, + "grad_norm": 21.385496139526367, + "kl": 2.4638442993164062, + "learning_rate": 6.0186446589853784e-05, + "logits/chosen": -12316193.454545455, + "logits/rejected": -11932992.0, + "logps/chosen": -411.81010298295456, + "logps/rejected": -446.3408668154762, + "loss": 0.0232, + "rewards/chosen": 10.413556879216975, + "rewards/margins": 26.619274420139597, + "rewards/rejected": -16.20571754092262, + "step": 223 + }, + { + "epoch": 0.08186386477843764, + "grad_norm": 7.697639465332031, + "kl": 10.840798377990723, + "learning_rate": 5.987239192014336e-05, + "logits/chosen": -15168502.857142856, + "logits/rejected": -12821316.363636363, + "logps/chosen": -311.9120628720238, + "logps/rejected": -273.797119140625, + "loss": 0.0284, + "rewards/chosen": 6.746843610491071, + "rewards/margins": 16.119167773754565, + "rewards/rejected": -9.372324163263494, + "step": 224 + }, + { + "epoch": 0.08222932846048424, + "grad_norm": 27.4875545501709, + "kl": 7.211214542388916, + "learning_rate": 5.955793143506863e-05, + "logits/chosen": -12500211.368421054, + "logits/rejected": -20363127.384615384, + "logps/chosen": -317.6380037006579, + "logps/rejected": -462.81689453125, + "loss": 0.0543, + "rewards/chosen": 8.480968274568257, + "rewards/margins": 22.93068352208929, + "rewards/rejected": -14.449715247521034, + "step": 225 + }, + { + "epoch": 0.08259479214253083, + "grad_norm": 5.119180679321289, + "kl": 3.8821263313293457, + "learning_rate": 5.924307806086844e-05, + "logits/chosen": -20646944.0, + "logits/rejected": -4519772.4, + "logps/chosen": -360.2757568359375, + "logps/rejected": -552.817529296875, + "loss": 0.0129, + "rewards/chosen": 8.117130915323893, + "rewards/margins": 26.899151484171547, + "rewards/rejected": -18.782020568847656, + "step": 226 + }, + { + "epoch": 0.08296025582457743, + "grad_norm": 6.482161998748779, + "kl": 0.0, + "learning_rate": 5.8927844739931834e-05, + "logits/chosen": -12243009.454545455, + "logits/rejected": -21820697.904761903, + "logps/chosen": -294.188720703125, + "logps/rejected": -468.8477492559524, + "loss": 0.0073, + "rewards/chosen": 8.640039617365057, + "rewards/margins": 28.237434288123985, + "rewards/rejected": -19.597394670758927, + "step": 227 + }, + { + "epoch": 0.08332571950662403, + "grad_norm": 5.045980930328369, + "kl": 8.78582763671875, + "learning_rate": 5.861224443026595e-05, + "logits/chosen": -5696805.333333333, + "logits/rejected": -7406330.285714285, + "logps/chosen": -302.4457736545139, + "logps/rejected": -459.8894740513393, + "loss": 0.0145, + "rewards/chosen": 8.533818562825521, + "rewards/margins": 23.901186988467263, + "rewards/rejected": -15.367368425641741, + "step": 228 + }, + { + "epoch": 0.08369118318867062, + "grad_norm": 7.285205364227295, + "kl": 9.07259750366211, + "learning_rate": 5.82962901049634e-05, + "logits/chosen": -3893539.294117647, + "logits/rejected": -25990393.6, + "logps/chosen": -337.59142348345586, + "logps/rejected": -577.366796875, + "loss": 0.0182, + "rewards/chosen": 8.984709795783548, + "rewards/margins": 30.35168570724188, + "rewards/rejected": -21.366975911458333, + "step": 229 + }, + { + "epoch": 0.08405664687071722, + "grad_norm": 4.332226276397705, + "kl": 12.022812843322754, + "learning_rate": 5.7979994751668964e-05, + "logits/chosen": -17081276.444444444, + "logits/rejected": -16491866.285714285, + "logps/chosen": -378.99175347222223, + "logps/rejected": -461.9462193080357, + "loss": 0.0175, + "rewards/chosen": 10.123255411783854, + "rewards/margins": 26.100438072567893, + "rewards/rejected": -15.97718266078404, + "step": 230 + }, + { + "epoch": 0.08442211055276382, + "grad_norm": 1.7576537132263184, + "kl": 4.291003227233887, + "learning_rate": 5.766337137204579e-05, + "logits/chosen": -18471228.23529412, + "logits/rejected": -14284845.866666667, + "logps/chosen": -330.1019071691176, + "logps/rejected": -343.8628255208333, + "loss": 0.0055, + "rewards/chosen": 10.958397360409007, + "rewards/margins": 23.948498475317862, + "rewards/rejected": -12.990101114908855, + "step": 231 + }, + { + "epoch": 0.08478757423481041, + "grad_norm": 12.976097106933594, + "kl": 0.06497049331665039, + "learning_rate": 5.7346432981240904e-05, + "logits/chosen": -12242513.23076923, + "logits/rejected": 8392253.47368421, + "logps/chosen": -276.62389197716345, + "logps/rejected": -548.2805304276316, + "loss": 0.0095, + "rewards/chosen": 8.222402132474459, + "rewards/margins": 32.36831791680834, + "rewards/rejected": -24.14591578433388, + "step": 232 + }, + { + "epoch": 0.08515303791685701, + "grad_norm": 4.58160400390625, + "kl": 8.429978370666504, + "learning_rate": 5.7029192607350146e-05, + "logits/chosen": -22118390.0, + "logits/rejected": -24906762.0, + "logps/chosen": -379.9720153808594, + "logps/rejected": -556.9617919921875, + "loss": 0.0139, + "rewards/chosen": 9.560355186462402, + "rewards/margins": 32.60457134246826, + "rewards/rejected": -23.04421615600586, + "step": 233 + }, + { + "epoch": 0.0855185015989036, + "grad_norm": 1.507628083229065, + "kl": 1.1839404106140137, + "learning_rate": 5.6711663290882776e-05, + "logits/chosen": -14845961.142857144, + "logits/rejected": -15407089.777777778, + "logps/chosen": -329.89181082589283, + "logps/rejected": -493.96733940972223, + "loss": 0.0055, + "rewards/chosen": 9.155017307826451, + "rewards/margins": 28.94087376670232, + "rewards/rejected": -19.78585645887587, + "step": 234 + }, + { + "epoch": 0.0858839652809502, + "grad_norm": 3.848444938659668, + "kl": 9.833124160766602, + "learning_rate": 5.6393858084225305e-05, + "logits/chosen": -6498912.0, + "logits/rejected": -18224133.333333332, + "logps/chosen": -333.9006453804348, + "logps/rejected": -521.8710394965278, + "loss": 0.0223, + "rewards/chosen": 7.9921742314877715, + "rewards/margins": 30.287640336631, + "rewards/rejected": -22.29546610514323, + "step": 235 + }, + { + "epoch": 0.0862494289629968, + "grad_norm": 2.7810912132263184, + "kl": 5.483455181121826, + "learning_rate": 5.6075790051105023e-05, + "logits/chosen": -7817208.0, + "logits/rejected": -16416721.0, + "logps/chosen": -362.0771484375, + "logps/rejected": -472.74322509765625, + "loss": 0.005, + "rewards/chosen": 9.401847839355469, + "rewards/margins": 31.938417434692383, + "rewards/rejected": -22.536569595336914, + "step": 236 + }, + { + "epoch": 0.08661489264504339, + "grad_norm": 3.110553741455078, + "kl": 3.809964656829834, + "learning_rate": 5.575747226605298e-05, + "logits/chosen": -8490709.05263158, + "logits/rejected": -11372866.461538462, + "logps/chosen": -400.8346011513158, + "logps/rejected": -545.5910081129807, + "loss": 0.0086, + "rewards/chosen": 8.414623059724507, + "rewards/margins": 33.92073361786753, + "rewards/rejected": -25.50611055814303, + "step": 237 + }, + { + "epoch": 0.08698035632708999, + "grad_norm": 1.600816011428833, + "kl": 2.572188377380371, + "learning_rate": 5.5438917813866554e-05, + "logits/chosen": -11434878.76923077, + "logits/rejected": -13644350.315789474, + "logps/chosen": -315.9757737379808, + "logps/rejected": -508.71952097039474, + "loss": 0.0016, + "rewards/chosen": 10.039117666391226, + "rewards/margins": 29.60621297118152, + "rewards/rejected": -19.567095304790296, + "step": 238 + }, + { + "epoch": 0.0873458200091366, + "grad_norm": 2.0251121520996094, + "kl": 3.4234132766723633, + "learning_rate": 5.512013978907157e-05, + "logits/chosen": -2543081.8666666667, + "logits/rejected": -14225318.588235294, + "logps/chosen": -282.37080078125, + "logps/rejected": -556.6636029411765, + "loss": 0.0043, + "rewards/chosen": 8.273530069986979, + "rewards/margins": 32.72615242752374, + "rewards/rejected": -24.452622357536764, + "step": 239 + }, + { + "epoch": 0.0877112836911832, + "grad_norm": 2.7250382900238037, + "kl": 5.877777099609375, + "learning_rate": 5.480115129538409e-05, + "logits/chosen": -8596929.142857144, + "logits/rejected": -11893044.444444444, + "logps/chosen": -389.1763392857143, + "logps/rejected": -416.30718315972223, + "loss": 0.0099, + "rewards/chosen": 9.133932931082589, + "rewards/margins": 28.867777264307414, + "rewards/rejected": -19.733844333224827, + "step": 240 + }, + { + "epoch": 0.08807674737322979, + "grad_norm": 2.191073417663574, + "kl": 3.252370595932007, + "learning_rate": 5.448196544517168e-05, + "logits/chosen": -16885422.0, + "logits/rejected": -12586504.0, + "logps/chosen": -303.47698974609375, + "logps/rejected": -482.5170593261719, + "loss": 0.0061, + "rewards/chosen": 10.387802124023438, + "rewards/margins": 31.04973602294922, + "rewards/rejected": -20.66193389892578, + "step": 241 + }, + { + "epoch": 0.08844221105527639, + "grad_norm": 1.3790781497955322, + "kl": 1.623072624206543, + "learning_rate": 5.416259535891447e-05, + "logits/chosen": -5154499.0, + "logits/rejected": -19852082.0, + "logps/chosen": -335.4534606933594, + "logps/rejected": -567.6194458007812, + "loss": 0.0022, + "rewards/chosen": 10.556591987609863, + "rewards/margins": 35.19700908660889, + "rewards/rejected": -24.640417098999023, + "step": 242 + }, + { + "epoch": 0.08880767473732298, + "grad_norm": 7.348330497741699, + "kl": 2.0587451457977295, + "learning_rate": 5.384305416466584e-05, + "logits/chosen": -3913020.5, + "logits/rejected": -14499589.0, + "logps/chosen": -292.5793762207031, + "logps/rejected": -463.9847717285156, + "loss": 0.007, + "rewards/chosen": 7.284144878387451, + "rewards/margins": 25.658551692962646, + "rewards/rejected": -18.374406814575195, + "step": 243 + }, + { + "epoch": 0.08917313841936958, + "grad_norm": 1.5713036060333252, + "kl": 3.138895034790039, + "learning_rate": 5.35233549975127e-05, + "logits/chosen": -14131090.133333333, + "logits/rejected": -7379913.882352941, + "logps/chosen": -321.50445963541665, + "logps/rejected": -477.98747702205884, + "loss": 0.0082, + "rewards/chosen": 8.999294026692708, + "rewards/margins": 30.62633690927543, + "rewards/rejected": -21.62704288258272, + "step": 244 + }, + { + "epoch": 0.08953860210141618, + "grad_norm": 10.220050811767578, + "kl": 4.970325469970703, + "learning_rate": 5.320351099903565e-05, + "logits/chosen": -7353914.947368421, + "logits/rejected": -9241026.461538462, + "logps/chosen": -322.42269736842104, + "logps/rejected": -513.96875, + "loss": 0.0261, + "rewards/chosen": 9.780102378443667, + "rewards/margins": 30.248021361316262, + "rewards/rejected": -20.467918982872597, + "step": 245 + }, + { + "epoch": 0.08990406578346277, + "grad_norm": 9.520925521850586, + "kl": 3.0357470512390137, + "learning_rate": 5.288353531676873e-05, + "logits/chosen": -10981987.0, + "logits/rejected": -16637833.0, + "logps/chosen": -346.5321350097656, + "logps/rejected": -475.2821044921875, + "loss": 0.024, + "rewards/chosen": 7.337143421173096, + "rewards/margins": 27.051758289337158, + "rewards/rejected": -19.714614868164062, + "step": 246 + }, + { + "epoch": 0.09026952946550937, + "grad_norm": 0.9409910440444946, + "kl": 1.646733283996582, + "learning_rate": 5.256344110365896e-05, + "logits/chosen": -9656946.133333333, + "logits/rejected": -21951666.82352941, + "logps/chosen": -390.64856770833336, + "logps/rejected": -533.7297219669117, + "loss": 0.0011, + "rewards/chosen": 10.652042643229167, + "rewards/margins": 32.71965343998928, + "rewards/rejected": -22.06761079676011, + "step": 247 + }, + { + "epoch": 0.09063499314755596, + "grad_norm": 3.4276442527770996, + "kl": 5.049632549285889, + "learning_rate": 5.2243241517525754e-05, + "logits/chosen": 8104922.666666667, + "logits/rejected": -13931694.11764706, + "logps/chosen": -311.51455078125, + "logps/rejected": -416.35816865808823, + "loss": 0.0276, + "rewards/chosen": 8.877303059895834, + "rewards/margins": 25.51912363089767, + "rewards/rejected": -16.641820571001837, + "step": 248 + }, + { + "epoch": 0.09100045682960256, + "grad_norm": 2.8090860843658447, + "kl": 3.980405807495117, + "learning_rate": 5.192294972051992e-05, + "logits/chosen": -16458994.461538462, + "logits/rejected": -16097566.315789474, + "logps/chosen": -326.67919921875, + "logps/rejected": -503.0575657894737, + "loss": 0.0045, + "rewards/chosen": 10.4029294527494, + "rewards/margins": 27.084651993353837, + "rewards/rejected": -16.68172254060444, + "step": 249 + }, + { + "epoch": 0.09136592051164916, + "grad_norm": 4.2613701820373535, + "kl": 4.3160295486450195, + "learning_rate": 5.1602578878582776e-05, + "logits/chosen": -13023320.0, + "logits/rejected": -13718616.0, + "logps/chosen": -329.45770263671875, + "logps/rejected": -412.9193115234375, + "loss": 0.0105, + "rewards/chosen": 9.6162748336792, + "rewards/margins": 26.780089378356934, + "rewards/rejected": -17.163814544677734, + "step": 250 + }, + { + "epoch": 0.09173138419369575, + "grad_norm": 3.3793652057647705, + "kl": 3.3492918014526367, + "learning_rate": 5.128214216090478e-05, + "logits/chosen": -13236014.76923077, + "logits/rejected": -6835835.7894736845, + "logps/chosen": -367.9299504206731, + "logps/rejected": -544.3783922697369, + "loss": 0.0055, + "rewards/chosen": 10.374457726111778, + "rewards/margins": 31.173101139454705, + "rewards/rejected": -20.798643413342926, + "step": 251 + }, + { + "epoch": 0.09209684787574235, + "grad_norm": 2.0119693279266357, + "kl": 3.09554123878479, + "learning_rate": 5.0961652739384356e-05, + "logits/chosen": -9913277.538461538, + "logits/rejected": -12488528.842105264, + "logps/chosen": -372.5987079326923, + "logps/rejected": -466.2408511513158, + "loss": 0.005, + "rewards/chosen": 9.688338059645433, + "rewards/margins": 29.738807709110894, + "rewards/rejected": -20.05046964946546, + "step": 252 + }, + { + "epoch": 0.09246231155778895, + "grad_norm": 0.4406461715698242, + "kl": 3.5766706466674805, + "learning_rate": 5.064112378808637e-05, + "logits/chosen": -11020506.285714285, + "logits/rejected": -15421498.666666666, + "logps/chosen": -335.23887416294644, + "logps/rejected": -548.6393229166666, + "loss": 0.0005, + "rewards/chosen": 10.790042332240514, + "rewards/margins": 34.74576302180215, + "rewards/rejected": -23.95572068956163, + "step": 253 + }, + { + "epoch": 0.09282777523983554, + "grad_norm": 8.106298446655273, + "kl": 7.274707794189453, + "learning_rate": 5.0320568482700556e-05, + "logits/chosen": -13233587.0, + "logits/rejected": -9594603.0, + "logps/chosen": -316.76092529296875, + "logps/rejected": -471.2465515136719, + "loss": 0.0101, + "rewards/chosen": 11.981158256530762, + "rewards/margins": 31.17054843902588, + "rewards/rejected": -19.189390182495117, + "step": 254 + }, + { + "epoch": 0.09319323892188214, + "grad_norm": 3.8371400833129883, + "kl": 4.641369819641113, + "learning_rate": 5e-05, + "logits/chosen": -22682042.0, + "logits/rejected": -20012056.0, + "logps/chosen": -323.8800048828125, + "logps/rejected": -447.966064453125, + "loss": 0.0123, + "rewards/chosen": 8.944287300109863, + "rewards/margins": 24.33084487915039, + "rewards/rejected": -15.386557579040527, + "step": 255 + }, + { + "epoch": 0.09355870260392873, + "grad_norm": 2.607996702194214, + "kl": 9.141901016235352, + "learning_rate": 4.967943151729945e-05, + "logits/chosen": -7518862.315789473, + "logits/rejected": -6660448.0, + "logps/chosen": -310.19413034539474, + "logps/rejected": -338.0193434495192, + "loss": 0.0155, + "rewards/chosen": 10.799164621453537, + "rewards/margins": 24.82408457149861, + "rewards/rejected": -14.024919950045073, + "step": 256 + }, + { + "epoch": 0.09392416628597533, + "grad_norm": 5.289769649505615, + "kl": 8.06167984008789, + "learning_rate": 4.935887621191364e-05, + "logits/chosen": -10651652.923076924, + "logits/rejected": -9403033.263157895, + "logps/chosen": -298.17056039663464, + "logps/rejected": -489.0219469572368, + "loss": 0.0135, + "rewards/chosen": 9.663170447716347, + "rewards/margins": 28.272147113012398, + "rewards/rejected": -18.60897666529605, + "step": 257 + }, + { + "epoch": 0.09428962996802193, + "grad_norm": 2.1013553142547607, + "kl": 6.316282272338867, + "learning_rate": 4.903834726061565e-05, + "logits/chosen": -5348608.470588235, + "logits/rejected": -17115524.266666666, + "logps/chosen": -372.2811925551471, + "logps/rejected": -529.6776041666667, + "loss": 0.0023, + "rewards/chosen": 12.543180577895221, + "rewards/margins": 31.27536034677543, + "rewards/rejected": -18.732179768880208, + "step": 258 + }, + { + "epoch": 0.09465509365006852, + "grad_norm": 0.09430605918169022, + "kl": 3.1657724380493164, + "learning_rate": 4.871785783909523e-05, + "logits/chosen": -9909307.333333334, + "logits/rejected": -14570040.0, + "logps/chosen": -376.5813395182292, + "logps/rejected": -526.92587890625, + "loss": 0.0001, + "rewards/chosen": 12.343156178792318, + "rewards/margins": 35.1986203511556, + "rewards/rejected": -22.85546417236328, + "step": 259 + }, + { + "epoch": 0.09502055733211512, + "grad_norm": 4.208137512207031, + "kl": 11.728958129882812, + "learning_rate": 4.839742112141724e-05, + "logits/chosen": -20940154.94736842, + "logits/rejected": -14148615.384615384, + "logps/chosen": -353.65185546875, + "logps/rejected": -458.05799278846155, + "loss": 0.0222, + "rewards/chosen": 9.921674226459704, + "rewards/margins": 29.544611942430258, + "rewards/rejected": -19.622937715970554, + "step": 260 + }, + { + "epoch": 0.09538602101416171, + "grad_norm": 7.073469638824463, + "kl": 8.252949714660645, + "learning_rate": 4.807705027948008e-05, + "logits/chosen": -5059035.6, + "logits/rejected": -12589068.0, + "logps/chosen": -392.0286376953125, + "logps/rejected": -612.832275390625, + "loss": 0.0198, + "rewards/chosen": 8.931254577636718, + "rewards/margins": 32.06540985107422, + "rewards/rejected": -23.1341552734375, + "step": 261 + }, + { + "epoch": 0.09575148469620831, + "grad_norm": 3.114185333251953, + "kl": 12.613537788391113, + "learning_rate": 4.775675848247427e-05, + "logits/chosen": -9859082.666666666, + "logits/rejected": -4106603.4285714286, + "logps/chosen": -340.0317654079861, + "logps/rejected": -626.2009626116071, + "loss": 0.0058, + "rewards/chosen": 10.996661716037327, + "rewards/margins": 30.18215627518911, + "rewards/rejected": -19.185494559151785, + "step": 262 + }, + { + "epoch": 0.09611694837825491, + "grad_norm": 1.4485032558441162, + "kl": 6.847216606140137, + "learning_rate": 4.743655889634105e-05, + "logits/chosen": -7044154.4, + "logits/rejected": -21760062.666666668, + "logps/chosen": -431.203076171875, + "logps/rejected": -547.7437337239584, + "loss": 0.0131, + "rewards/chosen": 11.205721282958985, + "rewards/margins": 30.8981320699056, + "rewards/rejected": -19.692410786946613, + "step": 263 + }, + { + "epoch": 0.0964824120603015, + "grad_norm": 2.2172164916992188, + "kl": 1.184948444366455, + "learning_rate": 4.711646468323129e-05, + "logits/chosen": 14399319.466666667, + "logits/rejected": -19165206.588235293, + "logps/chosen": -329.4247721354167, + "logps/rejected": -331.35920266544116, + "loss": 0.0033, + "rewards/chosen": 8.902529907226562, + "rewards/margins": 24.48414109173943, + "rewards/rejected": -15.581611184512868, + "step": 264 + }, + { + "epoch": 0.0968478757423481, + "grad_norm": 4.11033821105957, + "kl": 4.504157543182373, + "learning_rate": 4.679648900096436e-05, + "logits/chosen": -1394993.4736842106, + "logits/rejected": -24301171.692307692, + "logps/chosen": -318.26187294407896, + "logps/rejected": -512.2923677884615, + "loss": 0.0114, + "rewards/chosen": 8.505064311780428, + "rewards/margins": 30.516656296455906, + "rewards/rejected": -22.01159198467548, + "step": 265 + }, + { + "epoch": 0.0972133394243947, + "grad_norm": 2.838437557220459, + "kl": 8.365005493164062, + "learning_rate": 4.64766450024873e-05, + "logits/chosen": -6191776.0, + "logits/rejected": -17334344.533333335, + "logps/chosen": -392.1393037683824, + "logps/rejected": -423.8822916666667, + "loss": 0.0107, + "rewards/chosen": 10.756351246553308, + "rewards/margins": 26.411098764456952, + "rewards/rejected": -15.654747517903646, + "step": 266 + }, + { + "epoch": 0.09757880310644129, + "grad_norm": 7.6429877281188965, + "kl": 5.511181831359863, + "learning_rate": 4.6156945835334184e-05, + "logits/chosen": -23557641.846153848, + "logits/rejected": -12098004.210526315, + "logps/chosen": -385.46713491586536, + "logps/rejected": -375.2523900082237, + "loss": 0.013, + "rewards/chosen": 9.891978337214542, + "rewards/margins": 27.385773631725232, + "rewards/rejected": -17.49379529451069, + "step": 267 + }, + { + "epoch": 0.09794426678848789, + "grad_norm": 3.647859573364258, + "kl": 3.956812858581543, + "learning_rate": 4.583740464108554e-05, + "logits/chosen": 3308771.4285714286, + "logits/rejected": -12172128.0, + "logps/chosen": -422.1361607142857, + "logps/rejected": -441.7204318576389, + "loss": 0.0034, + "rewards/chosen": 10.608920506068639, + "rewards/margins": 29.980744982522634, + "rewards/rejected": -19.371824476453995, + "step": 268 + }, + { + "epoch": 0.09830973047053448, + "grad_norm": 6.895238876342773, + "kl": 6.380517482757568, + "learning_rate": 4.551803455482833e-05, + "logits/chosen": -1443125.7647058824, + "logits/rejected": -9765491.2, + "logps/chosen": -313.1545840992647, + "logps/rejected": -447.54417317708334, + "loss": 0.0304, + "rewards/chosen": 9.051400577320772, + "rewards/margins": 26.13627403109681, + "rewards/rejected": -17.08487345377604, + "step": 269 + }, + { + "epoch": 0.09867519415258108, + "grad_norm": 3.322082996368408, + "kl": 6.782515525817871, + "learning_rate": 4.5198848704615914e-05, + "logits/chosen": -4913181.0, + "logits/rejected": -11552806.0, + "logps/chosen": -379.10968017578125, + "logps/rejected": -502.29498291015625, + "loss": 0.0107, + "rewards/chosen": 10.786565780639648, + "rewards/margins": 29.220218658447266, + "rewards/rejected": -18.433652877807617, + "step": 270 + }, + { + "epoch": 0.09904065783462769, + "grad_norm": 2.155059814453125, + "kl": 7.093183994293213, + "learning_rate": 4.487986021092844e-05, + "logits/chosen": -2060767.619047619, + "logits/rejected": 49342050.90909091, + "logps/chosen": -298.44496372767856, + "logps/rejected": -489.07204367897725, + "loss": 0.0108, + "rewards/chosen": 10.626307896205358, + "rewards/margins": 30.07670850877638, + "rewards/rejected": -19.450400612571023, + "step": 271 + }, + { + "epoch": 0.09940612151667429, + "grad_norm": 2.6440062522888184, + "kl": 5.998599052429199, + "learning_rate": 4.4561082186133464e-05, + "logits/chosen": -18307681.777777776, + "logits/rejected": -12183321.142857144, + "logps/chosen": -367.89252387152777, + "logps/rejected": -474.12186104910717, + "loss": 0.0058, + "rewards/chosen": 12.178076002332899, + "rewards/margins": 28.29139975895957, + "rewards/rejected": -16.113323756626674, + "step": 272 + }, + { + "epoch": 0.09977158519872088, + "grad_norm": 3.304879903793335, + "kl": 3.679983139038086, + "learning_rate": 4.424252773394704e-05, + "logits/chosen": 661631.1428571428, + "logits/rejected": -10354320.0, + "logps/chosen": -314.482666015625, + "logps/rejected": -316.97254774305554, + "loss": 0.0081, + "rewards/chosen": 7.287243979317801, + "rewards/margins": 21.963611844986204, + "rewards/rejected": -14.676367865668404, + "step": 273 + }, + { + "epoch": 0.10013704888076748, + "grad_norm": 4.773331642150879, + "kl": 2.6945409774780273, + "learning_rate": 4.392420994889498e-05, + "logits/chosen": -13510961.066666666, + "logits/rejected": -12601705.411764706, + "logps/chosen": -298.2524739583333, + "logps/rejected": -451.3319450827206, + "loss": 0.0145, + "rewards/chosen": 8.849335734049479, + "rewards/margins": 30.453648406384037, + "rewards/rejected": -21.604312672334558, + "step": 274 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 3.402865409851074, + "kl": 7.450896739959717, + "learning_rate": 4.3606141915774693e-05, + "logits/chosen": -13262986.666666666, + "logits/rejected": -5343128.0, + "logps/chosen": -340.9634060329861, + "logps/rejected": -398.7054966517857, + "loss": 0.0085, + "rewards/chosen": 8.976877848307291, + "rewards/margins": 24.429266066778276, + "rewards/rejected": -15.452388218470983, + "step": 275 + }, + { + "epoch": 0.10086797624486067, + "grad_norm": 2.768672466278076, + "kl": 6.870980262756348, + "learning_rate": 4.328833670911724e-05, + "logits/chosen": 3538110.0, + "logits/rejected": -3276270.25, + "logps/chosen": -329.57305908203125, + "logps/rejected": -445.475830078125, + "loss": 0.006, + "rewards/chosen": 9.308938026428223, + "rewards/margins": 28.32182216644287, + "rewards/rejected": -19.01288414001465, + "step": 276 + }, + { + "epoch": 0.10123343992690727, + "grad_norm": 9.865166664123535, + "kl": 2.9107885360717773, + "learning_rate": 4.297080739264987e-05, + "logits/chosen": -6476342.857142857, + "logits/rejected": 3466262.222222222, + "logps/chosen": -377.0101841517857, + "logps/rejected": -472.68451605902777, + "loss": 0.0313, + "rewards/chosen": 10.301934378487724, + "rewards/margins": 26.057271563817586, + "rewards/rejected": -15.75533718532986, + "step": 277 + }, + { + "epoch": 0.10159890360895386, + "grad_norm": 2.6870737075805664, + "kl": 10.381534576416016, + "learning_rate": 4.265356701875911e-05, + "logits/chosen": 6271109.647058823, + "logits/rejected": -20574022.4, + "logps/chosen": -410.6941348805147, + "logps/rejected": -466.94651692708334, + "loss": 0.0096, + "rewards/chosen": 11.437478458180147, + "rewards/margins": 28.62426841586244, + "rewards/rejected": -17.18678995768229, + "step": 278 + }, + { + "epoch": 0.10196436729100046, + "grad_norm": 1.2762504816055298, + "kl": 0.0, + "learning_rate": 4.23366286279542e-05, + "logits/chosen": -10783479.272727273, + "logits/rejected": -5444778.666666667, + "logps/chosen": -327.41348544034093, + "logps/rejected": -387.4697265625, + "loss": 0.002, + "rewards/chosen": 8.192073475230824, + "rewards/margins": 24.024729608973384, + "rewards/rejected": -15.83265613374256, + "step": 279 + }, + { + "epoch": 0.10232983097304706, + "grad_norm": 2.7289135456085205, + "kl": 4.150970458984375, + "learning_rate": 4.2020005248331054e-05, + "logits/chosen": -8587636.0, + "logits/rejected": -18381478.0, + "logps/chosen": -346.87786865234375, + "logps/rejected": -479.1797790527344, + "loss": 0.0112, + "rewards/chosen": 8.215404510498047, + "rewards/margins": 26.087305068969727, + "rewards/rejected": -17.87190055847168, + "step": 280 + }, + { + "epoch": 0.10269529465509365, + "grad_norm": 3.547636032104492, + "kl": 8.719335556030273, + "learning_rate": 4.1703709895036625e-05, + "logits/chosen": -7574547.764705882, + "logits/rejected": -7473453.866666666, + "logps/chosen": -276.28294462316177, + "logps/rejected": -446.47649739583335, + "loss": 0.0301, + "rewards/chosen": 6.636681949391084, + "rewards/margins": 26.05585512647442, + "rewards/rejected": -19.419173177083334, + "step": 281 + }, + { + "epoch": 0.10306075833714025, + "grad_norm": 6.112145900726318, + "kl": 11.378875732421875, + "learning_rate": 4.138775556973406e-05, + "logits/chosen": -9608356.210526315, + "logits/rejected": 5188267.692307692, + "logps/chosen": -386.0028525904605, + "logps/rejected": -514.4464017427885, + "loss": 0.0244, + "rewards/chosen": 10.853593525133634, + "rewards/margins": 29.438655405392048, + "rewards/rejected": -18.585061880258415, + "step": 282 + }, + { + "epoch": 0.10342622201918684, + "grad_norm": 5.4980549812316895, + "kl": 10.184642791748047, + "learning_rate": 4.107215526006817e-05, + "logits/chosen": -8773523.2, + "logits/rejected": -16852018.82352941, + "logps/chosen": -440.4756184895833, + "logps/rejected": -380.9627470128676, + "loss": 0.0107, + "rewards/chosen": 12.489792887369791, + "rewards/margins": 27.21709498985141, + "rewards/rejected": -14.727302102481618, + "step": 283 + }, + { + "epoch": 0.10379168570123344, + "grad_norm": 3.757065773010254, + "kl": 7.056987762451172, + "learning_rate": 4.0756921939131565e-05, + "logits/chosen": 1226952.0, + "logits/rejected": -25138549.89473684, + "logps/chosen": -337.98230919471155, + "logps/rejected": -423.9380139802632, + "loss": 0.007, + "rewards/chosen": 8.946684030386118, + "rewards/margins": 24.827148283058815, + "rewards/rejected": -15.880464252672697, + "step": 284 + }, + { + "epoch": 0.10415714938328004, + "grad_norm": 2.959707260131836, + "kl": 5.374462127685547, + "learning_rate": 4.04420685649314e-05, + "logits/chosen": -12481856.0, + "logits/rejected": -6368019.2, + "logps/chosen": -292.14547909007354, + "logps/rejected": -415.2563151041667, + "loss": 0.0085, + "rewards/chosen": 9.923549876493567, + "rewards/margins": 27.57630459654565, + "rewards/rejected": -17.652754720052084, + "step": 285 + }, + { + "epoch": 0.10452261306532663, + "grad_norm": 1.8123482465744019, + "kl": 5.5137481689453125, + "learning_rate": 4.012760807985665e-05, + "logits/chosen": -6268517.176470588, + "logits/rejected": -11247858.133333333, + "logps/chosen": -355.95984604779414, + "logps/rejected": -429.3843098958333, + "loss": 0.0063, + "rewards/chosen": 11.71917096306296, + "rewards/margins": 31.577308983896295, + "rewards/rejected": -19.858138020833334, + "step": 286 + }, + { + "epoch": 0.10488807674737323, + "grad_norm": 14.286782264709473, + "kl": 2.210813522338867, + "learning_rate": 3.981355341014623e-05, + "logits/chosen": -19714211.76470588, + "logits/rejected": -14636373.333333334, + "logps/chosen": -403.02039292279414, + "logps/rejected": -384.91917317708334, + "loss": 0.0164, + "rewards/chosen": 9.70405668370864, + "rewards/margins": 25.581863283643536, + "rewards/rejected": -15.877806599934896, + "step": 287 + }, + { + "epoch": 0.10525354042941983, + "grad_norm": 2.224968910217285, + "kl": 8.97657585144043, + "learning_rate": 3.9499917465357534e-05, + "logits/chosen": -2586210.6666666665, + "logits/rejected": -12543665.6, + "logps/chosen": -418.0876871744792, + "logps/rejected": -388.8666259765625, + "loss": 0.0018, + "rewards/chosen": 13.430992126464844, + "rewards/margins": 30.3883056640625, + "rewards/rejected": -16.957313537597656, + "step": 288 + }, + { + "epoch": 0.10561900411146642, + "grad_norm": 5.878128528594971, + "kl": 4.0376458168029785, + "learning_rate": 3.9186713137835826e-05, + "logits/chosen": -1684398.4, + "logits/rejected": -6656306.352941177, + "logps/chosen": -410.1539713541667, + "logps/rejected": -459.13332950367646, + "loss": 0.0044, + "rewards/chosen": 8.71028544108073, + "rewards/margins": 27.36923175886566, + "rewards/rejected": -18.658946317784928, + "step": 289 + }, + { + "epoch": 0.10598446779351302, + "grad_norm": 3.007636785507202, + "kl": 2.5312657356262207, + "learning_rate": 3.887395330218429e-05, + "logits/chosen": -9244153.142857144, + "logits/rejected": -14424083.555555556, + "logps/chosen": -297.93558175223217, + "logps/rejected": -384.0437825520833, + "loss": 0.0066, + "rewards/chosen": 8.03838838849749, + "rewards/margins": 28.267722901843847, + "rewards/rejected": -20.229334513346355, + "step": 290 + }, + { + "epoch": 0.10634993147555961, + "grad_norm": 1.2328060865402222, + "kl": 8.91618537902832, + "learning_rate": 3.856165081473474e-05, + "logits/chosen": -3267848.2352941176, + "logits/rejected": -17831878.4, + "logps/chosen": -373.4680606617647, + "logps/rejected": -410.566796875, + "loss": 0.0186, + "rewards/chosen": 11.178346521714154, + "rewards/margins": 30.86922176585478, + "rewards/rejected": -19.690875244140624, + "step": 291 + }, + { + "epoch": 0.10671539515760621, + "grad_norm": 5.381865978240967, + "kl": 10.326642036437988, + "learning_rate": 3.8249818513019244e-05, + "logits/chosen": 4028676.5714285714, + "logits/rejected": -18713525.818181816, + "logps/chosen": -346.72523716517856, + "logps/rejected": -408.2999378551136, + "loss": 0.0183, + "rewards/chosen": 9.036735171363468, + "rewards/margins": 28.552046672606366, + "rewards/rejected": -19.5153115012429, + "step": 292 + }, + { + "epoch": 0.1070808588396528, + "grad_norm": 9.317866325378418, + "kl": 11.868725776672363, + "learning_rate": 3.793846921524237e-05, + "logits/chosen": -11081548.8, + "logits/rejected": -18464361.411764707, + "logps/chosen": -386.3818359375, + "logps/rejected": -457.32981962316177, + "loss": 0.0244, + "rewards/chosen": 10.424440511067708, + "rewards/margins": 26.13324716605392, + "rewards/rejected": -15.708806654986214, + "step": 293 + }, + { + "epoch": 0.1074463225216994, + "grad_norm": 1.1940836906433105, + "kl": 5.851612091064453, + "learning_rate": 3.762761571975429e-05, + "logits/chosen": -5387867.733333333, + "logits/rejected": -13382294.588235294, + "logps/chosen": -421.08258463541665, + "logps/rejected": -510.77809053308823, + "loss": 0.0078, + "rewards/chosen": 9.88799336751302, + "rewards/margins": 31.63544963761872, + "rewards/rejected": -21.747456270105697, + "step": 294 + }, + { + "epoch": 0.107811786203746, + "grad_norm": 3.175863742828369, + "kl": 7.240278244018555, + "learning_rate": 3.731727080452464e-05, + "logits/chosen": -5658752.421052632, + "logits/rejected": -1066111.2307692308, + "logps/chosen": -319.1949527138158, + "logps/rejected": -577.8239182692307, + "loss": 0.0138, + "rewards/chosen": 10.492460552014803, + "rewards/margins": 33.247498299911435, + "rewards/rejected": -22.755037747896633, + "step": 295 + }, + { + "epoch": 0.1081772498857926, + "grad_norm": 11.04151439666748, + "kl": 4.2807087898254395, + "learning_rate": 3.7007447226617366e-05, + "logits/chosen": -9482499.2, + "logits/rejected": -10914127.05882353, + "logps/chosen": -323.6539713541667, + "logps/rejected": -401.4125114889706, + "loss": 0.0204, + "rewards/chosen": 11.639662679036459, + "rewards/margins": 28.58819927140778, + "rewards/rejected": -16.948536592371322, + "step": 296 + }, + { + "epoch": 0.10854271356783919, + "grad_norm": 3.2049076557159424, + "kl": 1.5320796966552734, + "learning_rate": 3.6698157721666246e-05, + "logits/chosen": -8546204.8, + "logits/rejected": -15680977.88235294, + "logps/chosen": -335.16022135416665, + "logps/rejected": -450.6619083180147, + "loss": 0.0079, + "rewards/chosen": 7.482936604817708, + "rewards/margins": 23.285274251302084, + "rewards/rejected": -15.802337646484375, + "step": 297 + }, + { + "epoch": 0.10890817724988579, + "grad_norm": 1.7954075336456299, + "kl": 4.933806896209717, + "learning_rate": 3.638941500335145e-05, + "logits/chosen": -275943.3333333333, + "logits/rejected": -5345660.8, + "logps/chosen": -320.2151692708333, + "logps/rejected": -331.3191162109375, + "loss": 0.0059, + "rewards/chosen": 10.597073872884115, + "rewards/margins": 24.448579915364583, + "rewards/rejected": -13.85150604248047, + "step": 298 + }, + { + "epoch": 0.10927364093193238, + "grad_norm": 3.069288969039917, + "kl": 8.785832405090332, + "learning_rate": 3.608123176287685e-05, + "logits/chosen": 2352244.761904762, + "logits/rejected": -6656293.818181818, + "logps/chosen": -368.31815011160717, + "logps/rejected": -500.60746626420456, + "loss": 0.0127, + "rewards/chosen": 11.231942313058036, + "rewards/margins": 29.754265921456472, + "rewards/rejected": -18.522323608398438, + "step": 299 + }, + { + "epoch": 0.10963910461397898, + "grad_norm": 1.6475156545639038, + "kl": 5.473912239074707, + "learning_rate": 3.5773620668448384e-05, + "logits/chosen": -3627146.1052631577, + "logits/rejected": -12400518.153846154, + "logps/chosen": -342.6438630756579, + "logps/rejected": -411.27501502403845, + "loss": 0.0016, + "rewards/chosen": 10.417770385742188, + "rewards/margins": 28.146146334134617, + "rewards/rejected": -17.72837594839243, + "step": 300 + }, + { + "epoch": 0.11000456829602558, + "grad_norm": 4.515073776245117, + "kl": 6.356047630310059, + "learning_rate": 3.5466594364753326e-05, + "logits/chosen": -232744.5, + "logits/rejected": -14523295.0, + "logps/chosen": -306.03070068359375, + "logps/rejected": -497.333740234375, + "loss": 0.0083, + "rewards/chosen": 9.506460189819336, + "rewards/margins": 29.53728675842285, + "rewards/rejected": -20.030826568603516, + "step": 301 + }, + { + "epoch": 0.11037003197807217, + "grad_norm": 5.197349548339844, + "kl": 5.687413215637207, + "learning_rate": 3.5160165472440473e-05, + "logits/chosen": -6388844.888888889, + "logits/rejected": -15046987.42857143, + "logps/chosen": -271.69072808159723, + "logps/rejected": -429.1532505580357, + "loss": 0.0108, + "rewards/chosen": 9.108523898654514, + "rewards/margins": 26.03005121624659, + "rewards/rejected": -16.921527317592076, + "step": 302 + }, + { + "epoch": 0.11073549566011878, + "grad_norm": 5.994495868682861, + "kl": 0.2269287109375, + "learning_rate": 3.48543465876014e-05, + "logits/chosen": -7723626.666666667, + "logits/rejected": -22214044.8, + "logps/chosen": -278.26670328776044, + "logps/rejected": -457.678466796875, + "loss": 0.007, + "rewards/chosen": 10.46633529663086, + "rewards/margins": 28.194657135009766, + "rewards/rejected": -17.728321838378907, + "step": 303 + }, + { + "epoch": 0.11110095934216538, + "grad_norm": 3.716923713684082, + "kl": 6.1449079513549805, + "learning_rate": 3.4549150281252636e-05, + "logits/chosen": 2134213.75, + "logits/rejected": 34927228.0, + "logps/chosen": -312.145751953125, + "logps/rejected": -406.1129455566406, + "loss": 0.0135, + "rewards/chosen": 9.315784454345703, + "rewards/margins": 25.422822952270508, + "rewards/rejected": -16.107038497924805, + "step": 304 + }, + { + "epoch": 0.11146642302421197, + "grad_norm": 2.951690435409546, + "kl": 7.921679496765137, + "learning_rate": 3.424458909881897e-05, + "logits/chosen": 1339181.5555555555, + "logits/rejected": -8027849.714285715, + "logps/chosen": -332.3498806423611, + "logps/rejected": -432.915283203125, + "loss": 0.0101, + "rewards/chosen": 10.13966793484158, + "rewards/margins": 26.78086949908544, + "rewards/rejected": -16.64120156424386, + "step": 305 + }, + { + "epoch": 0.11183188670625857, + "grad_norm": 3.5274484157562256, + "kl": 6.175032615661621, + "learning_rate": 3.3940675559617724e-05, + "logits/chosen": -3370713.5384615385, + "logits/rejected": 95334.73684210527, + "logps/chosen": -282.7773625300481, + "logps/rejected": -402.06039268092104, + "loss": 0.0113, + "rewards/chosen": 8.491601210374098, + "rewards/margins": 23.726573712430017, + "rewards/rejected": -15.234972502055921, + "step": 306 + }, + { + "epoch": 0.11219735038830517, + "grad_norm": 4.101931095123291, + "kl": 9.005170822143555, + "learning_rate": 3.363742215634415e-05, + "logits/chosen": -11174512.0, + "logits/rejected": -25582148.266666666, + "logps/chosen": -409.6511661305147, + "logps/rejected": -438.85963541666666, + "loss": 0.006, + "rewards/chosen": 14.327471564797793, + "rewards/margins": 27.182800951191023, + "rewards/rejected": -12.85532938639323, + "step": 307 + }, + { + "epoch": 0.11256281407035176, + "grad_norm": 0.957822322845459, + "kl": 7.117717742919922, + "learning_rate": 3.333484135455792e-05, + "logits/chosen": -7990705.5, + "logits/rejected": 7891368.0, + "logps/chosen": -321.1587829589844, + "logps/rejected": -526.2684326171875, + "loss": 0.0011, + "rewards/chosen": 13.111435890197754, + "rewards/margins": 33.808695793151855, + "rewards/rejected": -20.6972599029541, + "step": 308 + }, + { + "epoch": 0.11292827775239836, + "grad_norm": 2.5059094429016113, + "kl": 1.6445379257202148, + "learning_rate": 3.303294559217063e-05, + "logits/chosen": -6566923.692307692, + "logits/rejected": -13658460.631578946, + "logps/chosen": -391.01870492788464, + "logps/rejected": -492.71607730263156, + "loss": 0.0028, + "rewards/chosen": 12.13046382023738, + "rewards/margins": 32.59376241707126, + "rewards/rejected": -20.46329859683388, + "step": 309 + }, + { + "epoch": 0.11329374143444496, + "grad_norm": 2.4314873218536377, + "kl": 1.0591979026794434, + "learning_rate": 3.273174727893463e-05, + "logits/chosen": 6695399.05882353, + "logits/rejected": -18948736.0, + "logps/chosen": -390.76858340992646, + "logps/rejected": -621.3104166666667, + "loss": 0.0031, + "rewards/chosen": 11.802028880399817, + "rewards/margins": 32.34186652688419, + "rewards/rejected": -20.539837646484376, + "step": 310 + }, + { + "epoch": 0.11365920511649155, + "grad_norm": 1.6919821500778198, + "kl": 3.981459617614746, + "learning_rate": 3.243125879593286e-05, + "logits/chosen": -10729503.2, + "logits/rejected": -15973034.181818182, + "logps/chosen": -335.160595703125, + "logps/rejected": -557.6052468039773, + "loss": 0.0041, + "rewards/chosen": 13.230134582519531, + "rewards/margins": 32.11197052001953, + "rewards/rejected": -18.8818359375, + "step": 311 + }, + { + "epoch": 0.11402466879853815, + "grad_norm": 2.942671537399292, + "kl": 13.388530731201172, + "learning_rate": 3.213149249506997e-05, + "logits/chosen": 3652304.75, + "logits/rejected": -1331766.625, + "logps/chosen": -350.85797119140625, + "logps/rejected": -460.2822570800781, + "loss": 0.0127, + "rewards/chosen": 11.398399353027344, + "rewards/margins": 26.07939338684082, + "rewards/rejected": -14.680994033813477, + "step": 312 + }, + { + "epoch": 0.11439013248058474, + "grad_norm": 0.4625697433948517, + "kl": 3.5627517700195312, + "learning_rate": 3.183246069856443e-05, + "logits/chosen": -6434336.533333333, + "logits/rejected": -2443675.7647058824, + "logps/chosen": -369.82939453125, + "logps/rejected": -459.69933363970586, + "loss": 0.0066, + "rewards/chosen": 11.4068359375, + "rewards/margins": 30.656993910845586, + "rewards/rejected": -19.250157973345587, + "step": 313 + }, + { + "epoch": 0.11475559616263134, + "grad_norm": 4.153180122375488, + "kl": 10.03883171081543, + "learning_rate": 3.153417569844219e-05, + "logits/chosen": -5034583.2, + "logits/rejected": 43748093.333333336, + "logps/chosen": -317.88896484375, + "logps/rejected": -573.0692138671875, + "loss": 0.0047, + "rewards/chosen": 10.730406188964844, + "rewards/margins": 29.240240478515624, + "rewards/rejected": -18.50983428955078, + "step": 314 + }, + { + "epoch": 0.11512105984467794, + "grad_norm": 11.761632919311523, + "kl": 5.991314888000488, + "learning_rate": 3.12366497560313e-05, + "logits/chosen": -2664633.8666666667, + "logits/rejected": -22424687.05882353, + "logps/chosen": -321.85817057291666, + "logps/rejected": -481.2059972426471, + "loss": 0.0327, + "rewards/chosen": 9.054744466145833, + "rewards/margins": 24.82827351888021, + "rewards/rejected": -15.773529052734375, + "step": 315 + }, + { + "epoch": 0.11548652352672453, + "grad_norm": 3.8463563919067383, + "kl": 10.83899974822998, + "learning_rate": 3.0939895101457916e-05, + "logits/chosen": -8834896.0, + "logits/rejected": -16614903.466666667, + "logps/chosen": -371.59880514705884, + "logps/rejected": -452.0177734375, + "loss": 0.019, + "rewards/chosen": 10.081998039694394, + "rewards/margins": 27.675273999980853, + "rewards/rejected": -17.59327596028646, + "step": 316 + }, + { + "epoch": 0.11585198720877113, + "grad_norm": 1.988590955734253, + "kl": 5.700039863586426, + "learning_rate": 3.06439239331436e-05, + "logits/chosen": -16386828.444444444, + "logits/rejected": -13155629.714285715, + "logps/chosen": -306.9330240885417, + "logps/rejected": -721.0735212053571, + "loss": 0.0107, + "rewards/chosen": 9.597460428873697, + "rewards/margins": 41.08022889636812, + "rewards/rejected": -31.48276846749442, + "step": 317 + }, + { + "epoch": 0.11621745089081773, + "grad_norm": 3.5786592960357666, + "kl": 15.998603820800781, + "learning_rate": 3.0348748417303823e-05, + "logits/chosen": -10034870.4, + "logits/rejected": -1309769.6666666667, + "logps/chosen": -355.5806884765625, + "logps/rejected": -495.1316731770833, + "loss": 0.0158, + "rewards/chosen": 12.606735229492188, + "rewards/margins": 31.470162709554035, + "rewards/rejected": -18.863427480061848, + "step": 318 + }, + { + "epoch": 0.11658291457286432, + "grad_norm": 2.6176459789276123, + "kl": 1.3107280731201172, + "learning_rate": 3.005438068744792e-05, + "logits/chosen": -12702872.0, + "logits/rejected": -14013162.666666666, + "logps/chosen": -351.5894252232143, + "logps/rejected": -368.68565538194446, + "loss": 0.0028, + "rewards/chosen": 9.445798601422991, + "rewards/margins": 27.873454987056675, + "rewards/rejected": -18.427656385633682, + "step": 319 + }, + { + "epoch": 0.11694837825491092, + "grad_norm": 3.6900506019592285, + "kl": 6.452792167663574, + "learning_rate": 2.976083284388031e-05, + "logits/chosen": -4234878.222222222, + "logits/rejected": -15321913.142857144, + "logps/chosen": -291.6098904079861, + "logps/rejected": -463.79659598214283, + "loss": 0.0191, + "rewards/chosen": 9.34364234076606, + "rewards/margins": 26.5793939015222, + "rewards/rejected": -17.23575156075614, + "step": 320 + }, + { + "epoch": 0.11731384193695751, + "grad_norm": 2.9935967922210693, + "kl": 7.0046281814575195, + "learning_rate": 2.9468116953203107e-05, + "logits/chosen": -1908481.125, + "logits/rejected": -24445900.0, + "logps/chosen": -318.03106689453125, + "logps/rejected": -491.6391296386719, + "loss": 0.0177, + "rewards/chosen": 9.31802749633789, + "rewards/margins": 30.474943161010742, + "rewards/rejected": -21.15691566467285, + "step": 321 + }, + { + "epoch": 0.11767930561900411, + "grad_norm": 3.706940174102783, + "kl": 7.503454208374023, + "learning_rate": 2.917624504782006e-05, + "logits/chosen": -18027024.94117647, + "logits/rejected": -24520162.133333333, + "logps/chosen": -376.8503848805147, + "logps/rejected": -488.60667317708334, + "loss": 0.0082, + "rewards/chosen": 12.155369478113512, + "rewards/margins": 31.510211600509344, + "rewards/rejected": -19.354842122395834, + "step": 322 + }, + { + "epoch": 0.1180447693010507, + "grad_norm": 1.6077102422714233, + "kl": 4.746549606323242, + "learning_rate": 2.888522912544202e-05, + "logits/chosen": -14891744.0, + "logits/rejected": -12216769.142857144, + "logps/chosen": -417.79478624131946, + "logps/rejected": -355.4105747767857, + "loss": 0.0111, + "rewards/chosen": 11.851554022894966, + "rewards/margins": 26.57977319142175, + "rewards/rejected": -14.728219168526786, + "step": 323 + }, + { + "epoch": 0.1184102329830973, + "grad_norm": 8.447465896606445, + "kl": 6.291051387786865, + "learning_rate": 2.8595081148593738e-05, + "logits/chosen": 5006377.411764706, + "logits/rejected": -10374186.666666666, + "logps/chosen": -305.07861328125, + "logps/rejected": -404.061328125, + "loss": 0.0236, + "rewards/chosen": 7.846922032973346, + "rewards/margins": 24.52906111174939, + "rewards/rejected": -16.682139078776043, + "step": 324 + }, + { + "epoch": 0.1187756966651439, + "grad_norm": 5.181960105895996, + "kl": 11.929756164550781, + "learning_rate": 2.8305813044122097e-05, + "logits/chosen": -11656651.0, + "logits/rejected": -6692423.5, + "logps/chosen": -269.94024658203125, + "logps/rejected": -386.2901611328125, + "loss": 0.0189, + "rewards/chosen": 9.077056884765625, + "rewards/margins": 23.699864387512207, + "rewards/rejected": -14.622807502746582, + "step": 325 + }, + { + "epoch": 0.1191411603471905, + "grad_norm": 2.3653838634490967, + "kl": 7.784326553344727, + "learning_rate": 2.80174367027059e-05, + "logits/chosen": -6502425.6, + "logits/rejected": -29446048.0, + "logps/chosen": -317.7934895833333, + "logps/rejected": -406.99195772058823, + "loss": 0.0079, + "rewards/chosen": 10.595701090494792, + "rewards/margins": 26.60975533279718, + "rewards/rejected": -16.01405424230239, + "step": 326 + }, + { + "epoch": 0.11950662402923709, + "grad_norm": 3.2690300941467285, + "kl": 8.301543235778809, + "learning_rate": 2.772996397836704e-05, + "logits/chosen": -12328018.0, + "logits/rejected": -14365276.0, + "logps/chosen": -311.1188049316406, + "logps/rejected": -445.22198486328125, + "loss": 0.0085, + "rewards/chosen": 10.632262229919434, + "rewards/margins": 29.866251945495605, + "rewards/rejected": -19.233989715576172, + "step": 327 + }, + { + "epoch": 0.11987208771128369, + "grad_norm": 0.9864028096199036, + "kl": 9.327417373657227, + "learning_rate": 2.7443406687983265e-05, + "logits/chosen": -17495437.714285713, + "logits/rejected": -21121440.0, + "logps/chosen": -318.60693359375, + "logps/rejected": -542.9109157986111, + "loss": 0.0187, + "rewards/chosen": 11.613272530691964, + "rewards/margins": 37.989162626720606, + "rewards/rejected": -26.375890096028645, + "step": 328 + }, + { + "epoch": 0.12023755139333028, + "grad_norm": 18.571590423583984, + "kl": 6.200735569000244, + "learning_rate": 2.7157776610802415e-05, + "logits/chosen": -1432442.9411764706, + "logits/rejected": -17544794.666666668, + "logps/chosen": -245.16762408088235, + "logps/rejected": -339.45654296875, + "loss": 0.0265, + "rewards/chosen": 7.856280158547794, + "rewards/margins": 20.563368374693628, + "rewards/rejected": -12.707088216145833, + "step": 329 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 1.4537888765335083, + "kl": 7.342352867126465, + "learning_rate": 2.687308548795825e-05, + "logits/chosen": -4700020.266666667, + "logits/rejected": -17269767.529411763, + "logps/chosen": -380.98186848958335, + "logps/rejected": -452.93201401654414, + "loss": 0.0066, + "rewards/chosen": 11.801508585611979, + "rewards/margins": 27.732429354798562, + "rewards/rejected": -15.930920769186582, + "step": 330 + }, + { + "epoch": 0.12096847875742348, + "grad_norm": 2.206916570663452, + "kl": 10.231358528137207, + "learning_rate": 2.658934502198772e-05, + "logits/chosen": -6524460.705882353, + "logits/rejected": -20435643.733333334, + "logps/chosen": -406.5317957261029, + "logps/rejected": -594.1096354166667, + "loss": 0.008, + "rewards/chosen": 11.267691219554228, + "rewards/margins": 38.60417600145527, + "rewards/rejected": -27.336484781901042, + "step": 331 + }, + { + "epoch": 0.12133394243947007, + "grad_norm": 3.599332332611084, + "kl": 4.368533134460449, + "learning_rate": 2.630656687635007e-05, + "logits/chosen": 3040821.5384615385, + "logits/rejected": -12727715.368421054, + "logps/chosen": -290.15902944711536, + "logps/rejected": -488.7636204769737, + "loss": 0.0058, + "rewards/chosen": 10.113963200495792, + "rewards/margins": 27.037680498501548, + "rewards/rejected": -16.923717298005755, + "step": 332 + }, + { + "epoch": 0.12169940612151667, + "grad_norm": 9.237248420715332, + "kl": 11.325383186340332, + "learning_rate": 2.6024762674947313e-05, + "logits/chosen": -10529606.222222222, + "logits/rejected": 32251108.57142857, + "logps/chosen": -392.4696451822917, + "logps/rejected": -499.440673828125, + "loss": 0.0284, + "rewards/chosen": 12.001873440212673, + "rewards/margins": 32.5229000515408, + "rewards/rejected": -20.521026611328125, + "step": 333 + }, + { + "epoch": 0.12206486980356326, + "grad_norm": 2.5345916748046875, + "kl": 8.640357971191406, + "learning_rate": 2.574394400164639e-05, + "logits/chosen": -11942077.176470589, + "logits/rejected": -18623456.0, + "logps/chosen": -326.38172104779414, + "logps/rejected": -560.0117838541667, + "loss": 0.0161, + "rewards/chosen": 9.513438505284926, + "rewards/margins": 33.07289081648284, + "rewards/rejected": -23.559452311197916, + "step": 334 + }, + { + "epoch": 0.12243033348560987, + "grad_norm": 1.950182318687439, + "kl": 6.183089256286621, + "learning_rate": 2.5464122399803125e-05, + "logits/chosen": -4152528.8571428573, + "logits/rejected": -17509498.666666668, + "logps/chosen": -336.8224400111607, + "logps/rejected": -485.3448893229167, + "loss": 0.0108, + "rewards/chosen": 13.824420383998326, + "rewards/margins": 29.117979140508744, + "rewards/rejected": -15.293558756510416, + "step": 335 + }, + { + "epoch": 0.12279579716765647, + "grad_norm": 3.8505806922912598, + "kl": 10.530712127685547, + "learning_rate": 2.5185309371787513e-05, + "logits/chosen": -8527324.444444444, + "logits/rejected": 21654619.42857143, + "logps/chosen": -403.47157118055554, + "logps/rejected": -550.1789202008929, + "loss": 0.0097, + "rewards/chosen": 11.37109629313151, + "rewards/margins": 38.78374844505673, + "rewards/rejected": -27.412652151925222, + "step": 336 + }, + { + "epoch": 0.12316126084970307, + "grad_norm": 4.47204065322876, + "kl": 6.041743278503418, + "learning_rate": 2.4907516378511135e-05, + "logits/chosen": -15307534.0, + "logits/rejected": -14369986.0, + "logps/chosen": -407.14483642578125, + "logps/rejected": -436.57684326171875, + "loss": 0.0108, + "rewards/chosen": 10.369928359985352, + "rewards/margins": 27.528108596801758, + "rewards/rejected": -17.158180236816406, + "step": 337 + }, + { + "epoch": 0.12352672453174966, + "grad_norm": 1.9707300662994385, + "kl": 5.188363075256348, + "learning_rate": 2.46307548389559e-05, + "logits/chosen": -5569525.647058823, + "logits/rejected": -20178924.8, + "logps/chosen": -297.4635225183824, + "logps/rejected": -559.9674479166666, + "loss": 0.0103, + "rewards/chosen": 9.120539945714613, + "rewards/margins": 31.027634265376072, + "rewards/rejected": -21.907094319661457, + "step": 338 + }, + { + "epoch": 0.12389218821379626, + "grad_norm": 2.470325231552124, + "kl": 4.298587322235107, + "learning_rate": 2.43550361297047e-05, + "logits/chosen": -3208856.0, + "logits/rejected": -13768366.0, + "logps/chosen": -288.8868408203125, + "logps/rejected": -411.2807312011719, + "loss": 0.0077, + "rewards/chosen": 10.152725219726562, + "rewards/margins": 26.672639846801758, + "rewards/rejected": -16.519914627075195, + "step": 339 + }, + { + "epoch": 0.12425765189584285, + "grad_norm": 4.409656524658203, + "kl": 2.399423360824585, + "learning_rate": 2.4080371584473748e-05, + "logits/chosen": -21513971.2, + "logits/rejected": -13645769.411764706, + "logps/chosen": -331.55787760416666, + "logps/rejected": -548.1875, + "loss": 0.0137, + "rewards/chosen": 10.703284708658854, + "rewards/margins": 31.893497182808673, + "rewards/rejected": -21.190212474149817, + "step": 340 + }, + { + "epoch": 0.12462311557788945, + "grad_norm": 3.6307265758514404, + "kl": 3.272641658782959, + "learning_rate": 2.3806772493646723e-05, + "logits/chosen": -9000796.8, + "logits/rejected": -9128429.176470589, + "logps/chosen": -327.75074869791666, + "logps/rejected": -531.9253791360294, + "loss": 0.0074, + "rewards/chosen": 7.822472635904948, + "rewards/margins": 27.42436903411267, + "rewards/rejected": -19.60189639820772, + "step": 341 + }, + { + "epoch": 0.12498857925993605, + "grad_norm": 4.346183776855469, + "kl": 12.65239143371582, + "learning_rate": 2.353425010381063e-05, + "logits/chosen": -18589593.904761903, + "logits/rejected": -10817597.090909092, + "logps/chosen": -338.4491722470238, + "logps/rejected": -369.1695001775568, + "loss": 0.0201, + "rewards/chosen": 10.285920642671131, + "rewards/margins": 25.68952532351275, + "rewards/rejected": -15.40360468084162, + "step": 342 + }, + { + "epoch": 0.12535404294198263, + "grad_norm": 2.9132769107818604, + "kl": 11.641450881958008, + "learning_rate": 2.3262815617293517e-05, + "logits/chosen": -7066994.222222222, + "logits/rejected": -5370290.857142857, + "logps/chosen": -386.5752224392361, + "logps/rejected": -544.4989188058036, + "loss": 0.0046, + "rewards/chosen": 11.964291042751736, + "rewards/margins": 32.27009025452629, + "rewards/rejected": -20.305799211774552, + "step": 343 + }, + { + "epoch": 0.12571950662402923, + "grad_norm": 3.080936908721924, + "kl": 15.507274627685547, + "learning_rate": 2.2992480191704002e-05, + "logits/chosen": -15526323.80952381, + "logits/rejected": -17953467.636363637, + "logps/chosen": -362.77413504464283, + "logps/rejected": -539.9638227982955, + "loss": 0.0348, + "rewards/chosen": 8.70968264625186, + "rewards/margins": 28.21762415237757, + "rewards/rejected": -19.50794150612571, + "step": 344 + }, + { + "epoch": 0.12608497030607582, + "grad_norm": 3.350477933883667, + "kl": 7.749001502990723, + "learning_rate": 2.272325493947257e-05, + "logits/chosen": 1378536.4, + "logits/rejected": -19413999.05882353, + "logps/chosen": -306.5630859375, + "logps/rejected": -494.1865234375, + "loss": 0.0167, + "rewards/chosen": 8.740608723958333, + "rewards/margins": 28.191382075291052, + "rewards/rejected": -19.45077335133272, + "step": 345 + }, + { + "epoch": 0.12645043398812242, + "grad_norm": 2.0843422412872314, + "kl": 7.250523567199707, + "learning_rate": 2.245515092739488e-05, + "logits/chosen": -6196012.235294118, + "logits/rejected": 1285648.0, + "logps/chosen": -319.6070772058824, + "logps/rejected": -487.025390625, + "loss": 0.0058, + "rewards/chosen": 11.684327069450827, + "rewards/margins": 28.801392499138327, + "rewards/rejected": -17.1170654296875, + "step": 346 + }, + { + "epoch": 0.12681589767016901, + "grad_norm": 3.1911070346832275, + "kl": 12.691611289978027, + "learning_rate": 2.2188179176176766e-05, + "logits/chosen": -17550987.789473683, + "logits/rejected": -22113875.692307692, + "logps/chosen": -318.3480160361842, + "logps/rejected": -535.5372220552885, + "loss": 0.0382, + "rewards/chosen": 11.653176558645148, + "rewards/margins": 32.772016702875916, + "rewards/rejected": -21.11884014423077, + "step": 347 + }, + { + "epoch": 0.1271813613522156, + "grad_norm": 5.1311750411987305, + "kl": 13.643856048583984, + "learning_rate": 2.192235065998126e-05, + "logits/chosen": -18615652.8, + "logits/rejected": -16904006.666666668, + "logps/chosen": -332.4, + "logps/rejected": -553.8886311848959, + "loss": 0.0454, + "rewards/chosen": 11.48718032836914, + "rewards/margins": 28.22889989217122, + "rewards/rejected": -16.741719563802082, + "step": 348 + }, + { + "epoch": 0.1275468250342622, + "grad_norm": 2.0195465087890625, + "kl": 9.704069137573242, + "learning_rate": 2.165767630597752e-05, + "logits/chosen": -369596.625, + "logits/rejected": -11441439.0, + "logps/chosen": -333.2952880859375, + "logps/rejected": -402.18646240234375, + "loss": 0.0152, + "rewards/chosen": 8.748852729797363, + "rewards/margins": 24.57652187347412, + "rewards/rejected": -15.827669143676758, + "step": 349 + }, + { + "epoch": 0.12791228871630883, + "grad_norm": 3.6267688274383545, + "kl": 10.93829345703125, + "learning_rate": 2.139416699389153e-05, + "logits/chosen": -20485460.0, + "logits/rejected": -18507292.0, + "logps/chosen": -367.7033996582031, + "logps/rejected": -348.7797546386719, + "loss": 0.012, + "rewards/chosen": 10.893939971923828, + "rewards/margins": 25.261469841003418, + "rewards/rejected": -14.36752986907959, + "step": 350 + }, + { + "epoch": 0.12827775239835543, + "grad_norm": 1.280780553817749, + "kl": 2.018073081970215, + "learning_rate": 2.1131833555559037e-05, + "logits/chosen": -16487076.57142857, + "logits/rejected": -22714336.0, + "logps/chosen": -365.26070731026783, + "logps/rejected": -467.31011284722223, + "loss": 0.0006, + "rewards/chosen": 11.814733232770648, + "rewards/margins": 27.97460634746249, + "rewards/rejected": -16.15987311469184, + "step": 351 + }, + { + "epoch": 0.12864321608040202, + "grad_norm": 3.7821719646453857, + "kl": 6.026181221008301, + "learning_rate": 2.0870686774480196e-05, + "logits/chosen": -16758192.0, + "logits/rejected": -19138073.6, + "logps/chosen": -307.3867594401042, + "logps/rejected": -583.582666015625, + "loss": 0.0096, + "rewards/chosen": 11.192501068115234, + "rewards/margins": 33.51781234741211, + "rewards/rejected": -22.325311279296876, + "step": 352 + }, + { + "epoch": 0.12900867976244862, + "grad_norm": 4.052680492401123, + "kl": 9.294964790344238, + "learning_rate": 2.061073738537635e-05, + "logits/chosen": -14739304.421052631, + "logits/rejected": -16077278.76923077, + "logps/chosen": -311.5301963404605, + "logps/rejected": -460.66556490384613, + "loss": 0.0104, + "rewards/chosen": 10.95355867084704, + "rewards/margins": 30.067303553283935, + "rewards/rejected": -19.113744882436897, + "step": 353 + }, + { + "epoch": 0.12937414344449522, + "grad_norm": 2.478837490081787, + "kl": 4.608976364135742, + "learning_rate": 2.0351996073748713e-05, + "logits/chosen": -17296711.529411763, + "logits/rejected": 24772644.266666666, + "logps/chosen": -356.7915900735294, + "logps/rejected": -523.5135416666667, + "loss": 0.0028, + "rewards/chosen": 12.470437442555147, + "rewards/margins": 34.43082155713848, + "rewards/rejected": -21.960384114583334, + "step": 354 + }, + { + "epoch": 0.1297396071265418, + "grad_norm": 1.847748875617981, + "kl": 8.054999351501465, + "learning_rate": 2.0094473475439202e-05, + "logits/chosen": -14106689.88235294, + "logits/rejected": -22187805.866666667, + "logps/chosen": -297.2208467371324, + "logps/rejected": -480.2058919270833, + "loss": 0.0026, + "rewards/chosen": 10.071542178883272, + "rewards/margins": 28.085637230966604, + "rewards/rejected": -18.014095052083334, + "step": 355 + }, + { + "epoch": 0.1301050708085884, + "grad_norm": 2.1878387928009033, + "kl": 3.745896100997925, + "learning_rate": 1.9838180176193178e-05, + "logits/chosen": -8668522.285714285, + "logits/rejected": -10683781.333333334, + "logps/chosen": -227.75493512834822, + "logps/rejected": -491.1897243923611, + "loss": 0.008, + "rewards/chosen": 8.659985133579799, + "rewards/margins": 29.01223936535063, + "rewards/rejected": -20.352254231770832, + "step": 356 + }, + { + "epoch": 0.130470534490635, + "grad_norm": 3.8430967330932617, + "kl": 7.603660583496094, + "learning_rate": 1.9583126711224343e-05, + "logits/chosen": -10734748.0, + "logits/rejected": -9619634.0, + "logps/chosen": -365.4243469238281, + "logps/rejected": -511.5800476074219, + "loss": 0.0139, + "rewards/chosen": 10.972052574157715, + "rewards/margins": 27.267218589782715, + "rewards/rejected": -16.295166015625, + "step": 357 + }, + { + "epoch": 0.1308359981726816, + "grad_norm": 6.80948543548584, + "kl": 5.7926740646362305, + "learning_rate": 1.9329323564781682e-05, + "logits/chosen": -7846976.0, + "logits/rejected": -24473128.533333335, + "logps/chosen": -285.24256089154414, + "logps/rejected": -564.2430338541667, + "loss": 0.017, + "rewards/chosen": 8.331493602079505, + "rewards/margins": 30.13961534687117, + "rewards/rejected": -21.808121744791666, + "step": 358 + }, + { + "epoch": 0.1312014618547282, + "grad_norm": 3.4457366466522217, + "kl": 4.417056560516357, + "learning_rate": 1.9076781169718428e-05, + "logits/chosen": 542870.8571428572, + "logits/rejected": -27325434.666666668, + "logps/chosen": -335.85372488839283, + "logps/rejected": -544.2056749131945, + "loss": 0.0091, + "rewards/chosen": 8.92410169328962, + "rewards/margins": 28.946749127100382, + "rewards/rejected": -20.022647433810764, + "step": 359 + }, + { + "epoch": 0.1315669255367748, + "grad_norm": 8.587203025817871, + "kl": 2.282320022583008, + "learning_rate": 1.8825509907063327e-05, + "logits/chosen": -21156348.8, + "logits/rejected": -16194277.818181818, + "logps/chosen": -348.8396484375, + "logps/rejected": -474.7384588068182, + "loss": 0.004, + "rewards/chosen": 9.361114501953125, + "rewards/margins": 26.070623224431817, + "rewards/rejected": -16.70950872247869, + "step": 360 + }, + { + "epoch": 0.1319323892188214, + "grad_norm": 5.191043376922607, + "kl": 5.198314666748047, + "learning_rate": 1.8575520105593817e-05, + "logits/chosen": -15013608.615384616, + "logits/rejected": -15378125.47368421, + "logps/chosen": -316.3674128605769, + "logps/rejected": -525.6800472861842, + "loss": 0.0128, + "rewards/chosen": 11.202185997596153, + "rewards/margins": 29.26889581718908, + "rewards/rejected": -18.066709819592926, + "step": 361 + }, + { + "epoch": 0.13229785290086798, + "grad_norm": 4.363422870635986, + "kl": 10.583883285522461, + "learning_rate": 1.8326822041411524e-05, + "logits/chosen": 5052862.933333334, + "logits/rejected": -16726782.11764706, + "logps/chosen": -342.75709635416666, + "logps/rejected": -456.0245576746324, + "loss": 0.0499, + "rewards/chosen": 9.132912190755208, + "rewards/margins": 25.596998386757043, + "rewards/rejected": -16.464086196001837, + "step": 362 + }, + { + "epoch": 0.13266331658291458, + "grad_norm": 1.1232410669326782, + "kl": 7.092581272125244, + "learning_rate": 1.807942593751973e-05, + "logits/chosen": -13845044.57142857, + "logits/rejected": -25986147.555555556, + "logps/chosen": -306.957763671875, + "logps/rejected": -509.3697916666667, + "loss": 0.0014, + "rewards/chosen": 11.72991943359375, + "rewards/margins": 28.951114230685764, + "rewards/rejected": -17.221194797092014, + "step": 363 + }, + { + "epoch": 0.13302878026496118, + "grad_norm": 2.600659132003784, + "kl": 5.733438014984131, + "learning_rate": 1.783334196340331e-05, + "logits/chosen": 2205126.4615384615, + "logits/rejected": -24825561.263157893, + "logps/chosen": -292.94227013221155, + "logps/rejected": -333.5703638980263, + "loss": 0.0059, + "rewards/chosen": 10.814683180588942, + "rewards/margins": 22.883818344548644, + "rewards/rejected": -12.069135163959704, + "step": 364 + }, + { + "epoch": 0.13339424394700777, + "grad_norm": 4.13524055480957, + "kl": 7.637614727020264, + "learning_rate": 1.758858023461059e-05, + "logits/chosen": -14920982.153846154, + "logits/rejected": -12979299.368421054, + "logps/chosen": -307.42758413461536, + "logps/rejected": -469.31846217105266, + "loss": 0.0196, + "rewards/chosen": 8.490856464092548, + "rewards/margins": 24.323358327271002, + "rewards/rejected": -15.832501863178454, + "step": 365 + }, + { + "epoch": 0.13375970762905437, + "grad_norm": 2.0567433834075928, + "kl": 12.823047637939453, + "learning_rate": 1.7345150812337564e-05, + "logits/chosen": -10395045.647058824, + "logits/rejected": -11002950.4, + "logps/chosen": -332.1681698069853, + "logps/rejected": -494.33857421875, + "loss": 0.014, + "rewards/chosen": 11.399554084329043, + "rewards/margins": 26.83494226792279, + "rewards/rejected": -15.43538818359375, + "step": 366 + }, + { + "epoch": 0.13412517131110097, + "grad_norm": 1.0228042602539062, + "kl": 5.559465408325195, + "learning_rate": 1.7103063703014372e-05, + "logits/chosen": -15145895.0, + "logits/rejected": -1945096.0, + "logps/chosen": -383.25677490234375, + "logps/rejected": -535.983154296875, + "loss": 0.006, + "rewards/chosen": 12.447044372558594, + "rewards/margins": 31.5280704498291, + "rewards/rejected": -19.081026077270508, + "step": 367 + }, + { + "epoch": 0.13449063499314756, + "grad_norm": 3.6094939708709717, + "kl": 5.796103477478027, + "learning_rate": 1.6862328857893854e-05, + "logits/chosen": -12962304.0, + "logits/rejected": -35320112.0, + "logps/chosen": -340.75103759765625, + "logps/rejected": -468.5360412597656, + "loss": 0.0095, + "rewards/chosen": 11.01352310180664, + "rewards/margins": 31.57244110107422, + "rewards/rejected": -20.558917999267578, + "step": 368 + }, + { + "epoch": 0.13485609867519416, + "grad_norm": 5.49335241317749, + "kl": 2.6393895149230957, + "learning_rate": 1.66229561726426e-05, + "logits/chosen": -27452233.846153848, + "logits/rejected": -19718218.10526316, + "logps/chosen": -320.00424429086536, + "logps/rejected": -509.1752158717105, + "loss": 0.0069, + "rewards/chosen": 8.941549447866587, + "rewards/margins": 25.749351347023662, + "rewards/rejected": -16.807801899157074, + "step": 369 + }, + { + "epoch": 0.13522156235724075, + "grad_norm": 2.3576343059539795, + "kl": 14.387454986572266, + "learning_rate": 1.6384955486934156e-05, + "logits/chosen": -10861536.94117647, + "logits/rejected": -29759168.0, + "logps/chosen": -299.68948184742646, + "logps/rejected": -566.5360026041667, + "loss": 0.0026, + "rewards/chosen": 13.967041015625, + "rewards/margins": 35.288795979817706, + "rewards/rejected": -21.32175496419271, + "step": 370 + }, + { + "epoch": 0.13558702603928735, + "grad_norm": 0.6327605247497559, + "kl": 3.9594826698303223, + "learning_rate": 1.614833658404454e-05, + "logits/chosen": -14301642.666666666, + "logits/rejected": -35243363.2, + "logps/chosen": -341.4702962239583, + "logps/rejected": -473.841162109375, + "loss": 0.0069, + "rewards/chosen": 11.527236938476562, + "rewards/margins": 31.85128173828125, + "rewards/rejected": -20.32404479980469, + "step": 371 + }, + { + "epoch": 0.13595248972133395, + "grad_norm": 5.050543308258057, + "kl": 9.189316749572754, + "learning_rate": 1.5913109190450032e-05, + "logits/chosen": -18253389.09090909, + "logits/rejected": -17729145.6, + "logps/chosen": -365.65407492897725, + "logps/rejected": -348.2648193359375, + "loss": 0.017, + "rewards/chosen": 9.233623157848012, + "rewards/margins": 26.5856220592152, + "rewards/rejected": -17.35199890136719, + "step": 372 + }, + { + "epoch": 0.13631795340338054, + "grad_norm": 2.5110576152801514, + "kl": 1.841963768005371, + "learning_rate": 1.567928297542749e-05, + "logits/chosen": -9551908.363636363, + "logits/rejected": -20412714.666666668, + "logps/chosen": -281.53786399147725, + "logps/rejected": -555.8650483630952, + "loss": 0.0077, + "rewards/chosen": 8.87999101118608, + "rewards/margins": 28.592726141859444, + "rewards/rejected": -19.712735130673362, + "step": 373 + }, + { + "epoch": 0.13668341708542714, + "grad_norm": 1.7502745389938354, + "kl": 10.012456893920898, + "learning_rate": 1.544686755065677e-05, + "logits/chosen": -18033862.736842107, + "logits/rejected": -11638153.846153846, + "logps/chosen": -299.41085012335526, + "logps/rejected": -426.59998497596155, + "loss": 0.0094, + "rewards/chosen": 11.120919478567023, + "rewards/margins": 27.851680493065217, + "rewards/rejected": -16.730761014498196, + "step": 374 + }, + { + "epoch": 0.13704888076747374, + "grad_norm": 7.847499370574951, + "kl": 4.975961685180664, + "learning_rate": 1.5215872469825682e-05, + "logits/chosen": -13523158.4, + "logits/rejected": -18919284.70588235, + "logps/chosen": -336.186328125, + "logps/rejected": -575.5615234375, + "loss": 0.0178, + "rewards/chosen": 10.927610270182292, + "rewards/margins": 31.373709046606926, + "rewards/rejected": -20.446098776424634, + "step": 375 + }, + { + "epoch": 0.13741434444952033, + "grad_norm": 0.9299519658088684, + "kl": 3.306962013244629, + "learning_rate": 1.4986307228237268e-05, + "logits/chosen": -18495393.6, + "logits/rejected": -25316706.90909091, + "logps/chosen": -327.200390625, + "logps/rejected": -445.8699840198864, + "loss": 0.0012, + "rewards/chosen": 10.558985900878906, + "rewards/margins": 30.361949157714843, + "rewards/rejected": -19.802963256835938, + "step": 376 + }, + { + "epoch": 0.13777980813156693, + "grad_norm": 2.3858423233032227, + "kl": 3.740281581878662, + "learning_rate": 1.4758181262419423e-05, + "logits/chosen": -19177298.0, + "logits/rejected": -23554898.0, + "logps/chosen": -310.3988342285156, + "logps/rejected": -508.4689636230469, + "loss": 0.0035, + "rewards/chosen": 10.302159309387207, + "rewards/margins": 28.714776039123535, + "rewards/rejected": -18.412616729736328, + "step": 377 + }, + { + "epoch": 0.13814527181361352, + "grad_norm": 0.7645795345306396, + "kl": 6.145695686340332, + "learning_rate": 1.4531503949737108e-05, + "logits/chosen": -12916399.111111112, + "logits/rejected": -7299935.428571428, + "logps/chosen": -373.2731119791667, + "logps/rejected": -432.65342494419644, + "loss": 0.0008, + "rewards/chosen": 11.443390740288628, + "rewards/margins": 29.490209064786395, + "rewards/rejected": -18.046818324497767, + "step": 378 + }, + { + "epoch": 0.13851073549566012, + "grad_norm": 3.233696699142456, + "kl": 14.197111129760742, + "learning_rate": 1.4306284608006836e-05, + "logits/chosen": -14462216.0, + "logits/rejected": -15989888.0, + "logps/chosen": -346.838623046875, + "logps/rejected": -552.4746500651041, + "loss": 0.0132, + "rewards/chosen": 10.641590881347657, + "rewards/margins": 29.17661361694336, + "rewards/rejected": -18.535022735595703, + "step": 379 + }, + { + "epoch": 0.13887619917770672, + "grad_norm": 3.947866201400757, + "kl": 5.6465654373168945, + "learning_rate": 1.4082532495113626e-05, + "logits/chosen": -12425200.0, + "logits/rejected": -18051664.0, + "logps/chosen": -303.81671142578125, + "logps/rejected": -375.0098876953125, + "loss": 0.0078, + "rewards/chosen": 8.62260627746582, + "rewards/margins": 23.38584327697754, + "rewards/rejected": -14.763236999511719, + "step": 380 + }, + { + "epoch": 0.1392416628597533, + "grad_norm": 4.610440731048584, + "kl": 11.182896614074707, + "learning_rate": 1.3860256808630428e-05, + "logits/chosen": -15481946.666666666, + "logits/rejected": -26137682.285714287, + "logps/chosen": -336.4997829861111, + "logps/rejected": -364.40223911830356, + "loss": 0.019, + "rewards/chosen": 11.971483866373697, + "rewards/margins": 27.233008611769904, + "rewards/rejected": -15.261524745396205, + "step": 381 + }, + { + "epoch": 0.1396071265417999, + "grad_norm": 3.6953859329223633, + "kl": 5.39324951171875, + "learning_rate": 1.3639466685440132e-05, + "logits/chosen": -12539386.666666666, + "logits/rejected": -23957658.352941178, + "logps/chosen": -293.07203776041666, + "logps/rejected": -509.17043887867646, + "loss": 0.0135, + "rewards/chosen": 9.760001627604167, + "rewards/margins": 29.735820934819237, + "rewards/rejected": -19.975819307215072, + "step": 382 + }, + { + "epoch": 0.1399725902238465, + "grad_norm": 0.5804247856140137, + "kl": 0.8706064224243164, + "learning_rate": 1.3420171201359933e-05, + "logits/chosen": -30833888.0, + "logits/rejected": -16618376.727272727, + "logps/chosen": -363.5557861328125, + "logps/rejected": -535.0470081676136, + "loss": 0.0007, + "rewards/chosen": 12.466926574707031, + "rewards/margins": 35.599439447576344, + "rewards/rejected": -23.132512872869317, + "step": 383 + }, + { + "epoch": 0.1403380539058931, + "grad_norm": 2.7365713119506836, + "kl": 5.294741630554199, + "learning_rate": 1.3202379370768252e-05, + "logits/chosen": -16583109.647058824, + "logits/rejected": -26208691.2, + "logps/chosen": -302.0433134191176, + "logps/rejected": -483.01647135416664, + "loss": 0.0155, + "rewards/chosen": 9.693610696231618, + "rewards/margins": 32.18121038698683, + "rewards/rejected": -22.487599690755207, + "step": 384 + }, + { + "epoch": 0.1407035175879397, + "grad_norm": 2.6418542861938477, + "kl": 14.296971321105957, + "learning_rate": 1.2986100146234232e-05, + "logits/chosen": -14297816.0, + "logits/rejected": -25109090.0, + "logps/chosen": -350.2439880371094, + "logps/rejected": -440.7972106933594, + "loss": 0.0082, + "rewards/chosen": 11.960081100463867, + "rewards/margins": 30.00857162475586, + "rewards/rejected": -18.048490524291992, + "step": 385 + }, + { + "epoch": 0.1410689812699863, + "grad_norm": 3.064525604248047, + "kl": 6.19841194152832, + "learning_rate": 1.2771342418149657e-05, + "logits/chosen": -23168660.363636363, + "logits/rejected": -23863637.333333332, + "logps/chosen": -444.31503018465907, + "logps/rejected": -511.3971354166667, + "loss": 0.0116, + "rewards/chosen": 10.073090986772018, + "rewards/margins": 29.97499877137023, + "rewards/rejected": -19.901907784598215, + "step": 386 + }, + { + "epoch": 0.1414344449520329, + "grad_norm": 2.8777620792388916, + "kl": 5.760807037353516, + "learning_rate": 1.2558115014363592e-05, + "logits/chosen": -18559053.17647059, + "logits/rejected": -17267602.133333333, + "logps/chosen": -359.0542853860294, + "logps/rejected": -599.6529296875, + "loss": 0.0101, + "rewards/chosen": 10.873653636259192, + "rewards/margins": 40.83948615579044, + "rewards/rejected": -29.96583251953125, + "step": 387 + }, + { + "epoch": 0.14179990863407949, + "grad_norm": 2.3328781127929688, + "kl": 6.80040979385376, + "learning_rate": 1.2346426699819458e-05, + "logits/chosen": -24171420.23529412, + "logits/rejected": -23203560.533333335, + "logps/chosen": -341.0097081801471, + "logps/rejected": -461.3701171875, + "loss": 0.0158, + "rewards/chosen": 12.552534215590534, + "rewards/margins": 35.3168571322572, + "rewards/rejected": -22.764322916666668, + "step": 388 + }, + { + "epoch": 0.14216537231612608, + "grad_norm": 12.305940628051758, + "kl": 5.226529121398926, + "learning_rate": 1.2136286176194745e-05, + "logits/chosen": -14569162.352941176, + "logits/rejected": -16322717.866666667, + "logps/chosen": -326.70461856617646, + "logps/rejected": -483.19768880208335, + "loss": 0.0137, + "rewards/chosen": 9.00458391974954, + "rewards/margins": 26.841862158681835, + "rewards/rejected": -17.837278238932292, + "step": 389 + }, + { + "epoch": 0.14253083599817268, + "grad_norm": 1.6833915710449219, + "kl": 6.847652435302734, + "learning_rate": 1.1927702081543279e-05, + "logits/chosen": -15248685.0, + "logits/rejected": -20202206.0, + "logps/chosen": -398.53173828125, + "logps/rejected": -541.6541748046875, + "loss": 0.0055, + "rewards/chosen": 12.000779151916504, + "rewards/margins": 35.35362148284912, + "rewards/rejected": -23.352842330932617, + "step": 390 + }, + { + "epoch": 0.14289629968021927, + "grad_norm": 2.8360769748687744, + "kl": 3.2881107330322266, + "learning_rate": 1.1720682989940262e-05, + "logits/chosen": -22246647.466666665, + "logits/rejected": -15968598.588235294, + "logps/chosen": -367.06940104166665, + "logps/rejected": -684.5506663602941, + "loss": 0.0046, + "rewards/chosen": 11.115691121419271, + "rewards/margins": 41.476821779737286, + "rewards/rejected": -30.361130658318014, + "step": 391 + }, + { + "epoch": 0.14326176336226587, + "grad_norm": 3.6260082721710205, + "kl": 10.013285636901855, + "learning_rate": 1.1515237411129698e-05, + "logits/chosen": -21980330.666666668, + "logits/rejected": -19440228.57142857, + "logps/chosen": -313.8605143229167, + "logps/rejected": -349.22938755580356, + "loss": 0.0062, + "rewards/chosen": 12.481781005859375, + "rewards/margins": 26.07877131870815, + "rewards/rejected": -13.596990312848773, + "step": 392 + }, + { + "epoch": 0.14362722704431247, + "grad_norm": 0.8183795213699341, + "kl": 3.0769596099853516, + "learning_rate": 1.1311373790174657e-05, + "logits/chosen": -8407544.0, + "logits/rejected": -11936653.714285715, + "logps/chosen": -362.89723899147725, + "logps/rejected": -505.6334170386905, + "loss": 0.001, + "rewards/chosen": 11.545245777476918, + "rewards/margins": 32.4259307663162, + "rewards/rejected": -20.880684988839285, + "step": 393 + }, + { + "epoch": 0.14399269072635906, + "grad_norm": 0.7599905133247375, + "kl": 3.1279444694519043, + "learning_rate": 1.1109100507110132e-05, + "logits/chosen": -21427994.0, + "logits/rejected": -17444296.0, + "logps/chosen": -307.7591247558594, + "logps/rejected": -507.5738830566406, + "loss": 0.0012, + "rewards/chosen": 11.219807624816895, + "rewards/margins": 30.0604829788208, + "rewards/rejected": -18.840675354003906, + "step": 394 + }, + { + "epoch": 0.14435815440840566, + "grad_norm": 1.6188733577728271, + "kl": 8.268957138061523, + "learning_rate": 1.090842587659851e-05, + "logits/chosen": -31088768.0, + "logits/rejected": -30401308.23529412, + "logps/chosen": -325.2583333333333, + "logps/rejected": -524.7356387867648, + "loss": 0.0095, + "rewards/chosen": 11.921632893880208, + "rewards/margins": 31.56609233781403, + "rewards/rejected": -19.644459443933822, + "step": 395 + }, + { + "epoch": 0.14472361809045226, + "grad_norm": 2.607548952102661, + "kl": 8.739367485046387, + "learning_rate": 1.0709358147587884e-05, + "logits/chosen": -13986383.05882353, + "logits/rejected": -33625621.333333336, + "logps/chosen": -301.66208065257354, + "logps/rejected": -570.3166015625, + "loss": 0.0147, + "rewards/chosen": 9.470724666819853, + "rewards/margins": 30.321798885569855, + "rewards/rejected": -20.85107421875, + "step": 396 + }, + { + "epoch": 0.14508908177249885, + "grad_norm": 3.7024481296539307, + "kl": 8.208661079406738, + "learning_rate": 1.0511905502972886e-05, + "logits/chosen": -24708076.0, + "logits/rejected": -26648060.0, + "logps/chosen": -323.1011657714844, + "logps/rejected": -431.546630859375, + "loss": 0.0229, + "rewards/chosen": 11.612174034118652, + "rewards/margins": 29.794764518737793, + "rewards/rejected": -18.18259048461914, + "step": 397 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 1.6638153791427612, + "kl": 6.803812026977539, + "learning_rate": 1.031607605925839e-05, + "logits/chosen": -36983144.0, + "logits/rejected": -19406228.0, + "logps/chosen": -266.76824951171875, + "logps/rejected": -394.08416748046875, + "loss": 0.0155, + "rewards/chosen": 10.895238876342773, + "rewards/margins": 29.74530792236328, + "rewards/rejected": -18.850069046020508, + "step": 398 + }, + { + "epoch": 0.14582000913659204, + "grad_norm": 6.819797515869141, + "kl": 15.803168296813965, + "learning_rate": 1.0121877866225781e-05, + "logits/chosen": -13767297.523809524, + "logits/rejected": -22651038.545454547, + "logps/chosen": -350.61397879464283, + "logps/rejected": -647.6376509232955, + "loss": 0.0251, + "rewards/chosen": 11.529028756277901, + "rewards/margins": 36.29702303007051, + "rewards/rejected": -24.767994273792613, + "step": 399 + }, + { + "epoch": 0.14618547281863864, + "grad_norm": 6.853137016296387, + "kl": 10.176651954650879, + "learning_rate": 9.929318906602175e-06, + "logits/chosen": -16842419.555555556, + "logits/rejected": -14678722.285714285, + "logps/chosen": -341.6521267361111, + "logps/rejected": -421.07303292410717, + "loss": 0.0132, + "rewards/chosen": 9.377894083658854, + "rewards/margins": 25.671339125860307, + "rewards/rejected": -16.29344504220145, + "step": 400 + }, + { + "epoch": 0.14655093650068524, + "grad_norm": 3.7213401794433594, + "kl": 8.98808479309082, + "learning_rate": 9.738407095732195e-06, + "logits/chosen": -11559998.0, + "logits/rejected": -19430600.0, + "logps/chosen": -333.1717224121094, + "logps/rejected": -498.2584228515625, + "loss": 0.013, + "rewards/chosen": 10.488086700439453, + "rewards/margins": 28.856863021850586, + "rewards/rejected": -18.368776321411133, + "step": 401 + }, + { + "epoch": 0.14691640018273183, + "grad_norm": 4.244117736816406, + "kl": 10.936875343322754, + "learning_rate": 9.549150281252633e-06, + "logits/chosen": -17990185.6, + "logits/rejected": -23708320.0, + "logps/chosen": -313.86640625, + "logps/rejected": -603.3377278645834, + "loss": 0.0209, + "rewards/chosen": 10.498314666748048, + "rewards/margins": 32.77055689493815, + "rewards/rejected": -22.272242228190105, + "step": 402 + }, + { + "epoch": 0.14728186386477843, + "grad_norm": 3.3611221313476562, + "kl": 7.79799747467041, + "learning_rate": 9.36155624276987e-06, + "logits/chosen": -13780365.538461538, + "logits/rejected": -11646197.894736841, + "logps/chosen": -354.96420522836536, + "logps/rejected": -514.0911287006579, + "loss": 0.0092, + "rewards/chosen": 10.992173414963942, + "rewards/margins": 33.64494601724601, + "rewards/rejected": -22.652772602282074, + "step": 403 + }, + { + "epoch": 0.14764732754682502, + "grad_norm": 2.651437759399414, + "kl": 4.366757392883301, + "learning_rate": 9.175632691540065e-06, + "logits/chosen": -14217870.222222222, + "logits/rejected": -25839862.85714286, + "logps/chosen": -272.78358289930554, + "logps/rejected": -399.7432338169643, + "loss": 0.0052, + "rewards/chosen": 9.935327317979601, + "rewards/margins": 27.865867130340092, + "rewards/rejected": -17.93053981236049, + "step": 404 + }, + { + "epoch": 0.14801279122887162, + "grad_norm": 0.06816679984331131, + "kl": 2.2351841926574707, + "learning_rate": 8.991387270152201e-06, + "logits/chosen": -23278110.222222224, + "logits/rejected": -16935799.652173914, + "logps/chosen": -350.6968044704861, + "logps/rejected": -522.5578294836956, + "loss": 0.0001, + "rewards/chosen": 10.803120930989584, + "rewards/margins": 31.210005165874094, + "rewards/rejected": -20.40688423488451, + "step": 405 + }, + { + "epoch": 0.14837825491091822, + "grad_norm": 3.3330228328704834, + "kl": 9.458934783935547, + "learning_rate": 8.808827552213916e-06, + "logits/chosen": -22736454.0, + "logits/rejected": -11275474.0, + "logps/chosen": -361.2474670410156, + "logps/rejected": -399.935791015625, + "loss": 0.0149, + "rewards/chosen": 10.495070457458496, + "rewards/margins": 28.26623249053955, + "rewards/rejected": -17.771162033081055, + "step": 406 + }, + { + "epoch": 0.1487437185929648, + "grad_norm": 2.6508915424346924, + "kl": 5.435604572296143, + "learning_rate": 8.627961042040184e-06, + "logits/chosen": -16735508.210526315, + "logits/rejected": -18577777.230769232, + "logps/chosen": -327.1977796052632, + "logps/rejected": -479.3508488581731, + "loss": 0.0073, + "rewards/chosen": 11.9719382838199, + "rewards/margins": 34.59804395915043, + "rewards/rejected": -22.62610567533053, + "step": 407 + }, + { + "epoch": 0.1491091822750114, + "grad_norm": 3.9252002239227295, + "kl": 6.755114555358887, + "learning_rate": 8.448795174344804e-06, + "logits/chosen": -21921961.14285714, + "logits/rejected": -23587729.777777776, + "logps/chosen": -376.69192940848217, + "logps/rejected": -516.4800347222222, + "loss": 0.0198, + "rewards/chosen": 9.311163766043526, + "rewards/margins": 27.996756417410715, + "rewards/rejected": -18.685592651367188, + "step": 408 + }, + { + "epoch": 0.149474645957058, + "grad_norm": 2.8203179836273193, + "kl": 9.154296875, + "learning_rate": 8.271337313934869e-06, + "logits/chosen": -16806274.0, + "logits/rejected": -19890772.0, + "logps/chosen": -299.132080078125, + "logps/rejected": -469.033203125, + "loss": 0.011, + "rewards/chosen": 10.388571739196777, + "rewards/margins": 26.146337509155273, + "rewards/rejected": -15.757765769958496, + "step": 409 + }, + { + "epoch": 0.1498401096391046, + "grad_norm": 2.4894776344299316, + "kl": 3.0369834899902344, + "learning_rate": 8.09559475540797e-06, + "logits/chosen": -20668140.0, + "logits/rejected": -8688395.0, + "logps/chosen": -331.1326599121094, + "logps/rejected": -446.0169982910156, + "loss": 0.0037, + "rewards/chosen": 9.85788631439209, + "rewards/margins": 28.907238960266113, + "rewards/rejected": -19.049352645874023, + "step": 410 + }, + { + "epoch": 0.1502055733211512, + "grad_norm": 3.686699867248535, + "kl": 6.8073248863220215, + "learning_rate": 7.921574722852343e-06, + "logits/chosen": -18417532.0, + "logits/rejected": -15535066.0, + "logps/chosen": -403.1018981933594, + "logps/rejected": -405.13018798828125, + "loss": 0.012, + "rewards/chosen": 11.727499008178711, + "rewards/margins": 27.075042724609375, + "rewards/rejected": -15.347543716430664, + "step": 411 + }, + { + "epoch": 0.1505710370031978, + "grad_norm": 4.364274978637695, + "kl": 7.379733085632324, + "learning_rate": 7.749284369549953e-06, + "logits/chosen": -20572004.266666666, + "logits/rejected": -16465479.529411765, + "logps/chosen": -383.176171875, + "logps/rejected": -496.5433134191176, + "loss": 0.0081, + "rewards/chosen": 10.621148681640625, + "rewards/margins": 35.045168887867646, + "rewards/rejected": -24.424020206227024, + "step": 412 + }, + { + "epoch": 0.1509365006852444, + "grad_norm": 8.022272109985352, + "kl": 8.102239608764648, + "learning_rate": 7.578730777682386e-06, + "logits/chosen": -15526733.714285715, + "logits/rejected": -27349265.777777776, + "logps/chosen": -262.49257114955356, + "logps/rejected": -401.47447374131946, + "loss": 0.0254, + "rewards/chosen": 9.998312813895089, + "rewards/margins": 24.25057111467634, + "rewards/rejected": -14.25225830078125, + "step": 413 + }, + { + "epoch": 0.15130196436729101, + "grad_norm": 5.674525737762451, + "kl": 11.191556930541992, + "learning_rate": 7.409920958039795e-06, + "logits/chosen": -3903738.25, + "logits/rejected": -10004945.0, + "logps/chosen": -329.92071533203125, + "logps/rejected": -460.50567626953125, + "loss": 0.0156, + "rewards/chosen": 10.704710006713867, + "rewards/margins": 29.470081329345703, + "rewards/rejected": -18.765371322631836, + "step": 414 + }, + { + "epoch": 0.1516674280493376, + "grad_norm": 1.652557373046875, + "kl": 6.069989204406738, + "learning_rate": 7.242861849732696e-06, + "logits/chosen": -15238320.0, + "logits/rejected": -23044738.133333333, + "logps/chosen": -308.97262752757354, + "logps/rejected": -473.00934244791665, + "loss": 0.0081, + "rewards/chosen": 13.310818840475644, + "rewards/margins": 31.41892024021523, + "rewards/rejected": -18.108101399739585, + "step": 415 + }, + { + "epoch": 0.1520328917313842, + "grad_norm": 2.467336654663086, + "kl": 8.838824272155762, + "learning_rate": 7.077560319906695e-06, + "logits/chosen": -15968238.933333334, + "logits/rejected": -11101067.294117646, + "logps/chosen": -347.22734375, + "logps/rejected": -448.1480928308824, + "loss": 0.0051, + "rewards/chosen": 12.324575805664063, + "rewards/margins": 28.871825992359838, + "rewards/rejected": -16.547250186695774, + "step": 416 + }, + { + "epoch": 0.1523983554134308, + "grad_norm": 4.9573588371276855, + "kl": 12.335953712463379, + "learning_rate": 6.9140231634602485e-06, + "logits/chosen": -20029196.0, + "logits/rejected": -21321944.0, + "logps/chosen": -348.1080627441406, + "logps/rejected": -455.6292724609375, + "loss": 0.0474, + "rewards/chosen": 11.541509628295898, + "rewards/margins": 27.00638771057129, + "rewards/rejected": -15.46487808227539, + "step": 417 + }, + { + "epoch": 0.1527638190954774, + "grad_norm": 6.510196685791016, + "kl": 7.926250457763672, + "learning_rate": 6.752257102765325e-06, + "logits/chosen": -17274903.466666665, + "logits/rejected": -18529421.17647059, + "logps/chosen": -348.30162760416664, + "logps/rejected": -420.3418830422794, + "loss": 0.0327, + "rewards/chosen": 11.31205546061198, + "rewards/margins": 28.550096160290288, + "rewards/rejected": -17.238040699678308, + "step": 418 + }, + { + "epoch": 0.153129282777524, + "grad_norm": 8.186201095581055, + "kl": 5.455965995788574, + "learning_rate": 6.592268787391076e-06, + "logits/chosen": -12488167.0, + "logits/rejected": -25540416.0, + "logps/chosen": -308.2163391113281, + "logps/rejected": -400.8643798828125, + "loss": 0.0133, + "rewards/chosen": 11.308248519897461, + "rewards/margins": 25.812393188476562, + "rewards/rejected": -14.504144668579102, + "step": 419 + }, + { + "epoch": 0.1534947464595706, + "grad_norm": 4.119979381561279, + "kl": 8.486355781555176, + "learning_rate": 6.43406479383053e-06, + "logits/chosen": -21335499.42857143, + "logits/rejected": -40071217.777777776, + "logps/chosen": -377.98660714285717, + "logps/rejected": -421.1868489583333, + "loss": 0.0158, + "rewards/chosen": 10.400750296456474, + "rewards/margins": 26.636881752619665, + "rewards/rejected": -16.236131456163193, + "step": 420 + }, + { + "epoch": 0.1538602101416172, + "grad_norm": 0.5004268288612366, + "kl": 12.081637382507324, + "learning_rate": 6.277651625230219e-06, + "logits/chosen": -17302134.85714286, + "logits/rejected": -20977779.555555556, + "logps/chosen": -388.91099330357144, + "logps/rejected": -525.6024848090278, + "loss": 0.0007, + "rewards/chosen": 13.48549325125558, + "rewards/margins": 32.43476116846478, + "rewards/rejected": -18.949267917209202, + "step": 421 + }, + { + "epoch": 0.15422567382366378, + "grad_norm": 1.5807373523712158, + "kl": 3.601625442504883, + "learning_rate": 6.12303571112286e-06, + "logits/chosen": -21944930.0, + "logits/rejected": -3096052.0, + "logps/chosen": -394.995849609375, + "logps/rejected": -614.4163208007812, + "loss": 0.0029, + "rewards/chosen": 10.758222579956055, + "rewards/margins": 33.25175476074219, + "rewards/rejected": -22.493532180786133, + "step": 422 + }, + { + "epoch": 0.15459113750571038, + "grad_norm": 3.571805238723755, + "kl": 0.9084372520446777, + "learning_rate": 5.9702234071631e-06, + "logits/chosen": -19101278.666666668, + "logits/rejected": -16278646.4, + "logps/chosen": -366.7606201171875, + "logps/rejected": -393.5871337890625, + "loss": 0.009, + "rewards/chosen": 10.781204223632812, + "rewards/margins": 28.953993225097655, + "rewards/rejected": -18.172789001464842, + "step": 423 + }, + { + "epoch": 0.15495660118775698, + "grad_norm": 1.2493962049484253, + "kl": 12.248997688293457, + "learning_rate": 5.819220994866237e-06, + "logits/chosen": -14666216.421052631, + "logits/rejected": -10508819.692307692, + "logps/chosen": -337.61636513157896, + "logps/rejected": -325.08653846153845, + "loss": 0.0081, + "rewards/chosen": 11.996806897615132, + "rewards/margins": 25.43762861861874, + "rewards/rejected": -13.440821721003605, + "step": 424 + }, + { + "epoch": 0.15532206486980357, + "grad_norm": 8.128250122070312, + "kl": 7.635442733764648, + "learning_rate": 5.670034681349995e-06, + "logits/chosen": -5133392.0, + "logits/rejected": -15896204.444444444, + "logps/chosen": -312.2733677455357, + "logps/rejected": -554.7775065104166, + "loss": 0.0422, + "rewards/chosen": 11.78571537562779, + "rewards/margins": 32.79290396069723, + "rewards/rejected": -21.007188585069443, + "step": 425 + }, + { + "epoch": 0.15568752855185017, + "grad_norm": 2.2715001106262207, + "kl": 9.255424499511719, + "learning_rate": 5.5226705990794155e-06, + "logits/chosen": 3280712.888888889, + "logits/rejected": -22880834.285714287, + "logps/chosen": -280.72943793402777, + "logps/rejected": -448.06644112723217, + "loss": 0.0158, + "rewards/chosen": 9.40623050265842, + "rewards/margins": 27.18710012284536, + "rewards/rejected": -17.78086962018694, + "step": 426 + }, + { + "epoch": 0.15605299223389676, + "grad_norm": 2.222080945968628, + "kl": 9.617365837097168, + "learning_rate": 5.377134805614714e-06, + "logits/chosen": 5196017.0, + "logits/rejected": -17370620.0, + "logps/chosen": -344.3332824707031, + "logps/rejected": -398.67108154296875, + "loss": 0.0093, + "rewards/chosen": 10.131664276123047, + "rewards/margins": 26.408960342407227, + "rewards/rejected": -16.27729606628418, + "step": 427 + }, + { + "epoch": 0.15641845591594336, + "grad_norm": 4.735278129577637, + "kl": 20.595947265625, + "learning_rate": 5.233433283362349e-06, + "logits/chosen": -9070970.94736842, + "logits/rejected": -23176440.615384616, + "logps/chosen": -331.1449424342105, + "logps/rejected": -415.65966796875, + "loss": 0.0253, + "rewards/chosen": 11.629506161338405, + "rewards/margins": 28.43736866321641, + "rewards/rejected": -16.807862501878006, + "step": 428 + }, + { + "epoch": 0.15678391959798996, + "grad_norm": 5.029958248138428, + "kl": 10.042930603027344, + "learning_rate": 5.091571939329048e-06, + "logits/chosen": -21280144.94117647, + "logits/rejected": -8040764.266666667, + "logps/chosen": -380.55945542279414, + "logps/rejected": -419.2379557291667, + "loss": 0.0065, + "rewards/chosen": 12.805199118221507, + "rewards/margins": 27.627571554744947, + "rewards/rejected": -14.822372436523438, + "step": 429 + }, + { + "epoch": 0.15714938328003655, + "grad_norm": 2.6428425312042236, + "kl": 4.413733005523682, + "learning_rate": 4.951556604879048e-06, + "logits/chosen": -15455672.0, + "logits/rejected": -22217891.2, + "logps/chosen": -376.5189208984375, + "logps/rejected": -719.201318359375, + "loss": 0.0054, + "rewards/chosen": 10.589691162109375, + "rewards/margins": 41.24964904785156, + "rewards/rejected": -30.659957885742188, + "step": 430 + }, + { + "epoch": 0.15751484696208315, + "grad_norm": 6.668398857116699, + "kl": 12.125503540039062, + "learning_rate": 4.813393035494329e-06, + "logits/chosen": -23639686.736842107, + "logits/rejected": -8327404.307692308, + "logps/chosen": -355.2806846217105, + "logps/rejected": -268.4061936598558, + "loss": 0.0506, + "rewards/chosen": 10.754629837839227, + "rewards/margins": 22.43894779344319, + "rewards/rejected": -11.684317955603966, + "step": 431 + }, + { + "epoch": 0.15788031064412975, + "grad_norm": 9.94383716583252, + "kl": 7.063098430633545, + "learning_rate": 4.677086910538092e-06, + "logits/chosen": -15368172.8, + "logits/rejected": -24191817.411764707, + "logps/chosen": -368.21412760416666, + "logps/rejected": -448.17532169117646, + "loss": 0.038, + "rewards/chosen": 10.19063720703125, + "rewards/margins": 27.027727194393385, + "rewards/rejected": -16.837089987362134, + "step": 432 + }, + { + "epoch": 0.15824577432617634, + "grad_norm": 6.477745532989502, + "kl": 9.224189758300781, + "learning_rate": 4.542643833021254e-06, + "logits/chosen": -11979980.8, + "logits/rejected": -28487032.470588237, + "logps/chosen": -235.78707682291667, + "logps/rejected": -466.2767118566176, + "loss": 0.0306, + "rewards/chosen": 8.808645629882813, + "rewards/margins": 26.163985128963695, + "rewards/rejected": -17.355339499080884, + "step": 433 + }, + { + "epoch": 0.15861123800822294, + "grad_norm": 2.8732669353485107, + "kl": 9.912464141845703, + "learning_rate": 4.410069329372152e-06, + "logits/chosen": -9873192.533333333, + "logits/rejected": -21160124.23529412, + "logps/chosen": -300.35882161458335, + "logps/rejected": -586.7696461397059, + "loss": 0.0131, + "rewards/chosen": 10.231475830078125, + "rewards/margins": 27.539403578814337, + "rewards/rejected": -17.307927748736212, + "step": 434 + }, + { + "epoch": 0.15897670169026953, + "grad_norm": 2.760800361633301, + "kl": 11.60853099822998, + "learning_rate": 4.279368849209381e-06, + "logits/chosen": -10689651.368421054, + "logits/rejected": -17628550.153846152, + "logps/chosen": -360.66085012335526, + "logps/rejected": -481.80337289663464, + "loss": 0.0036, + "rewards/chosen": 12.94503462942023, + "rewards/margins": 31.39527689204042, + "rewards/rejected": -18.450242262620193, + "step": 435 + }, + { + "epoch": 0.15934216537231613, + "grad_norm": 2.4744064807891846, + "kl": 5.831138610839844, + "learning_rate": 4.150547765117746e-06, + "logits/chosen": -16001760.0, + "logits/rejected": -8217619.555555556, + "logps/chosen": -303.439208984375, + "logps/rejected": -609.8605143229166, + "loss": 0.011, + "rewards/chosen": 12.055870056152344, + "rewards/margins": 32.38429684109158, + "rewards/rejected": -20.328426784939236, + "step": 436 + }, + { + "epoch": 0.15970762905436273, + "grad_norm": 4.599251747131348, + "kl": 8.393531799316406, + "learning_rate": 4.023611372427471e-06, + "logits/chosen": -16657670.0, + "logits/rejected": -20244786.0, + "logps/chosen": -379.3275451660156, + "logps/rejected": -414.9568176269531, + "loss": 0.0195, + "rewards/chosen": 10.768531799316406, + "rewards/margins": 26.098987579345703, + "rewards/rejected": -15.330455780029297, + "step": 437 + }, + { + "epoch": 0.16007309273640932, + "grad_norm": 4.286300182342529, + "kl": 4.385191917419434, + "learning_rate": 3.898564888996476e-06, + "logits/chosen": -23601801.846153848, + "logits/rejected": -31119413.89473684, + "logps/chosen": -291.6387469951923, + "logps/rejected": -438.96895559210526, + "loss": 0.0123, + "rewards/chosen": 9.170396658090445, + "rewards/margins": 27.52339348426232, + "rewards/rejected": -18.352996826171875, + "step": 438 + }, + { + "epoch": 0.16043855641845592, + "grad_norm": 2.5487780570983887, + "kl": 5.990370750427246, + "learning_rate": 3.7754134549959297e-06, + "logits/chosen": -27669673.6, + "logits/rejected": -32257399.272727273, + "logps/chosen": -391.1444580078125, + "logps/rejected": -474.4271129261364, + "loss": 0.034, + "rewards/chosen": 13.463796997070313, + "rewards/margins": 25.966402088512076, + "rewards/rejected": -12.502605091441762, + "step": 439 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 16.61724281311035, + "kl": 11.28474235534668, + "learning_rate": 3.654162132698918e-06, + "logits/chosen": -21012483.76470588, + "logits/rejected": -16002807.466666667, + "logps/chosen": -400.98796530330884, + "logps/rejected": -377.8818684895833, + "loss": 0.0184, + "rewards/chosen": 13.141266766716452, + "rewards/margins": 29.15266813390395, + "rewards/rejected": -16.0114013671875, + "step": 440 + }, + { + "epoch": 0.1611694837825491, + "grad_norm": 3.457298994064331, + "kl": 4.654218673706055, + "learning_rate": 3.534815906272404e-06, + "logits/chosen": -20219384.0, + "logits/rejected": -22506824.727272727, + "logps/chosen": -481.352392578125, + "logps/rejected": -488.08056640625, + "loss": 0.0069, + "rewards/chosen": 14.993611145019532, + "rewards/margins": 37.497987643155184, + "rewards/rejected": -22.504376498135652, + "step": 441 + }, + { + "epoch": 0.1615349474645957, + "grad_norm": 2.888286828994751, + "kl": 12.707405090332031, + "learning_rate": 3.417379681572297e-06, + "logits/chosen": -11562834.52631579, + "logits/rejected": -21493016.615384616, + "logps/chosen": -319.17251747532896, + "logps/rejected": -503.45763221153845, + "loss": 0.0156, + "rewards/chosen": 10.551447818153783, + "rewards/margins": 30.981600124343686, + "rewards/rejected": -20.430152306189903, + "step": 442 + }, + { + "epoch": 0.1619004111466423, + "grad_norm": 0.8166823983192444, + "kl": 5.3129730224609375, + "learning_rate": 3.3018582859418446e-06, + "logits/chosen": -15714855.272727273, + "logits/rejected": -24346642.285714287, + "logps/chosen": -374.6168767755682, + "logps/rejected": -476.68178013392856, + "loss": 0.0072, + "rewards/chosen": 9.757564891468395, + "rewards/margins": 25.32948755701899, + "rewards/rejected": -15.571922665550595, + "step": 443 + }, + { + "epoch": 0.1622658748286889, + "grad_norm": 8.551015853881836, + "kl": 8.53953742980957, + "learning_rate": 3.18825646801314e-06, + "logits/chosen": -17968157.866666667, + "logits/rejected": 14188054.588235294, + "logps/chosen": -404.3224283854167, + "logps/rejected": -520.2399471507352, + "loss": 0.0107, + "rewards/chosen": 14.040669759114584, + "rewards/margins": 35.7948273303462, + "rewards/rejected": -21.754157571231616, + "step": 444 + }, + { + "epoch": 0.1626313385107355, + "grad_norm": 4.408789157867432, + "kl": 10.378676414489746, + "learning_rate": 3.076578897511978e-06, + "logits/chosen": -15063312.94117647, + "logits/rejected": -7432438.4, + "logps/chosen": -327.6275850183824, + "logps/rejected": -391.27955729166666, + "loss": 0.0101, + "rewards/chosen": 10.016830444335938, + "rewards/margins": 25.488014729817706, + "rewards/rejected": -15.47118428548177, + "step": 445 + }, + { + "epoch": 0.1629968021927821, + "grad_norm": 4.022834777832031, + "kl": 9.119674682617188, + "learning_rate": 2.966830165065876e-06, + "logits/chosen": -15041556.705882354, + "logits/rejected": -28819428.266666666, + "logps/chosen": -324.4514590992647, + "logps/rejected": -502.3069661458333, + "loss": 0.01, + "rewards/chosen": 12.984105727251839, + "rewards/margins": 35.77514169730392, + "rewards/rejected": -22.791035970052082, + "step": 446 + }, + { + "epoch": 0.1633622658748287, + "grad_norm": 2.4674453735351562, + "kl": 5.848677635192871, + "learning_rate": 2.8590147820153513e-06, + "logits/chosen": -10032599.111111112, + "logits/rejected": -5404501.714285715, + "logps/chosen": -254.16986762152777, + "logps/rejected": -504.61000279017856, + "loss": 0.0068, + "rewards/chosen": 8.41563245985243, + "rewards/margins": 25.688304840572293, + "rewards/rejected": -17.272672380719865, + "step": 447 + }, + { + "epoch": 0.16372772955687528, + "grad_norm": 5.428394317626953, + "kl": 14.211067199707031, + "learning_rate": 2.753137180228543e-06, + "logits/chosen": -20134986.666666668, + "logits/rejected": -23868205.714285713, + "logps/chosen": -390.21001519097223, + "logps/rejected": -567.7820870535714, + "loss": 0.0174, + "rewards/chosen": 13.056165907118055, + "rewards/margins": 35.81026616172185, + "rewards/rejected": -22.754100254603795, + "step": 448 + }, + { + "epoch": 0.16409319323892188, + "grad_norm": 3.0148675441741943, + "kl": 9.013250350952148, + "learning_rate": 2.6492017119189417e-06, + "logits/chosen": -19723339.29411765, + "logits/rejected": -23073634.133333333, + "logps/chosen": -320.5142176011029, + "logps/rejected": -382.3089192708333, + "loss": 0.004, + "rewards/chosen": 10.78306489832261, + "rewards/margins": 26.367939369351255, + "rewards/rejected": -15.584874471028646, + "step": 449 + }, + { + "epoch": 0.16445865692096848, + "grad_norm": 0.6360757350921631, + "kl": 7.605924129486084, + "learning_rate": 2.547212649466568e-06, + "logits/chosen": -21655616.0, + "logits/rejected": -17167992.888888888, + "logps/chosen": -401.30653599330356, + "logps/rejected": -440.38506401909723, + "loss": 0.0123, + "rewards/chosen": 11.876908438546318, + "rewards/margins": 27.86839064340743, + "rewards/rejected": -15.99148220486111, + "step": 450 + }, + { + "epoch": 0.16482412060301507, + "grad_norm": 3.2881557941436768, + "kl": 14.220067977905273, + "learning_rate": 2.4471741852423237e-06, + "logits/chosen": -12561598.0, + "logits/rejected": -20079520.0, + "logps/chosen": -371.8477478027344, + "logps/rejected": -483.8169860839844, + "loss": 0.0203, + "rewards/chosen": 12.1270112991333, + "rewards/margins": 30.894883155822754, + "rewards/rejected": -18.767871856689453, + "step": 451 + }, + { + "epoch": 0.16518958428506167, + "grad_norm": 3.586458921432495, + "kl": 7.0165228843688965, + "learning_rate": 2.349090431435641e-06, + "logits/chosen": -27070567.111111112, + "logits/rejected": -25088331.42857143, + "logps/chosen": -323.81732855902777, + "logps/rejected": -513.6007952008929, + "loss": 0.0086, + "rewards/chosen": 10.81132337782118, + "rewards/margins": 28.4467048039512, + "rewards/rejected": -17.63538142613002, + "step": 452 + }, + { + "epoch": 0.16555504796710827, + "grad_norm": 2.260195016860962, + "kl": 4.048793792724609, + "learning_rate": 2.2529654198854835e-06, + "logits/chosen": -10841934.857142856, + "logits/rejected": -13254800.0, + "logps/chosen": -346.89780970982144, + "logps/rejected": -309.6354709201389, + "loss": 0.0063, + "rewards/chosen": 11.380504063197545, + "rewards/margins": 25.66125221857949, + "rewards/rejected": -14.280748155381945, + "step": 453 + }, + { + "epoch": 0.16592051164915486, + "grad_norm": 5.410141468048096, + "kl": 4.34533166885376, + "learning_rate": 2.1588031019145636e-06, + "logits/chosen": -22553651.2, + "logits/rejected": -17623986.90909091, + "logps/chosen": -288.311865234375, + "logps/rejected": -469.3187144886364, + "loss": 0.0037, + "rewards/chosen": 12.686544036865234, + "rewards/margins": 29.263437236439096, + "rewards/rejected": -16.576893199573863, + "step": 454 + }, + { + "epoch": 0.16628597533120146, + "grad_norm": 4.688035488128662, + "kl": 14.214731216430664, + "learning_rate": 2.066607348166971e-06, + "logits/chosen": -20386053.05263158, + "logits/rejected": -30553351.384615384, + "logps/chosen": -293.4904142680921, + "logps/rejected": -538.5098407451923, + "loss": 0.0141, + "rewards/chosen": 12.120495444849917, + "rewards/margins": 33.90094025028862, + "rewards/rejected": -21.7804448054387, + "step": 455 + }, + { + "epoch": 0.16665143901324805, + "grad_norm": 1.9798458814620972, + "kl": 2.4126996994018555, + "learning_rate": 1.9763819484490355e-06, + "logits/chosen": -8454466.4, + "logits/rejected": -25837099.636363637, + "logps/chosen": -296.7321533203125, + "logps/rejected": -524.9772283380681, + "loss": 0.0027, + "rewards/chosen": 9.412641143798828, + "rewards/margins": 28.30869688554244, + "rewards/rejected": -18.89605574174361, + "step": 456 + }, + { + "epoch": 0.16701690269529465, + "grad_norm": 2.8953404426574707, + "kl": 4.921684265136719, + "learning_rate": 1.888130611573563e-06, + "logits/chosen": -20163811.692307692, + "logits/rejected": -23792535.57894737, + "logps/chosen": -284.1425030048077, + "logps/rejected": -463.0054481907895, + "loss": 0.0065, + "rewards/chosen": 9.146165114182692, + "rewards/margins": 29.556708455568383, + "rewards/rejected": -20.41054334138569, + "step": 457 + }, + { + "epoch": 0.16738236637734125, + "grad_norm": 3.254206657409668, + "kl": 18.50867462158203, + "learning_rate": 1.8018569652073381e-06, + "logits/chosen": -22341660.8, + "logits/rejected": -3755183.3333333335, + "logps/chosen": -433.17724609375, + "logps/rejected": -258.1299235026042, + "loss": 0.0263, + "rewards/chosen": 12.941940307617188, + "rewards/margins": 23.54852294921875, + "rewards/rejected": -10.606582641601562, + "step": 458 + }, + { + "epoch": 0.16774783005938784, + "grad_norm": 8.866241455078125, + "kl": 8.923077583312988, + "learning_rate": 1.7175645557220566e-06, + "logits/chosen": -9742920.0, + "logits/rejected": -17124748.0, + "logps/chosen": -351.39398193359375, + "logps/rejected": -471.82354736328125, + "loss": 0.0144, + "rewards/chosen": 10.004897117614746, + "rewards/margins": 27.321707725524902, + "rewards/rejected": -17.316810607910156, + "step": 459 + }, + { + "epoch": 0.16811329374143444, + "grad_norm": 5.046156883239746, + "kl": 4.031088829040527, + "learning_rate": 1.6352568480485276e-06, + "logits/chosen": -23608648.0, + "logits/rejected": -24768715.2, + "logps/chosen": -443.3495686848958, + "logps/rejected": -464.429052734375, + "loss": 0.0069, + "rewards/chosen": 13.324520111083984, + "rewards/margins": 30.990674591064455, + "rewards/rejected": -17.66615447998047, + "step": 460 + }, + { + "epoch": 0.16847875742348103, + "grad_norm": 2.8274333477020264, + "kl": 17.224836349487305, + "learning_rate": 1.5549372255342366e-06, + "logits/chosen": -5637647.2, + "logits/rejected": -18935974.666666668, + "logps/chosen": -390.06513671875, + "logps/rejected": -405.3094482421875, + "loss": 0.0157, + "rewards/chosen": 11.449068450927735, + "rewards/margins": 29.44400634765625, + "rewards/rejected": -17.994937896728516, + "step": 461 + }, + { + "epoch": 0.16884422110552763, + "grad_norm": 4.146451473236084, + "kl": 7.169589996337891, + "learning_rate": 1.4766089898042678e-06, + "logits/chosen": -15471779.0, + "logits/rejected": -24703278.0, + "logps/chosen": -312.13665771484375, + "logps/rejected": -547.6018676757812, + "loss": 0.013, + "rewards/chosen": 9.415575981140137, + "rewards/margins": 31.896620750427246, + "rewards/rejected": -22.48104476928711, + "step": 462 + }, + { + "epoch": 0.16920968478757423, + "grad_norm": 3.1862587928771973, + "kl": 8.787742614746094, + "learning_rate": 1.400275360625608e-06, + "logits/chosen": -11017497.777777778, + "logits/rejected": -20924608.0, + "logps/chosen": -313.27745225694446, + "logps/rejected": -408.04007393973217, + "loss": 0.0074, + "rewards/chosen": 9.87420654296875, + "rewards/margins": 28.075029645647323, + "rewards/rejected": -18.200823102678573, + "step": 463 + }, + { + "epoch": 0.16957514846962082, + "grad_norm": 0.5183057188987732, + "kl": 5.149826526641846, + "learning_rate": 1.325939475774768e-06, + "logits/chosen": -20062652.0, + "logits/rejected": -17662944.0, + "logps/chosen": -365.0354309082031, + "logps/rejected": -521.9868774414062, + "loss": 0.0065, + "rewards/chosen": 12.215868949890137, + "rewards/margins": 32.11898899078369, + "rewards/rejected": -19.903120040893555, + "step": 464 + }, + { + "epoch": 0.16994061215166742, + "grad_norm": 3.8341994285583496, + "kl": 4.903093338012695, + "learning_rate": 1.2536043909088191e-06, + "logits/chosen": -22292958.0, + "logits/rejected": -16977694.666666668, + "logps/chosen": -361.6695861816406, + "logps/rejected": -416.9369303385417, + "loss": 0.0057, + "rewards/chosen": 11.536267280578613, + "rewards/margins": 27.647740364074707, + "rewards/rejected": -16.111473083496094, + "step": 465 + }, + { + "epoch": 0.17030607583371402, + "grad_norm": 3.6549277305603027, + "kl": 3.0719661712646484, + "learning_rate": 1.183273079439795e-06, + "logits/chosen": 3277206.4, + "logits/rejected": -22675217.454545453, + "logps/chosen": -239.0556396484375, + "logps/rejected": -443.1888316761364, + "loss": 0.0093, + "rewards/chosen": 9.901571655273438, + "rewards/margins": 25.439639559659092, + "rewards/rejected": -15.538067904385654, + "step": 466 + }, + { + "epoch": 0.1706715395157606, + "grad_norm": 2.5459866523742676, + "kl": 5.1879801750183105, + "learning_rate": 1.1149484324124327e-06, + "logits/chosen": 3489919.466666667, + "logits/rejected": -15425745.88235294, + "logps/chosen": -320.90667317708335, + "logps/rejected": -521.5926011029412, + "loss": 0.005, + "rewards/chosen": 8.778363037109376, + "rewards/margins": 29.747816736557905, + "rewards/rejected": -20.96945369944853, + "step": 467 + }, + { + "epoch": 0.1710370031978072, + "grad_norm": 2.968933343887329, + "kl": 12.588403701782227, + "learning_rate": 1.0486332583853563e-06, + "logits/chosen": -1008199.8888888889, + "logits/rejected": -18537979.42857143, + "logps/chosen": -377.12152777777777, + "logps/rejected": -465.94834681919644, + "loss": 0.0203, + "rewards/chosen": 12.618971082899305, + "rewards/margins": 30.498735094827317, + "rewards/rejected": -17.879764011928014, + "step": 468 + }, + { + "epoch": 0.1714024668798538, + "grad_norm": 1.5093332529067993, + "kl": 7.728363037109375, + "learning_rate": 9.843302833156376e-07, + "logits/chosen": -17004349.714285713, + "logits/rejected": -24741642.666666668, + "logps/chosen": -469.29697963169644, + "logps/rejected": -506.8410915798611, + "loss": 0.0067, + "rewards/chosen": 11.747360229492188, + "rewards/margins": 32.10038757324219, + "rewards/rejected": -20.35302734375, + "step": 469 + }, + { + "epoch": 0.1717679305619004, + "grad_norm": 1.5929416418075562, + "kl": 6.162945747375488, + "learning_rate": 9.220421504467281e-07, + "logits/chosen": -8297201.333333333, + "logits/rejected": -6470800.8, + "logps/chosen": -411.8868815104167, + "logps/rejected": -551.1888671875, + "loss": 0.0072, + "rewards/chosen": 11.85494613647461, + "rewards/margins": 33.65380630493164, + "rewards/rejected": -21.798860168457033, + "step": 470 + }, + { + "epoch": 0.172133394243947, + "grad_norm": 2.8989174365997314, + "kl": 10.956710815429688, + "learning_rate": 8.617714201998084e-07, + "logits/chosen": -20825719.111111112, + "logits/rejected": -25976987.42857143, + "logps/chosen": -310.8628200954861, + "logps/rejected": -488.12744140625, + "loss": 0.013, + "rewards/chosen": 11.204368591308594, + "rewards/margins": 32.89811815534319, + "rewards/rejected": -21.693749564034597, + "step": 471 + }, + { + "epoch": 0.1724988579259936, + "grad_norm": 1.9638949632644653, + "kl": 2.666713237762451, + "learning_rate": 8.035205700685167e-07, + "logits/chosen": -21657440.0, + "logits/rejected": -24487132.0, + "logps/chosen": -315.4588623046875, + "logps/rejected": -509.10479736328125, + "loss": 0.0013, + "rewards/chosen": 11.063799858093262, + "rewards/margins": 33.06114673614502, + "rewards/rejected": -21.997346878051758, + "step": 472 + }, + { + "epoch": 0.1728643216080402, + "grad_norm": 0.3028445839881897, + "kl": 0.6589040756225586, + "learning_rate": 7.472919945171631e-07, + "logits/chosen": -8883835.636363637, + "logits/rejected": 2916624.380952381, + "logps/chosen": -344.75779030539775, + "logps/rejected": -528.5618954613095, + "loss": 0.0006, + "rewards/chosen": 9.552167025479404, + "rewards/margins": 30.197308627041906, + "rewards/rejected": -20.6451416015625, + "step": 473 + }, + { + "epoch": 0.17322978529008679, + "grad_norm": 2.493788242340088, + "kl": 4.94297981262207, + "learning_rate": 6.93088004882253e-07, + "logits/chosen": -3673888.0, + "logits/rejected": -24019832.888888888, + "logps/chosen": -330.12803431919644, + "logps/rejected": -429.9820963541667, + "loss": 0.0066, + "rewards/chosen": 12.02521732875279, + "rewards/margins": 29.39323558504619, + "rewards/rejected": -17.368018256293404, + "step": 474 + }, + { + "epoch": 0.17359524897213338, + "grad_norm": 2.6338140964508057, + "kl": 9.93124771118164, + "learning_rate": 6.409108292774913e-07, + "logits/chosen": -18640262.736842107, + "logits/rejected": 53104221.538461536, + "logps/chosen": -346.21849300986844, + "logps/rejected": -500.64855018028845, + "loss": 0.0183, + "rewards/chosen": 9.224417435495477, + "rewards/margins": 30.428021326721435, + "rewards/rejected": -21.20360389122596, + "step": 475 + }, + { + "epoch": 0.17396071265417998, + "grad_norm": 2.909181594848633, + "kl": 9.125072479248047, + "learning_rate": 5.907626125022159e-07, + "logits/chosen": -14157770.666666666, + "logits/rejected": -9903623.529411765, + "logps/chosen": -363.1652018229167, + "logps/rejected": -486.26740579044116, + "loss": 0.0073, + "rewards/chosen": 10.916317749023438, + "rewards/margins": 29.896545948701746, + "rewards/rejected": -18.980228199678308, + "step": 476 + }, + { + "epoch": 0.17432617633622657, + "grad_norm": 4.639556407928467, + "kl": 10.329207420349121, + "learning_rate": 5.426454159531913e-07, + "logits/chosen": -18176832.0, + "logits/rejected": -6609936.0, + "logps/chosen": -368.0164005055147, + "logps/rejected": -496.54137369791664, + "loss": 0.0153, + "rewards/chosen": 10.700091193704043, + "rewards/margins": 28.425209195006126, + "rewards/rejected": -17.725118001302082, + "step": 477 + }, + { + "epoch": 0.1746916400182732, + "grad_norm": 4.749722003936768, + "kl": 5.273879051208496, + "learning_rate": 4.965612175399092e-07, + "logits/chosen": -8928793.23076923, + "logits/rejected": -21867589.05263158, + "logps/chosen": -277.80774864783655, + "logps/rejected": -617.8948396381579, + "loss": 0.0105, + "rewards/chosen": 9.456420311560997, + "rewards/margins": 36.08678337436939, + "rewards/rejected": -26.63036306280839, + "step": 478 + }, + { + "epoch": 0.1750571037003198, + "grad_norm": 2.6643779277801514, + "kl": 7.412860870361328, + "learning_rate": 4.52511911603265e-07, + "logits/chosen": -10700281.6, + "logits/rejected": -32832368.94117647, + "logps/chosen": -344.36940104166666, + "logps/rejected": -596.8102022058823, + "loss": 0.0034, + "rewards/chosen": 11.234705607096354, + "rewards/margins": 31.734041400984218, + "rewards/rejected": -20.499335793887866, + "step": 479 + }, + { + "epoch": 0.1754225673823664, + "grad_norm": 1.5384433269500732, + "kl": 9.162883758544922, + "learning_rate": 4.104993088376974e-07, + "logits/chosen": -6981346.4, + "logits/rejected": -15258368.0, + "logps/chosen": -369.62978515625, + "logps/rejected": -370.474853515625, + "loss": 0.0266, + "rewards/chosen": 11.282172393798827, + "rewards/margins": 25.670399220784503, + "rewards/rejected": -14.388226826985678, + "step": 480 + }, + { + "epoch": 0.175788031064413, + "grad_norm": 1.8148412704467773, + "kl": 7.009784698486328, + "learning_rate": 3.7052513621674833e-07, + "logits/chosen": -14011508.363636363, + "logits/rejected": 361841.8, + "logps/chosen": -289.41659268465907, + "logps/rejected": -573.371728515625, + "loss": 0.0025, + "rewards/chosen": 11.16969021883878, + "rewards/margins": 38.56281155672941, + "rewards/rejected": -27.393121337890626, + "step": 481 + }, + { + "epoch": 0.17615349474645958, + "grad_norm": 1.9310189485549927, + "kl": 8.411112785339355, + "learning_rate": 3.3259103692209747e-07, + "logits/chosen": -16072949.333333334, + "logits/rejected": -18589086.11764706, + "logps/chosen": -373.74244791666666, + "logps/rejected": -525.0498621323529, + "loss": 0.0109, + "rewards/chosen": 13.651190185546875, + "rewards/margins": 36.5129545323989, + "rewards/rejected": -22.861764346852024, + "step": 482 + }, + { + "epoch": 0.17651895842850618, + "grad_norm": 2.8287031650543213, + "kl": 7.927334308624268, + "learning_rate": 2.966985702759828e-07, + "logits/chosen": -14653672.0, + "logits/rejected": -43709133.333333336, + "logps/chosen": -264.4124755859375, + "logps/rejected": -564.6012369791666, + "loss": 0.0112, + "rewards/chosen": 9.58668212890625, + "rewards/margins": 31.40618082682292, + "rewards/rejected": -21.819498697916668, + "step": 483 + }, + { + "epoch": 0.17688442211055277, + "grad_norm": 3.5193405151367188, + "kl": 8.235217094421387, + "learning_rate": 2.6284921167712973e-07, + "logits/chosen": 1018994.75, + "logits/rejected": 8535198.0, + "logps/chosen": -328.6596374511719, + "logps/rejected": -531.7152099609375, + "loss": 0.009, + "rewards/chosen": 10.774845123291016, + "rewards/margins": 36.856367111206055, + "rewards/rejected": -26.08152198791504, + "step": 484 + }, + { + "epoch": 0.17724988579259937, + "grad_norm": 4.018964767456055, + "kl": 6.8196306228637695, + "learning_rate": 2.310443525400885e-07, + "logits/chosen": -5921261.5, + "logits/rejected": -19844862.0, + "logps/chosen": -295.30987548828125, + "logps/rejected": -503.4870300292969, + "loss": 0.01, + "rewards/chosen": 8.991737365722656, + "rewards/margins": 29.11577033996582, + "rewards/rejected": -20.124032974243164, + "step": 485 + }, + { + "epoch": 0.17761534947464597, + "grad_norm": 1.37460196018219, + "kl": 7.840858459472656, + "learning_rate": 2.012853002380466e-07, + "logits/chosen": -24336974.0, + "logits/rejected": -20589992.0, + "logps/chosen": -397.7123718261719, + "logps/rejected": -509.329833984375, + "loss": 0.0062, + "rewards/chosen": 12.916855812072754, + "rewards/margins": 32.98902606964111, + "rewards/rejected": -20.07217025756836, + "step": 486 + }, + { + "epoch": 0.17798081315669256, + "grad_norm": 4.123758316040039, + "kl": 11.539236068725586, + "learning_rate": 1.735732780490884e-07, + "logits/chosen": 4270496.888888889, + "logits/rejected": -20655766.85714286, + "logps/chosen": -331.24262152777777, + "logps/rejected": -422.5819614955357, + "loss": 0.0079, + "rewards/chosen": 11.95119137234158, + "rewards/margins": 30.818084595695375, + "rewards/rejected": -18.866893223353795, + "step": 487 + }, + { + "epoch": 0.17834627683873916, + "grad_norm": 4.753218173980713, + "kl": 7.45941162109375, + "learning_rate": 1.4790942510590766e-07, + "logits/chosen": -2601407.0588235296, + "logits/rejected": 30009175.466666665, + "logps/chosen": -371.7783777573529, + "logps/rejected": -604.9147786458333, + "loss": 0.009, + "rewards/chosen": 9.890346751493567, + "rewards/margins": 33.77864224303002, + "rewards/rejected": -23.888295491536457, + "step": 488 + }, + { + "epoch": 0.17871174052078576, + "grad_norm": 7.9544291496276855, + "kl": 18.027603149414062, + "learning_rate": 1.2429479634897267e-07, + "logits/chosen": -13646257.777777778, + "logits/rejected": -25930912.0, + "logps/chosen": -402.1135525173611, + "logps/rejected": -524.4549037388393, + "loss": 0.0571, + "rewards/chosen": 11.21631113688151, + "rewards/margins": 30.627842857724147, + "rewards/rejected": -19.411531720842635, + "step": 489 + }, + { + "epoch": 0.17907720420283235, + "grad_norm": 1.6414546966552734, + "kl": 7.33444881439209, + "learning_rate": 1.0273036248318324e-07, + "logits/chosen": -11945475.2, + "logits/rejected": -21892050.82352941, + "logps/chosen": -307.16142578125, + "logps/rejected": -377.21363740808823, + "loss": 0.0105, + "rewards/chosen": 12.484385172526041, + "rewards/margins": 28.366560393688722, + "rewards/rejected": -15.882175221162683, + "step": 490 + }, + { + "epoch": 0.17944266788487895, + "grad_norm": 3.110675096511841, + "kl": 12.340682983398438, + "learning_rate": 8.321700993795811e-08, + "logits/chosen": -12828076.19047619, + "logits/rejected": -16727296.0, + "logps/chosen": -311.0126953125, + "logps/rejected": -421.34912109375, + "loss": 0.0252, + "rewards/chosen": 10.290027436755953, + "rewards/margins": 26.688527359074843, + "rewards/rejected": -16.39849992231889, + "step": 491 + }, + { + "epoch": 0.17980813156692554, + "grad_norm": 2.8156538009643555, + "kl": 15.150474548339844, + "learning_rate": 6.575554083078084e-08, + "logits/chosen": -15203136.0, + "logits/rejected": -39046663.384615384, + "logps/chosen": -378.41568153782896, + "logps/rejected": -522.4693509615385, + "loss": 0.026, + "rewards/chosen": 11.60993074115954, + "rewards/margins": 31.670972939927566, + "rewards/rejected": -20.06104219876803, + "step": 492 + }, + { + "epoch": 0.18017359524897214, + "grad_norm": 2.8355517387390137, + "kl": 11.84912395477295, + "learning_rate": 5.0346672934270534e-08, + "logits/chosen": -15409815.578947369, + "logits/rejected": -18598270.769230768, + "logps/chosen": -367.1744962993421, + "logps/rejected": -427.1051682692308, + "loss": 0.0102, + "rewards/chosen": 10.714483160721628, + "rewards/margins": 31.533452887284128, + "rewards/rejected": -20.8189697265625, + "step": 493 + }, + { + "epoch": 0.18053905893101874, + "grad_norm": 5.410240173339844, + "kl": 5.61060905456543, + "learning_rate": 3.699103964661665e-08, + "logits/chosen": -7293272.615384615, + "logits/rejected": 12952489.263157895, + "logps/chosen": -384.59555288461536, + "logps/rejected": -511.2163342927632, + "loss": 0.004, + "rewards/chosen": 11.954400869516226, + "rewards/margins": 31.494382109236618, + "rewards/rejected": -19.539981239720394, + "step": 494 + }, + { + "epoch": 0.18090452261306533, + "grad_norm": 3.9821200370788574, + "kl": 14.919679641723633, + "learning_rate": 2.568918996560532e-08, + "logits/chosen": -18706298.352941178, + "logits/rejected": -9232838.4, + "logps/chosen": -366.9410615808824, + "logps/rejected": -399.20081380208336, + "loss": 0.0174, + "rewards/chosen": 10.451574886546416, + "rewards/margins": 28.082658057119332, + "rewards/rejected": -17.631083170572918, + "step": 495 + }, + { + "epoch": 0.18126998629511193, + "grad_norm": 2.1854822635650635, + "kl": 9.119820594787598, + "learning_rate": 1.644158846600963e-08, + "logits/chosen": -17346803.80952381, + "logits/rejected": -25567930.181818184, + "logps/chosen": -360.5658482142857, + "logps/rejected": -446.74973366477275, + "loss": 0.0095, + "rewards/chosen": 12.458696637834821, + "rewards/margins": 33.447455071783686, + "rewards/rejected": -20.988758433948863, + "step": 496 + }, + { + "epoch": 0.18163544997715853, + "grad_norm": 2.4213058948516846, + "kl": 15.704545974731445, + "learning_rate": 9.248615280499361e-09, + "logits/chosen": -19568956.8, + "logits/rejected": -7878462.666666667, + "logps/chosen": -329.4513671875, + "logps/rejected": -413.5679117838542, + "loss": 0.0121, + "rewards/chosen": 12.537107849121094, + "rewards/margins": 30.53261922200521, + "rewards/rejected": -17.995511372884113, + "step": 497 + }, + { + "epoch": 0.18200091365920512, + "grad_norm": 2.7919280529022217, + "kl": 13.333806037902832, + "learning_rate": 4.110566084036816e-09, + "logits/chosen": -18479448.38095238, + "logits/rejected": -22103949.09090909, + "logps/chosen": -343.45814732142856, + "logps/rejected": -345.1758922230114, + "loss": 0.0052, + "rewards/chosen": 12.064200265066964, + "rewards/margins": 26.447875580230317, + "rewards/rejected": -14.383675315163352, + "step": 498 + }, + { + "epoch": 0.18236637734125172, + "grad_norm": 0.4460287392139435, + "kl": 1.2321033477783203, + "learning_rate": 1.0276520816976387e-09, + "logits/chosen": -18236149.333333332, + "logits/rejected": -15490749.217391305, + "logps/chosen": -503.7068684895833, + "logps/rejected": -622.257472826087, + "loss": 0.0002, + "rewards/chosen": 12.759617275661892, + "rewards/margins": 37.443540084765154, + "rewards/rejected": -24.68392280910326, + "step": 499 + }, + { + "epoch": 0.1827318410232983, + "grad_norm": 0.04296117648482323, + "kl": 5.907767295837402, + "learning_rate": 0.0, + "logits/chosen": -18141325.333333332, + "logits/rejected": 11516864.0, + "logps/chosen": -421.7276204427083, + "logps/rejected": -527.7302734375, + "loss": 0.0063, + "rewards/chosen": 13.598575592041016, + "rewards/margins": 32.69757766723633, + "rewards/rejected": -19.099002075195312, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}