{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 4.125313107067299, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.060312606394290924, "logits/rejected": 0.15203741192817688, "logps/chosen": -1.7157971858978271, "logps/rejected": -1.8896640539169312, "loss": 0.2582, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.7157971858978271, "rewards/margins": 0.17386700212955475, "rewards/rejected": -1.8896640539169312, "sft_loss": 1.4683139324188232, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 2.7467738156031922, "learning_rate": 1.7825311942959e-08, "logits/chosen": 0.011781789362430573, "logits/rejected": 0.13588806986808777, "logps/chosen": -1.8027633428573608, "logps/rejected": -1.8470537662506104, "loss": 0.2651, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8027633428573608, "rewards/margins": 0.044290412217378616, "rewards/rejected": -1.8470537662506104, "sft_loss": 1.5084987878799438, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 2.953214775219304, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.03865582123398781, "logits/rejected": 0.061098456382751465, "logps/chosen": -1.6350253820419312, "logps/rejected": -1.7651439905166626, "loss": 0.302, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6350253820419312, "rewards/margins": 0.13011865317821503, "rewards/rejected": -1.7651439905166626, "sft_loss": 1.5002126693725586, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 3.875440232060317, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.04312217980623245, "logits/rejected": 0.044587552547454834, "logps/chosen": -1.7249486446380615, "logps/rejected": -1.8060178756713867, "loss": 0.2933, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.7249486446380615, "rewards/margins": 0.08106913417577744, "rewards/rejected": -1.8060178756713867, "sft_loss": 1.500407338142395, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 3.683341007905961, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.07231198251247406, "logits/rejected": 0.015074786730110645, "logps/chosen": -1.8695526123046875, "logps/rejected": -1.7800153493881226, "loss": 0.322, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -1.8695526123046875, "rewards/margins": -0.08953739702701569, "rewards/rejected": -1.7800153493881226, "sft_loss": 1.5455690622329712, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 2.8411343337921076, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.0865975096821785, "logits/rejected": 0.009093428030610085, "logps/chosen": -1.9094641208648682, "logps/rejected": -1.8325374126434326, "loss": 0.264, "rewards/accuracies": 0.4375, "rewards/chosen": -1.9094641208648682, "rewards/margins": -0.07692664116621017, "rewards/rejected": -1.8325374126434326, "sft_loss": 1.6473287343978882, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 3.7871810202252782, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.04631795734167099, "logits/rejected": 0.11639624834060669, "logps/chosen": -1.8485195636749268, "logps/rejected": -1.9989744424819946, "loss": 0.2781, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8485195636749268, "rewards/margins": 0.15045490860939026, "rewards/rejected": -1.9989744424819946, "sft_loss": 1.5623446702957153, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 3.209277493286425, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.03658987209200859, "logits/rejected": 0.2132900059223175, "logps/chosen": -1.8844735622406006, "logps/rejected": -1.7460263967514038, "loss": 0.2934, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.8844735622406006, "rewards/margins": -0.1384473741054535, "rewards/rejected": -1.7460263967514038, "sft_loss": 1.5194506645202637, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 3.7608064055241694, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.019204024225473404, "logits/rejected": 0.21857735514640808, "logps/chosen": -1.8422836065292358, "logps/rejected": -1.8763787746429443, "loss": 0.285, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8422836065292358, "rewards/margins": 0.034095339477062225, "rewards/rejected": -1.8763787746429443, "sft_loss": 1.538379430770874, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 3.4310065887447876, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.05229135602712631, "logits/rejected": 0.09997323900461197, "logps/chosen": -1.908278226852417, "logps/rejected": -1.7853820323944092, "loss": 0.2806, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.908278226852417, "rewards/margins": -0.12289615720510483, "rewards/rejected": -1.7853820323944092, "sft_loss": 1.5872937440872192, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 3.1486928978797013, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.11598268896341324, "logits/rejected": 0.10807422548532486, "logps/chosen": -1.846299409866333, "logps/rejected": -1.8794721364974976, "loss": 0.2676, "rewards/accuracies": 0.5625, "rewards/chosen": -1.846299409866333, "rewards/margins": 0.03317270055413246, "rewards/rejected": -1.8794721364974976, "sft_loss": 1.5888155698776245, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 3.225575628852993, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.08910714834928513, "logits/rejected": 0.10314790904521942, "logps/chosen": -1.8055875301361084, "logps/rejected": -1.9120228290557861, "loss": 0.2595, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.8055875301361084, "rewards/margins": 0.1064353734254837, "rewards/rejected": -1.9120228290557861, "sft_loss": 1.5487940311431885, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 3.0501140875467243, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.02393939718604088, "logits/rejected": 0.128209188580513, "logps/chosen": -1.6527059078216553, "logps/rejected": -1.7865177392959595, "loss": 0.2825, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.6527059078216553, "rewards/margins": 0.13381178677082062, "rewards/rejected": -1.7865177392959595, "sft_loss": 1.482107400894165, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 4.556150358888877, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.06528159230947495, "logits/rejected": 0.09094108641147614, "logps/chosen": -1.7894785404205322, "logps/rejected": -1.8382999897003174, "loss": 0.2873, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.7894785404205322, "rewards/margins": 0.04882138594985008, "rewards/rejected": -1.8382999897003174, "sft_loss": 1.6435940265655518, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 2.8190971503848, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.0369485542178154, "logits/rejected": 0.15109845995903015, "logps/chosen": -1.8229849338531494, "logps/rejected": -2.091214179992676, "loss": 0.2536, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.8229849338531494, "rewards/margins": 0.268229216337204, "rewards/rejected": -2.091214179992676, "sft_loss": 1.584989309310913, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 2.640922353276211, "learning_rate": 1.42602495543672e-07, "logits/chosen": -0.008784117177128792, "logits/rejected": 0.0963423103094101, "logps/chosen": -1.772080421447754, "logps/rejected": -1.804764986038208, "loss": 0.2817, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.772080421447754, "rewards/margins": 0.0326843187212944, "rewards/rejected": -1.804764986038208, "sft_loss": 1.5509886741638184, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 2.71350748289859, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.1522648185491562, "logits/rejected": 0.09689263254404068, "logps/chosen": -1.8555856943130493, "logps/rejected": -2.046074867248535, "loss": 0.2782, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.8555856943130493, "rewards/margins": 0.1904892474412918, "rewards/rejected": -2.046074867248535, "sft_loss": 1.5202927589416504, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 2.422361599261421, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.09719739854335785, "logits/rejected": 0.05986147001385689, "logps/chosen": -1.8296709060668945, "logps/rejected": -1.8398548364639282, "loss": 0.2889, "rewards/accuracies": 0.46875, "rewards/chosen": -1.8296709060668945, "rewards/margins": 0.010183680802583694, "rewards/rejected": -1.8398548364639282, "sft_loss": 1.4819883108139038, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 2.418905862195348, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.07108329236507416, "logits/rejected": 0.0822470635175705, "logps/chosen": -1.9167404174804688, "logps/rejected": -1.9951963424682617, "loss": 0.2626, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.9167404174804688, "rewards/margins": 0.07845588028430939, "rewards/rejected": -1.9951963424682617, "sft_loss": 1.5648051500320435, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 2.3559369369237277, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.03189660981297493, "logits/rejected": 0.03379274904727936, "logps/chosen": -1.7909519672393799, "logps/rejected": -1.903272032737732, "loss": 0.263, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.7909519672393799, "rewards/margins": 0.1123199313879013, "rewards/rejected": -1.903272032737732, "sft_loss": 1.5315051078796387, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 2.272132842693275, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.038053132593631744, "logits/rejected": 0.06628037244081497, "logps/chosen": -1.7722151279449463, "logps/rejected": -1.9358274936676025, "loss": 0.2647, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.7722151279449463, "rewards/margins": 0.16361233592033386, "rewards/rejected": -1.9358274936676025, "sft_loss": 1.4914172887802124, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 2.3939211561963982, "learning_rate": 1.96078431372549e-07, "logits/chosen": 0.019141273573040962, "logits/rejected": 0.1181107759475708, "logps/chosen": -1.8616676330566406, "logps/rejected": -1.9208825826644897, "loss": 0.2764, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.8616676330566406, "rewards/margins": 0.05921504646539688, "rewards/rejected": -1.9208825826644897, "sft_loss": 1.5512902736663818, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 2.3627131217942043, "learning_rate": 2.049910873440285e-07, "logits/chosen": 0.07100260257720947, "logits/rejected": 0.29280149936676025, "logps/chosen": -1.8520358800888062, "logps/rejected": -2.1854333877563477, "loss": 0.2304, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.8520358800888062, "rewards/margins": 0.33339765667915344, "rewards/rejected": -2.1854333877563477, "sft_loss": 1.6704508066177368, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 1.735165157614565, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.07888107746839523, "logits/rejected": 0.10008995234966278, "logps/chosen": -1.9981311559677124, "logps/rejected": -2.1442179679870605, "loss": 0.2481, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.9981311559677124, "rewards/margins": 0.14608684182167053, "rewards/rejected": -2.1442179679870605, "sft_loss": 1.6943069696426392, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 2.786593260731286, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.05085619166493416, "logits/rejected": 0.08655449748039246, "logps/chosen": -1.8938229084014893, "logps/rejected": -1.8082813024520874, "loss": 0.281, "rewards/accuracies": 0.46875, "rewards/chosen": -1.8938229084014893, "rewards/margins": -0.08554168045520782, "rewards/rejected": -1.8082813024520874, "sft_loss": 1.606369972229004, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 2.5772159985228655, "learning_rate": 2.31729055258467e-07, "logits/chosen": 0.04295843839645386, "logits/rejected": 0.18473029136657715, "logps/chosen": -1.9629977941513062, "logps/rejected": -2.086339235305786, "loss": 0.2481, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.9629977941513062, "rewards/margins": 0.12334122508764267, "rewards/rejected": -2.086339235305786, "sft_loss": 1.6907141208648682, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 1.9943010581072478, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.0350356251001358, "logits/rejected": 0.08792857825756073, "logps/chosen": -2.0462894439697266, "logps/rejected": -2.0182530879974365, "loss": 0.2578, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.0462894439697266, "rewards/margins": -0.02803630754351616, "rewards/rejected": -2.0182530879974365, "sft_loss": 1.6577552556991577, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 2.8486191540947807, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.02075616642832756, "logits/rejected": 0.1543327271938324, "logps/chosen": -2.015401601791382, "logps/rejected": -2.2778303623199463, "loss": 0.2269, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.015401601791382, "rewards/margins": 0.2624287009239197, "rewards/rejected": -2.2778303623199463, "sft_loss": 1.6974109411239624, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 2.1225048898899246, "learning_rate": 2.5846702317290554e-07, "logits/chosen": 0.0070198155008256435, "logits/rejected": 0.17295916378498077, "logps/chosen": -1.9902257919311523, "logps/rejected": -2.140160083770752, "loss": 0.2439, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.9902257919311523, "rewards/margins": 0.1499340534210205, "rewards/rejected": -2.140160083770752, "sft_loss": 1.631656289100647, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 2.338788349764335, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.03971818834543228, "logits/rejected": 0.13490387797355652, "logps/chosen": -1.9857639074325562, "logps/rejected": -1.9886020421981812, "loss": 0.2651, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.9857639074325562, "rewards/margins": 0.002838182495906949, "rewards/rejected": -1.9886020421981812, "sft_loss": 1.5006240606307983, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 2.1835505399539104, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.027233857661485672, "logits/rejected": 0.025476187467575073, "logps/chosen": -2.1108012199401855, "logps/rejected": -2.1566002368927, "loss": 0.2523, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.1108012199401855, "rewards/margins": 0.04579881578683853, "rewards/rejected": -2.1566002368927, "sft_loss": 1.6765267848968506, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 2.040498416917074, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.11586171388626099, "logits/rejected": 0.03955882042646408, "logps/chosen": -2.320681095123291, "logps/rejected": -2.297348976135254, "loss": 0.2415, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.320681095123291, "rewards/margins": -0.023332182317972183, "rewards/rejected": -2.297348976135254, "sft_loss": 1.771780014038086, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 1.9928740456508027, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.04006613790988922, "logits/rejected": 0.14414557814598083, "logps/chosen": -2.0469436645507812, "logps/rejected": -2.3669915199279785, "loss": 0.2436, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0469436645507812, "rewards/margins": 0.3200477659702301, "rewards/rejected": -2.3669915199279785, "sft_loss": 1.6215006113052368, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 1.9534296167766696, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.06098126247525215, "logits/rejected": 0.001214376068674028, "logps/chosen": -2.3339781761169434, "logps/rejected": -2.305957555770874, "loss": 0.2285, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -2.3339781761169434, "rewards/margins": -0.028020773082971573, "rewards/rejected": -2.305957555770874, "sft_loss": 1.726961374282837, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 1.9588967180799672, "learning_rate": 3.1194295900178254e-07, "logits/chosen": 0.09518565982580185, "logits/rejected": 0.09727490693330765, "logps/chosen": -2.214961290359497, "logps/rejected": -2.278738498687744, "loss": 0.2654, "rewards/accuracies": 0.46875, "rewards/chosen": -2.214961290359497, "rewards/margins": 0.06377717852592468, "rewards/rejected": -2.278738498687744, "sft_loss": 1.7609399557113647, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 1.6589936848533324, "learning_rate": 3.2085561497326203e-07, "logits/chosen": 0.03179093450307846, "logits/rejected": 0.037368230521678925, "logps/chosen": -2.3154728412628174, "logps/rejected": -2.2699761390686035, "loss": 0.236, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.3154728412628174, "rewards/margins": -0.04549693316221237, "rewards/rejected": -2.2699761390686035, "sft_loss": 1.7420244216918945, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 1.9967555737580138, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.12511086463928223, "logits/rejected": -0.02812013030052185, "logps/chosen": -2.2482380867004395, "logps/rejected": -2.3399927616119385, "loss": 0.2525, "rewards/accuracies": 0.5, "rewards/chosen": -2.2482380867004395, "rewards/margins": 0.09175457060337067, "rewards/rejected": -2.3399927616119385, "sft_loss": 1.711207389831543, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 2.678787481783515, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.03826170042157173, "logits/rejected": 0.10330984741449356, "logps/chosen": -2.804680347442627, "logps/rejected": -2.641824245452881, "loss": 0.2148, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -2.804680347442627, "rewards/margins": -0.162856325507164, "rewards/rejected": -2.641824245452881, "sft_loss": 1.9906642436981201, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 2.0306729180588396, "learning_rate": 3.475935828877005e-07, "logits/chosen": 0.04664776846766472, "logits/rejected": 0.22064730525016785, "logps/chosen": -2.1520609855651855, "logps/rejected": -2.210716724395752, "loss": 0.2296, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.1520609855651855, "rewards/margins": 0.058656178414821625, "rewards/rejected": -2.210716724395752, "sft_loss": 1.615627646446228, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 3.1931495872144007, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.03578418493270874, "logits/rejected": 0.12399481236934662, "logps/chosen": -2.6481387615203857, "logps/rejected": -2.3530163764953613, "loss": 0.2331, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.6481387615203857, "rewards/margins": -0.2951226830482483, "rewards/rejected": -2.3530163764953613, "sft_loss": 1.9002326726913452, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 2.640144568717103, "learning_rate": 3.654188948306595e-07, "logits/chosen": -0.021364711225032806, "logits/rejected": 0.15079109370708466, "logps/chosen": -3.106703281402588, "logps/rejected": -2.7240376472473145, "loss": 0.2138, "rewards/accuracies": 0.5, "rewards/chosen": -3.106703281402588, "rewards/margins": -0.3826655149459839, "rewards/rejected": -2.7240376472473145, "sft_loss": 1.900535225868225, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 2.186333179804364, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.11891081184148788, "logits/rejected": 0.10270519554615021, "logps/chosen": -2.9581754207611084, "logps/rejected": -3.388826847076416, "loss": 0.1741, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.9581754207611084, "rewards/margins": 0.4306512773036957, "rewards/rejected": -3.388826847076416, "sft_loss": 2.0287399291992188, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 2.0176626763649, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.13966991007328033, "logits/rejected": 0.140414297580719, "logps/chosen": -2.749617099761963, "logps/rejected": -2.8892064094543457, "loss": 0.1857, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.749617099761963, "rewards/margins": 0.13958922028541565, "rewards/rejected": -2.8892064094543457, "sft_loss": 2.041731357574463, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 1.998903196870583, "learning_rate": 3.92156862745098e-07, "logits/chosen": 0.0680420845746994, "logits/rejected": 0.18458662927150726, "logps/chosen": -3.2915711402893066, "logps/rejected": -3.790569305419922, "loss": 0.1668, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -3.2915711402893066, "rewards/margins": 0.4989985525608063, "rewards/rejected": -3.790569305419922, "sft_loss": 2.265310764312744, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 1.9590568738460437, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.07913367450237274, "logits/rejected": 0.1182810515165329, "logps/chosen": -3.0919346809387207, "logps/rejected": -3.1701674461364746, "loss": 0.1653, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -3.0919346809387207, "rewards/margins": 0.07823298126459122, "rewards/rejected": -3.1701674461364746, "sft_loss": 1.9699609279632568, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 1.8737010294999485, "learning_rate": 4.09982174688057e-07, "logits/chosen": 0.037992849946022034, "logits/rejected": 0.13747188448905945, "logps/chosen": -3.8204619884490967, "logps/rejected": -3.797633647918701, "loss": 0.1635, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -3.8204619884490967, "rewards/margins": -0.022828320041298866, "rewards/rejected": -3.797633647918701, "sft_loss": 2.1811490058898926, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 1.8191581628112006, "learning_rate": 4.188948306595365e-07, "logits/chosen": 0.046239614486694336, "logits/rejected": 0.22374077141284943, "logps/chosen": -3.5340847969055176, "logps/rejected": -3.8707435131073, "loss": 0.1426, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.5340847969055176, "rewards/margins": 0.3366585373878479, "rewards/rejected": -3.8707435131073, "sft_loss": 2.135493516921997, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 2.15850506003128, "learning_rate": 4.27807486631016e-07, "logits/chosen": -0.020359747111797333, "logits/rejected": 0.12103061378002167, "logps/chosen": -3.8267650604248047, "logps/rejected": -3.854151487350464, "loss": 0.1603, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -3.8267650604248047, "rewards/margins": 0.02738667093217373, "rewards/rejected": -3.854151487350464, "sft_loss": 2.468606472015381, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 1.9869901093405427, "learning_rate": 4.3672014260249554e-07, "logits/chosen": 0.09430526196956635, "logits/rejected": 0.2459847629070282, "logps/chosen": -3.5051181316375732, "logps/rejected": -4.297321319580078, "loss": 0.1457, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.5051181316375732, "rewards/margins": 0.792202889919281, "rewards/rejected": -4.297321319580078, "sft_loss": 2.459380626678467, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 1.9104173001829199, "learning_rate": 4.4563279857397503e-07, "logits/chosen": 0.02481432631611824, "logits/rejected": 0.22242239117622375, "logps/chosen": -5.418574333190918, "logps/rejected": -5.279926300048828, "loss": 0.1462, "rewards/accuracies": 0.53125, "rewards/chosen": -5.418574333190918, "rewards/margins": -0.13864776492118835, "rewards/rejected": -5.279926300048828, "sft_loss": 2.6364169120788574, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 1.2454643359237576, "learning_rate": 4.545454545454545e-07, "logits/chosen": 0.06115083023905754, "logits/rejected": 0.2540772557258606, "logps/chosen": -3.8281192779541016, "logps/rejected": -4.610691547393799, "loss": 0.1384, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -3.8281192779541016, "rewards/margins": 0.7825719714164734, "rewards/rejected": -4.610691547393799, "sft_loss": 2.2860593795776367, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 0.9849368225532316, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.10867585986852646, "logits/rejected": 0.02872631885111332, "logps/chosen": -5.133664131164551, "logps/rejected": -4.367121696472168, "loss": 0.1178, "rewards/accuracies": 0.53125, "rewards/chosen": -5.133664131164551, "rewards/margins": -0.7665426135063171, "rewards/rejected": -4.367121696472168, "sft_loss": 2.7160372734069824, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 0.8961988680228047, "learning_rate": 4.723707664884135e-07, "logits/chosen": 0.07030217349529266, "logits/rejected": 0.17124192416667938, "logps/chosen": -6.0838727951049805, "logps/rejected": -4.811110019683838, "loss": 0.1367, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -6.0838727951049805, "rewards/margins": -1.2727632522583008, "rewards/rejected": -4.811110019683838, "sft_loss": 3.739053726196289, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 0.8941719839014858, "learning_rate": 4.81283422459893e-07, "logits/chosen": 0.017831971868872643, "logits/rejected": 0.20090460777282715, "logps/chosen": -4.764120578765869, "logps/rejected": -5.134100437164307, "loss": 0.1264, "rewards/accuracies": 0.53125, "rewards/chosen": -4.764120578765869, "rewards/margins": 0.3699795603752136, "rewards/rejected": -5.134100437164307, "sft_loss": 2.7194724082946777, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 1.222685081948667, "learning_rate": 4.901960784313725e-07, "logits/chosen": 0.16087068617343903, "logits/rejected": 0.2754586338996887, "logps/chosen": -5.333785057067871, "logps/rejected": -6.004696846008301, "loss": 0.1436, "rewards/accuracies": 0.59375, "rewards/chosen": -5.333785057067871, "rewards/margins": 0.6709117293357849, "rewards/rejected": -6.004696846008301, "sft_loss": 3.1917669773101807, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 0.7370626970557951, "learning_rate": 4.99108734402852e-07, "logits/chosen": 0.007818743586540222, "logits/rejected": 0.22343873977661133, "logps/chosen": -6.119328498840332, "logps/rejected": -5.818517208099365, "loss": 0.1211, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -6.119328498840332, "rewards/margins": -0.3008107542991638, "rewards/rejected": -5.818517208099365, "sft_loss": 3.9426021575927734, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 1.233957256562051, "learning_rate": 5.080213903743315e-07, "logits/chosen": 0.041990119963884354, "logits/rejected": 0.22771665453910828, "logps/chosen": -5.575260162353516, "logps/rejected": -5.49068546295166, "loss": 0.1275, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -5.575260162353516, "rewards/margins": -0.08457436412572861, "rewards/rejected": -5.49068546295166, "sft_loss": 2.8716347217559814, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 0.6072672871705348, "learning_rate": 5.169340463458111e-07, "logits/chosen": -0.026344675570726395, "logits/rejected": 0.3459742069244385, "logps/chosen": -4.510623931884766, "logps/rejected": -5.7433624267578125, "loss": 0.0906, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.510623931884766, "rewards/margins": 1.2327378988265991, "rewards/rejected": -5.7433624267578125, "sft_loss": 2.9556336402893066, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 0.7208828481537657, "learning_rate": 5.258467023172905e-07, "logits/chosen": 0.09221816062927246, "logits/rejected": 0.17328932881355286, "logps/chosen": -6.130520820617676, "logps/rejected": -5.456192970275879, "loss": 0.1207, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -6.130520820617676, "rewards/margins": -0.6743276715278625, "rewards/rejected": -5.456192970275879, "sft_loss": 3.7058982849121094, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 1.340847333736133, "learning_rate": 5.347593582887701e-07, "logits/chosen": 0.009460541419684887, "logits/rejected": 0.23661403357982635, "logps/chosen": -6.522822380065918, "logps/rejected": -6.587447166442871, "loss": 0.1156, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6.522822380065918, "rewards/margins": 0.06462571769952774, "rewards/rejected": -6.587447166442871, "sft_loss": 3.30534029006958, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 1.9457407077227884, "learning_rate": 5.436720142602496e-07, "logits/chosen": 0.06960954517126083, "logits/rejected": 0.16367551684379578, "logps/chosen": -5.572967052459717, "logps/rejected": -5.5532331466674805, "loss": 0.1133, "rewards/accuracies": 0.46875, "rewards/chosen": -5.572967052459717, "rewards/margins": -0.01973416842520237, "rewards/rejected": -5.5532331466674805, "sft_loss": 3.6321425437927246, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 6.893463954081525, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.13142789900302887, "logits/rejected": 0.0010722450679168105, "logps/chosen": -6.735803127288818, "logps/rejected": -6.812127590179443, "loss": 0.0959, "rewards/accuracies": 0.5, "rewards/chosen": -6.735803127288818, "rewards/margins": 0.07632424682378769, "rewards/rejected": -6.812127590179443, "sft_loss": 4.3729987144470215, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 1.2119659575882744, "learning_rate": 5.614973262032086e-07, "logits/chosen": 0.05507947877049446, "logits/rejected": 0.24547457695007324, "logps/chosen": -7.327678680419922, "logps/rejected": -7.448760986328125, "loss": 0.0735, "rewards/accuracies": 0.5625, "rewards/chosen": -7.327678680419922, "rewards/margins": 0.12108228355646133, "rewards/rejected": -7.448760986328125, "sft_loss": 5.221084117889404, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 1.421852572662818, "learning_rate": 5.70409982174688e-07, "logits/chosen": -0.0505533441901207, "logits/rejected": 0.09875977784395218, "logps/chosen": -7.0575737953186035, "logps/rejected": -6.7295331954956055, "loss": 0.0729, "rewards/accuracies": 0.5, "rewards/chosen": -7.0575737953186035, "rewards/margins": -0.32803958654403687, "rewards/rejected": -6.7295331954956055, "sft_loss": 5.3815484046936035, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 2.1768410941997747, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.06135901063680649, "logits/rejected": 0.12024674564599991, "logps/chosen": -8.038436889648438, "logps/rejected": -8.41388988494873, "loss": 0.0679, "rewards/accuracies": 0.53125, "rewards/chosen": -8.038436889648438, "rewards/margins": 0.3754529356956482, "rewards/rejected": -8.41388988494873, "sft_loss": 5.565326690673828, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 0.9252033224489392, "learning_rate": 5.88235294117647e-07, "logits/chosen": 0.034027762711048126, "logits/rejected": 0.23385238647460938, "logps/chosen": -6.080555438995361, "logps/rejected": -7.770529270172119, "loss": 0.0713, "rewards/accuracies": 0.59375, "rewards/chosen": -6.080555438995361, "rewards/margins": 1.6899728775024414, "rewards/rejected": -7.770529270172119, "sft_loss": 5.016576290130615, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 2.0989591544928925, "learning_rate": 5.971479500891266e-07, "logits/chosen": 0.01920177973806858, "logits/rejected": 0.206724613904953, "logps/chosen": -7.403994560241699, "logps/rejected": -7.278214931488037, "loss": 0.0695, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -7.403994560241699, "rewards/margins": -0.12578034400939941, "rewards/rejected": -7.278214931488037, "sft_loss": 4.916281223297119, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 1.893897803965405, "learning_rate": 6.060606060606061e-07, "logits/chosen": 0.055364273488521576, "logits/rejected": 0.2726927697658539, "logps/chosen": -6.973310947418213, "logps/rejected": -7.325900077819824, "loss": 0.0663, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -6.973310947418213, "rewards/margins": 0.35258910059928894, "rewards/rejected": -7.325900077819824, "sft_loss": 5.119781494140625, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 2.836372712749154, "learning_rate": 6.149732620320855e-07, "logits/chosen": 0.06766609847545624, "logits/rejected": 0.1415514200925827, "logps/chosen": -6.7723259925842285, "logps/rejected": -6.848855495452881, "loss": 0.0667, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -6.7723259925842285, "rewards/margins": 0.07653047144412994, "rewards/rejected": -6.848855495452881, "sft_loss": 5.026312828063965, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 2.265144184050898, "learning_rate": 6.238859180035651e-07, "logits/chosen": -0.15822748839855194, "logits/rejected": 0.0024463594891130924, "logps/chosen": -6.168185234069824, "logps/rejected": -5.750182151794434, "loss": 0.0671, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -6.168185234069824, "rewards/margins": -0.4180033802986145, "rewards/rejected": -5.750182151794434, "sft_loss": 4.6768975257873535, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 2.614388773693835, "learning_rate": 6.327985739750445e-07, "logits/chosen": -0.14725571870803833, "logits/rejected": 0.13961216807365417, "logps/chosen": -6.235496520996094, "logps/rejected": -6.224798202514648, "loss": 0.0614, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -6.235496520996094, "rewards/margins": -0.01069814246147871, "rewards/rejected": -6.224798202514648, "sft_loss": 5.157872200012207, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 2.7692613642785884, "learning_rate": 6.417112299465241e-07, "logits/chosen": -0.1557673215866089, "logits/rejected": -0.048311877995729446, "logps/chosen": -5.508719444274902, "logps/rejected": -5.668200492858887, "loss": 0.061, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -5.508719444274902, "rewards/margins": 0.15948060154914856, "rewards/rejected": -5.668200492858887, "sft_loss": 4.47888708114624, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 1.759183414717174, "learning_rate": 6.506238859180035e-07, "logits/chosen": -0.1632741391658783, "logits/rejected": -0.04335709661245346, "logps/chosen": -5.9066925048828125, "logps/rejected": -5.570250988006592, "loss": 0.0622, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -5.9066925048828125, "rewards/margins": -0.33644169569015503, "rewards/rejected": -5.570250988006592, "sft_loss": 5.030333518981934, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 2.2296092123630236, "learning_rate": 6.59536541889483e-07, "logits/chosen": -0.3708917498588562, "logits/rejected": -0.20164895057678223, "logps/chosen": -5.268651485443115, "logps/rejected": -5.234259128570557, "loss": 0.062, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -5.268651485443115, "rewards/margins": -0.03439173102378845, "rewards/rejected": -5.234259128570557, "sft_loss": 4.815543174743652, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 2.0146802870823928, "learning_rate": 6.684491978609626e-07, "logits/chosen": -0.47356781363487244, "logits/rejected": -0.21013717353343964, "logps/chosen": -5.339799880981445, "logps/rejected": -5.731418609619141, "loss": 0.0567, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -5.339799880981445, "rewards/margins": 0.39161843061447144, "rewards/rejected": -5.731418609619141, "sft_loss": 4.969027996063232, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 1.3381881149332266, "learning_rate": 6.77361853832442e-07, "logits/chosen": -0.5766822695732117, "logits/rejected": -0.4294039309024811, "logps/chosen": -4.788380146026611, "logps/rejected": -4.977846622467041, "loss": 0.0566, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.788380146026611, "rewards/margins": 0.18946652114391327, "rewards/rejected": -4.977846622467041, "sft_loss": 4.430079460144043, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 0.9198991724416198, "learning_rate": 6.862745098039216e-07, "logits/chosen": -0.48545369505882263, "logits/rejected": -0.3366524577140808, "logps/chosen": -5.007147789001465, "logps/rejected": -5.114348888397217, "loss": 0.0569, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -5.007147789001465, "rewards/margins": 0.10720244795084, "rewards/rejected": -5.114348888397217, "sft_loss": 4.7221221923828125, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 2.7786654323372857, "learning_rate": 6.95187165775401e-07, "logits/chosen": -0.1661207228899002, "logits/rejected": 0.11756277084350586, "logps/chosen": -5.295405387878418, "logps/rejected": -5.372384548187256, "loss": 0.0601, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -5.295405387878418, "rewards/margins": 0.07697971165180206, "rewards/rejected": -5.372384548187256, "sft_loss": 4.778872489929199, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 1.0788168546978547, "learning_rate": 7.040998217468806e-07, "logits/chosen": -0.4957035183906555, "logits/rejected": -0.25873202085494995, "logps/chosen": -4.942808151245117, "logps/rejected": -5.0668792724609375, "loss": 0.0572, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.942808151245117, "rewards/margins": 0.1240713819861412, "rewards/rejected": -5.0668792724609375, "sft_loss": 4.497957706451416, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 1.2369650710490778, "learning_rate": 7.1301247771836e-07, "logits/chosen": -0.5010001063346863, "logits/rejected": -0.32428526878356934, "logps/chosen": -5.017227649688721, "logps/rejected": -5.143941879272461, "loss": 0.0548, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -5.017227649688721, "rewards/margins": 0.12671446800231934, "rewards/rejected": -5.143941879272461, "sft_loss": 4.438155174255371, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": -0.02770337089896202, "eval_logits/rejected": 0.10948384553194046, "eval_logps/chosen": -5.346701622009277, "eval_logps/rejected": -5.472263813018799, "eval_loss": 0.05566617101430893, "eval_rewards/accuracies": 0.5326409339904785, "eval_rewards/chosen": -5.346701622009277, "eval_rewards/margins": 0.12556174397468567, "eval_rewards/rejected": -5.472263813018799, "eval_runtime": 43.8831, "eval_samples_per_second": 30.65, "eval_sft_loss": 4.829505443572998, "eval_steps_per_second": 7.679, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 0.8680376616287312, "learning_rate": 7.219251336898395e-07, "logits/chosen": -0.5568983554840088, "logits/rejected": -0.42599543929100037, "logps/chosen": -5.262287616729736, "logps/rejected": -5.478768825531006, "loss": 0.0577, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -5.262287616729736, "rewards/margins": 0.21648113429546356, "rewards/rejected": -5.478768825531006, "sft_loss": 4.965443134307861, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 1.3304348425737549, "learning_rate": 7.30837789661319e-07, "logits/chosen": -0.4703024923801422, "logits/rejected": -0.2515341639518738, "logps/chosen": -4.820174694061279, "logps/rejected": -5.074994087219238, "loss": 0.055, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.820174694061279, "rewards/margins": 0.25481972098350525, "rewards/rejected": -5.074994087219238, "sft_loss": 4.33463191986084, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 1.5135280531396909, "learning_rate": 7.397504456327985e-07, "logits/chosen": -0.4607006907463074, "logits/rejected": -0.3690333366394043, "logps/chosen": -5.324876308441162, "logps/rejected": -5.247057914733887, "loss": 0.0574, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -5.324876308441162, "rewards/margins": -0.07781883329153061, "rewards/rejected": -5.247057914733887, "sft_loss": 4.834041595458984, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 2.6272102535641886, "learning_rate": 7.486631016042781e-07, "logits/chosen": -0.5340300798416138, "logits/rejected": -0.14234408736228943, "logps/chosen": -4.852800369262695, "logps/rejected": -5.007115840911865, "loss": 0.0559, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -4.852800369262695, "rewards/margins": 0.15431593358516693, "rewards/rejected": -5.007115840911865, "sft_loss": 4.486388206481934, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 1.9315381406431007, "learning_rate": 7.575757575757575e-07, "logits/chosen": -0.43069228529930115, "logits/rejected": -0.1704765111207962, "logps/chosen": -4.909597396850586, "logps/rejected": -5.0253095626831055, "loss": 0.0551, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.909597396850586, "rewards/margins": 0.11571161448955536, "rewards/rejected": -5.0253095626831055, "sft_loss": 4.460376739501953, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 1.137232786466204, "learning_rate": 7.664884135472371e-07, "logits/chosen": -0.5641778707504272, "logits/rejected": -0.2592639625072479, "logps/chosen": -4.983081817626953, "logps/rejected": -5.339555740356445, "loss": 0.0554, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.983081817626953, "rewards/margins": 0.35647445917129517, "rewards/rejected": -5.339555740356445, "sft_loss": 4.552998065948486, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 1.1748183885688712, "learning_rate": 7.754010695187165e-07, "logits/chosen": -0.403970867395401, "logits/rejected": -0.2805110514163971, "logps/chosen": -5.1232590675354, "logps/rejected": -5.035851001739502, "loss": 0.0569, "rewards/accuracies": 0.46875, "rewards/chosen": -5.1232590675354, "rewards/margins": -0.08740736544132233, "rewards/rejected": -5.035851001739502, "sft_loss": 4.523171901702881, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 1.9134674045638693, "learning_rate": 7.84313725490196e-07, "logits/chosen": -0.39476364850997925, "logits/rejected": -0.21744295954704285, "logps/chosen": -4.964127540588379, "logps/rejected": -5.1540703773498535, "loss": 0.0566, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.964127540588379, "rewards/margins": 0.18994323909282684, "rewards/rejected": -5.1540703773498535, "sft_loss": 4.714345455169678, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 4.445589266572552, "learning_rate": 7.932263814616755e-07, "logits/chosen": -0.3707229495048523, "logits/rejected": -0.2078903168439865, "logps/chosen": -5.059138298034668, "logps/rejected": -5.273739814758301, "loss": 0.0563, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -5.059138298034668, "rewards/margins": 0.21460111439228058, "rewards/rejected": -5.273739814758301, "sft_loss": 4.584336280822754, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 1.7787613623310712, "learning_rate": 8.02139037433155e-07, "logits/chosen": -0.3272281289100647, "logits/rejected": -0.1507757008075714, "logps/chosen": -4.958096027374268, "logps/rejected": -5.078334808349609, "loss": 0.0555, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -4.958096027374268, "rewards/margins": 0.12023873627185822, "rewards/rejected": -5.078334808349609, "sft_loss": 4.501555442810059, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 0.9480957363325464, "learning_rate": 8.110516934046346e-07, "logits/chosen": -0.3552139103412628, "logits/rejected": -0.19632229208946228, "logps/chosen": -4.692941188812256, "logps/rejected": -5.041573524475098, "loss": 0.0543, "rewards/accuracies": 0.625, "rewards/chosen": -4.692941188812256, "rewards/margins": 0.3486325144767761, "rewards/rejected": -5.041573524475098, "sft_loss": 4.325136184692383, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 1.9429007679640615, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.5261731743812561, "logits/rejected": -0.28895407915115356, "logps/chosen": -5.142203330993652, "logps/rejected": -5.269199371337891, "loss": 0.0574, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -5.142203330993652, "rewards/margins": 0.12699629366397858, "rewards/rejected": -5.269199371337891, "sft_loss": 4.904362201690674, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 1.719077122753352, "learning_rate": 8.288770053475936e-07, "logits/chosen": -0.347450852394104, "logits/rejected": -0.24443945288658142, "logps/chosen": -4.623368263244629, "logps/rejected": -4.905174255371094, "loss": 0.0569, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.623368263244629, "rewards/margins": 0.28180620074272156, "rewards/rejected": -4.905174255371094, "sft_loss": 4.362107753753662, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 3.797351690873165, "learning_rate": 8.37789661319073e-07, "logits/chosen": -0.23248498141765594, "logits/rejected": -0.33860307931900024, "logps/chosen": -5.063205718994141, "logps/rejected": -5.013503074645996, "loss": 0.0581, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -5.063205718994141, "rewards/margins": -0.049702536314725876, "rewards/rejected": -5.013503074645996, "sft_loss": 4.805975437164307, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 1.0037746775043357, "learning_rate": 8.467023172905525e-07, "logits/chosen": -0.4212326109409332, "logits/rejected": -0.16105397045612335, "logps/chosen": -4.638871669769287, "logps/rejected": -5.172748565673828, "loss": 0.0534, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.638871669769287, "rewards/margins": 0.5338773131370544, "rewards/rejected": -5.172748565673828, "sft_loss": 4.322568893432617, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 1.5393574024949208, "learning_rate": 8.55614973262032e-07, "logits/chosen": -0.36025190353393555, "logits/rejected": -0.09443429112434387, "logps/chosen": -4.921746730804443, "logps/rejected": -5.215901851654053, "loss": 0.0555, "rewards/accuracies": 0.59375, "rewards/chosen": -4.921746730804443, "rewards/margins": 0.2941551208496094, "rewards/rejected": -5.215901851654053, "sft_loss": 4.582496166229248, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 2.359407535802298, "learning_rate": 8.645276292335115e-07, "logits/chosen": -0.3606078624725342, "logits/rejected": -0.2609712481498718, "logps/chosen": -5.037476062774658, "logps/rejected": -5.087932586669922, "loss": 0.0559, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -5.037476062774658, "rewards/margins": 0.05045701935887337, "rewards/rejected": -5.087932586669922, "sft_loss": 4.643454551696777, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 0.9031378126507257, "learning_rate": 8.734402852049911e-07, "logits/chosen": -0.20180901885032654, "logits/rejected": -0.12481292337179184, "logps/chosen": -4.759294033050537, "logps/rejected": -4.890738010406494, "loss": 0.0552, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -4.759294033050537, "rewards/margins": 0.13144339621067047, "rewards/rejected": -4.890738010406494, "sft_loss": 4.474959850311279, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 0.864766969516503, "learning_rate": 8.823529411764705e-07, "logits/chosen": -0.29709574580192566, "logits/rejected": -0.2729634642601013, "logps/chosen": -4.82390022277832, "logps/rejected": -4.918272972106934, "loss": 0.0563, "rewards/accuracies": 0.46875, "rewards/chosen": -4.82390022277832, "rewards/margins": 0.0943729504942894, "rewards/rejected": -4.918272972106934, "sft_loss": 4.600865840911865, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 0.6588994171949467, "learning_rate": 8.912655971479501e-07, "logits/chosen": -0.37265440821647644, "logits/rejected": -0.23569945991039276, "logps/chosen": -4.7590532302856445, "logps/rejected": -4.979560375213623, "loss": 0.0546, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -4.7590532302856445, "rewards/margins": 0.22050723433494568, "rewards/rejected": -4.979560375213623, "sft_loss": 4.484901428222656, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 0.9600651230526771, "learning_rate": 9.001782531194295e-07, "logits/chosen": -0.3922001123428345, "logits/rejected": -0.2255508154630661, "logps/chosen": -4.7787580490112305, "logps/rejected": -4.795541286468506, "loss": 0.0567, "rewards/accuracies": 0.53125, "rewards/chosen": -4.7787580490112305, "rewards/margins": 0.016783803701400757, "rewards/rejected": -4.795541286468506, "sft_loss": 4.484111785888672, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 0.8779860995345065, "learning_rate": 9.09090909090909e-07, "logits/chosen": -0.25639277696609497, "logits/rejected": -0.17562855780124664, "logps/chosen": -4.838879585266113, "logps/rejected": -5.002278804779053, "loss": 0.0552, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -4.838879585266113, "rewards/margins": 0.16339975595474243, "rewards/rejected": -5.002278804779053, "sft_loss": 4.533398628234863, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 1.6340742571733686, "learning_rate": 9.180035650623885e-07, "logits/chosen": -0.29652127623558044, "logits/rejected": -0.1382482796907425, "logps/chosen": -4.777513027191162, "logps/rejected": -4.839566230773926, "loss": 0.0554, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -4.777513027191162, "rewards/margins": 0.06205293536186218, "rewards/rejected": -4.839566230773926, "sft_loss": 4.462241172790527, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 1.467892207273564, "learning_rate": 9.26916221033868e-07, "logits/chosen": -0.39073318243026733, "logits/rejected": -0.17723903059959412, "logps/chosen": -4.805098533630371, "logps/rejected": -4.994345188140869, "loss": 0.055, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.805098533630371, "rewards/margins": 0.18924656510353088, "rewards/rejected": -4.994345188140869, "sft_loss": 4.511242866516113, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 0.769155240820834, "learning_rate": 9.358288770053476e-07, "logits/chosen": -0.2005387246608734, "logits/rejected": -0.061540864408016205, "logps/chosen": -4.82773494720459, "logps/rejected": -5.0809125900268555, "loss": 0.0548, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.82773494720459, "rewards/margins": 0.25317686796188354, "rewards/rejected": -5.0809125900268555, "sft_loss": 4.457303047180176, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 1.1120839605961124, "learning_rate": 9.44741532976827e-07, "logits/chosen": -0.3281249403953552, "logits/rejected": -0.25439485907554626, "logps/chosen": -4.870813369750977, "logps/rejected": -5.039546012878418, "loss": 0.0562, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.870813369750977, "rewards/margins": 0.16873237490653992, "rewards/rejected": -5.039546012878418, "sft_loss": 4.659360408782959, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 2.0952234650377193, "learning_rate": 9.536541889483066e-07, "logits/chosen": -0.5139321684837341, "logits/rejected": -0.1076575517654419, "logps/chosen": -4.730218410491943, "logps/rejected": -4.953000068664551, "loss": 0.0544, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.730218410491943, "rewards/margins": 0.22278185188770294, "rewards/rejected": -4.953000068664551, "sft_loss": 4.485461235046387, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 0.9124179505228884, "learning_rate": 9.62566844919786e-07, "logits/chosen": -0.3589113652706146, "logits/rejected": -0.20875516533851624, "logps/chosen": -4.740080833435059, "logps/rejected": -4.862841606140137, "loss": 0.0563, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.740080833435059, "rewards/margins": 0.12276099622249603, "rewards/rejected": -4.862841606140137, "sft_loss": 4.531832218170166, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 1.040263178598027, "learning_rate": 9.714795008912655e-07, "logits/chosen": -0.39433753490448, "logits/rejected": -0.11539351940155029, "logps/chosen": -4.739431858062744, "logps/rejected": -4.895320415496826, "loss": 0.0544, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.739431858062744, "rewards/margins": 0.15588879585266113, "rewards/rejected": -4.895320415496826, "sft_loss": 4.4855875968933105, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 1.5371896765442068, "learning_rate": 9.80392156862745e-07, "logits/chosen": -0.26903194189071655, "logits/rejected": -0.18110091984272003, "logps/chosen": -4.590166091918945, "logps/rejected": -4.735904216766357, "loss": 0.056, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -4.590166091918945, "rewards/margins": 0.14573858678340912, "rewards/rejected": -4.735904216766357, "sft_loss": 4.3490447998046875, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 1.4915643897189883, "learning_rate": 9.893048128342244e-07, "logits/chosen": -0.36765116453170776, "logits/rejected": -0.19273436069488525, "logps/chosen": -5.064404487609863, "logps/rejected": -5.279716491699219, "loss": 0.0561, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -5.064404487609863, "rewards/margins": 0.2153124362230301, "rewards/rejected": -5.279716491699219, "sft_loss": 4.768413543701172, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 1.2834538492458512, "learning_rate": 9.98217468805704e-07, "logits/chosen": -0.3625454008579254, "logits/rejected": -0.3016476631164551, "logps/chosen": -4.831945896148682, "logps/rejected": -4.9502854347229, "loss": 0.0557, "rewards/accuracies": 0.53125, "rewards/chosen": -4.831945896148682, "rewards/margins": 0.11833939701318741, "rewards/rejected": -4.9502854347229, "sft_loss": 4.503427028656006, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 1.6223027628936535, "learning_rate": 9.999984476788462e-07, "logits/chosen": -0.3643776476383209, "logits/rejected": -0.2353856861591339, "logps/chosen": -4.69890832901001, "logps/rejected": -4.904583930969238, "loss": 0.0556, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.69890832901001, "rewards/margins": 0.20567627251148224, "rewards/rejected": -4.904583930969238, "sft_loss": 4.513741493225098, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 0.9717056305058467, "learning_rate": 9.999921413906797e-07, "logits/chosen": -0.4494122564792633, "logits/rejected": -0.12291695922613144, "logps/chosen": -4.675474643707275, "logps/rejected": -4.861818790435791, "loss": 0.0549, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.675474643707275, "rewards/margins": 0.18634450435638428, "rewards/rejected": -4.861818790435791, "sft_loss": 4.422656059265137, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 0.8102835169481186, "learning_rate": 9.999809841765644e-07, "logits/chosen": -0.5083755850791931, "logits/rejected": -0.468639612197876, "logps/chosen": -4.939351558685303, "logps/rejected": -5.073581218719482, "loss": 0.0562, "rewards/accuracies": 0.5625, "rewards/chosen": -4.939351558685303, "rewards/margins": 0.13423016667366028, "rewards/rejected": -5.073581218719482, "sft_loss": 4.74433708190918, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 0.484057830475299, "learning_rate": 9.999649761447477e-07, "logits/chosen": -0.47592979669570923, "logits/rejected": -0.19933168590068817, "logps/chosen": -4.55141544342041, "logps/rejected": -4.823070049285889, "loss": 0.0549, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.55141544342041, "rewards/margins": 0.27165549993515015, "rewards/rejected": -4.823070049285889, "sft_loss": 4.323666572570801, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 0.4954728704364703, "learning_rate": 9.999441174505398e-07, "logits/chosen": -0.5497112274169922, "logits/rejected": -0.40136200189590454, "logps/chosen": -4.930845737457275, "logps/rejected": -5.082024574279785, "loss": 0.0548, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.930845737457275, "rewards/margins": 0.15117934346199036, "rewards/rejected": -5.082024574279785, "sft_loss": 4.561702251434326, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 0.8929142904448083, "learning_rate": 9.999184082963116e-07, "logits/chosen": -0.5046008825302124, "logits/rejected": -0.3427330255508423, "logps/chosen": -4.885519981384277, "logps/rejected": -5.006654739379883, "loss": 0.0551, "rewards/accuracies": 0.625, "rewards/chosen": -4.885519981384277, "rewards/margins": 0.12113462388515472, "rewards/rejected": -5.006654739379883, "sft_loss": 4.5403523445129395, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 0.9240145223243589, "learning_rate": 9.998878489314937e-07, "logits/chosen": -0.46767106652259827, "logits/rejected": -0.23591558635234833, "logps/chosen": -4.70677375793457, "logps/rejected": -4.994019508361816, "loss": 0.055, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.70677375793457, "rewards/margins": 0.28724604845046997, "rewards/rejected": -4.994019508361816, "sft_loss": 4.432908058166504, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 1.0018417636595902, "learning_rate": 9.99852439652573e-07, "logits/chosen": -0.5504830479621887, "logits/rejected": -0.3432347774505615, "logps/chosen": -4.664198875427246, "logps/rejected": -4.872433662414551, "loss": 0.0546, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.664198875427246, "rewards/margins": 0.20823463797569275, "rewards/rejected": -4.872433662414551, "sft_loss": 4.430227756500244, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 0.6275109758749665, "learning_rate": 9.998121808030904e-07, "logits/chosen": -0.6691871881484985, "logits/rejected": -0.5714275240898132, "logps/chosen": -4.999661445617676, "logps/rejected": -5.115338325500488, "loss": 0.0558, "rewards/accuracies": 0.59375, "rewards/chosen": -4.999661445617676, "rewards/margins": 0.11567720025777817, "rewards/rejected": -5.115338325500488, "sft_loss": 4.749647617340088, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 1.9533627210971916, "learning_rate": 9.997670727736379e-07, "logits/chosen": -0.5287994146347046, "logits/rejected": -0.2409917414188385, "logps/chosen": -4.611300468444824, "logps/rejected": -4.752462387084961, "loss": 0.0548, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.611300468444824, "rewards/margins": 0.14116191864013672, "rewards/rejected": -4.752462387084961, "sft_loss": 4.276651382446289, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 1.0181190866876273, "learning_rate": 9.99717116001853e-07, "logits/chosen": -0.5561403036117554, "logits/rejected": -0.4006883502006531, "logps/chosen": -4.733808517456055, "logps/rejected": -4.987280368804932, "loss": 0.0541, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.733808517456055, "rewards/margins": 0.25347140431404114, "rewards/rejected": -4.987280368804932, "sft_loss": 4.515638828277588, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 1.8286375874490535, "learning_rate": 9.996623109724173e-07, "logits/chosen": -0.36191409826278687, "logits/rejected": -0.23526597023010254, "logps/chosen": -4.690080165863037, "logps/rejected": -4.8197832107543945, "loss": 0.0553, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.690080165863037, "rewards/margins": 0.12970253825187683, "rewards/rejected": -4.8197832107543945, "sft_loss": 4.448857307434082, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 0.8430739768632581, "learning_rate": 9.996026582170488e-07, "logits/chosen": -0.40729236602783203, "logits/rejected": -0.15933868288993835, "logps/chosen": -4.661129474639893, "logps/rejected": -5.0198845863342285, "loss": 0.0534, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.661129474639893, "rewards/margins": 0.3587549328804016, "rewards/rejected": -5.0198845863342285, "sft_loss": 4.314611434936523, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 0.41172622641288736, "learning_rate": 9.995381583144996e-07, "logits/chosen": -0.41392308473587036, "logits/rejected": -0.23915371298789978, "logps/chosen": -4.795115947723389, "logps/rejected": -5.012903690338135, "loss": 0.0544, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.795115947723389, "rewards/margins": 0.2177872359752655, "rewards/rejected": -5.012903690338135, "sft_loss": 4.513781547546387, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 1.3659635196177828, "learning_rate": 9.994688118905471e-07, "logits/chosen": -0.40934377908706665, "logits/rejected": -0.06164498254656792, "logps/chosen": -4.684535980224609, "logps/rejected": -5.066915512084961, "loss": 0.0539, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.684535980224609, "rewards/margins": 0.38237953186035156, "rewards/rejected": -5.066915512084961, "sft_loss": 4.450982093811035, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 0.7655809286311621, "learning_rate": 9.993946196179912e-07, "logits/chosen": -0.4428192973136902, "logits/rejected": -0.13932375609874725, "logps/chosen": -4.84511661529541, "logps/rejected": -5.081221103668213, "loss": 0.055, "rewards/accuracies": 0.59375, "rewards/chosen": -4.84511661529541, "rewards/margins": 0.2361036241054535, "rewards/rejected": -5.081221103668213, "sft_loss": 4.533552646636963, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 0.753608439880354, "learning_rate": 9.993155822166455e-07, "logits/chosen": -0.4735592305660248, "logits/rejected": -0.3594627380371094, "logps/chosen": -4.647860527038574, "logps/rejected": -4.909556865692139, "loss": 0.0545, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.647860527038574, "rewards/margins": 0.2616958022117615, "rewards/rejected": -4.909556865692139, "sft_loss": 4.432980537414551, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 0.7532799343685316, "learning_rate": 9.992317004533313e-07, "logits/chosen": -0.3656768798828125, "logits/rejected": -0.24436573684215546, "logps/chosen": -4.652246952056885, "logps/rejected": -4.921938896179199, "loss": 0.0551, "rewards/accuracies": 0.59375, "rewards/chosen": -4.652246952056885, "rewards/margins": 0.26969173550605774, "rewards/rejected": -4.921938896179199, "sft_loss": 4.465109825134277, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 0.8544184685749583, "learning_rate": 9.991429751418696e-07, "logits/chosen": -0.38019418716430664, "logits/rejected": -0.35641294717788696, "logps/chosen": -4.640519618988037, "logps/rejected": -4.896888732910156, "loss": 0.0548, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.640519618988037, "rewards/margins": 0.256369024515152, "rewards/rejected": -4.896888732910156, "sft_loss": 4.424017429351807, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 0.7903309832011153, "learning_rate": 9.99049407143074e-07, "logits/chosen": -0.462789922952652, "logits/rejected": -0.25584983825683594, "logps/chosen": -5.043513298034668, "logps/rejected": -5.086087703704834, "loss": 0.0565, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -5.043513298034668, "rewards/margins": 0.04257440194487572, "rewards/rejected": -5.086087703704834, "sft_loss": 4.764005661010742, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 0.609573225241613, "learning_rate": 9.989509973647416e-07, "logits/chosen": -0.44240602850914, "logits/rejected": -0.2531934976577759, "logps/chosen": -4.6058125495910645, "logps/rejected": -4.872105121612549, "loss": 0.054, "rewards/accuracies": 0.6875, "rewards/chosen": -4.6058125495910645, "rewards/margins": 0.26629284024238586, "rewards/rejected": -4.872105121612549, "sft_loss": 4.4222412109375, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 0.4364736762052332, "learning_rate": 9.988477467616445e-07, "logits/chosen": -0.5186976194381714, "logits/rejected": -0.2430642545223236, "logps/chosen": -4.617299556732178, "logps/rejected": -4.82781982421875, "loss": 0.0548, "rewards/accuracies": 0.65625, "rewards/chosen": -4.617299556732178, "rewards/margins": 0.21052002906799316, "rewards/rejected": -4.82781982421875, "sft_loss": 4.397097110748291, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 1.137938867391629, "learning_rate": 9.987396563355205e-07, "logits/chosen": -0.517059326171875, "logits/rejected": -0.4032202661037445, "logps/chosen": -4.8206658363342285, "logps/rejected": -5.0536298751831055, "loss": 0.0554, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.8206658363342285, "rewards/margins": 0.2329637110233307, "rewards/rejected": -5.0536298751831055, "sft_loss": 4.577258110046387, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 1.2897163566434429, "learning_rate": 9.986267271350631e-07, "logits/chosen": -0.440659761428833, "logits/rejected": -0.23207931220531464, "logps/chosen": -4.67221736907959, "logps/rejected": -4.837889194488525, "loss": 0.0558, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.67221736907959, "rewards/margins": 0.1656724214553833, "rewards/rejected": -4.837889194488525, "sft_loss": 4.413691997528076, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 0.539702639060486, "learning_rate": 9.985089602559123e-07, "logits/chosen": -0.4820149838924408, "logits/rejected": -0.21975748240947723, "logps/chosen": -4.768852233886719, "logps/rejected": -5.050930976867676, "loss": 0.054, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.768852233886719, "rewards/margins": 0.28207892179489136, "rewards/rejected": -5.050930976867676, "sft_loss": 4.496817588806152, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 1.5387946865898154, "learning_rate": 9.983863568406428e-07, "logits/chosen": -0.35290640592575073, "logits/rejected": -0.29900383949279785, "logps/chosen": -4.636789798736572, "logps/rejected": -4.796220302581787, "loss": 0.0552, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.636789798736572, "rewards/margins": 0.1594308465719223, "rewards/rejected": -4.796220302581787, "sft_loss": 4.371980667114258, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 0.6503780378168439, "learning_rate": 9.982589180787532e-07, "logits/chosen": -0.4263625741004944, "logits/rejected": -0.2828146815299988, "logps/chosen": -4.904356002807617, "logps/rejected": -5.096182823181152, "loss": 0.0549, "rewards/accuracies": 0.59375, "rewards/chosen": -4.904356002807617, "rewards/margins": 0.19182677567005157, "rewards/rejected": -5.096182823181152, "sft_loss": 4.617617130279541, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 0.5447784030210343, "learning_rate": 9.981266452066553e-07, "logits/chosen": -0.5558158755302429, "logits/rejected": -0.32897305488586426, "logps/chosen": -4.607967853546143, "logps/rejected": -4.868782043457031, "loss": 0.0542, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.607967853546143, "rewards/margins": 0.2608141601085663, "rewards/rejected": -4.868782043457031, "sft_loss": 4.378183841705322, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 0.6795933899283331, "learning_rate": 9.979895395076608e-07, "logits/chosen": -0.502457857131958, "logits/rejected": -0.19821786880493164, "logps/chosen": -4.631689071655273, "logps/rejected": -4.896653175354004, "loss": 0.0539, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.631689071655273, "rewards/margins": 0.26496395468711853, "rewards/rejected": -4.896653175354004, "sft_loss": 4.34788703918457, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 0.6873470967068471, "learning_rate": 9.9784760231197e-07, "logits/chosen": -0.5005318522453308, "logits/rejected": -0.3380716145038605, "logps/chosen": -4.7272210121154785, "logps/rejected": -4.948512554168701, "loss": 0.0545, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.7272210121154785, "rewards/margins": 0.22129102051258087, "rewards/rejected": -4.948512554168701, "sft_loss": 4.455728054046631, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 0.38741905047989544, "learning_rate": 9.97700834996658e-07, "logits/chosen": -0.5377181768417358, "logits/rejected": -0.27840954065322876, "logps/chosen": -4.827449798583984, "logps/rejected": -5.016432285308838, "loss": 0.0545, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.827449798583984, "rewards/margins": 0.18898220360279083, "rewards/rejected": -5.016432285308838, "sft_loss": 4.430556297302246, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 0.6920780216870869, "learning_rate": 9.97549238985662e-07, "logits/chosen": -0.49541395902633667, "logits/rejected": -0.19519221782684326, "logps/chosen": -4.899749755859375, "logps/rejected": -5.18363094329834, "loss": 0.0544, "rewards/accuracies": 0.59375, "rewards/chosen": -4.899749755859375, "rewards/margins": 0.28388163447380066, "rewards/rejected": -5.18363094329834, "sft_loss": 4.593430519104004, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 0.6183567658360065, "learning_rate": 9.973928157497674e-07, "logits/chosen": -0.7071774005889893, "logits/rejected": -0.45961588621139526, "logps/chosen": -4.625585079193115, "logps/rejected": -4.892023086547852, "loss": 0.0537, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.625585079193115, "rewards/margins": 0.2664377987384796, "rewards/rejected": -4.892023086547852, "sft_loss": 4.314015865325928, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 0.8093467088520951, "learning_rate": 9.972315668065927e-07, "logits/chosen": -0.6060682535171509, "logits/rejected": -0.4073333740234375, "logps/chosen": -4.745782375335693, "logps/rejected": -4.918501377105713, "loss": 0.0541, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -4.745782375335693, "rewards/margins": 0.17271855473518372, "rewards/rejected": -4.918501377105713, "sft_loss": 4.408670902252197, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 0.5190267369651727, "learning_rate": 9.97065493720576e-07, "logits/chosen": -0.6034917235374451, "logits/rejected": -0.43217557668685913, "logps/chosen": -4.639141082763672, "logps/rejected": -4.880422115325928, "loss": 0.0547, "rewards/accuracies": 0.625, "rewards/chosen": -4.639141082763672, "rewards/margins": 0.24128136038780212, "rewards/rejected": -4.880422115325928, "sft_loss": 4.418794631958008, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 0.7299713178783463, "learning_rate": 9.968945981029594e-07, "logits/chosen": -0.5461141467094421, "logits/rejected": -0.2863614559173584, "logps/chosen": -4.790996551513672, "logps/rejected": -5.0908708572387695, "loss": 0.0531, "rewards/accuracies": 0.625, "rewards/chosen": -4.790996551513672, "rewards/margins": 0.299874484539032, "rewards/rejected": -5.0908708572387695, "sft_loss": 4.418035984039307, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 1.357858967463249, "learning_rate": 9.967188816117726e-07, "logits/chosen": -0.4381503462791443, "logits/rejected": -0.24060969054698944, "logps/chosen": -4.734060287475586, "logps/rejected": -5.104639530181885, "loss": 0.0556, "rewards/accuracies": 0.625, "rewards/chosen": -4.734060287475586, "rewards/margins": 0.3705799877643585, "rewards/rejected": -5.104639530181885, "sft_loss": 4.420103549957275, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 0.5571020275754393, "learning_rate": 9.965383459518179e-07, "logits/chosen": -0.6509796977043152, "logits/rejected": -0.3502160310745239, "logps/chosen": -5.010281085968018, "logps/rejected": -5.239097595214844, "loss": 0.0542, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -5.010281085968018, "rewards/margins": 0.22881627082824707, "rewards/rejected": -5.239097595214844, "sft_loss": 4.609499931335449, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 0.5180996933056597, "learning_rate": 9.963529928746533e-07, "logits/chosen": -0.5796464681625366, "logits/rejected": -0.3192821145057678, "logps/chosen": -4.6411871910095215, "logps/rejected": -4.946473598480225, "loss": 0.0545, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.6411871910095215, "rewards/margins": 0.305286169052124, "rewards/rejected": -4.946473598480225, "sft_loss": 4.233515739440918, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 0.383125627873452, "learning_rate": 9.961628241785746e-07, "logits/chosen": -0.7499212622642517, "logits/rejected": -0.5933259129524231, "logps/chosen": -4.820398807525635, "logps/rejected": -5.086698055267334, "loss": 0.0541, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.820398807525635, "rewards/margins": 0.266299307346344, "rewards/rejected": -5.086698055267334, "sft_loss": 4.4908623695373535, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 0.7665403862710062, "learning_rate": 9.959678417085998e-07, "logits/chosen": -0.684373140335083, "logits/rejected": -0.5438544154167175, "logps/chosen": -4.870017051696777, "logps/rejected": -5.046212196350098, "loss": 0.0552, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.870017051696777, "rewards/margins": 0.17619472742080688, "rewards/rejected": -5.046212196350098, "sft_loss": 4.622311115264893, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 0.41415913294882845, "learning_rate": 9.957680473564493e-07, "logits/chosen": -0.5799676775932312, "logits/rejected": -0.32691285014152527, "logps/chosen": -4.706719398498535, "logps/rejected": -4.918424129486084, "loss": 0.0541, "rewards/accuracies": 0.5625, "rewards/chosen": -4.706719398498535, "rewards/margins": 0.21170508861541748, "rewards/rejected": -4.918424129486084, "sft_loss": 4.271069526672363, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 0.443777235736431, "learning_rate": 9.95563443060529e-07, "logits/chosen": -0.7746785879135132, "logits/rejected": -0.47731414437294006, "logps/chosen": -4.810762405395508, "logps/rejected": -5.0177717208862305, "loss": 0.0541, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.810762405395508, "rewards/margins": 0.2070087492465973, "rewards/rejected": -5.0177717208862305, "sft_loss": 4.463230609893799, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 0.5265592870739666, "learning_rate": 9.95354030805911e-07, "logits/chosen": -0.7950371503829956, "logits/rejected": -0.5439642071723938, "logps/chosen": -4.769791603088379, "logps/rejected": -4.9892578125, "loss": 0.054, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.769791603088379, "rewards/margins": 0.21946604549884796, "rewards/rejected": -4.9892578125, "sft_loss": 4.466793537139893, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 0.8208738399246265, "learning_rate": 9.951398126243133e-07, "logits/chosen": -0.6324111223220825, "logits/rejected": -0.486175000667572, "logps/chosen": -4.708271026611328, "logps/rejected": -4.979205131530762, "loss": 0.0544, "rewards/accuracies": 0.625, "rewards/chosen": -4.708271026611328, "rewards/margins": 0.2709343433380127, "rewards/rejected": -4.979205131530762, "sft_loss": 4.414024829864502, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 0.7140836228407493, "learning_rate": 9.94920790594082e-07, "logits/chosen": -0.6593544483184814, "logits/rejected": -0.48353734612464905, "logps/chosen": -4.598227500915527, "logps/rejected": -4.899634838104248, "loss": 0.0535, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.598227500915527, "rewards/margins": 0.30140742659568787, "rewards/rejected": -4.899634838104248, "sft_loss": 4.312748908996582, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 0.4014903013901931, "learning_rate": 9.946969668401696e-07, "logits/chosen": -0.6135523915290833, "logits/rejected": -0.32322195172309875, "logps/chosen": -4.550875663757324, "logps/rejected": -4.979404926300049, "loss": 0.0535, "rewards/accuracies": 0.65625, "rewards/chosen": -4.550875663757324, "rewards/margins": 0.4285293519496918, "rewards/rejected": -4.979404926300049, "sft_loss": 4.304534912109375, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 1.1777799079408109, "learning_rate": 9.944683435341155e-07, "logits/chosen": -0.433992862701416, "logits/rejected": -0.33607035875320435, "logps/chosen": -4.869369029998779, "logps/rejected": -5.201943397521973, "loss": 0.0537, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.869369029998779, "rewards/margins": 0.3325735628604889, "rewards/rejected": -5.201943397521973, "sft_loss": 4.4167160987854, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.0762772411108017, "eval_logits/rejected": 0.21884751319885254, "eval_logps/chosen": -4.661361217498779, "eval_logps/rejected": -4.99029541015625, "eval_loss": 0.05288328602910042, "eval_rewards/accuracies": 0.6023738980293274, "eval_rewards/chosen": -4.661361217498779, "eval_rewards/margins": 0.32893452048301697, "eval_rewards/rejected": -4.99029541015625, "eval_runtime": 43.6891, "eval_samples_per_second": 30.786, "eval_sft_loss": 4.132972717285156, "eval_steps_per_second": 7.714, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 1.019461476829721, "learning_rate": 9.942349228940236e-07, "logits/chosen": -0.5534158945083618, "logits/rejected": -0.2658959627151489, "logps/chosen": -4.665492057800293, "logps/rejected": -5.222234725952148, "loss": 0.053, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.665492057800293, "rewards/margins": 0.5567426681518555, "rewards/rejected": -5.222234725952148, "sft_loss": 4.35896635055542, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 0.6905383851627591, "learning_rate": 9.939967071845424e-07, "logits/chosen": -0.5316272974014282, "logits/rejected": -0.44879570603370667, "logps/chosen": -4.868673324584961, "logps/rejected": -5.094768524169922, "loss": 0.0559, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.868673324584961, "rewards/margins": 0.22609524428844452, "rewards/rejected": -5.094768524169922, "sft_loss": 4.565918922424316, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 0.9640958718568238, "learning_rate": 9.937536987168413e-07, "logits/chosen": -0.5573722124099731, "logits/rejected": -0.34027716517448425, "logps/chosen": -4.796445846557617, "logps/rejected": -5.075738430023193, "loss": 0.0535, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.796445846557617, "rewards/margins": 0.27929285168647766, "rewards/rejected": -5.075738430023193, "sft_loss": 4.393874645233154, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 0.45255104653719846, "learning_rate": 9.935058998485896e-07, "logits/chosen": -0.5388621687889099, "logits/rejected": -0.5094455480575562, "logps/chosen": -4.702640533447266, "logps/rejected": -4.948770523071289, "loss": 0.0544, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.702640533447266, "rewards/margins": 0.24612972140312195, "rewards/rejected": -4.948770523071289, "sft_loss": 4.344797134399414, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 1.1306242091401009, "learning_rate": 9.932533129839333e-07, "logits/chosen": -0.7148083448410034, "logits/rejected": -0.5382518172264099, "logps/chosen": -4.7975382804870605, "logps/rejected": -4.953667640686035, "loss": 0.0552, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.7975382804870605, "rewards/margins": 0.15612894296646118, "rewards/rejected": -4.953667640686035, "sft_loss": 4.56071138381958, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 0.6894798373046558, "learning_rate": 9.929959405734711e-07, "logits/chosen": -0.533441424369812, "logits/rejected": -0.3360450863838196, "logps/chosen": -4.676575660705566, "logps/rejected": -4.950590133666992, "loss": 0.0543, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.676575660705566, "rewards/margins": 0.2740144729614258, "rewards/rejected": -4.950590133666992, "sft_loss": 4.489292621612549, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 0.5615731774657157, "learning_rate": 9.927337851142314e-07, "logits/chosen": -0.535950779914856, "logits/rejected": -0.3678244650363922, "logps/chosen": -4.804347991943359, "logps/rejected": -4.954916954040527, "loss": 0.0554, "rewards/accuracies": 0.5625, "rewards/chosen": -4.804347991943359, "rewards/margins": 0.1505691409111023, "rewards/rejected": -4.954916954040527, "sft_loss": 4.574199199676514, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 0.5138053605954034, "learning_rate": 9.924668491496474e-07, "logits/chosen": -0.598753809928894, "logits/rejected": -0.30034974217414856, "logps/chosen": -4.644226551055908, "logps/rejected": -4.914678573608398, "loss": 0.0542, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.644226551055908, "rewards/margins": 0.270452082157135, "rewards/rejected": -4.914678573608398, "sft_loss": 4.4098100662231445, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 0.3953000760486427, "learning_rate": 9.92195135269533e-07, "logits/chosen": -0.571025013923645, "logits/rejected": -0.4940189719200134, "logps/chosen": -4.623247146606445, "logps/rejected": -4.7912702560424805, "loss": 0.0547, "rewards/accuracies": 0.625, "rewards/chosen": -4.623247146606445, "rewards/margins": 0.16802339255809784, "rewards/rejected": -4.7912702560424805, "sft_loss": 4.395881175994873, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 0.9784287629098288, "learning_rate": 9.919186461100574e-07, "logits/chosen": -0.7064191699028015, "logits/rejected": -0.5537311434745789, "logps/chosen": -4.647214889526367, "logps/rejected": -4.877057075500488, "loss": 0.0542, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.647214889526367, "rewards/margins": 0.22984282672405243, "rewards/rejected": -4.877057075500488, "sft_loss": 4.406468391418457, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 0.6202522928914502, "learning_rate": 9.9163738435372e-07, "logits/chosen": -0.6961187124252319, "logits/rejected": -0.4674125611782074, "logps/chosen": -4.66948938369751, "logps/rejected": -5.008526802062988, "loss": 0.0547, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.66948938369751, "rewards/margins": 0.33903709053993225, "rewards/rejected": -5.008526802062988, "sft_loss": 4.422011852264404, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 1.1274872046878186, "learning_rate": 9.913513527293234e-07, "logits/chosen": -0.8537279367446899, "logits/rejected": -0.6118448376655579, "logps/chosen": -4.752366065979004, "logps/rejected": -5.068416595458984, "loss": 0.0541, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.752366065979004, "rewards/margins": 0.3160504400730133, "rewards/rejected": -5.068416595458984, "sft_loss": 4.435427665710449, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 0.9165527333766942, "learning_rate": 9.910605540119474e-07, "logits/chosen": -0.7939995527267456, "logits/rejected": -0.6034280061721802, "logps/chosen": -4.908832550048828, "logps/rejected": -5.1738457679748535, "loss": 0.0545, "rewards/accuracies": 0.625, "rewards/chosen": -4.908832550048828, "rewards/margins": 0.2650133967399597, "rewards/rejected": -5.1738457679748535, "sft_loss": 4.486456394195557, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 0.42446011187865473, "learning_rate": 9.907649910229227e-07, "logits/chosen": -0.9538064002990723, "logits/rejected": -0.5534366965293884, "logps/chosen": -4.690494537353516, "logps/rejected": -4.987832546234131, "loss": 0.0538, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.690494537353516, "rewards/margins": 0.2973388135433197, "rewards/rejected": -4.987832546234131, "sft_loss": 4.383624076843262, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 0.5446443287919899, "learning_rate": 9.90464666629803e-07, "logits/chosen": -0.6814507246017456, "logits/rejected": -0.545750081539154, "logps/chosen": -4.67910099029541, "logps/rejected": -4.943177223205566, "loss": 0.0551, "rewards/accuracies": 0.59375, "rewards/chosen": -4.67910099029541, "rewards/margins": 0.26407718658447266, "rewards/rejected": -4.943177223205566, "sft_loss": 4.388939380645752, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 0.7810222448754378, "learning_rate": 9.901595837463363e-07, "logits/chosen": -0.6461896896362305, "logits/rejected": -0.390037477016449, "logps/chosen": -4.738075256347656, "logps/rejected": -5.075900554656982, "loss": 0.0543, "rewards/accuracies": 0.625, "rewards/chosen": -4.738075256347656, "rewards/margins": 0.33782586455345154, "rewards/rejected": -5.075900554656982, "sft_loss": 4.471536159515381, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 0.46368875012727806, "learning_rate": 9.898497453324384e-07, "logits/chosen": -0.6110280752182007, "logits/rejected": -0.501623809337616, "logps/chosen": -4.8805389404296875, "logps/rejected": -5.093916893005371, "loss": 0.054, "rewards/accuracies": 0.65625, "rewards/chosen": -4.8805389404296875, "rewards/margins": 0.21337802708148956, "rewards/rejected": -5.093916893005371, "sft_loss": 4.527132034301758, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 0.5772927826805968, "learning_rate": 9.895351543941628e-07, "logits/chosen": -0.6308537721633911, "logits/rejected": -0.45382413268089294, "logps/chosen": -4.536052703857422, "logps/rejected": -4.7554030418396, "loss": 0.0549, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -4.536052703857422, "rewards/margins": 0.21935030817985535, "rewards/rejected": -4.7554030418396, "sft_loss": 4.236863136291504, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 0.5909700614108395, "learning_rate": 9.892158139836724e-07, "logits/chosen": -0.667130172252655, "logits/rejected": -0.5236655473709106, "logps/chosen": -5.009900093078613, "logps/rejected": -5.105405330657959, "loss": 0.0549, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -5.009900093078613, "rewards/margins": 0.09550575166940689, "rewards/rejected": -5.105405330657959, "sft_loss": 4.719082355499268, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 1.2509012937984993, "learning_rate": 9.88891727199209e-07, "logits/chosen": -0.7147120833396912, "logits/rejected": -0.5922271609306335, "logps/chosen": -4.690701961517334, "logps/rejected": -4.896805763244629, "loss": 0.0538, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.690701961517334, "rewards/margins": 0.20610396564006805, "rewards/rejected": -4.896805763244629, "sft_loss": 4.374290466308594, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 0.5020956780576539, "learning_rate": 9.885628971850641e-07, "logits/chosen": -0.6695514917373657, "logits/rejected": -0.40942034125328064, "logps/chosen": -4.461915016174316, "logps/rejected": -4.818450927734375, "loss": 0.0538, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.461915016174316, "rewards/margins": 0.35653623938560486, "rewards/rejected": -4.818450927734375, "sft_loss": 4.175963878631592, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 0.2706776295092486, "learning_rate": 9.882293271315481e-07, "logits/chosen": -0.7963489294052124, "logits/rejected": -0.6448882818222046, "logps/chosen": -4.85056209564209, "logps/rejected": -5.1240363121032715, "loss": 0.0541, "rewards/accuracies": 0.625, "rewards/chosen": -4.85056209564209, "rewards/margins": 0.2734738886356354, "rewards/rejected": -5.1240363121032715, "sft_loss": 4.56730842590332, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 0.7830577955619996, "learning_rate": 9.878910202749589e-07, "logits/chosen": -0.7318392992019653, "logits/rejected": -0.4440035820007324, "logps/chosen": -4.606733322143555, "logps/rejected": -4.9441142082214355, "loss": 0.0535, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.606733322143555, "rewards/margins": 0.33738085627555847, "rewards/rejected": -4.9441142082214355, "sft_loss": 4.38753080368042, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 0.5249861703336051, "learning_rate": 9.875479798975512e-07, "logits/chosen": -0.5288979411125183, "logits/rejected": -0.25817009806632996, "logps/chosen": -4.555031776428223, "logps/rejected": -4.973117828369141, "loss": 0.0536, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.555031776428223, "rewards/margins": 0.41808605194091797, "rewards/rejected": -4.973117828369141, "sft_loss": 4.3153204917907715, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 0.6958792681399926, "learning_rate": 9.87200209327504e-07, "logits/chosen": -0.6519374847412109, "logits/rejected": -0.3630174994468689, "logps/chosen": -4.784842014312744, "logps/rejected": -4.913394927978516, "loss": 0.0546, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -4.784842014312744, "rewards/margins": 0.12855303287506104, "rewards/rejected": -4.913394927978516, "sft_loss": 4.431909561157227, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 0.6857385655509702, "learning_rate": 9.868477119388894e-07, "logits/chosen": -0.6547890901565552, "logits/rejected": -0.5493338704109192, "logps/chosen": -4.803074836730957, "logps/rejected": -5.25410795211792, "loss": 0.0538, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.803074836730957, "rewards/margins": 0.45103350281715393, "rewards/rejected": -5.25410795211792, "sft_loss": 4.5226616859436035, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 0.7494384327463705, "learning_rate": 9.864904911516383e-07, "logits/chosen": -0.5585234761238098, "logits/rejected": -0.4689217209815979, "logps/chosen": -4.659033298492432, "logps/rejected": -4.939300537109375, "loss": 0.0542, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.659033298492432, "rewards/margins": 0.28026682138442993, "rewards/rejected": -4.939300537109375, "sft_loss": 4.326298713684082, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 0.5667236667391294, "learning_rate": 9.861285504315084e-07, "logits/chosen": -0.549680233001709, "logits/rejected": -0.4027118682861328, "logps/chosen": -4.72601318359375, "logps/rejected": -5.003291130065918, "loss": 0.0547, "rewards/accuracies": 0.625, "rewards/chosen": -4.72601318359375, "rewards/margins": 0.2772785723209381, "rewards/rejected": -5.003291130065918, "sft_loss": 4.4270710945129395, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 0.3704541163727195, "learning_rate": 9.857618932900502e-07, "logits/chosen": -0.6717230081558228, "logits/rejected": -0.4291006624698639, "logps/chosen": -4.774092197418213, "logps/rejected": -5.1116228103637695, "loss": 0.0542, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.774092197418213, "rewards/margins": 0.3375304341316223, "rewards/rejected": -5.1116228103637695, "sft_loss": 4.614851474761963, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 1.080406869888111, "learning_rate": 9.853905232845727e-07, "logits/chosen": -0.5478076934814453, "logits/rejected": -0.3193608224391937, "logps/chosen": -4.520052909851074, "logps/rejected": -4.752264976501465, "loss": 0.0547, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.520052909851074, "rewards/margins": 0.23221249878406525, "rewards/rejected": -4.752264976501465, "sft_loss": 4.331027507781982, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 0.47093914637526024, "learning_rate": 9.850144440181095e-07, "logits/chosen": -0.48740583658218384, "logits/rejected": -0.2183370292186737, "logps/chosen": -4.712460994720459, "logps/rejected": -4.964376926422119, "loss": 0.0543, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.712460994720459, "rewards/margins": 0.25191575288772583, "rewards/rejected": -4.964376926422119, "sft_loss": 4.447497367858887, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 0.9253745426059107, "learning_rate": 9.846336591393832e-07, "logits/chosen": -0.45807933807373047, "logits/rejected": -0.2740600109100342, "logps/chosen": -4.678889751434326, "logps/rejected": -4.9867753982543945, "loss": 0.055, "rewards/accuracies": 0.6875, "rewards/chosen": -4.678889751434326, "rewards/margins": 0.30788561701774597, "rewards/rejected": -4.9867753982543945, "sft_loss": 4.41696834564209, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 0.5562158528705252, "learning_rate": 9.842481723427704e-07, "logits/chosen": -0.43384265899658203, "logits/rejected": -0.3705459535121918, "logps/chosen": -4.858570098876953, "logps/rejected": -5.0885772705078125, "loss": 0.0552, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.858570098876953, "rewards/margins": 0.23000743985176086, "rewards/rejected": -5.0885772705078125, "sft_loss": 4.645135402679443, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 0.46758288708109713, "learning_rate": 9.838579873682658e-07, "logits/chosen": -0.4290435314178467, "logits/rejected": -0.42416420578956604, "logps/chosen": -4.561893463134766, "logps/rejected": -4.798999786376953, "loss": 0.0547, "rewards/accuracies": 0.5625, "rewards/chosen": -4.561893463134766, "rewards/margins": 0.2371063530445099, "rewards/rejected": -4.798999786376953, "sft_loss": 4.215584754943848, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 0.9168000242182311, "learning_rate": 9.834631080014457e-07, "logits/chosen": -0.6926376223564148, "logits/rejected": -0.4112038016319275, "logps/chosen": -4.897233486175537, "logps/rejected": -5.223374366760254, "loss": 0.0536, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.897233486175537, "rewards/margins": 0.3261413872241974, "rewards/rejected": -5.223374366760254, "sft_loss": 4.6407694816589355, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 0.8592004725850547, "learning_rate": 9.830635380734312e-07, "logits/chosen": -0.6455878615379333, "logits/rejected": -0.4143894612789154, "logps/chosen": -4.61000919342041, "logps/rejected": -4.840395450592041, "loss": 0.0545, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.61000919342041, "rewards/margins": 0.23038557171821594, "rewards/rejected": -4.840395450592041, "sft_loss": 4.316131591796875, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 0.4155059102576832, "learning_rate": 9.826592814608517e-07, "logits/chosen": -0.5832679867744446, "logits/rejected": -0.30334407091140747, "logps/chosen": -4.680327415466309, "logps/rejected": -4.880679130554199, "loss": 0.0543, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.680327415466309, "rewards/margins": 0.20035116374492645, "rewards/rejected": -4.880679130554199, "sft_loss": 4.373396873474121, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 0.6151022174046136, "learning_rate": 9.822503420858067e-07, "logits/chosen": -0.5774039030075073, "logits/rejected": -0.5873244404792786, "logps/chosen": -4.89186429977417, "logps/rejected": -5.018913269042969, "loss": 0.0546, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -4.89186429977417, "rewards/margins": 0.12704893946647644, "rewards/rejected": -5.018913269042969, "sft_loss": 4.576397895812988, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 0.9617406539039807, "learning_rate": 9.818367239158277e-07, "logits/chosen": -0.4993751645088196, "logits/rejected": -0.41326743364334106, "logps/chosen": -4.793762683868408, "logps/rejected": -4.96630334854126, "loss": 0.0542, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.793762683868408, "rewards/margins": 0.1725403368473053, "rewards/rejected": -4.96630334854126, "sft_loss": 4.41725492477417, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 0.9060064144194804, "learning_rate": 9.8141843096384e-07, "logits/chosen": -0.5664402842521667, "logits/rejected": -0.34098461270332336, "logps/chosen": -4.702371120452881, "logps/rejected": -5.04431676864624, "loss": 0.0542, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.702371120452881, "rewards/margins": 0.3419460654258728, "rewards/rejected": -5.04431676864624, "sft_loss": 4.421466827392578, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 0.5086244800614527, "learning_rate": 9.809954672881237e-07, "logits/chosen": -0.46155256032943726, "logits/rejected": -0.2267765998840332, "logps/chosen": -4.836634159088135, "logps/rejected": -5.148859024047852, "loss": 0.0543, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.836634159088135, "rewards/margins": 0.3122252821922302, "rewards/rejected": -5.148859024047852, "sft_loss": 4.532876968383789, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 0.5176592683524385, "learning_rate": 9.80567836992274e-07, "logits/chosen": -0.47870248556137085, "logits/rejected": -0.22733981907367706, "logps/chosen": -4.524051666259766, "logps/rejected": -4.881123065948486, "loss": 0.0549, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.524051666259766, "rewards/margins": 0.35707104206085205, "rewards/rejected": -4.881123065948486, "sft_loss": 4.257218837738037, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 0.6158973094099502, "learning_rate": 9.801355442251625e-07, "logits/chosen": -0.7883400321006775, "logits/rejected": -0.5459173917770386, "logps/chosen": -4.803179740905762, "logps/rejected": -5.041607856750488, "loss": 0.0545, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.803179740905762, "rewards/margins": 0.23842862248420715, "rewards/rejected": -5.041607856750488, "sft_loss": 4.608723163604736, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 0.38857303823842887, "learning_rate": 9.796985931808949e-07, "logits/chosen": -0.8481922149658203, "logits/rejected": -0.5922120809555054, "logps/chosen": -4.708874702453613, "logps/rejected": -4.960658073425293, "loss": 0.0537, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.708874702453613, "rewards/margins": 0.25178369879722595, "rewards/rejected": -4.960658073425293, "sft_loss": 4.406599521636963, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 0.37995724426168664, "learning_rate": 9.792569880987724e-07, "logits/chosen": -0.7771550416946411, "logits/rejected": -0.6147995591163635, "logps/chosen": -4.492480754852295, "logps/rejected": -4.806388854980469, "loss": 0.0536, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.492480754852295, "rewards/margins": 0.3139081597328186, "rewards/rejected": -4.806388854980469, "sft_loss": 4.236155986785889, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 0.5433034173994814, "learning_rate": 9.788107332632493e-07, "logits/chosen": -0.7106307744979858, "logits/rejected": -0.619476318359375, "logps/chosen": -4.824548244476318, "logps/rejected": -4.9400177001953125, "loss": 0.0569, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.824548244476318, "rewards/margins": 0.11546945571899414, "rewards/rejected": -4.9400177001953125, "sft_loss": 4.580724239349365, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 0.39975295156815127, "learning_rate": 9.783598330038924e-07, "logits/chosen": -0.7352392077445984, "logits/rejected": -0.5586301684379578, "logps/chosen": -4.770339488983154, "logps/rejected": -4.9732537269592285, "loss": 0.0541, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.770339488983154, "rewards/margins": 0.20291383564472198, "rewards/rejected": -4.9732537269592285, "sft_loss": 4.535826206207275, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 0.5679160526366398, "learning_rate": 9.779042916953376e-07, "logits/chosen": -0.6250237226486206, "logits/rejected": -0.35187873244285583, "logps/chosen": -4.4509196281433105, "logps/rejected": -4.852322578430176, "loss": 0.0538, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.4509196281433105, "rewards/margins": 0.4014025330543518, "rewards/rejected": -4.852322578430176, "sft_loss": 4.283907413482666, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 0.45573423362566345, "learning_rate": 9.774441137572487e-07, "logits/chosen": -0.6780288219451904, "logits/rejected": -0.4629104733467102, "logps/chosen": -4.64642858505249, "logps/rejected": -4.976282596588135, "loss": 0.0532, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.64642858505249, "rewards/margins": 0.32985401153564453, "rewards/rejected": -4.976282596588135, "sft_loss": 4.352892875671387, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 0.422512298556522, "learning_rate": 9.76979303654274e-07, "logits/chosen": -0.6532023549079895, "logits/rejected": -0.502606213092804, "logps/chosen": -4.712841033935547, "logps/rejected": -5.147298336029053, "loss": 0.0539, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.712841033935547, "rewards/margins": 0.434457391500473, "rewards/rejected": -5.147298336029053, "sft_loss": 4.4879865646362305, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 0.8131359180672422, "learning_rate": 9.765098658960035e-07, "logits/chosen": -0.5284560918807983, "logits/rejected": -0.4684422016143799, "logps/chosen": -4.553152561187744, "logps/rejected": -4.902011871337891, "loss": 0.0535, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.553152561187744, "rewards/margins": 0.3488597273826599, "rewards/rejected": -4.902011871337891, "sft_loss": 4.281155586242676, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 0.9015355821027234, "learning_rate": 9.76035805036924e-07, "logits/chosen": -0.518097996711731, "logits/rejected": -0.28143590688705444, "logps/chosen": -4.709748268127441, "logps/rejected": -5.019460201263428, "loss": 0.0537, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.709748268127441, "rewards/margins": 0.30971240997314453, "rewards/rejected": -5.019460201263428, "sft_loss": 4.399357795715332, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 0.749882232299179, "learning_rate": 9.755571256763764e-07, "logits/chosen": -0.4936675429344177, "logits/rejected": -0.32630711793899536, "logps/chosen": -4.695061683654785, "logps/rejected": -5.094540596008301, "loss": 0.0531, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.695061683654785, "rewards/margins": 0.3994784355163574, "rewards/rejected": -5.094540596008301, "sft_loss": 4.374762535095215, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 0.4335609502427654, "learning_rate": 9.750738324585097e-07, "logits/chosen": -0.5365744233131409, "logits/rejected": -0.21823947131633759, "logps/chosen": -4.555490970611572, "logps/rejected": -4.938295364379883, "loss": 0.0536, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.555490970611572, "rewards/margins": 0.3828045725822449, "rewards/rejected": -4.938295364379883, "sft_loss": 4.294363975524902, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 0.5841385822714422, "learning_rate": 9.74585930072237e-07, "logits/chosen": -0.5023024082183838, "logits/rejected": -0.32627633213996887, "logps/chosen": -4.687676906585693, "logps/rejected": -5.107883453369141, "loss": 0.0534, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.687676906585693, "rewards/margins": 0.42020702362060547, "rewards/rejected": -5.107883453369141, "sft_loss": 4.394619464874268, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 0.3573018957478717, "learning_rate": 9.740934232511892e-07, "logits/chosen": -0.6413004398345947, "logits/rejected": -0.5170631408691406, "logps/chosen": -4.733739376068115, "logps/rejected": -4.969666957855225, "loss": 0.0544, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.733739376068115, "rewards/margins": 0.23592762649059296, "rewards/rejected": -4.969666957855225, "sft_loss": 4.418457984924316, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 0.9387854678624943, "learning_rate": 9.735963167736698e-07, "logits/chosen": -0.5804794430732727, "logits/rejected": -0.3907211422920227, "logps/chosen": -4.788337707519531, "logps/rejected": -5.068944454193115, "loss": 0.0541, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.788337707519531, "rewards/margins": 0.28060680627822876, "rewards/rejected": -5.068944454193115, "sft_loss": 4.502654075622559, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 0.574041624870192, "learning_rate": 9.730946154626078e-07, "logits/chosen": -0.5329641699790955, "logits/rejected": -0.40476202964782715, "logps/chosen": -4.6259355545043945, "logps/rejected": -4.907242774963379, "loss": 0.0537, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.6259355545043945, "rewards/margins": 0.2813071608543396, "rewards/rejected": -4.907242774963379, "sft_loss": 4.300456523895264, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 0.7365756470251096, "learning_rate": 9.725883241855117e-07, "logits/chosen": -0.7071422338485718, "logits/rejected": -0.5051488876342773, "logps/chosen": -4.796841621398926, "logps/rejected": -5.196400165557861, "loss": 0.0542, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.796841621398926, "rewards/margins": 0.39955854415893555, "rewards/rejected": -5.196400165557861, "sft_loss": 4.557826995849609, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 0.5917366709488491, "learning_rate": 9.720774478544218e-07, "logits/chosen": -0.4832594394683838, "logits/rejected": -0.29662543535232544, "logps/chosen": -4.595888614654541, "logps/rejected": -4.967526435852051, "loss": 0.0533, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.595888614654541, "rewards/margins": 0.3716380000114441, "rewards/rejected": -4.967526435852051, "sft_loss": 4.249319553375244, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 0.3847051965590612, "learning_rate": 9.715619914258624e-07, "logits/chosen": -0.6779504418373108, "logits/rejected": -0.5655364394187927, "logps/chosen": -4.718003273010254, "logps/rejected": -4.955427646636963, "loss": 0.0536, "rewards/accuracies": 0.625, "rewards/chosen": -4.718003273010254, "rewards/margins": 0.2374243438243866, "rewards/rejected": -4.955427646636963, "sft_loss": 4.384034156799316, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 0.543269055034267, "learning_rate": 9.710419599007937e-07, "logits/chosen": -0.592439591884613, "logits/rejected": -0.3935183882713318, "logps/chosen": -4.704367637634277, "logps/rejected": -4.9490461349487305, "loss": 0.0542, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.704367637634277, "rewards/margins": 0.24467873573303223, "rewards/rejected": -4.9490461349487305, "sft_loss": 4.478358745574951, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 0.40955078166194536, "learning_rate": 9.705173583245643e-07, "logits/chosen": -0.6035395860671997, "logits/rejected": -0.35157328844070435, "logps/chosen": -4.598028659820557, "logps/rejected": -4.925137042999268, "loss": 0.0535, "rewards/accuracies": 0.625, "rewards/chosen": -4.598028659820557, "rewards/margins": 0.32710808515548706, "rewards/rejected": -4.925137042999268, "sft_loss": 4.3422746658325195, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 0.2948789244920242, "learning_rate": 9.699881917868609e-07, "logits/chosen": -0.6719862222671509, "logits/rejected": -0.5192317962646484, "logps/chosen": -4.481719970703125, "logps/rejected": -4.777043342590332, "loss": 0.0535, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.481719970703125, "rewards/margins": 0.29532328248023987, "rewards/rejected": -4.777043342590332, "sft_loss": 4.248114585876465, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 0.8245398173478702, "learning_rate": 9.694544654216594e-07, "logits/chosen": -0.6878781318664551, "logits/rejected": -0.4318612515926361, "logps/chosen": -4.79897403717041, "logps/rejected": -5.210911750793457, "loss": 0.0535, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.79897403717041, "rewards/margins": 0.4119381308555603, "rewards/rejected": -5.210911750793457, "sft_loss": 4.552307605743408, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 1.14084627054799, "learning_rate": 9.689161844071755e-07, "logits/chosen": -0.3799501657485962, "logits/rejected": -0.2871240973472595, "logps/chosen": -4.454643726348877, "logps/rejected": -4.7599263191223145, "loss": 0.0536, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.454643726348877, "rewards/margins": 0.3052830696105957, "rewards/rejected": -4.7599263191223145, "sft_loss": 4.177380084991455, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 0.8660037391087473, "learning_rate": 9.683733539658138e-07, "logits/chosen": -0.5278415083885193, "logits/rejected": -0.268408864736557, "logps/chosen": -4.686205863952637, "logps/rejected": -5.040558815002441, "loss": 0.0532, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.686205863952637, "rewards/margins": 0.3543532192707062, "rewards/rejected": -5.040558815002441, "sft_loss": 4.330956935882568, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 0.4970819248895511, "learning_rate": 9.678259793641178e-07, "logits/chosen": -0.48409318923950195, "logits/rejected": -0.44155415892601013, "logps/chosen": -4.872367858886719, "logps/rejected": -5.101282119750977, "loss": 0.0547, "rewards/accuracies": 0.59375, "rewards/chosen": -4.872367858886719, "rewards/margins": 0.22891390323638916, "rewards/rejected": -5.101282119750977, "sft_loss": 4.596518039703369, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 0.5812923017369568, "learning_rate": 9.672740659127183e-07, "logits/chosen": -0.5782763361930847, "logits/rejected": -0.40878796577453613, "logps/chosen": -4.521733283996582, "logps/rejected": -4.903876304626465, "loss": 0.0529, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.521733283996582, "rewards/margins": 0.382142573595047, "rewards/rejected": -4.903876304626465, "sft_loss": 4.200924396514893, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 0.44409718097577794, "learning_rate": 9.667176189662818e-07, "logits/chosen": -0.5531161427497864, "logits/rejected": -0.3877137005329132, "logps/chosen": -4.67879581451416, "logps/rejected": -4.9793853759765625, "loss": 0.0538, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.67879581451416, "rewards/margins": 0.3005899488925934, "rewards/rejected": -4.9793853759765625, "sft_loss": 4.385974407196045, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 0.5761368695976793, "learning_rate": 9.661566439234592e-07, "logits/chosen": -0.4771701395511627, "logits/rejected": -0.3725683093070984, "logps/chosen": -4.723606586456299, "logps/rejected": -4.984592914581299, "loss": 0.0541, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.723606586456299, "rewards/margins": 0.26098623871803284, "rewards/rejected": -4.984592914581299, "sft_loss": 4.454628944396973, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 0.5556713770938857, "learning_rate": 9.655911462268327e-07, "logits/chosen": -0.4481441378593445, "logits/rejected": -0.30789104104042053, "logps/chosen": -4.546295166015625, "logps/rejected": -4.988908767700195, "loss": 0.0526, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.546295166015625, "rewards/margins": 0.44261303544044495, "rewards/rejected": -4.988908767700195, "sft_loss": 4.256467819213867, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 0.4875769722274501, "learning_rate": 9.650211313628636e-07, "logits/chosen": -0.5960179567337036, "logits/rejected": -0.4614837169647217, "logps/chosen": -4.707175254821777, "logps/rejected": -4.891497611999512, "loss": 0.0553, "rewards/accuracies": 0.59375, "rewards/chosen": -4.707175254821777, "rewards/margins": 0.18432240188121796, "rewards/rejected": -4.891497611999512, "sft_loss": 4.4733428955078125, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 0.4014279228736453, "learning_rate": 9.644466048618386e-07, "logits/chosen": -0.6922046542167664, "logits/rejected": -0.4939608573913574, "logps/chosen": -5.006098747253418, "logps/rejected": -5.234212398529053, "loss": 0.0542, "rewards/accuracies": 0.59375, "rewards/chosen": -5.006098747253418, "rewards/margins": 0.22811400890350342, "rewards/rejected": -5.234212398529053, "sft_loss": 4.620914459228516, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 0.8535156242979435, "learning_rate": 9.63867572297816e-07, "logits/chosen": -0.705421507358551, "logits/rejected": -0.4823324680328369, "logps/chosen": -4.5951361656188965, "logps/rejected": -4.936031818389893, "loss": 0.0543, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.5951361656188965, "rewards/margins": 0.34089553356170654, "rewards/rejected": -4.936031818389893, "sft_loss": 4.360302925109863, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 0.30103834357953974, "learning_rate": 9.632840392885727e-07, "logits/chosen": -0.7017086744308472, "logits/rejected": -0.4853813648223877, "logps/chosen": -4.485077857971191, "logps/rejected": -4.886071681976318, "loss": 0.0536, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.485077857971191, "rewards/margins": 0.4009944498538971, "rewards/rejected": -4.886071681976318, "sft_loss": 4.283857822418213, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 0.4847167733676787, "learning_rate": 9.626960114955483e-07, "logits/chosen": -0.6630807518959045, "logits/rejected": -0.46446362137794495, "logps/chosen": -4.6696085929870605, "logps/rejected": -5.147528648376465, "loss": 0.0533, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.6696085929870605, "rewards/margins": 0.47792062163352966, "rewards/rejected": -5.147528648376465, "sft_loss": 4.391916751861572, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 0.6465226432916064, "learning_rate": 9.621034946237909e-07, "logits/chosen": -0.7046906352043152, "logits/rejected": -0.514801025390625, "logps/chosen": -4.797091007232666, "logps/rejected": -5.1934494972229, "loss": 0.0538, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.797091007232666, "rewards/margins": 0.3963584899902344, "rewards/rejected": -5.1934494972229, "sft_loss": 4.548532485961914, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 0.6053610691894898, "learning_rate": 9.615064944219021e-07, "logits/chosen": -0.5024127960205078, "logits/rejected": -0.33732375502586365, "logps/chosen": -4.484908103942871, "logps/rejected": -4.776598930358887, "loss": 0.0528, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.484908103942871, "rewards/margins": 0.2916913628578186, "rewards/rejected": -4.776598930358887, "sft_loss": 4.1641082763671875, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 1.0983117178191424, "learning_rate": 9.609050166819803e-07, "logits/chosen": -0.49139395356178284, "logits/rejected": -0.4269639551639557, "logps/chosen": -4.553410530090332, "logps/rejected": -4.853245735168457, "loss": 0.0545, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.553410530090332, "rewards/margins": 0.29983487725257874, "rewards/rejected": -4.853245735168457, "sft_loss": 4.232519149780273, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": -0.025698307901620865, "eval_logits/rejected": 0.09142318367958069, "eval_logps/chosen": -4.657962799072266, "eval_logps/rejected": -5.0485663414001465, "eval_loss": 0.05232247710227966, "eval_rewards/accuracies": 0.6350148320198059, "eval_rewards/chosen": -4.657962799072266, "eval_rewards/margins": 0.390603631734848, "eval_rewards/rejected": -5.0485663414001465, "eval_runtime": 43.3819, "eval_samples_per_second": 31.004, "eval_sft_loss": 4.285591125488281, "eval_steps_per_second": 7.768, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 0.6598117160164223, "learning_rate": 9.602990672395653e-07, "logits/chosen": -0.6095829010009766, "logits/rejected": -0.38074302673339844, "logps/chosen": -4.750426292419434, "logps/rejected": -5.1771368980407715, "loss": 0.0529, "rewards/accuracies": 0.6875, "rewards/chosen": -4.750426292419434, "rewards/margins": 0.42671045660972595, "rewards/rejected": -5.1771368980407715, "sft_loss": 4.473883628845215, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 0.5694728776522204, "learning_rate": 9.59688651973581e-07, "logits/chosen": -0.5579411387443542, "logits/rejected": -0.27562081813812256, "logps/chosen": -4.6332831382751465, "logps/rejected": -5.050353050231934, "loss": 0.0541, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.6332831382751465, "rewards/margins": 0.4170694351196289, "rewards/rejected": -5.050353050231934, "sft_loss": 4.405447959899902, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 1.128531677554023, "learning_rate": 9.590737768062792e-07, "logits/chosen": -0.5781430602073669, "logits/rejected": -0.4014016091823578, "logps/chosen": -4.605533599853516, "logps/rejected": -4.950179100036621, "loss": 0.0537, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.605533599853516, "rewards/margins": 0.34464550018310547, "rewards/rejected": -4.950179100036621, "sft_loss": 4.295781135559082, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 0.44594614543377137, "learning_rate": 9.584544477031816e-07, "logits/chosen": -0.3928416967391968, "logits/rejected": -0.23103955388069153, "logps/chosen": -4.714247703552246, "logps/rejected": -5.023430347442627, "loss": 0.0537, "rewards/accuracies": 0.625, "rewards/chosen": -4.714247703552246, "rewards/margins": 0.30918246507644653, "rewards/rejected": -5.023430347442627, "sft_loss": 4.452761650085449, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 0.5839455517038359, "learning_rate": 9.578306706730215e-07, "logits/chosen": -0.6410695314407349, "logits/rejected": -0.3730124831199646, "logps/chosen": -4.631039619445801, "logps/rejected": -4.9336748123168945, "loss": 0.0531, "rewards/accuracies": 0.65625, "rewards/chosen": -4.631039619445801, "rewards/margins": 0.3026350438594818, "rewards/rejected": -4.9336748123168945, "sft_loss": 4.3163886070251465, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 0.643742164156604, "learning_rate": 9.572024517676865e-07, "logits/chosen": -0.5921844840049744, "logits/rejected": -0.46048134565353394, "logps/chosen": -4.5794901847839355, "logps/rejected": -4.9018235206604, "loss": 0.055, "rewards/accuracies": 0.65625, "rewards/chosen": -4.5794901847839355, "rewards/margins": 0.3223329186439514, "rewards/rejected": -4.9018235206604, "sft_loss": 4.405655860900879, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 0.45905634809888224, "learning_rate": 9.565697970821593e-07, "logits/chosen": -0.6735567450523376, "logits/rejected": -0.4286714196205139, "logps/chosen": -4.814480781555176, "logps/rejected": -5.108050346374512, "loss": 0.0536, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.814480781555176, "rewards/margins": 0.2935686707496643, "rewards/rejected": -5.108050346374512, "sft_loss": 4.5338850021362305, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 0.3718124611962775, "learning_rate": 9.559327127544585e-07, "logits/chosen": -0.7341528534889221, "logits/rejected": -0.5741795897483826, "logps/chosen": -4.655917167663574, "logps/rejected": -4.946690559387207, "loss": 0.0536, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.655917167663574, "rewards/margins": 0.2907727062702179, "rewards/rejected": -4.946690559387207, "sft_loss": 4.393622398376465, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 0.47313857382521357, "learning_rate": 9.552912049655789e-07, "logits/chosen": -0.5752710103988647, "logits/rejected": -0.3442533612251282, "logps/chosen": -4.453244209289551, "logps/rejected": -4.790841102600098, "loss": 0.0541, "rewards/accuracies": 0.65625, "rewards/chosen": -4.453244209289551, "rewards/margins": 0.3375973701477051, "rewards/rejected": -4.790841102600098, "sft_loss": 4.221789836883545, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 0.3080289767371704, "learning_rate": 9.546452799394315e-07, "logits/chosen": -0.6375377178192139, "logits/rejected": -0.35768041014671326, "logps/chosen": -4.7251386642456055, "logps/rejected": -5.06402063369751, "loss": 0.0537, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.7251386642456055, "rewards/margins": 0.3388821482658386, "rewards/rejected": -5.06402063369751, "sft_loss": 4.505545616149902, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 0.46157795630326126, "learning_rate": 9.539949439427846e-07, "logits/chosen": -0.5481323003768921, "logits/rejected": -0.40522176027297974, "logps/chosen": -4.713761329650879, "logps/rejected": -5.103206157684326, "loss": 0.0543, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.713761329650879, "rewards/margins": 0.38944488763809204, "rewards/rejected": -5.103206157684326, "sft_loss": 4.457167148590088, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 0.7554972971844888, "learning_rate": 9.533402032852002e-07, "logits/chosen": -0.5324119329452515, "logits/rejected": -0.3285521864891052, "logps/chosen": -4.660325050354004, "logps/rejected": -5.11639404296875, "loss": 0.053, "rewards/accuracies": 0.65625, "rewards/chosen": -4.660325050354004, "rewards/margins": 0.45606860518455505, "rewards/rejected": -5.11639404296875, "sft_loss": 4.358373641967773, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 0.5334990626640641, "learning_rate": 9.526810643189754e-07, "logits/chosen": -0.4682994484901428, "logits/rejected": -0.20002928376197815, "logps/chosen": -4.4835896492004395, "logps/rejected": -4.984886646270752, "loss": 0.053, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.4835896492004395, "rewards/margins": 0.501296877861023, "rewards/rejected": -4.984886646270752, "sft_loss": 4.2979960441589355, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 0.43311466275245447, "learning_rate": 9.52017533439079e-07, "logits/chosen": -0.6111350655555725, "logits/rejected": -0.5189183950424194, "logps/chosen": -4.642319679260254, "logps/rejected": -4.935871601104736, "loss": 0.055, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.642319679260254, "rewards/margins": 0.29355183243751526, "rewards/rejected": -4.935871601104736, "sft_loss": 4.443874359130859, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 0.4523237714480055, "learning_rate": 9.513496170830909e-07, "logits/chosen": -0.7188066244125366, "logits/rejected": -0.6016985177993774, "logps/chosen": -4.738122463226318, "logps/rejected": -4.9969658851623535, "loss": 0.0537, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.738122463226318, "rewards/margins": 0.25884348154067993, "rewards/rejected": -4.9969658851623535, "sft_loss": 4.435435771942139, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 0.39885392574273837, "learning_rate": 9.506773217311382e-07, "logits/chosen": -0.6342407464981079, "logits/rejected": -0.4105973243713379, "logps/chosen": -4.633099555969238, "logps/rejected": -4.917786121368408, "loss": 0.0534, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.633099555969238, "rewards/margins": 0.2846868932247162, "rewards/rejected": -4.917786121368408, "sft_loss": 4.4025044441223145, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 0.46859664979499294, "learning_rate": 9.500006539058334e-07, "logits/chosen": -0.5705159902572632, "logits/rejected": -0.32873377203941345, "logps/chosen": -4.4862871170043945, "logps/rejected": -4.728259086608887, "loss": 0.0534, "rewards/accuracies": 0.625, "rewards/chosen": -4.4862871170043945, "rewards/margins": 0.24197213351726532, "rewards/rejected": -4.728259086608887, "sft_loss": 4.215309143066406, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 0.5566937884089165, "learning_rate": 9.493196201722109e-07, "logits/chosen": -0.6300413012504578, "logits/rejected": -0.4329058527946472, "logps/chosen": -4.681478500366211, "logps/rejected": -4.944789886474609, "loss": 0.0537, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.681478500366211, "rewards/margins": 0.26331159472465515, "rewards/rejected": -4.944789886474609, "sft_loss": 4.417520999908447, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 0.5005083892393065, "learning_rate": 9.486342271376628e-07, "logits/chosen": -0.5651583671569824, "logits/rejected": -0.6070525050163269, "logps/chosen": -4.731893539428711, "logps/rejected": -5.015305519104004, "loss": 0.0538, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.731893539428711, "rewards/margins": 0.28341221809387207, "rewards/rejected": -5.015305519104004, "sft_loss": 4.412493705749512, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 0.4047781839205759, "learning_rate": 9.479444814518755e-07, "logits/chosen": -0.5466286540031433, "logits/rejected": -0.2215103805065155, "logps/chosen": -4.484042644500732, "logps/rejected": -5.004049301147461, "loss": 0.0522, "rewards/accuracies": 0.6875, "rewards/chosen": -4.484042644500732, "rewards/margins": 0.5200066566467285, "rewards/rejected": -5.004049301147461, "sft_loss": 4.189947605133057, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 0.4919221601673572, "learning_rate": 9.472503898067645e-07, "logits/chosen": -0.3866714835166931, "logits/rejected": -0.3418964743614197, "logps/chosen": -4.764806747436523, "logps/rejected": -4.945952892303467, "loss": 0.0548, "rewards/accuracies": 0.59375, "rewards/chosen": -4.764806747436523, "rewards/margins": 0.18114568293094635, "rewards/rejected": -4.945952892303467, "sft_loss": 4.389309883117676, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 0.6165938817684519, "learning_rate": 9.465519589364099e-07, "logits/chosen": -0.43105238676071167, "logits/rejected": -0.34996071457862854, "logps/chosen": -4.986048698425293, "logps/rejected": -5.196448802947998, "loss": 0.0539, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.986048698425293, "rewards/margins": 0.2104000300168991, "rewards/rejected": -5.196448802947998, "sft_loss": 4.601518154144287, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 0.7147540339684912, "learning_rate": 9.458491956169914e-07, "logits/chosen": -0.5200138688087463, "logits/rejected": -0.30936622619628906, "logps/chosen": -4.600648403167725, "logps/rejected": -4.895071983337402, "loss": 0.0534, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.600648403167725, "rewards/margins": 0.29442328214645386, "rewards/rejected": -4.895071983337402, "sft_loss": 4.186682224273682, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 0.5222909906022749, "learning_rate": 9.451421066667215e-07, "logits/chosen": -0.6883410215377808, "logits/rejected": -0.45219460129737854, "logps/chosen": -4.589325904846191, "logps/rejected": -5.036540508270264, "loss": 0.0527, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.589325904846191, "rewards/margins": 0.4472144544124603, "rewards/rejected": -5.036540508270264, "sft_loss": 4.3194098472595215, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 0.4776830565945863, "learning_rate": 9.444306989457805e-07, "logits/chosen": -0.46004819869995117, "logits/rejected": -0.3241944909095764, "logps/chosen": -4.868544101715088, "logps/rejected": -5.153488636016846, "loss": 0.0542, "rewards/accuracies": 0.59375, "rewards/chosen": -4.868544101715088, "rewards/margins": 0.2849445641040802, "rewards/rejected": -5.153488636016846, "sft_loss": 4.496047019958496, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 0.4294909802110886, "learning_rate": 9.437149793562489e-07, "logits/chosen": -0.5180003046989441, "logits/rejected": -0.3660891652107239, "logps/chosen": -4.651247501373291, "logps/rejected": -4.924403190612793, "loss": 0.054, "rewards/accuracies": 0.625, "rewards/chosen": -4.651247501373291, "rewards/margins": 0.2731553614139557, "rewards/rejected": -4.924403190612793, "sft_loss": 4.393563270568848, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 0.39095469809878836, "learning_rate": 9.429949548420417e-07, "logits/chosen": -0.5731022357940674, "logits/rejected": -0.4370029866695404, "logps/chosen": -4.695939064025879, "logps/rejected": -4.961706638336182, "loss": 0.0536, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.695939064025879, "rewards/margins": 0.2657679617404938, "rewards/rejected": -4.961706638336182, "sft_loss": 4.3769636154174805, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 0.430223860508395, "learning_rate": 9.422706323888396e-07, "logits/chosen": -0.5357168912887573, "logits/rejected": -0.49134665727615356, "logps/chosen": -4.6197052001953125, "logps/rejected": -4.928733825683594, "loss": 0.0535, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.6197052001953125, "rewards/margins": 0.3090288043022156, "rewards/rejected": -4.928733825683594, "sft_loss": 4.343020439147949, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 0.5805561587916378, "learning_rate": 9.415420190240225e-07, "logits/chosen": -0.5039713382720947, "logits/rejected": -0.23104509711265564, "logps/chosen": -4.593451976776123, "logps/rejected": -5.125218868255615, "loss": 0.0519, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.593451976776123, "rewards/margins": 0.5317668914794922, "rewards/rejected": -5.125218868255615, "sft_loss": 4.35361385345459, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 0.794948230843, "learning_rate": 9.408091218166002e-07, "logits/chosen": -0.4368966519832611, "logits/rejected": -0.349046915769577, "logps/chosen": -4.646870136260986, "logps/rejected": -4.846858024597168, "loss": 0.0545, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.646870136260986, "rewards/margins": 0.19998829066753387, "rewards/rejected": -4.846858024597168, "sft_loss": 4.331336498260498, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 0.4322649148913923, "learning_rate": 9.400719478771449e-07, "logits/chosen": -0.4928915500640869, "logits/rejected": -0.10693871974945068, "logps/chosen": -4.772175312042236, "logps/rejected": -5.275957107543945, "loss": 0.0523, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.772175312042236, "rewards/margins": 0.5037820935249329, "rewards/rejected": -5.275957107543945, "sft_loss": 4.457993507385254, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 0.4188247735094768, "learning_rate": 9.393305043577209e-07, "logits/chosen": -0.4668586850166321, "logits/rejected": -0.3280408978462219, "logps/chosen": -4.57827615737915, "logps/rejected": -4.987173080444336, "loss": 0.0538, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.57827615737915, "rewards/margins": 0.40889644622802734, "rewards/rejected": -4.987173080444336, "sft_loss": 4.273009300231934, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 0.43986121858771965, "learning_rate": 9.38584798451817e-07, "logits/chosen": -0.5263036489486694, "logits/rejected": -0.32945355772972107, "logps/chosen": -4.534844398498535, "logps/rejected": -4.883469581604004, "loss": 0.0528, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.534844398498535, "rewards/margins": 0.3486255705356598, "rewards/rejected": -4.883469581604004, "sft_loss": 4.275891304016113, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 0.5027182729571974, "learning_rate": 9.37834837394275e-07, "logits/chosen": -0.41481298208236694, "logits/rejected": -0.24069428443908691, "logps/chosen": -4.659480094909668, "logps/rejected": -5.198307514190674, "loss": 0.0531, "rewards/accuracies": 0.6875, "rewards/chosen": -4.659480094909668, "rewards/margins": 0.5388270616531372, "rewards/rejected": -5.198307514190674, "sft_loss": 4.420654296875, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 0.5596253221586619, "learning_rate": 9.370806284612203e-07, "logits/chosen": -0.4726240038871765, "logits/rejected": -0.2989794909954071, "logps/chosen": -4.519133567810059, "logps/rejected": -5.032240867614746, "loss": 0.053, "rewards/accuracies": 0.71875, "rewards/chosen": -4.519133567810059, "rewards/margins": 0.5131076574325562, "rewards/rejected": -5.032240867614746, "sft_loss": 4.242257118225098, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 0.639067801858824, "learning_rate": 9.363221789699912e-07, "logits/chosen": -0.5229755640029907, "logits/rejected": -0.3458749055862427, "logps/chosen": -4.645009517669678, "logps/rejected": -5.047579765319824, "loss": 0.0535, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.645009517669678, "rewards/margins": 0.4025697112083435, "rewards/rejected": -5.047579765319824, "sft_loss": 4.33743953704834, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 0.45340914139326294, "learning_rate": 9.355594962790682e-07, "logits/chosen": -0.5707454681396484, "logits/rejected": -0.39737915992736816, "logps/chosen": -4.724990367889404, "logps/rejected": -5.140081405639648, "loss": 0.0531, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.724990367889404, "rewards/margins": 0.4150908589363098, "rewards/rejected": -5.140081405639648, "sft_loss": 4.456721305847168, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 0.7726334686248777, "learning_rate": 9.34792587788002e-07, "logits/chosen": -0.41615360975265503, "logits/rejected": -0.25168564915657043, "logps/chosen": -4.589061260223389, "logps/rejected": -4.944598197937012, "loss": 0.0538, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.589061260223389, "rewards/margins": 0.3555375933647156, "rewards/rejected": -4.944598197937012, "sft_loss": 4.252989292144775, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 0.9713420234440161, "learning_rate": 9.34021460937342e-07, "logits/chosen": -0.54308021068573, "logits/rejected": -0.46706587076187134, "logps/chosen": -4.68400764465332, "logps/rejected": -4.97832727432251, "loss": 0.0542, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.68400764465332, "rewards/margins": 0.2943192422389984, "rewards/rejected": -4.97832727432251, "sft_loss": 4.438172817230225, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 0.7646700933687028, "learning_rate": 9.332461232085646e-07, "logits/chosen": -0.7934912443161011, "logits/rejected": -0.5852801203727722, "logps/chosen": -4.717600345611572, "logps/rejected": -4.994270324707031, "loss": 0.0542, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.717600345611572, "rewards/margins": 0.27667081356048584, "rewards/rejected": -4.994270324707031, "sft_loss": 4.516864776611328, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 1.1584729658858994, "learning_rate": 9.324665821239998e-07, "logits/chosen": -0.7201007604598999, "logits/rejected": -0.4515800476074219, "logps/chosen": -4.434556484222412, "logps/rejected": -4.892245292663574, "loss": 0.0539, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.434556484222412, "rewards/margins": 0.45768898725509644, "rewards/rejected": -4.892245292663574, "sft_loss": 4.247842788696289, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 0.6140739611213539, "learning_rate": 9.316828452467583e-07, "logits/chosen": -0.6372144222259521, "logits/rejected": -0.42637553811073303, "logps/chosen": -4.58826208114624, "logps/rejected": -5.025907516479492, "loss": 0.0531, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.58826208114624, "rewards/margins": 0.4376456141471863, "rewards/rejected": -5.025907516479492, "sft_loss": 4.415618896484375, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 0.6626048352879617, "learning_rate": 9.30894920180659e-07, "logits/chosen": -0.47773098945617676, "logits/rejected": -0.34003114700317383, "logps/chosen": -4.645096778869629, "logps/rejected": -4.930912971496582, "loss": 0.0528, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.645096778869629, "rewards/margins": 0.2858158051967621, "rewards/rejected": -4.930912971496582, "sft_loss": 4.292731285095215, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 0.46790800138866595, "learning_rate": 9.301028145701543e-07, "logits/chosen": -0.4381941854953766, "logits/rejected": -0.23344704508781433, "logps/chosen": -4.556414604187012, "logps/rejected": -5.132447242736816, "loss": 0.053, "rewards/accuracies": 0.65625, "rewards/chosen": -4.556414604187012, "rewards/margins": 0.5760326385498047, "rewards/rejected": -5.132447242736816, "sft_loss": 4.284694671630859, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 0.4538761637030669, "learning_rate": 9.293065361002563e-07, "logits/chosen": -0.46119099855422974, "logits/rejected": -0.28453510999679565, "logps/chosen": -4.576390266418457, "logps/rejected": -5.065820217132568, "loss": 0.055, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.576390266418457, "rewards/margins": 0.4894295632839203, "rewards/rejected": -5.065820217132568, "sft_loss": 4.269384860992432, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 0.5262247782905854, "learning_rate": 9.285060924964622e-07, "logits/chosen": -0.5923141241073608, "logits/rejected": -0.43897438049316406, "logps/chosen": -4.758378028869629, "logps/rejected": -5.130233287811279, "loss": 0.0531, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.758378028869629, "rewards/margins": 0.3718549311161041, "rewards/rejected": -5.130233287811279, "sft_loss": 4.387657165527344, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 0.8738310108131122, "learning_rate": 9.277014915246792e-07, "logits/chosen": -0.4979740083217621, "logits/rejected": -0.4129610061645508, "logps/chosen": -4.627547740936279, "logps/rejected": -5.047521114349365, "loss": 0.0538, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.627547740936279, "rewards/margins": 0.41997361183166504, "rewards/rejected": -5.047521114349365, "sft_loss": 4.41188907623291, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 0.3665660050498204, "learning_rate": 9.268927409911498e-07, "logits/chosen": -0.6547388434410095, "logits/rejected": -0.5283665060997009, "logps/chosen": -4.4866533279418945, "logps/rejected": -4.788092136383057, "loss": 0.0533, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.4866533279418945, "rewards/margins": 0.3014386296272278, "rewards/rejected": -4.788092136383057, "sft_loss": 4.192168235778809, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 0.5777401513346766, "learning_rate": 9.260798487423749e-07, "logits/chosen": -0.8416692614555359, "logits/rejected": -0.5481675863265991, "logps/chosen": -4.734822750091553, "logps/rejected": -5.159518241882324, "loss": 0.0529, "rewards/accuracies": 0.6875, "rewards/chosen": -4.734822750091553, "rewards/margins": 0.42469555139541626, "rewards/rejected": -5.159518241882324, "sft_loss": 4.497926235198975, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 0.31040149478224743, "learning_rate": 9.252628226650389e-07, "logits/chosen": -0.6390944719314575, "logits/rejected": -0.5294175744056702, "logps/chosen": -4.691042423248291, "logps/rejected": -4.936129093170166, "loss": 0.0535, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.691042423248291, "rewards/margins": 0.24508705735206604, "rewards/rejected": -4.936129093170166, "sft_loss": 4.376203536987305, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 0.8324745102838857, "learning_rate": 9.244416706859321e-07, "logits/chosen": -0.6490924954414368, "logits/rejected": -0.43776410818099976, "logps/chosen": -4.428481578826904, "logps/rejected": -4.945733070373535, "loss": 0.0533, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.428481578826904, "rewards/margins": 0.5172508955001831, "rewards/rejected": -4.945733070373535, "sft_loss": 4.210099220275879, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 0.4524607634156159, "learning_rate": 9.23616400771875e-07, "logits/chosen": -0.6316944360733032, "logits/rejected": -0.37926986813545227, "logps/chosen": -4.5279340744018555, "logps/rejected": -4.91841459274292, "loss": 0.0521, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.5279340744018555, "rewards/margins": 0.39048075675964355, "rewards/rejected": -4.91841459274292, "sft_loss": 4.214154243469238, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 0.45011613171946563, "learning_rate": 9.227870209296395e-07, "logits/chosen": -0.5129834413528442, "logits/rejected": -0.3363080322742462, "logps/chosen": -4.668064117431641, "logps/rejected": -5.000769138336182, "loss": 0.0538, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.668064117431641, "rewards/margins": 0.3327048420906067, "rewards/rejected": -5.000769138336182, "sft_loss": 4.383709907531738, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 0.7776345772417559, "learning_rate": 9.219535392058728e-07, "logits/chosen": -0.5440296530723572, "logits/rejected": -0.5481287837028503, "logps/chosen": -4.869925022125244, "logps/rejected": -5.206021308898926, "loss": 0.054, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.869925022125244, "rewards/margins": 0.33609622716903687, "rewards/rejected": -5.206021308898926, "sft_loss": 4.53743314743042, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 0.72464037163324, "learning_rate": 9.211159636870181e-07, "logits/chosen": -0.5129449367523193, "logits/rejected": -0.3094898760318756, "logps/chosen": -4.535016059875488, "logps/rejected": -4.995938777923584, "loss": 0.0526, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.535016059875488, "rewards/margins": 0.46092361211776733, "rewards/rejected": -4.995938777923584, "sft_loss": 4.218871116638184, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 0.43282860953379676, "learning_rate": 9.202743024992367e-07, "logits/chosen": -0.29366233944892883, "logits/rejected": -0.19437307119369507, "logps/chosen": -4.601809978485107, "logps/rejected": -4.955982208251953, "loss": 0.054, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.601809978485107, "rewards/margins": 0.35417240858078003, "rewards/rejected": -4.955982208251953, "sft_loss": 4.2811689376831055, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 0.6048286155475231, "learning_rate": 9.194285638083293e-07, "logits/chosen": -0.4645848870277405, "logits/rejected": -0.2716544270515442, "logps/chosen": -4.692347049713135, "logps/rejected": -5.111114501953125, "loss": 0.0521, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.692347049713135, "rewards/margins": 0.4187680780887604, "rewards/rejected": -5.111114501953125, "sft_loss": 4.267924785614014, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 0.7623333749730745, "learning_rate": 9.185787558196562e-07, "logits/chosen": -0.5028216242790222, "logits/rejected": -0.36704540252685547, "logps/chosen": -4.721834182739258, "logps/rejected": -5.188471794128418, "loss": 0.0525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.721834182739258, "rewards/margins": 0.466637521982193, "rewards/rejected": -5.188471794128418, "sft_loss": 4.410996437072754, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 0.5986005442913178, "learning_rate": 9.177248867780583e-07, "logits/chosen": -0.38116052746772766, "logits/rejected": -0.2935437262058258, "logps/chosen": -4.649445533752441, "logps/rejected": -4.961583137512207, "loss": 0.0549, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.649445533752441, "rewards/margins": 0.3121368885040283, "rewards/rejected": -4.961583137512207, "sft_loss": 4.2997026443481445, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 0.5435666495008531, "learning_rate": 9.168669649677769e-07, "logits/chosen": -0.6052119135856628, "logits/rejected": -0.4620954394340515, "logps/chosen": -4.813775539398193, "logps/rejected": -5.131553649902344, "loss": 0.0548, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.813775539398193, "rewards/margins": 0.31777825951576233, "rewards/rejected": -5.131553649902344, "sft_loss": 4.5045881271362305, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 0.47022443877330955, "learning_rate": 9.16004998712373e-07, "logits/chosen": -0.5376960039138794, "logits/rejected": -0.4372090697288513, "logps/chosen": -4.671207904815674, "logps/rejected": -4.986814498901367, "loss": 0.0534, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.671207904815674, "rewards/margins": 0.3156066834926605, "rewards/rejected": -4.986814498901367, "sft_loss": 4.38083553314209, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 0.6199400338313545, "learning_rate": 9.151389963746472e-07, "logits/chosen": -0.5899304747581482, "logits/rejected": -0.18980534374713898, "logps/chosen": -4.4528584480285645, "logps/rejected": -5.0324931144714355, "loss": 0.0524, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.4528584480285645, "rewards/margins": 0.579634964466095, "rewards/rejected": -5.0324931144714355, "sft_loss": 4.2412800788879395, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 0.5555948687982465, "learning_rate": 9.142689663565577e-07, "logits/chosen": -0.5152315497398376, "logits/rejected": -0.436471164226532, "logps/chosen": -4.5293426513671875, "logps/rejected": -4.902256965637207, "loss": 0.0535, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.5293426513671875, "rewards/margins": 0.3729146420955658, "rewards/rejected": -4.902256965637207, "sft_loss": 4.266984462738037, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 0.45142612803831084, "learning_rate": 9.133949170991397e-07, "logits/chosen": -0.53922039270401, "logits/rejected": -0.4263296127319336, "logps/chosen": -4.764470100402832, "logps/rejected": -5.035752296447754, "loss": 0.0537, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.764470100402832, "rewards/margins": 0.27128297090530396, "rewards/rejected": -5.035752296447754, "sft_loss": 4.539823055267334, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 0.41254383948397666, "learning_rate": 9.125168570824231e-07, "logits/chosen": -0.6185752153396606, "logits/rejected": -0.3851194977760315, "logps/chosen": -4.496891021728516, "logps/rejected": -4.882938861846924, "loss": 0.0531, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.496891021728516, "rewards/margins": 0.3860477805137634, "rewards/rejected": -4.882938861846924, "sft_loss": 4.258194923400879, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 0.7151117766432287, "learning_rate": 9.116347948253496e-07, "logits/chosen": -0.5224160552024841, "logits/rejected": -0.3366536498069763, "logps/chosen": -4.449323654174805, "logps/rejected": -4.829797267913818, "loss": 0.0529, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.449323654174805, "rewards/margins": 0.3804740905761719, "rewards/rejected": -4.829797267913818, "sft_loss": 4.185049533843994, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 0.583199273167788, "learning_rate": 9.107487388856916e-07, "logits/chosen": -0.5745779275894165, "logits/rejected": -0.3310237228870392, "logps/chosen": -4.638999938964844, "logps/rejected": -5.003554344177246, "loss": 0.0529, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.638999938964844, "rewards/margins": 0.3645547330379486, "rewards/rejected": -5.003554344177246, "sft_loss": 4.2992844581604, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 1.0175227991306681, "learning_rate": 9.098586978599673e-07, "logits/chosen": -0.4655645489692688, "logits/rejected": -0.27722448110580444, "logps/chosen": -4.643935680389404, "logps/rejected": -5.168910503387451, "loss": 0.0526, "rewards/accuracies": 0.625, "rewards/chosen": -4.643935680389404, "rewards/margins": 0.524974524974823, "rewards/rejected": -5.168910503387451, "sft_loss": 4.284567832946777, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 1.0535335408279394, "learning_rate": 9.089646803833588e-07, "logits/chosen": -0.48223942518234253, "logits/rejected": -0.29267364740371704, "logps/chosen": -4.699334621429443, "logps/rejected": -5.175503730773926, "loss": 0.0529, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.699334621429443, "rewards/margins": 0.47616925835609436, "rewards/rejected": -5.175503730773926, "sft_loss": 4.3690996170043945, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 1.225203498147348, "learning_rate": 9.080666951296276e-07, "logits/chosen": -0.5685257315635681, "logits/rejected": -0.23230977356433868, "logps/chosen": -4.532471656799316, "logps/rejected": -5.177938938140869, "loss": 0.0525, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.532471656799316, "rewards/margins": 0.6454674601554871, "rewards/rejected": -5.177938938140869, "sft_loss": 4.264753818511963, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 0.3571080799880975, "learning_rate": 9.071647508110305e-07, "logits/chosen": -0.5444567203521729, "logits/rejected": -0.20953519642353058, "logps/chosen": -4.405203819274902, "logps/rejected": -5.111395359039307, "loss": 0.0518, "rewards/accuracies": 0.71875, "rewards/chosen": -4.405203819274902, "rewards/margins": 0.706192135810852, "rewards/rejected": -5.111395359039307, "sft_loss": 4.1048126220703125, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 0.4647677566083696, "learning_rate": 9.062588561782354e-07, "logits/chosen": -0.3920247256755829, "logits/rejected": -0.31407880783081055, "logps/chosen": -4.9274492263793945, "logps/rejected": -5.254195690155029, "loss": 0.0543, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.9274492263793945, "rewards/margins": 0.32674673199653625, "rewards/rejected": -5.254195690155029, "sft_loss": 4.650223731994629, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 0.33147523191362094, "learning_rate": 9.053490200202358e-07, "logits/chosen": -0.4448007047176361, "logits/rejected": -0.37099045515060425, "logps/chosen": -4.541591167449951, "logps/rejected": -4.890256404876709, "loss": 0.0546, "rewards/accuracies": 0.65625, "rewards/chosen": -4.541591167449951, "rewards/margins": 0.3486659526824951, "rewards/rejected": -4.890256404876709, "sft_loss": 4.340108871459961, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 0.666464713418103, "learning_rate": 9.044352511642661e-07, "logits/chosen": -0.4679701328277588, "logits/rejected": -0.3994170129299164, "logps/chosen": -4.796227931976318, "logps/rejected": -5.057922840118408, "loss": 0.0545, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.796227931976318, "rewards/margins": 0.2616943120956421, "rewards/rejected": -5.057922840118408, "sft_loss": 4.517434597015381, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 0.6293748246790352, "learning_rate": 9.03517558475716e-07, "logits/chosen": -0.5948684811592102, "logits/rejected": -0.4503151774406433, "logps/chosen": -4.529299736022949, "logps/rejected": -4.76159143447876, "loss": 0.056, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.529299736022949, "rewards/margins": 0.23229138553142548, "rewards/rejected": -4.76159143447876, "sft_loss": 4.296181678771973, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 0.4250320609206886, "learning_rate": 9.025959508580436e-07, "logits/chosen": -0.6207831501960754, "logits/rejected": -0.3034132719039917, "logps/chosen": -4.6496171951293945, "logps/rejected": -5.029486656188965, "loss": 0.053, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.6496171951293945, "rewards/margins": 0.3798690736293793, "rewards/rejected": -5.029486656188965, "sft_loss": 4.412766456604004, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 0.3411551452075428, "learning_rate": 9.016704372526905e-07, "logits/chosen": -0.6650777459144592, "logits/rejected": -0.4226778447628021, "logps/chosen": -4.5687479972839355, "logps/rejected": -5.018956661224365, "loss": 0.0531, "rewards/accuracies": 0.65625, "rewards/chosen": -4.5687479972839355, "rewards/margins": 0.4502086639404297, "rewards/rejected": -5.018956661224365, "sft_loss": 4.3290324211120605, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 0.3485149666875214, "learning_rate": 9.007410266389934e-07, "logits/chosen": -0.6740955114364624, "logits/rejected": -0.6200405359268188, "logps/chosen": -4.6393561363220215, "logps/rejected": -4.842759132385254, "loss": 0.0538, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.6393561363220215, "rewards/margins": 0.20340339839458466, "rewards/rejected": -4.842759132385254, "sft_loss": 4.307949066162109, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 0.6993713590336478, "learning_rate": 8.998077280340981e-07, "logits/chosen": -0.5477308034896851, "logits/rejected": -0.4848629832267761, "logps/chosen": -4.8374176025390625, "logps/rejected": -5.147733688354492, "loss": 0.0537, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.8374176025390625, "rewards/margins": 0.31031590700149536, "rewards/rejected": -5.147733688354492, "sft_loss": 4.451597690582275, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 0.5857090427241004, "learning_rate": 8.988705504928722e-07, "logits/chosen": -0.593254029750824, "logits/rejected": -0.3634767532348633, "logps/chosen": -4.616876125335693, "logps/rejected": -5.206143856048584, "loss": 0.0518, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.616876125335693, "rewards/margins": 0.5892679691314697, "rewards/rejected": -5.206143856048584, "sft_loss": 4.2967424392700195, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": -0.02904585748910904, "eval_logits/rejected": 0.07823801785707474, "eval_logps/chosen": -4.5006561279296875, "eval_logps/rejected": -4.917550086975098, "eval_loss": 0.051895011216402054, "eval_rewards/accuracies": 0.6313056349754333, "eval_rewards/chosen": -4.5006561279296875, "eval_rewards/margins": 0.41689401865005493, "eval_rewards/rejected": -4.917550086975098, "eval_runtime": 43.4471, "eval_samples_per_second": 30.957, "eval_sft_loss": 4.063570499420166, "eval_steps_per_second": 7.757, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 0.4116631557826821, "learning_rate": 8.979295031078157e-07, "logits/chosen": -0.5330547094345093, "logits/rejected": -0.25229379534721375, "logps/chosen": -4.346199035644531, "logps/rejected": -4.8608598709106445, "loss": 0.0518, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.346199035644531, "rewards/margins": 0.5146608948707581, "rewards/rejected": -4.8608598709106445, "sft_loss": 4.030482292175293, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 0.38214100986360633, "learning_rate": 8.969845950089751e-07, "logits/chosen": -0.5902666449546814, "logits/rejected": -0.3709181249141693, "logps/chosen": -4.7403693199157715, "logps/rejected": -5.202174186706543, "loss": 0.0538, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.7403693199157715, "rewards/margins": 0.4618045389652252, "rewards/rejected": -5.202174186706543, "sft_loss": 4.481407165527344, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 0.41539489108804767, "learning_rate": 8.960358353638526e-07, "logits/chosen": -0.4401358664035797, "logits/rejected": -0.2931682765483856, "logps/chosen": -4.8379106521606445, "logps/rejected": -5.323012828826904, "loss": 0.0535, "rewards/accuracies": 0.625, "rewards/chosen": -4.8379106521606445, "rewards/margins": 0.48510226607322693, "rewards/rejected": -5.323012828826904, "sft_loss": 4.449693202972412, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 0.9036399070841549, "learning_rate": 8.950832333773184e-07, "logits/chosen": -0.45871132612228394, "logits/rejected": -0.25480368733406067, "logps/chosen": -4.593859672546387, "logps/rejected": -4.8868608474731445, "loss": 0.0541, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.593859672546387, "rewards/margins": 0.2930009961128235, "rewards/rejected": -4.8868608474731445, "sft_loss": 4.267394065856934, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 0.4427581928743918, "learning_rate": 8.941267982915213e-07, "logits/chosen": -0.45720523595809937, "logits/rejected": -0.39475345611572266, "logps/chosen": -4.652650356292725, "logps/rejected": -4.871710777282715, "loss": 0.0551, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.652650356292725, "rewards/margins": 0.2190600335597992, "rewards/rejected": -4.871710777282715, "sft_loss": 4.429585933685303, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 0.6495835714729182, "learning_rate": 8.931665393857983e-07, "logits/chosen": -0.5988067388534546, "logits/rejected": -0.41781100630760193, "logps/chosen": -4.665750026702881, "logps/rejected": -5.207381725311279, "loss": 0.0533, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.665750026702881, "rewards/margins": 0.5416315793991089, "rewards/rejected": -5.207381725311279, "sft_loss": 4.4793195724487305, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 0.41623413128943515, "learning_rate": 8.922024659765861e-07, "logits/chosen": -0.7399147748947144, "logits/rejected": -0.5550050139427185, "logps/chosen": -4.498051643371582, "logps/rejected": -4.938418865203857, "loss": 0.0534, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.498051643371582, "rewards/margins": 0.44036778807640076, "rewards/rejected": -4.938418865203857, "sft_loss": 4.29925012588501, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 0.5313489161329956, "learning_rate": 8.912345874173288e-07, "logits/chosen": -0.7936776876449585, "logits/rejected": -0.5909181237220764, "logps/chosen": -4.6866841316223145, "logps/rejected": -5.111433506011963, "loss": 0.0532, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.6866841316223145, "rewards/margins": 0.4247500002384186, "rewards/rejected": -5.111433506011963, "sft_loss": 4.379754543304443, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 0.6359924799554241, "learning_rate": 8.902629130983885e-07, "logits/chosen": -0.7347515821456909, "logits/rejected": -0.6840382814407349, "logps/chosen": -4.671685695648193, "logps/rejected": -4.900129795074463, "loss": 0.0551, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.671685695648193, "rewards/margins": 0.22844386100769043, "rewards/rejected": -4.900129795074463, "sft_loss": 4.440484046936035, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 0.5805478038553051, "learning_rate": 8.892874524469537e-07, "logits/chosen": -0.5966772437095642, "logits/rejected": -0.5708019137382507, "logps/chosen": -4.531647682189941, "logps/rejected": -4.784287452697754, "loss": 0.0534, "rewards/accuracies": 0.625, "rewards/chosen": -4.531647682189941, "rewards/margins": 0.25264012813568115, "rewards/rejected": -4.784287452697754, "sft_loss": 4.222924709320068, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 0.4154036618062291, "learning_rate": 8.883082149269478e-07, "logits/chosen": -0.7492375373840332, "logits/rejected": -0.625057578086853, "logps/chosen": -4.6985764503479, "logps/rejected": -5.03641414642334, "loss": 0.0533, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.6985764503479, "rewards/margins": 0.33783769607543945, "rewards/rejected": -5.03641414642334, "sft_loss": 4.449755668640137, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 0.6942420678365312, "learning_rate": 8.873252100389377e-07, "logits/chosen": -0.6865184307098389, "logits/rejected": -0.6350366473197937, "logps/chosen": -4.639318466186523, "logps/rejected": -4.9327192306518555, "loss": 0.0541, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.639318466186523, "rewards/margins": 0.2934008240699768, "rewards/rejected": -4.9327192306518555, "sft_loss": 4.407034873962402, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 0.8556379469701134, "learning_rate": 8.863384473200411e-07, "logits/chosen": -0.5269213914871216, "logits/rejected": -0.47171592712402344, "logps/chosen": -4.581943511962891, "logps/rejected": -4.845519065856934, "loss": 0.054, "rewards/accuracies": 0.625, "rewards/chosen": -4.581943511962891, "rewards/margins": 0.2635752558708191, "rewards/rejected": -4.845519065856934, "sft_loss": 4.290720462799072, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 0.38175959218553795, "learning_rate": 8.853479363438342e-07, "logits/chosen": -0.5232313871383667, "logits/rejected": -0.3149745464324951, "logps/chosen": -4.757014751434326, "logps/rejected": -5.126891136169434, "loss": 0.0536, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.757014751434326, "rewards/margins": 0.3698762059211731, "rewards/rejected": -5.126891136169434, "sft_loss": 4.466257572174072, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 0.7736549454767151, "learning_rate": 8.843536867202588e-07, "logits/chosen": -0.5337976217269897, "logits/rejected": -0.30675259232521057, "logps/chosen": -4.592353343963623, "logps/rejected": -5.095428943634033, "loss": 0.0539, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.592353343963623, "rewards/margins": 0.5030753016471863, "rewards/rejected": -5.095428943634033, "sft_loss": 4.437192440032959, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 0.5225751200668408, "learning_rate": 8.833557080955292e-07, "logits/chosen": -0.668506383895874, "logits/rejected": -0.5236512422561646, "logps/chosen": -4.579380035400391, "logps/rejected": -4.8987932205200195, "loss": 0.0548, "rewards/accuracies": 0.625, "rewards/chosen": -4.579380035400391, "rewards/margins": 0.3194130063056946, "rewards/rejected": -4.8987932205200195, "sft_loss": 4.3848490715026855, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 0.3776357485013768, "learning_rate": 8.823540101520381e-07, "logits/chosen": -0.6779472231864929, "logits/rejected": -0.41766971349716187, "logps/chosen": -4.687167644500732, "logps/rejected": -5.149572372436523, "loss": 0.0527, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.687167644500732, "rewards/margins": 0.4624043107032776, "rewards/rejected": -5.149572372436523, "sft_loss": 4.447117805480957, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 0.40393689961090434, "learning_rate": 8.813486026082637e-07, "logits/chosen": -0.6170944571495056, "logits/rejected": -0.3785988688468933, "logps/chosen": -4.538905143737793, "logps/rejected": -5.0261921882629395, "loss": 0.052, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.538905143737793, "rewards/margins": 0.48728686571121216, "rewards/rejected": -5.0261921882629395, "sft_loss": 4.21726131439209, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 0.6594051353912328, "learning_rate": 8.803394952186742e-07, "logits/chosen": -0.6432759165763855, "logits/rejected": -0.46522051095962524, "logps/chosen": -4.532651424407959, "logps/rejected": -4.9046783447265625, "loss": 0.0529, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.532651424407959, "rewards/margins": 0.37202686071395874, "rewards/rejected": -4.9046783447265625, "sft_loss": 4.1833367347717285, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 0.7064046912842677, "learning_rate": 8.793266977736342e-07, "logits/chosen": -0.5009998083114624, "logits/rejected": -0.6162468791007996, "logps/chosen": -4.827506065368652, "logps/rejected": -4.991827964782715, "loss": 0.0557, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.827506065368652, "rewards/margins": 0.16432145237922668, "rewards/rejected": -4.991827964782715, "sft_loss": 4.603185653686523, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 0.5206622644102082, "learning_rate": 8.783102200993085e-07, "logits/chosen": -0.5576699376106262, "logits/rejected": -0.43255311250686646, "logps/chosen": -4.724583625793457, "logps/rejected": -5.145751953125, "loss": 0.053, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.724583625793457, "rewards/margins": 0.42116886377334595, "rewards/rejected": -5.145751953125, "sft_loss": 4.429846286773682, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 0.44476556944134527, "learning_rate": 8.772900720575683e-07, "logits/chosen": -0.5629488229751587, "logits/rejected": -0.43112772703170776, "logps/chosen": -4.643448829650879, "logps/rejected": -4.956890106201172, "loss": 0.0532, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.643448829650879, "rewards/margins": 0.3134412169456482, "rewards/rejected": -4.956890106201172, "sft_loss": 4.35292911529541, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 0.3480510172874056, "learning_rate": 8.762662635458944e-07, "logits/chosen": -0.5707327127456665, "logits/rejected": -0.3803301453590393, "logps/chosen": -4.5035834312438965, "logps/rejected": -4.909355640411377, "loss": 0.053, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.5035834312438965, "rewards/margins": 0.4057716429233551, "rewards/rejected": -4.909355640411377, "sft_loss": 4.233164310455322, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 0.5164062533786418, "learning_rate": 8.752388044972811e-07, "logits/chosen": -0.484900563955307, "logits/rejected": -0.4002462923526764, "logps/chosen": -4.537683963775635, "logps/rejected": -4.98577356338501, "loss": 0.0527, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.537683963775635, "rewards/margins": 0.448089599609375, "rewards/rejected": -4.98577356338501, "sft_loss": 4.265603542327881, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 1.3750125559284796, "learning_rate": 8.74207704880141e-07, "logits/chosen": -0.38507509231567383, "logits/rejected": -0.2651823163032532, "logps/chosen": -4.553491115570068, "logps/rejected": -5.053616523742676, "loss": 0.0524, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.553491115570068, "rewards/margins": 0.5001254081726074, "rewards/rejected": -5.053616523742676, "sft_loss": 4.2198309898376465, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 0.6479414735451374, "learning_rate": 8.731729746982068e-07, "logits/chosen": -0.4087442457675934, "logits/rejected": -0.3513071537017822, "logps/chosen": -4.680237293243408, "logps/rejected": -5.0000081062316895, "loss": 0.0537, "rewards/accuracies": 0.65625, "rewards/chosen": -4.680237293243408, "rewards/margins": 0.319771409034729, "rewards/rejected": -5.0000081062316895, "sft_loss": 4.4230875968933105, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 0.5320410614211661, "learning_rate": 8.721346239904355e-07, "logits/chosen": -0.6621562242507935, "logits/rejected": -0.38414087891578674, "logps/chosen": -4.588827610015869, "logps/rejected": -5.170763969421387, "loss": 0.0532, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.588827610015869, "rewards/margins": 0.5819366574287415, "rewards/rejected": -5.170763969421387, "sft_loss": 4.343879699707031, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 0.7707100372795159, "learning_rate": 8.710926628309101e-07, "logits/chosen": -0.6921442151069641, "logits/rejected": -0.4576943814754486, "logps/chosen": -4.3977508544921875, "logps/rejected": -4.836879730224609, "loss": 0.0523, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.3977508544921875, "rewards/margins": 0.4391290545463562, "rewards/rejected": -4.836879730224609, "sft_loss": 4.144339561462402, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 0.40155269984358594, "learning_rate": 8.700471013287424e-07, "logits/chosen": -0.5236523747444153, "logits/rejected": -0.5326138734817505, "logps/chosen": -4.680853366851807, "logps/rejected": -4.947844982147217, "loss": 0.0542, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.680853366851807, "rewards/margins": 0.266991525888443, "rewards/rejected": -4.947844982147217, "sft_loss": 4.386083126068115, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 0.5819037718837893, "learning_rate": 8.689979496279746e-07, "logits/chosen": -0.7221859693527222, "logits/rejected": -0.6695482730865479, "logps/chosen": -4.943053722381592, "logps/rejected": -5.197215557098389, "loss": 0.0556, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.943053722381592, "rewards/margins": 0.2541615962982178, "rewards/rejected": -5.197215557098389, "sft_loss": 4.715271949768066, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 0.4677346992925345, "learning_rate": 8.679452179074811e-07, "logits/chosen": -0.7230926752090454, "logits/rejected": -0.5767534375190735, "logps/chosen": -4.456429481506348, "logps/rejected": -4.846767425537109, "loss": 0.0529, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.456429481506348, "rewards/margins": 0.39033815264701843, "rewards/rejected": -4.846767425537109, "sft_loss": 4.245125770568848, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 0.9766948042968379, "learning_rate": 8.668889163808698e-07, "logits/chosen": -0.6952082514762878, "logits/rejected": -0.5020134449005127, "logps/chosen": -4.320244312286377, "logps/rejected": -4.711088180541992, "loss": 0.0531, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.320244312286377, "rewards/margins": 0.3908434510231018, "rewards/rejected": -4.711088180541992, "sft_loss": 4.1057281494140625, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 0.8113564262746916, "learning_rate": 8.658290552963827e-07, "logits/chosen": -0.6360453963279724, "logits/rejected": -0.5571495294570923, "logps/chosen": -4.789154529571533, "logps/rejected": -5.147796154022217, "loss": 0.0547, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.789154529571533, "rewards/margins": 0.35864168405532837, "rewards/rejected": -5.147796154022217, "sft_loss": 4.56979513168335, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 0.40121347821380127, "learning_rate": 8.647656449367966e-07, "logits/chosen": -0.6600741147994995, "logits/rejected": -0.48249635100364685, "logps/chosen": -4.793465614318848, "logps/rejected": -5.12413215637207, "loss": 0.0543, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.793465614318848, "rewards/margins": 0.33066678047180176, "rewards/rejected": -5.12413215637207, "sft_loss": 4.610315799713135, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 0.4609934021865174, "learning_rate": 8.636986956193235e-07, "logits/chosen": -0.7527320981025696, "logits/rejected": -0.6049268841743469, "logps/chosen": -4.484477519989014, "logps/rejected": -4.966238975524902, "loss": 0.0527, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.484477519989014, "rewards/margins": 0.4817616045475006, "rewards/rejected": -4.966238975524902, "sft_loss": 4.206968307495117, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 0.5990607543889642, "learning_rate": 8.626282176955104e-07, "logits/chosen": -0.7125250697135925, "logits/rejected": -0.5920495390892029, "logps/chosen": -4.532000541687012, "logps/rejected": -4.994868278503418, "loss": 0.053, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.532000541687012, "rewards/margins": 0.46286827325820923, "rewards/rejected": -4.994868278503418, "sft_loss": 4.308735370635986, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 0.6096852450418404, "learning_rate": 8.615542215511389e-07, "logits/chosen": -0.6793350577354431, "logits/rejected": -0.6178755760192871, "logps/chosen": -4.636038303375244, "logps/rejected": -4.855711460113525, "loss": 0.0548, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.636038303375244, "rewards/margins": 0.21967339515686035, "rewards/rejected": -4.855711460113525, "sft_loss": 4.38653564453125, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 0.37934684130563323, "learning_rate": 8.604767176061241e-07, "logits/chosen": -0.6598840951919556, "logits/rejected": -0.5501774549484253, "logps/chosen": -4.648791313171387, "logps/rejected": -5.084364414215088, "loss": 0.0527, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.648791313171387, "rewards/margins": 0.4355725347995758, "rewards/rejected": -5.084364414215088, "sft_loss": 4.3693437576293945, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 0.3543703405808491, "learning_rate": 8.593957163144141e-07, "logits/chosen": -0.8177486658096313, "logits/rejected": -0.6528640985488892, "logps/chosen": -4.524102210998535, "logps/rejected": -5.0155134201049805, "loss": 0.0526, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.524102210998535, "rewards/margins": 0.4914116859436035, "rewards/rejected": -5.0155134201049805, "sft_loss": 4.297591686248779, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 0.587837761567623, "learning_rate": 8.58311228163888e-07, "logits/chosen": -0.7575830817222595, "logits/rejected": -0.6973570585250854, "logps/chosen": -4.548133850097656, "logps/rejected": -4.882990837097168, "loss": 0.0534, "rewards/accuracies": 0.65625, "rewards/chosen": -4.548133850097656, "rewards/margins": 0.3348572254180908, "rewards/rejected": -4.882990837097168, "sft_loss": 4.285576343536377, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 0.7831749014712782, "learning_rate": 8.57223263676255e-07, "logits/chosen": -0.827855110168457, "logits/rejected": -0.6802206039428711, "logps/chosen": -4.526045799255371, "logps/rejected": -5.129607200622559, "loss": 0.0524, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.526045799255371, "rewards/margins": 0.603561282157898, "rewards/rejected": -5.129607200622559, "sft_loss": 4.310399055480957, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 0.4646618216970684, "learning_rate": 8.561318334069511e-07, "logits/chosen": -0.695126473903656, "logits/rejected": -0.5704531073570251, "logps/chosen": -4.636171340942383, "logps/rejected": -5.058096408843994, "loss": 0.0528, "rewards/accuracies": 0.71875, "rewards/chosen": -4.636171340942383, "rewards/margins": 0.42192497849464417, "rewards/rejected": -5.058096408843994, "sft_loss": 4.393563747406006, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 0.4712524042717296, "learning_rate": 8.550369479450375e-07, "logits/chosen": -0.5622086524963379, "logits/rejected": -0.40890851616859436, "logps/chosen": -4.376356601715088, "logps/rejected": -4.874255657196045, "loss": 0.0527, "rewards/accuracies": 0.6875, "rewards/chosen": -4.376356601715088, "rewards/margins": 0.4978991448879242, "rewards/rejected": -4.874255657196045, "sft_loss": 4.152787685394287, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 0.34063083871666683, "learning_rate": 8.539386179130977e-07, "logits/chosen": -0.49509549140930176, "logits/rejected": -0.4550551474094391, "logps/chosen": -4.642989158630371, "logps/rejected": -5.026967525482178, "loss": 0.053, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.642989158630371, "rewards/margins": 0.3839784264564514, "rewards/rejected": -5.026967525482178, "sft_loss": 4.345905303955078, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 0.3872496478526205, "learning_rate": 8.528368539671347e-07, "logits/chosen": -0.5788191556930542, "logits/rejected": -0.3689228594303131, "logps/chosen": -4.652213096618652, "logps/rejected": -5.309985160827637, "loss": 0.0531, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.652213096618652, "rewards/margins": 0.6577725410461426, "rewards/rejected": -5.309985160827637, "sft_loss": 4.4433393478393555, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 0.9287089525439055, "learning_rate": 8.51731666796467e-07, "logits/chosen": -0.3933059573173523, "logits/rejected": -0.328712522983551, "logps/chosen": -4.452856540679932, "logps/rejected": -4.835246562957764, "loss": 0.0532, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.452856540679932, "rewards/margins": 0.38238975405693054, "rewards/rejected": -4.835246562957764, "sft_loss": 4.1753621101379395, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 0.5941262114362317, "learning_rate": 8.506230671236254e-07, "logits/chosen": -0.5021569132804871, "logits/rejected": -0.39401504397392273, "logps/chosen": -4.731034278869629, "logps/rejected": -5.092568397521973, "loss": 0.0532, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.731034278869629, "rewards/margins": 0.3615338206291199, "rewards/rejected": -5.092568397521973, "sft_loss": 4.504293441772461, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 0.34993440604791326, "learning_rate": 8.495110657042488e-07, "logits/chosen": -0.47997450828552246, "logits/rejected": -0.2725691795349121, "logps/chosen": -4.54381799697876, "logps/rejected": -5.032081127166748, "loss": 0.0538, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.54381799697876, "rewards/margins": 0.4882632791996002, "rewards/rejected": -5.032081127166748, "sft_loss": 4.36675500869751, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 0.4308703258389709, "learning_rate": 8.483956733269799e-07, "logits/chosen": -0.4384092688560486, "logits/rejected": -0.3426581919193268, "logps/chosen": -4.591732978820801, "logps/rejected": -4.909899711608887, "loss": 0.0531, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.591732978820801, "rewards/margins": 0.31816643476486206, "rewards/rejected": -4.909899711608887, "sft_loss": 4.256570816040039, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 0.3473658903289487, "learning_rate": 8.472769008133602e-07, "logits/chosen": -0.6959007978439331, "logits/rejected": -0.5554211735725403, "logps/chosen": -4.5757036209106445, "logps/rejected": -4.93710994720459, "loss": 0.0537, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.5757036209106445, "rewards/margins": 0.3614066243171692, "rewards/rejected": -4.93710994720459, "sft_loss": 4.2838358879089355, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 0.4555695972184036, "learning_rate": 8.461547590177259e-07, "logits/chosen": -0.557471513748169, "logits/rejected": -0.37859243154525757, "logps/chosen": -4.733495712280273, "logps/rejected": -5.202332973480225, "loss": 0.0515, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.733495712280273, "rewards/margins": 0.4688374996185303, "rewards/rejected": -5.202332973480225, "sft_loss": 4.25360631942749, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 0.43279035633127416, "learning_rate": 8.450292588271014e-07, "logits/chosen": -0.604584276676178, "logits/rejected": -0.4833999276161194, "logps/chosen": -4.606520652770996, "logps/rejected": -5.053445339202881, "loss": 0.0523, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.606520652770996, "rewards/margins": 0.44692516326904297, "rewards/rejected": -5.053445339202881, "sft_loss": 4.216244220733643, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 0.5329839703813494, "learning_rate": 8.439004111610945e-07, "logits/chosen": -0.5827174782752991, "logits/rejected": -0.5101505517959595, "logps/chosen": -4.813737869262695, "logps/rejected": -5.12241268157959, "loss": 0.0533, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.813737869262695, "rewards/margins": 0.30867481231689453, "rewards/rejected": -5.12241268157959, "sft_loss": 4.392908573150635, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 0.3367694694066443, "learning_rate": 8.427682269717901e-07, "logits/chosen": -0.6478679776191711, "logits/rejected": -0.4914635717868805, "logps/chosen": -4.561735153198242, "logps/rejected": -5.053101539611816, "loss": 0.0525, "rewards/accuracies": 0.6875, "rewards/chosen": -4.561735153198242, "rewards/margins": 0.49136656522750854, "rewards/rejected": -5.053101539611816, "sft_loss": 4.242537498474121, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 0.5834097301481967, "learning_rate": 8.416327172436446e-07, "logits/chosen": -0.8005746603012085, "logits/rejected": -0.6261088848114014, "logps/chosen": -4.756998538970947, "logps/rejected": -5.0415496826171875, "loss": 0.0542, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.756998538970947, "rewards/margins": 0.28455111384391785, "rewards/rejected": -5.0415496826171875, "sft_loss": 4.461302757263184, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 0.3973798220767986, "learning_rate": 8.404938929933778e-07, "logits/chosen": -0.6104685664176941, "logits/rejected": -0.4688517153263092, "logps/chosen": -4.621617794036865, "logps/rejected": -5.099360466003418, "loss": 0.052, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.621617794036865, "rewards/margins": 0.47774267196655273, "rewards/rejected": -5.099360466003418, "sft_loss": 4.235894203186035, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 0.3540909470534527, "learning_rate": 8.39351765269868e-07, "logits/chosen": -0.6760979890823364, "logits/rejected": -0.609889805316925, "logps/chosen": -4.65677547454834, "logps/rejected": -5.0093488693237305, "loss": 0.0539, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.65677547454834, "rewards/margins": 0.3525732159614563, "rewards/rejected": -5.0093488693237305, "sft_loss": 4.3292412757873535, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 0.7267303197089038, "learning_rate": 8.382063451540431e-07, "logits/chosen": -0.7155488729476929, "logits/rejected": -0.45979467034339905, "logps/chosen": -4.453604698181152, "logps/rejected": -4.879714488983154, "loss": 0.0533, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.453604698181152, "rewards/margins": 0.4261098802089691, "rewards/rejected": -4.879714488983154, "sft_loss": 4.265443801879883, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 0.4939611442573637, "learning_rate": 8.370576437587742e-07, "logits/chosen": -0.6435521841049194, "logits/rejected": -0.6209043264389038, "logps/chosen": -4.716995716094971, "logps/rejected": -5.006557464599609, "loss": 0.0531, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.716995716094971, "rewards/margins": 0.28956204652786255, "rewards/rejected": -5.006557464599609, "sft_loss": 4.35626220703125, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 0.41701206009092195, "learning_rate": 8.359056722287674e-07, "logits/chosen": -0.7903778553009033, "logits/rejected": -0.4564805030822754, "logps/chosen": -4.639558792114258, "logps/rejected": -5.135783672332764, "loss": 0.0524, "rewards/accuracies": 0.71875, "rewards/chosen": -4.639558792114258, "rewards/margins": 0.49622488021850586, "rewards/rejected": -5.135783672332764, "sft_loss": 4.399038791656494, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 0.7172297640459512, "learning_rate": 8.347504417404553e-07, "logits/chosen": -0.5581150054931641, "logits/rejected": -0.4032842218875885, "logps/chosen": -4.544151306152344, "logps/rejected": -4.86648416519165, "loss": 0.0542, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.544151306152344, "rewards/margins": 0.32233288884162903, "rewards/rejected": -4.86648416519165, "sft_loss": 4.188483238220215, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 0.5879181967847867, "learning_rate": 8.335919635018893e-07, "logits/chosen": -0.7893685102462769, "logits/rejected": -0.6554333567619324, "logps/chosen": -4.7575788497924805, "logps/rejected": -5.040469169616699, "loss": 0.0538, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.7575788497924805, "rewards/margins": 0.2828896641731262, "rewards/rejected": -5.040469169616699, "sft_loss": 4.464777946472168, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 0.43231433946425346, "learning_rate": 8.324302487526303e-07, "logits/chosen": -0.7919416427612305, "logits/rejected": -0.6639026403427124, "logps/chosen": -4.676050186157227, "logps/rejected": -5.0417985916137695, "loss": 0.0529, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.676050186157227, "rewards/margins": 0.36574864387512207, "rewards/rejected": -5.0417985916137695, "sft_loss": 4.378249168395996, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 0.37404157282655504, "learning_rate": 8.312653087636398e-07, "logits/chosen": -0.7558452486991882, "logits/rejected": -0.6526741981506348, "logps/chosen": -4.579874515533447, "logps/rejected": -4.968623161315918, "loss": 0.0534, "rewards/accuracies": 0.65625, "rewards/chosen": -4.579874515533447, "rewards/margins": 0.38874801993370056, "rewards/rejected": -4.968623161315918, "sft_loss": 4.276757717132568, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 0.36303918679572256, "learning_rate": 8.300971548371711e-07, "logits/chosen": -0.883111834526062, "logits/rejected": -0.602270781993866, "logps/chosen": -4.452870845794678, "logps/rejected": -4.911247730255127, "loss": 0.052, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.452870845794678, "rewards/margins": 0.45837679505348206, "rewards/rejected": -4.911247730255127, "sft_loss": 4.131048202514648, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 0.490560671457728, "learning_rate": 8.289257983066582e-07, "logits/chosen": -0.7516878843307495, "logits/rejected": -0.5508066415786743, "logps/chosen": -4.64019775390625, "logps/rejected": -5.095065593719482, "loss": 0.0528, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.64019775390625, "rewards/margins": 0.4548683166503906, "rewards/rejected": -5.095065593719482, "sft_loss": 4.342806339263916, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 0.2893276344274073, "learning_rate": 8.277512505366077e-07, "logits/chosen": -0.8057788014411926, "logits/rejected": -0.5403832197189331, "logps/chosen": -4.649588108062744, "logps/rejected": -5.086615085601807, "loss": 0.0531, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.649588108062744, "rewards/margins": 0.4370269179344177, "rewards/rejected": -5.086615085601807, "sft_loss": 4.376861095428467, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 0.7350489462981791, "learning_rate": 8.265735229224868e-07, "logits/chosen": -0.6775953769683838, "logits/rejected": -0.5745862722396851, "logps/chosen": -4.5659379959106445, "logps/rejected": -5.098782062530518, "loss": 0.052, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.5659379959106445, "rewards/margins": 0.5328438878059387, "rewards/rejected": -5.098782062530518, "sft_loss": 4.174136161804199, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 0.4516735988201357, "learning_rate": 8.253926268906144e-07, "logits/chosen": -0.7164157629013062, "logits/rejected": -0.5405601859092712, "logps/chosen": -4.749704837799072, "logps/rejected": -5.371539115905762, "loss": 0.0527, "rewards/accuracies": 0.6875, "rewards/chosen": -4.749704837799072, "rewards/margins": 0.6218348145484924, "rewards/rejected": -5.371539115905762, "sft_loss": 4.391979694366455, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 0.3510630604415655, "learning_rate": 8.242085738980487e-07, "logits/chosen": -0.5420058965682983, "logits/rejected": -0.2874351739883423, "logps/chosen": -4.528346061706543, "logps/rejected": -5.030544281005859, "loss": 0.0531, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.528346061706543, "rewards/margins": 0.5021986961364746, "rewards/rejected": -5.030544281005859, "sft_loss": 4.249731540679932, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 0.5538265414870637, "learning_rate": 8.230213754324772e-07, "logits/chosen": -0.6377596259117126, "logits/rejected": -0.5630909204483032, "logps/chosen": -4.5644450187683105, "logps/rejected": -4.965079307556152, "loss": 0.0539, "rewards/accuracies": 0.625, "rewards/chosen": -4.5644450187683105, "rewards/margins": 0.40063467621803284, "rewards/rejected": -4.965079307556152, "sft_loss": 4.360657215118408, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 0.2949614545223873, "learning_rate": 8.218310430121045e-07, "logits/chosen": -0.5675156712532043, "logits/rejected": -0.5481168031692505, "logps/chosen": -4.707152366638184, "logps/rejected": -4.992379188537598, "loss": 0.0539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.707152366638184, "rewards/margins": 0.2852264642715454, "rewards/rejected": -4.992379188537598, "sft_loss": 4.478783130645752, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 0.5543113807704609, "learning_rate": 8.20637588185541e-07, "logits/chosen": -0.5608028173446655, "logits/rejected": -0.46988582611083984, "logps/chosen": -4.442027568817139, "logps/rejected": -4.943470478057861, "loss": 0.0531, "rewards/accuracies": 0.65625, "rewards/chosen": -4.442027568817139, "rewards/margins": 0.5014427900314331, "rewards/rejected": -4.943470478057861, "sft_loss": 4.245416164398193, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 0.2866117529311254, "learning_rate": 8.194410225316906e-07, "logits/chosen": -0.7090498208999634, "logits/rejected": -0.5142877101898193, "logps/chosen": -4.616601467132568, "logps/rejected": -5.014645576477051, "loss": 0.0529, "rewards/accuracies": 0.65625, "rewards/chosen": -4.616601467132568, "rewards/margins": 0.39804330468177795, "rewards/rejected": -5.014645576477051, "sft_loss": 4.298870086669922, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 0.35295190473633326, "learning_rate": 8.182413576596385e-07, "logits/chosen": -0.5405132174491882, "logits/rejected": -0.48180103302001953, "logps/chosen": -4.607327938079834, "logps/rejected": -5.084053039550781, "loss": 0.0532, "rewards/accuracies": 0.65625, "rewards/chosen": -4.607327938079834, "rewards/margins": 0.4767250418663025, "rewards/rejected": -5.084053039550781, "sft_loss": 4.384486675262451, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 0.3776636814624771, "learning_rate": 8.170386052085389e-07, "logits/chosen": -0.5723873376846313, "logits/rejected": -0.44267210364341736, "logps/chosen": -4.660275459289551, "logps/rejected": -5.051526069641113, "loss": 0.0534, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.660275459289551, "rewards/margins": 0.39125025272369385, "rewards/rejected": -5.051526069641113, "sft_loss": 4.37464714050293, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 0.5159615230619127, "learning_rate": 8.158327768475008e-07, "logits/chosen": -0.6213208436965942, "logits/rejected": -0.4670190215110779, "logps/chosen": -4.665583610534668, "logps/rejected": -4.967374801635742, "loss": 0.0529, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.665583610534668, "rewards/margins": 0.30179107189178467, "rewards/rejected": -4.967374801635742, "sft_loss": 4.264622211456299, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 0.44400302792011925, "learning_rate": 8.146238842754767e-07, "logits/chosen": -0.6752325296401978, "logits/rejected": -0.5508004426956177, "logps/chosen": -4.6823554039001465, "logps/rejected": -5.140440940856934, "loss": 0.0533, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.6823554039001465, "rewards/margins": 0.45808520913124084, "rewards/rejected": -5.140440940856934, "sft_loss": 4.321152210235596, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 0.7057418884353844, "learning_rate": 8.134119392211476e-07, "logits/chosen": -0.5761233568191528, "logits/rejected": -0.38899824023246765, "logps/chosen": -4.757379055023193, "logps/rejected": -5.293605804443359, "loss": 0.0528, "rewards/accuracies": 0.6875, "rewards/chosen": -4.757379055023193, "rewards/margins": 0.5362268686294556, "rewards/rejected": -5.293605804443359, "sft_loss": 4.403469085693359, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 0.49240902562783034, "learning_rate": 8.121969534428094e-07, "logits/chosen": -0.7272932529449463, "logits/rejected": -0.5523107051849365, "logps/chosen": -4.743283271789551, "logps/rejected": -5.234428882598877, "loss": 0.0537, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.743283271789551, "rewards/margins": 0.4911455512046814, "rewards/rejected": -5.234428882598877, "sft_loss": 4.387631416320801, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": -0.23999834060668945, "eval_logits/rejected": -0.15501756966114044, "eval_logps/chosen": -4.426951885223389, "eval_logps/rejected": -4.892354488372803, "eval_loss": 0.051702603697776794, "eval_rewards/accuracies": 0.6468842625617981, "eval_rewards/chosen": -4.426951885223389, "eval_rewards/margins": 0.4654025435447693, "eval_rewards/rejected": -4.892354488372803, "eval_runtime": 43.2906, "eval_samples_per_second": 31.069, "eval_sft_loss": 3.9661672115325928, "eval_steps_per_second": 7.785, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 0.4600706793047822, "learning_rate": 8.109789387282599e-07, "logits/chosen": -0.6717931032180786, "logits/rejected": -0.6304608583450317, "logps/chosen": -4.575514793395996, "logps/rejected": -4.920340538024902, "loss": 0.0535, "rewards/accuracies": 0.625, "rewards/chosen": -4.575514793395996, "rewards/margins": 0.3448256552219391, "rewards/rejected": -4.920340538024902, "sft_loss": 4.244610786437988, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 0.3970379766667623, "learning_rate": 8.097579068946827e-07, "logits/chosen": -0.7990378141403198, "logits/rejected": -0.6704200506210327, "logps/chosen": -4.71152400970459, "logps/rejected": -5.189643859863281, "loss": 0.0522, "rewards/accuracies": 0.625, "rewards/chosen": -4.71152400970459, "rewards/margins": 0.4781200885772705, "rewards/rejected": -5.189643859863281, "sft_loss": 4.398639678955078, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 0.5944728609383647, "learning_rate": 8.085338697885344e-07, "logits/chosen": -0.7603497505187988, "logits/rejected": -0.6357791423797607, "logps/chosen": -4.704463481903076, "logps/rejected": -5.05355978012085, "loss": 0.0532, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.704463481903076, "rewards/margins": 0.3490960896015167, "rewards/rejected": -5.05355978012085, "sft_loss": 4.395869255065918, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 0.37605343362394694, "learning_rate": 8.073068392854282e-07, "logits/chosen": -0.826943039894104, "logits/rejected": -0.5462093949317932, "logps/chosen": -4.333527565002441, "logps/rejected": -4.886292457580566, "loss": 0.0514, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.333527565002441, "rewards/margins": 0.5527652502059937, "rewards/rejected": -4.886292457580566, "sft_loss": 4.111708641052246, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 0.7179662433757041, "learning_rate": 8.060768272900193e-07, "logits/chosen": -0.6332185864448547, "logits/rejected": -0.4485379755496979, "logps/chosen": -4.430613040924072, "logps/rejected": -4.984936237335205, "loss": 0.0525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.430613040924072, "rewards/margins": 0.5543233156204224, "rewards/rejected": -4.984936237335205, "sft_loss": 4.166750907897949, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 0.35942591017215597, "learning_rate": 8.0484384573589e-07, "logits/chosen": -0.6678962111473083, "logits/rejected": -0.6808444857597351, "logps/chosen": -4.590447425842285, "logps/rejected": -4.973687648773193, "loss": 0.0534, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.590447425842285, "rewards/margins": 0.3832399249076843, "rewards/rejected": -4.973687648773193, "sft_loss": 4.327738285064697, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 0.3819578692381291, "learning_rate": 8.03607906585432e-07, "logits/chosen": -0.7403281331062317, "logits/rejected": -0.5643646121025085, "logps/chosen": -4.80985164642334, "logps/rejected": -5.229228496551514, "loss": 0.0534, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.80985164642334, "rewards/margins": 0.4193764328956604, "rewards/rejected": -5.229228496551514, "sft_loss": 4.544407844543457, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 0.4776251797363461, "learning_rate": 8.023690218297329e-07, "logits/chosen": -0.6995172500610352, "logits/rejected": -0.6782066226005554, "logps/chosen": -4.589086532592773, "logps/rejected": -4.949246406555176, "loss": 0.0522, "rewards/accuracies": 0.625, "rewards/chosen": -4.589086532592773, "rewards/margins": 0.36015990376472473, "rewards/rejected": -4.949246406555176, "sft_loss": 4.224146842956543, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 0.7479474801927108, "learning_rate": 8.01127203488458e-07, "logits/chosen": -0.5383197665214539, "logits/rejected": -0.48996859788894653, "logps/chosen": -4.540228366851807, "logps/rejected": -5.0145792961120605, "loss": 0.0537, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.540228366851807, "rewards/margins": 0.47435110807418823, "rewards/rejected": -5.0145792961120605, "sft_loss": 4.176971435546875, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 0.4776403747608541, "learning_rate": 7.998824636097339e-07, "logits/chosen": -0.6831840872764587, "logits/rejected": -0.5318336486816406, "logps/chosen": -4.693176746368408, "logps/rejected": -5.142681121826172, "loss": 0.0541, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.693176746368408, "rewards/margins": 0.44950443506240845, "rewards/rejected": -5.142681121826172, "sft_loss": 4.48852014541626, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 0.5140746201417399, "learning_rate": 7.986348142700328e-07, "logits/chosen": -0.7236698269844055, "logits/rejected": -0.5705364346504211, "logps/chosen": -4.719154357910156, "logps/rejected": -5.148931980133057, "loss": 0.0535, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.719154357910156, "rewards/margins": 0.42977744340896606, "rewards/rejected": -5.148931980133057, "sft_loss": 4.52241325378418, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 0.4179464858196893, "learning_rate": 7.973842675740539e-07, "logits/chosen": -0.609150767326355, "logits/rejected": -0.5410366058349609, "logps/chosen": -4.482192039489746, "logps/rejected": -4.9732208251953125, "loss": 0.052, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.482192039489746, "rewards/margins": 0.49102896451950073, "rewards/rejected": -4.9732208251953125, "sft_loss": 4.188296318054199, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 0.5180053361561886, "learning_rate": 7.961308356546066e-07, "logits/chosen": -0.7119373083114624, "logits/rejected": -0.5679913759231567, "logps/chosen": -4.229228973388672, "logps/rejected": -4.735593318939209, "loss": 0.0523, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.229228973388672, "rewards/margins": 0.5063642263412476, "rewards/rejected": -4.735593318939209, "sft_loss": 3.9967262744903564, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 0.46718871726056116, "learning_rate": 7.948745306724931e-07, "logits/chosen": -0.7739778757095337, "logits/rejected": -0.6170490980148315, "logps/chosen": -4.635968208312988, "logps/rejected": -5.1852192878723145, "loss": 0.0518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.635968208312988, "rewards/margins": 0.549250602722168, "rewards/rejected": -5.1852192878723145, "sft_loss": 4.369393825531006, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 0.3984587230629792, "learning_rate": 7.936153648163897e-07, "logits/chosen": -0.8065184354782104, "logits/rejected": -0.7064257860183716, "logps/chosen": -4.752967834472656, "logps/rejected": -5.092961311340332, "loss": 0.0534, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.752967834472656, "rewards/margins": 0.33999359607696533, "rewards/rejected": -5.092961311340332, "sft_loss": 4.481800079345703, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 0.5695032759468597, "learning_rate": 7.92353350302729e-07, "logits/chosen": -0.7206646800041199, "logits/rejected": -0.548992931842804, "logps/chosen": -4.335890293121338, "logps/rejected": -4.820664882659912, "loss": 0.0531, "rewards/accuracies": 0.65625, "rewards/chosen": -4.335890293121338, "rewards/margins": 0.48477450013160706, "rewards/rejected": -4.820664882659912, "sft_loss": 4.050368785858154, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 0.5829877306762072, "learning_rate": 7.910884993755816e-07, "logits/chosen": -0.7600020170211792, "logits/rejected": -0.6812421679496765, "logps/chosen": -4.553534984588623, "logps/rejected": -5.294806480407715, "loss": 0.0522, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.553534984588623, "rewards/margins": 0.741270899772644, "rewards/rejected": -5.294806480407715, "sft_loss": 4.326718807220459, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 0.8262505274696786, "learning_rate": 7.898208243065367e-07, "logits/chosen": -0.7206606864929199, "logits/rejected": -0.7447828054428101, "logps/chosen": -4.787585258483887, "logps/rejected": -5.069046497344971, "loss": 0.0538, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.787585258483887, "rewards/margins": 0.2814616560935974, "rewards/rejected": -5.069046497344971, "sft_loss": 4.394356727600098, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 0.5172306289125376, "learning_rate": 7.88550337394583e-07, "logits/chosen": -0.8299128413200378, "logits/rejected": -0.6698902249336243, "logps/chosen": -4.428835391998291, "logps/rejected": -4.9404144287109375, "loss": 0.0525, "rewards/accuracies": 0.6875, "rewards/chosen": -4.428835391998291, "rewards/margins": 0.5115790367126465, "rewards/rejected": -4.9404144287109375, "sft_loss": 4.22437047958374, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 0.3443635655516794, "learning_rate": 7.872770509659905e-07, "logits/chosen": -0.6373104453086853, "logits/rejected": -0.5988988876342773, "logps/chosen": -4.619525909423828, "logps/rejected": -4.976244926452637, "loss": 0.0525, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.619525909423828, "rewards/margins": 0.3567189574241638, "rewards/rejected": -4.976244926452637, "sft_loss": 4.2509589195251465, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 0.3976039731943874, "learning_rate": 7.860009773741896e-07, "logits/chosen": -0.5825362801551819, "logits/rejected": -0.4224637448787689, "logps/chosen": -4.629540920257568, "logps/rejected": -5.204381465911865, "loss": 0.0522, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.629540920257568, "rewards/margins": 0.5748408436775208, "rewards/rejected": -5.204381465911865, "sft_loss": 4.34817361831665, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 0.39212041229796935, "learning_rate": 7.84722128999652e-07, "logits/chosen": -0.6080501675605774, "logits/rejected": -0.45994147658348083, "logps/chosen": -4.713656425476074, "logps/rejected": -5.265693187713623, "loss": 0.0538, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.713656425476074, "rewards/margins": 0.5520361661911011, "rewards/rejected": -5.265693187713623, "sft_loss": 4.4130964279174805, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 1.2673650048653755, "learning_rate": 7.834405182497699e-07, "logits/chosen": -0.49251309037208557, "logits/rejected": -0.4597319960594177, "logps/chosen": -4.524658203125, "logps/rejected": -4.95531702041626, "loss": 0.0544, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.524658203125, "rewards/margins": 0.43065857887268066, "rewards/rejected": -4.95531702041626, "sft_loss": 4.2953691482543945, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 0.5472533304467926, "learning_rate": 7.821561575587368e-07, "logits/chosen": -0.632826566696167, "logits/rejected": -0.6065362095832825, "logps/chosen": -4.617916584014893, "logps/rejected": -4.967799663543701, "loss": 0.0539, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.617916584014893, "rewards/margins": 0.34988293051719666, "rewards/rejected": -4.967799663543701, "sft_loss": 4.453722953796387, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 0.277069241699628, "learning_rate": 7.808690593874254e-07, "logits/chosen": -0.7417432069778442, "logits/rejected": -0.6385709047317505, "logps/chosen": -4.6788010597229, "logps/rejected": -5.170589923858643, "loss": 0.0548, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.6788010597229, "rewards/margins": 0.491788774728775, "rewards/rejected": -5.170589923858643, "sft_loss": 4.516565322875977, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 0.47122826813421975, "learning_rate": 7.79579236223268e-07, "logits/chosen": -0.6222350001335144, "logits/rejected": -0.3234289288520813, "logps/chosen": -4.440484523773193, "logps/rejected": -4.997524261474609, "loss": 0.0512, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.440484523773193, "rewards/margins": 0.5570399165153503, "rewards/rejected": -4.997524261474609, "sft_loss": 4.1651482582092285, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 0.44261863106415245, "learning_rate": 7.782867005801346e-07, "logits/chosen": -0.6503061056137085, "logits/rejected": -0.42080339789390564, "logps/chosen": -4.290419578552246, "logps/rejected": -4.850780963897705, "loss": 0.0529, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.290419578552246, "rewards/margins": 0.560361385345459, "rewards/rejected": -4.850780963897705, "sft_loss": 4.068234443664551, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 0.6257258204652094, "learning_rate": 7.769914649982117e-07, "logits/chosen": -0.6751757264137268, "logits/rejected": -0.5070086717605591, "logps/chosen": -4.67188024520874, "logps/rejected": -5.1316142082214355, "loss": 0.0526, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.67188024520874, "rewards/margins": 0.4597338140010834, "rewards/rejected": -5.1316142082214355, "sft_loss": 4.425201416015625, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 0.5302938713487861, "learning_rate": 7.756935420438803e-07, "logits/chosen": -0.6334782838821411, "logits/rejected": -0.5502496957778931, "logps/chosen": -4.623457908630371, "logps/rejected": -5.174169540405273, "loss": 0.0517, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.623457908630371, "rewards/margins": 0.5507121086120605, "rewards/rejected": -5.174169540405273, "sft_loss": 4.292878150939941, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 0.6070596328759367, "learning_rate": 7.743929443095951e-07, "logits/chosen": -0.6257916688919067, "logits/rejected": -0.5801526308059692, "logps/chosen": -4.365163326263428, "logps/rejected": -4.9333815574646, "loss": 0.0531, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.365163326263428, "rewards/margins": 0.568219006061554, "rewards/rejected": -4.9333815574646, "sft_loss": 4.0889482498168945, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 0.6839084267850211, "learning_rate": 7.730896844137609e-07, "logits/chosen": -0.6672028303146362, "logits/rejected": -0.5766544342041016, "logps/chosen": -4.858401775360107, "logps/rejected": -5.171161651611328, "loss": 0.0538, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.858401775360107, "rewards/margins": 0.3127599358558655, "rewards/rejected": -5.171161651611328, "sft_loss": 4.545053005218506, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 0.3568908807824978, "learning_rate": 7.717837750006106e-07, "logits/chosen": -0.7871010303497314, "logits/rejected": -0.7272329330444336, "logps/chosen": -4.782454490661621, "logps/rejected": -5.240635871887207, "loss": 0.0531, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.782454490661621, "rewards/margins": 0.45818084478378296, "rewards/rejected": -5.240635871887207, "sft_loss": 4.542524337768555, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 0.5990662498663287, "learning_rate": 7.704752287400832e-07, "logits/chosen": -0.73150235414505, "logits/rejected": -0.5180791020393372, "logps/chosen": -4.3465166091918945, "logps/rejected": -4.793034076690674, "loss": 0.0532, "rewards/accuracies": 0.6875, "rewards/chosen": -4.3465166091918945, "rewards/margins": 0.44651785492897034, "rewards/rejected": -4.793034076690674, "sft_loss": 4.138575553894043, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 0.27564186929080403, "learning_rate": 7.691640583277004e-07, "logits/chosen": -0.6673997640609741, "logits/rejected": -0.48911604285240173, "logps/chosen": -4.416393756866455, "logps/rejected": -4.9265360832214355, "loss": 0.0521, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.416393756866455, "rewards/margins": 0.5101426839828491, "rewards/rejected": -4.9265360832214355, "sft_loss": 4.156350135803223, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 0.4813794132035719, "learning_rate": 7.678502764844433e-07, "logits/chosen": -0.7979347705841064, "logits/rejected": -0.5554312467575073, "logps/chosen": -4.576869964599609, "logps/rejected": -4.963016986846924, "loss": 0.0526, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.576869964599609, "rewards/margins": 0.3861469328403473, "rewards/rejected": -4.963016986846924, "sft_loss": 4.301800727844238, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 0.3189556223539962, "learning_rate": 7.665338959566288e-07, "logits/chosen": -0.68720543384552, "logits/rejected": -0.6039665937423706, "logps/chosen": -4.594615459442139, "logps/rejected": -5.138156414031982, "loss": 0.0517, "rewards/accuracies": 0.71875, "rewards/chosen": -4.594615459442139, "rewards/margins": 0.5435405969619751, "rewards/rejected": -5.138156414031982, "sft_loss": 4.311945915222168, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 0.7522816530936208, "learning_rate": 7.652149295157868e-07, "logits/chosen": -0.5866094827651978, "logits/rejected": -0.40514469146728516, "logps/chosen": -4.510667324066162, "logps/rejected": -4.878499984741211, "loss": 0.053, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.510667324066162, "rewards/margins": 0.3678319752216339, "rewards/rejected": -4.878499984741211, "sft_loss": 4.197674751281738, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 0.4463112427262321, "learning_rate": 7.638933899585354e-07, "logits/chosen": -0.44559282064437866, "logits/rejected": -0.45199769735336304, "logps/chosen": -4.532196998596191, "logps/rejected": -5.0780205726623535, "loss": 0.0524, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.532196998596191, "rewards/margins": 0.5458240509033203, "rewards/rejected": -5.0780205726623535, "sft_loss": 4.232436180114746, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 0.457643428777109, "learning_rate": 7.625692901064573e-07, "logits/chosen": -0.6123597025871277, "logits/rejected": -0.5176479816436768, "logps/chosen": -4.659605026245117, "logps/rejected": -5.072868347167969, "loss": 0.0551, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -4.659605026245117, "rewards/margins": 0.41326403617858887, "rewards/rejected": -5.072868347167969, "sft_loss": 4.42981481552124, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 0.39494702241247864, "learning_rate": 7.61242642805975e-07, "logits/chosen": -0.7008353471755981, "logits/rejected": -0.7373430132865906, "logps/chosen": -4.8912858963012695, "logps/rejected": -5.152310848236084, "loss": 0.0543, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.8912858963012695, "rewards/margins": 0.2610251009464264, "rewards/rejected": -5.152310848236084, "sft_loss": 4.551990509033203, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 0.3579528644218798, "learning_rate": 7.599134609282266e-07, "logits/chosen": -0.8311487436294556, "logits/rejected": -0.5965268015861511, "logps/chosen": -4.442378044128418, "logps/rejected": -4.958862781524658, "loss": 0.053, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.442378044128418, "rewards/margins": 0.516484797000885, "rewards/rejected": -4.958862781524658, "sft_loss": 4.260416030883789, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 0.44236643382684815, "learning_rate": 7.585817573689402e-07, "logits/chosen": -0.7978760600090027, "logits/rejected": -0.6854046583175659, "logps/chosen": -4.419394016265869, "logps/rejected": -5.014368057250977, "loss": 0.0518, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.419394016265869, "rewards/margins": 0.5949746370315552, "rewards/rejected": -5.014368057250977, "sft_loss": 4.203858852386475, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 0.4060038941710105, "learning_rate": 7.572475450483098e-07, "logits/chosen": -0.704419732093811, "logits/rejected": -0.5912919044494629, "logps/chosen": -4.481686115264893, "logps/rejected": -5.0178141593933105, "loss": 0.0522, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.481686115264893, "rewards/margins": 0.5361284017562866, "rewards/rejected": -5.0178141593933105, "sft_loss": 4.2278852462768555, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 0.5070512938760626, "learning_rate": 7.559108369108689e-07, "logits/chosen": -0.7109116315841675, "logits/rejected": -0.5894996523857117, "logps/chosen": -4.390748500823975, "logps/rejected": -4.943523406982422, "loss": 0.0529, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.390748500823975, "rewards/margins": 0.5527750253677368, "rewards/rejected": -4.943523406982422, "sft_loss": 4.148125648498535, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 0.5159637234340495, "learning_rate": 7.54571645925366e-07, "logits/chosen": -0.8002594709396362, "logits/rejected": -0.5314058661460876, "logps/chosen": -4.7372260093688965, "logps/rejected": -5.4383015632629395, "loss": 0.0517, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.7372260093688965, "rewards/margins": 0.7010757327079773, "rewards/rejected": -5.4383015632629395, "sft_loss": 4.354250907897949, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 0.45824704280534556, "learning_rate": 7.532299850846378e-07, "logits/chosen": -0.7148901224136353, "logits/rejected": -0.5222693681716919, "logps/chosen": -4.647156715393066, "logps/rejected": -5.444609642028809, "loss": 0.0521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.647156715393066, "rewards/margins": 0.7974528074264526, "rewards/rejected": -5.444609642028809, "sft_loss": 4.288125038146973, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 0.6411370012537118, "learning_rate": 7.518858674054838e-07, "logits/chosen": -0.658902645111084, "logits/rejected": -0.44083452224731445, "logps/chosen": -4.602587699890137, "logps/rejected": -5.086310386657715, "loss": 0.0529, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.602587699890137, "rewards/margins": 0.48372262716293335, "rewards/rejected": -5.086310386657715, "sft_loss": 4.2280964851379395, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 0.4630768053585857, "learning_rate": 7.505393059285394e-07, "logits/chosen": -0.7535982728004456, "logits/rejected": -0.5658230781555176, "logps/chosen": -4.64790153503418, "logps/rejected": -5.012650966644287, "loss": 0.0535, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.64790153503418, "rewards/margins": 0.36474916338920593, "rewards/rejected": -5.012650966644287, "sft_loss": 4.332772254943848, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 0.47993843494898725, "learning_rate": 7.491903137181501e-07, "logits/chosen": -0.7029468417167664, "logits/rejected": -0.6749275326728821, "logps/chosen": -4.604931831359863, "logps/rejected": -4.9890336990356445, "loss": 0.0536, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.604931831359863, "rewards/margins": 0.3841017186641693, "rewards/rejected": -4.9890336990356445, "sft_loss": 4.398990631103516, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 0.41159973312127757, "learning_rate": 7.478389038622441e-07, "logits/chosen": -0.6323720216751099, "logits/rejected": -0.6110953092575073, "logps/chosen": -4.512185096740723, "logps/rejected": -4.970728874206543, "loss": 0.0522, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.512185096740723, "rewards/margins": 0.45854368805885315, "rewards/rejected": -4.970728874206543, "sft_loss": 4.247210502624512, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 0.4682841990616749, "learning_rate": 7.46485089472206e-07, "logits/chosen": -0.7587865591049194, "logits/rejected": -0.7158206701278687, "logps/chosen": -4.485775947570801, "logps/rejected": -4.8737568855285645, "loss": 0.0537, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.485775947570801, "rewards/margins": 0.3879804015159607, "rewards/rejected": -4.8737568855285645, "sft_loss": 4.248816013336182, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 0.44708077693442627, "learning_rate": 7.451288836827487e-07, "logits/chosen": -0.7529920339584351, "logits/rejected": -0.7776705622673035, "logps/chosen": -4.722033500671387, "logps/rejected": -5.016490459442139, "loss": 0.0538, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.722033500671387, "rewards/margins": 0.2944570481777191, "rewards/rejected": -5.016490459442139, "sft_loss": 4.490199089050293, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 0.5230816480612343, "learning_rate": 7.437702996517869e-07, "logits/chosen": -0.80247563123703, "logits/rejected": -0.7050091028213501, "logps/chosen": -4.602765083312988, "logps/rejected": -4.979498863220215, "loss": 0.0529, "rewards/accuracies": 0.6875, "rewards/chosen": -4.602765083312988, "rewards/margins": 0.37673383951187134, "rewards/rejected": -4.979498863220215, "sft_loss": 4.365364074707031, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 0.4773651019537365, "learning_rate": 7.424093505603087e-07, "logits/chosen": -0.8925487399101257, "logits/rejected": -0.6971911191940308, "logps/chosen": -4.3991804122924805, "logps/rejected": -4.9190897941589355, "loss": 0.0517, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.3991804122924805, "rewards/margins": 0.5199095010757446, "rewards/rejected": -4.9190897941589355, "sft_loss": 4.129302024841309, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 0.5743721034439623, "learning_rate": 7.410460496122482e-07, "logits/chosen": -0.7015866637229919, "logits/rejected": -0.5496960878372192, "logps/chosen": -4.501893043518066, "logps/rejected": -5.030624866485596, "loss": 0.0515, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.501893043518066, "rewards/margins": 0.5287320017814636, "rewards/rejected": -5.030624866485596, "sft_loss": 4.109658718109131, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 0.5284106323606012, "learning_rate": 7.396804100343572e-07, "logits/chosen": -0.8011897206306458, "logits/rejected": -0.5881227254867554, "logps/chosen": -4.559887409210205, "logps/rejected": -5.120944023132324, "loss": 0.0513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.559887409210205, "rewards/margins": 0.5610562562942505, "rewards/rejected": -5.120944023132324, "sft_loss": 4.201657295227051, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 0.4225976306052054, "learning_rate": 7.383124450760768e-07, "logits/chosen": -0.7284170985221863, "logits/rejected": -0.5309914946556091, "logps/chosen": -4.601205348968506, "logps/rejected": -5.269949913024902, "loss": 0.0518, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.601205348968506, "rewards/margins": 0.668745219707489, "rewards/rejected": -5.269949913024902, "sft_loss": 4.3044843673706055, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 0.7224182412559759, "learning_rate": 7.369421680094091e-07, "logits/chosen": -0.7828829884529114, "logits/rejected": -0.6176687479019165, "logps/chosen": -4.525793552398682, "logps/rejected": -5.050825119018555, "loss": 0.0525, "rewards/accuracies": 0.65625, "rewards/chosen": -4.525793552398682, "rewards/margins": 0.5250317454338074, "rewards/rejected": -5.050825119018555, "sft_loss": 4.251994609832764, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 0.629047772424287, "learning_rate": 7.355695921287881e-07, "logits/chosen": -0.7749170064926147, "logits/rejected": -0.6841250061988831, "logps/chosen": -4.591757297515869, "logps/rejected": -5.037539482116699, "loss": 0.0536, "rewards/accuracies": 0.65625, "rewards/chosen": -4.591757297515869, "rewards/margins": 0.4457823634147644, "rewards/rejected": -5.037539482116699, "sft_loss": 4.263556480407715, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 0.33053420351210816, "learning_rate": 7.341947307509513e-07, "logits/chosen": -0.7545934915542603, "logits/rejected": -0.6486998796463013, "logps/chosen": -4.742500305175781, "logps/rejected": -5.1753973960876465, "loss": 0.053, "rewards/accuracies": 0.65625, "rewards/chosen": -4.742500305175781, "rewards/margins": 0.4328971803188324, "rewards/rejected": -5.1753973960876465, "sft_loss": 4.4836931228637695, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 0.4071489993833283, "learning_rate": 7.328175972148094e-07, "logits/chosen": -0.7530182600021362, "logits/rejected": -0.6168416142463684, "logps/chosen": -4.746242523193359, "logps/rejected": -5.2826247215271, "loss": 0.0534, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.746242523193359, "rewards/margins": 0.5363827347755432, "rewards/rejected": -5.2826247215271, "sft_loss": 4.459273815155029, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 0.5441597534623708, "learning_rate": 7.314382048813185e-07, "logits/chosen": -0.7168510556221008, "logits/rejected": -0.43174856901168823, "logps/chosen": -4.272377967834473, "logps/rejected": -4.978228569030762, "loss": 0.0509, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.272377967834473, "rewards/margins": 0.7058510780334473, "rewards/rejected": -4.978228569030762, "sft_loss": 4.000433444976807, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 0.6252127343519807, "learning_rate": 7.300565671333486e-07, "logits/chosen": -0.8265350461006165, "logits/rejected": -0.6041692495346069, "logps/chosen": -4.5154876708984375, "logps/rejected": -5.120383262634277, "loss": 0.0516, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.5154876708984375, "rewards/margins": 0.6048959493637085, "rewards/rejected": -5.120383262634277, "sft_loss": 4.2072625160217285, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 0.4222806528582792, "learning_rate": 7.286726973755554e-07, "logits/chosen": -0.7032849192619324, "logits/rejected": -0.6845365762710571, "logps/chosen": -4.55859375, "logps/rejected": -5.039181709289551, "loss": 0.0526, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.55859375, "rewards/margins": 0.4805881083011627, "rewards/rejected": -5.039181709289551, "sft_loss": 4.184603691101074, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 0.5106826580742952, "learning_rate": 7.272866090342493e-07, "logits/chosen": -0.5765025615692139, "logits/rejected": -0.5348777770996094, "logps/chosen": -4.575560569763184, "logps/rejected": -5.200422286987305, "loss": 0.0513, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.575560569763184, "rewards/margins": 0.6248610615730286, "rewards/rejected": -5.200422286987305, "sft_loss": 4.18874979019165, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 0.36219022238136406, "learning_rate": 7.258983155572656e-07, "logits/chosen": -0.7583931684494019, "logits/rejected": -0.6806284189224243, "logps/chosen": -4.756449222564697, "logps/rejected": -5.231799125671387, "loss": 0.0529, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.756449222564697, "rewards/margins": 0.4753497540950775, "rewards/rejected": -5.231799125671387, "sft_loss": 4.347174644470215, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 0.7028939653486833, "learning_rate": 7.245078304138335e-07, "logits/chosen": -0.676438570022583, "logits/rejected": -0.617641270160675, "logps/chosen": -4.50102424621582, "logps/rejected": -5.152174472808838, "loss": 0.052, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.50102424621582, "rewards/margins": 0.6511501669883728, "rewards/rejected": -5.152174472808838, "sft_loss": 4.230152606964111, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 0.34441628678949693, "learning_rate": 7.231151670944462e-07, "logits/chosen": -0.9201499223709106, "logits/rejected": -0.6691521406173706, "logps/chosen": -4.548752307891846, "logps/rejected": -5.115777969360352, "loss": 0.0525, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.548752307891846, "rewards/margins": 0.5670259594917297, "rewards/rejected": -5.115777969360352, "sft_loss": 4.256083965301514, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 0.36028138123982145, "learning_rate": 7.217203391107291e-07, "logits/chosen": -0.8337070345878601, "logits/rejected": -0.6574937105178833, "logps/chosen": -4.54742431640625, "logps/rejected": -5.139768600463867, "loss": 0.0526, "rewards/accuracies": 0.625, "rewards/chosen": -4.54742431640625, "rewards/margins": 0.5923444032669067, "rewards/rejected": -5.139768600463867, "sft_loss": 4.214510440826416, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 0.5578957686131796, "learning_rate": 7.203233599953096e-07, "logits/chosen": -0.8540999293327332, "logits/rejected": -0.6997717618942261, "logps/chosen": -4.4829535484313965, "logps/rejected": -5.033291816711426, "loss": 0.0524, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.4829535484313965, "rewards/margins": 0.5503381490707397, "rewards/rejected": -5.033291816711426, "sft_loss": 4.220416069030762, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 0.4751570065353926, "learning_rate": 7.189242433016852e-07, "logits/chosen": -0.778272271156311, "logits/rejected": -0.6525672674179077, "logps/chosen": -4.632129669189453, "logps/rejected": -5.244833946228027, "loss": 0.0526, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.632129669189453, "rewards/margins": 0.6127038598060608, "rewards/rejected": -5.244833946228027, "sft_loss": 4.3556694984436035, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 0.49060819947586853, "learning_rate": 7.17523002604092e-07, "logits/chosen": -0.8550931811332703, "logits/rejected": -0.6443161368370056, "logps/chosen": -4.523386001586914, "logps/rejected": -5.068552017211914, "loss": 0.0526, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.523386001586914, "rewards/margins": 0.5451655983924866, "rewards/rejected": -5.068552017211914, "sft_loss": 4.249871253967285, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 0.8862204290910091, "learning_rate": 7.161196514973734e-07, "logits/chosen": -0.7608314752578735, "logits/rejected": -0.5959222912788391, "logps/chosen": -4.416409969329834, "logps/rejected": -5.110037326812744, "loss": 0.0538, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -4.416409969329834, "rewards/margins": 0.6936279535293579, "rewards/rejected": -5.110037326812744, "sft_loss": 4.177985191345215, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 0.36966512161661286, "learning_rate": 7.147142035968483e-07, "logits/chosen": -0.7292311787605286, "logits/rejected": -0.5311886668205261, "logps/chosen": -4.622040748596191, "logps/rejected": -5.120987415313721, "loss": 0.053, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.622040748596191, "rewards/margins": 0.49894601106643677, "rewards/rejected": -5.120987415313721, "sft_loss": 4.340848445892334, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 0.6284592603398197, "learning_rate": 7.133066725381781e-07, "logits/chosen": -0.9135788679122925, "logits/rejected": -0.701241135597229, "logps/chosen": -4.508200645446777, "logps/rejected": -5.073288917541504, "loss": 0.0527, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.508200645446777, "rewards/margins": 0.565088152885437, "rewards/rejected": -5.073288917541504, "sft_loss": 4.304009437561035, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 0.4830072872353554, "learning_rate": 7.118970719772354e-07, "logits/chosen": -0.8243740200996399, "logits/rejected": -0.6196537017822266, "logps/chosen": -4.614374160766602, "logps/rejected": -5.266146659851074, "loss": 0.0523, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.614374160766602, "rewards/margins": 0.651772141456604, "rewards/rejected": -5.266146659851074, "sft_loss": 4.351268768310547, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 0.4899922503825756, "learning_rate": 7.104854155899711e-07, "logits/chosen": -0.676455020904541, "logits/rejected": -0.5914028286933899, "logps/chosen": -4.441977024078369, "logps/rejected": -4.935439109802246, "loss": 0.0524, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.441977024078369, "rewards/margins": 0.4934620261192322, "rewards/rejected": -4.935439109802246, "sft_loss": 4.1658124923706055, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 0.4339700877673433, "learning_rate": 7.090717170722817e-07, "logits/chosen": -0.6420737504959106, "logits/rejected": -0.6251760721206665, "logps/chosen": -4.5333662033081055, "logps/rejected": -5.1688055992126465, "loss": 0.0515, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.5333662033081055, "rewards/margins": 0.6354392766952515, "rewards/rejected": -5.1688055992126465, "sft_loss": 4.251795768737793, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 0.5061167164386356, "learning_rate": 7.076559901398762e-07, "logits/chosen": -0.7426393628120422, "logits/rejected": -0.6046496629714966, "logps/chosen": -4.450405120849609, "logps/rejected": -4.918197154998779, "loss": 0.0527, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.450405120849609, "rewards/margins": 0.46779197454452515, "rewards/rejected": -4.918197154998779, "sft_loss": 4.215243339538574, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 0.5037524653154726, "learning_rate": 7.062382485281436e-07, "logits/chosen": -0.64927077293396, "logits/rejected": -0.513473391532898, "logps/chosen": -4.409026145935059, "logps/rejected": -4.936314105987549, "loss": 0.0533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.409026145935059, "rewards/margins": 0.5272881388664246, "rewards/rejected": -4.936314105987549, "sft_loss": 4.165338039398193, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": -0.2459660917520523, "eval_logits/rejected": -0.15563316643238068, "eval_logps/chosen": -4.822916507720947, "eval_logps/rejected": -5.425734519958496, "eval_loss": 0.051439397037029266, "eval_rewards/accuracies": 0.6632047295570374, "eval_rewards/chosen": -4.822916507720947, "eval_rewards/margins": 0.6028181910514832, "eval_rewards/rejected": -5.425734519958496, "eval_runtime": 43.403, "eval_samples_per_second": 30.989, "eval_sft_loss": 4.406880855560303, "eval_steps_per_second": 7.764, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 0.7512180519376704, "learning_rate": 7.048185059920193e-07, "logits/chosen": -0.6440736651420593, "logits/rejected": -0.5108321905136108, "logps/chosen": -4.895911693572998, "logps/rejected": -5.490865230560303, "loss": 0.0528, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.895911693572998, "rewards/margins": 0.5949534773826599, "rewards/rejected": -5.490865230560303, "sft_loss": 4.582104206085205, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 1.0070888677024856, "learning_rate": 7.033967763058516e-07, "logits/chosen": -0.722671389579773, "logits/rejected": -0.5478376150131226, "logps/chosen": -4.615888595581055, "logps/rejected": -5.002371788024902, "loss": 0.0521, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.615888595581055, "rewards/margins": 0.386482298374176, "rewards/rejected": -5.002371788024902, "sft_loss": 4.239572048187256, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 0.4258529079691424, "learning_rate": 7.019730732632681e-07, "logits/chosen": -0.5446482300758362, "logits/rejected": -0.43018198013305664, "logps/chosen": -4.422608375549316, "logps/rejected": -5.125778675079346, "loss": 0.0518, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.422608375549316, "rewards/margins": 0.7031702995300293, "rewards/rejected": -5.125778675079346, "sft_loss": 4.148711681365967, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 0.35467819540066825, "learning_rate": 7.005474106770418e-07, "logits/chosen": -0.6596757173538208, "logits/rejected": -0.5504968166351318, "logps/chosen": -4.54642391204834, "logps/rejected": -5.045216083526611, "loss": 0.0521, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.54642391204834, "rewards/margins": 0.4987919330596924, "rewards/rejected": -5.045216083526611, "sft_loss": 4.238245964050293, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 1.3599351436307947, "learning_rate": 6.991198023789577e-07, "logits/chosen": -0.5905810594558716, "logits/rejected": -0.5072682499885559, "logps/chosen": -4.447530269622803, "logps/rejected": -4.882772445678711, "loss": 0.0538, "rewards/accuracies": 0.625, "rewards/chosen": -4.447530269622803, "rewards/margins": 0.43524178862571716, "rewards/rejected": -4.882772445678711, "sft_loss": 4.207924842834473, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 0.5025417009492776, "learning_rate": 6.976902622196776e-07, "logits/chosen": -0.6522349119186401, "logits/rejected": -0.5629934072494507, "logps/chosen": -4.689788818359375, "logps/rejected": -5.103816509246826, "loss": 0.0537, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.689788818359375, "rewards/margins": 0.41402775049209595, "rewards/rejected": -5.103816509246826, "sft_loss": 4.374016761779785, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 0.33461264496064647, "learning_rate": 6.962588040686064e-07, "logits/chosen": -0.7623521089553833, "logits/rejected": -0.5751533508300781, "logps/chosen": -4.745035171508789, "logps/rejected": -5.1734185218811035, "loss": 0.0537, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.745035171508789, "rewards/margins": 0.42838358879089355, "rewards/rejected": -5.1734185218811035, "sft_loss": 4.583779811859131, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 0.3503261781496817, "learning_rate": 6.948254418137573e-07, "logits/chosen": -0.712530255317688, "logits/rejected": -0.5704913139343262, "logps/chosen": -4.597674369812012, "logps/rejected": -5.1268134117126465, "loss": 0.0526, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.597674369812012, "rewards/margins": 0.5291392803192139, "rewards/rejected": -5.1268134117126465, "sft_loss": 4.329079627990723, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 0.39441341051128054, "learning_rate": 6.933901893616174e-07, "logits/chosen": -0.7677714824676514, "logits/rejected": -0.6126347184181213, "logps/chosen": -4.438683032989502, "logps/rejected": -5.042284965515137, "loss": 0.0519, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.438683032989502, "rewards/margins": 0.6036025881767273, "rewards/rejected": -5.042284965515137, "sft_loss": 4.119725227355957, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 0.3927956162770441, "learning_rate": 6.919530606370121e-07, "logits/chosen": -0.7254393696784973, "logits/rejected": -0.5506623983383179, "logps/chosen": -4.357143402099609, "logps/rejected": -4.954403877258301, "loss": 0.0517, "rewards/accuracies": 0.6875, "rewards/chosen": -4.357143402099609, "rewards/margins": 0.5972608327865601, "rewards/rejected": -4.954403877258301, "sft_loss": 4.115630149841309, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 0.3738986864645462, "learning_rate": 6.905140695829706e-07, "logits/chosen": -0.7687052488327026, "logits/rejected": -0.46032652258872986, "logps/chosen": -4.44050407409668, "logps/rejected": -5.113463878631592, "loss": 0.0514, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.44050407409668, "rewards/margins": 0.6729599237442017, "rewards/rejected": -5.113463878631592, "sft_loss": 4.17454719543457, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 0.5571532095288572, "learning_rate": 6.890732301605904e-07, "logits/chosen": -0.6341904401779175, "logits/rejected": -0.5448669195175171, "logps/chosen": -4.6078996658325195, "logps/rejected": -5.000533103942871, "loss": 0.0531, "rewards/accuracies": 0.625, "rewards/chosen": -4.6078996658325195, "rewards/margins": 0.3926334083080292, "rewards/rejected": -5.000533103942871, "sft_loss": 4.291760444641113, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 0.3892123944114807, "learning_rate": 6.876305563489021e-07, "logits/chosen": -0.6441881060600281, "logits/rejected": -0.5762056112289429, "logps/chosen": -4.400938510894775, "logps/rejected": -5.1529011726379395, "loss": 0.0504, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.400938510894775, "rewards/margins": 0.7519630193710327, "rewards/rejected": -5.1529011726379395, "sft_loss": 4.062161445617676, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 0.5043097198631741, "learning_rate": 6.861860621447331e-07, "logits/chosen": -0.7821289300918579, "logits/rejected": -0.6575089693069458, "logps/chosen": -4.7076544761657715, "logps/rejected": -5.158883094787598, "loss": 0.0539, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.7076544761657715, "rewards/margins": 0.451228529214859, "rewards/rejected": -5.158883094787598, "sft_loss": 4.450681686401367, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 0.5241208894891419, "learning_rate": 6.847397615625725e-07, "logits/chosen": -0.6322156190872192, "logits/rejected": -0.5924761295318604, "logps/chosen": -4.701780796051025, "logps/rejected": -5.140883922576904, "loss": 0.0522, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.701780796051025, "rewards/margins": 0.43910354375839233, "rewards/rejected": -5.140883922576904, "sft_loss": 4.368111610412598, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 0.5530610503614979, "learning_rate": 6.83291668634435e-07, "logits/chosen": -0.7902230024337769, "logits/rejected": -0.5947554111480713, "logps/chosen": -4.581168174743652, "logps/rejected": -5.159932613372803, "loss": 0.0522, "rewards/accuracies": 0.71875, "rewards/chosen": -4.581168174743652, "rewards/margins": 0.5787646174430847, "rewards/rejected": -5.159932613372803, "sft_loss": 4.308629512786865, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 0.5053924424200537, "learning_rate": 6.818417974097246e-07, "logits/chosen": -0.5258646607398987, "logits/rejected": -0.38841742277145386, "logps/chosen": -4.452823638916016, "logps/rejected": -5.15665864944458, "loss": 0.0516, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.452823638916016, "rewards/margins": 0.7038346529006958, "rewards/rejected": -5.15665864944458, "sft_loss": 4.177115440368652, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 0.46316275553685127, "learning_rate": 6.803901619550981e-07, "logits/chosen": -0.6718374490737915, "logits/rejected": -0.6161580085754395, "logps/chosen": -4.422329902648926, "logps/rejected": -4.936065673828125, "loss": 0.0529, "rewards/accuracies": 0.6875, "rewards/chosen": -4.422329902648926, "rewards/margins": 0.5137358903884888, "rewards/rejected": -4.936065673828125, "sft_loss": 4.147505283355713, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 0.5381914488734175, "learning_rate": 6.789367763543292e-07, "logits/chosen": -0.6015291213989258, "logits/rejected": -0.5835751295089722, "logps/chosen": -4.570321559906006, "logps/rejected": -5.038084983825684, "loss": 0.0544, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.570321559906006, "rewards/margins": 0.467763751745224, "rewards/rejected": -5.038084983825684, "sft_loss": 4.3525590896606445, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 0.7920108460285353, "learning_rate": 6.774816547081714e-07, "logits/chosen": -0.6614540815353394, "logits/rejected": -0.4659046232700348, "logps/chosen": -4.600603103637695, "logps/rejected": -5.15115213394165, "loss": 0.0535, "rewards/accuracies": 0.59375, "rewards/chosen": -4.600603103637695, "rewards/margins": 0.5505493879318237, "rewards/rejected": -5.15115213394165, "sft_loss": 4.411740303039551, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 0.32826827986852447, "learning_rate": 6.760248111342211e-07, "logits/chosen": -0.7377767562866211, "logits/rejected": -0.5531325340270996, "logps/chosen": -4.688912868499756, "logps/rejected": -5.238001823425293, "loss": 0.0523, "rewards/accuracies": 0.6875, "rewards/chosen": -4.688912868499756, "rewards/margins": 0.5490891337394714, "rewards/rejected": -5.238001823425293, "sft_loss": 4.432499885559082, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 0.6428193005965454, "learning_rate": 6.745662597667813e-07, "logits/chosen": -0.750898540019989, "logits/rejected": -0.6026118993759155, "logps/chosen": -4.439126014709473, "logps/rejected": -4.934445381164551, "loss": 0.0519, "rewards/accuracies": 0.6875, "rewards/chosen": -4.439126014709473, "rewards/margins": 0.49531856179237366, "rewards/rejected": -4.934445381164551, "sft_loss": 4.148636817932129, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 0.38982972965941254, "learning_rate": 6.731060147567236e-07, "logits/chosen": -0.5894494652748108, "logits/rejected": -0.5215369462966919, "logps/chosen": -4.36592960357666, "logps/rejected": -4.934881210327148, "loss": 0.0519, "rewards/accuracies": 0.6875, "rewards/chosen": -4.36592960357666, "rewards/margins": 0.5689516067504883, "rewards/rejected": -4.934881210327148, "sft_loss": 4.115760326385498, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 0.611393333794855, "learning_rate": 6.716440902713515e-07, "logits/chosen": -0.6964327096939087, "logits/rejected": -0.6343324184417725, "logps/chosen": -4.4690141677856445, "logps/rejected": -4.8751397132873535, "loss": 0.0527, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.4690141677856445, "rewards/margins": 0.406125545501709, "rewards/rejected": -4.8751397132873535, "sft_loss": 4.147188663482666, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 0.39953792525208676, "learning_rate": 6.701805004942627e-07, "logits/chosen": -0.6524925231933594, "logits/rejected": -0.595403254032135, "logps/chosen": -4.759545803070068, "logps/rejected": -5.341307640075684, "loss": 0.0532, "rewards/accuracies": 0.65625, "rewards/chosen": -4.759545803070068, "rewards/margins": 0.5817619562149048, "rewards/rejected": -5.341307640075684, "sft_loss": 4.543875694274902, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 0.5767017487926428, "learning_rate": 6.687152596252119e-07, "logits/chosen": -0.7337282299995422, "logits/rejected": -0.6855202913284302, "logps/chosen": -4.838815212249756, "logps/rejected": -5.192466735839844, "loss": 0.0538, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.838815212249756, "rewards/margins": 0.35365158319473267, "rewards/rejected": -5.192466735839844, "sft_loss": 4.54294490814209, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 0.4398488876945579, "learning_rate": 6.672483818799722e-07, "logits/chosen": -0.7373114824295044, "logits/rejected": -0.5846437215805054, "logps/chosen": -4.462458610534668, "logps/rejected": -4.956521511077881, "loss": 0.0528, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.462458610534668, "rewards/margins": 0.4940629005432129, "rewards/rejected": -4.956521511077881, "sft_loss": 4.213451862335205, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 0.3887196561405752, "learning_rate": 6.657798814901978e-07, "logits/chosen": -0.7088289260864258, "logits/rejected": -0.4987329840660095, "logps/chosen": -4.533008098602295, "logps/rejected": -4.962521553039551, "loss": 0.0516, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.533008098602295, "rewards/margins": 0.4295133650302887, "rewards/rejected": -4.962521553039551, "sft_loss": 4.149649143218994, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 0.51660624133589, "learning_rate": 6.643097727032863e-07, "logits/chosen": -0.7439020872116089, "logits/rejected": -0.5362112522125244, "logps/chosen": -4.448864936828613, "logps/rejected": -5.117484092712402, "loss": 0.0516, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.448864936828613, "rewards/margins": 0.6686197519302368, "rewards/rejected": -5.117484092712402, "sft_loss": 4.22825288772583, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 0.5837046444529704, "learning_rate": 6.628380697822392e-07, "logits/chosen": -0.7480605244636536, "logits/rejected": -0.5812298059463501, "logps/chosen": -4.5380167961120605, "logps/rejected": -5.00238561630249, "loss": 0.0526, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.5380167961120605, "rewards/margins": 0.46436864137649536, "rewards/rejected": -5.00238561630249, "sft_loss": 4.235762119293213, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 0.5264631149675114, "learning_rate": 6.61364787005525e-07, "logits/chosen": -0.7261337041854858, "logits/rejected": -0.5831155180931091, "logps/chosen": -4.57515287399292, "logps/rejected": -5.160813331604004, "loss": 0.0529, "rewards/accuracies": 0.6875, "rewards/chosen": -4.57515287399292, "rewards/margins": 0.585660457611084, "rewards/rejected": -5.160813331604004, "sft_loss": 4.378745079040527, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 0.4918096353013463, "learning_rate": 6.598899386669395e-07, "logits/chosen": -0.7453008890151978, "logits/rejected": -0.6159285306930542, "logps/chosen": -4.560245513916016, "logps/rejected": -5.077498435974121, "loss": 0.0534, "rewards/accuracies": 0.65625, "rewards/chosen": -4.560245513916016, "rewards/margins": 0.5172520279884338, "rewards/rejected": -5.077498435974121, "sft_loss": 4.279725074768066, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 0.44468809524085257, "learning_rate": 6.584135390754679e-07, "logits/chosen": -0.7995618581771851, "logits/rejected": -0.6510564088821411, "logps/chosen": -4.504146099090576, "logps/rejected": -5.2412333488464355, "loss": 0.0519, "rewards/accuracies": 0.65625, "rewards/chosen": -4.504146099090576, "rewards/margins": 0.7370876669883728, "rewards/rejected": -5.2412333488464355, "sft_loss": 4.247662544250488, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 0.38358992062706415, "learning_rate": 6.569356025551454e-07, "logits/chosen": -0.7912853956222534, "logits/rejected": -0.7308619022369385, "logps/chosen": -4.615150451660156, "logps/rejected": -5.038762092590332, "loss": 0.0521, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.615150451660156, "rewards/margins": 0.42361217737197876, "rewards/rejected": -5.038762092590332, "sft_loss": 4.207141399383545, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 0.36018657240876495, "learning_rate": 6.554561434449186e-07, "logits/chosen": -0.9121583104133606, "logits/rejected": -0.7354962825775146, "logps/chosen": -4.590145587921143, "logps/rejected": -5.142067909240723, "loss": 0.0527, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.590145587921143, "rewards/margins": 0.5519219636917114, "rewards/rejected": -5.142067909240723, "sft_loss": 4.361257076263428, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 0.41305806068904577, "learning_rate": 6.539751760985063e-07, "logits/chosen": -0.780017614364624, "logits/rejected": -0.7046865224838257, "logps/chosen": -4.54480504989624, "logps/rejected": -4.859285831451416, "loss": 0.0538, "rewards/accuracies": 0.625, "rewards/chosen": -4.54480504989624, "rewards/margins": 0.31448012590408325, "rewards/rejected": -4.859285831451416, "sft_loss": 4.202549934387207, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 0.3782339057679898, "learning_rate": 6.524927148842602e-07, "logits/chosen": -0.7151792049407959, "logits/rejected": -0.5270150303840637, "logps/chosen": -4.705697059631348, "logps/rejected": -5.182741641998291, "loss": 0.0519, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.705697059631348, "rewards/margins": 0.47704464197158813, "rewards/rejected": -5.182741641998291, "sft_loss": 4.334061622619629, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 0.5868680617117082, "learning_rate": 6.510087741850254e-07, "logits/chosen": -0.7695601582527161, "logits/rejected": -0.6490595936775208, "logps/chosen": -4.606133460998535, "logps/rejected": -5.070815086364746, "loss": 0.0528, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.606133460998535, "rewards/margins": 0.464682012796402, "rewards/rejected": -5.070815086364746, "sft_loss": 4.332033634185791, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 0.31923571248340193, "learning_rate": 6.495233683980012e-07, "logits/chosen": -0.7917790412902832, "logits/rejected": -0.757839560508728, "logps/chosen": -4.3590850830078125, "logps/rejected": -4.865941047668457, "loss": 0.0524, "rewards/accuracies": 0.65625, "rewards/chosen": -4.3590850830078125, "rewards/margins": 0.5068557858467102, "rewards/rejected": -4.865941047668457, "sft_loss": 4.141876220703125, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 0.6110778260580617, "learning_rate": 6.480365119346011e-07, "logits/chosen": -0.7061268091201782, "logits/rejected": -0.5795341730117798, "logps/chosen": -4.5417585372924805, "logps/rejected": -4.974133491516113, "loss": 0.0526, "rewards/accuracies": 0.6875, "rewards/chosen": -4.5417585372924805, "rewards/margins": 0.4323754906654358, "rewards/rejected": -4.974133491516113, "sft_loss": 4.267592906951904, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 0.3968746308711125, "learning_rate": 6.465482192203129e-07, "logits/chosen": -0.6922782063484192, "logits/rejected": -0.6162427663803101, "logps/chosen": -4.611149787902832, "logps/rejected": -5.119748115539551, "loss": 0.0529, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.611149787902832, "rewards/margins": 0.508597731590271, "rewards/rejected": -5.119748115539551, "sft_loss": 4.388543605804443, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 0.43719125204764003, "learning_rate": 6.45058504694559e-07, "logits/chosen": -0.6123635768890381, "logits/rejected": -0.5769099593162537, "logps/chosen": -4.499598503112793, "logps/rejected": -5.056746006011963, "loss": 0.0532, "rewards/accuracies": 0.65625, "rewards/chosen": -4.499598503112793, "rewards/margins": 0.557147204875946, "rewards/rejected": -5.056746006011963, "sft_loss": 4.259655952453613, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 0.6082587458332754, "learning_rate": 6.435673828105564e-07, "logits/chosen": -0.7298904657363892, "logits/rejected": -0.5628734230995178, "logps/chosen": -4.518126964569092, "logps/rejected": -5.07608699798584, "loss": 0.0534, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.518126964569092, "rewards/margins": 0.5579599142074585, "rewards/rejected": -5.07608699798584, "sft_loss": 4.245197296142578, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 0.8367099282361746, "learning_rate": 6.420748680351763e-07, "logits/chosen": -0.6793943643569946, "logits/rejected": -0.7437562942504883, "logps/chosen": -4.676467418670654, "logps/rejected": -5.091729164123535, "loss": 0.0542, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.676467418670654, "rewards/margins": 0.4152621626853943, "rewards/rejected": -5.091729164123535, "sft_loss": 4.483765125274658, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 0.3433494848599413, "learning_rate": 6.405809748488032e-07, "logits/chosen": -0.8351935148239136, "logits/rejected": -0.6800636053085327, "logps/chosen": -4.701047420501709, "logps/rejected": -5.2720818519592285, "loss": 0.0524, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.701047420501709, "rewards/margins": 0.571033775806427, "rewards/rejected": -5.2720818519592285, "sft_loss": 4.444343566894531, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 0.4795161285636052, "learning_rate": 6.390857177451956e-07, "logits/chosen": -0.8992173075675964, "logits/rejected": -0.6891031265258789, "logps/chosen": -4.538784027099609, "logps/rejected": -5.005777835845947, "loss": 0.0531, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.538784027099609, "rewards/margins": 0.46699291467666626, "rewards/rejected": -5.005777835845947, "sft_loss": 4.31063985824585, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 0.7016521627801586, "learning_rate": 6.375891112313445e-07, "logits/chosen": -0.8801708221435547, "logits/rejected": -0.7778578996658325, "logps/chosen": -4.37764310836792, "logps/rejected": -4.836381912231445, "loss": 0.0519, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.37764310836792, "rewards/margins": 0.4587384760379791, "rewards/rejected": -4.836381912231445, "sft_loss": 4.069215774536133, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 0.4657342923390584, "learning_rate": 6.360911698273326e-07, "logits/chosen": -0.7782545685768127, "logits/rejected": -0.6499952077865601, "logps/chosen": -4.533600807189941, "logps/rejected": -4.894816875457764, "loss": 0.0543, "rewards/accuracies": 0.65625, "rewards/chosen": -4.533600807189941, "rewards/margins": 0.36121657490730286, "rewards/rejected": -4.894816875457764, "sft_loss": 4.284762382507324, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 0.47112647697309784, "learning_rate": 6.345919080661944e-07, "logits/chosen": -0.9207308888435364, "logits/rejected": -0.8575568199157715, "logps/chosen": -4.626215934753418, "logps/rejected": -5.244388580322266, "loss": 0.0522, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.626215934753418, "rewards/margins": 0.6181727051734924, "rewards/rejected": -5.244388580322266, "sft_loss": 4.426272392272949, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 0.29671780119762087, "learning_rate": 6.330913404937737e-07, "logits/chosen": -0.9336298108100891, "logits/rejected": -0.7710980176925659, "logps/chosen": -4.727460861206055, "logps/rejected": -5.37843132019043, "loss": 0.0519, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.727460861206055, "rewards/margins": 0.6509709358215332, "rewards/rejected": -5.37843132019043, "sft_loss": 4.423439979553223, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 0.5791699497151932, "learning_rate": 6.315894816685838e-07, "logits/chosen": -0.7895804643630981, "logits/rejected": -0.6132604479789734, "logps/chosen": -4.567627906799316, "logps/rejected": -4.8765106201171875, "loss": 0.0527, "rewards/accuracies": 0.625, "rewards/chosen": -4.567627906799316, "rewards/margins": 0.3088833689689636, "rewards/rejected": -4.8765106201171875, "sft_loss": 4.217142105102539, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 0.5073327462343239, "learning_rate": 6.300863461616657e-07, "logits/chosen": -0.7717041969299316, "logits/rejected": -0.6730908155441284, "logps/chosen": -4.412945747375488, "logps/rejected": -4.921000003814697, "loss": 0.053, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.412945747375488, "rewards/margins": 0.5080535411834717, "rewards/rejected": -4.921000003814697, "sft_loss": 4.149726390838623, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 0.3239165109984895, "learning_rate": 6.285819485564465e-07, "logits/chosen": -0.9396502375602722, "logits/rejected": -0.7643145322799683, "logps/chosen": -4.451190948486328, "logps/rejected": -5.017029762268066, "loss": 0.0523, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.451190948486328, "rewards/margins": 0.5658388733863831, "rewards/rejected": -5.017029762268066, "sft_loss": 4.155117511749268, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 0.47727707504839423, "learning_rate": 6.270763034485986e-07, "logits/chosen": -0.8083940744400024, "logits/rejected": -0.6736945509910583, "logps/chosen": -4.661080360412598, "logps/rejected": -5.173447132110596, "loss": 0.0516, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.661080360412598, "rewards/margins": 0.5123669505119324, "rewards/rejected": -5.173447132110596, "sft_loss": 4.300747871398926, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 0.49975394521225786, "learning_rate": 6.255694254458972e-07, "logits/chosen": -0.9003822207450867, "logits/rejected": -0.7234520316123962, "logps/chosen": -4.687695503234863, "logps/rejected": -5.162412166595459, "loss": 0.0545, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.687695503234863, "rewards/margins": 0.474717378616333, "rewards/rejected": -5.162412166595459, "sft_loss": 4.253655433654785, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 0.7970279083312815, "learning_rate": 6.240613291680795e-07, "logits/chosen": -0.8419411778450012, "logits/rejected": -0.653814435005188, "logps/chosen": -4.679329872131348, "logps/rejected": -5.142355918884277, "loss": 0.0531, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.679329872131348, "rewards/margins": 0.4630259871482849, "rewards/rejected": -5.142355918884277, "sft_loss": 4.350041389465332, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 0.44742716716600295, "learning_rate": 6.225520292467021e-07, "logits/chosen": -0.8692100644111633, "logits/rejected": -0.6362851858139038, "logps/chosen": -4.339302062988281, "logps/rejected": -5.107569217681885, "loss": 0.05, "rewards/accuracies": 0.71875, "rewards/chosen": -4.339302062988281, "rewards/margins": 0.7682672739028931, "rewards/rejected": -5.107569217681885, "sft_loss": 4.058346748352051, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 0.5956502684124177, "learning_rate": 6.210415403249993e-07, "logits/chosen": -0.9499366879463196, "logits/rejected": -0.6553173065185547, "logps/chosen": -4.524402141571045, "logps/rejected": -5.320879936218262, "loss": 0.051, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.524402141571045, "rewards/margins": 0.7964780330657959, "rewards/rejected": -5.320879936218262, "sft_loss": 4.101487159729004, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 0.6211044159448419, "learning_rate": 6.195298770577415e-07, "logits/chosen": -0.7495101094245911, "logits/rejected": -0.7657794952392578, "logps/chosen": -4.553832054138184, "logps/rejected": -5.104920864105225, "loss": 0.0532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.553832054138184, "rewards/margins": 0.551088809967041, "rewards/rejected": -5.104920864105225, "sft_loss": 4.29389762878418, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 0.3685359376502068, "learning_rate": 6.180170541110923e-07, "logits/chosen": -0.82825767993927, "logits/rejected": -0.6391871571540833, "logps/chosen": -4.653820514678955, "logps/rejected": -5.102269649505615, "loss": 0.0534, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.653820514678955, "rewards/margins": 0.4484497010707855, "rewards/rejected": -5.102269649505615, "sft_loss": 4.338795185089111, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 0.36879136698007464, "learning_rate": 6.165030861624663e-07, "logits/chosen": -1.0215765237808228, "logits/rejected": -0.704619288444519, "logps/chosen": -4.500130653381348, "logps/rejected": -5.3099188804626465, "loss": 0.0514, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.500130653381348, "rewards/margins": 0.809788703918457, "rewards/rejected": -5.3099188804626465, "sft_loss": 4.320158004760742, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 0.35921794005610297, "learning_rate": 6.149879879003876e-07, "logits/chosen": -0.736519992351532, "logits/rejected": -0.7717832326889038, "logps/chosen": -4.527623176574707, "logps/rejected": -4.958807945251465, "loss": 0.0526, "rewards/accuracies": 0.6875, "rewards/chosen": -4.527623176574707, "rewards/margins": 0.431184858083725, "rewards/rejected": -4.958807945251465, "sft_loss": 4.252358436584473, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 0.4029998204084026, "learning_rate": 6.13471774024346e-07, "logits/chosen": -0.9023638963699341, "logits/rejected": -0.7861912250518799, "logps/chosen": -4.328760623931885, "logps/rejected": -4.899123191833496, "loss": 0.0513, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.328760623931885, "rewards/margins": 0.5703624486923218, "rewards/rejected": -4.899123191833496, "sft_loss": 4.081745624542236, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 0.3181867162536053, "learning_rate": 6.119544592446551e-07, "logits/chosen": -0.8461063504219055, "logits/rejected": -0.7370610237121582, "logps/chosen": -4.367936611175537, "logps/rejected": -4.853856086730957, "loss": 0.0524, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.367936611175537, "rewards/margins": 0.48591962456703186, "rewards/rejected": -4.853856086730957, "sft_loss": 4.085618019104004, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 0.4417312535342543, "learning_rate": 6.104360582823096e-07, "logits/chosen": -0.853473961353302, "logits/rejected": -0.7386851906776428, "logps/chosen": -4.7194013595581055, "logps/rejected": -5.207268714904785, "loss": 0.0531, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.7194013595581055, "rewards/margins": 0.4878672659397125, "rewards/rejected": -5.207268714904785, "sft_loss": 4.425844192504883, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 0.4267754873095855, "learning_rate": 6.089165858688423e-07, "logits/chosen": -0.7712498903274536, "logits/rejected": -0.5837336778640747, "logps/chosen": -4.546807289123535, "logps/rejected": -5.284149169921875, "loss": 0.0507, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.546807289123535, "rewards/margins": 0.7373424768447876, "rewards/rejected": -5.284149169921875, "sft_loss": 4.248849391937256, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 0.5243832632135247, "learning_rate": 6.073960567461811e-07, "logits/chosen": -0.7328991889953613, "logits/rejected": -0.5446587800979614, "logps/chosen": -4.3400068283081055, "logps/rejected": -5.033965110778809, "loss": 0.0511, "rewards/accuracies": 0.71875, "rewards/chosen": -4.3400068283081055, "rewards/margins": 0.6939582824707031, "rewards/rejected": -5.033965110778809, "sft_loss": 4.018763542175293, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 0.43775113695751916, "learning_rate": 6.058744856665065e-07, "logits/chosen": -0.7534220218658447, "logits/rejected": -0.6736117601394653, "logps/chosen": -4.569155693054199, "logps/rejected": -5.47993278503418, "loss": 0.0507, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.569155693054199, "rewards/margins": 0.9107775688171387, "rewards/rejected": -5.47993278503418, "sft_loss": 4.218738555908203, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 0.4717035016463936, "learning_rate": 6.043518873921074e-07, "logits/chosen": -0.7917760610580444, "logits/rejected": -0.657507061958313, "logps/chosen": -4.352883338928223, "logps/rejected": -4.968850135803223, "loss": 0.0514, "rewards/accuracies": 0.71875, "rewards/chosen": -4.352883338928223, "rewards/margins": 0.6159666776657104, "rewards/rejected": -4.968850135803223, "sft_loss": 3.9638657569885254, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 0.45414652768336783, "learning_rate": 6.028282766952393e-07, "logits/chosen": -0.7075755596160889, "logits/rejected": -0.6383055448532104, "logps/chosen": -4.6790876388549805, "logps/rejected": -5.391210556030273, "loss": 0.0507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.6790876388549805, "rewards/margins": 0.7121232151985168, "rewards/rejected": -5.391210556030273, "sft_loss": 4.254571914672852, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 0.5878041839400147, "learning_rate": 6.013036683579798e-07, "logits/chosen": -0.6836889386177063, "logits/rejected": -0.5880746841430664, "logps/chosen": -4.447044849395752, "logps/rejected": -5.132805824279785, "loss": 0.0517, "rewards/accuracies": 0.75, "rewards/chosen": -4.447044849395752, "rewards/margins": 0.6857603788375854, "rewards/rejected": -5.132805824279785, "sft_loss": 4.128294944763184, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 0.44709587630970354, "learning_rate": 5.997780771720854e-07, "logits/chosen": -0.8651409149169922, "logits/rejected": -0.6353691816329956, "logps/chosen": -4.558709144592285, "logps/rejected": -5.283412933349609, "loss": 0.0523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.558709144592285, "rewards/margins": 0.7247046232223511, "rewards/rejected": -5.283412933349609, "sft_loss": 4.31949520111084, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 0.4967270396229397, "learning_rate": 5.982515179388486e-07, "logits/chosen": -0.6980944871902466, "logits/rejected": -0.5798202753067017, "logps/chosen": -4.641325950622559, "logps/rejected": -5.210552215576172, "loss": 0.0527, "rewards/accuracies": 0.6875, "rewards/chosen": -4.641325950622559, "rewards/margins": 0.5692263841629028, "rewards/rejected": -5.210552215576172, "sft_loss": 4.369772911071777, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 0.414039866021628, "learning_rate": 5.967240054689541e-07, "logits/chosen": -0.8741037249565125, "logits/rejected": -0.8427600860595703, "logps/chosen": -4.335209846496582, "logps/rejected": -4.899285316467285, "loss": 0.0525, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.335209846496582, "rewards/margins": 0.5640758872032166, "rewards/rejected": -4.899285316467285, "sft_loss": 4.151376247406006, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 0.32893641730688494, "learning_rate": 5.951955545823342e-07, "logits/chosen": -0.8423898816108704, "logits/rejected": -0.7706656455993652, "logps/chosen": -4.534244537353516, "logps/rejected": -5.130573272705078, "loss": 0.0535, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.534244537353516, "rewards/margins": 0.5963292717933655, "rewards/rejected": -5.130573272705078, "sft_loss": 4.373254776000977, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 0.2899564595789967, "learning_rate": 5.936661801080263e-07, "logits/chosen": -0.8321715593338013, "logits/rejected": -0.7527654767036438, "logps/chosen": -4.561938285827637, "logps/rejected": -5.114630222320557, "loss": 0.0524, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.561938285827637, "rewards/margins": 0.5526921153068542, "rewards/rejected": -5.114630222320557, "sft_loss": 4.300882339477539, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 0.39218345035640906, "learning_rate": 5.92135896884028e-07, "logits/chosen": -0.8997092247009277, "logits/rejected": -0.7471407055854797, "logps/chosen": -4.617938041687012, "logps/rejected": -5.369228839874268, "loss": 0.0519, "rewards/accuracies": 0.71875, "rewards/chosen": -4.617938041687012, "rewards/margins": 0.7512913346290588, "rewards/rejected": -5.369228839874268, "sft_loss": 4.363840579986572, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 0.5000524915500977, "learning_rate": 5.906047197571541e-07, "logits/chosen": -0.7601592540740967, "logits/rejected": -0.8226611018180847, "logps/chosen": -4.56928014755249, "logps/rejected": -4.973417282104492, "loss": 0.0548, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.56928014755249, "rewards/margins": 0.4041372835636139, "rewards/rejected": -4.973417282104492, "sft_loss": 4.3844685554504395, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 0.4501939041101977, "learning_rate": 5.890726635828919e-07, "logits/chosen": -0.6789978742599487, "logits/rejected": -0.7200356721878052, "logps/chosen": -4.450923919677734, "logps/rejected": -4.913413047790527, "loss": 0.0536, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.450923919677734, "rewards/margins": 0.46248936653137207, "rewards/rejected": -4.913413047790527, "sft_loss": 4.257745265960693, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 0.4064590009353923, "learning_rate": 5.875397432252569e-07, "logits/chosen": -0.8166916966438293, "logits/rejected": -0.7958462834358215, "logps/chosen": -4.582040786743164, "logps/rejected": -5.0602827072143555, "loss": 0.0522, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.582040786743164, "rewards/margins": 0.47824162244796753, "rewards/rejected": -5.0602827072143555, "sft_loss": 4.298692226409912, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": -0.38490617275238037, "eval_logits/rejected": -0.29844847321510315, "eval_logps/chosen": -4.544572830200195, "eval_logps/rejected": -5.137356758117676, "eval_loss": 0.051060404628515244, "eval_rewards/accuracies": 0.6802670359611511, "eval_rewards/chosen": -4.544572830200195, "eval_rewards/margins": 0.5927836298942566, "eval_rewards/rejected": -5.137356758117676, "eval_runtime": 43.3873, "eval_samples_per_second": 31.0, "eval_sft_loss": 4.224380970001221, "eval_steps_per_second": 7.767, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 0.5857737743279292, "learning_rate": 5.860059735566491e-07, "logits/chosen": -0.986019492149353, "logits/rejected": -0.7938674688339233, "logps/chosen": -4.5381388664245605, "logps/rejected": -5.097907066345215, "loss": 0.0522, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.5381388664245605, "rewards/margins": 0.5597677230834961, "rewards/rejected": -5.097907066345215, "sft_loss": 4.319216728210449, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 0.5663653190019956, "learning_rate": 5.844713694577087e-07, "logits/chosen": -0.7099121809005737, "logits/rejected": -0.6400435566902161, "logps/chosen": -4.539187431335449, "logps/rejected": -5.13986873626709, "loss": 0.0518, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.539187431335449, "rewards/margins": 0.6006811857223511, "rewards/rejected": -5.13986873626709, "sft_loss": 4.334919452667236, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 0.8834082325413514, "learning_rate": 5.829359458171714e-07, "logits/chosen": -0.6759532690048218, "logits/rejected": -0.5866900682449341, "logps/chosen": -4.316249847412109, "logps/rejected": -5.01406192779541, "loss": 0.0513, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.316249847412109, "rewards/margins": 0.6978114247322083, "rewards/rejected": -5.01406192779541, "sft_loss": 4.148660659790039, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 0.7366505641829346, "learning_rate": 5.81399717531724e-07, "logits/chosen": -0.7597302198410034, "logits/rejected": -0.5598629713058472, "logps/chosen": -4.290135860443115, "logps/rejected": -4.8505539894104, "loss": 0.0532, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.290135860443115, "rewards/margins": 0.5604175329208374, "rewards/rejected": -4.8505539894104, "sft_loss": 4.115383148193359, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 0.4483292065178676, "learning_rate": 5.798626995058602e-07, "logits/chosen": -0.9454687833786011, "logits/rejected": -0.7220587730407715, "logps/chosen": -4.53515100479126, "logps/rejected": -5.261549949645996, "loss": 0.0525, "rewards/accuracies": 0.71875, "rewards/chosen": -4.53515100479126, "rewards/margins": 0.7263993620872498, "rewards/rejected": -5.261549949645996, "sft_loss": 4.332110404968262, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 0.5592988396281663, "learning_rate": 5.783249066517354e-07, "logits/chosen": -0.8145958781242371, "logits/rejected": -0.7614253163337708, "logps/chosen": -4.5657124519348145, "logps/rejected": -5.182963848114014, "loss": 0.0503, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.5657124519348145, "rewards/margins": 0.6172509789466858, "rewards/rejected": -5.182963848114014, "sft_loss": 4.231114387512207, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 0.632303409364965, "learning_rate": 5.767863538890228e-07, "logits/chosen": -0.8183963894844055, "logits/rejected": -0.6835195422172546, "logps/chosen": -4.389924049377441, "logps/rejected": -5.1250081062316895, "loss": 0.0517, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.389924049377441, "rewards/margins": 0.7350836992263794, "rewards/rejected": -5.1250081062316895, "sft_loss": 4.124279499053955, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 0.541651076628437, "learning_rate": 5.75247056144768e-07, "logits/chosen": -0.8295953869819641, "logits/rejected": -0.8045207858085632, "logps/chosen": -4.505274772644043, "logps/rejected": -4.960772514343262, "loss": 0.0536, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.505274772644043, "rewards/margins": 0.4554976522922516, "rewards/rejected": -4.960772514343262, "sft_loss": 4.231078147888184, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 0.40328763918897, "learning_rate": 5.737070283532444e-07, "logits/chosen": -0.9527303576469421, "logits/rejected": -0.83274906873703, "logps/chosen": -4.756344795227051, "logps/rejected": -5.247215270996094, "loss": 0.0533, "rewards/accuracies": 0.65625, "rewards/chosen": -4.756344795227051, "rewards/margins": 0.49087056517601013, "rewards/rejected": -5.247215270996094, "sft_loss": 4.509295463562012, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 0.5501878255143199, "learning_rate": 5.721662854558084e-07, "logits/chosen": -0.9534880518913269, "logits/rejected": -0.9018028378486633, "logps/chosen": -4.588342666625977, "logps/rejected": -5.2513628005981445, "loss": 0.0512, "rewards/accuracies": 0.78125, "rewards/chosen": -4.588342666625977, "rewards/margins": 0.6630192995071411, "rewards/rejected": -5.2513628005981445, "sft_loss": 4.240243911743164, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 0.4812915607269805, "learning_rate": 5.706248424007545e-07, "logits/chosen": -0.9855610728263855, "logits/rejected": -0.7704046964645386, "logps/chosen": -4.3409504890441895, "logps/rejected": -4.86018180847168, "loss": 0.0522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.3409504890441895, "rewards/margins": 0.5192316174507141, "rewards/rejected": -4.86018180847168, "sft_loss": 4.120865821838379, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 0.45233228772754347, "learning_rate": 5.690827141431699e-07, "logits/chosen": -1.017059326171875, "logits/rejected": -0.7848398089408875, "logps/chosen": -4.593303680419922, "logps/rejected": -5.0092973709106445, "loss": 0.0524, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.593303680419922, "rewards/margins": 0.4159931242465973, "rewards/rejected": -5.0092973709106445, "sft_loss": 4.261613368988037, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 0.6506887003976763, "learning_rate": 5.675399156447897e-07, "logits/chosen": -1.0267689228057861, "logits/rejected": -0.8769267797470093, "logps/chosen": -4.629855155944824, "logps/rejected": -5.098114967346191, "loss": 0.0534, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.629855155944824, "rewards/margins": 0.4682607054710388, "rewards/rejected": -5.098114967346191, "sft_loss": 4.358528137207031, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 0.4798389011577337, "learning_rate": 5.659964618738515e-07, "logits/chosen": -0.9780286550521851, "logits/rejected": -0.8610752820968628, "logps/chosen": -4.556265830993652, "logps/rejected": -5.136031150817871, "loss": 0.0515, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.556265830993652, "rewards/margins": 0.5797653794288635, "rewards/rejected": -5.136031150817871, "sft_loss": 4.165777206420898, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 0.5422032747817679, "learning_rate": 5.644523678049509e-07, "logits/chosen": -0.9687239527702332, "logits/rejected": -0.8813830614089966, "logps/chosen": -4.5033183097839355, "logps/rejected": -4.969407558441162, "loss": 0.0513, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.5033183097839355, "rewards/margins": 0.4660890996456146, "rewards/rejected": -4.969407558441162, "sft_loss": 4.0725483894348145, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 0.5901973530137451, "learning_rate": 5.629076484188952e-07, "logits/chosen": -0.7941701412200928, "logits/rejected": -0.7139784693717957, "logps/chosen": -4.662365913391113, "logps/rejected": -5.221032619476318, "loss": 0.0526, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.662365913391113, "rewards/margins": 0.5586673617362976, "rewards/rejected": -5.221032619476318, "sft_loss": 4.298089027404785, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 0.364967962468647, "learning_rate": 5.613623187025587e-07, "logits/chosen": -0.8740784525871277, "logits/rejected": -0.7707508206367493, "logps/chosen": -4.610237121582031, "logps/rejected": -5.228066444396973, "loss": 0.0518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.610237121582031, "rewards/margins": 0.617829442024231, "rewards/rejected": -5.228066444396973, "sft_loss": 4.25256872177124, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 0.479594255068986, "learning_rate": 5.598163936487369e-07, "logits/chosen": -0.8517980575561523, "logits/rejected": -0.6656395196914673, "logps/chosen": -4.480307102203369, "logps/rejected": -5.1197099685668945, "loss": 0.0518, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.480307102203369, "rewards/margins": 0.6394029259681702, "rewards/rejected": -5.1197099685668945, "sft_loss": 4.178659439086914, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 0.38971936523868217, "learning_rate": 5.582698882560017e-07, "logits/chosen": -0.8685995936393738, "logits/rejected": -0.692309558391571, "logps/chosen": -4.6656389236450195, "logps/rejected": -5.286332130432129, "loss": 0.0529, "rewards/accuracies": 0.6875, "rewards/chosen": -4.6656389236450195, "rewards/margins": 0.6206930875778198, "rewards/rejected": -5.286332130432129, "sft_loss": 4.3696160316467285, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 0.44187326201186605, "learning_rate": 5.567228175285549e-07, "logits/chosen": -0.706171452999115, "logits/rejected": -0.6334139108657837, "logps/chosen": -4.3591508865356445, "logps/rejected": -4.947355270385742, "loss": 0.0502, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.3591508865356445, "rewards/margins": 0.5882046222686768, "rewards/rejected": -4.947355270385742, "sft_loss": 3.9303412437438965, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 0.525767187888611, "learning_rate": 5.551751964760838e-07, "logits/chosen": -0.6887973546981812, "logits/rejected": -0.7219077348709106, "logps/chosen": -4.609784126281738, "logps/rejected": -5.058163642883301, "loss": 0.0531, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.609784126281738, "rewards/margins": 0.4483796954154968, "rewards/rejected": -5.058163642883301, "sft_loss": 4.3274431228637695, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 0.354163445464275, "learning_rate": 5.536270401136145e-07, "logits/chosen": -0.8383262753486633, "logits/rejected": -0.7376397252082825, "logps/chosen": -4.493884086608887, "logps/rejected": -5.036890029907227, "loss": 0.0514, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.493884086608887, "rewards/margins": 0.5430058836936951, "rewards/rejected": -5.036890029907227, "sft_loss": 4.180073261260986, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 0.4674214316597324, "learning_rate": 5.520783634613667e-07, "logits/chosen": -0.7461773753166199, "logits/rejected": -0.522574245929718, "logps/chosen": -4.679784774780273, "logps/rejected": -5.226053714752197, "loss": 0.0524, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.679784774780273, "rewards/margins": 0.5462688207626343, "rewards/rejected": -5.226053714752197, "sft_loss": 4.342376708984375, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 1.2543196580616705, "learning_rate": 5.505291815446082e-07, "logits/chosen": -0.749670147895813, "logits/rejected": -0.6325221657752991, "logps/chosen": -4.405003547668457, "logps/rejected": -5.0354204177856445, "loss": 0.0529, "rewards/accuracies": 0.71875, "rewards/chosen": -4.405003547668457, "rewards/margins": 0.6304169297218323, "rewards/rejected": -5.0354204177856445, "sft_loss": 4.188388347625732, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 0.5222585233235869, "learning_rate": 5.489795093935089e-07, "logits/chosen": -0.7664873600006104, "logits/rejected": -0.679275631904602, "logps/chosen": -4.5715131759643555, "logps/rejected": -5.1358561515808105, "loss": 0.0519, "rewards/accuracies": 0.65625, "rewards/chosen": -4.5715131759643555, "rewards/margins": 0.5643435716629028, "rewards/rejected": -5.1358561515808105, "sft_loss": 4.250007629394531, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 0.3737733749118266, "learning_rate": 5.474293620429946e-07, "logits/chosen": -0.9465273022651672, "logits/rejected": -0.7799872159957886, "logps/chosen": -4.570036888122559, "logps/rejected": -5.4389424324035645, "loss": 0.0506, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.570036888122559, "rewards/margins": 0.8689058423042297, "rewards/rejected": -5.4389424324035645, "sft_loss": 4.252045631408691, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 0.5110524346117336, "learning_rate": 5.458787545326018e-07, "logits/chosen": -0.8712457418441772, "logits/rejected": -0.7498850226402283, "logps/chosen": -4.555843830108643, "logps/rejected": -5.041597366333008, "loss": 0.0534, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.555843830108643, "rewards/margins": 0.4857536256313324, "rewards/rejected": -5.041597366333008, "sft_loss": 4.287815570831299, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 0.4193933937806095, "learning_rate": 5.443277019063311e-07, "logits/chosen": -0.8290327787399292, "logits/rejected": -0.6746954917907715, "logps/chosen": -4.528292655944824, "logps/rejected": -5.201540470123291, "loss": 0.0523, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.528292655944824, "rewards/margins": 0.6732484698295593, "rewards/rejected": -5.201540470123291, "sft_loss": 4.318000793457031, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 0.7984256655785995, "learning_rate": 5.427762192125023e-07, "logits/chosen": -0.7302804589271545, "logits/rejected": -0.6393738985061646, "logps/chosen": -4.297086238861084, "logps/rejected": -4.874281406402588, "loss": 0.0518, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.297086238861084, "rewards/margins": 0.5771942734718323, "rewards/rejected": -4.874281406402588, "sft_loss": 4.018523693084717, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 0.5139984575136903, "learning_rate": 5.41224321503607e-07, "logits/chosen": -0.8039859533309937, "logits/rejected": -0.5436501502990723, "logps/chosen": -4.477331638336182, "logps/rejected": -5.2113447189331055, "loss": 0.0512, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.477331638336182, "rewards/margins": 0.7340143322944641, "rewards/rejected": -5.2113447189331055, "sft_loss": 4.270988464355469, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 0.5318063905528159, "learning_rate": 5.396720238361637e-07, "logits/chosen": -0.7131645679473877, "logits/rejected": -0.5724089741706848, "logps/chosen": -4.619770526885986, "logps/rejected": -5.129917621612549, "loss": 0.0521, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.619770526885986, "rewards/margins": 0.510147213935852, "rewards/rejected": -5.129917621612549, "sft_loss": 4.325153350830078, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 0.6270796231497009, "learning_rate": 5.381193412705711e-07, "logits/chosen": -0.8254559636116028, "logits/rejected": -0.6606593728065491, "logps/chosen": -4.438187122344971, "logps/rejected": -5.023268222808838, "loss": 0.0521, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.438187122344971, "rewards/margins": 0.5850812196731567, "rewards/rejected": -5.023268222808838, "sft_loss": 4.149109840393066, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 0.3628875605274503, "learning_rate": 5.365662888709622e-07, "logits/chosen": -0.8125503659248352, "logits/rejected": -0.650180459022522, "logps/chosen": -4.428151607513428, "logps/rejected": -5.005454063415527, "loss": 0.0524, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.428151607513428, "rewards/margins": 0.5773029923439026, "rewards/rejected": -5.005454063415527, "sft_loss": 4.177712917327881, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 0.4408978324808102, "learning_rate": 5.350128817050585e-07, "logits/chosen": -0.8711091876029968, "logits/rejected": -0.6683140993118286, "logps/chosen": -4.5739264488220215, "logps/rejected": -5.2639336585998535, "loss": 0.0511, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.5739264488220215, "rewards/margins": 0.690007209777832, "rewards/rejected": -5.2639336585998535, "sft_loss": 4.209588050842285, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 0.634029397334541, "learning_rate": 5.334591348440229e-07, "logits/chosen": -0.8157389760017395, "logits/rejected": -0.6465741395950317, "logps/chosen": -4.551390171051025, "logps/rejected": -5.187037944793701, "loss": 0.0526, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.551390171051025, "rewards/margins": 0.6356481313705444, "rewards/rejected": -5.187037944793701, "sft_loss": 4.29290771484375, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 0.4523710769588691, "learning_rate": 5.319050633623141e-07, "logits/chosen": -0.8819277882575989, "logits/rejected": -0.7007160782814026, "logps/chosen": -4.540030002593994, "logps/rejected": -5.0742316246032715, "loss": 0.0525, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.540030002593994, "rewards/margins": 0.5342013239860535, "rewards/rejected": -5.0742316246032715, "sft_loss": 4.217968463897705, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 0.5121523451628749, "learning_rate": 5.303506823375409e-07, "logits/chosen": -0.8410658836364746, "logits/rejected": -0.6168532371520996, "logps/chosen": -4.4125237464904785, "logps/rejected": -5.1476545333862305, "loss": 0.0514, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.4125237464904785, "rewards/margins": 0.7351310849189758, "rewards/rejected": -5.1476545333862305, "sft_loss": 4.170558929443359, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 0.4242550067163679, "learning_rate": 5.287960068503143e-07, "logits/chosen": -0.8414437174797058, "logits/rejected": -0.6301363110542297, "logps/chosen": -4.453303337097168, "logps/rejected": -5.125790596008301, "loss": 0.0521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.453303337097168, "rewards/margins": 0.6724871397018433, "rewards/rejected": -5.125790596008301, "sft_loss": 4.234655857086182, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 0.6150856743479923, "learning_rate": 5.272410519841032e-07, "logits/chosen": -0.7048937678337097, "logits/rejected": -0.6164297461509705, "logps/chosen": -4.490935325622559, "logps/rejected": -5.310682773590088, "loss": 0.0503, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.490935325622559, "rewards/margins": 0.819747805595398, "rewards/rejected": -5.310682773590088, "sft_loss": 4.104347229003906, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 0.44023608513343926, "learning_rate": 5.256858328250861e-07, "logits/chosen": -0.8334380388259888, "logits/rejected": -0.6159783601760864, "logps/chosen": -4.609116554260254, "logps/rejected": -5.1678619384765625, "loss": 0.052, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.609116554260254, "rewards/margins": 0.5587445497512817, "rewards/rejected": -5.1678619384765625, "sft_loss": 4.265917778015137, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 0.6277862194810916, "learning_rate": 5.241303644620063e-07, "logits/chosen": -0.9017225503921509, "logits/rejected": -0.6855077147483826, "logps/chosen": -4.580887794494629, "logps/rejected": -5.039603233337402, "loss": 0.0538, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.580887794494629, "rewards/margins": 0.4587152898311615, "rewards/rejected": -5.039603233337402, "sft_loss": 4.258471488952637, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 0.3973293347863802, "learning_rate": 5.225746619860248e-07, "logits/chosen": -0.9179320335388184, "logits/rejected": -0.7894536852836609, "logps/chosen": -4.528168678283691, "logps/rejected": -5.234195709228516, "loss": 0.0532, "rewards/accuracies": 0.65625, "rewards/chosen": -4.528168678283691, "rewards/margins": 0.7060272097587585, "rewards/rejected": -5.234195709228516, "sft_loss": 4.231138229370117, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 0.4052136199043096, "learning_rate": 5.210187404905735e-07, "logits/chosen": -0.7160056829452515, "logits/rejected": -0.6443689465522766, "logps/chosen": -4.8330841064453125, "logps/rejected": -5.205080986022949, "loss": 0.0539, "rewards/accuracies": 0.625, "rewards/chosen": -4.8330841064453125, "rewards/margins": 0.3719966411590576, "rewards/rejected": -5.205080986022949, "sft_loss": 4.528097152709961, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 0.327549476063512, "learning_rate": 5.194626150712098e-07, "logits/chosen": -0.9027112722396851, "logits/rejected": -0.7617613077163696, "logps/chosen": -4.600415229797363, "logps/rejected": -5.1396894454956055, "loss": 0.0531, "rewards/accuracies": 0.65625, "rewards/chosen": -4.600415229797363, "rewards/margins": 0.539274275302887, "rewards/rejected": -5.1396894454956055, "sft_loss": 4.4282941818237305, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 0.3866774364589384, "learning_rate": 5.179063008254695e-07, "logits/chosen": -0.8924050331115723, "logits/rejected": -0.692057728767395, "logps/chosen": -4.390013694763184, "logps/rejected": -4.872861385345459, "loss": 0.0522, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.390013694763184, "rewards/margins": 0.48284751176834106, "rewards/rejected": -4.872861385345459, "sft_loss": 4.109368324279785, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 0.5603972389461411, "learning_rate": 5.163498128527199e-07, "logits/chosen": -0.8043405413627625, "logits/rejected": -0.6585147976875305, "logps/chosen": -4.4106950759887695, "logps/rejected": -4.972418308258057, "loss": 0.0524, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.4106950759887695, "rewards/margins": 0.5617232322692871, "rewards/rejected": -4.972418308258057, "sft_loss": 4.162978172302246, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 0.5226850952217248, "learning_rate": 5.147931662540144e-07, "logits/chosen": -0.7547642588615417, "logits/rejected": -0.6366636753082275, "logps/chosen": -4.469838619232178, "logps/rejected": -4.992844581604004, "loss": 0.0511, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.469838619232178, "rewards/margins": 0.523006021976471, "rewards/rejected": -4.992844581604004, "sft_loss": 4.131374359130859, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 0.5621502773295101, "learning_rate": 5.132363761319449e-07, "logits/chosen": -0.8233652114868164, "logits/rejected": -0.7582941651344299, "logps/chosen": -4.578380584716797, "logps/rejected": -5.224091053009033, "loss": 0.0516, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.578380584716797, "rewards/margins": 0.6457099914550781, "rewards/rejected": -5.224091053009033, "sft_loss": 4.231387615203857, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 0.5384394600398351, "learning_rate": 5.116794575904962e-07, "logits/chosen": -0.6828616261482239, "logits/rejected": -0.6190515756607056, "logps/chosen": -4.537975788116455, "logps/rejected": -5.16754150390625, "loss": 0.0512, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.537975788116455, "rewards/margins": 0.6295658946037292, "rewards/rejected": -5.16754150390625, "sft_loss": 4.14799690246582, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 0.3570274089495265, "learning_rate": 5.101224257348987e-07, "logits/chosen": -0.7118021845817566, "logits/rejected": -0.6111310124397278, "logps/chosen": -4.623868942260742, "logps/rejected": -5.227258682250977, "loss": 0.051, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.623868942260742, "rewards/margins": 0.6033896207809448, "rewards/rejected": -5.227258682250977, "sft_loss": 4.14676570892334, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 0.41812252336808753, "learning_rate": 5.085652956714823e-07, "logits/chosen": -0.8265964388847351, "logits/rejected": -0.6396560072898865, "logps/chosen": -4.667733669281006, "logps/rejected": -5.240296840667725, "loss": 0.0534, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.667733669281006, "rewards/margins": 0.572563111782074, "rewards/rejected": -5.240296840667725, "sft_loss": 4.416812419891357, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 0.4141573304369751, "learning_rate": 5.070080825075298e-07, "logits/chosen": -0.6839703321456909, "logits/rejected": -0.4868551194667816, "logps/chosen": -4.377912998199463, "logps/rejected": -5.105005741119385, "loss": 0.0525, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.377912998199463, "rewards/margins": 0.7270928025245667, "rewards/rejected": -5.105005741119385, "sft_loss": 4.107635974884033, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 0.5889041881610368, "learning_rate": 5.0545080135113e-07, "logits/chosen": -0.6704410910606384, "logits/rejected": -0.5773349404335022, "logps/chosen": -4.510235786437988, "logps/rejected": -5.127143383026123, "loss": 0.0534, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.510235786437988, "rewards/margins": 0.6169074177742004, "rewards/rejected": -5.127143383026123, "sft_loss": 4.2573747634887695, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 0.5491081889665882, "learning_rate": 5.038934673110316e-07, "logits/chosen": -0.7946319580078125, "logits/rejected": -0.6988022327423096, "logps/chosen": -4.589323043823242, "logps/rejected": -5.201617240905762, "loss": 0.0526, "rewards/accuracies": 0.625, "rewards/chosen": -4.589323043823242, "rewards/margins": 0.6122941374778748, "rewards/rejected": -5.201617240905762, "sft_loss": 4.306293487548828, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 0.40370148653990523, "learning_rate": 5.023360954964963e-07, "logits/chosen": -0.8991419076919556, "logits/rejected": -0.8406192660331726, "logps/chosen": -4.753389358520508, "logps/rejected": -5.2947564125061035, "loss": 0.0527, "rewards/accuracies": 0.65625, "rewards/chosen": -4.753389358520508, "rewards/margins": 0.5413663983345032, "rewards/rejected": -5.2947564125061035, "sft_loss": 4.370591640472412, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 0.4667987266950202, "learning_rate": 5.007787010171524e-07, "logits/chosen": -0.926565945148468, "logits/rejected": -0.7107774615287781, "logps/chosen": -4.401252746582031, "logps/rejected": -5.139235019683838, "loss": 0.0512, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.401252746582031, "rewards/margins": 0.7379823923110962, "rewards/rejected": -5.139235019683838, "sft_loss": 4.14643669128418, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 0.28977187648312863, "learning_rate": 4.992212989828477e-07, "logits/chosen": -0.7511423826217651, "logits/rejected": -0.7302947044372559, "logps/chosen": -4.628520965576172, "logps/rejected": -5.08123779296875, "loss": 0.0533, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.628520965576172, "rewards/margins": 0.45271721482276917, "rewards/rejected": -5.08123779296875, "sft_loss": 4.332284927368164, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 0.5128071746578113, "learning_rate": 4.976639045035036e-07, "logits/chosen": -0.7045882940292358, "logits/rejected": -0.651595413684845, "logps/chosen": -4.456996440887451, "logps/rejected": -4.9166107177734375, "loss": 0.0533, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.456996440887451, "rewards/margins": 0.45961475372314453, "rewards/rejected": -4.9166107177734375, "sft_loss": 4.227812767028809, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 0.7439117424034836, "learning_rate": 4.961065326889683e-07, "logits/chosen": -0.7401078939437866, "logits/rejected": -0.5558581948280334, "logps/chosen": -4.522768974304199, "logps/rejected": -5.026648044586182, "loss": 0.0539, "rewards/accuracies": 0.65625, "rewards/chosen": -4.522768974304199, "rewards/margins": 0.5038790702819824, "rewards/rejected": -5.026648044586182, "sft_loss": 4.227447032928467, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 0.44974777016917117, "learning_rate": 4.9454919864887e-07, "logits/chosen": -0.8705316781997681, "logits/rejected": -0.7544268369674683, "logps/chosen": -4.580806732177734, "logps/rejected": -5.142989158630371, "loss": 0.0526, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.580806732177734, "rewards/margins": 0.5621822476387024, "rewards/rejected": -5.142989158630371, "sft_loss": 4.323142051696777, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 0.60351015808472, "learning_rate": 4.929919174924701e-07, "logits/chosen": -0.9598020315170288, "logits/rejected": -0.7137739062309265, "logps/chosen": -4.588383674621582, "logps/rejected": -5.193715572357178, "loss": 0.0527, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.588383674621582, "rewards/margins": 0.6053324937820435, "rewards/rejected": -5.193715572357178, "sft_loss": 4.364377498626709, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 0.39383948253171636, "learning_rate": 4.914347043285177e-07, "logits/chosen": -0.7553201913833618, "logits/rejected": -0.5945597290992737, "logps/chosen": -4.58227014541626, "logps/rejected": -5.167197227478027, "loss": 0.0515, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.58227014541626, "rewards/margins": 0.5849268436431885, "rewards/rejected": -5.167197227478027, "sft_loss": 4.150500297546387, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 0.5947338725544065, "learning_rate": 4.898775742651013e-07, "logits/chosen": -0.6695328950881958, "logits/rejected": -0.5480534434318542, "logps/chosen": -4.455540657043457, "logps/rejected": -5.015857696533203, "loss": 0.0514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.455540657043457, "rewards/margins": 0.5603172183036804, "rewards/rejected": -5.015857696533203, "sft_loss": 4.052641868591309, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 0.3739608220920118, "learning_rate": 4.883205424095037e-07, "logits/chosen": -0.8755930066108704, "logits/rejected": -0.6813184022903442, "logps/chosen": -4.374234199523926, "logps/rejected": -5.120595455169678, "loss": 0.0516, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.374234199523926, "rewards/margins": 0.7463610172271729, "rewards/rejected": -5.120595455169678, "sft_loss": 4.135296821594238, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 0.47999415721425, "learning_rate": 4.86763623868055e-07, "logits/chosen": -0.7871851921081543, "logits/rejected": -0.6408876180648804, "logps/chosen": -4.715963363647461, "logps/rejected": -5.292166233062744, "loss": 0.0522, "rewards/accuracies": 0.65625, "rewards/chosen": -4.715963363647461, "rewards/margins": 0.5762028098106384, "rewards/rejected": -5.292166233062744, "sft_loss": 4.366208076477051, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 0.36852219394257385, "learning_rate": 4.852068337459856e-07, "logits/chosen": -0.7195180654525757, "logits/rejected": -0.532809853553772, "logps/chosen": -4.462833404541016, "logps/rejected": -5.156439304351807, "loss": 0.0508, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.462833404541016, "rewards/margins": 0.6936055421829224, "rewards/rejected": -5.156439304351807, "sft_loss": 4.119335174560547, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 0.40507193582200285, "learning_rate": 4.8365018714728e-07, "logits/chosen": -0.7378911375999451, "logits/rejected": -0.6666657328605652, "logps/chosen": -4.564920425415039, "logps/rejected": -5.0844197273254395, "loss": 0.0536, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.564920425415039, "rewards/margins": 0.5194991827011108, "rewards/rejected": -5.0844197273254395, "sft_loss": 4.275947093963623, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 0.31724766919051245, "learning_rate": 4.820936991745304e-07, "logits/chosen": -1.0948458909988403, "logits/rejected": -0.9697257876396179, "logps/chosen": -4.6035237312316895, "logps/rejected": -5.034295082092285, "loss": 0.0525, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.6035237312316895, "rewards/margins": 0.43077144026756287, "rewards/rejected": -5.034295082092285, "sft_loss": 4.244277000427246, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 0.5310039889396487, "learning_rate": 4.8053738492879e-07, "logits/chosen": -0.8384534120559692, "logits/rejected": -0.7171124815940857, "logps/chosen": -4.526313781738281, "logps/rejected": -5.261399269104004, "loss": 0.0515, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.526313781738281, "rewards/margins": 0.7350856065750122, "rewards/rejected": -5.261399269104004, "sft_loss": 4.204371452331543, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 0.5335564765964186, "learning_rate": 4.789812595094265e-07, "logits/chosen": -0.9797199368476868, "logits/rejected": -0.8858574032783508, "logps/chosen": -4.503859519958496, "logps/rejected": -5.087957382202148, "loss": 0.052, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.503859519958496, "rewards/margins": 0.5840980410575867, "rewards/rejected": -5.087957382202148, "sft_loss": 4.208001613616943, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 0.42218902416314297, "learning_rate": 4.774253380139752e-07, "logits/chosen": -0.9380515813827515, "logits/rejected": -0.8422372937202454, "logps/chosen": -4.567145347595215, "logps/rejected": -5.0723161697387695, "loss": 0.0527, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.567145347595215, "rewards/margins": 0.5051710605621338, "rewards/rejected": -5.0723161697387695, "sft_loss": 4.258059501647949, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 0.48659534318798275, "learning_rate": 4.758696355379936e-07, "logits/chosen": -0.6386554837226868, "logits/rejected": -0.7299365401268005, "logps/chosen": -4.623234272003174, "logps/rejected": -5.053004264831543, "loss": 0.0529, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.623234272003174, "rewards/margins": 0.429770290851593, "rewards/rejected": -5.053004264831543, "sft_loss": 4.323592662811279, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 0.4845276460062366, "learning_rate": 4.743141671749138e-07, "logits/chosen": -0.9454347491264343, "logits/rejected": -0.794151782989502, "logps/chosen": -4.372522354125977, "logps/rejected": -4.8364152908325195, "loss": 0.0535, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.372522354125977, "rewards/margins": 0.46389341354370117, "rewards/rejected": -4.8364152908325195, "sft_loss": 4.175771236419678, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 0.3380040956576745, "learning_rate": 4.727589480158968e-07, "logits/chosen": -0.8804110288619995, "logits/rejected": -0.7924290299415588, "logps/chosen": -4.5793280601501465, "logps/rejected": -5.261588096618652, "loss": 0.0515, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.5793280601501465, "rewards/margins": 0.6822600960731506, "rewards/rejected": -5.261588096618652, "sft_loss": 4.343472003936768, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 0.4022845360049597, "learning_rate": 4.712039931496855e-07, "logits/chosen": -0.8952493667602539, "logits/rejected": -0.7143739461898804, "logps/chosen": -4.63895845413208, "logps/rejected": -5.060461521148682, "loss": 0.0524, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.63895845413208, "rewards/margins": 0.42150363326072693, "rewards/rejected": -5.060461521148682, "sft_loss": 4.36241340637207, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 0.35743884470081916, "learning_rate": 4.6964931766245905e-07, "logits/chosen": -0.7012723684310913, "logits/rejected": -0.6638824343681335, "logps/chosen": -4.4255876541137695, "logps/rejected": -4.9864630699157715, "loss": 0.0537, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.4255876541137695, "rewards/margins": 0.5608752369880676, "rewards/rejected": -4.9864630699157715, "sft_loss": 4.170031547546387, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 0.388598888460869, "learning_rate": 4.6809493663768575e-07, "logits/chosen": -0.7917588353157043, "logits/rejected": -0.7726391553878784, "logps/chosen": -4.632717132568359, "logps/rejected": -4.928259372711182, "loss": 0.0523, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -4.632717132568359, "rewards/margins": 0.29554182291030884, "rewards/rejected": -4.928259372711182, "sft_loss": 4.243005275726318, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 0.43334284868642725, "learning_rate": 4.6654086515597716e-07, "logits/chosen": -0.9887200593948364, "logits/rejected": -0.7736762762069702, "logps/chosen": -4.42282772064209, "logps/rejected": -5.094423294067383, "loss": 0.0517, "rewards/accuracies": 0.6875, "rewards/chosen": -4.42282772064209, "rewards/margins": 0.6715952754020691, "rewards/rejected": -5.094423294067383, "sft_loss": 4.217164516448975, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 0.551121726681534, "learning_rate": 4.6498711829494154e-07, "logits/chosen": -0.9799394607543945, "logits/rejected": -0.8590563535690308, "logps/chosen": -4.606871604919434, "logps/rejected": -5.045818328857422, "loss": 0.0537, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.606871604919434, "rewards/margins": 0.43894606828689575, "rewards/rejected": -5.045818328857422, "sft_loss": 4.364339351654053, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 0.36960789827653123, "learning_rate": 4.6343371112903777e-07, "logits/chosen": -0.8044673204421997, "logits/rejected": -0.6007072329521179, "logps/chosen": -4.615143299102783, "logps/rejected": -5.214327335357666, "loss": 0.053, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.615143299102783, "rewards/margins": 0.5991836786270142, "rewards/rejected": -5.214327335357666, "sft_loss": 4.31845760345459, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": -0.29473957419395447, "eval_logits/rejected": -0.2032015025615692, "eval_logps/chosen": -4.496006011962891, "eval_logps/rejected": -5.107307434082031, "eval_loss": 0.050797030329704285, "eval_rewards/accuracies": 0.6691394448280334, "eval_rewards/chosen": -4.496006011962891, "eval_rewards/margins": 0.6113012433052063, "eval_rewards/rejected": -5.107307434082031, "eval_runtime": 43.3125, "eval_samples_per_second": 31.053, "eval_sft_loss": 4.119299411773682, "eval_steps_per_second": 7.781, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 0.44427092334138696, "learning_rate": 4.618806587294291e-07, "logits/chosen": -0.87445068359375, "logits/rejected": -0.7708224058151245, "logps/chosen": -4.504208564758301, "logps/rejected": -5.129702568054199, "loss": 0.0519, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.504208564758301, "rewards/margins": 0.6254942417144775, "rewards/rejected": -5.129702568054199, "sft_loss": 4.297804832458496, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 0.3724267286377576, "learning_rate": 4.603279761638365e-07, "logits/chosen": -0.8704094886779785, "logits/rejected": -0.723287045955658, "logps/chosen": -4.470217227935791, "logps/rejected": -5.0563645362854, "loss": 0.0526, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.470217227935791, "rewards/margins": 0.5861474871635437, "rewards/rejected": -5.0563645362854, "sft_loss": 4.2170891761779785, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 0.41128865897944417, "learning_rate": 4.5877567849639315e-07, "logits/chosen": -0.8630671501159668, "logits/rejected": -0.7631199955940247, "logps/chosen": -4.550658226013184, "logps/rejected": -5.094977378845215, "loss": 0.0526, "rewards/accuracies": 0.6875, "rewards/chosen": -4.550658226013184, "rewards/margins": 0.5443195700645447, "rewards/rejected": -5.094977378845215, "sft_loss": 4.317914009094238, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 0.6568465025902127, "learning_rate": 4.572237807874979e-07, "logits/chosen": -1.0142710208892822, "logits/rejected": -0.6812065243721008, "logps/chosen": -4.546741008758545, "logps/rejected": -5.181439399719238, "loss": 0.0516, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.546741008758545, "rewards/margins": 0.6346983909606934, "rewards/rejected": -5.181439399719238, "sft_loss": 4.28690242767334, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 0.43508632949046916, "learning_rate": 4.5567229809366895e-07, "logits/chosen": -0.8844995498657227, "logits/rejected": -0.7484878301620483, "logps/chosen": -4.440615653991699, "logps/rejected": -4.99686336517334, "loss": 0.0521, "rewards/accuracies": 0.65625, "rewards/chosen": -4.440615653991699, "rewards/margins": 0.5562475919723511, "rewards/rejected": -4.99686336517334, "sft_loss": 4.153700351715088, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 0.4122279782736568, "learning_rate": 4.541212454673984e-07, "logits/chosen": -0.9238206148147583, "logits/rejected": -0.7181123495101929, "logps/chosen": -4.5585713386535645, "logps/rejected": -5.414360523223877, "loss": 0.0515, "rewards/accuracies": 0.6875, "rewards/chosen": -4.5585713386535645, "rewards/margins": 0.8557893633842468, "rewards/rejected": -5.414360523223877, "sft_loss": 4.298130989074707, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 0.4026073166205477, "learning_rate": 4.525706379570055e-07, "logits/chosen": -0.9051357507705688, "logits/rejected": -0.832781195640564, "logps/chosen": -4.346682548522949, "logps/rejected": -4.964540481567383, "loss": 0.0521, "rewards/accuracies": 0.71875, "rewards/chosen": -4.346682548522949, "rewards/margins": 0.6178587675094604, "rewards/rejected": -4.964540481567383, "sft_loss": 4.1737565994262695, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 0.47424442428266617, "learning_rate": 4.510204906064911e-07, "logits/chosen": -0.8570457696914673, "logits/rejected": -0.7040773630142212, "logps/chosen": -4.4648237228393555, "logps/rejected": -5.1131768226623535, "loss": 0.0513, "rewards/accuracies": 0.75, "rewards/chosen": -4.4648237228393555, "rewards/margins": 0.6483533978462219, "rewards/rejected": -5.1131768226623535, "sft_loss": 4.171679496765137, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 1.0429902624418217, "learning_rate": 4.4947081845539177e-07, "logits/chosen": -0.9616962671279907, "logits/rejected": -0.8357839584350586, "logps/chosen": -4.420161247253418, "logps/rejected": -5.0600433349609375, "loss": 0.052, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.420161247253418, "rewards/margins": 0.6398815512657166, "rewards/rejected": -5.0600433349609375, "sft_loss": 4.109945297241211, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 0.40018549193747066, "learning_rate": 4.479216365386333e-07, "logits/chosen": -0.772409200668335, "logits/rejected": -0.5954066514968872, "logps/chosen": -4.221619129180908, "logps/rejected": -5.036656856536865, "loss": 0.0503, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.221619129180908, "rewards/margins": 0.8150378465652466, "rewards/rejected": -5.036656856536865, "sft_loss": 3.9400768280029297, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 0.37909844451042285, "learning_rate": 4.4637295988638555e-07, "logits/chosen": -0.8761689066886902, "logits/rejected": -0.8001937866210938, "logps/chosen": -4.484442710876465, "logps/rejected": -5.098120212554932, "loss": 0.0517, "rewards/accuracies": 0.6875, "rewards/chosen": -4.484442710876465, "rewards/margins": 0.6136777400970459, "rewards/rejected": -5.098120212554932, "sft_loss": 4.218336582183838, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 0.6816501770865704, "learning_rate": 4.4482480352391623e-07, "logits/chosen": -0.8988161087036133, "logits/rejected": -0.7821734547615051, "logps/chosen": -4.642391204833984, "logps/rejected": -5.23095703125, "loss": 0.0531, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.642391204833984, "rewards/margins": 0.5885659456253052, "rewards/rejected": -5.23095703125, "sft_loss": 4.404733657836914, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 0.5455705596230104, "learning_rate": 4.4327718247144507e-07, "logits/chosen": -0.7707226872444153, "logits/rejected": -0.6102782487869263, "logps/chosen": -4.860251426696777, "logps/rejected": -5.384507656097412, "loss": 0.0536, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.860251426696777, "rewards/margins": 0.5242565870285034, "rewards/rejected": -5.384507656097412, "sft_loss": 4.573927879333496, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 0.5162517294345276, "learning_rate": 4.417301117439984e-07, "logits/chosen": -0.9343627095222473, "logits/rejected": -0.7939955592155457, "logps/chosen": -4.480029106140137, "logps/rejected": -4.994220733642578, "loss": 0.0529, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.480029106140137, "rewards/margins": 0.5141913294792175, "rewards/rejected": -4.994220733642578, "sft_loss": 4.212777137756348, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 0.4073109145029346, "learning_rate": 4.401836063512631e-07, "logits/chosen": -1.001089334487915, "logits/rejected": -0.6555830240249634, "logps/chosen": -4.451521873474121, "logps/rejected": -5.125973224639893, "loss": 0.0512, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.451521873474121, "rewards/margins": 0.6744511723518372, "rewards/rejected": -5.125973224639893, "sft_loss": 4.141744136810303, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 0.5772050447307934, "learning_rate": 4.386376812974413e-07, "logits/chosen": -0.9038535952568054, "logits/rejected": -0.8890512585639954, "logps/chosen": -4.574075698852539, "logps/rejected": -5.134883880615234, "loss": 0.0521, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.574075698852539, "rewards/margins": 0.5608078837394714, "rewards/rejected": -5.134883880615234, "sft_loss": 4.309540748596191, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 0.40686850263594554, "learning_rate": 4.370923515811048e-07, "logits/chosen": -0.9050912857055664, "logits/rejected": -0.6649254560470581, "logps/chosen": -4.3815155029296875, "logps/rejected": -5.013126373291016, "loss": 0.0513, "rewards/accuracies": 0.65625, "rewards/chosen": -4.3815155029296875, "rewards/margins": 0.6316103935241699, "rewards/rejected": -5.013126373291016, "sft_loss": 4.075606346130371, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 0.39012420613123405, "learning_rate": 4.35547632195049e-07, "logits/chosen": -0.8044376373291016, "logits/rejected": -0.7201283574104309, "logps/chosen": -4.696098327636719, "logps/rejected": -5.314190864562988, "loss": 0.0524, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.696098327636719, "rewards/margins": 0.6180926561355591, "rewards/rejected": -5.314190864562988, "sft_loss": 4.33831787109375, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 0.7351191098993615, "learning_rate": 4.340035381261484e-07, "logits/chosen": -0.7771255373954773, "logits/rejected": -0.7277175784111023, "logps/chosen": -4.453921318054199, "logps/rejected": -5.145596504211426, "loss": 0.0511, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.453921318054199, "rewards/margins": 0.6916751861572266, "rewards/rejected": -5.145596504211426, "sft_loss": 4.129612922668457, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 0.35475089495116524, "learning_rate": 4.324600843552104e-07, "logits/chosen": -0.9506170153617859, "logits/rejected": -0.8384958505630493, "logps/chosen": -4.479653358459473, "logps/rejected": -5.169857978820801, "loss": 0.0513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.479653358459473, "rewards/margins": 0.6902052760124207, "rewards/rejected": -5.169857978820801, "sft_loss": 4.17246150970459, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 0.3659908245134801, "learning_rate": 4.309172858568302e-07, "logits/chosen": -0.937427818775177, "logits/rejected": -0.7483657598495483, "logps/chosen": -4.533991813659668, "logps/rejected": -5.1667327880859375, "loss": 0.0515, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.533991813659668, "rewards/margins": 0.6327404975891113, "rewards/rejected": -5.1667327880859375, "sft_loss": 4.246024131774902, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 0.6137087486247289, "learning_rate": 4.293751575992455e-07, "logits/chosen": -0.7637144327163696, "logits/rejected": -0.7552968859672546, "logps/chosen": -4.631504535675049, "logps/rejected": -5.132235527038574, "loss": 0.053, "rewards/accuracies": 0.6875, "rewards/chosen": -4.631504535675049, "rewards/margins": 0.5007305145263672, "rewards/rejected": -5.132235527038574, "sft_loss": 4.390534400939941, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 0.4352540545382265, "learning_rate": 4.278337145441916e-07, "logits/chosen": -0.886570155620575, "logits/rejected": -0.7136012315750122, "logps/chosen": -4.416377067565918, "logps/rejected": -5.063299179077148, "loss": 0.0515, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.416377067565918, "rewards/margins": 0.6469219326972961, "rewards/rejected": -5.063299179077148, "sft_loss": 4.168293476104736, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 0.3831451426766769, "learning_rate": 4.262929716467556e-07, "logits/chosen": -0.8285180926322937, "logits/rejected": -0.5632731318473816, "logps/chosen": -4.392007827758789, "logps/rejected": -5.139756679534912, "loss": 0.052, "rewards/accuracies": 0.6875, "rewards/chosen": -4.392007827758789, "rewards/margins": 0.7477489709854126, "rewards/rejected": -5.139756679534912, "sft_loss": 4.160672187805176, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 0.39530178751687783, "learning_rate": 4.247529438552321e-07, "logits/chosen": -0.9192399978637695, "logits/rejected": -0.7035531997680664, "logps/chosen": -4.617929458618164, "logps/rejected": -5.078191757202148, "loss": 0.0532, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.617929458618164, "rewards/margins": 0.4602627754211426, "rewards/rejected": -5.078191757202148, "sft_loss": 4.380537986755371, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 0.44253110919207433, "learning_rate": 4.232136461109773e-07, "logits/chosen": -0.8389490842819214, "logits/rejected": -0.7441287040710449, "logps/chosen": -4.518656253814697, "logps/rejected": -5.131009101867676, "loss": 0.0514, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.518656253814697, "rewards/margins": 0.6123533248901367, "rewards/rejected": -5.131009101867676, "sft_loss": 4.230744361877441, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 0.4149420144966067, "learning_rate": 4.216750933482646e-07, "logits/chosen": -0.8691641092300415, "logits/rejected": -0.6906719207763672, "logps/chosen": -4.620790958404541, "logps/rejected": -5.059487342834473, "loss": 0.053, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.620790958404541, "rewards/margins": 0.4386964440345764, "rewards/rejected": -5.059487342834473, "sft_loss": 4.235035419464111, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 0.5139969806101036, "learning_rate": 4.2013730049413986e-07, "logits/chosen": -0.8421649932861328, "logits/rejected": -0.6403535604476929, "logps/chosen": -4.331358432769775, "logps/rejected": -5.270960807800293, "loss": 0.0505, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.331358432769775, "rewards/margins": 0.9396018981933594, "rewards/rejected": -5.270960807800293, "sft_loss": 4.097912788391113, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 0.30430095720378014, "learning_rate": 4.1860028246827594e-07, "logits/chosen": -0.8284981846809387, "logits/rejected": -0.5819277167320251, "logps/chosen": -4.4769392013549805, "logps/rejected": -5.007475852966309, "loss": 0.0518, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.4769392013549805, "rewards/margins": 0.5305370092391968, "rewards/rejected": -5.007475852966309, "sft_loss": 4.225216388702393, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 0.4921011154236414, "learning_rate": 4.170640541828285e-07, "logits/chosen": -0.9329277873039246, "logits/rejected": -0.7737506031990051, "logps/chosen": -4.421766757965088, "logps/rejected": -5.002078056335449, "loss": 0.0526, "rewards/accuracies": 0.6875, "rewards/chosen": -4.421766757965088, "rewards/margins": 0.5803118348121643, "rewards/rejected": -5.002078056335449, "sft_loss": 4.226648807525635, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 0.43199463592891063, "learning_rate": 4.1552863054229116e-07, "logits/chosen": -0.7047543525695801, "logits/rejected": -0.6609756350517273, "logps/chosen": -4.531655788421631, "logps/rejected": -5.140510559082031, "loss": 0.0521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.531655788421631, "rewards/margins": 0.6088550686836243, "rewards/rejected": -5.140510559082031, "sft_loss": 4.227361679077148, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 0.5426284178434481, "learning_rate": 4.139940264433508e-07, "logits/chosen": -0.7945530414581299, "logits/rejected": -0.5765786170959473, "logps/chosen": -4.41995906829834, "logps/rejected": -5.090672016143799, "loss": 0.0517, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.41995906829834, "rewards/margins": 0.6707130074501038, "rewards/rejected": -5.090672016143799, "sft_loss": 4.104195594787598, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 0.3120787417194607, "learning_rate": 4.1246025677474303e-07, "logits/chosen": -0.8796011805534363, "logits/rejected": -0.6404326558113098, "logps/chosen": -4.570914268493652, "logps/rejected": -5.127249240875244, "loss": 0.0531, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.570914268493652, "rewards/margins": 0.5563352108001709, "rewards/rejected": -5.127249240875244, "sft_loss": 4.335757255554199, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 0.3718432668133724, "learning_rate": 4.10927336417108e-07, "logits/chosen": -0.8479071855545044, "logits/rejected": -0.6178969740867615, "logps/chosen": -4.612782955169678, "logps/rejected": -5.014036655426025, "loss": 0.0534, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.612782955169678, "rewards/margins": 0.40125328302383423, "rewards/rejected": -5.014036655426025, "sft_loss": 4.301538944244385, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 0.6878982267884615, "learning_rate": 4.093952802428457e-07, "logits/chosen": -0.6994894742965698, "logits/rejected": -0.6803777813911438, "logps/chosen": -4.756828308105469, "logps/rejected": -5.290118217468262, "loss": 0.0534, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.756828308105469, "rewards/margins": 0.5332905054092407, "rewards/rejected": -5.290118217468262, "sft_loss": 4.457432746887207, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 0.589099141329664, "learning_rate": 4.0786410311597184e-07, "logits/chosen": -0.9029039144515991, "logits/rejected": -0.6868539452552795, "logps/chosen": -4.576333522796631, "logps/rejected": -5.159743309020996, "loss": 0.052, "rewards/accuracies": 0.6875, "rewards/chosen": -4.576333522796631, "rewards/margins": 0.5834100842475891, "rewards/rejected": -5.159743309020996, "sft_loss": 4.252318382263184, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 0.3999674824957761, "learning_rate": 4.063338198919737e-07, "logits/chosen": -0.8113029599189758, "logits/rejected": -0.846416175365448, "logps/chosen": -4.430078506469727, "logps/rejected": -4.966437339782715, "loss": 0.0528, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.430078506469727, "rewards/margins": 0.5363594889640808, "rewards/rejected": -4.966437339782715, "sft_loss": 4.174395561218262, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 0.38273738849063044, "learning_rate": 4.0480444541766575e-07, "logits/chosen": -0.8961130380630493, "logits/rejected": -0.7447247505187988, "logps/chosen": -4.732843399047852, "logps/rejected": -5.325850486755371, "loss": 0.0529, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.732843399047852, "rewards/margins": 0.5930072665214539, "rewards/rejected": -5.325850486755371, "sft_loss": 4.414688587188721, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 0.5147586922678519, "learning_rate": 4.0327599453104606e-07, "logits/chosen": -0.9704347848892212, "logits/rejected": -0.780985951423645, "logps/chosen": -4.527245998382568, "logps/rejected": -5.088501930236816, "loss": 0.0509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.527245998382568, "rewards/margins": 0.5612561106681824, "rewards/rejected": -5.088501930236816, "sft_loss": 4.192348480224609, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 0.5005840786352599, "learning_rate": 4.017484820611514e-07, "logits/chosen": -0.7550173997879028, "logits/rejected": -0.6032567620277405, "logps/chosen": -4.417336463928223, "logps/rejected": -5.080150127410889, "loss": 0.0511, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.417336463928223, "rewards/margins": 0.6628138422966003, "rewards/rejected": -5.080150127410889, "sft_loss": 4.079850673675537, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 0.8229748724775394, "learning_rate": 4.002219228279148e-07, "logits/chosen": -0.8199543952941895, "logits/rejected": -0.6911274194717407, "logps/chosen": -4.341639518737793, "logps/rejected": -4.903729438781738, "loss": 0.0517, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.341639518737793, "rewards/margins": 0.5620898008346558, "rewards/rejected": -4.903729438781738, "sft_loss": 4.065591812133789, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 0.4876509117776437, "learning_rate": 3.9869633164202045e-07, "logits/chosen": -0.8162205815315247, "logits/rejected": -0.52099609375, "logps/chosen": -4.461337089538574, "logps/rejected": -5.199841022491455, "loss": 0.051, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.461337089538574, "rewards/margins": 0.7385039925575256, "rewards/rejected": -5.199841022491455, "sft_loss": 4.167636394500732, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 0.7065591387395672, "learning_rate": 3.9717172330476077e-07, "logits/chosen": -0.9044772982597351, "logits/rejected": -0.7785995006561279, "logps/chosen": -4.586330413818359, "logps/rejected": -5.2429423332214355, "loss": 0.0522, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.586330413818359, "rewards/margins": 0.6566125154495239, "rewards/rejected": -5.2429423332214355, "sft_loss": 4.347871780395508, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 0.5075784931382683, "learning_rate": 3.956481126078927e-07, "logits/chosen": -0.6835981607437134, "logits/rejected": -0.5895021557807922, "logps/chosen": -4.792338848114014, "logps/rejected": -5.461629867553711, "loss": 0.0543, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.792338848114014, "rewards/margins": 0.6692904829978943, "rewards/rejected": -5.461629867553711, "sft_loss": 4.541243553161621, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 0.537368618423302, "learning_rate": 3.941255143334937e-07, "logits/chosen": -0.7706578969955444, "logits/rejected": -0.7533372640609741, "logps/chosen": -4.472137451171875, "logps/rejected": -5.051485538482666, "loss": 0.0514, "rewards/accuracies": 0.6875, "rewards/chosen": -4.472137451171875, "rewards/margins": 0.5793476700782776, "rewards/rejected": -5.051485538482666, "sft_loss": 4.173643589019775, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 0.35854577541397925, "learning_rate": 3.9260394325381895e-07, "logits/chosen": -0.7394391894340515, "logits/rejected": -0.6220877766609192, "logps/chosen": -4.258572578430176, "logps/rejected": -5.175926685333252, "loss": 0.0503, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.258572578430176, "rewards/margins": 0.9173545837402344, "rewards/rejected": -5.175926685333252, "sft_loss": 3.9583396911621094, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 0.41343245811909185, "learning_rate": 3.9108341413115784e-07, "logits/chosen": -0.7238286733627319, "logits/rejected": -0.6091488003730774, "logps/chosen": -4.33715295791626, "logps/rejected": -4.925466060638428, "loss": 0.0516, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.33715295791626, "rewards/margins": 0.5883134603500366, "rewards/rejected": -4.925466060638428, "sft_loss": 3.97629976272583, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 0.41918254444927916, "learning_rate": 3.895639417176905e-07, "logits/chosen": -0.8348200917243958, "logits/rejected": -0.707625687122345, "logps/chosen": -4.645773887634277, "logps/rejected": -5.314620018005371, "loss": 0.0542, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.645773887634277, "rewards/margins": 0.668846607208252, "rewards/rejected": -5.314620018005371, "sft_loss": 4.392010688781738, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 0.42246506471392314, "learning_rate": 3.8804554075534497e-07, "logits/chosen": -0.8349407315254211, "logits/rejected": -0.5855430960655212, "logps/chosen": -4.490998268127441, "logps/rejected": -5.2729082107543945, "loss": 0.0517, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.490998268127441, "rewards/margins": 0.7819093465805054, "rewards/rejected": -5.2729082107543945, "sft_loss": 4.270726203918457, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 0.6390590052604445, "learning_rate": 3.8652822597565403e-07, "logits/chosen": -0.8645042181015015, "logits/rejected": -0.6598786115646362, "logps/chosen": -4.601441383361816, "logps/rejected": -5.257026672363281, "loss": 0.0523, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.601441383361816, "rewards/margins": 0.6555854678153992, "rewards/rejected": -5.257026672363281, "sft_loss": 4.356138229370117, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 0.39388686843240567, "learning_rate": 3.850120120996123e-07, "logits/chosen": -0.8234783411026001, "logits/rejected": -0.5680242776870728, "logps/chosen": -4.540238857269287, "logps/rejected": -5.173902988433838, "loss": 0.052, "rewards/accuracies": 0.6875, "rewards/chosen": -4.540238857269287, "rewards/margins": 0.633664071559906, "rewards/rejected": -5.173902988433838, "sft_loss": 4.335484504699707, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 0.4499215399446686, "learning_rate": 3.8349691383753356e-07, "logits/chosen": -0.6719542145729065, "logits/rejected": -0.5450443029403687, "logps/chosen": -4.431046962738037, "logps/rejected": -5.082084655761719, "loss": 0.0518, "rewards/accuracies": 0.71875, "rewards/chosen": -4.431046962738037, "rewards/margins": 0.6510375142097473, "rewards/rejected": -5.082084655761719, "sft_loss": 4.082747459411621, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 0.3778904426429339, "learning_rate": 3.819829458889078e-07, "logits/chosen": -0.7996759414672852, "logits/rejected": -0.7100083231925964, "logps/chosen": -4.312413215637207, "logps/rejected": -4.77877950668335, "loss": 0.0522, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.312413215637207, "rewards/margins": 0.4663669466972351, "rewards/rejected": -4.77877950668335, "sft_loss": 4.042895317077637, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 0.5604460188611351, "learning_rate": 3.804701229422585e-07, "logits/chosen": -0.8722193837165833, "logits/rejected": -0.8069941401481628, "logps/chosen": -4.671416282653809, "logps/rejected": -5.111981391906738, "loss": 0.0542, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.671416282653809, "rewards/margins": 0.4405645728111267, "rewards/rejected": -5.111981391906738, "sft_loss": 4.324137210845947, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 0.46389515283591515, "learning_rate": 3.789584596750007e-07, "logits/chosen": -0.8924866914749146, "logits/rejected": -0.86207515001297, "logps/chosen": -4.578545570373535, "logps/rejected": -5.2069993019104, "loss": 0.0525, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.578545570373535, "rewards/margins": 0.62845379114151, "rewards/rejected": -5.2069993019104, "sft_loss": 4.352547645568848, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 0.3101780013628597, "learning_rate": 3.77447970753298e-07, "logits/chosen": -0.8054901957511902, "logits/rejected": -0.8184728622436523, "logps/chosen": -4.611974716186523, "logps/rejected": -5.077882766723633, "loss": 0.0536, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.611974716186523, "rewards/margins": 0.4659079909324646, "rewards/rejected": -5.077882766723633, "sft_loss": 4.400869369506836, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 0.4254690093635414, "learning_rate": 3.7593867083192057e-07, "logits/chosen": -0.8784816861152649, "logits/rejected": -0.7168563604354858, "logps/chosen": -4.520941257476807, "logps/rejected": -5.078371524810791, "loss": 0.0522, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.520941257476807, "rewards/margins": 0.5574301481246948, "rewards/rejected": -5.078371524810791, "sft_loss": 4.297972679138184, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 0.48612755028753435, "learning_rate": 3.7443057455410276e-07, "logits/chosen": -0.7678418159484863, "logits/rejected": -0.6967147588729858, "logps/chosen": -4.190707206726074, "logps/rejected": -5.020942211151123, "loss": 0.0509, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.190707206726074, "rewards/margins": 0.8302351832389832, "rewards/rejected": -5.020942211151123, "sft_loss": 4.027964115142822, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 0.357070124593642, "learning_rate": 3.7292369655140145e-07, "logits/chosen": -1.0313518047332764, "logits/rejected": -0.8303739428520203, "logps/chosen": -4.52823543548584, "logps/rejected": -5.011662006378174, "loss": 0.054, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.52823543548584, "rewards/margins": 0.4834265112876892, "rewards/rejected": -5.011662006378174, "sft_loss": 4.35888147354126, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 0.35556182344734116, "learning_rate": 3.714180514435534e-07, "logits/chosen": -0.8014265298843384, "logits/rejected": -0.6077925562858582, "logps/chosen": -4.6149725914001465, "logps/rejected": -5.177509307861328, "loss": 0.0527, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.6149725914001465, "rewards/margins": 0.562536895275116, "rewards/rejected": -5.177509307861328, "sft_loss": 4.300775527954102, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 0.3562554445522406, "learning_rate": 3.6991365383833426e-07, "logits/chosen": -0.9311081171035767, "logits/rejected": -0.7546035647392273, "logps/chosen": -4.536148548126221, "logps/rejected": -5.134714126586914, "loss": 0.0516, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.536148548126221, "rewards/margins": 0.5985656976699829, "rewards/rejected": -5.134714126586914, "sft_loss": 4.202921390533447, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 0.4070594382170017, "learning_rate": 3.684105183314162e-07, "logits/chosen": -0.9334009289741516, "logits/rejected": -0.8775346875190735, "logps/chosen": -4.4893903732299805, "logps/rejected": -5.005202293395996, "loss": 0.0522, "rewards/accuracies": 0.65625, "rewards/chosen": -4.4893903732299805, "rewards/margins": 0.5158116221427917, "rewards/rejected": -5.005202293395996, "sft_loss": 4.235560894012451, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 0.42486172086349194, "learning_rate": 3.669086595062263e-07, "logits/chosen": -0.8683802485466003, "logits/rejected": -0.6446768641471863, "logps/chosen": -4.424868583679199, "logps/rejected": -5.10919713973999, "loss": 0.0517, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.424868583679199, "rewards/margins": 0.6843288540840149, "rewards/rejected": -5.10919713973999, "sft_loss": 4.203177452087402, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 0.3631473467602663, "learning_rate": 3.654080919338056e-07, "logits/chosen": -0.9535540342330933, "logits/rejected": -0.7772972583770752, "logps/chosen": -4.488328456878662, "logps/rejected": -5.187651634216309, "loss": 0.0518, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.488328456878662, "rewards/margins": 0.6993231773376465, "rewards/rejected": -5.187651634216309, "sft_loss": 4.2527360916137695, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 1.108956434138034, "learning_rate": 3.639088301726673e-07, "logits/chosen": -0.8254637718200684, "logits/rejected": -0.6029265522956848, "logps/chosen": -4.488307476043701, "logps/rejected": -5.122195720672607, "loss": 0.0526, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.488307476043701, "rewards/margins": 0.633887529373169, "rewards/rejected": -5.122195720672607, "sft_loss": 4.214524745941162, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 0.297083487299301, "learning_rate": 3.624108887686556e-07, "logits/chosen": -0.8513669967651367, "logits/rejected": -0.8039869070053101, "logps/chosen": -4.6177592277526855, "logps/rejected": -5.003042697906494, "loss": 0.053, "rewards/accuracies": 0.625, "rewards/chosen": -4.6177592277526855, "rewards/margins": 0.38528352975845337, "rewards/rejected": -5.003042697906494, "sft_loss": 4.321106910705566, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 0.3894391188758439, "learning_rate": 3.6091428225480433e-07, "logits/chosen": -1.000799536705017, "logits/rejected": -0.8797224164009094, "logps/chosen": -4.493613243103027, "logps/rejected": -5.126000881195068, "loss": 0.0513, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.493613243103027, "rewards/margins": 0.632387638092041, "rewards/rejected": -5.126000881195068, "sft_loss": 4.193148612976074, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 0.38449769948061835, "learning_rate": 3.5941902515119674e-07, "logits/chosen": -1.034938097000122, "logits/rejected": -0.7592099905014038, "logps/chosen": -4.570082664489746, "logps/rejected": -5.047337532043457, "loss": 0.0533, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.570082664489746, "rewards/margins": 0.47725504636764526, "rewards/rejected": -5.047337532043457, "sft_loss": 4.295851707458496, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 0.5234235514253388, "learning_rate": 3.5792513196482373e-07, "logits/chosen": -1.1635518074035645, "logits/rejected": -0.8318096399307251, "logps/chosen": -4.4717559814453125, "logps/rejected": -5.075552463531494, "loss": 0.0503, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.4717559814453125, "rewards/margins": 0.6037967205047607, "rewards/rejected": -5.075552463531494, "sft_loss": 4.137124538421631, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 0.33314820472642775, "learning_rate": 3.5643261718944346e-07, "logits/chosen": -0.8819448351860046, "logits/rejected": -0.7907723188400269, "logps/chosen": -4.5302557945251465, "logps/rejected": -4.951289176940918, "loss": 0.0536, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.5302557945251465, "rewards/margins": 0.4210330843925476, "rewards/rejected": -4.951289176940918, "sft_loss": 4.2429728507995605, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 0.5263923862202771, "learning_rate": 3.5494149530544087e-07, "logits/chosen": -1.0538041591644287, "logits/rejected": -0.9239507913589478, "logps/chosen": -4.604650497436523, "logps/rejected": -5.217805862426758, "loss": 0.0524, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.604650497436523, "rewards/margins": 0.6131556034088135, "rewards/rejected": -5.217805862426758, "sft_loss": 4.297959327697754, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 0.5020939907325735, "learning_rate": 3.534517807796871e-07, "logits/chosen": -0.9725322723388672, "logits/rejected": -0.8672218322753906, "logps/chosen": -4.604799747467041, "logps/rejected": -5.175978183746338, "loss": 0.0523, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.604799747467041, "rewards/margins": 0.571178674697876, "rewards/rejected": -5.175978183746338, "sft_loss": 4.33532190322876, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 0.43958453397135727, "learning_rate": 3.519634880653988e-07, "logits/chosen": -0.841769814491272, "logits/rejected": -0.7862639427185059, "logps/chosen": -4.502523422241211, "logps/rejected": -5.1410603523254395, "loss": 0.0508, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.502523422241211, "rewards/margins": 0.6385367512702942, "rewards/rejected": -5.1410603523254395, "sft_loss": 4.152388572692871, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 0.4603995201061191, "learning_rate": 3.504766316019987e-07, "logits/chosen": -0.8893829584121704, "logits/rejected": -0.6943017840385437, "logps/chosen": -4.210341453552246, "logps/rejected": -4.964727401733398, "loss": 0.0497, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.210341453552246, "rewards/margins": 0.7543860673904419, "rewards/rejected": -4.964727401733398, "sft_loss": 3.8542957305908203, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 0.4166426736441857, "learning_rate": 3.489912258149745e-07, "logits/chosen": -0.7692733407020569, "logits/rejected": -0.6670472025871277, "logps/chosen": -4.290536403656006, "logps/rejected": -5.004633903503418, "loss": 0.0513, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.290536403656006, "rewards/margins": 0.7140974998474121, "rewards/rejected": -5.004633903503418, "sft_loss": 3.9590721130371094, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 0.4202601946525645, "learning_rate": 3.475072851157397e-07, "logits/chosen": -0.8163237571716309, "logits/rejected": -0.7968477010726929, "logps/chosen": -4.64694881439209, "logps/rejected": -5.253317356109619, "loss": 0.0523, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.64694881439209, "rewards/margins": 0.6063681840896606, "rewards/rejected": -5.253317356109619, "sft_loss": 4.27789306640625, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 0.48439923094521753, "learning_rate": 3.460248239014936e-07, "logits/chosen": -0.7363717555999756, "logits/rejected": -0.7516336441040039, "logps/chosen": -4.712521553039551, "logps/rejected": -5.380669593811035, "loss": 0.052, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.712521553039551, "rewards/margins": 0.6681480407714844, "rewards/rejected": -5.380669593811035, "sft_loss": 4.458998203277588, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 1.2607721506545888, "learning_rate": 3.4454385655508134e-07, "logits/chosen": -0.7563374042510986, "logits/rejected": -0.7634093165397644, "logps/chosen": -4.5644426345825195, "logps/rejected": -5.0168280601501465, "loss": 0.0544, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.5644426345825195, "rewards/margins": 0.45238548517227173, "rewards/rejected": -5.0168280601501465, "sft_loss": 4.332982540130615, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 0.5141778910520323, "learning_rate": 3.4306439744485447e-07, "logits/chosen": -0.9014706611633301, "logits/rejected": -0.6987265348434448, "logps/chosen": -4.463529586791992, "logps/rejected": -5.1967034339904785, "loss": 0.0517, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.463529586791992, "rewards/margins": 0.7331734895706177, "rewards/rejected": -5.1967034339904785, "sft_loss": 4.167792320251465, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 0.7170772833367152, "learning_rate": 3.415864609245322e-07, "logits/chosen": -0.8435789942741394, "logits/rejected": -0.6109930276870728, "logps/chosen": -4.614388465881348, "logps/rejected": -5.201871871948242, "loss": 0.0538, "rewards/accuracies": 0.59375, "rewards/chosen": -4.614388465881348, "rewards/margins": 0.5874830484390259, "rewards/rejected": -5.201871871948242, "sft_loss": 4.346115589141846, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": -0.32916495203971863, "eval_logits/rejected": -0.2475946843624115, "eval_logps/chosen": -4.419311046600342, "eval_logps/rejected": -5.063793659210205, "eval_loss": 0.050544675439596176, "eval_rewards/accuracies": 0.6847180724143982, "eval_rewards/chosen": -4.419311046600342, "eval_rewards/margins": 0.6444829702377319, "eval_rewards/rejected": -5.063793659210205, "eval_runtime": 43.3773, "eval_samples_per_second": 31.007, "eval_sft_loss": 4.043381214141846, "eval_steps_per_second": 7.769, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 0.4483136535653781, "learning_rate": 3.401100613330605e-07, "logits/chosen": -0.9276592135429382, "logits/rejected": -0.9318366050720215, "logps/chosen": -4.496493339538574, "logps/rejected": -4.951340675354004, "loss": 0.0527, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.496493339538574, "rewards/margins": 0.45484742522239685, "rewards/rejected": -4.951340675354004, "sft_loss": 4.23056173324585, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 0.31593869243204614, "learning_rate": 3.3863521299447514e-07, "logits/chosen": -0.9971421360969543, "logits/rejected": -0.8217166662216187, "logps/chosen": -4.537590980529785, "logps/rejected": -5.150124549865723, "loss": 0.0522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.537590980529785, "rewards/margins": 0.6125333905220032, "rewards/rejected": -5.150124549865723, "sft_loss": 4.283602714538574, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 0.40521049552058575, "learning_rate": 3.371619302177609e-07, "logits/chosen": -0.8417550921440125, "logits/rejected": -0.700288712978363, "logps/chosen": -4.353780269622803, "logps/rejected": -4.941786766052246, "loss": 0.0529, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.353780269622803, "rewards/margins": 0.5880061388015747, "rewards/rejected": -4.941786766052246, "sft_loss": 4.163086891174316, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 0.5389906830610456, "learning_rate": 3.3569022729671393e-07, "logits/chosen": -0.9811625480651855, "logits/rejected": -0.8910226821899414, "logps/chosen": -4.600131511688232, "logps/rejected": -5.100063800811768, "loss": 0.0536, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.600131511688232, "rewards/margins": 0.4999319016933441, "rewards/rejected": -5.100063800811768, "sft_loss": 4.426603317260742, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 0.4394386395821723, "learning_rate": 3.342201185098024e-07, "logits/chosen": -0.8498247265815735, "logits/rejected": -0.9369813203811646, "logps/chosen": -4.571401119232178, "logps/rejected": -4.972775936126709, "loss": 0.0524, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.571401119232178, "rewards/margins": 0.4013746380805969, "rewards/rejected": -4.972775936126709, "sft_loss": 4.325285911560059, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 0.38906540416917657, "learning_rate": 3.3275161812002807e-07, "logits/chosen": -0.9747118949890137, "logits/rejected": -0.9587947726249695, "logps/chosen": -4.523557662963867, "logps/rejected": -5.116204261779785, "loss": 0.0529, "rewards/accuracies": 0.65625, "rewards/chosen": -4.523557662963867, "rewards/margins": 0.5926466584205627, "rewards/rejected": -5.116204261779785, "sft_loss": 4.328993797302246, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 0.37734258771151635, "learning_rate": 3.312847403747883e-07, "logits/chosen": -1.0632799863815308, "logits/rejected": -0.9446859359741211, "logps/chosen": -4.447041988372803, "logps/rejected": -5.011303424835205, "loss": 0.0519, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.447041988372803, "rewards/margins": 0.5642611384391785, "rewards/rejected": -5.011303424835205, "sft_loss": 4.248290061950684, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 0.3448461543727165, "learning_rate": 3.2981949950573733e-07, "logits/chosen": -0.950855553150177, "logits/rejected": -0.9059017300605774, "logps/chosen": -4.567697048187256, "logps/rejected": -4.874037742614746, "loss": 0.0533, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.567697048187256, "rewards/margins": 0.3063400685787201, "rewards/rejected": -4.874037742614746, "sft_loss": 4.307459831237793, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 0.3138553731803001, "learning_rate": 3.283559097286486e-07, "logits/chosen": -0.9531084895133972, "logits/rejected": -0.8209444284439087, "logps/chosen": -4.5451555252075195, "logps/rejected": -5.012529373168945, "loss": 0.0524, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.5451555252075195, "rewards/margins": 0.4673749506473541, "rewards/rejected": -5.012529373168945, "sft_loss": 4.314141273498535, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 0.4786368381669681, "learning_rate": 3.268939852432765e-07, "logits/chosen": -1.0215755701065063, "logits/rejected": -0.9069620370864868, "logps/chosen": -4.601208686828613, "logps/rejected": -4.959532737731934, "loss": 0.054, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.601208686828613, "rewards/margins": 0.3583240211009979, "rewards/rejected": -4.959532737731934, "sft_loss": 4.377420425415039, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 0.560950653630944, "learning_rate": 3.254337402332187e-07, "logits/chosen": -0.8691121935844421, "logits/rejected": -0.7701493501663208, "logps/chosen": -4.460612773895264, "logps/rejected": -5.0053582191467285, "loss": 0.052, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.460612773895264, "rewards/margins": 0.5447447299957275, "rewards/rejected": -5.0053582191467285, "sft_loss": 4.138766288757324, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 0.7040297008633517, "learning_rate": 3.239751888657788e-07, "logits/chosen": -0.9016326665878296, "logits/rejected": -0.7556005120277405, "logps/chosen": -4.512946605682373, "logps/rejected": -5.122220993041992, "loss": 0.0524, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.512946605682373, "rewards/margins": 0.6092746257781982, "rewards/rejected": -5.122220993041992, "sft_loss": 4.221314907073975, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 0.4938417120856117, "learning_rate": 3.2251834529182856e-07, "logits/chosen": -0.9205803871154785, "logits/rejected": -0.8506709337234497, "logps/chosen": -4.682583332061768, "logps/rejected": -5.245231628417969, "loss": 0.052, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.682583332061768, "rewards/margins": 0.5626480579376221, "rewards/rejected": -5.245231628417969, "sft_loss": 4.3715009689331055, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 0.6781558880822852, "learning_rate": 3.2106322364567075e-07, "logits/chosen": -0.9177119135856628, "logits/rejected": -0.7638968825340271, "logps/chosen": -4.520151138305664, "logps/rejected": -5.164031028747559, "loss": 0.0528, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.520151138305664, "rewards/margins": 0.6438802480697632, "rewards/rejected": -5.164031028747559, "sft_loss": 4.398355484008789, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 0.43076111918754245, "learning_rate": 3.1960983804490183e-07, "logits/chosen": -0.8420068621635437, "logits/rejected": -0.6891841888427734, "logps/chosen": -4.482913017272949, "logps/rejected": -5.048392295837402, "loss": 0.053, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.482913017272949, "rewards/margins": 0.5654793977737427, "rewards/rejected": -5.048392295837402, "sft_loss": 4.185931205749512, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 0.47109133227895944, "learning_rate": 3.1815820259027537e-07, "logits/chosen": -0.8213092088699341, "logits/rejected": -0.7251837849617004, "logps/chosen": -4.373992919921875, "logps/rejected": -4.963822841644287, "loss": 0.051, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.373992919921875, "rewards/margins": 0.5898297429084778, "rewards/rejected": -4.963822841644287, "sft_loss": 4.038293838500977, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 0.4587038973394695, "learning_rate": 3.16708331365565e-07, "logits/chosen": -0.8804284334182739, "logits/rejected": -0.7934475541114807, "logps/chosen": -4.585146903991699, "logps/rejected": -5.182665824890137, "loss": 0.0523, "rewards/accuracies": 0.6875, "rewards/chosen": -4.585146903991699, "rewards/margins": 0.5975189805030823, "rewards/rejected": -5.182665824890137, "sft_loss": 4.350895881652832, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 0.4353653760418129, "learning_rate": 3.152602384374275e-07, "logits/chosen": -0.8628435134887695, "logits/rejected": -0.6308306455612183, "logps/chosen": -4.509026527404785, "logps/rejected": -5.229661464691162, "loss": 0.0514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.509026527404785, "rewards/margins": 0.720634937286377, "rewards/rejected": -5.229661464691162, "sft_loss": 4.181239128112793, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 0.4775253211912713, "learning_rate": 3.1381393785526697e-07, "logits/chosen": -0.8128175735473633, "logits/rejected": -0.7428448796272278, "logps/chosen": -4.55011510848999, "logps/rejected": -5.149205207824707, "loss": 0.0518, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.55011510848999, "rewards/margins": 0.5990898013114929, "rewards/rejected": -5.149205207824707, "sft_loss": 4.240631103515625, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 0.4083359155614466, "learning_rate": 3.123694436510979e-07, "logits/chosen": -0.7938219308853149, "logits/rejected": -0.6490747928619385, "logps/chosen": -4.4928998947143555, "logps/rejected": -5.018836498260498, "loss": 0.0531, "rewards/accuracies": 0.65625, "rewards/chosen": -4.4928998947143555, "rewards/margins": 0.5259365439414978, "rewards/rejected": -5.018836498260498, "sft_loss": 4.249953269958496, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 0.4099432277759763, "learning_rate": 3.1092676983940946e-07, "logits/chosen": -0.8014167547225952, "logits/rejected": -0.7310387492179871, "logps/chosen": -4.47890567779541, "logps/rejected": -5.038219928741455, "loss": 0.0518, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.47890567779541, "rewards/margins": 0.5593137145042419, "rewards/rejected": -5.038219928741455, "sft_loss": 4.150893211364746, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 0.3350153632175618, "learning_rate": 3.094859304170293e-07, "logits/chosen": -0.6266270875930786, "logits/rejected": -0.6359224915504456, "logps/chosen": -4.500874042510986, "logps/rejected": -5.046133995056152, "loss": 0.0523, "rewards/accuracies": 0.65625, "rewards/chosen": -4.500874042510986, "rewards/margins": 0.5452605485916138, "rewards/rejected": -5.046133995056152, "sft_loss": 4.219162940979004, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 0.3536268621775762, "learning_rate": 3.0804693936298795e-07, "logits/chosen": -0.8250945210456848, "logits/rejected": -0.7901517152786255, "logps/chosen": -4.515018463134766, "logps/rejected": -5.176741600036621, "loss": 0.052, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.515018463134766, "rewards/margins": 0.6617237329483032, "rewards/rejected": -5.176741600036621, "sft_loss": 4.3591413497924805, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 0.40375392263119575, "learning_rate": 3.066098106383826e-07, "logits/chosen": -0.8308378458023071, "logits/rejected": -0.7185466289520264, "logps/chosen": -4.537173271179199, "logps/rejected": -5.094812870025635, "loss": 0.0517, "rewards/accuracies": 0.6875, "rewards/chosen": -4.537173271179199, "rewards/margins": 0.5576392412185669, "rewards/rejected": -5.094812870025635, "sft_loss": 4.137363433837891, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 0.48157595238238843, "learning_rate": 3.0517455818624263e-07, "logits/chosen": -0.8573415875434875, "logits/rejected": -0.7897814512252808, "logps/chosen": -4.485922813415527, "logps/rejected": -5.143821716308594, "loss": 0.0521, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.485922813415527, "rewards/margins": 0.6578995585441589, "rewards/rejected": -5.143821716308594, "sft_loss": 4.253503799438477, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 0.4845596241734785, "learning_rate": 3.037411959313936e-07, "logits/chosen": -0.7231793403625488, "logits/rejected": -0.5665433406829834, "logps/chosen": -4.50700044631958, "logps/rejected": -5.043286323547363, "loss": 0.0527, "rewards/accuracies": 0.6875, "rewards/chosen": -4.50700044631958, "rewards/margins": 0.5362854599952698, "rewards/rejected": -5.043286323547363, "sft_loss": 4.2585368156433105, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 1.072754132396118, "learning_rate": 3.023097377803224e-07, "logits/chosen": -0.7113522887229919, "logits/rejected": -0.6499849557876587, "logps/chosen": -4.468755722045898, "logps/rejected": -5.006053447723389, "loss": 0.0533, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.468755722045898, "rewards/margins": 0.5372985005378723, "rewards/rejected": -5.006053447723389, "sft_loss": 4.215538024902344, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 0.4718352567577755, "learning_rate": 3.008801976210423e-07, "logits/chosen": -0.7235435843467712, "logits/rejected": -0.6755806803703308, "logps/chosen": -4.443969249725342, "logps/rejected": -4.865131855010986, "loss": 0.053, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.443969249725342, "rewards/margins": 0.42116230726242065, "rewards/rejected": -4.865131855010986, "sft_loss": 4.133395195007324, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 0.45285574666734346, "learning_rate": 2.994525893229581e-07, "logits/chosen": -0.7920453548431396, "logits/rejected": -0.7268325090408325, "logps/chosen": -4.540145397186279, "logps/rejected": -5.200549125671387, "loss": 0.0507, "rewards/accuracies": 0.75, "rewards/chosen": -4.540145397186279, "rewards/margins": 0.6604036688804626, "rewards/rejected": -5.200549125671387, "sft_loss": 4.260335445404053, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 0.4211428890658203, "learning_rate": 2.98026926736732e-07, "logits/chosen": -0.8909608721733093, "logits/rejected": -0.7725866436958313, "logps/chosen": -4.55959415435791, "logps/rejected": -5.068253040313721, "loss": 0.0526, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.55959415435791, "rewards/margins": 0.5086590647697449, "rewards/rejected": -5.068253040313721, "sft_loss": 4.258984088897705, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 0.4308395644884328, "learning_rate": 2.9660322369414846e-07, "logits/chosen": -0.8293676376342773, "logits/rejected": -0.6661375761032104, "logps/chosen": -4.575963973999023, "logps/rejected": -5.214879989624023, "loss": 0.0526, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.575963973999023, "rewards/margins": 0.638916015625, "rewards/rejected": -5.214879989624023, "sft_loss": 4.343644618988037, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 0.2876446754182886, "learning_rate": 2.9518149400798063e-07, "logits/chosen": -0.850312352180481, "logits/rejected": -0.8274194002151489, "logps/chosen": -4.500282287597656, "logps/rejected": -5.1650567054748535, "loss": 0.0517, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.500282287597656, "rewards/margins": 0.6647745370864868, "rewards/rejected": -5.1650567054748535, "sft_loss": 4.275763511657715, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 0.39282700418404237, "learning_rate": 2.9376175147185633e-07, "logits/chosen": -0.844325065612793, "logits/rejected": -0.5920781493186951, "logps/chosen": -4.502379894256592, "logps/rejected": -5.151594638824463, "loss": 0.0511, "rewards/accuracies": 0.6875, "rewards/chosen": -4.502379894256592, "rewards/margins": 0.649213969707489, "rewards/rejected": -5.151594638824463, "sft_loss": 4.227784633636475, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 0.42074838124805847, "learning_rate": 2.9234400986012376e-07, "logits/chosen": -0.8818314671516418, "logits/rejected": -0.6762841939926147, "logps/chosen": -4.428236961364746, "logps/rejected": -5.1282124519348145, "loss": 0.051, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.428236961364746, "rewards/margins": 0.699974775314331, "rewards/rejected": -5.1282124519348145, "sft_loss": 4.132842063903809, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 0.4956780345069873, "learning_rate": 2.9092828292771817e-07, "logits/chosen": -0.7929924130439758, "logits/rejected": -0.7674404382705688, "logps/chosen": -4.421003818511963, "logps/rejected": -5.078767776489258, "loss": 0.0515, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.421003818511963, "rewards/margins": 0.6577636003494263, "rewards/rejected": -5.078767776489258, "sft_loss": 4.150858402252197, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 0.3934700234275401, "learning_rate": 2.8951458441002875e-07, "logits/chosen": -0.698670506477356, "logits/rejected": -0.7248165011405945, "logps/chosen": -4.2970147132873535, "logps/rejected": -4.952702522277832, "loss": 0.0513, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.2970147132873535, "rewards/margins": 0.655687689781189, "rewards/rejected": -4.952702522277832, "sft_loss": 4.0070271492004395, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 0.514669682771898, "learning_rate": 2.881029280227643e-07, "logits/chosen": -0.8328143358230591, "logits/rejected": -0.6541591882705688, "logps/chosen": -4.35631799697876, "logps/rejected": -5.068148612976074, "loss": 0.0509, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.35631799697876, "rewards/margins": 0.7118302583694458, "rewards/rejected": -5.068148612976074, "sft_loss": 4.113364219665527, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 0.5041909432877334, "learning_rate": 2.8669332746182177e-07, "logits/chosen": -0.9019726514816284, "logits/rejected": -0.7289915084838867, "logps/chosen": -4.497717380523682, "logps/rejected": -5.219419002532959, "loss": 0.0522, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.497717380523682, "rewards/margins": 0.7217018604278564, "rewards/rejected": -5.219419002532959, "sft_loss": 4.304531097412109, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 0.39504962692275486, "learning_rate": 2.8528579640315156e-07, "logits/chosen": -0.7584611177444458, "logits/rejected": -0.7723220586776733, "logps/chosen": -4.636214733123779, "logps/rejected": -5.066348075866699, "loss": 0.0523, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.636214733123779, "rewards/margins": 0.43013325333595276, "rewards/rejected": -5.066348075866699, "sft_loss": 4.299112796783447, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 0.670166400454059, "learning_rate": 2.8388034850262646e-07, "logits/chosen": -0.7787135243415833, "logits/rejected": -0.6269806623458862, "logps/chosen": -4.497548580169678, "logps/rejected": -5.134943962097168, "loss": 0.0522, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.497548580169678, "rewards/margins": 0.637395441532135, "rewards/rejected": -5.134943962097168, "sft_loss": 4.2226152420043945, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 0.41224442261042604, "learning_rate": 2.824769973959079e-07, "logits/chosen": -0.770029604434967, "logits/rejected": -0.6335395574569702, "logps/chosen": -4.504012584686279, "logps/rejected": -5.04500150680542, "loss": 0.0524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.504012584686279, "rewards/margins": 0.5409889221191406, "rewards/rejected": -5.04500150680542, "sft_loss": 4.187629222869873, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 0.44522007655542895, "learning_rate": 2.81075756698315e-07, "logits/chosen": -0.615141749382019, "logits/rejected": -0.5446062088012695, "logps/chosen": -4.580349922180176, "logps/rejected": -5.290804862976074, "loss": 0.0515, "rewards/accuracies": 0.71875, "rewards/chosen": -4.580349922180176, "rewards/margins": 0.7104545831680298, "rewards/rejected": -5.290804862976074, "sft_loss": 4.226175785064697, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 0.36982501701742904, "learning_rate": 2.7967664000469035e-07, "logits/chosen": -0.8423372507095337, "logits/rejected": -0.7574399709701538, "logps/chosen": -4.433310031890869, "logps/rejected": -5.036944389343262, "loss": 0.0507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.433310031890869, "rewards/margins": 0.6036348342895508, "rewards/rejected": -5.036944389343262, "sft_loss": 4.126814842224121, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 0.4483686878186856, "learning_rate": 2.7827966088927095e-07, "logits/chosen": -0.8709455728530884, "logits/rejected": -0.6395670175552368, "logps/chosen": -4.461734294891357, "logps/rejected": -5.337707042694092, "loss": 0.051, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.461734294891357, "rewards/margins": 0.8759723901748657, "rewards/rejected": -5.337707042694092, "sft_loss": 4.206548690795898, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 0.5525955846494557, "learning_rate": 2.768848329055538e-07, "logits/chosen": -0.7681508660316467, "logits/rejected": -0.725921630859375, "logps/chosen": -4.424405574798584, "logps/rejected": -5.064883232116699, "loss": 0.0523, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.424405574798584, "rewards/margins": 0.6404775977134705, "rewards/rejected": -5.064883232116699, "sft_loss": 4.222466945648193, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 0.6304674537764908, "learning_rate": 2.7549216958616657e-07, "logits/chosen": -0.8746339678764343, "logits/rejected": -0.734113872051239, "logps/chosen": -4.404292106628418, "logps/rejected": -5.094416618347168, "loss": 0.0508, "rewards/accuracies": 0.65625, "rewards/chosen": -4.404292106628418, "rewards/margins": 0.6901249885559082, "rewards/rejected": -5.094416618347168, "sft_loss": 4.037627220153809, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 0.5009731987661138, "learning_rate": 2.741016844427344e-07, "logits/chosen": -0.7817445993423462, "logits/rejected": -0.601385235786438, "logps/chosen": -4.428588390350342, "logps/rejected": -5.240962028503418, "loss": 0.0516, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.428588390350342, "rewards/margins": 0.8123737573623657, "rewards/rejected": -5.240962028503418, "sft_loss": 4.188836097717285, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 0.6299463597143712, "learning_rate": 2.7271339096575073e-07, "logits/chosen": -0.7218152284622192, "logits/rejected": -0.577179491519928, "logps/chosen": -4.575904846191406, "logps/rejected": -5.296813011169434, "loss": 0.0523, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.575904846191406, "rewards/margins": 0.7209089994430542, "rewards/rejected": -5.296813011169434, "sft_loss": 4.2866058349609375, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 0.4385588051996929, "learning_rate": 2.713273026244446e-07, "logits/chosen": -0.8641475439071655, "logits/rejected": -0.6361261606216431, "logps/chosen": -4.40885066986084, "logps/rejected": -5.324732303619385, "loss": 0.0502, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.40885066986084, "rewards/margins": 0.9158821105957031, "rewards/rejected": -5.324732303619385, "sft_loss": 4.178675651550293, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 0.4168058120641021, "learning_rate": 2.6994343286665156e-07, "logits/chosen": -0.8316490054130554, "logits/rejected": -0.619785726070404, "logps/chosen": -4.314603328704834, "logps/rejected": -5.067779064178467, "loss": 0.0518, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.314603328704834, "rewards/margins": 0.7531753778457642, "rewards/rejected": -5.067779064178467, "sft_loss": 4.156607627868652, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 0.5075649961997, "learning_rate": 2.6856179511868156e-07, "logits/chosen": -0.7296597957611084, "logits/rejected": -0.5145146250724792, "logps/chosen": -4.354414463043213, "logps/rejected": -5.239541530609131, "loss": 0.0511, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.354414463043213, "rewards/margins": 0.8851274251937866, "rewards/rejected": -5.239541530609131, "sft_loss": 4.124449253082275, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 1.1238836661784852, "learning_rate": 2.6718240278519056e-07, "logits/chosen": -0.6999574899673462, "logits/rejected": -0.5747981071472168, "logps/chosen": -4.261233329772949, "logps/rejected": -5.282289981842041, "loss": 0.0517, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.261233329772949, "rewards/margins": 1.021056056022644, "rewards/rejected": -5.282289981842041, "sft_loss": 4.051290035247803, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 0.4627375151319719, "learning_rate": 2.6580526924904866e-07, "logits/chosen": -0.8693877458572388, "logits/rejected": -0.7199397087097168, "logps/chosen": -4.682175636291504, "logps/rejected": -5.245555400848389, "loss": 0.0524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.682175636291504, "rewards/margins": 0.5633805990219116, "rewards/rejected": -5.245555400848389, "sft_loss": 4.376290321350098, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 0.5160298725641026, "learning_rate": 2.6443040787121186e-07, "logits/chosen": -0.9046756029129028, "logits/rejected": -0.8257226943969727, "logps/chosen": -4.422331809997559, "logps/rejected": -5.142786979675293, "loss": 0.051, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.422331809997559, "rewards/margins": 0.720454752445221, "rewards/rejected": -5.142786979675293, "sft_loss": 4.169841289520264, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 0.5638296933110125, "learning_rate": 2.6305783199059084e-07, "logits/chosen": -0.7855437397956848, "logits/rejected": -0.7157570123672485, "logps/chosen": -4.402723789215088, "logps/rejected": -4.955962181091309, "loss": 0.053, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.402723789215088, "rewards/margins": 0.5532382726669312, "rewards/rejected": -4.955962181091309, "sft_loss": 4.0784478187561035, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 0.38165204325516433, "learning_rate": 2.6168755492392324e-07, "logits/chosen": -0.7893201112747192, "logits/rejected": -0.637191653251648, "logps/chosen": -4.291540145874023, "logps/rejected": -5.171633720397949, "loss": 0.0489, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.291540145874023, "rewards/margins": 0.8800934553146362, "rewards/rejected": -5.171633720397949, "sft_loss": 3.9720988273620605, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 0.6583163664132554, "learning_rate": 2.6031958996564274e-07, "logits/chosen": -0.8037883043289185, "logits/rejected": -0.7117749452590942, "logps/chosen": -4.445651054382324, "logps/rejected": -5.30559778213501, "loss": 0.0525, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.445651054382324, "rewards/margins": 0.859946608543396, "rewards/rejected": -5.30559778213501, "sft_loss": 4.167903900146484, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 0.4715461336878274, "learning_rate": 2.589539503877518e-07, "logits/chosen": -0.7256428599357605, "logits/rejected": -0.6271122097969055, "logps/chosen": -4.560754776000977, "logps/rejected": -5.249648094177246, "loss": 0.0515, "rewards/accuracies": 0.6875, "rewards/chosen": -4.560754776000977, "rewards/margins": 0.6888927817344666, "rewards/rejected": -5.249648094177246, "sft_loss": 4.257047176361084, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 0.5647077173254447, "learning_rate": 2.5759064943969125e-07, "logits/chosen": -0.8433957099914551, "logits/rejected": -0.6119260787963867, "logps/chosen": -4.4959588050842285, "logps/rejected": -5.253961086273193, "loss": 0.0503, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.4959588050842285, "rewards/margins": 0.7580021023750305, "rewards/rejected": -5.253961086273193, "sft_loss": 4.209029197692871, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 0.3465141084373431, "learning_rate": 2.562297003482131e-07, "logits/chosen": -0.660112202167511, "logits/rejected": -0.6809910535812378, "logps/chosen": -4.294495582580566, "logps/rejected": -5.017926216125488, "loss": 0.0508, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.294495582580566, "rewards/margins": 0.7234315872192383, "rewards/rejected": -5.017926216125488, "sft_loss": 4.066096782684326, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 0.3979943305801828, "learning_rate": 2.548711163172512e-07, "logits/chosen": -0.700593888759613, "logits/rejected": -0.64739990234375, "logps/chosen": -4.466081142425537, "logps/rejected": -4.965723514556885, "loss": 0.0534, "rewards/accuracies": 0.6875, "rewards/chosen": -4.466081142425537, "rewards/margins": 0.49964267015457153, "rewards/rejected": -4.965723514556885, "sft_loss": 4.193602561950684, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 0.42928400117151966, "learning_rate": 2.53514910527794e-07, "logits/chosen": -0.727473795413971, "logits/rejected": -0.5996404886245728, "logps/chosen": -4.518040180206299, "logps/rejected": -5.068653583526611, "loss": 0.0521, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.518040180206299, "rewards/margins": 0.5506137609481812, "rewards/rejected": -5.068653583526611, "sft_loss": 4.262098789215088, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 0.37355707424726675, "learning_rate": 2.5216109613775573e-07, "logits/chosen": -0.8052509427070618, "logits/rejected": -0.6294256448745728, "logps/chosen": -4.585465431213379, "logps/rejected": -5.209853649139404, "loss": 0.0529, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.585465431213379, "rewards/margins": 0.6243882179260254, "rewards/rejected": -5.209853649139404, "sft_loss": 4.374361991882324, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 0.48572956394977185, "learning_rate": 2.5080968628184993e-07, "logits/chosen": -0.7453235387802124, "logits/rejected": -0.5784560441970825, "logps/chosen": -4.471060752868652, "logps/rejected": -5.356838226318359, "loss": 0.0507, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.471060752868652, "rewards/margins": 0.8857781291007996, "rewards/rejected": -5.356838226318359, "sft_loss": 4.242326736450195, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 0.6413231575213167, "learning_rate": 2.494606940714605e-07, "logits/chosen": -0.694869875907898, "logits/rejected": -0.641283392906189, "logps/chosen": -4.33608865737915, "logps/rejected": -5.122622489929199, "loss": 0.0509, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.33608865737915, "rewards/margins": 0.7865341305732727, "rewards/rejected": -5.122622489929199, "sft_loss": 4.0861897468566895, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 0.6794536127536217, "learning_rate": 2.4811413259451625e-07, "logits/chosen": -0.8177544474601746, "logits/rejected": -0.6390523910522461, "logps/chosen": -4.208698272705078, "logps/rejected": -4.995844841003418, "loss": 0.051, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.208698272705078, "rewards/margins": 0.7871468663215637, "rewards/rejected": -4.995844841003418, "sft_loss": 4.0283613204956055, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 0.635316600850136, "learning_rate": 2.46770014915362e-07, "logits/chosen": -0.7075114846229553, "logits/rejected": -0.6430591344833374, "logps/chosen": -4.4350762367248535, "logps/rejected": -5.117311000823975, "loss": 0.053, "rewards/accuracies": 0.6875, "rewards/chosen": -4.4350762367248535, "rewards/margins": 0.6822348833084106, "rewards/rejected": -5.117311000823975, "sft_loss": 4.175159454345703, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 0.5542245200634366, "learning_rate": 2.45428354074634e-07, "logits/chosen": -0.7162960171699524, "logits/rejected": -0.6563288569450378, "logps/chosen": -4.496886730194092, "logps/rejected": -5.226065635681152, "loss": 0.0516, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.496886730194092, "rewards/margins": 0.7291792631149292, "rewards/rejected": -5.226065635681152, "sft_loss": 4.221349716186523, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 0.6123482841719484, "learning_rate": 2.4408916308913105e-07, "logits/chosen": -0.7921692132949829, "logits/rejected": -0.5897735357284546, "logps/chosen": -4.634129524230957, "logps/rejected": -5.069488525390625, "loss": 0.0544, "rewards/accuracies": 0.5625, "rewards/chosen": -4.634129524230957, "rewards/margins": 0.4353586733341217, "rewards/rejected": -5.069488525390625, "sft_loss": 4.371542930603027, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 0.4166900257171104, "learning_rate": 2.4275245495169025e-07, "logits/chosen": -0.694656491279602, "logits/rejected": -0.5563549399375916, "logps/chosen": -4.529080867767334, "logps/rejected": -5.242933750152588, "loss": 0.0516, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.529080867767334, "rewards/margins": 0.7138529419898987, "rewards/rejected": -5.242933750152588, "sft_loss": 4.234822750091553, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 0.4856713048710508, "learning_rate": 2.414182426310597e-07, "logits/chosen": -0.7959322333335876, "logits/rejected": -0.769672691822052, "logps/chosen": -4.510166168212891, "logps/rejected": -5.311366081237793, "loss": 0.0515, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.510166168212891, "rewards/margins": 0.801199734210968, "rewards/rejected": -5.311366081237793, "sft_loss": 4.296000003814697, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 0.6243875236957093, "learning_rate": 2.400865390717734e-07, "logits/chosen": -0.7460509538650513, "logits/rejected": -0.6690871119499207, "logps/chosen": -4.389871120452881, "logps/rejected": -5.315016746520996, "loss": 0.0516, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.389871120452881, "rewards/margins": 0.9251459240913391, "rewards/rejected": -5.315016746520996, "sft_loss": 4.183564186096191, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 0.5658353195120891, "learning_rate": 2.3875735719402475e-07, "logits/chosen": -0.7674288749694824, "logits/rejected": -0.634276807308197, "logps/chosen": -4.3222336769104, "logps/rejected": -5.103603839874268, "loss": 0.0507, "rewards/accuracies": 0.71875, "rewards/chosen": -4.3222336769104, "rewards/margins": 0.7813698649406433, "rewards/rejected": -5.103603839874268, "sft_loss": 4.120206356048584, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 0.5640306652342747, "learning_rate": 2.3743070989354258e-07, "logits/chosen": -0.7015902996063232, "logits/rejected": -0.6395735740661621, "logps/chosen": -4.416680335998535, "logps/rejected": -5.131161689758301, "loss": 0.0512, "rewards/accuracies": 0.625, "rewards/chosen": -4.416680335998535, "rewards/margins": 0.7144818305969238, "rewards/rejected": -5.131161689758301, "sft_loss": 4.159177780151367, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 0.8818729398968036, "learning_rate": 2.3610661004146454e-07, "logits/chosen": -0.6776810884475708, "logits/rejected": -0.5723994374275208, "logps/chosen": -4.257053375244141, "logps/rejected": -4.910529136657715, "loss": 0.0505, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.257053375244141, "rewards/margins": 0.6534755229949951, "rewards/rejected": -4.910529136657715, "sft_loss": 3.978513240814209, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 0.42266618942146955, "learning_rate": 2.3478507048421314e-07, "logits/chosen": -0.7549012899398804, "logits/rejected": -0.6445156335830688, "logps/chosen": -4.521634578704834, "logps/rejected": -5.223923683166504, "loss": 0.0515, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.521634578704834, "rewards/margins": 0.7022888660430908, "rewards/rejected": -5.223923683166504, "sft_loss": 4.206809997558594, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 0.46043866853182125, "learning_rate": 2.334661040433713e-07, "logits/chosen": -0.8515769243240356, "logits/rejected": -0.7206335663795471, "logps/chosen": -4.5367536544799805, "logps/rejected": -5.210167407989502, "loss": 0.0517, "rewards/accuracies": 0.6875, "rewards/chosen": -4.5367536544799805, "rewards/margins": 0.6734140515327454, "rewards/rejected": -5.210167407989502, "sft_loss": 4.247954368591309, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 0.5358993416695645, "learning_rate": 2.321497235155568e-07, "logits/chosen": -0.8475052118301392, "logits/rejected": -0.688243567943573, "logps/chosen": -4.262228965759277, "logps/rejected": -5.043619632720947, "loss": 0.0517, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.262228965759277, "rewards/margins": 0.7813904881477356, "rewards/rejected": -5.043619632720947, "sft_loss": 4.01896858215332, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 0.4150651391308651, "learning_rate": 2.3083594167229965e-07, "logits/chosen": -0.8656677007675171, "logits/rejected": -0.5582382678985596, "logps/chosen": -4.472632884979248, "logps/rejected": -5.255242347717285, "loss": 0.0509, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.472632884979248, "rewards/margins": 0.7826098203659058, "rewards/rejected": -5.255242347717285, "sft_loss": 4.183423042297363, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 0.4563166112067603, "learning_rate": 2.295247712599167e-07, "logits/chosen": -0.7128655910491943, "logits/rejected": -0.6650645136833191, "logps/chosen": -4.469484806060791, "logps/rejected": -5.218777179718018, "loss": 0.0504, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.469484806060791, "rewards/margins": 0.7492923736572266, "rewards/rejected": -5.218777179718018, "sft_loss": 4.08309268951416, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": -0.2918679714202881, "eval_logits/rejected": -0.21033975481987, "eval_logps/chosen": -4.464621543884277, "eval_logps/rejected": -5.165764808654785, "eval_loss": 0.050455424934625626, "eval_rewards/accuracies": 0.6839762330055237, "eval_rewards/chosen": -4.464621543884277, "eval_rewards/margins": 0.7011440396308899, "eval_rewards/rejected": -5.165764808654785, "eval_runtime": 43.4608, "eval_samples_per_second": 30.947, "eval_sft_loss": 4.058549404144287, "eval_steps_per_second": 7.754, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 0.6727805412244077, "learning_rate": 2.2821622499938948e-07, "logits/chosen": -0.7068689465522766, "logits/rejected": -0.47361889481544495, "logps/chosen": -4.4505085945129395, "logps/rejected": -5.127585411071777, "loss": 0.0508, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.4505085945129395, "rewards/margins": 0.6770769357681274, "rewards/rejected": -5.127585411071777, "sft_loss": 4.1432719230651855, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 0.32224127814306514, "learning_rate": 2.269103155862391e-07, "logits/chosen": -0.8249640464782715, "logits/rejected": -0.7345031499862671, "logps/chosen": -4.459612846374512, "logps/rejected": -5.158649444580078, "loss": 0.05, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.459612846374512, "rewards/margins": 0.6990362405776978, "rewards/rejected": -5.158649444580078, "sft_loss": 4.074860095977783, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 0.5506638370672858, "learning_rate": 2.2560705569040483e-07, "logits/chosen": -0.7756333351135254, "logits/rejected": -0.47715824842453003, "logps/chosen": -4.469395637512207, "logps/rejected": -5.134026527404785, "loss": 0.0527, "rewards/accuracies": 0.65625, "rewards/chosen": -4.469395637512207, "rewards/margins": 0.6646314859390259, "rewards/rejected": -5.134026527404785, "sft_loss": 4.218961238861084, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 0.3443569983012991, "learning_rate": 2.2430645795611963e-07, "logits/chosen": -0.8699603080749512, "logits/rejected": -0.7662721276283264, "logps/chosen": -4.326430320739746, "logps/rejected": -5.181424140930176, "loss": 0.0499, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.326430320739746, "rewards/margins": 0.854993999004364, "rewards/rejected": -5.181424140930176, "sft_loss": 4.081225395202637, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 0.46911133108933106, "learning_rate": 2.230085350017884e-07, "logits/chosen": -0.7873638868331909, "logits/rejected": -0.6871055364608765, "logps/chosen": -4.547414302825928, "logps/rejected": -5.233478546142578, "loss": 0.0521, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.547414302825928, "rewards/margins": 0.6860648393630981, "rewards/rejected": -5.233478546142578, "sft_loss": 4.271318435668945, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 0.3329126350332984, "learning_rate": 2.2171329941986554e-07, "logits/chosen": -0.805925726890564, "logits/rejected": -0.7726612091064453, "logps/chosen": -4.549063682556152, "logps/rejected": -5.230555057525635, "loss": 0.0496, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.549063682556152, "rewards/margins": 0.6814913153648376, "rewards/rejected": -5.230555057525635, "sft_loss": 4.1083083152771, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 0.35451698745934257, "learning_rate": 2.2042076377673202e-07, "logits/chosen": -0.7483960390090942, "logits/rejected": -0.8004158139228821, "logps/chosen": -4.400989055633545, "logps/rejected": -4.926598072052002, "loss": 0.052, "rewards/accuracies": 0.65625, "rewards/chosen": -4.400989055633545, "rewards/margins": 0.5256087183952332, "rewards/rejected": -4.926598072052002, "sft_loss": 4.080153465270996, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 0.3835866808895003, "learning_rate": 2.1913094061257476e-07, "logits/chosen": -0.7398586869239807, "logits/rejected": -0.7862161993980408, "logps/chosen": -4.51585054397583, "logps/rejected": -5.192282676696777, "loss": 0.0509, "rewards/accuracies": 0.75, "rewards/chosen": -4.51585054397583, "rewards/margins": 0.6764318346977234, "rewards/rejected": -5.192282676696777, "sft_loss": 4.22283935546875, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 0.5104943845035439, "learning_rate": 2.178438424412633e-07, "logits/chosen": -0.7213504314422607, "logits/rejected": -0.576121985912323, "logps/chosen": -4.430332183837891, "logps/rejected": -5.1058149337768555, "loss": 0.0516, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.430332183837891, "rewards/margins": 0.6754826307296753, "rewards/rejected": -5.1058149337768555, "sft_loss": 4.154946804046631, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 0.4674501996338367, "learning_rate": 2.165594817502302e-07, "logits/chosen": -0.8302208781242371, "logits/rejected": -0.6873995065689087, "logps/chosen": -4.677610397338867, "logps/rejected": -5.197269916534424, "loss": 0.0549, "rewards/accuracies": 0.625, "rewards/chosen": -4.677610397338867, "rewards/margins": 0.5196588635444641, "rewards/rejected": -5.197269916534424, "sft_loss": 4.4770989418029785, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 0.7933969079589734, "learning_rate": 2.1527787100034806e-07, "logits/chosen": -0.646473228931427, "logits/rejected": -0.5958948135375977, "logps/chosen": -4.4888916015625, "logps/rejected": -5.003674030303955, "loss": 0.0522, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.4888916015625, "rewards/margins": 0.5147822499275208, "rewards/rejected": -5.003674030303955, "sft_loss": 4.261991024017334, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 0.4931963118357809, "learning_rate": 2.1399902262581037e-07, "logits/chosen": -0.7125464677810669, "logits/rejected": -0.5157756209373474, "logps/chosen": -4.63405704498291, "logps/rejected": -5.100339412689209, "loss": 0.0534, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.63405704498291, "rewards/margins": 0.4662818908691406, "rewards/rejected": -5.100339412689209, "sft_loss": 4.32558012008667, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 0.4731138425152067, "learning_rate": 2.127229490340094e-07, "logits/chosen": -0.8338751792907715, "logits/rejected": -0.767744243144989, "logps/chosen": -4.563037872314453, "logps/rejected": -5.240099906921387, "loss": 0.0512, "rewards/accuracies": 0.71875, "rewards/chosen": -4.563037872314453, "rewards/margins": 0.6770623922348022, "rewards/rejected": -5.240099906921387, "sft_loss": 4.253331661224365, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 0.6017968419842186, "learning_rate": 2.1144966260541698e-07, "logits/chosen": -0.7297824025154114, "logits/rejected": -0.48667675256729126, "logps/chosen": -4.502832412719727, "logps/rejected": -5.1569108963012695, "loss": 0.0516, "rewards/accuracies": 0.6875, "rewards/chosen": -4.502832412719727, "rewards/margins": 0.6540783047676086, "rewards/rejected": -5.1569108963012695, "sft_loss": 4.205613613128662, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 0.605056601130136, "learning_rate": 2.1017917569346332e-07, "logits/chosen": -0.7562541961669922, "logits/rejected": -0.5662790536880493, "logps/chosen": -4.355158805847168, "logps/rejected": -5.044077396392822, "loss": 0.0498, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.355158805847168, "rewards/margins": 0.6889182329177856, "rewards/rejected": -5.044077396392822, "sft_loss": 4.024393081665039, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 0.4321712990803465, "learning_rate": 2.0891150062441837e-07, "logits/chosen": -0.7883024215698242, "logits/rejected": -0.6575873494148254, "logps/chosen": -4.471930503845215, "logps/rejected": -5.290343284606934, "loss": 0.0507, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.471930503845215, "rewards/margins": 0.8184129595756531, "rewards/rejected": -5.290343284606934, "sft_loss": 4.168588161468506, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 0.5782685093734775, "learning_rate": 2.0764664969727086e-07, "logits/chosen": -0.6935213804244995, "logits/rejected": -0.6597651243209839, "logps/chosen": -4.432676792144775, "logps/rejected": -5.141415596008301, "loss": 0.0518, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.432676792144775, "rewards/margins": 0.708739161491394, "rewards/rejected": -5.141415596008301, "sft_loss": 4.131624221801758, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 0.4630350649187951, "learning_rate": 2.0638463518361033e-07, "logits/chosen": -0.8443444967269897, "logits/rejected": -0.6547593474388123, "logps/chosen": -4.442532539367676, "logps/rejected": -5.188077449798584, "loss": 0.0509, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.442532539367676, "rewards/margins": 0.7455454468727112, "rewards/rejected": -5.188077449798584, "sft_loss": 4.115714073181152, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 0.38394800610862523, "learning_rate": 2.0512546932750702e-07, "logits/chosen": -0.8786581158638, "logits/rejected": -0.8057855367660522, "logps/chosen": -4.48517370223999, "logps/rejected": -5.181807041168213, "loss": 0.0506, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.48517370223999, "rewards/margins": 0.6966326236724854, "rewards/rejected": -5.181807041168213, "sft_loss": 4.173416614532471, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 0.6202912991395683, "learning_rate": 2.0386916434539343e-07, "logits/chosen": -0.7296054363250732, "logits/rejected": -0.5482798218727112, "logps/chosen": -4.548017501831055, "logps/rejected": -5.236302375793457, "loss": 0.0511, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.548017501831055, "rewards/margins": 0.6882843971252441, "rewards/rejected": -5.236302375793457, "sft_loss": 4.268096446990967, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 0.3488816624853031, "learning_rate": 2.0261573242594627e-07, "logits/chosen": -0.795524001121521, "logits/rejected": -0.5811234712600708, "logps/chosen": -4.477652072906494, "logps/rejected": -5.104580879211426, "loss": 0.051, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.477652072906494, "rewards/margins": 0.6269285678863525, "rewards/rejected": -5.104580879211426, "sft_loss": 4.149083137512207, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 0.6510997768380627, "learning_rate": 2.0136518572996724e-07, "logits/chosen": -0.7480728030204773, "logits/rejected": -0.5544711351394653, "logps/chosen": -4.376706600189209, "logps/rejected": -5.170914173126221, "loss": 0.0506, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.376706600189209, "rewards/margins": 0.7942078709602356, "rewards/rejected": -5.170914173126221, "sft_loss": 4.084542274475098, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 0.7380670951006642, "learning_rate": 2.0011753639026617e-07, "logits/chosen": -0.722009003162384, "logits/rejected": -0.6640048027038574, "logps/chosen": -4.3249921798706055, "logps/rejected": -5.086312294006348, "loss": 0.0513, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.3249921798706055, "rewards/margins": 0.7613199949264526, "rewards/rejected": -5.086312294006348, "sft_loss": 4.073752403259277, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 0.6413941733357926, "learning_rate": 1.988727965115421e-07, "logits/chosen": -0.7592009902000427, "logits/rejected": -0.6666856408119202, "logps/chosen": -4.346851348876953, "logps/rejected": -5.01780366897583, "loss": 0.0516, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.346851348876953, "rewards/margins": 0.6709519028663635, "rewards/rejected": -5.01780366897583, "sft_loss": 4.029229640960693, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 0.513048432496639, "learning_rate": 1.9763097817026713e-07, "logits/chosen": -0.8778330087661743, "logits/rejected": -0.6732766032218933, "logps/chosen": -4.389376640319824, "logps/rejected": -5.263741493225098, "loss": 0.0501, "rewards/accuracies": 0.65625, "rewards/chosen": -4.389376640319824, "rewards/margins": 0.8743650317192078, "rewards/rejected": -5.263741493225098, "sft_loss": 4.129411697387695, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 0.6110412211607691, "learning_rate": 1.9639209341456796e-07, "logits/chosen": -0.713258683681488, "logits/rejected": -0.632881760597229, "logps/chosen": -4.497132778167725, "logps/rejected": -5.232693195343018, "loss": 0.0516, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.497132778167725, "rewards/margins": 0.7355600595474243, "rewards/rejected": -5.232693195343018, "sft_loss": 4.2595624923706055, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 0.6504637140199618, "learning_rate": 1.951561542641102e-07, "logits/chosen": -0.6895782947540283, "logits/rejected": -0.7371370792388916, "logps/chosen": -4.693241119384766, "logps/rejected": -5.284863471984863, "loss": 0.0533, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.693241119384766, "rewards/margins": 0.5916224718093872, "rewards/rejected": -5.284863471984863, "sft_loss": 4.434107780456543, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 0.4543352377658658, "learning_rate": 1.939231727099806e-07, "logits/chosen": -0.9554224014282227, "logits/rejected": -0.9246322512626648, "logps/chosen": -4.4521613121032715, "logps/rejected": -5.085289001464844, "loss": 0.0527, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.4521613121032715, "rewards/margins": 0.6331278085708618, "rewards/rejected": -5.085289001464844, "sft_loss": 4.227784633636475, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 0.4284689952652344, "learning_rate": 1.926931607145719e-07, "logits/chosen": -0.6809648871421814, "logits/rejected": -0.5421003103256226, "logps/chosen": -4.518945693969727, "logps/rejected": -5.160582065582275, "loss": 0.0519, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.518945693969727, "rewards/margins": 0.6416367292404175, "rewards/rejected": -5.160582065582275, "sft_loss": 4.263835906982422, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 0.43424034162181147, "learning_rate": 1.9146613021146564e-07, "logits/chosen": -0.7884284257888794, "logits/rejected": -0.680195152759552, "logps/chosen": -4.412850379943848, "logps/rejected": -5.129262924194336, "loss": 0.0519, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.412850379943848, "rewards/margins": 0.7164131999015808, "rewards/rejected": -5.129262924194336, "sft_loss": 4.188337326049805, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 0.4962627859609203, "learning_rate": 1.9024209310531736e-07, "logits/chosen": -0.7695158123970032, "logits/rejected": -0.7895294427871704, "logps/chosen": -4.313672065734863, "logps/rejected": -4.9209136962890625, "loss": 0.0509, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.313672065734863, "rewards/margins": 0.6072418689727783, "rewards/rejected": -4.9209136962890625, "sft_loss": 3.9769134521484375, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 0.481643698172507, "learning_rate": 1.890210612717401e-07, "logits/chosen": -0.7807799577713013, "logits/rejected": -0.6693819165229797, "logps/chosen": -4.398434638977051, "logps/rejected": -5.057561874389648, "loss": 0.0518, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.398434638977051, "rewards/margins": 0.6591275334358215, "rewards/rejected": -5.057561874389648, "sft_loss": 4.108274936676025, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 0.7413910120393763, "learning_rate": 1.8780304655719054e-07, "logits/chosen": -0.8045433163642883, "logits/rejected": -0.6661940813064575, "logps/chosen": -4.460057258605957, "logps/rejected": -5.185713291168213, "loss": 0.0513, "rewards/accuracies": 0.71875, "rewards/chosen": -4.460057258605957, "rewards/margins": 0.725655734539032, "rewards/rejected": -5.185713291168213, "sft_loss": 4.1655497550964355, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 0.4732959746390582, "learning_rate": 1.865880607788523e-07, "logits/chosen": -0.6616698503494263, "logits/rejected": -0.5911990404129028, "logps/chosen": -4.5712995529174805, "logps/rejected": -5.2418389320373535, "loss": 0.0523, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.5712995529174805, "rewards/margins": 0.6705387234687805, "rewards/rejected": -5.2418389320373535, "sft_loss": 4.364117622375488, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 0.8094323886363942, "learning_rate": 1.8537611572452316e-07, "logits/chosen": -0.794582724571228, "logits/rejected": -0.7341277599334717, "logps/chosen": -4.37639856338501, "logps/rejected": -5.012908935546875, "loss": 0.0517, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.37639856338501, "rewards/margins": 0.6365109086036682, "rewards/rejected": -5.012908935546875, "sft_loss": 4.13184118270874, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 0.4541511078659696, "learning_rate": 1.84167223152499e-07, "logits/chosen": -0.7890401482582092, "logits/rejected": -0.5393490791320801, "logps/chosen": -4.449801921844482, "logps/rejected": -5.224527835845947, "loss": 0.0524, "rewards/accuracies": 0.71875, "rewards/chosen": -4.449801921844482, "rewards/margins": 0.7747262120246887, "rewards/rejected": -5.224527835845947, "sft_loss": 4.180199146270752, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 0.48971091863328464, "learning_rate": 1.8296139479146112e-07, "logits/chosen": -0.733080267906189, "logits/rejected": -0.7528313398361206, "logps/chosen": -4.379557132720947, "logps/rejected": -4.981081008911133, "loss": 0.0507, "rewards/accuracies": 0.6875, "rewards/chosen": -4.379557132720947, "rewards/margins": 0.6015235781669617, "rewards/rejected": -4.981081008911133, "sft_loss": 4.055811405181885, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 0.4429743125528724, "learning_rate": 1.8175864234036132e-07, "logits/chosen": -0.5849838852882385, "logits/rejected": -0.5171209573745728, "logps/chosen": -4.486813545227051, "logps/rejected": -5.217012405395508, "loss": 0.0529, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.486813545227051, "rewards/margins": 0.7301994562149048, "rewards/rejected": -5.217012405395508, "sft_loss": 4.215363502502441, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 0.3529342606467267, "learning_rate": 1.805589774683094e-07, "logits/chosen": -0.8781973719596863, "logits/rejected": -0.7594443559646606, "logps/chosen": -4.630221366882324, "logps/rejected": -5.19809627532959, "loss": 0.0514, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.630221366882324, "rewards/margins": 0.5678743124008179, "rewards/rejected": -5.19809627532959, "sft_loss": 4.259702205657959, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 0.7539970445961938, "learning_rate": 1.79362411814459e-07, "logits/chosen": -0.5950525999069214, "logits/rejected": -0.6381107568740845, "logps/chosen": -4.614771842956543, "logps/rejected": -5.1353583335876465, "loss": 0.0539, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.614771842956543, "rewards/margins": 0.5205863118171692, "rewards/rejected": -5.1353583335876465, "sft_loss": 4.3355183601379395, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 0.47071934411282107, "learning_rate": 1.7816895698789552e-07, "logits/chosen": -0.8427804112434387, "logits/rejected": -0.7817627191543579, "logps/chosen": -4.487260341644287, "logps/rejected": -5.036301612854004, "loss": 0.0517, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.487260341644287, "rewards/margins": 0.5490409135818481, "rewards/rejected": -5.036301612854004, "sft_loss": 4.125479698181152, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 0.5406217374875316, "learning_rate": 1.7697862456752271e-07, "logits/chosen": -0.8153530955314636, "logits/rejected": -0.690669059753418, "logps/chosen": -4.521656036376953, "logps/rejected": -5.366150856018066, "loss": 0.0511, "rewards/accuracies": 0.75, "rewards/chosen": -4.521656036376953, "rewards/margins": 0.8444948196411133, "rewards/rejected": -5.366150856018066, "sft_loss": 4.276778221130371, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 0.5810986187677788, "learning_rate": 1.7579142610195124e-07, "logits/chosen": -0.7868109941482544, "logits/rejected": -0.6204288601875305, "logps/chosen": -4.511598587036133, "logps/rejected": -5.312621116638184, "loss": 0.0517, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.511598587036133, "rewards/margins": 0.8010231256484985, "rewards/rejected": -5.312621116638184, "sft_loss": 4.231732368469238, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 0.3654874885769355, "learning_rate": 1.7460737310938568e-07, "logits/chosen": -0.8439321517944336, "logits/rejected": -0.6458622217178345, "logps/chosen": -4.324358940124512, "logps/rejected": -5.175654411315918, "loss": 0.05, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.324358940124512, "rewards/margins": 0.8512958288192749, "rewards/rejected": -5.175654411315918, "sft_loss": 4.1335344314575195, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 0.3690452777813944, "learning_rate": 1.734264770775133e-07, "logits/chosen": -0.8422131538391113, "logits/rejected": -0.5545368790626526, "logps/chosen": -4.361656665802002, "logps/rejected": -5.022474765777588, "loss": 0.0523, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.361656665802002, "rewards/margins": 0.660818338394165, "rewards/rejected": -5.022474765777588, "sft_loss": 4.097499847412109, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 0.40414939716552284, "learning_rate": 1.7224874946339241e-07, "logits/chosen": -0.799502968788147, "logits/rejected": -0.7204964756965637, "logps/chosen": -4.439566612243652, "logps/rejected": -5.170541763305664, "loss": 0.0505, "rewards/accuracies": 0.65625, "rewards/chosen": -4.439566612243652, "rewards/margins": 0.7309752106666565, "rewards/rejected": -5.170541763305664, "sft_loss": 4.066781044006348, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 0.5952940841735112, "learning_rate": 1.7107420169334186e-07, "logits/chosen": -0.7833027243614197, "logits/rejected": -0.7465739846229553, "logps/chosen": -4.582724094390869, "logps/rejected": -5.141078472137451, "loss": 0.0518, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.582724094390869, "rewards/margins": 0.5583544969558716, "rewards/rejected": -5.141078472137451, "sft_loss": 4.265813827514648, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 0.4398041156053364, "learning_rate": 1.6990284516282893e-07, "logits/chosen": -0.7631226778030396, "logits/rejected": -0.6925583481788635, "logps/chosen": -4.404193878173828, "logps/rejected": -5.143115520477295, "loss": 0.05, "rewards/accuracies": 0.6875, "rewards/chosen": -4.404193878173828, "rewards/margins": 0.7389219403266907, "rewards/rejected": -5.143115520477295, "sft_loss": 4.091065883636475, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 0.5284040359104598, "learning_rate": 1.687346912363602e-07, "logits/chosen": -0.8135977983474731, "logits/rejected": -0.6727418303489685, "logps/chosen": -4.361421585083008, "logps/rejected": -5.143485069274902, "loss": 0.0502, "rewards/accuracies": 0.75, "rewards/chosen": -4.361421585083008, "rewards/margins": 0.7820636630058289, "rewards/rejected": -5.143485069274902, "sft_loss": 4.039944648742676, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 0.427930060724418, "learning_rate": 1.675697512473697e-07, "logits/chosen": -0.7885524034500122, "logits/rejected": -0.5956434011459351, "logps/chosen": -4.521124362945557, "logps/rejected": -5.362552642822266, "loss": 0.0496, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.521124362945557, "rewards/margins": 0.8414284586906433, "rewards/rejected": -5.362552642822266, "sft_loss": 4.1074538230896, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 0.4548490650516944, "learning_rate": 1.6640803649811087e-07, "logits/chosen": -0.806907057762146, "logits/rejected": -0.5140247344970703, "logps/chosen": -4.435779571533203, "logps/rejected": -5.372751712799072, "loss": 0.0502, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.435779571533203, "rewards/margins": 0.9369718432426453, "rewards/rejected": -5.372751712799072, "sft_loss": 4.071789741516113, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 0.433676020311226, "learning_rate": 1.6524955825954472e-07, "logits/chosen": -0.7338653802871704, "logits/rejected": -0.675475001335144, "logps/chosen": -4.471889019012451, "logps/rejected": -5.032140254974365, "loss": 0.0531, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.471889019012451, "rewards/margins": 0.5602517127990723, "rewards/rejected": -5.032140254974365, "sft_loss": 4.22143030166626, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 0.44709679685057646, "learning_rate": 1.6409432777123277e-07, "logits/chosen": -0.8582879304885864, "logits/rejected": -0.7037560939788818, "logps/chosen": -4.3130292892456055, "logps/rejected": -5.295225143432617, "loss": 0.049, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.3130292892456055, "rewards/margins": 0.9821959733963013, "rewards/rejected": -5.295225143432617, "sft_loss": 4.0433030128479, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 0.6454731179033166, "learning_rate": 1.6294235624122577e-07, "logits/chosen": -0.6963884234428406, "logits/rejected": -0.4233129620552063, "logps/chosen": -4.356287956237793, "logps/rejected": -5.090372085571289, "loss": 0.0517, "rewards/accuracies": 0.65625, "rewards/chosen": -4.356287956237793, "rewards/margins": 0.7340839505195618, "rewards/rejected": -5.090372085571289, "sft_loss": 4.082320213317871, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 0.4092375456675233, "learning_rate": 1.6179365484595697e-07, "logits/chosen": -0.7554842233657837, "logits/rejected": -0.6546878218650818, "logps/chosen": -4.4962849617004395, "logps/rejected": -5.201930999755859, "loss": 0.052, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.4962849617004395, "rewards/margins": 0.7056463360786438, "rewards/rejected": -5.201930999755859, "sft_loss": 4.25368595123291, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 0.7796030085865294, "learning_rate": 1.60648234730132e-07, "logits/chosen": -0.8003331422805786, "logits/rejected": -0.7303368449211121, "logps/chosen": -4.419107913970947, "logps/rejected": -5.249577522277832, "loss": 0.05, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.419107913970947, "rewards/margins": 0.8304697275161743, "rewards/rejected": -5.249577522277832, "sft_loss": 4.124005317687988, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 0.5726370771240188, "learning_rate": 1.595061070066222e-07, "logits/chosen": -0.6955457925796509, "logits/rejected": -0.7629978060722351, "logps/chosen": -4.541815757751465, "logps/rejected": -5.194106101989746, "loss": 0.0516, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.541815757751465, "rewards/margins": 0.6522905826568604, "rewards/rejected": -5.194106101989746, "sft_loss": 4.267590522766113, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 0.47755328394996693, "learning_rate": 1.5836728275635542e-07, "logits/chosen": -0.8143297433853149, "logits/rejected": -0.6354633569717407, "logps/chosen": -4.457322120666504, "logps/rejected": -5.215367317199707, "loss": 0.051, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.457322120666504, "rewards/margins": 0.7580451965332031, "rewards/rejected": -5.215367317199707, "sft_loss": 4.198925971984863, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 0.3876912944758544, "learning_rate": 1.5723177302820984e-07, "logits/chosen": -0.8010802268981934, "logits/rejected": -0.7300946712493896, "logps/chosen": -4.525595664978027, "logps/rejected": -5.123916149139404, "loss": 0.0525, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.525595664978027, "rewards/margins": 0.5983200073242188, "rewards/rejected": -5.123916149139404, "sft_loss": 4.291536331176758, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 0.4036539451488429, "learning_rate": 1.5609958883890544e-07, "logits/chosen": -0.6942816376686096, "logits/rejected": -0.5985075235366821, "logps/chosen": -4.369370937347412, "logps/rejected": -5.014688014984131, "loss": 0.0506, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.369370937347412, "rewards/margins": 0.6453171968460083, "rewards/rejected": -5.014688014984131, "sft_loss": 4.008292198181152, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 0.5265430776252449, "learning_rate": 1.5497074117289865e-07, "logits/chosen": -0.753690242767334, "logits/rejected": -0.634030282497406, "logps/chosen": -4.488044738769531, "logps/rejected": -5.264333724975586, "loss": 0.0513, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.488044738769531, "rewards/margins": 0.7762894034385681, "rewards/rejected": -5.264333724975586, "sft_loss": 4.194975852966309, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 0.3790394544769755, "learning_rate": 1.5384524098227402e-07, "logits/chosen": -0.7402902841567993, "logits/rejected": -0.5147100687026978, "logps/chosen": -4.349900245666504, "logps/rejected": -5.224468231201172, "loss": 0.0508, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.349900245666504, "rewards/margins": 0.8745684623718262, "rewards/rejected": -5.224468231201172, "sft_loss": 4.091513156890869, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 0.5268560797622209, "learning_rate": 1.5272309918663974e-07, "logits/chosen": -0.7533577084541321, "logits/rejected": -0.5765639543533325, "logps/chosen": -4.552016735076904, "logps/rejected": -5.05606746673584, "loss": 0.0529, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.552016735076904, "rewards/margins": 0.5040509104728699, "rewards/rejected": -5.05606746673584, "sft_loss": 4.245696544647217, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 0.497137273147704, "learning_rate": 1.516043266730201e-07, "logits/chosen": -0.7740308046340942, "logits/rejected": -0.6450439095497131, "logps/chosen": -4.40801477432251, "logps/rejected": -5.11243200302124, "loss": 0.0523, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.40801477432251, "rewards/margins": 0.7044172286987305, "rewards/rejected": -5.11243200302124, "sft_loss": 4.104895114898682, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 0.5593452729007138, "learning_rate": 1.504889342957512e-07, "logits/chosen": -0.7910572290420532, "logits/rejected": -0.5866572260856628, "logps/chosen": -4.533371448516846, "logps/rejected": -5.306161403656006, "loss": 0.052, "rewards/accuracies": 0.6875, "rewards/chosen": -4.533371448516846, "rewards/margins": 0.7727904319763184, "rewards/rejected": -5.306161403656006, "sft_loss": 4.273785591125488, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 0.4308516372542784, "learning_rate": 1.4937693287637453e-07, "logits/chosen": -0.811202883720398, "logits/rejected": -0.6923697590827942, "logps/chosen": -4.52161169052124, "logps/rejected": -5.191643238067627, "loss": 0.0514, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.52161169052124, "rewards/margins": 0.670030951499939, "rewards/rejected": -5.191643238067627, "sft_loss": 4.221057415008545, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 0.37138887048768443, "learning_rate": 1.4826833320353305e-07, "logits/chosen": -0.7932590246200562, "logits/rejected": -0.7206074595451355, "logps/chosen": -4.624764919281006, "logps/rejected": -5.276534557342529, "loss": 0.0515, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.624764919281006, "rewards/margins": 0.6517696976661682, "rewards/rejected": -5.276534557342529, "sft_loss": 4.188200950622559, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 0.4345851380582179, "learning_rate": 1.4716314603286528e-07, "logits/chosen": -0.8649484515190125, "logits/rejected": -0.634868323802948, "logps/chosen": -4.535511493682861, "logps/rejected": -5.249735355377197, "loss": 0.0517, "rewards/accuracies": 0.6875, "rewards/chosen": -4.535511493682861, "rewards/margins": 0.7142241597175598, "rewards/rejected": -5.249735355377197, "sft_loss": 4.229328632354736, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 0.5104610877999559, "learning_rate": 1.4606138208690233e-07, "logits/chosen": -0.7997044324874878, "logits/rejected": -0.7385850548744202, "logps/chosen": -4.5416460037231445, "logps/rejected": -5.044976711273193, "loss": 0.052, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.5416460037231445, "rewards/margins": 0.5033308267593384, "rewards/rejected": -5.044976711273193, "sft_loss": 4.25277853012085, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 0.3642995428464966, "learning_rate": 1.4496305205496251e-07, "logits/chosen": -0.7962000966072083, "logits/rejected": -0.749233067035675, "logps/chosen": -4.49139404296875, "logps/rejected": -5.204357624053955, "loss": 0.0511, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.49139404296875, "rewards/margins": 0.7129632830619812, "rewards/rejected": -5.204357624053955, "sft_loss": 4.271798133850098, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 0.45592266136726567, "learning_rate": 1.4386816659304895e-07, "logits/chosen": -0.9152060747146606, "logits/rejected": -0.7515543699264526, "logps/chosen": -4.481812477111816, "logps/rejected": -5.05482292175293, "loss": 0.0526, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.481812477111816, "rewards/margins": 0.5730103254318237, "rewards/rejected": -5.05482292175293, "sft_loss": 4.24124002456665, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 0.43698490211025665, "learning_rate": 1.4277673632374492e-07, "logits/chosen": -0.8096843957901001, "logits/rejected": -0.5594863891601562, "logps/chosen": -4.307278156280518, "logps/rejected": -5.064208984375, "loss": 0.0505, "rewards/accuracies": 0.71875, "rewards/chosen": -4.307278156280518, "rewards/margins": 0.7569302320480347, "rewards/rejected": -5.064208984375, "sft_loss": 4.024901866912842, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 0.40380334550713853, "learning_rate": 1.416887718361119e-07, "logits/chosen": -0.7265334725379944, "logits/rejected": -0.7379759550094604, "logps/chosen": -4.446936130523682, "logps/rejected": -5.041685581207275, "loss": 0.0532, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.446936130523682, "rewards/margins": 0.5947496294975281, "rewards/rejected": -5.041685581207275, "sft_loss": 4.248701095581055, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 0.4955611851621508, "learning_rate": 1.406042836855859e-07, "logits/chosen": -0.74248206615448, "logits/rejected": -0.6353031396865845, "logps/chosen": -4.549657344818115, "logps/rejected": -5.308493137359619, "loss": 0.0517, "rewards/accuracies": 0.75, "rewards/chosen": -4.549657344818115, "rewards/margins": 0.7588354349136353, "rewards/rejected": -5.308493137359619, "sft_loss": 4.329131603240967, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 0.37714745130252286, "learning_rate": 1.3952328239387595e-07, "logits/chosen": -0.8745020627975464, "logits/rejected": -0.6099725365638733, "logps/chosen": -4.425809860229492, "logps/rejected": -5.203674793243408, "loss": 0.0517, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.425809860229492, "rewards/margins": 0.7778650522232056, "rewards/rejected": -5.203674793243408, "sft_loss": 4.267220973968506, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 0.4350403366163347, "learning_rate": 1.3844577844886109e-07, "logits/chosen": -0.8680629730224609, "logits/rejected": -0.635954737663269, "logps/chosen": -4.190207004547119, "logps/rejected": -5.01309871673584, "loss": 0.0503, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.190207004547119, "rewards/margins": 0.8228910565376282, "rewards/rejected": -5.01309871673584, "sft_loss": 4.028267860412598, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 0.5090011205479518, "learning_rate": 1.3737178230448955e-07, "logits/chosen": -0.8712724447250366, "logits/rejected": -0.7374725341796875, "logps/chosen": -4.453272819519043, "logps/rejected": -5.0184431076049805, "loss": 0.0518, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.453272819519043, "rewards/margins": 0.5651699900627136, "rewards/rejected": -5.0184431076049805, "sft_loss": 4.107565879821777, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 0.44470986998643264, "learning_rate": 1.363013043806764e-07, "logits/chosen": -0.8503999710083008, "logits/rejected": -0.7448652386665344, "logps/chosen": -4.280064105987549, "logps/rejected": -4.982614994049072, "loss": 0.0512, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.280064105987549, "rewards/margins": 0.7025504112243652, "rewards/rejected": -4.982614994049072, "sft_loss": 4.109427452087402, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 0.7032192811206371, "learning_rate": 1.352343550632034e-07, "logits/chosen": -0.8059431910514832, "logits/rejected": -0.6396316885948181, "logps/chosen": -4.340572357177734, "logps/rejected": -5.229712009429932, "loss": 0.0513, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.340572357177734, "rewards/margins": 0.8891401290893555, "rewards/rejected": -5.229712009429932, "sft_loss": 4.127902030944824, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 0.39015834229113866, "learning_rate": 1.3417094470361722e-07, "logits/chosen": -0.8471466302871704, "logits/rejected": -0.6921052932739258, "logps/chosen": -4.541820049285889, "logps/rejected": -5.182088375091553, "loss": 0.053, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.541820049285889, "rewards/margins": 0.6402683258056641, "rewards/rejected": -5.182088375091553, "sft_loss": 4.28817081451416, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": -0.3632008731365204, "eval_logits/rejected": -0.285022109746933, "eval_logps/chosen": -4.476656436920166, "eval_logps/rejected": -5.172234535217285, "eval_loss": 0.050458874553442, "eval_rewards/accuracies": 0.6839762330055237, "eval_rewards/chosen": -4.476656436920166, "eval_rewards/margins": 0.6955785751342773, "eval_rewards/rejected": -5.172234535217285, "eval_runtime": 43.4365, "eval_samples_per_second": 30.965, "eval_sft_loss": 4.090471267700195, "eval_steps_per_second": 7.758, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 0.41993569063387165, "learning_rate": 1.3311108361913015e-07, "logits/chosen": -0.899549663066864, "logits/rejected": -0.9365586042404175, "logps/chosen": -4.511231422424316, "logps/rejected": -5.088454246520996, "loss": 0.0522, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.511231422424316, "rewards/margins": 0.5772226452827454, "rewards/rejected": -5.088454246520996, "sft_loss": 4.286912441253662, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 0.43817622591238337, "learning_rate": 1.3205478209251874e-07, "logits/chosen": -0.7712717056274414, "logits/rejected": -0.7058557868003845, "logps/chosen": -4.5705180168151855, "logps/rejected": -5.281630039215088, "loss": 0.0519, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.5705180168151855, "rewards/margins": 0.711111307144165, "rewards/rejected": -5.281630039215088, "sft_loss": 4.32778787612915, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 0.4837368965922269, "learning_rate": 1.310020503720254e-07, "logits/chosen": -0.793159008026123, "logits/rejected": -0.62529456615448, "logps/chosen": -4.411933898925781, "logps/rejected": -5.071666240692139, "loss": 0.0518, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.411933898925781, "rewards/margins": 0.6597329378128052, "rewards/rejected": -5.071666240692139, "sft_loss": 4.156750679016113, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 0.4008401185189836, "learning_rate": 1.2995289867125752e-07, "logits/chosen": -0.823562741279602, "logits/rejected": -0.7247136831283569, "logps/chosen": -4.479001045227051, "logps/rejected": -5.045378684997559, "loss": 0.0518, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.479001045227051, "rewards/margins": 0.5663775205612183, "rewards/rejected": -5.045378684997559, "sft_loss": 4.197256565093994, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 0.3267884216984576, "learning_rate": 1.2890733716908986e-07, "logits/chosen": -0.7804628014564514, "logits/rejected": -0.7346547842025757, "logps/chosen": -4.284626483917236, "logps/rejected": -4.872819423675537, "loss": 0.0528, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.284626483917236, "rewards/margins": 0.5881929397583008, "rewards/rejected": -4.872819423675537, "sft_loss": 4.072215557098389, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 0.4383847142525197, "learning_rate": 1.2786537600956454e-07, "logits/chosen": -0.8587814569473267, "logits/rejected": -0.6906665563583374, "logps/chosen": -4.313345432281494, "logps/rejected": -5.195896148681641, "loss": 0.0487, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.313345432281494, "rewards/margins": 0.8825508952140808, "rewards/rejected": -5.195896148681641, "sft_loss": 3.9952876567840576, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 0.45896207390131155, "learning_rate": 1.268270253017933e-07, "logits/chosen": -0.854796290397644, "logits/rejected": -0.6479583382606506, "logps/chosen": -4.433968544006348, "logps/rejected": -5.119320869445801, "loss": 0.0518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.433968544006348, "rewards/margins": 0.6853523850440979, "rewards/rejected": -5.119320869445801, "sft_loss": 4.195663928985596, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 0.44999129974127655, "learning_rate": 1.257922951198591e-07, "logits/chosen": -0.9596928358078003, "logits/rejected": -0.657541036605835, "logps/chosen": -4.332813262939453, "logps/rejected": -5.0844526290893555, "loss": 0.0507, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.332813262939453, "rewards/margins": 0.7516393065452576, "rewards/rejected": -5.0844526290893555, "sft_loss": 4.116638660430908, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 0.5436196141333159, "learning_rate": 1.24761195502719e-07, "logits/chosen": -0.8771514892578125, "logits/rejected": -0.6101125478744507, "logps/chosen": -4.607757091522217, "logps/rejected": -5.213659763336182, "loss": 0.0508, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.607757091522217, "rewards/margins": 0.6059027910232544, "rewards/rejected": -5.213659763336182, "sft_loss": 4.251969337463379, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 0.7332787813082794, "learning_rate": 1.2373373645410573e-07, "logits/chosen": -0.762103259563446, "logits/rejected": -0.624140739440918, "logps/chosen": -4.644742012023926, "logps/rejected": -5.275030136108398, "loss": 0.0522, "rewards/accuracies": 0.6875, "rewards/chosen": -4.644742012023926, "rewards/margins": 0.6302889585494995, "rewards/rejected": -5.275030136108398, "sft_loss": 4.331053733825684, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 0.4631029256293614, "learning_rate": 1.2270992794243175e-07, "logits/chosen": -0.8583146333694458, "logits/rejected": -0.7607332468032837, "logps/chosen": -4.444066524505615, "logps/rejected": -5.150336265563965, "loss": 0.0508, "rewards/accuracies": 0.6875, "rewards/chosen": -4.444066524505615, "rewards/margins": 0.7062696218490601, "rewards/rejected": -5.150336265563965, "sft_loss": 4.155452728271484, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 0.8308078175306337, "learning_rate": 1.2168977990069147e-07, "logits/chosen": -0.8691369891166687, "logits/rejected": -0.6420689821243286, "logps/chosen": -4.282433032989502, "logps/rejected": -5.122197151184082, "loss": 0.0517, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.282433032989502, "rewards/margins": 0.839765191078186, "rewards/rejected": -5.122197151184082, "sft_loss": 3.9892375469207764, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 0.3763620124349001, "learning_rate": 1.206733022263659e-07, "logits/chosen": -0.8173881769180298, "logits/rejected": -0.6439284086227417, "logps/chosen": -4.300229549407959, "logps/rejected": -5.05617094039917, "loss": 0.0501, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.300229549407959, "rewards/margins": 0.7559418678283691, "rewards/rejected": -5.05617094039917, "sft_loss": 3.9857208728790283, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 0.5485172489909115, "learning_rate": 1.1966050478132572e-07, "logits/chosen": -0.7517123222351074, "logits/rejected": -0.6935483813285828, "logps/chosen": -4.504305362701416, "logps/rejected": -5.118971347808838, "loss": 0.0525, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.504305362701416, "rewards/margins": 0.6146660447120667, "rewards/rejected": -5.118971347808838, "sft_loss": 4.205090522766113, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 0.42943747081626427, "learning_rate": 1.1865139739173635e-07, "logits/chosen": -0.8406542539596558, "logits/rejected": -0.6299400329589844, "logps/chosen": -4.632096290588379, "logps/rejected": -5.228485584259033, "loss": 0.0515, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.632096290588379, "rewards/margins": 0.5963901281356812, "rewards/rejected": -5.228485584259033, "sft_loss": 4.2738494873046875, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 0.4209416780986708, "learning_rate": 1.1764598984796187e-07, "logits/chosen": -0.9492759704589844, "logits/rejected": -0.8266083598136902, "logps/chosen": -4.4401984214782715, "logps/rejected": -5.127415657043457, "loss": 0.0508, "rewards/accuracies": 0.71875, "rewards/chosen": -4.4401984214782715, "rewards/margins": 0.6872166395187378, "rewards/rejected": -5.127415657043457, "sft_loss": 4.163792133331299, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 0.404322929379254, "learning_rate": 1.1664429190447095e-07, "logits/chosen": -0.7948734760284424, "logits/rejected": -0.7301923036575317, "logps/chosen": -4.572030544281006, "logps/rejected": -5.346141338348389, "loss": 0.0515, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.572030544281006, "rewards/margins": 0.7741105556488037, "rewards/rejected": -5.346141338348389, "sft_loss": 4.311251640319824, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 0.3949020633907638, "learning_rate": 1.1564631327974122e-07, "logits/chosen": -0.9128694534301758, "logits/rejected": -0.6827823519706726, "logps/chosen": -4.563012599945068, "logps/rejected": -5.230254173278809, "loss": 0.052, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.563012599945068, "rewards/margins": 0.6672413349151611, "rewards/rejected": -5.230254173278809, "sft_loss": 4.242611408233643, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 0.553553569804726, "learning_rate": 1.1465206365616587e-07, "logits/chosen": -0.9420261383056641, "logits/rejected": -0.7209168672561646, "logps/chosen": -4.519663333892822, "logps/rejected": -5.303266525268555, "loss": 0.0504, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.519663333892822, "rewards/margins": 0.7836031913757324, "rewards/rejected": -5.303266525268555, "sft_loss": 4.19036340713501, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 0.3632794095375088, "learning_rate": 1.1366155267995887e-07, "logits/chosen": -0.7734476327896118, "logits/rejected": -0.7625142931938171, "logps/chosen": -4.501950263977051, "logps/rejected": -5.078164100646973, "loss": 0.0516, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.501950263977051, "rewards/margins": 0.5762136578559875, "rewards/rejected": -5.078164100646973, "sft_loss": 4.153041839599609, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 0.45684321809705974, "learning_rate": 1.1267478996106228e-07, "logits/chosen": -0.8146098852157593, "logits/rejected": -0.6006115078926086, "logps/chosen": -4.422109127044678, "logps/rejected": -5.165881156921387, "loss": 0.0518, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.422109127044678, "rewards/margins": 0.7437718510627747, "rewards/rejected": -5.165881156921387, "sft_loss": 4.101393699645996, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 0.4183010395909284, "learning_rate": 1.116917850730521e-07, "logits/chosen": -0.8815043568611145, "logits/rejected": -0.7283543348312378, "logps/chosen": -4.545543193817139, "logps/rejected": -5.047765254974365, "loss": 0.0528, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.545543193817139, "rewards/margins": 0.5022218823432922, "rewards/rejected": -5.047765254974365, "sft_loss": 4.220363616943359, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 0.505742763058506, "learning_rate": 1.1071254755304637e-07, "logits/chosen": -0.8288267850875854, "logits/rejected": -0.7838674187660217, "logps/chosen": -4.297197341918945, "logps/rejected": -5.042912483215332, "loss": 0.0494, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.297197341918945, "rewards/margins": 0.7457149028778076, "rewards/rejected": -5.042912483215332, "sft_loss": 3.972548007965088, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 0.4752082402543453, "learning_rate": 1.0973708690161143e-07, "logits/chosen": -0.8543336987495422, "logits/rejected": -0.74104243516922, "logps/chosen": -4.514694690704346, "logps/rejected": -5.1261162757873535, "loss": 0.0511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.514694690704346, "rewards/margins": 0.6114215850830078, "rewards/rejected": -5.1261162757873535, "sft_loss": 4.206650257110596, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 0.47586666938227407, "learning_rate": 1.0876541258267119e-07, "logits/chosen": -0.8609308004379272, "logits/rejected": -0.6737977266311646, "logps/chosen": -4.337949275970459, "logps/rejected": -5.158361911773682, "loss": 0.0505, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.337949275970459, "rewards/margins": 0.8204119801521301, "rewards/rejected": -5.158361911773682, "sft_loss": 4.102989196777344, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 0.7712988555674866, "learning_rate": 1.0779753402341379e-07, "logits/chosen": -0.9207685589790344, "logits/rejected": -0.84931480884552, "logps/chosen": -4.338393688201904, "logps/rejected": -5.032431602478027, "loss": 0.0521, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.338393688201904, "rewards/margins": 0.6940376162528992, "rewards/rejected": -5.032431602478027, "sft_loss": 4.081472396850586, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 0.5783647376843744, "learning_rate": 1.0683346061420157e-07, "logits/chosen": -0.7379239797592163, "logits/rejected": -0.6506637334823608, "logps/chosen": -4.515606880187988, "logps/rejected": -5.266798973083496, "loss": 0.0526, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.515606880187988, "rewards/margins": 0.7511924505233765, "rewards/rejected": -5.266798973083496, "sft_loss": 4.281412124633789, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 0.44600935802431835, "learning_rate": 1.0587320170847874e-07, "logits/chosen": -0.8168829083442688, "logits/rejected": -0.7206194400787354, "logps/chosen": -4.545700550079346, "logps/rejected": -5.104142189025879, "loss": 0.0533, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.545700550079346, "rewards/margins": 0.5584414601325989, "rewards/rejected": -5.104142189025879, "sft_loss": 4.292700290679932, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 0.36067335080276597, "learning_rate": 1.0491676662268156e-07, "logits/chosen": -0.7393335103988647, "logits/rejected": -0.6619399785995483, "logps/chosen": -4.518947601318359, "logps/rejected": -5.257824897766113, "loss": 0.0506, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.518947601318359, "rewards/margins": 0.7388771176338196, "rewards/rejected": -5.257824897766113, "sft_loss": 4.184815406799316, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 0.5409398505388431, "learning_rate": 1.0396416463614732e-07, "logits/chosen": -0.8931136131286621, "logits/rejected": -0.8121291995048523, "logps/chosen": -4.453886985778809, "logps/rejected": -5.1217451095581055, "loss": 0.052, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.453886985778809, "rewards/margins": 0.667857825756073, "rewards/rejected": -5.1217451095581055, "sft_loss": 4.168035507202148, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 1.0388002306851687, "learning_rate": 1.0301540499102479e-07, "logits/chosen": -0.7921608686447144, "logits/rejected": -0.7310789227485657, "logps/chosen": -4.489283561706543, "logps/rejected": -5.021276950836182, "loss": 0.0532, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.489283561706543, "rewards/margins": 0.5319927334785461, "rewards/rejected": -5.021276950836182, "sft_loss": 4.189752101898193, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 0.55059593910096, "learning_rate": 1.0207049689218405e-07, "logits/chosen": -0.9317811131477356, "logits/rejected": -0.6625012755393982, "logps/chosen": -4.478198051452637, "logps/rejected": -5.189414978027344, "loss": 0.0521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.478198051452637, "rewards/margins": 0.7112170457839966, "rewards/rejected": -5.189414978027344, "sft_loss": 4.156213760375977, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 0.3662390057934976, "learning_rate": 1.0112944950712782e-07, "logits/chosen": -0.8565780520439148, "logits/rejected": -0.7790525555610657, "logps/chosen": -4.4673357009887695, "logps/rejected": -5.257838249206543, "loss": 0.0502, "rewards/accuracies": 0.71875, "rewards/chosen": -4.4673357009887695, "rewards/margins": 0.7905027866363525, "rewards/rejected": -5.257838249206543, "sft_loss": 4.1379194259643555, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 0.59163711890118, "learning_rate": 1.0019227196590174e-07, "logits/chosen": -0.8295857310295105, "logits/rejected": -0.6748948097229004, "logps/chosen": -4.575309753417969, "logps/rejected": -5.304391384124756, "loss": 0.0519, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.575309753417969, "rewards/margins": 0.7290816903114319, "rewards/rejected": -5.304391384124756, "sft_loss": 4.324819087982178, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 0.3863191151206876, "learning_rate": 9.925897336100664e-08, "logits/chosen": -0.7855613827705383, "logits/rejected": -0.7414531111717224, "logps/chosen": -4.514120101928711, "logps/rejected": -5.226920127868652, "loss": 0.0522, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.514120101928711, "rewards/margins": 0.7128003239631653, "rewards/rejected": -5.226920127868652, "sft_loss": 4.242648124694824, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 0.5313135323825595, "learning_rate": 9.832956274730946e-08, "logits/chosen": -0.7910794019699097, "logits/rejected": -0.7502156496047974, "logps/chosen": -4.682957172393799, "logps/rejected": -5.110443115234375, "loss": 0.0529, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.682957172393799, "rewards/margins": 0.42748576402664185, "rewards/rejected": -5.110443115234375, "sft_loss": 4.3094916343688965, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 0.6164310535748478, "learning_rate": 9.740404914195633e-08, "logits/chosen": -0.8025332689285278, "logits/rejected": -0.6623189449310303, "logps/chosen": -4.397765159606934, "logps/rejected": -5.139482021331787, "loss": 0.0511, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.397765159606934, "rewards/margins": 0.7417163252830505, "rewards/rejected": -5.139482021331787, "sft_loss": 4.196730613708496, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 0.5277575602538304, "learning_rate": 9.648244152428392e-08, "logits/chosen": -0.8694614171981812, "logits/rejected": -0.7217963933944702, "logps/chosen": -4.478517055511475, "logps/rejected": -4.989750862121582, "loss": 0.0529, "rewards/accuracies": 0.65625, "rewards/chosen": -4.478517055511475, "rewards/margins": 0.5112346410751343, "rewards/rejected": -4.989750862121582, "sft_loss": 4.229212760925293, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 0.3465526549100429, "learning_rate": 9.556474883573379e-08, "logits/chosen": -0.8793339729309082, "logits/rejected": -0.7501915693283081, "logps/chosen": -4.311750411987305, "logps/rejected": -5.096708297729492, "loss": 0.0523, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.311750411987305, "rewards/margins": 0.7849579453468323, "rewards/rejected": -5.096708297729492, "sft_loss": 4.058889389038086, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 0.6362567491428798, "learning_rate": 9.465097997976412e-08, "logits/chosen": -0.8457925915718079, "logits/rejected": -0.6267693042755127, "logps/chosen": -4.335329532623291, "logps/rejected": -5.132666110992432, "loss": 0.0509, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.335329532623291, "rewards/margins": 0.7973363399505615, "rewards/rejected": -5.132666110992432, "sft_loss": 4.092240333557129, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 0.5709648129829065, "learning_rate": 9.374114382176457e-08, "logits/chosen": -0.8717010617256165, "logits/rejected": -0.6866291761398315, "logps/chosen": -4.533945560455322, "logps/rejected": -5.346493721008301, "loss": 0.0503, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.533945560455322, "rewards/margins": 0.8125476837158203, "rewards/rejected": -5.346493721008301, "sft_loss": 4.200273036956787, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 0.41256938888295447, "learning_rate": 9.283524918896945e-08, "logits/chosen": -0.9007472991943359, "logits/rejected": -0.7356555461883545, "logps/chosen": -4.5387139320373535, "logps/rejected": -5.191751956939697, "loss": 0.0522, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.5387139320373535, "rewards/margins": 0.6530376076698303, "rewards/rejected": -5.191751956939697, "sft_loss": 4.2472429275512695, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 0.46700861734247096, "learning_rate": 9.193330487037232e-08, "logits/chosen": -0.8507378697395325, "logits/rejected": -0.6919487714767456, "logps/chosen": -4.463172912597656, "logps/rejected": -5.266392230987549, "loss": 0.0517, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.463172912597656, "rewards/margins": 0.8032194972038269, "rewards/rejected": -5.266392230987549, "sft_loss": 4.272570610046387, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 0.4668769656142938, "learning_rate": 9.103531961664118e-08, "logits/chosen": -0.7989420890808105, "logits/rejected": -0.6214526295661926, "logps/chosen": -4.455315113067627, "logps/rejected": -5.171011447906494, "loss": 0.0501, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.455315113067627, "rewards/margins": 0.7156961560249329, "rewards/rejected": -5.171011447906494, "sft_loss": 4.132755279541016, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 0.47924486796798166, "learning_rate": 9.014130214003269e-08, "logits/chosen": -0.8464191555976868, "logits/rejected": -0.8963597416877747, "logps/chosen": -4.2732834815979, "logps/rejected": -4.967657089233398, "loss": 0.051, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.2732834815979, "rewards/margins": 0.6943733096122742, "rewards/rejected": -4.967657089233398, "sft_loss": 3.9865031242370605, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 0.32552256494364223, "learning_rate": 8.925126111430848e-08, "logits/chosen": -0.7193098068237305, "logits/rejected": -0.675899088382721, "logps/chosen": -4.4091105461120605, "logps/rejected": -5.017752647399902, "loss": 0.051, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.4091105461120605, "rewards/margins": 0.6086419224739075, "rewards/rejected": -5.017752647399902, "sft_loss": 4.017719268798828, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 0.5496220681748125, "learning_rate": 8.83652051746504e-08, "logits/chosen": -0.689524233341217, "logits/rejected": -0.5122482180595398, "logps/chosen": -4.472263813018799, "logps/rejected": -5.335396766662598, "loss": 0.051, "rewards/accuracies": 0.6875, "rewards/chosen": -4.472263813018799, "rewards/margins": 0.8631328344345093, "rewards/rejected": -5.335396766662598, "sft_loss": 4.2152910232543945, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 0.31765867467775627, "learning_rate": 8.748314291757696e-08, "logits/chosen": -0.7355210781097412, "logits/rejected": -0.6390531659126282, "logps/chosen": -4.5320844650268555, "logps/rejected": -5.168013572692871, "loss": 0.0521, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.5320844650268555, "rewards/margins": 0.635929524898529, "rewards/rejected": -5.168013572692871, "sft_loss": 4.182335376739502, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 0.5771252870993228, "learning_rate": 8.660508290086032e-08, "logits/chosen": -0.7999386191368103, "logits/rejected": -0.6518659591674805, "logps/chosen": -4.393448829650879, "logps/rejected": -5.0774006843566895, "loss": 0.0514, "rewards/accuracies": 0.71875, "rewards/chosen": -4.393448829650879, "rewards/margins": 0.6839522123336792, "rewards/rejected": -5.0774006843566895, "sft_loss": 4.168224811553955, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 0.5150308708468381, "learning_rate": 8.573103364344231e-08, "logits/chosen": -0.8791343569755554, "logits/rejected": -0.6491331458091736, "logps/chosen": -4.423648834228516, "logps/rejected": -5.2099127769470215, "loss": 0.0502, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.423648834228516, "rewards/margins": 0.7862640619277954, "rewards/rejected": -5.2099127769470215, "sft_loss": 4.084849834442139, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 0.3661674336980175, "learning_rate": 8.486100362535292e-08, "logits/chosen": -0.8177053332328796, "logits/rejected": -0.6740037202835083, "logps/chosen": -4.4994401931762695, "logps/rejected": -5.0591583251953125, "loss": 0.0517, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.4994401931762695, "rewards/margins": 0.5597187280654907, "rewards/rejected": -5.0591583251953125, "sft_loss": 4.204558372497559, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 0.5140280133573151, "learning_rate": 8.399500128762693e-08, "logits/chosen": -0.8273393511772156, "logits/rejected": -0.7141873240470886, "logps/chosen": -4.562862396240234, "logps/rejected": -5.185477256774902, "loss": 0.0509, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.562862396240234, "rewards/margins": 0.6226149201393127, "rewards/rejected": -5.185477256774902, "sft_loss": 4.275196552276611, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 0.42529640095647614, "learning_rate": 8.313303503222313e-08, "logits/chosen": -0.8213428258895874, "logits/rejected": -0.7776002883911133, "logps/chosen": -4.575163841247559, "logps/rejected": -5.10188102722168, "loss": 0.0522, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.575163841247559, "rewards/margins": 0.5267173051834106, "rewards/rejected": -5.10188102722168, "sft_loss": 4.26285982131958, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 0.4007308001400934, "learning_rate": 8.227511322194164e-08, "logits/chosen": -0.7592180371284485, "logits/rejected": -0.637819766998291, "logps/chosen": -4.361072063446045, "logps/rejected": -4.973891258239746, "loss": 0.0527, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.361072063446045, "rewards/margins": 0.6128195524215698, "rewards/rejected": -4.973891258239746, "sft_loss": 4.0938920974731445, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 0.40321531518970055, "learning_rate": 8.142124418034385e-08, "logits/chosen": -0.7266249656677246, "logits/rejected": -0.5235757827758789, "logps/chosen": -4.5698018074035645, "logps/rejected": -5.1615800857543945, "loss": 0.0522, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.5698018074035645, "rewards/margins": 0.5917780995368958, "rewards/rejected": -5.1615800857543945, "sft_loss": 4.234898567199707, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 0.39867424326365275, "learning_rate": 8.057143619167073e-08, "logits/chosen": -0.7164444923400879, "logits/rejected": -0.5923494696617126, "logps/chosen": -4.497181415557861, "logps/rejected": -5.251903533935547, "loss": 0.0515, "rewards/accuracies": 0.6875, "rewards/chosen": -4.497181415557861, "rewards/margins": 0.7547226548194885, "rewards/rejected": -5.251903533935547, "sft_loss": 4.232115745544434, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 0.5680624236093563, "learning_rate": 7.97256975007633e-08, "logits/chosen": -0.7831182479858398, "logits/rejected": -0.565954327583313, "logps/chosen": -4.234208106994629, "logps/rejected": -5.022221565246582, "loss": 0.0509, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.234208106994629, "rewards/margins": 0.7880129814147949, "rewards/rejected": -5.022221565246582, "sft_loss": 4.004078388214111, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 0.3881217825976462, "learning_rate": 7.888403631298186e-08, "logits/chosen": -0.6825178861618042, "logits/rejected": -0.6623591184616089, "logps/chosen": -4.475518226623535, "logps/rejected": -5.0694780349731445, "loss": 0.0526, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.475518226623535, "rewards/margins": 0.593960165977478, "rewards/rejected": -5.0694780349731445, "sft_loss": 4.17282247543335, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 0.4290121459253616, "learning_rate": 7.804646079412719e-08, "logits/chosen": -0.7428678274154663, "logits/rejected": -0.5591611266136169, "logps/chosen": -4.560364246368408, "logps/rejected": -5.201272964477539, "loss": 0.0516, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.560364246368408, "rewards/margins": 0.6409088969230652, "rewards/rejected": -5.201272964477539, "sft_loss": 4.295758247375488, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 0.41012547747397127, "learning_rate": 7.72129790703604e-08, "logits/chosen": -0.8270618319511414, "logits/rejected": -0.6811308264732361, "logps/chosen": -4.522173881530762, "logps/rejected": -5.09686803817749, "loss": 0.0522, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.522173881530762, "rewards/margins": 0.5746942758560181, "rewards/rejected": -5.09686803817749, "sft_loss": 4.222050666809082, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 0.3557932248875748, "learning_rate": 7.638359922812504e-08, "logits/chosen": -0.7023937702178955, "logits/rejected": -0.6598516702651978, "logps/chosen": -4.509191989898682, "logps/rejected": -5.237663269042969, "loss": 0.0502, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.509191989898682, "rewards/margins": 0.7284715175628662, "rewards/rejected": -5.237663269042969, "sft_loss": 4.194537162780762, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 0.5817691255500771, "learning_rate": 7.555832931406774e-08, "logits/chosen": -0.8006316423416138, "logits/rejected": -0.5774275064468384, "logps/chosen": -4.520382881164551, "logps/rejected": -5.25664758682251, "loss": 0.0519, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.520382881164551, "rewards/margins": 0.7362645864486694, "rewards/rejected": -5.25664758682251, "sft_loss": 4.245955467224121, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 0.3995963858342414, "learning_rate": 7.47371773349611e-08, "logits/chosen": -0.7040305733680725, "logits/rejected": -0.7341457605361938, "logps/chosen": -4.4279961585998535, "logps/rejected": -5.22509241104126, "loss": 0.0496, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.4279961585998535, "rewards/margins": 0.7970967888832092, "rewards/rejected": -5.22509241104126, "sft_loss": 4.074034690856934, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 0.3769854449599952, "learning_rate": 7.392015125762496e-08, "logits/chosen": -0.8018277883529663, "logits/rejected": -0.6637119054794312, "logps/chosen": -4.422866344451904, "logps/rejected": -5.083056449890137, "loss": 0.0506, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.422866344451904, "rewards/margins": 0.6601905822753906, "rewards/rejected": -5.083056449890137, "sft_loss": 4.099396705627441, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 0.4314105030286715, "learning_rate": 7.310725900885018e-08, "logits/chosen": -0.8106725811958313, "logits/rejected": -0.795625627040863, "logps/chosen": -4.441800594329834, "logps/rejected": -4.995269298553467, "loss": 0.0534, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.441800594329834, "rewards/margins": 0.5534688234329224, "rewards/rejected": -4.995269298553467, "sft_loss": 4.204402923583984, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 0.5168731561480471, "learning_rate": 7.229850847532076e-08, "logits/chosen": -0.7228676080703735, "logits/rejected": -0.5544254183769226, "logps/chosen": -4.531182765960693, "logps/rejected": -5.202092170715332, "loss": 0.051, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.531182765960693, "rewards/margins": 0.6709097623825073, "rewards/rejected": -5.202092170715332, "sft_loss": 4.208251953125, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 0.7606664409858177, "learning_rate": 7.149390750353779e-08, "logits/chosen": -0.6411561965942383, "logits/rejected": -0.7863910794258118, "logps/chosen": -4.404947757720947, "logps/rejected": -4.9736504554748535, "loss": 0.0511, "rewards/accuracies": 0.71875, "rewards/chosen": -4.404947757720947, "rewards/margins": 0.5687026381492615, "rewards/rejected": -4.9736504554748535, "sft_loss": 4.057023048400879, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 0.39633161239377634, "learning_rate": 7.069346389974374e-08, "logits/chosen": -0.8106769323348999, "logits/rejected": -0.6289903521537781, "logps/chosen": -4.468173027038574, "logps/rejected": -5.0915846824646, "loss": 0.0526, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.468173027038574, "rewards/margins": 0.623412013053894, "rewards/rejected": -5.0915846824646, "sft_loss": 4.241403102874756, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 0.41054122270576154, "learning_rate": 6.989718542984563e-08, "logits/chosen": -0.7652484774589539, "logits/rejected": -0.7496183514595032, "logps/chosen": -4.5117692947387695, "logps/rejected": -5.081242084503174, "loss": 0.0521, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.5117692947387695, "rewards/margins": 0.5694732666015625, "rewards/rejected": -5.081242084503174, "sft_loss": 4.277536869049072, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 0.6095427658221313, "learning_rate": 6.9105079819341e-08, "logits/chosen": -0.708454430103302, "logits/rejected": -0.4635780453681946, "logps/chosen": -4.330394744873047, "logps/rejected": -5.344074249267578, "loss": 0.0498, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.330394744873047, "rewards/margins": 1.0136792659759521, "rewards/rejected": -5.344074249267578, "sft_loss": 4.1580681800842285, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 0.40158170861414555, "learning_rate": 6.831715475324163e-08, "logits/chosen": -0.8929897546768188, "logits/rejected": -0.6932271718978882, "logps/chosen": -4.430346488952637, "logps/rejected": -5.198345184326172, "loss": 0.0516, "rewards/accuracies": 0.6875, "rewards/chosen": -4.430346488952637, "rewards/margins": 0.7679981589317322, "rewards/rejected": -5.198345184326172, "sft_loss": 4.236446857452393, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 0.9430362390645314, "learning_rate": 6.753341787600026e-08, "logits/chosen": -0.8296493291854858, "logits/rejected": -0.733299970626831, "logps/chosen": -4.424052715301514, "logps/rejected": -5.159595489501953, "loss": 0.0519, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.424052715301514, "rewards/margins": 0.7355419397354126, "rewards/rejected": -5.159595489501953, "sft_loss": 4.191681861877441, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 0.5193369946732992, "learning_rate": 6.67538767914353e-08, "logits/chosen": -0.7756398916244507, "logits/rejected": -0.5511414408683777, "logps/chosen": -4.501200199127197, "logps/rejected": -5.170102596282959, "loss": 0.0543, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.501200199127197, "rewards/margins": 0.668903112411499, "rewards/rejected": -5.170102596282959, "sft_loss": 4.344412326812744, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 0.625624192281797, "learning_rate": 6.597853906265793e-08, "logits/chosen": -0.7071702480316162, "logits/rejected": -0.5840771794319153, "logps/chosen": -4.405869483947754, "logps/rejected": -5.307399749755859, "loss": 0.0507, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.405869483947754, "rewards/margins": 0.901530385017395, "rewards/rejected": -5.307399749755859, "sft_loss": 4.106754779815674, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 0.44989926116378653, "learning_rate": 6.5207412211998e-08, "logits/chosen": -0.6850845217704773, "logits/rejected": -0.5953121185302734, "logps/chosen": -4.426508903503418, "logps/rejected": -5.078802108764648, "loss": 0.0524, "rewards/accuracies": 0.65625, "rewards/chosen": -4.426508903503418, "rewards/margins": 0.6522935628890991, "rewards/rejected": -5.078802108764648, "sft_loss": 4.111783027648926, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 0.3869486131303834, "learning_rate": 6.444050372093186e-08, "logits/chosen": -0.7650526165962219, "logits/rejected": -0.6805169582366943, "logps/chosen": -4.382111549377441, "logps/rejected": -5.045238971710205, "loss": 0.0513, "rewards/accuracies": 0.71875, "rewards/chosen": -4.382111549377441, "rewards/margins": 0.6631268858909607, "rewards/rejected": -5.045238971710205, "sft_loss": 4.1749267578125, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 0.5142574862060383, "learning_rate": 6.367782103000873e-08, "logits/chosen": -0.7648038268089294, "logits/rejected": -0.7223659753799438, "logps/chosen": -4.393990993499756, "logps/rejected": -4.955014228820801, "loss": 0.0526, "rewards/accuracies": 0.65625, "rewards/chosen": -4.393990993499756, "rewards/margins": 0.5610231161117554, "rewards/rejected": -4.955014228820801, "sft_loss": 4.111966133117676, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 0.4258157669046418, "learning_rate": 6.29193715387798e-08, "logits/chosen": -0.8260219693183899, "logits/rejected": -0.7080743312835693, "logps/chosen": -4.401946544647217, "logps/rejected": -5.105975151062012, "loss": 0.0525, "rewards/accuracies": 0.6875, "rewards/chosen": -4.401946544647217, "rewards/margins": 0.7040285468101501, "rewards/rejected": -5.105975151062012, "sft_loss": 4.163133144378662, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 0.6189933185149098, "learning_rate": 6.216516260572502e-08, "logits/chosen": -0.7578010559082031, "logits/rejected": -0.657882809638977, "logps/chosen": -4.521510124206543, "logps/rejected": -5.196859359741211, "loss": 0.0518, "rewards/accuracies": 0.71875, "rewards/chosen": -4.521510124206543, "rewards/margins": 0.6753486394882202, "rewards/rejected": -5.196859359741211, "sft_loss": 4.304436683654785, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 0.567926027513947, "learning_rate": 6.141520154818297e-08, "logits/chosen": -0.7929534912109375, "logits/rejected": -0.6780120134353638, "logps/chosen": -4.525482654571533, "logps/rejected": -5.13308048248291, "loss": 0.0525, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.525482654571533, "rewards/margins": 0.6075973510742188, "rewards/rejected": -5.13308048248291, "sft_loss": 4.307214736938477, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": -0.2741285264492035, "eval_logits/rejected": -0.18904297053813934, "eval_logps/chosen": -4.448305130004883, "eval_logps/rejected": -5.142607688903809, "eval_loss": 0.05042795091867447, "eval_rewards/accuracies": 0.6832343935966492, "eval_rewards/chosen": -4.448305130004883, "eval_rewards/margins": 0.6943021416664124, "eval_rewards/rejected": -5.142607688903809, "eval_runtime": 43.661, "eval_samples_per_second": 30.805, "eval_sft_loss": 4.069952011108398, "eval_steps_per_second": 7.719, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 0.36636097203702034, "learning_rate": 6.066949564227897e-08, "logits/chosen": -0.7946752905845642, "logits/rejected": -0.716626763343811, "logps/chosen": -4.3803510665893555, "logps/rejected": -5.064523696899414, "loss": 0.0515, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.3803510665893555, "rewards/margins": 0.6841726303100586, "rewards/rejected": -5.064523696899414, "sft_loss": 4.091052055358887, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 0.7677640156297736, "learning_rate": 5.992805212285523e-08, "logits/chosen": -0.7754315137863159, "logits/rejected": -0.7091799378395081, "logps/chosen": -4.335581302642822, "logps/rejected": -5.124228477478027, "loss": 0.0511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.335581302642822, "rewards/margins": 0.7886467576026917, "rewards/rejected": -5.124228477478027, "sft_loss": 4.099038600921631, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 0.5748646729931978, "learning_rate": 5.9190878183399684e-08, "logits/chosen": -0.7570444345474243, "logits/rejected": -0.613994300365448, "logps/chosen": -4.440479755401611, "logps/rejected": -5.164405822753906, "loss": 0.052, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.440479755401611, "rewards/margins": 0.7239259481430054, "rewards/rejected": -5.164405822753906, "sft_loss": 4.215497016906738, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 0.4191146750687918, "learning_rate": 5.845798097597748e-08, "logits/chosen": -0.7721574306488037, "logits/rejected": -0.6697301268577576, "logps/chosen": -4.579713821411133, "logps/rejected": -5.104306221008301, "loss": 0.0529, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.579713821411133, "rewards/margins": 0.5245919227600098, "rewards/rejected": -5.104306221008301, "sft_loss": 4.305607795715332, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 0.45022059402292774, "learning_rate": 5.772936761116026e-08, "logits/chosen": -0.7701061964035034, "logits/rejected": -0.6287974715232849, "logps/chosen": -4.400467395782471, "logps/rejected": -5.098750114440918, "loss": 0.051, "rewards/accuracies": 0.71875, "rewards/chosen": -4.400467395782471, "rewards/margins": 0.6982828378677368, "rewards/rejected": -5.098750114440918, "sft_loss": 4.131747245788574, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 0.4799590961567342, "learning_rate": 5.700504515795829e-08, "logits/chosen": -0.8211865425109863, "logits/rejected": -0.6615251302719116, "logps/chosen": -4.437475681304932, "logps/rejected": -5.135467529296875, "loss": 0.0513, "rewards/accuracies": 0.71875, "rewards/chosen": -4.437475681304932, "rewards/margins": 0.6979917287826538, "rewards/rejected": -5.135467529296875, "sft_loss": 4.257898330688477, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 0.42527193100918037, "learning_rate": 5.628502064375101e-08, "logits/chosen": -0.8720036745071411, "logits/rejected": -0.6801129579544067, "logps/chosen": -4.475311279296875, "logps/rejected": -5.252869129180908, "loss": 0.0507, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.475311279296875, "rewards/margins": 0.777558445930481, "rewards/rejected": -5.252869129180908, "sft_loss": 4.221250057220459, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 0.5115033248703027, "learning_rate": 5.55693010542197e-08, "logits/chosen": -0.7869556546211243, "logits/rejected": -0.5591322779655457, "logps/chosen": -4.349687099456787, "logps/rejected": -5.18341064453125, "loss": 0.0485, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.349687099456787, "rewards/margins": 0.8337236642837524, "rewards/rejected": -5.18341064453125, "sft_loss": 3.9441981315612793, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 0.7067733700435774, "learning_rate": 5.485789333327856e-08, "logits/chosen": -0.7598763108253479, "logits/rejected": -0.7221094369888306, "logps/chosen": -4.49146842956543, "logps/rejected": -5.119671821594238, "loss": 0.0522, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.49146842956543, "rewards/margins": 0.628203809261322, "rewards/rejected": -5.119671821594238, "sft_loss": 4.2429094314575195, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 0.45260597716363027, "learning_rate": 5.4150804383008675e-08, "logits/chosen": -0.910417914390564, "logits/rejected": -0.7389578819274902, "logps/chosen": -4.464565277099609, "logps/rejected": -5.181889533996582, "loss": 0.053, "rewards/accuracies": 0.65625, "rewards/chosen": -4.464565277099609, "rewards/margins": 0.7173237204551697, "rewards/rejected": -5.181889533996582, "sft_loss": 4.224325656890869, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 0.5973972322326072, "learning_rate": 5.344804106359002e-08, "logits/chosen": -0.6693606376647949, "logits/rejected": -0.5169059634208679, "logps/chosen": -4.443115711212158, "logps/rejected": -5.097043037414551, "loss": 0.0524, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.443115711212158, "rewards/margins": 0.653927206993103, "rewards/rejected": -5.097043037414551, "sft_loss": 4.195808410644531, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 0.5755470171232112, "learning_rate": 5.274961019323559e-08, "logits/chosen": -0.7551354169845581, "logits/rejected": -0.6369063854217529, "logps/chosen": -4.399575710296631, "logps/rejected": -5.089932918548584, "loss": 0.0528, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.399575710296631, "rewards/margins": 0.6903573870658875, "rewards/rejected": -5.089932918548584, "sft_loss": 4.221864700317383, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 0.4840213634050733, "learning_rate": 5.205551854812451e-08, "logits/chosen": -0.8901827931404114, "logits/rejected": -0.8028414845466614, "logps/chosen": -4.3761115074157715, "logps/rejected": -5.150932312011719, "loss": 0.0504, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.3761115074157715, "rewards/margins": 0.7748211026191711, "rewards/rejected": -5.150932312011719, "sft_loss": 4.163114070892334, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 0.8958131855189024, "learning_rate": 5.1365772862337177e-08, "logits/chosen": -0.6926102638244629, "logits/rejected": -0.6045705080032349, "logps/chosen": -4.361238956451416, "logps/rejected": -5.185271263122559, "loss": 0.0509, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.361238956451416, "rewards/margins": 0.8240326642990112, "rewards/rejected": -5.185271263122559, "sft_loss": 4.081574440002441, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 0.6988608914200577, "learning_rate": 5.068037982778905e-08, "logits/chosen": -0.6733156442642212, "logits/rejected": -0.5771912336349487, "logps/chosen": -4.478516101837158, "logps/rejected": -5.19637393951416, "loss": 0.0516, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.478516101837158, "rewards/margins": 0.7178576588630676, "rewards/rejected": -5.19637393951416, "sft_loss": 4.19520902633667, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 0.5200460676604345, "learning_rate": 4.999934609416656e-08, "logits/chosen": -0.7352452278137207, "logits/rejected": -0.6223322153091431, "logps/chosen": -4.333428859710693, "logps/rejected": -5.200177192687988, "loss": 0.051, "rewards/accuracies": 0.6875, "rewards/chosen": -4.333428859710693, "rewards/margins": 0.8667477369308472, "rewards/rejected": -5.200177192687988, "sft_loss": 4.1180853843688965, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 0.5114132721276469, "learning_rate": 4.932267826886183e-08, "logits/chosen": -0.6905040144920349, "logits/rejected": -0.6525358557701111, "logps/chosen": -4.4714531898498535, "logps/rejected": -5.135130882263184, "loss": 0.0526, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.4714531898498535, "rewards/margins": 0.6636782884597778, "rewards/rejected": -5.135130882263184, "sft_loss": 4.293010711669922, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 0.3541675492702616, "learning_rate": 4.8650382916909206e-08, "logits/chosen": -0.8800719976425171, "logits/rejected": -0.6866661310195923, "logps/chosen": -4.409409523010254, "logps/rejected": -5.073835849761963, "loss": 0.052, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.409409523010254, "rewards/margins": 0.6644265055656433, "rewards/rejected": -5.073835849761963, "sft_loss": 4.160528182983398, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 0.5684592362716069, "learning_rate": 4.7982466560920976e-08, "logits/chosen": -0.7659167051315308, "logits/rejected": -0.7323800921440125, "logps/chosen": -4.4443440437316895, "logps/rejected": -5.052102088928223, "loss": 0.052, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.4443440437316895, "rewards/margins": 0.6077579259872437, "rewards/rejected": -5.052102088928223, "sft_loss": 4.219048976898193, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 0.6689392592042074, "learning_rate": 4.7318935681024685e-08, "logits/chosen": -0.7107598185539246, "logits/rejected": -0.5510147213935852, "logps/chosen": -4.424412250518799, "logps/rejected": -5.242027282714844, "loss": 0.0511, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.424412250518799, "rewards/margins": 0.8176156282424927, "rewards/rejected": -5.242027282714844, "sft_loss": 4.165490627288818, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 0.5722256802272633, "learning_rate": 4.6659796714799745e-08, "logits/chosen": -0.7005605697631836, "logits/rejected": -0.576447606086731, "logps/chosen": -4.384562969207764, "logps/rejected": -5.328797817230225, "loss": 0.0493, "rewards/accuracies": 0.71875, "rewards/chosen": -4.384562969207764, "rewards/margins": 0.9442348480224609, "rewards/rejected": -5.328797817230225, "sft_loss": 4.149122714996338, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 0.5219836508629162, "learning_rate": 4.60050560572155e-08, "logits/chosen": -0.7817445993423462, "logits/rejected": -0.8545292615890503, "logps/chosen": -4.5263991355896, "logps/rejected": -5.273439884185791, "loss": 0.0523, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.5263991355896, "rewards/margins": 0.7470411658287048, "rewards/rejected": -5.273439884185791, "sft_loss": 4.291499137878418, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 0.38646571365518173, "learning_rate": 4.535472006056834e-08, "logits/chosen": -0.7773014903068542, "logits/rejected": -0.585507333278656, "logps/chosen": -4.419951438903809, "logps/rejected": -5.040675640106201, "loss": 0.0516, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.419951438903809, "rewards/margins": 0.6207249164581299, "rewards/rejected": -5.040675640106201, "sft_loss": 4.128361225128174, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 0.4415088424078639, "learning_rate": 4.470879503442132e-08, "logits/chosen": -0.7938684225082397, "logits/rejected": -0.6925408244132996, "logps/chosen": -4.400835990905762, "logps/rejected": -5.0848212242126465, "loss": 0.051, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.400835990905762, "rewards/margins": 0.6839861869812012, "rewards/rejected": -5.0848212242126465, "sft_loss": 4.152151584625244, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 0.5438520820103692, "learning_rate": 4.406728724554154e-08, "logits/chosen": -0.9053106307983398, "logits/rejected": -0.6270970106124878, "logps/chosen": -4.350544452667236, "logps/rejected": -5.182711601257324, "loss": 0.0503, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.350544452667236, "rewards/margins": 0.8321673274040222, "rewards/rejected": -5.182711601257324, "sft_loss": 4.118407249450684, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 0.40830510209473897, "learning_rate": 4.3430202917840664e-08, "logits/chosen": -0.7182999849319458, "logits/rejected": -0.5021204352378845, "logps/chosen": -4.545349597930908, "logps/rejected": -5.3235087394714355, "loss": 0.0518, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.545349597930908, "rewards/margins": 0.7781594395637512, "rewards/rejected": -5.3235087394714355, "sft_loss": 4.261865615844727, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 0.46964279049381513, "learning_rate": 4.279754823231346e-08, "logits/chosen": -0.776440441608429, "logits/rejected": -0.5842413902282715, "logps/chosen": -4.306726932525635, "logps/rejected": -5.086239337921143, "loss": 0.0504, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.306726932525635, "rewards/margins": 0.7795121073722839, "rewards/rejected": -5.086239337921143, "sft_loss": 3.9869303703308105, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 0.49471229779296944, "learning_rate": 4.216932932697859e-08, "logits/chosen": -0.832381546497345, "logits/rejected": -0.7716125249862671, "logps/chosen": -4.284350395202637, "logps/rejected": -4.896607398986816, "loss": 0.0511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.284350395202637, "rewards/margins": 0.6122564077377319, "rewards/rejected": -4.896607398986816, "sft_loss": 4.027619361877441, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 0.5063133009359136, "learning_rate": 4.154555229681844e-08, "logits/chosen": -0.7936744093894958, "logits/rejected": -0.5885189175605774, "logps/chosen": -4.427206993103027, "logps/rejected": -5.143578052520752, "loss": 0.0509, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.427206993103027, "rewards/margins": 0.7163704633712769, "rewards/rejected": -5.143578052520752, "sft_loss": 4.04074764251709, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 0.5395838576200975, "learning_rate": 4.092622319372069e-08, "logits/chosen": -0.7596436142921448, "logits/rejected": -0.5754319429397583, "logps/chosen": -4.363346099853516, "logps/rejected": -5.0439653396606445, "loss": 0.0518, "rewards/accuracies": 0.71875, "rewards/chosen": -4.363346099853516, "rewards/margins": 0.6806186437606812, "rewards/rejected": -5.0439653396606445, "sft_loss": 4.0527753829956055, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 0.4273690281432811, "learning_rate": 4.031134802641889e-08, "logits/chosen": -0.754179060459137, "logits/rejected": -0.7639588713645935, "logps/chosen": -4.647191524505615, "logps/rejected": -5.165177345275879, "loss": 0.0515, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.647191524505615, "rewards/margins": 0.5179857015609741, "rewards/rejected": -5.165177345275879, "sft_loss": 4.326084136962891, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 0.4601641742802231, "learning_rate": 3.970093276043468e-08, "logits/chosen": -0.6982657313346863, "logits/rejected": -0.6274687051773071, "logps/chosen": -4.353320121765137, "logps/rejected": -5.106142997741699, "loss": 0.052, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.353320121765137, "rewards/margins": 0.7528237700462341, "rewards/rejected": -5.106142997741699, "sft_loss": 4.145398139953613, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 0.4035041863470644, "learning_rate": 3.9094983318019584e-08, "logits/chosen": -0.8743463754653931, "logits/rejected": -0.7296475172042847, "logps/chosen": -4.384496212005615, "logps/rejected": -5.0877203941345215, "loss": 0.0517, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.384496212005615, "rewards/margins": 0.7032240629196167, "rewards/rejected": -5.0877203941345215, "sft_loss": 4.198531150817871, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 0.4512012847530197, "learning_rate": 3.849350557809789e-08, "logits/chosen": -0.6640986204147339, "logits/rejected": -0.653388500213623, "logps/chosen": -4.503409385681152, "logps/rejected": -5.102181434631348, "loss": 0.0502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.503409385681152, "rewards/margins": 0.5987719297409058, "rewards/rejected": -5.102181434631348, "sft_loss": 4.060650825500488, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 0.6901863489404572, "learning_rate": 3.789650537620903e-08, "logits/chosen": -0.688258945941925, "logits/rejected": -0.6832031011581421, "logps/chosen": -4.540500164031982, "logps/rejected": -5.2547407150268555, "loss": 0.0513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.540500164031982, "rewards/margins": 0.7142406105995178, "rewards/rejected": -5.2547407150268555, "sft_loss": 4.2646074295043945, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 0.42588906837966245, "learning_rate": 3.730398850445182e-08, "logits/chosen": -0.6220318078994751, "logits/rejected": -0.6261580586433411, "logps/chosen": -4.496462821960449, "logps/rejected": -5.189952373504639, "loss": 0.052, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.496462821960449, "rewards/margins": 0.6934901475906372, "rewards/rejected": -5.189952373504639, "sft_loss": 4.15579080581665, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 0.4940453819815409, "learning_rate": 3.671596071142735e-08, "logits/chosen": -0.7255562543869019, "logits/rejected": -0.5626224875450134, "logps/chosen": -4.4464030265808105, "logps/rejected": -5.3239946365356445, "loss": 0.0508, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.4464030265808105, "rewards/margins": 0.8775907754898071, "rewards/rejected": -5.3239946365356445, "sft_loss": 4.130679130554199, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 0.533231803665976, "learning_rate": 3.6132427702183996e-08, "logits/chosen": -0.898454487323761, "logits/rejected": -0.7337983846664429, "logps/chosen": -4.380088806152344, "logps/rejected": -5.301820278167725, "loss": 0.0502, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.380088806152344, "rewards/margins": 0.9217315912246704, "rewards/rejected": -5.301820278167725, "sft_loss": 4.153786659240723, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 0.4560585718830667, "learning_rate": 3.555339513816147e-08, "logits/chosen": -0.8258197903633118, "logits/rejected": -0.8702704310417175, "logps/chosen": -4.535338401794434, "logps/rejected": -5.168300151824951, "loss": 0.0529, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.535338401794434, "rewards/margins": 0.6329620480537415, "rewards/rejected": -5.168300151824951, "sft_loss": 4.283738136291504, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 0.4913083081311584, "learning_rate": 3.497886863713639e-08, "logits/chosen": -0.7957427501678467, "logits/rejected": -0.7848941087722778, "logps/chosen": -4.596715927124023, "logps/rejected": -5.212932586669922, "loss": 0.052, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.596715927124023, "rewards/margins": 0.6162165403366089, "rewards/rejected": -5.212932586669922, "sft_loss": 4.317381381988525, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 0.5146051345946518, "learning_rate": 3.440885377316721e-08, "logits/chosen": -0.7044495344161987, "logits/rejected": -0.6470869779586792, "logps/chosen": -4.40933895111084, "logps/rejected": -4.96150016784668, "loss": 0.0517, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.40933895111084, "rewards/margins": 0.5521610975265503, "rewards/rejected": -4.96150016784668, "sft_loss": 4.148724555969238, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 0.5127153318494577, "learning_rate": 3.384335607654082e-08, "logits/chosen": -0.662913978099823, "logits/rejected": -0.6405649185180664, "logps/chosen": -4.492884159088135, "logps/rejected": -5.25106143951416, "loss": 0.0509, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.492884159088135, "rewards/margins": 0.7581772804260254, "rewards/rejected": -5.25106143951416, "sft_loss": 4.1412672996521, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 0.49440876947556955, "learning_rate": 3.328238103371811e-08, "logits/chosen": -0.755716860294342, "logits/rejected": -0.7262305021286011, "logps/chosen": -4.506894111633301, "logps/rejected": -5.315282344818115, "loss": 0.0503, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.506894111633301, "rewards/margins": 0.8083890676498413, "rewards/rejected": -5.315282344818115, "sft_loss": 4.149899482727051, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 0.5390351293879576, "learning_rate": 3.272593408728169e-08, "logits/chosen": -0.8326853513717651, "logits/rejected": -0.5620445013046265, "logps/chosen": -4.3972320556640625, "logps/rejected": -5.159958839416504, "loss": 0.0506, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.3972320556640625, "rewards/margins": 0.7627268433570862, "rewards/rejected": -5.159958839416504, "sft_loss": 4.1451311111450195, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 0.4278067734510489, "learning_rate": 3.217402063588204e-08, "logits/chosen": -0.8494710922241211, "logits/rejected": -0.6724889874458313, "logps/chosen": -4.459986686706543, "logps/rejected": -5.137322902679443, "loss": 0.0525, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.459986686706543, "rewards/margins": 0.6773372888565063, "rewards/rejected": -5.137322902679443, "sft_loss": 4.2519850730896, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 0.5935683504662128, "learning_rate": 3.162664603418608e-08, "logits/chosen": -0.7728666067123413, "logits/rejected": -0.7169617414474487, "logps/chosen": -4.379161357879639, "logps/rejected": -5.187068939208984, "loss": 0.0511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.379161357879639, "rewards/margins": 0.807907223701477, "rewards/rejected": -5.187068939208984, "sft_loss": 4.1201324462890625, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 0.38628884775927563, "learning_rate": 3.1083815592824416e-08, "logits/chosen": -0.752131462097168, "logits/rejected": -0.6836836934089661, "logps/chosen": -4.516790866851807, "logps/rejected": -5.249283790588379, "loss": 0.0515, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.516790866851807, "rewards/margins": 0.7324928045272827, "rewards/rejected": -5.249283790588379, "sft_loss": 4.276049613952637, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 0.5871576493438839, "learning_rate": 3.054553457834053e-08, "logits/chosen": -0.6166585087776184, "logits/rejected": -0.6771060824394226, "logps/chosen": -4.626035213470459, "logps/rejected": -5.179619789123535, "loss": 0.0527, "rewards/accuracies": 0.625, "rewards/chosen": -4.626035213470459, "rewards/margins": 0.5535842180252075, "rewards/rejected": -5.179619789123535, "sft_loss": 4.302709102630615, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 0.6160064556957795, "learning_rate": 3.0011808213139036e-08, "logits/chosen": -0.6177735328674316, "logits/rejected": -0.6710564494132996, "logps/chosen": -4.500868320465088, "logps/rejected": -5.045173168182373, "loss": 0.0515, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.500868320465088, "rewards/margins": 0.5443046689033508, "rewards/rejected": -5.045173168182373, "sft_loss": 4.213364124298096, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 0.6859916514482486, "learning_rate": 2.948264167543568e-08, "logits/chosen": -0.7522085905075073, "logits/rejected": -0.6835848093032837, "logps/chosen": -4.500675678253174, "logps/rejected": -5.0596089363098145, "loss": 0.0518, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.500675678253174, "rewards/margins": 0.5589330792427063, "rewards/rejected": -5.0596089363098145, "sft_loss": 4.171204566955566, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 0.4799906826336578, "learning_rate": 2.8958040099206216e-08, "logits/chosen": -0.8550014495849609, "logits/rejected": -0.7934740781784058, "logps/chosen": -4.4887800216674805, "logps/rejected": -5.191119194030762, "loss": 0.0512, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.4887800216674805, "rewards/margins": 0.7023388743400574, "rewards/rejected": -5.191119194030762, "sft_loss": 4.201326847076416, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 0.7859840677597958, "learning_rate": 2.843800857413775e-08, "logits/chosen": -0.6860078573226929, "logits/rejected": -0.654365062713623, "logps/chosen": -4.445427417755127, "logps/rejected": -4.997714996337891, "loss": 0.0529, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.445427417755127, "rewards/margins": 0.5522874593734741, "rewards/rejected": -4.997714996337891, "sft_loss": 4.158383846282959, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 0.4526236800027281, "learning_rate": 2.7922552145578203e-08, "logits/chosen": -0.7144232988357544, "logits/rejected": -0.45682835578918457, "logps/chosen": -4.429794788360596, "logps/rejected": -5.128900051116943, "loss": 0.0511, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.429794788360596, "rewards/margins": 0.6991047263145447, "rewards/rejected": -5.128900051116943, "sft_loss": 4.1685028076171875, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 0.4394735274325847, "learning_rate": 2.7411675814488277e-08, "logits/chosen": -0.697012722492218, "logits/rejected": -0.56499183177948, "logps/chosen": -4.419262886047363, "logps/rejected": -5.056243896484375, "loss": 0.0507, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.419262886047363, "rewards/margins": 0.6369813680648804, "rewards/rejected": -5.056243896484375, "sft_loss": 4.206353664398193, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 0.5891818486629351, "learning_rate": 2.690538453739216e-08, "logits/chosen": -0.7441337704658508, "logits/rejected": -0.706308901309967, "logps/chosen": -4.351699352264404, "logps/rejected": -4.927661895751953, "loss": 0.0531, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.351699352264404, "rewards/margins": 0.5759629011154175, "rewards/rejected": -4.927661895751953, "sft_loss": 4.168587684631348, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 0.43729963617249973, "learning_rate": 2.6403683226330298e-08, "logits/chosen": -0.8585704565048218, "logits/rejected": -0.6886411309242249, "logps/chosen": -4.4380598068237305, "logps/rejected": -5.095976829528809, "loss": 0.0517, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.4380598068237305, "rewards/margins": 0.6579168438911438, "rewards/rejected": -5.095976829528809, "sft_loss": 4.219568729400635, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 0.45646069193158206, "learning_rate": 2.5906576748810804e-08, "logits/chosen": -0.8393117189407349, "logits/rejected": -0.7262551784515381, "logps/chosen": -4.536147117614746, "logps/rejected": -5.36475944519043, "loss": 0.051, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.536147117614746, "rewards/margins": 0.8286125063896179, "rewards/rejected": -5.36475944519043, "sft_loss": 4.3527655601501465, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 0.40349095053106443, "learning_rate": 2.5414069927763016e-08, "logits/chosen": -0.9187151789665222, "logits/rejected": -0.7296867370605469, "logps/chosen": -4.362442970275879, "logps/rejected": -5.214807033538818, "loss": 0.0498, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -4.362442970275879, "rewards/margins": 0.8523637056350708, "rewards/rejected": -5.214807033538818, "sft_loss": 4.102611064910889, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 0.31381315759075395, "learning_rate": 2.4926167541490185e-08, "logits/chosen": -0.941425621509552, "logits/rejected": -0.7067974805831909, "logps/chosen": -4.382145404815674, "logps/rejected": -5.232102870941162, "loss": 0.0508, "rewards/accuracies": 0.71875, "rewards/chosen": -4.382145404815674, "rewards/margins": 0.849958062171936, "rewards/rejected": -5.232102870941162, "sft_loss": 4.168827056884766, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 0.4666743335275937, "learning_rate": 2.4442874323623574e-08, "logits/chosen": -0.7078942060470581, "logits/rejected": -0.5640963912010193, "logps/chosen": -4.456545829772949, "logps/rejected": -5.237743377685547, "loss": 0.0516, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.456545829772949, "rewards/margins": 0.7811979055404663, "rewards/rejected": -5.237743377685547, "sft_loss": 4.171876430511475, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 0.5009611946781228, "learning_rate": 2.396419496307589e-08, "logits/chosen": -0.7781413197517395, "logits/rejected": -0.5991483330726624, "logps/chosen": -4.574313163757324, "logps/rejected": -5.172600746154785, "loss": 0.051, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.574313163757324, "rewards/margins": 0.5982874631881714, "rewards/rejected": -5.172600746154785, "sft_loss": 4.223634243011475, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 0.40995838518576144, "learning_rate": 2.349013410399653e-08, "logits/chosen": -0.769730269908905, "logits/rejected": -0.6913172006607056, "logps/chosen": -4.531946182250977, "logps/rejected": -5.277296543121338, "loss": 0.0505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.531946182250977, "rewards/margins": 0.7453504800796509, "rewards/rejected": -5.277296543121338, "sft_loss": 4.089993476867676, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 0.4386421790875959, "learning_rate": 2.3020696345725954e-08, "logits/chosen": -0.8437735438346863, "logits/rejected": -0.6403151750564575, "logps/chosen": -4.284999847412109, "logps/rejected": -5.3409104347229, "loss": 0.0495, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.284999847412109, "rewards/margins": 1.0559107065200806, "rewards/rejected": -5.3409104347229, "sft_loss": 4.055171966552734, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 0.6096353214247594, "learning_rate": 2.2555886242751398e-08, "logits/chosen": -0.7667422294616699, "logits/rejected": -0.7132569551467896, "logps/chosen": -4.329689025878906, "logps/rejected": -5.106996059417725, "loss": 0.0513, "rewards/accuracies": 0.78125, "rewards/chosen": -4.329689025878906, "rewards/margins": 0.7773071527481079, "rewards/rejected": -5.106996059417725, "sft_loss": 4.15250825881958, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 0.739154952389392, "learning_rate": 2.2095708304662453e-08, "logits/chosen": -0.8948475122451782, "logits/rejected": -0.6626867055892944, "logps/chosen": -4.5080671310424805, "logps/rejected": -5.133249282836914, "loss": 0.0513, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.5080671310424805, "rewards/margins": 0.6251822710037231, "rewards/rejected": -5.133249282836914, "sft_loss": 4.2104692459106445, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 0.40146489525306356, "learning_rate": 2.16401669961076e-08, "logits/chosen": -0.8997477293014526, "logits/rejected": -0.6744705438613892, "logps/chosen": -4.385453701019287, "logps/rejected": -5.082536220550537, "loss": 0.0517, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.385453701019287, "rewards/margins": 0.6970824003219604, "rewards/rejected": -5.082536220550537, "sft_loss": 4.138113975524902, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 0.5550146219227535, "learning_rate": 2.1189266736750532e-08, "logits/chosen": -0.6366299390792847, "logits/rejected": -0.6059743165969849, "logps/chosen": -4.567104816436768, "logps/rejected": -5.097182273864746, "loss": 0.0529, "rewards/accuracies": 0.625, "rewards/chosen": -4.567104816436768, "rewards/margins": 0.5300775766372681, "rewards/rejected": -5.097182273864746, "sft_loss": 4.227593898773193, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 0.7238548964288543, "learning_rate": 2.0743011901227623e-08, "logits/chosen": -0.7072452902793884, "logits/rejected": -0.5823957920074463, "logps/chosen": -4.160729885101318, "logps/rejected": -4.983979225158691, "loss": 0.0506, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.160729885101318, "rewards/margins": 0.8232491612434387, "rewards/rejected": -4.983979225158691, "sft_loss": 3.919672727584839, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 0.6313164538400142, "learning_rate": 2.030140681910508e-08, "logits/chosen": -0.7149503827095032, "logits/rejected": -0.5912491083145142, "logps/chosen": -4.391037464141846, "logps/rejected": -5.0824480056762695, "loss": 0.051, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.391037464141846, "rewards/margins": 0.6914108991622925, "rewards/rejected": -5.0824480056762695, "sft_loss": 4.110098838806152, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 0.36572611483915707, "learning_rate": 1.986445577483753e-08, "logits/chosen": -0.7998876571655273, "logits/rejected": -0.6524479985237122, "logps/chosen": -4.311178207397461, "logps/rejected": -5.040547847747803, "loss": 0.0511, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.311178207397461, "rewards/margins": 0.7293696999549866, "rewards/rejected": -5.040547847747803, "sft_loss": 4.074983596801758, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 0.40350920608337093, "learning_rate": 1.9432163007725765e-08, "logits/chosen": -0.8454571962356567, "logits/rejected": -0.7627595663070679, "logps/chosen": -4.490768909454346, "logps/rejected": -5.0736775398254395, "loss": 0.0522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.490768909454346, "rewards/margins": 0.5829084515571594, "rewards/rejected": -5.0736775398254395, "sft_loss": 4.289803504943848, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 0.35743425580419486, "learning_rate": 1.9004532711876297e-08, "logits/chosen": -0.7591687440872192, "logits/rejected": -0.7784875631332397, "logps/chosen": -4.46417760848999, "logps/rejected": -5.038629531860352, "loss": 0.0519, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.46417760848999, "rewards/margins": 0.5744518637657166, "rewards/rejected": -5.038629531860352, "sft_loss": 4.209836483001709, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 0.40974843728329524, "learning_rate": 1.8581569036159928e-08, "logits/chosen": -0.7757205367088318, "logits/rejected": -0.5620787739753723, "logps/chosen": -4.352284908294678, "logps/rejected": -5.135004997253418, "loss": 0.0511, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.352284908294678, "rewards/margins": 0.7827197909355164, "rewards/rejected": -5.135004997253418, "sft_loss": 4.145803928375244, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 0.38651979743253223, "learning_rate": 1.8163276084172285e-08, "logits/chosen": -0.7541450262069702, "logits/rejected": -0.6550502181053162, "logps/chosen": -4.484699249267578, "logps/rejected": -5.219845771789551, "loss": 0.0504, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.484699249267578, "rewards/margins": 0.7351458072662354, "rewards/rejected": -5.219845771789551, "sft_loss": 4.190016746520996, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 0.4746599739406412, "learning_rate": 1.7749657914193194e-08, "logits/chosen": -0.7308141589164734, "logits/rejected": -0.6888738870620728, "logps/chosen": -4.515814304351807, "logps/rejected": -5.2973856925964355, "loss": 0.0499, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.515814304351807, "rewards/margins": 0.7815715074539185, "rewards/rejected": -5.2973856925964355, "sft_loss": 4.202316761016846, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 0.4964614722849225, "learning_rate": 1.7340718539148203e-08, "logits/chosen": -0.6771430969238281, "logits/rejected": -0.6231086850166321, "logps/chosen": -4.451423168182373, "logps/rejected": -5.105251789093018, "loss": 0.0525, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.451423168182373, "rewards/margins": 0.653827965259552, "rewards/rejected": -5.105251789093018, "sft_loss": 4.232422351837158, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 0.38468316076352804, "learning_rate": 1.6936461926568724e-08, "logits/chosen": -0.7071312665939331, "logits/rejected": -0.5591712594032288, "logps/chosen": -4.469587802886963, "logps/rejected": -5.192925453186035, "loss": 0.0518, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.469587802886963, "rewards/margins": 0.7233376502990723, "rewards/rejected": -5.192925453186035, "sft_loss": 4.177231788635254, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 0.44091666285478454, "learning_rate": 1.6536891998554346e-08, "logits/chosen": -0.8431358337402344, "logits/rejected": -0.6786155700683594, "logps/chosen": -4.418313980102539, "logps/rejected": -5.12216854095459, "loss": 0.0504, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.418313980102539, "rewards/margins": 0.7038545608520508, "rewards/rejected": -5.12216854095459, "sft_loss": 4.129395961761475, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 0.4200872795921329, "learning_rate": 1.6142012631734093e-08, "logits/chosen": -0.6793020963668823, "logits/rejected": -0.5708988904953003, "logps/chosen": -4.42996883392334, "logps/rejected": -5.1513671875, "loss": 0.0499, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.42996883392334, "rewards/margins": 0.7213989496231079, "rewards/rejected": -5.1513671875, "sft_loss": 4.167457580566406, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 0.3583087592611993, "learning_rate": 1.575182765722949e-08, "logits/chosen": -0.8591778874397278, "logits/rejected": -0.6734490990638733, "logps/chosen": -4.4382429122924805, "logps/rejected": -5.177542209625244, "loss": 0.0509, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.4382429122924805, "rewards/margins": 0.7392994165420532, "rewards/rejected": -5.177542209625244, "sft_loss": 4.069396018981934, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": -0.2376498281955719, "eval_logits/rejected": -0.15155045688152313, "eval_logps/chosen": -4.39319372177124, "eval_logps/rejected": -5.099259376525879, "eval_loss": 0.05041056498885155, "eval_rewards/accuracies": 0.6854599118232727, "eval_rewards/chosen": -4.39319372177124, "eval_rewards/margins": 0.70606529712677, "eval_rewards/rejected": -5.099259376525879, "eval_runtime": 43.6092, "eval_samples_per_second": 30.842, "eval_sft_loss": 4.013496398925781, "eval_steps_per_second": 7.728, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 0.6691376618275345, "learning_rate": 1.536634086061672e-08, "logits/chosen": -0.7162854075431824, "logits/rejected": -0.7120442986488342, "logps/chosen": -4.272818088531494, "logps/rejected": -5.1587724685668945, "loss": 0.0498, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.272818088531494, "rewards/margins": 0.8859542012214661, "rewards/rejected": -5.1587724685668945, "sft_loss": 4.018543243408203, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 0.6331576336219715, "learning_rate": 1.4985555981890495e-08, "logits/chosen": -0.770778477191925, "logits/rejected": -0.6682685017585754, "logps/chosen": -4.637447357177734, "logps/rejected": -5.280395030975342, "loss": 0.0523, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.637447357177734, "rewards/margins": 0.6429480314254761, "rewards/rejected": -5.280395030975342, "sft_loss": 4.285575866699219, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 0.45844722316500985, "learning_rate": 1.4609476715427226e-08, "logits/chosen": -0.7665554285049438, "logits/rejected": -0.7006998658180237, "logps/chosen": -4.294949531555176, "logps/rejected": -5.111362457275391, "loss": 0.05, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.294949531555176, "rewards/margins": 0.8164127469062805, "rewards/rejected": -5.111362457275391, "sft_loss": 4.0890374183654785, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 0.46167473933351355, "learning_rate": 1.4238106709949792e-08, "logits/chosen": -0.7603719234466553, "logits/rejected": -0.704023003578186, "logps/chosen": -4.407052993774414, "logps/rejected": -5.269985198974609, "loss": 0.0501, "rewards/accuracies": 0.75, "rewards/chosen": -4.407052993774414, "rewards/margins": 0.8629329800605774, "rewards/rejected": -5.269985198974609, "sft_loss": 4.130857467651367, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 0.6465361159968553, "learning_rate": 1.3871449568491511e-08, "logits/chosen": -0.6923054456710815, "logits/rejected": -0.5152607560157776, "logps/chosen": -4.521544456481934, "logps/rejected": -5.11342716217041, "loss": 0.0532, "rewards/accuracies": 0.6875, "rewards/chosen": -4.521544456481934, "rewards/margins": 0.5918827056884766, "rewards/rejected": -5.11342716217041, "sft_loss": 4.219715118408203, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 0.5335497897965382, "learning_rate": 1.3509508848361606e-08, "logits/chosen": -0.8408746719360352, "logits/rejected": -0.7014719843864441, "logps/chosen": -4.4506120681762695, "logps/rejected": -5.232138633728027, "loss": 0.0495, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.4506120681762695, "rewards/margins": 0.7815271615982056, "rewards/rejected": -5.232138633728027, "sft_loss": 4.041218280792236, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 0.4340640550491573, "learning_rate": 1.3152288061110517e-08, "logits/chosen": -0.8067764043807983, "logits/rejected": -0.6543148159980774, "logps/chosen": -4.338916778564453, "logps/rejected": -5.174741744995117, "loss": 0.0499, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.338916778564453, "rewards/margins": 0.8358249664306641, "rewards/rejected": -5.174741744995117, "sft_loss": 4.055007457733154, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 0.4702838319568216, "learning_rate": 1.2799790672495814e-08, "logits/chosen": -0.8347498178482056, "logits/rejected": -0.6073392629623413, "logps/chosen": -4.421563148498535, "logps/rejected": -5.192359447479248, "loss": 0.0507, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.421563148498535, "rewards/margins": 0.7707957029342651, "rewards/rejected": -5.192359447479248, "sft_loss": 4.199678897857666, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 0.40117546884872646, "learning_rate": 1.2452020102448835e-08, "logits/chosen": -0.7439224123954773, "logits/rejected": -0.6960216760635376, "logps/chosen": -4.4497785568237305, "logps/rejected": -5.058270454406738, "loss": 0.0525, "rewards/accuracies": 0.6875, "rewards/chosen": -4.4497785568237305, "rewards/margins": 0.6084924340248108, "rewards/rejected": -5.058270454406738, "sft_loss": 4.284776210784912, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 0.4585374000452793, "learning_rate": 1.2108979725041103e-08, "logits/chosen": -0.8621365427970886, "logits/rejected": -0.7040331363677979, "logps/chosen": -4.3691277503967285, "logps/rejected": -5.159308910369873, "loss": 0.051, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.3691277503967285, "rewards/margins": 0.7901814579963684, "rewards/rejected": -5.159308910369873, "sft_loss": 4.125119686126709, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 0.5149332418443064, "learning_rate": 1.1770672868451958e-08, "logits/chosen": -0.786795973777771, "logits/rejected": -0.5567874908447266, "logps/chosen": -4.3395161628723145, "logps/rejected": -5.155367374420166, "loss": 0.0511, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.3395161628723145, "rewards/margins": 0.8158513307571411, "rewards/rejected": -5.155367374420166, "sft_loss": 4.167263984680176, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 0.4589040615922559, "learning_rate": 1.1437102814935872e-08, "logits/chosen": -0.7088602781295776, "logits/rejected": -0.6603358387947083, "logps/chosen": -4.456506252288818, "logps/rejected": -5.139347076416016, "loss": 0.0537, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.456506252288818, "rewards/margins": 0.6828408241271973, "rewards/rejected": -5.139347076416016, "sft_loss": 4.205148220062256, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 0.41054547983463774, "learning_rate": 1.1108272800791018e-08, "logits/chosen": -0.8841502070426941, "logits/rejected": -0.6486460566520691, "logps/chosen": -4.255162239074707, "logps/rejected": -5.279638290405273, "loss": 0.0492, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.255162239074707, "rewards/margins": 1.024476408958435, "rewards/rejected": -5.279638290405273, "sft_loss": 4.01407527923584, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 0.3870327970812567, "learning_rate": 1.078418601632769e-08, "logits/chosen": -0.72391676902771, "logits/rejected": -0.5991867780685425, "logps/chosen": -4.367644309997559, "logps/rejected": -5.057244777679443, "loss": 0.0517, "rewards/accuracies": 0.75, "rewards/chosen": -4.367644309997559, "rewards/margins": 0.689600944519043, "rewards/rejected": -5.057244777679443, "sft_loss": 4.147953033447266, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 0.49824908401362694, "learning_rate": 1.0464845605837159e-08, "logits/chosen": -0.729150116443634, "logits/rejected": -0.6012585163116455, "logps/chosen": -4.400598049163818, "logps/rejected": -5.120515823364258, "loss": 0.0515, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.400598049163818, "rewards/margins": 0.7199177742004395, "rewards/rejected": -5.120515823364258, "sft_loss": 4.194445610046387, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 0.37218461921657003, "learning_rate": 1.0150254667561642e-08, "logits/chosen": -0.7861107587814331, "logits/rejected": -0.5771303176879883, "logps/chosen": -4.436859130859375, "logps/rejected": -5.288681507110596, "loss": 0.0522, "rewards/accuracies": 0.75, "rewards/chosen": -4.436859130859375, "rewards/margins": 0.851822018623352, "rewards/rejected": -5.288681507110596, "sft_loss": 4.093539237976074, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 0.37692200660419406, "learning_rate": 9.840416253663719e-09, "logits/chosen": -0.8008987307548523, "logits/rejected": -0.6927303075790405, "logps/chosen": -4.387301445007324, "logps/rejected": -5.200824737548828, "loss": 0.0511, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.387301445007324, "rewards/margins": 0.8135232925415039, "rewards/rejected": -5.200824737548828, "sft_loss": 4.164353847503662, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 0.43110480795076206, "learning_rate": 9.535333370197074e-09, "logits/chosen": -0.7683074474334717, "logits/rejected": -0.6206791996955872, "logps/chosen": -4.306056976318359, "logps/rejected": -5.059770107269287, "loss": 0.0504, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.306056976318359, "rewards/margins": 0.7537132501602173, "rewards/rejected": -5.059770107269287, "sft_loss": 4.051373481750488, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 0.39348380949619655, "learning_rate": 9.23500897707713e-09, "logits/chosen": -0.8012930750846863, "logits/rejected": -0.6044802069664001, "logps/chosen": -4.543567180633545, "logps/rejected": -5.1686201095581055, "loss": 0.0516, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.543567180633545, "rewards/margins": 0.6250527501106262, "rewards/rejected": -5.1686201095581055, "sft_loss": 4.221306800842285, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 0.6265290893699348, "learning_rate": 8.939445988052574e-09, "logits/chosen": -0.7233133912086487, "logits/rejected": -0.6620453596115112, "logps/chosen": -4.482752799987793, "logps/rejected": -5.183930397033691, "loss": 0.0505, "rewards/accuracies": 0.71875, "rewards/chosen": -4.482752799987793, "rewards/margins": 0.7011777758598328, "rewards/rejected": -5.183930397033691, "sft_loss": 4.141602516174316, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 0.5143935043837382, "learning_rate": 8.648647270676656e-09, "logits/chosen": -0.6966943740844727, "logits/rejected": -0.601551353931427, "logps/chosen": -4.2645697593688965, "logps/rejected": -5.0711989402771, "loss": 0.0501, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.2645697593688965, "rewards/margins": 0.8066291809082031, "rewards/rejected": -5.0711989402771, "sft_loss": 3.9466934204101562, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 0.5979309433728134, "learning_rate": 8.362615646279991e-09, "logits/chosen": -0.918570876121521, "logits/rejected": -0.6350168585777283, "logps/chosen": -4.436649322509766, "logps/rejected": -5.3039045333862305, "loss": 0.0509, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.436649322509766, "rewards/margins": 0.8672553300857544, "rewards/rejected": -5.3039045333862305, "sft_loss": 4.19779109954834, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 0.5780580944954342, "learning_rate": 8.081353889942466e-09, "logits/chosen": -0.6806483268737793, "logits/rejected": -0.500455915927887, "logps/chosen": -4.434041500091553, "logps/rejected": -5.082981586456299, "loss": 0.0522, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.434041500091553, "rewards/margins": 0.6489400267601013, "rewards/rejected": -5.082981586456299, "sft_loss": 4.191941261291504, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 0.3981472388972543, "learning_rate": 7.804864730467042e-09, "logits/chosen": -0.6645959615707397, "logits/rejected": -0.6632333397865295, "logps/chosen": -4.3376359939575195, "logps/rejected": -5.054964542388916, "loss": 0.0506, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.3376359939575195, "rewards/margins": 0.7173280715942383, "rewards/rejected": -5.054964542388916, "sft_loss": 4.088263988494873, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 0.5496295929814721, "learning_rate": 7.533150850352665e-09, "logits/chosen": -0.6918113231658936, "logits/rejected": -0.5065929889678955, "logps/chosen": -4.538696765899658, "logps/rejected": -5.3134541511535645, "loss": 0.0503, "rewards/accuracies": 0.75, "rewards/chosen": -4.538696765899658, "rewards/margins": 0.7747570872306824, "rewards/rejected": -5.3134541511535645, "sft_loss": 4.243317604064941, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 0.5330143026502275, "learning_rate": 7.2662148857686175e-09, "logits/chosen": -0.6605895757675171, "logits/rejected": -0.5792001485824585, "logps/chosen": -4.514766693115234, "logps/rejected": -5.122227668762207, "loss": 0.0518, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.514766693115234, "rewards/margins": 0.6074615120887756, "rewards/rejected": -5.122227668762207, "sft_loss": 4.139904022216797, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 0.4151414572497506, "learning_rate": 7.0040594265287635e-09, "logits/chosen": -0.5867056846618652, "logits/rejected": -0.68376225233078, "logps/chosen": -4.496993064880371, "logps/rejected": -4.990506172180176, "loss": 0.0528, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -4.496993064880371, "rewards/margins": 0.49351271986961365, "rewards/rejected": -4.990506172180176, "sft_loss": 4.215310096740723, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 0.42968389745954066, "learning_rate": 6.746687016066566e-09, "logits/chosen": -0.6900774240493774, "logits/rejected": -0.6579724550247192, "logps/chosen": -4.339878559112549, "logps/rejected": -4.979432106018066, "loss": 0.052, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.339878559112549, "rewards/margins": 0.6395532488822937, "rewards/rejected": -4.979432106018066, "sft_loss": 4.117095470428467, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 0.5137218352574986, "learning_rate": 6.494100151410276e-09, "logits/chosen": -0.897404670715332, "logits/rejected": -0.6970912218093872, "logps/chosen": -4.338336944580078, "logps/rejected": -5.060499668121338, "loss": 0.0512, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.338336944580078, "rewards/margins": 0.7221625447273254, "rewards/rejected": -5.060499668121338, "sft_loss": 4.119289875030518, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 0.9439486911375388, "learning_rate": 6.246301283158728e-09, "logits/chosen": -0.6498326063156128, "logits/rejected": -0.6877025365829468, "logps/chosen": -4.418894290924072, "logps/rejected": -5.056708812713623, "loss": 0.0525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.418894290924072, "rewards/margins": 0.6378144025802612, "rewards/rejected": -5.056708812713623, "sft_loss": 4.190070152282715, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 0.5415123457240395, "learning_rate": 6.0032928154576944e-09, "logits/chosen": -0.7772369980812073, "logits/rejected": -0.7227431535720825, "logps/chosen": -4.4435014724731445, "logps/rejected": -5.072482585906982, "loss": 0.0532, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.4435014724731445, "rewards/margins": 0.6289812326431274, "rewards/rejected": -5.072482585906982, "sft_loss": 4.237886905670166, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 0.6985459324311037, "learning_rate": 5.76507710597629e-09, "logits/chosen": -0.8166434168815613, "logits/rejected": -0.6002271175384521, "logps/chosen": -4.332741737365723, "logps/rejected": -5.127420425415039, "loss": 0.0508, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.332741737365723, "rewards/margins": 0.7946786284446716, "rewards/rejected": -5.127420425415039, "sft_loss": 4.032086372375488, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 0.3729066256025123, "learning_rate": 5.531656465884438e-09, "logits/chosen": -0.8140283823013306, "logits/rejected": -0.6739929914474487, "logps/chosen": -4.36183500289917, "logps/rejected": -5.198390960693359, "loss": 0.0496, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.36183500289917, "rewards/margins": 0.8365559577941895, "rewards/rejected": -5.198390960693359, "sft_loss": 4.06744384765625, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 0.7101122740083048, "learning_rate": 5.303033159830217e-09, "logits/chosen": -0.6431114673614502, "logits/rejected": -0.6394237279891968, "logps/chosen": -4.446755409240723, "logps/rejected": -4.919232368469238, "loss": 0.0521, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.446755409240723, "rewards/margins": 0.4724767804145813, "rewards/rejected": -4.919232368469238, "sft_loss": 4.0875115394592285, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 0.4761441037980727, "learning_rate": 5.079209405917939e-09, "logits/chosen": -0.7186871767044067, "logits/rejected": -0.6644418835639954, "logps/chosen": -4.400277614593506, "logps/rejected": -5.278224945068359, "loss": 0.0512, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.400277614593506, "rewards/margins": 0.8779473304748535, "rewards/rejected": -5.278224945068359, "sft_loss": 4.172907829284668, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 0.44965344183186384, "learning_rate": 4.860187375686664e-09, "logits/chosen": -0.840808093547821, "logits/rejected": -0.5630152821540833, "logps/chosen": -4.355297088623047, "logps/rejected": -5.174862384796143, "loss": 0.0504, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.355297088623047, "rewards/margins": 0.8195658922195435, "rewards/rejected": -5.174862384796143, "sft_loss": 4.117367744445801, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 0.48662423395902865, "learning_rate": 4.64596919408905e-09, "logits/chosen": -0.6709896922111511, "logits/rejected": -0.6175445914268494, "logps/chosen": -4.350368976593018, "logps/rejected": -4.9599738121032715, "loss": 0.0524, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.350368976593018, "rewards/margins": 0.6096046566963196, "rewards/rejected": -4.9599738121032715, "sft_loss": 4.05443811416626, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 0.5097574203945324, "learning_rate": 4.436556939470814e-09, "logits/chosen": -0.8031808733940125, "logits/rejected": -0.6307480335235596, "logps/chosen": -4.699088096618652, "logps/rejected": -5.155211448669434, "loss": 0.0536, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.699088096618652, "rewards/margins": 0.456122487783432, "rewards/rejected": -5.155211448669434, "sft_loss": 4.452386856079102, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 0.4438377766535619, "learning_rate": 4.23195264355064e-09, "logits/chosen": -0.935401439666748, "logits/rejected": -0.6709798574447632, "logps/chosen": -4.372194766998291, "logps/rejected": -5.0931525230407715, "loss": 0.0513, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.372194766998291, "rewards/margins": 0.7209590077400208, "rewards/rejected": -5.0931525230407715, "sft_loss": 4.168568134307861, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 0.3898996829524812, "learning_rate": 4.032158291400245e-09, "logits/chosen": -0.7936094403266907, "logits/rejected": -0.5639082193374634, "logps/chosen": -4.286824703216553, "logps/rejected": -5.411582946777344, "loss": 0.0481, "rewards/accuracies": 0.75, "rewards/chosen": -4.286824703216553, "rewards/margins": 1.1247583627700806, "rewards/rejected": -5.411582946777344, "sft_loss": 3.914653778076172, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 0.37144975755492143, "learning_rate": 3.837175821425398e-09, "logits/chosen": -0.6788768172264099, "logits/rejected": -0.6351056098937988, "logps/chosen": -4.575131416320801, "logps/rejected": -5.201429843902588, "loss": 0.0522, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.575131416320801, "rewards/margins": 0.6262980103492737, "rewards/rejected": -5.201429843902588, "sft_loss": 4.266793251037598, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 0.4819502320454787, "learning_rate": 3.6470071253467683e-09, "logits/chosen": -0.7794539332389832, "logits/rejected": -0.6866748332977295, "logps/chosen": -4.382658958435059, "logps/rejected": -5.218075752258301, "loss": 0.0499, "rewards/accuracies": 0.6875, "rewards/chosen": -4.382658958435059, "rewards/margins": 0.8354169130325317, "rewards/rejected": -5.218075752258301, "sft_loss": 4.06662654876709, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 0.4675477289758297, "learning_rate": 3.461654048181939e-09, "logits/chosen": -0.8172122836112976, "logits/rejected": -0.5970250964164734, "logps/chosen": -4.448482513427734, "logps/rejected": -5.097105979919434, "loss": 0.0525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.448482513427734, "rewards/margins": 0.6486231088638306, "rewards/rejected": -5.097105979919434, "sft_loss": 4.25862979888916, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 0.481561756284469, "learning_rate": 3.281118388227255e-09, "logits/chosen": -0.7290286421775818, "logits/rejected": -0.6467230319976807, "logps/chosen": -4.468303680419922, "logps/rejected": -5.0093994140625, "loss": 0.0538, "rewards/accuracies": 0.65625, "rewards/chosen": -4.468303680419922, "rewards/margins": 0.5410959124565125, "rewards/rejected": -5.0093994140625, "sft_loss": 4.1863837242126465, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 0.5204386750591277, "learning_rate": 3.1054018970405048e-09, "logits/chosen": -0.7600063681602478, "logits/rejected": -0.6038556694984436, "logps/chosen": -4.390046119689941, "logps/rejected": -5.161233901977539, "loss": 0.05, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.390046119689941, "rewards/margins": 0.7711877822875977, "rewards/rejected": -5.161233901977539, "sft_loss": 4.058727741241455, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 0.3750239879457203, "learning_rate": 2.9345062794238207e-09, "logits/chosen": -0.7968858480453491, "logits/rejected": -0.5854495763778687, "logps/chosen": -4.418655872344971, "logps/rejected": -5.176025390625, "loss": 0.0513, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.418655872344971, "rewards/margins": 0.757369875907898, "rewards/rejected": -5.176025390625, "sft_loss": 4.210319519042969, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 0.42673124272737256, "learning_rate": 2.7684331934072492e-09, "logits/chosen": -0.8364018201828003, "logits/rejected": -0.774222731590271, "logps/chosen": -4.359699726104736, "logps/rejected": -5.043485164642334, "loss": 0.0519, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.359699726104736, "rewards/margins": 0.6837862133979797, "rewards/rejected": -5.043485164642334, "sft_loss": 4.126735210418701, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 0.52408152068924, "learning_rate": 2.6071842502326526e-09, "logits/chosen": -0.8465268015861511, "logits/rejected": -0.6868315935134888, "logps/chosen": -4.515316963195801, "logps/rejected": -5.097006797790527, "loss": 0.0523, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.515316963195801, "rewards/margins": 0.5816894173622131, "rewards/rejected": -5.097006797790527, "sft_loss": 4.289347171783447, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 0.5496966834501419, "learning_rate": 2.450761014337888e-09, "logits/chosen": -0.5394278168678284, "logits/rejected": -0.48762258887290955, "logps/chosen": -4.465736389160156, "logps/rejected": -5.261734962463379, "loss": 0.0527, "rewards/accuracies": 0.65625, "rewards/chosen": -4.465736389160156, "rewards/margins": 0.7959989905357361, "rewards/rejected": -5.261734962463379, "sft_loss": 4.236559867858887, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 0.7378399710882302, "learning_rate": 2.299165003341985e-09, "logits/chosen": -0.5353703498840332, "logits/rejected": -0.4766581058502197, "logps/chosen": -4.363390922546387, "logps/rejected": -5.103070259094238, "loss": 0.0514, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.363390922546387, "rewards/margins": 0.739679217338562, "rewards/rejected": -5.103070259094238, "sft_loss": 4.137202262878418, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 0.5686744201567238, "learning_rate": 2.1523976880299945e-09, "logits/chosen": -0.8194225430488586, "logits/rejected": -0.6335167288780212, "logps/chosen": -4.4221296310424805, "logps/rejected": -4.990933895111084, "loss": 0.0535, "rewards/accuracies": 0.6875, "rewards/chosen": -4.4221296310424805, "rewards/margins": 0.5688046813011169, "rewards/rejected": -4.990933895111084, "sft_loss": 4.245485782623291, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 0.5578072935373403, "learning_rate": 2.010460492339161e-09, "logits/chosen": -0.7293864488601685, "logits/rejected": -0.5793944597244263, "logps/chosen": -4.39241886138916, "logps/rejected": -5.086266994476318, "loss": 0.0501, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.39241886138916, "rewards/margins": 0.6938482522964478, "rewards/rejected": -5.086266994476318, "sft_loss": 4.0339789390563965, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 0.5421431657374898, "learning_rate": 1.8733547933446614e-09, "logits/chosen": -0.823691189289093, "logits/rejected": -0.5740182995796204, "logps/chosen": -4.491018295288086, "logps/rejected": -5.095160007476807, "loss": 0.0519, "rewards/accuracies": 0.65625, "rewards/chosen": -4.491018295288086, "rewards/margins": 0.6041414737701416, "rewards/rejected": -5.095160007476807, "sft_loss": 4.210727691650391, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 0.5262232764119357, "learning_rate": 1.7410819212467231e-09, "logits/chosen": -0.7801578640937805, "logits/rejected": -0.7355565428733826, "logps/chosen": -4.562183856964111, "logps/rejected": -5.0614728927612305, "loss": 0.0543, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.562183856964111, "rewards/margins": 0.49928945302963257, "rewards/rejected": -5.0614728927612305, "sft_loss": 4.305178642272949, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 0.33633377390369845, "learning_rate": 1.613643159357192e-09, "logits/chosen": -0.6578032374382019, "logits/rejected": -0.7792307734489441, "logps/chosen": -4.518008232116699, "logps/rejected": -5.044106483459473, "loss": 0.0525, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.518008232116699, "rewards/margins": 0.5260982513427734, "rewards/rejected": -5.044106483459473, "sft_loss": 4.252413749694824, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 0.5280603635941091, "learning_rate": 1.4910397440875967e-09, "logits/chosen": -0.7603357434272766, "logits/rejected": -0.6517842411994934, "logps/chosen": -4.375611305236816, "logps/rejected": -5.041906833648682, "loss": 0.0512, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.375611305236816, "rewards/margins": 0.6662946939468384, "rewards/rejected": -5.041906833648682, "sft_loss": 4.093026161193848, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 0.43302048339241966, "learning_rate": 1.3732728649368253e-09, "logits/chosen": -0.6951078176498413, "logits/rejected": -0.5088014602661133, "logps/chosen": -4.342538356781006, "logps/rejected": -5.075597763061523, "loss": 0.0511, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.342538356781006, "rewards/margins": 0.7330597639083862, "rewards/rejected": -5.075597763061523, "sft_loss": 4.080550193786621, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 0.4333874844955521, "learning_rate": 1.260343664479524e-09, "logits/chosen": -0.7748027443885803, "logits/rejected": -0.7607916593551636, "logps/chosen": -4.558234214782715, "logps/rejected": -5.103475570678711, "loss": 0.0514, "rewards/accuracies": 0.625, "rewards/chosen": -4.558234214782715, "rewards/margins": 0.5452412962913513, "rewards/rejected": -5.103475570678711, "sft_loss": 4.141097068786621, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 0.5357566836246764, "learning_rate": 1.1522532383554384e-09, "logits/chosen": -0.8556427955627441, "logits/rejected": -0.5980334281921387, "logps/chosen": -4.472742557525635, "logps/rejected": -5.171594142913818, "loss": 0.0519, "rewards/accuracies": 0.71875, "rewards/chosen": -4.472742557525635, "rewards/margins": 0.6988515257835388, "rewards/rejected": -5.171594142913818, "sft_loss": 4.282252311706543, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 0.298521633055877, "learning_rate": 1.049002635258256e-09, "logits/chosen": -0.6451120972633362, "logits/rejected": -0.5836285352706909, "logps/chosen": -4.546640872955322, "logps/rejected": -5.189908027648926, "loss": 0.0516, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.546640872955322, "rewards/margins": 0.6432673335075378, "rewards/rejected": -5.189908027648926, "sft_loss": 4.219212532043457, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 0.47740541750846544, "learning_rate": 9.505928569258358e-10, "logits/chosen": -0.6692383289337158, "logits/rejected": -0.6824567317962646, "logps/chosen": -4.47029972076416, "logps/rejected": -5.136569499969482, "loss": 0.0515, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.47029972076416, "rewards/margins": 0.6662701368331909, "rewards/rejected": -5.136569499969482, "sft_loss": 4.262817859649658, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 0.49326359074807213, "learning_rate": 8.57024858130273e-10, "logits/chosen": -0.7504460215568542, "logits/rejected": -0.5905870199203491, "logps/chosen": -4.473368167877197, "logps/rejected": -5.392681121826172, "loss": 0.0514, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.473368167877197, "rewards/margins": 0.9193128347396851, "rewards/rejected": -5.392681121826172, "sft_loss": 4.297789096832275, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 0.39847456506363377, "learning_rate": 7.682995466686826e-10, "logits/chosen": -0.8444706201553345, "logits/rejected": -0.6898127794265747, "logps/chosen": -4.56125545501709, "logps/rejected": -5.338624000549316, "loss": 0.0503, "rewards/accuracies": 0.65625, "rewards/chosen": -4.56125545501709, "rewards/margins": 0.7773683071136475, "rewards/rejected": -5.338624000549316, "sft_loss": 4.119109153747559, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 0.4569463801314364, "learning_rate": 6.844177833543741e-10, "logits/chosen": -0.752629816532135, "logits/rejected": -0.7100390791893005, "logps/chosen": -4.399282932281494, "logps/rejected": -5.064263343811035, "loss": 0.0513, "rewards/accuracies": 0.6875, "rewards/chosen": -4.399282932281494, "rewards/margins": 0.6649808287620544, "rewards/rejected": -5.064263343811035, "sft_loss": 4.0922088623046875, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 0.5095330277701998, "learning_rate": 6.053803820087467e-10, "logits/chosen": -0.7440576553344727, "logits/rejected": -0.5561671257019043, "logps/chosen": -4.5473103523254395, "logps/rejected": -5.268017768859863, "loss": 0.0532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.5473103523254395, "rewards/margins": 0.7207077145576477, "rewards/rejected": -5.268017768859863, "sft_loss": 4.374567985534668, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 0.5418342484558206, "learning_rate": 5.311881094528514e-10, "logits/chosen": -0.894533634185791, "logits/rejected": -0.6330394744873047, "logps/chosen": -4.468113422393799, "logps/rejected": -5.109396934509277, "loss": 0.0515, "rewards/accuracies": 0.71875, "rewards/chosen": -4.468113422393799, "rewards/margins": 0.6412833333015442, "rewards/rejected": -5.109396934509277, "sft_loss": 4.15500545501709, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 0.7560684828912863, "learning_rate": 4.6184168550050806e-10, "logits/chosen": -0.7792457342147827, "logits/rejected": -0.7371629476547241, "logps/chosen": -4.565438270568848, "logps/rejected": -5.105252742767334, "loss": 0.0537, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.565438270568848, "rewards/margins": 0.539814829826355, "rewards/rejected": -5.105252742767334, "sft_loss": 4.340537071228027, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 0.4911869174076437, "learning_rate": 3.973417829510328e-10, "logits/chosen": -0.8998439908027649, "logits/rejected": -0.7843471765518188, "logps/chosen": -4.363260746002197, "logps/rejected": -5.053696632385254, "loss": 0.0513, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.363260746002197, "rewards/margins": 0.6904358267784119, "rewards/rejected": -5.053696632385254, "sft_loss": 4.130491256713867, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 0.625561234732518, "learning_rate": 3.3768902758274377e-10, "logits/chosen": -0.7709188461303711, "logits/rejected": -0.6807830929756165, "logps/chosen": -4.335400104522705, "logps/rejected": -5.092108726501465, "loss": 0.051, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.335400104522705, "rewards/margins": 0.7567082643508911, "rewards/rejected": -5.092108726501465, "sft_loss": 4.07431173324585, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 0.40367028372201813, "learning_rate": 2.8288399814691e-10, "logits/chosen": -0.6472212672233582, "logits/rejected": -0.6156342625617981, "logps/chosen": -4.514806270599365, "logps/rejected": -5.145084381103516, "loss": 0.0518, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.514806270599365, "rewards/margins": 0.6302779912948608, "rewards/rejected": -5.145084381103516, "sft_loss": 4.210458278656006, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 0.38644244219702006, "learning_rate": 2.3292722636220066e-10, "logits/chosen": -0.7775002717971802, "logits/rejected": -0.5841083526611328, "logps/chosen": -4.504853248596191, "logps/rejected": -5.351606845855713, "loss": 0.0505, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.504853248596191, "rewards/margins": 0.8467535972595215, "rewards/rejected": -5.351606845855713, "sft_loss": 4.162545680999756, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 0.525091240664735, "learning_rate": 1.8781919690946668e-10, "logits/chosen": -0.7250782251358032, "logits/rejected": -0.7407873868942261, "logps/chosen": -4.452993392944336, "logps/rejected": -5.02133846282959, "loss": 0.0516, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.452993392944336, "rewards/margins": 0.5683449506759644, "rewards/rejected": -5.02133846282959, "sft_loss": 4.157462120056152, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 0.46121490745190485, "learning_rate": 1.4756034742696711e-10, "logits/chosen": -0.8873132467269897, "logits/rejected": -0.7985233068466187, "logps/chosen": -4.419460773468018, "logps/rejected": -5.141184329986572, "loss": 0.052, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.419460773468018, "rewards/margins": 0.7217229604721069, "rewards/rejected": -5.141184329986572, "sft_loss": 4.184345245361328, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 0.5079173509922432, "learning_rate": 1.12151068506261e-10, "logits/chosen": -0.7679609060287476, "logits/rejected": -0.6635790467262268, "logps/chosen": -4.502461910247803, "logps/rejected": -5.3459858894348145, "loss": 0.0493, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.502461910247803, "rewards/margins": 0.8435236215591431, "rewards/rejected": -5.3459858894348145, "sft_loss": 4.08714485168457, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 0.6251494874129625, "learning_rate": 8.159170368826629e-11, "logits/chosen": -0.778547465801239, "logits/rejected": -0.6166914701461792, "logps/chosen": -4.4004130363464355, "logps/rejected": -5.13525390625, "loss": 0.0509, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.4004130363464355, "rewards/margins": 0.734841525554657, "rewards/rejected": -5.13525390625, "sft_loss": 4.074005603790283, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 0.4323529956911697, "learning_rate": 5.588254946015114e-11, "logits/chosen": -0.937720775604248, "logits/rejected": -0.6476465463638306, "logps/chosen": -4.396031856536865, "logps/rejected": -5.199591636657715, "loss": 0.0505, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.396031856536865, "rewards/margins": 0.8035598993301392, "rewards/rejected": -5.199591636657715, "sft_loss": 4.089269161224365, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 0.4034195581734263, "learning_rate": 3.502385525216978e-11, "logits/chosen": -0.8808103799819946, "logits/rejected": -0.7039750814437866, "logps/chosen": -4.423071384429932, "logps/rejected": -5.164847373962402, "loss": 0.0515, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.423071384429932, "rewards/margins": 0.7417756915092468, "rewards/rejected": -5.164847373962402, "sft_loss": 4.177689552307129, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 0.3570840544619443, "learning_rate": 1.901582343555308e-11, "logits/chosen": -0.7723742127418518, "logits/rejected": -0.709536075592041, "logps/chosen": -4.62722635269165, "logps/rejected": -5.315423488616943, "loss": 0.0525, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.62722635269165, "rewards/margins": 0.688197135925293, "rewards/rejected": -5.315423488616943, "sft_loss": 4.275882720947266, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 0.4923520610414012, "learning_rate": 7.858609320232634e-12, "logits/chosen": -0.8053900599479675, "logits/rejected": -0.6184626221656799, "logps/chosen": -4.50331974029541, "logps/rejected": -5.1998724937438965, "loss": 0.0515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.50331974029541, "rewards/margins": 0.696553111076355, "rewards/rejected": -5.1998724937438965, "sft_loss": 4.270722389221191, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 0.5386939604422419, "learning_rate": 1.5523211535639624e-12, "logits/chosen": -0.8037877082824707, "logits/rejected": -0.6862096190452576, "logps/chosen": -4.483442306518555, "logps/rejected": -5.466944694519043, "loss": 0.0504, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.483442306518555, "rewards/margins": 0.983502209186554, "rewards/rejected": -5.466944694519043, "sft_loss": 4.2200493812561035, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": -0.3687087595462799, "eval_logits/rejected": -0.2954810559749603, "eval_logps/chosen": -4.423083305358887, "eval_logps/rejected": -5.141829967498779, "eval_loss": 0.05035361275076866, "eval_rewards/accuracies": 0.6862017512321472, "eval_rewards/chosen": -4.423083305358887, "eval_rewards/margins": 0.7187467217445374, "eval_rewards/rejected": -5.141829967498779, "eval_runtime": 44.0385, "eval_samples_per_second": 30.541, "eval_sft_loss": 4.028059959411621, "eval_steps_per_second": 7.652, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 0.0619825580417259, "train_runtime": 31817.9407, "train_samples_per_second": 5.637, "train_steps_per_second": 0.176 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }