diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18200 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999297541394882, + "eval_steps": 400, + "global_step": 5604, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002676032781401572, + "grad_norm": 4.125313107067299, + "learning_rate": 8.9126559714795e-09, + "logits/chosen": -0.060312606394290924, + "logits/rejected": 0.15203741192817688, + "logps/chosen": -1.7157971858978271, + "logps/rejected": -1.8896640539169312, + "loss": 0.2582, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.7157971858978271, + "rewards/margins": 0.17386700212955475, + "rewards/rejected": -1.8896640539169312, + "sft_loss": 1.4683139324188232, + "step": 5 + }, + { + "epoch": 0.005352065562803144, + "grad_norm": 2.7467738156031922, + "learning_rate": 1.7825311942959e-08, + "logits/chosen": 0.011781789362430573, + "logits/rejected": 0.13588806986808777, + "logps/chosen": -1.8027633428573608, + "logps/rejected": -1.8470537662506104, + "loss": 0.2651, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.8027633428573608, + "rewards/margins": 0.044290412217378616, + "rewards/rejected": -1.8470537662506104, + "sft_loss": 1.5084987878799438, + "step": 10 + }, + { + "epoch": 0.008028098344204716, + "grad_norm": 2.953214775219304, + "learning_rate": 2.67379679144385e-08, + "logits/chosen": -0.03865582123398781, + "logits/rejected": 0.061098456382751465, + "logps/chosen": -1.6350253820419312, + "logps/rejected": -1.7651439905166626, + "loss": 0.302, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.6350253820419312, + "rewards/margins": 0.13011865317821503, + "rewards/rejected": -1.7651439905166626, + "sft_loss": 1.5002126693725586, + "step": 15 + }, + { + "epoch": 0.010704131125606288, + "grad_norm": 3.875440232060317, + "learning_rate": 3.5650623885918e-08, + "logits/chosen": -0.04312217980623245, + "logits/rejected": 0.044587552547454834, + "logps/chosen": -1.7249486446380615, + "logps/rejected": -1.8060178756713867, + "loss": 0.2933, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.7249486446380615, + "rewards/margins": 0.08106913417577744, + "rewards/rejected": -1.8060178756713867, + "sft_loss": 1.500407338142395, + "step": 20 + }, + { + "epoch": 0.013380163907007862, + "grad_norm": 3.683341007905961, + "learning_rate": 4.45632798573975e-08, + "logits/chosen": -0.07231198251247406, + "logits/rejected": 0.015074786730110645, + "logps/chosen": -1.8695526123046875, + "logps/rejected": -1.7800153493881226, + "loss": 0.322, + "rewards/accuracies": 0.3812499940395355, + "rewards/chosen": -1.8695526123046875, + "rewards/margins": -0.08953739702701569, + "rewards/rejected": -1.7800153493881226, + "sft_loss": 1.5455690622329712, + "step": 25 + }, + { + "epoch": 0.016056196688409432, + "grad_norm": 2.8411343337921076, + "learning_rate": 5.3475935828877e-08, + "logits/chosen": -0.0865975096821785, + "logits/rejected": 0.009093428030610085, + "logps/chosen": -1.9094641208648682, + "logps/rejected": -1.8325374126434326, + "loss": 0.264, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.9094641208648682, + "rewards/margins": -0.07692664116621017, + "rewards/rejected": -1.8325374126434326, + "sft_loss": 1.6473287343978882, + "step": 30 + }, + { + "epoch": 0.018732229469811006, + "grad_norm": 3.7871810202252782, + "learning_rate": 6.23885918003565e-08, + "logits/chosen": -0.04631795734167099, + "logits/rejected": 0.11639624834060669, + "logps/chosen": -1.8485195636749268, + "logps/rejected": -1.9989744424819946, + "loss": 0.2781, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.8485195636749268, + "rewards/margins": 0.15045490860939026, + "rewards/rejected": -1.9989744424819946, + "sft_loss": 1.5623446702957153, + "step": 35 + }, + { + "epoch": 0.021408262251212576, + "grad_norm": 3.209277493286425, + "learning_rate": 7.1301247771836e-08, + "logits/chosen": 0.03658987209200859, + "logits/rejected": 0.2132900059223175, + "logps/chosen": -1.8844735622406006, + "logps/rejected": -1.7460263967514038, + "loss": 0.2934, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -1.8844735622406006, + "rewards/margins": -0.1384473741054535, + "rewards/rejected": -1.7460263967514038, + "sft_loss": 1.5194506645202637, + "step": 40 + }, + { + "epoch": 0.02408429503261415, + "grad_norm": 3.7608064055241694, + "learning_rate": 8.021390374331551e-08, + "logits/chosen": 0.019204024225473404, + "logits/rejected": 0.21857735514640808, + "logps/chosen": -1.8422836065292358, + "logps/rejected": -1.8763787746429443, + "loss": 0.285, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.8422836065292358, + "rewards/margins": 0.034095339477062225, + "rewards/rejected": -1.8763787746429443, + "sft_loss": 1.538379430770874, + "step": 45 + }, + { + "epoch": 0.026760327814015723, + "grad_norm": 3.4310065887447876, + "learning_rate": 8.9126559714795e-08, + "logits/chosen": -0.05229135602712631, + "logits/rejected": 0.09997323900461197, + "logps/chosen": -1.908278226852417, + "logps/rejected": -1.7853820323944092, + "loss": 0.2806, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.908278226852417, + "rewards/margins": -0.12289615720510483, + "rewards/rejected": -1.7853820323944092, + "sft_loss": 1.5872937440872192, + "step": 50 + }, + { + "epoch": 0.029436360595417294, + "grad_norm": 3.1486928978797013, + "learning_rate": 9.80392156862745e-08, + "logits/chosen": -0.11598268896341324, + "logits/rejected": 0.10807422548532486, + "logps/chosen": -1.846299409866333, + "logps/rejected": -1.8794721364974976, + "loss": 0.2676, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.846299409866333, + "rewards/margins": 0.03317270055413246, + "rewards/rejected": -1.8794721364974976, + "sft_loss": 1.5888155698776245, + "step": 55 + }, + { + "epoch": 0.032112393376818864, + "grad_norm": 3.225575628852993, + "learning_rate": 1.06951871657754e-07, + "logits/chosen": -0.08910714834928513, + "logits/rejected": 0.10314790904521942, + "logps/chosen": -1.8055875301361084, + "logps/rejected": -1.9120228290557861, + "loss": 0.2595, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.8055875301361084, + "rewards/margins": 0.1064353734254837, + "rewards/rejected": -1.9120228290557861, + "sft_loss": 1.5487940311431885, + "step": 60 + }, + { + "epoch": 0.03478842615822044, + "grad_norm": 3.0501140875467243, + "learning_rate": 1.158645276292335e-07, + "logits/chosen": -0.02393939718604088, + "logits/rejected": 0.128209188580513, + "logps/chosen": -1.6527059078216553, + "logps/rejected": -1.7865177392959595, + "loss": 0.2825, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.6527059078216553, + "rewards/margins": 0.13381178677082062, + "rewards/rejected": -1.7865177392959595, + "sft_loss": 1.482107400894165, + "step": 65 + }, + { + "epoch": 0.03746445893962201, + "grad_norm": 4.556150358888877, + "learning_rate": 1.24777183600713e-07, + "logits/chosen": -0.06528159230947495, + "logits/rejected": 0.09094108641147614, + "logps/chosen": -1.7894785404205322, + "logps/rejected": -1.8382999897003174, + "loss": 0.2873, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -1.7894785404205322, + "rewards/margins": 0.04882138594985008, + "rewards/rejected": -1.8382999897003174, + "sft_loss": 1.6435940265655518, + "step": 70 + }, + { + "epoch": 0.04014049172102358, + "grad_norm": 2.8190971503848, + "learning_rate": 1.3368983957219251e-07, + "logits/chosen": -0.0369485542178154, + "logits/rejected": 0.15109845995903015, + "logps/chosen": -1.8229849338531494, + "logps/rejected": -2.091214179992676, + "loss": 0.2536, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.8229849338531494, + "rewards/margins": 0.268229216337204, + "rewards/rejected": -2.091214179992676, + "sft_loss": 1.584989309310913, + "step": 75 + }, + { + "epoch": 0.04281652450242515, + "grad_norm": 2.640922353276211, + "learning_rate": 1.42602495543672e-07, + "logits/chosen": -0.008784117177128792, + "logits/rejected": 0.0963423103094101, + "logps/chosen": -1.772080421447754, + "logps/rejected": -1.804764986038208, + "loss": 0.2817, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.772080421447754, + "rewards/margins": 0.0326843187212944, + "rewards/rejected": -1.804764986038208, + "sft_loss": 1.5509886741638184, + "step": 80 + }, + { + "epoch": 0.04549255728382673, + "grad_norm": 2.71350748289859, + "learning_rate": 1.5151515151515152e-07, + "logits/chosen": -0.1522648185491562, + "logits/rejected": 0.09689263254404068, + "logps/chosen": -1.8555856943130493, + "logps/rejected": -2.046074867248535, + "loss": 0.2782, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.8555856943130493, + "rewards/margins": 0.1904892474412918, + "rewards/rejected": -2.046074867248535, + "sft_loss": 1.5202927589416504, + "step": 85 + }, + { + "epoch": 0.0481685900652283, + "grad_norm": 2.422361599261421, + "learning_rate": 1.6042780748663102e-07, + "logits/chosen": 0.09719739854335785, + "logits/rejected": 0.05986147001385689, + "logps/chosen": -1.8296709060668945, + "logps/rejected": -1.8398548364639282, + "loss": 0.2889, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.8296709060668945, + "rewards/margins": 0.010183680802583694, + "rewards/rejected": -1.8398548364639282, + "sft_loss": 1.4819883108139038, + "step": 90 + }, + { + "epoch": 0.05084462284662987, + "grad_norm": 2.418905862195348, + "learning_rate": 1.693404634581105e-07, + "logits/chosen": -0.07108329236507416, + "logits/rejected": 0.0822470635175705, + "logps/chosen": -1.9167404174804688, + "logps/rejected": -1.9951963424682617, + "loss": 0.2626, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.9167404174804688, + "rewards/margins": 0.07845588028430939, + "rewards/rejected": -1.9951963424682617, + "sft_loss": 1.5648051500320435, + "step": 95 + }, + { + "epoch": 0.05352065562803145, + "grad_norm": 2.3559369369237277, + "learning_rate": 1.7825311942959e-07, + "logits/chosen": -0.03189660981297493, + "logits/rejected": 0.03379274904727936, + "logps/chosen": -1.7909519672393799, + "logps/rejected": -1.903272032737732, + "loss": 0.263, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.7909519672393799, + "rewards/margins": 0.1123199313879013, + "rewards/rejected": -1.903272032737732, + "sft_loss": 1.5315051078796387, + "step": 100 + }, + { + "epoch": 0.05619668840943302, + "grad_norm": 2.272132842693275, + "learning_rate": 1.8716577540106952e-07, + "logits/chosen": 0.038053132593631744, + "logits/rejected": 0.06628037244081497, + "logps/chosen": -1.7722151279449463, + "logps/rejected": -1.9358274936676025, + "loss": 0.2647, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.7722151279449463, + "rewards/margins": 0.16361233592033386, + "rewards/rejected": -1.9358274936676025, + "sft_loss": 1.4914172887802124, + "step": 105 + }, + { + "epoch": 0.05887272119083459, + "grad_norm": 2.3939211561963982, + "learning_rate": 1.96078431372549e-07, + "logits/chosen": 0.019141273573040962, + "logits/rejected": 0.1181107759475708, + "logps/chosen": -1.8616676330566406, + "logps/rejected": -1.9208825826644897, + "loss": 0.2764, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.8616676330566406, + "rewards/margins": 0.05921504646539688, + "rewards/rejected": -1.9208825826644897, + "sft_loss": 1.5512902736663818, + "step": 110 + }, + { + "epoch": 0.06154875397223616, + "grad_norm": 2.3627131217942043, + "learning_rate": 2.049910873440285e-07, + "logits/chosen": 0.07100260257720947, + "logits/rejected": 0.29280149936676025, + "logps/chosen": -1.8520358800888062, + "logps/rejected": -2.1854333877563477, + "loss": 0.2304, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.8520358800888062, + "rewards/margins": 0.33339765667915344, + "rewards/rejected": -2.1854333877563477, + "sft_loss": 1.6704508066177368, + "step": 115 + }, + { + "epoch": 0.06422478675363773, + "grad_norm": 1.735165157614565, + "learning_rate": 2.13903743315508e-07, + "logits/chosen": -0.07888107746839523, + "logits/rejected": 0.10008995234966278, + "logps/chosen": -1.9981311559677124, + "logps/rejected": -2.1442179679870605, + "loss": 0.2481, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.9981311559677124, + "rewards/margins": 0.14608684182167053, + "rewards/rejected": -2.1442179679870605, + "sft_loss": 1.6943069696426392, + "step": 120 + }, + { + "epoch": 0.0669008195350393, + "grad_norm": 2.786593260731286, + "learning_rate": 2.2281639928698751e-07, + "logits/chosen": -0.05085619166493416, + "logits/rejected": 0.08655449748039246, + "logps/chosen": -1.8938229084014893, + "logps/rejected": -1.8082813024520874, + "loss": 0.281, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.8938229084014893, + "rewards/margins": -0.08554168045520782, + "rewards/rejected": -1.8082813024520874, + "sft_loss": 1.606369972229004, + "step": 125 + }, + { + "epoch": 0.06957685231644088, + "grad_norm": 2.5772159985228655, + "learning_rate": 2.31729055258467e-07, + "logits/chosen": 0.04295843839645386, + "logits/rejected": 0.18473029136657715, + "logps/chosen": -1.9629977941513062, + "logps/rejected": -2.086339235305786, + "loss": 0.2481, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.9629977941513062, + "rewards/margins": 0.12334122508764267, + "rewards/rejected": -2.086339235305786, + "sft_loss": 1.6907141208648682, + "step": 130 + }, + { + "epoch": 0.07225288509784245, + "grad_norm": 1.9943010581072478, + "learning_rate": 2.406417112299465e-07, + "logits/chosen": -0.0350356251001358, + "logits/rejected": 0.08792857825756073, + "logps/chosen": -2.0462894439697266, + "logps/rejected": -2.0182530879974365, + "loss": 0.2578, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.0462894439697266, + "rewards/margins": -0.02803630754351616, + "rewards/rejected": -2.0182530879974365, + "sft_loss": 1.6577552556991577, + "step": 135 + }, + { + "epoch": 0.07492891787924402, + "grad_norm": 2.8486191540947807, + "learning_rate": 2.49554367201426e-07, + "logits/chosen": -0.02075616642832756, + "logits/rejected": 0.1543327271938324, + "logps/chosen": -2.015401601791382, + "logps/rejected": -2.2778303623199463, + "loss": 0.2269, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.015401601791382, + "rewards/margins": 0.2624287009239197, + "rewards/rejected": -2.2778303623199463, + "sft_loss": 1.6974109411239624, + "step": 140 + }, + { + "epoch": 0.0776049506606456, + "grad_norm": 2.1225048898899246, + "learning_rate": 2.5846702317290554e-07, + "logits/chosen": 0.0070198155008256435, + "logits/rejected": 0.17295916378498077, + "logps/chosen": -1.9902257919311523, + "logps/rejected": -2.140160083770752, + "loss": 0.2439, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.9902257919311523, + "rewards/margins": 0.1499340534210205, + "rewards/rejected": -2.140160083770752, + "sft_loss": 1.631656289100647, + "step": 145 + }, + { + "epoch": 0.08028098344204716, + "grad_norm": 2.338788349764335, + "learning_rate": 2.6737967914438503e-07, + "logits/chosen": -0.03971818834543228, + "logits/rejected": 0.13490387797355652, + "logps/chosen": -1.9857639074325562, + "logps/rejected": -1.9886020421981812, + "loss": 0.2651, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.9857639074325562, + "rewards/margins": 0.002838182495906949, + "rewards/rejected": -1.9886020421981812, + "sft_loss": 1.5006240606307983, + "step": 150 + }, + { + "epoch": 0.08295701622344874, + "grad_norm": 2.1835505399539104, + "learning_rate": 2.762923351158645e-07, + "logits/chosen": -0.027233857661485672, + "logits/rejected": 0.025476187467575073, + "logps/chosen": -2.1108012199401855, + "logps/rejected": -2.1566002368927, + "loss": 0.2523, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -2.1108012199401855, + "rewards/margins": 0.04579881578683853, + "rewards/rejected": -2.1566002368927, + "sft_loss": 1.6765267848968506, + "step": 155 + }, + { + "epoch": 0.0856330490048503, + "grad_norm": 2.040498416917074, + "learning_rate": 2.85204991087344e-07, + "logits/chosen": -0.11586171388626099, + "logits/rejected": 0.03955882042646408, + "logps/chosen": -2.320681095123291, + "logps/rejected": -2.297348976135254, + "loss": 0.2415, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -2.320681095123291, + "rewards/margins": -0.023332182317972183, + "rewards/rejected": -2.297348976135254, + "sft_loss": 1.771780014038086, + "step": 160 + }, + { + "epoch": 0.08830908178625188, + "grad_norm": 1.9928740456508027, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -0.04006613790988922, + "logits/rejected": 0.14414557814598083, + "logps/chosen": -2.0469436645507812, + "logps/rejected": -2.3669915199279785, + "loss": 0.2436, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.0469436645507812, + "rewards/margins": 0.3200477659702301, + "rewards/rejected": -2.3669915199279785, + "sft_loss": 1.6215006113052368, + "step": 165 + }, + { + "epoch": 0.09098511456765346, + "grad_norm": 1.9534296167766696, + "learning_rate": 3.0303030303030305e-07, + "logits/chosen": -0.06098126247525215, + "logits/rejected": 0.001214376068674028, + "logps/chosen": -2.3339781761169434, + "logps/rejected": -2.305957555770874, + "loss": 0.2285, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -2.3339781761169434, + "rewards/margins": -0.028020773082971573, + "rewards/rejected": -2.305957555770874, + "sft_loss": 1.726961374282837, + "step": 170 + }, + { + "epoch": 0.09366114734905502, + "grad_norm": 1.9588967180799672, + "learning_rate": 3.1194295900178254e-07, + "logits/chosen": 0.09518565982580185, + "logits/rejected": 0.09727490693330765, + "logps/chosen": -2.214961290359497, + "logps/rejected": -2.278738498687744, + "loss": 0.2654, + "rewards/accuracies": 0.46875, + "rewards/chosen": -2.214961290359497, + "rewards/margins": 0.06377717852592468, + "rewards/rejected": -2.278738498687744, + "sft_loss": 1.7609399557113647, + "step": 175 + }, + { + "epoch": 0.0963371801304566, + "grad_norm": 1.6589936848533324, + "learning_rate": 3.2085561497326203e-07, + "logits/chosen": 0.03179093450307846, + "logits/rejected": 0.037368230521678925, + "logps/chosen": -2.3154728412628174, + "logps/rejected": -2.2699761390686035, + "loss": 0.236, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.3154728412628174, + "rewards/margins": -0.04549693316221237, + "rewards/rejected": -2.2699761390686035, + "sft_loss": 1.7420244216918945, + "step": 180 + }, + { + "epoch": 0.09901321291185818, + "grad_norm": 1.9967555737580138, + "learning_rate": 3.297682709447415e-07, + "logits/chosen": -0.12511086463928223, + "logits/rejected": -0.02812013030052185, + "logps/chosen": -2.2482380867004395, + "logps/rejected": -2.3399927616119385, + "loss": 0.2525, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.2482380867004395, + "rewards/margins": 0.09175457060337067, + "rewards/rejected": -2.3399927616119385, + "sft_loss": 1.711207389831543, + "step": 185 + }, + { + "epoch": 0.10168924569325974, + "grad_norm": 2.678787481783515, + "learning_rate": 3.38680926916221e-07, + "logits/chosen": -0.03826170042157173, + "logits/rejected": 0.10330984741449356, + "logps/chosen": -2.804680347442627, + "logps/rejected": -2.641824245452881, + "loss": 0.2148, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -2.804680347442627, + "rewards/margins": -0.162856325507164, + "rewards/rejected": -2.641824245452881, + "sft_loss": 1.9906642436981201, + "step": 190 + }, + { + "epoch": 0.10436527847466132, + "grad_norm": 2.0306729180588396, + "learning_rate": 3.475935828877005e-07, + "logits/chosen": 0.04664776846766472, + "logits/rejected": 0.22064730525016785, + "logps/chosen": -2.1520609855651855, + "logps/rejected": -2.210716724395752, + "loss": 0.2296, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.1520609855651855, + "rewards/margins": 0.058656178414821625, + "rewards/rejected": -2.210716724395752, + "sft_loss": 1.615627646446228, + "step": 195 + }, + { + "epoch": 0.1070413112560629, + "grad_norm": 3.1931495872144007, + "learning_rate": 3.5650623885918e-07, + "logits/chosen": -0.03578418493270874, + "logits/rejected": 0.12399481236934662, + "logps/chosen": -2.6481387615203857, + "logps/rejected": -2.3530163764953613, + "loss": 0.2331, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.6481387615203857, + "rewards/margins": -0.2951226830482483, + "rewards/rejected": -2.3530163764953613, + "sft_loss": 1.9002326726913452, + "step": 200 + }, + { + "epoch": 0.10971734403746446, + "grad_norm": 2.640144568717103, + "learning_rate": 3.654188948306595e-07, + "logits/chosen": -0.021364711225032806, + "logits/rejected": 0.15079109370708466, + "logps/chosen": -3.106703281402588, + "logps/rejected": -2.7240376472473145, + "loss": 0.2138, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.106703281402588, + "rewards/margins": -0.3826655149459839, + "rewards/rejected": -2.7240376472473145, + "sft_loss": 1.900535225868225, + "step": 205 + }, + { + "epoch": 0.11239337681886603, + "grad_norm": 2.186333179804364, + "learning_rate": 3.7433155080213904e-07, + "logits/chosen": -0.11891081184148788, + "logits/rejected": 0.10270519554615021, + "logps/chosen": -2.9581754207611084, + "logps/rejected": -3.388826847076416, + "loss": 0.1741, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.9581754207611084, + "rewards/margins": 0.4306512773036957, + "rewards/rejected": -3.388826847076416, + "sft_loss": 2.0287399291992188, + "step": 210 + }, + { + "epoch": 0.1150694096002676, + "grad_norm": 2.0176626763649, + "learning_rate": 3.8324420677361853e-07, + "logits/chosen": -0.13966991007328033, + "logits/rejected": 0.140414297580719, + "logps/chosen": -2.749617099761963, + "logps/rejected": -2.8892064094543457, + "loss": 0.1857, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.749617099761963, + "rewards/margins": 0.13958922028541565, + "rewards/rejected": -2.8892064094543457, + "sft_loss": 2.041731357574463, + "step": 215 + }, + { + "epoch": 0.11774544238166917, + "grad_norm": 1.998903196870583, + "learning_rate": 3.92156862745098e-07, + "logits/chosen": 0.0680420845746994, + "logits/rejected": 0.18458662927150726, + "logps/chosen": -3.2915711402893066, + "logps/rejected": -3.790569305419922, + "loss": 0.1668, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.2915711402893066, + "rewards/margins": 0.4989985525608063, + "rewards/rejected": -3.790569305419922, + "sft_loss": 2.265310764312744, + "step": 220 + }, + { + "epoch": 0.12042147516307075, + "grad_norm": 1.9590568738460437, + "learning_rate": 4.010695187165775e-07, + "logits/chosen": -0.07913367450237274, + "logits/rejected": 0.1182810515165329, + "logps/chosen": -3.0919346809387207, + "logps/rejected": -3.1701674461364746, + "loss": 0.1653, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.0919346809387207, + "rewards/margins": 0.07823298126459122, + "rewards/rejected": -3.1701674461364746, + "sft_loss": 1.9699609279632568, + "step": 225 + }, + { + "epoch": 0.12309750794447231, + "grad_norm": 1.8737010294999485, + "learning_rate": 4.09982174688057e-07, + "logits/chosen": 0.037992849946022034, + "logits/rejected": 0.13747188448905945, + "logps/chosen": -3.8204619884490967, + "logps/rejected": -3.797633647918701, + "loss": 0.1635, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.8204619884490967, + "rewards/margins": -0.022828320041298866, + "rewards/rejected": -3.797633647918701, + "sft_loss": 2.1811490058898926, + "step": 230 + }, + { + "epoch": 0.1257735407258739, + "grad_norm": 1.8191581628112006, + "learning_rate": 4.188948306595365e-07, + "logits/chosen": 0.046239614486694336, + "logits/rejected": 0.22374077141284943, + "logps/chosen": -3.5340847969055176, + "logps/rejected": -3.8707435131073, + "loss": 0.1426, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.5340847969055176, + "rewards/margins": 0.3366585373878479, + "rewards/rejected": -3.8707435131073, + "sft_loss": 2.135493516921997, + "step": 235 + }, + { + "epoch": 0.12844957350727546, + "grad_norm": 2.15850506003128, + "learning_rate": 4.27807486631016e-07, + "logits/chosen": -0.020359747111797333, + "logits/rejected": 0.12103061378002167, + "logps/chosen": -3.8267650604248047, + "logps/rejected": -3.854151487350464, + "loss": 0.1603, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -3.8267650604248047, + "rewards/margins": 0.02738667093217373, + "rewards/rejected": -3.854151487350464, + "sft_loss": 2.468606472015381, + "step": 240 + }, + { + "epoch": 0.13112560628867703, + "grad_norm": 1.9869901093405427, + "learning_rate": 4.3672014260249554e-07, + "logits/chosen": 0.09430526196956635, + "logits/rejected": 0.2459847629070282, + "logps/chosen": -3.5051181316375732, + "logps/rejected": -4.297321319580078, + "loss": 0.1457, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -3.5051181316375732, + "rewards/margins": 0.792202889919281, + "rewards/rejected": -4.297321319580078, + "sft_loss": 2.459380626678467, + "step": 245 + }, + { + "epoch": 0.1338016390700786, + "grad_norm": 1.9104173001829199, + "learning_rate": 4.4563279857397503e-07, + "logits/chosen": 0.02481432631611824, + "logits/rejected": 0.22242239117622375, + "logps/chosen": -5.418574333190918, + "logps/rejected": -5.279926300048828, + "loss": 0.1462, + "rewards/accuracies": 0.53125, + "rewards/chosen": -5.418574333190918, + "rewards/margins": -0.13864776492118835, + "rewards/rejected": -5.279926300048828, + "sft_loss": 2.6364169120788574, + "step": 250 + }, + { + "epoch": 0.1364776718514802, + "grad_norm": 1.2454643359237576, + "learning_rate": 4.545454545454545e-07, + "logits/chosen": 0.06115083023905754, + "logits/rejected": 0.2540772557258606, + "logps/chosen": -3.8281192779541016, + "logps/rejected": -4.610691547393799, + "loss": 0.1384, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.8281192779541016, + "rewards/margins": 0.7825719714164734, + "rewards/rejected": -4.610691547393799, + "sft_loss": 2.2860593795776367, + "step": 255 + }, + { + "epoch": 0.13915370463288176, + "grad_norm": 0.9849368225532316, + "learning_rate": 4.63458110516934e-07, + "logits/chosen": -0.10867585986852646, + "logits/rejected": 0.02872631885111332, + "logps/chosen": -5.133664131164551, + "logps/rejected": -4.367121696472168, + "loss": 0.1178, + "rewards/accuracies": 0.53125, + "rewards/chosen": -5.133664131164551, + "rewards/margins": -0.7665426135063171, + "rewards/rejected": -4.367121696472168, + "sft_loss": 2.7160372734069824, + "step": 260 + }, + { + "epoch": 0.1418297374142833, + "grad_norm": 0.8961988680228047, + "learning_rate": 4.723707664884135e-07, + "logits/chosen": 0.07030217349529266, + "logits/rejected": 0.17124192416667938, + "logps/chosen": -6.0838727951049805, + "logps/rejected": -4.811110019683838, + "loss": 0.1367, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -6.0838727951049805, + "rewards/margins": -1.2727632522583008, + "rewards/rejected": -4.811110019683838, + "sft_loss": 3.739053726196289, + "step": 265 + }, + { + "epoch": 0.1445057701956849, + "grad_norm": 0.8941719839014858, + "learning_rate": 4.81283422459893e-07, + "logits/chosen": 0.017831971868872643, + "logits/rejected": 0.20090460777282715, + "logps/chosen": -4.764120578765869, + "logps/rejected": -5.134100437164307, + "loss": 0.1264, + "rewards/accuracies": 0.53125, + "rewards/chosen": -4.764120578765869, + "rewards/margins": 0.3699795603752136, + "rewards/rejected": -5.134100437164307, + "sft_loss": 2.7194724082946777, + "step": 270 + }, + { + "epoch": 0.14718180297708647, + "grad_norm": 1.222685081948667, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": 0.16087068617343903, + "logits/rejected": 0.2754586338996887, + "logps/chosen": -5.333785057067871, + "logps/rejected": -6.004696846008301, + "loss": 0.1436, + "rewards/accuracies": 0.59375, + "rewards/chosen": -5.333785057067871, + "rewards/margins": 0.6709117293357849, + "rewards/rejected": -6.004696846008301, + "sft_loss": 3.1917669773101807, + "step": 275 + }, + { + "epoch": 0.14985783575848804, + "grad_norm": 0.7370626970557951, + "learning_rate": 4.99108734402852e-07, + "logits/chosen": 0.007818743586540222, + "logits/rejected": 0.22343873977661133, + "logps/chosen": -6.119328498840332, + "logps/rejected": -5.818517208099365, + "loss": 0.1211, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -6.119328498840332, + "rewards/margins": -0.3008107542991638, + "rewards/rejected": -5.818517208099365, + "sft_loss": 3.9426021575927734, + "step": 280 + }, + { + "epoch": 0.15253386853988962, + "grad_norm": 1.233957256562051, + "learning_rate": 5.080213903743315e-07, + "logits/chosen": 0.041990119963884354, + "logits/rejected": 0.22771665453910828, + "logps/chosen": -5.575260162353516, + "logps/rejected": -5.49068546295166, + "loss": 0.1275, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -5.575260162353516, + "rewards/margins": -0.08457436412572861, + "rewards/rejected": -5.49068546295166, + "sft_loss": 2.8716347217559814, + "step": 285 + }, + { + "epoch": 0.1552099013212912, + "grad_norm": 0.6072672871705348, + "learning_rate": 5.169340463458111e-07, + "logits/chosen": -0.026344675570726395, + "logits/rejected": 0.3459742069244385, + "logps/chosen": -4.510623931884766, + "logps/rejected": -5.7433624267578125, + "loss": 0.0906, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.510623931884766, + "rewards/margins": 1.2327378988265991, + "rewards/rejected": -5.7433624267578125, + "sft_loss": 2.9556336402893066, + "step": 290 + }, + { + "epoch": 0.15788593410269275, + "grad_norm": 0.7208828481537657, + "learning_rate": 5.258467023172905e-07, + "logits/chosen": 0.09221816062927246, + "logits/rejected": 0.17328932881355286, + "logps/chosen": -6.130520820617676, + "logps/rejected": -5.456192970275879, + "loss": 0.1207, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -6.130520820617676, + "rewards/margins": -0.6743276715278625, + "rewards/rejected": -5.456192970275879, + "sft_loss": 3.7058982849121094, + "step": 295 + }, + { + "epoch": 0.16056196688409433, + "grad_norm": 1.340847333736133, + "learning_rate": 5.347593582887701e-07, + "logits/chosen": 0.009460541419684887, + "logits/rejected": 0.23661403357982635, + "logps/chosen": -6.522822380065918, + "logps/rejected": -6.587447166442871, + "loss": 0.1156, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -6.522822380065918, + "rewards/margins": 0.06462571769952774, + "rewards/rejected": -6.587447166442871, + "sft_loss": 3.30534029006958, + "step": 300 + }, + { + "epoch": 0.1632379996654959, + "grad_norm": 1.9457407077227884, + "learning_rate": 5.436720142602496e-07, + "logits/chosen": 0.06960954517126083, + "logits/rejected": 0.16367551684379578, + "logps/chosen": -5.572967052459717, + "logps/rejected": -5.5532331466674805, + "loss": 0.1133, + "rewards/accuracies": 0.46875, + "rewards/chosen": -5.572967052459717, + "rewards/margins": -0.01973416842520237, + "rewards/rejected": -5.5532331466674805, + "sft_loss": 3.6321425437927246, + "step": 305 + }, + { + "epoch": 0.16591403244689748, + "grad_norm": 6.893463954081525, + "learning_rate": 5.52584670231729e-07, + "logits/chosen": -0.13142789900302887, + "logits/rejected": 0.0010722450679168105, + "logps/chosen": -6.735803127288818, + "logps/rejected": -6.812127590179443, + "loss": 0.0959, + "rewards/accuracies": 0.5, + "rewards/chosen": -6.735803127288818, + "rewards/margins": 0.07632424682378769, + "rewards/rejected": -6.812127590179443, + "sft_loss": 4.3729987144470215, + "step": 310 + }, + { + "epoch": 0.16859006522829906, + "grad_norm": 1.2119659575882744, + "learning_rate": 5.614973262032086e-07, + "logits/chosen": 0.05507947877049446, + "logits/rejected": 0.24547457695007324, + "logps/chosen": -7.327678680419922, + "logps/rejected": -7.448760986328125, + "loss": 0.0735, + "rewards/accuracies": 0.5625, + "rewards/chosen": -7.327678680419922, + "rewards/margins": 0.12108228355646133, + "rewards/rejected": -7.448760986328125, + "sft_loss": 5.221084117889404, + "step": 315 + }, + { + "epoch": 0.1712660980097006, + "grad_norm": 1.421852572662818, + "learning_rate": 5.70409982174688e-07, + "logits/chosen": -0.0505533441901207, + "logits/rejected": 0.09875977784395218, + "logps/chosen": -7.0575737953186035, + "logps/rejected": -6.7295331954956055, + "loss": 0.0729, + "rewards/accuracies": 0.5, + "rewards/chosen": -7.0575737953186035, + "rewards/margins": -0.32803958654403687, + "rewards/rejected": -6.7295331954956055, + "sft_loss": 5.3815484046936035, + "step": 320 + }, + { + "epoch": 0.17394213079110218, + "grad_norm": 2.1768410941997747, + "learning_rate": 5.793226381461676e-07, + "logits/chosen": -0.06135901063680649, + "logits/rejected": 0.12024674564599991, + "logps/chosen": -8.038436889648438, + "logps/rejected": -8.41388988494873, + "loss": 0.0679, + "rewards/accuracies": 0.53125, + "rewards/chosen": -8.038436889648438, + "rewards/margins": 0.3754529356956482, + "rewards/rejected": -8.41388988494873, + "sft_loss": 5.565326690673828, + "step": 325 + }, + { + "epoch": 0.17661816357250376, + "grad_norm": 0.9252033224489392, + "learning_rate": 5.88235294117647e-07, + "logits/chosen": 0.034027762711048126, + "logits/rejected": 0.23385238647460938, + "logps/chosen": -6.080555438995361, + "logps/rejected": -7.770529270172119, + "loss": 0.0713, + "rewards/accuracies": 0.59375, + "rewards/chosen": -6.080555438995361, + "rewards/margins": 1.6899728775024414, + "rewards/rejected": -7.770529270172119, + "sft_loss": 5.016576290130615, + "step": 330 + }, + { + "epoch": 0.17929419635390534, + "grad_norm": 2.0989591544928925, + "learning_rate": 5.971479500891266e-07, + "logits/chosen": 0.01920177973806858, + "logits/rejected": 0.206724613904953, + "logps/chosen": -7.403994560241699, + "logps/rejected": -7.278214931488037, + "loss": 0.0695, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -7.403994560241699, + "rewards/margins": -0.12578034400939941, + "rewards/rejected": -7.278214931488037, + "sft_loss": 4.916281223297119, + "step": 335 + }, + { + "epoch": 0.18197022913530692, + "grad_norm": 1.893897803965405, + "learning_rate": 6.060606060606061e-07, + "logits/chosen": 0.055364273488521576, + "logits/rejected": 0.2726927697658539, + "logps/chosen": -6.973310947418213, + "logps/rejected": -7.325900077819824, + "loss": 0.0663, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -6.973310947418213, + "rewards/margins": 0.35258910059928894, + "rewards/rejected": -7.325900077819824, + "sft_loss": 5.119781494140625, + "step": 340 + }, + { + "epoch": 0.1846462619167085, + "grad_norm": 2.836372712749154, + "learning_rate": 6.149732620320855e-07, + "logits/chosen": 0.06766609847545624, + "logits/rejected": 0.1415514200925827, + "logps/chosen": -6.7723259925842285, + "logps/rejected": -6.848855495452881, + "loss": 0.0667, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -6.7723259925842285, + "rewards/margins": 0.07653047144412994, + "rewards/rejected": -6.848855495452881, + "sft_loss": 5.026312828063965, + "step": 345 + }, + { + "epoch": 0.18732229469811004, + "grad_norm": 2.265144184050898, + "learning_rate": 6.238859180035651e-07, + "logits/chosen": -0.15822748839855194, + "logits/rejected": 0.0024463594891130924, + "logps/chosen": -6.168185234069824, + "logps/rejected": -5.750182151794434, + "loss": 0.0671, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -6.168185234069824, + "rewards/margins": -0.4180033802986145, + "rewards/rejected": -5.750182151794434, + "sft_loss": 4.6768975257873535, + "step": 350 + }, + { + "epoch": 0.18999832747951162, + "grad_norm": 2.614388773693835, + "learning_rate": 6.327985739750445e-07, + "logits/chosen": -0.14725571870803833, + "logits/rejected": 0.13961216807365417, + "logps/chosen": -6.235496520996094, + "logps/rejected": -6.224798202514648, + "loss": 0.0614, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -6.235496520996094, + "rewards/margins": -0.01069814246147871, + "rewards/rejected": -6.224798202514648, + "sft_loss": 5.157872200012207, + "step": 355 + }, + { + "epoch": 0.1926743602609132, + "grad_norm": 2.7692613642785884, + "learning_rate": 6.417112299465241e-07, + "logits/chosen": -0.1557673215866089, + "logits/rejected": -0.048311877995729446, + "logps/chosen": -5.508719444274902, + "logps/rejected": -5.668200492858887, + "loss": 0.061, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -5.508719444274902, + "rewards/margins": 0.15948060154914856, + "rewards/rejected": -5.668200492858887, + "sft_loss": 4.47888708114624, + "step": 360 + }, + { + "epoch": 0.19535039304231477, + "grad_norm": 1.759183414717174, + "learning_rate": 6.506238859180035e-07, + "logits/chosen": -0.1632741391658783, + "logits/rejected": -0.04335709661245346, + "logps/chosen": -5.9066925048828125, + "logps/rejected": -5.570250988006592, + "loss": 0.0622, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -5.9066925048828125, + "rewards/margins": -0.33644169569015503, + "rewards/rejected": -5.570250988006592, + "sft_loss": 5.030333518981934, + "step": 365 + }, + { + "epoch": 0.19802642582371635, + "grad_norm": 2.2296092123630236, + "learning_rate": 6.59536541889483e-07, + "logits/chosen": -0.3708917498588562, + "logits/rejected": -0.20164895057678223, + "logps/chosen": -5.268651485443115, + "logps/rejected": -5.234259128570557, + "loss": 0.062, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -5.268651485443115, + "rewards/margins": -0.03439173102378845, + "rewards/rejected": -5.234259128570557, + "sft_loss": 4.815543174743652, + "step": 370 + }, + { + "epoch": 0.2007024586051179, + "grad_norm": 2.0146802870823928, + "learning_rate": 6.684491978609626e-07, + "logits/chosen": -0.47356781363487244, + "logits/rejected": -0.21013717353343964, + "logps/chosen": -5.339799880981445, + "logps/rejected": -5.731418609619141, + "loss": 0.0567, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -5.339799880981445, + "rewards/margins": 0.39161843061447144, + "rewards/rejected": -5.731418609619141, + "sft_loss": 4.969027996063232, + "step": 375 + }, + { + "epoch": 0.20337849138651948, + "grad_norm": 1.3381881149332266, + "learning_rate": 6.77361853832442e-07, + "logits/chosen": -0.5766822695732117, + "logits/rejected": -0.4294039309024811, + "logps/chosen": -4.788380146026611, + "logps/rejected": -4.977846622467041, + "loss": 0.0566, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.788380146026611, + "rewards/margins": 0.18946652114391327, + "rewards/rejected": -4.977846622467041, + "sft_loss": 4.430079460144043, + "step": 380 + }, + { + "epoch": 0.20605452416792105, + "grad_norm": 0.9198991724416198, + "learning_rate": 6.862745098039216e-07, + "logits/chosen": -0.48545369505882263, + "logits/rejected": -0.3366524577140808, + "logps/chosen": -5.007147789001465, + "logps/rejected": -5.114348888397217, + "loss": 0.0569, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -5.007147789001465, + "rewards/margins": 0.10720244795084, + "rewards/rejected": -5.114348888397217, + "sft_loss": 4.7221221923828125, + "step": 385 + }, + { + "epoch": 0.20873055694932263, + "grad_norm": 2.7786654323372857, + "learning_rate": 6.95187165775401e-07, + "logits/chosen": -0.1661207228899002, + "logits/rejected": 0.11756277084350586, + "logps/chosen": -5.295405387878418, + "logps/rejected": -5.372384548187256, + "loss": 0.0601, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -5.295405387878418, + "rewards/margins": 0.07697971165180206, + "rewards/rejected": -5.372384548187256, + "sft_loss": 4.778872489929199, + "step": 390 + }, + { + "epoch": 0.2114065897307242, + "grad_norm": 1.0788168546978547, + "learning_rate": 7.040998217468806e-07, + "logits/chosen": -0.4957035183906555, + "logits/rejected": -0.25873202085494995, + "logps/chosen": -4.942808151245117, + "logps/rejected": -5.0668792724609375, + "loss": 0.0572, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.942808151245117, + "rewards/margins": 0.1240713819861412, + "rewards/rejected": -5.0668792724609375, + "sft_loss": 4.497957706451416, + "step": 395 + }, + { + "epoch": 0.2140826225121258, + "grad_norm": 1.2369650710490778, + "learning_rate": 7.1301247771836e-07, + "logits/chosen": -0.5010001063346863, + "logits/rejected": -0.32428526878356934, + "logps/chosen": -5.017227649688721, + "logps/rejected": -5.143941879272461, + "loss": 0.0548, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -5.017227649688721, + "rewards/margins": 0.12671446800231934, + "rewards/rejected": -5.143941879272461, + "sft_loss": 4.438155174255371, + "step": 400 + }, + { + "epoch": 0.2140826225121258, + "eval_logits/chosen": -0.02770337089896202, + "eval_logits/rejected": 0.10948384553194046, + "eval_logps/chosen": -5.346701622009277, + "eval_logps/rejected": -5.472263813018799, + "eval_loss": 0.05566617101430893, + "eval_rewards/accuracies": 0.5326409339904785, + "eval_rewards/chosen": -5.346701622009277, + "eval_rewards/margins": 0.12556174397468567, + "eval_rewards/rejected": -5.472263813018799, + "eval_runtime": 43.8831, + "eval_samples_per_second": 30.65, + "eval_sft_loss": 4.829505443572998, + "eval_steps_per_second": 7.679, + "step": 400 + }, + { + "epoch": 0.21675865529352734, + "grad_norm": 0.8680376616287312, + "learning_rate": 7.219251336898395e-07, + "logits/chosen": -0.5568983554840088, + "logits/rejected": -0.42599543929100037, + "logps/chosen": -5.262287616729736, + "logps/rejected": -5.478768825531006, + "loss": 0.0577, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -5.262287616729736, + "rewards/margins": 0.21648113429546356, + "rewards/rejected": -5.478768825531006, + "sft_loss": 4.965443134307861, + "step": 405 + }, + { + "epoch": 0.2194346880749289, + "grad_norm": 1.3304348425737549, + "learning_rate": 7.30837789661319e-07, + "logits/chosen": -0.4703024923801422, + "logits/rejected": -0.2515341639518738, + "logps/chosen": -4.820174694061279, + "logps/rejected": -5.074994087219238, + "loss": 0.055, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.820174694061279, + "rewards/margins": 0.25481972098350525, + "rewards/rejected": -5.074994087219238, + "sft_loss": 4.33463191986084, + "step": 410 + }, + { + "epoch": 0.2221107208563305, + "grad_norm": 1.5135280531396909, + "learning_rate": 7.397504456327985e-07, + "logits/chosen": -0.4607006907463074, + "logits/rejected": -0.3690333366394043, + "logps/chosen": -5.324876308441162, + "logps/rejected": -5.247057914733887, + "loss": 0.0574, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -5.324876308441162, + "rewards/margins": -0.07781883329153061, + "rewards/rejected": -5.247057914733887, + "sft_loss": 4.834041595458984, + "step": 415 + }, + { + "epoch": 0.22478675363773207, + "grad_norm": 2.6272102535641886, + "learning_rate": 7.486631016042781e-07, + "logits/chosen": -0.5340300798416138, + "logits/rejected": -0.14234408736228943, + "logps/chosen": -4.852800369262695, + "logps/rejected": -5.007115840911865, + "loss": 0.0559, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -4.852800369262695, + "rewards/margins": 0.15431593358516693, + "rewards/rejected": -5.007115840911865, + "sft_loss": 4.486388206481934, + "step": 420 + }, + { + "epoch": 0.22746278641913364, + "grad_norm": 1.9315381406431007, + "learning_rate": 7.575757575757575e-07, + "logits/chosen": -0.43069228529930115, + "logits/rejected": -0.1704765111207962, + "logps/chosen": -4.909597396850586, + "logps/rejected": -5.0253095626831055, + "loss": 0.0551, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.909597396850586, + "rewards/margins": 0.11571161448955536, + "rewards/rejected": -5.0253095626831055, + "sft_loss": 4.460376739501953, + "step": 425 + }, + { + "epoch": 0.2301388192005352, + "grad_norm": 1.137232786466204, + "learning_rate": 7.664884135472371e-07, + "logits/chosen": -0.5641778707504272, + "logits/rejected": -0.2592639625072479, + "logps/chosen": -4.983081817626953, + "logps/rejected": -5.339555740356445, + "loss": 0.0554, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.983081817626953, + "rewards/margins": 0.35647445917129517, + "rewards/rejected": -5.339555740356445, + "sft_loss": 4.552998065948486, + "step": 430 + }, + { + "epoch": 0.23281485198193677, + "grad_norm": 1.1748183885688712, + "learning_rate": 7.754010695187165e-07, + "logits/chosen": -0.403970867395401, + "logits/rejected": -0.2805110514163971, + "logps/chosen": -5.1232590675354, + "logps/rejected": -5.035851001739502, + "loss": 0.0569, + "rewards/accuracies": 0.46875, + "rewards/chosen": -5.1232590675354, + "rewards/margins": -0.08740736544132233, + "rewards/rejected": -5.035851001739502, + "sft_loss": 4.523171901702881, + "step": 435 + }, + { + "epoch": 0.23549088476333835, + "grad_norm": 1.9134674045638693, + "learning_rate": 7.84313725490196e-07, + "logits/chosen": -0.39476364850997925, + "logits/rejected": -0.21744295954704285, + "logps/chosen": -4.964127540588379, + "logps/rejected": -5.1540703773498535, + "loss": 0.0566, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.964127540588379, + "rewards/margins": 0.18994323909282684, + "rewards/rejected": -5.1540703773498535, + "sft_loss": 4.714345455169678, + "step": 440 + }, + { + "epoch": 0.23816691754473993, + "grad_norm": 4.445589266572552, + "learning_rate": 7.932263814616755e-07, + "logits/chosen": -0.3707229495048523, + "logits/rejected": -0.2078903168439865, + "logps/chosen": -5.059138298034668, + "logps/rejected": -5.273739814758301, + "loss": 0.0563, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -5.059138298034668, + "rewards/margins": 0.21460111439228058, + "rewards/rejected": -5.273739814758301, + "sft_loss": 4.584336280822754, + "step": 445 + }, + { + "epoch": 0.2408429503261415, + "grad_norm": 1.7787613623310712, + "learning_rate": 8.02139037433155e-07, + "logits/chosen": -0.3272281289100647, + "logits/rejected": -0.1507757008075714, + "logps/chosen": -4.958096027374268, + "logps/rejected": -5.078334808349609, + "loss": 0.0555, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -4.958096027374268, + "rewards/margins": 0.12023873627185822, + "rewards/rejected": -5.078334808349609, + "sft_loss": 4.501555442810059, + "step": 450 + }, + { + "epoch": 0.24351898310754308, + "grad_norm": 0.9480957363325464, + "learning_rate": 8.110516934046346e-07, + "logits/chosen": -0.3552139103412628, + "logits/rejected": -0.19632229208946228, + "logps/chosen": -4.692941188812256, + "logps/rejected": -5.041573524475098, + "loss": 0.0543, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.692941188812256, + "rewards/margins": 0.3486325144767761, + "rewards/rejected": -5.041573524475098, + "sft_loss": 4.325136184692383, + "step": 455 + }, + { + "epoch": 0.24619501588894463, + "grad_norm": 1.9429007679640615, + "learning_rate": 8.19964349376114e-07, + "logits/chosen": -0.5261731743812561, + "logits/rejected": -0.28895407915115356, + "logps/chosen": -5.142203330993652, + "logps/rejected": -5.269199371337891, + "loss": 0.0574, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -5.142203330993652, + "rewards/margins": 0.12699629366397858, + "rewards/rejected": -5.269199371337891, + "sft_loss": 4.904362201690674, + "step": 460 + }, + { + "epoch": 0.2488710486703462, + "grad_norm": 1.719077122753352, + "learning_rate": 8.288770053475936e-07, + "logits/chosen": -0.347450852394104, + "logits/rejected": -0.24443945288658142, + "logps/chosen": -4.623368263244629, + "logps/rejected": -4.905174255371094, + "loss": 0.0569, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.623368263244629, + "rewards/margins": 0.28180620074272156, + "rewards/rejected": -4.905174255371094, + "sft_loss": 4.362107753753662, + "step": 465 + }, + { + "epoch": 0.2515470814517478, + "grad_norm": 3.797351690873165, + "learning_rate": 8.37789661319073e-07, + "logits/chosen": -0.23248498141765594, + "logits/rejected": -0.33860307931900024, + "logps/chosen": -5.063205718994141, + "logps/rejected": -5.013503074645996, + "loss": 0.0581, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -5.063205718994141, + "rewards/margins": -0.049702536314725876, + "rewards/rejected": -5.013503074645996, + "sft_loss": 4.805975437164307, + "step": 470 + }, + { + "epoch": 0.25422311423314936, + "grad_norm": 1.0037746775043357, + "learning_rate": 8.467023172905525e-07, + "logits/chosen": -0.4212326109409332, + "logits/rejected": -0.16105397045612335, + "logps/chosen": -4.638871669769287, + "logps/rejected": -5.172748565673828, + "loss": 0.0534, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.638871669769287, + "rewards/margins": 0.5338773131370544, + "rewards/rejected": -5.172748565673828, + "sft_loss": 4.322568893432617, + "step": 475 + }, + { + "epoch": 0.2568991470145509, + "grad_norm": 1.5393574024949208, + "learning_rate": 8.55614973262032e-07, + "logits/chosen": -0.36025190353393555, + "logits/rejected": -0.09443429112434387, + "logps/chosen": -4.921746730804443, + "logps/rejected": -5.215901851654053, + "loss": 0.0555, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.921746730804443, + "rewards/margins": 0.2941551208496094, + "rewards/rejected": -5.215901851654053, + "sft_loss": 4.582496166229248, + "step": 480 + }, + { + "epoch": 0.2595751797959525, + "grad_norm": 2.359407535802298, + "learning_rate": 8.645276292335115e-07, + "logits/chosen": -0.3606078624725342, + "logits/rejected": -0.2609712481498718, + "logps/chosen": -5.037476062774658, + "logps/rejected": -5.087932586669922, + "loss": 0.0559, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -5.037476062774658, + "rewards/margins": 0.05045701935887337, + "rewards/rejected": -5.087932586669922, + "sft_loss": 4.643454551696777, + "step": 485 + }, + { + "epoch": 0.26225121257735406, + "grad_norm": 0.9031378126507257, + "learning_rate": 8.734402852049911e-07, + "logits/chosen": -0.20180901885032654, + "logits/rejected": -0.12481292337179184, + "logps/chosen": -4.759294033050537, + "logps/rejected": -4.890738010406494, + "loss": 0.0552, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -4.759294033050537, + "rewards/margins": 0.13144339621067047, + "rewards/rejected": -4.890738010406494, + "sft_loss": 4.474959850311279, + "step": 490 + }, + { + "epoch": 0.26492724535875567, + "grad_norm": 0.864766969516503, + "learning_rate": 8.823529411764705e-07, + "logits/chosen": -0.29709574580192566, + "logits/rejected": -0.2729634642601013, + "logps/chosen": -4.82390022277832, + "logps/rejected": -4.918272972106934, + "loss": 0.0563, + "rewards/accuracies": 0.46875, + "rewards/chosen": -4.82390022277832, + "rewards/margins": 0.0943729504942894, + "rewards/rejected": -4.918272972106934, + "sft_loss": 4.600865840911865, + "step": 495 + }, + { + "epoch": 0.2676032781401572, + "grad_norm": 0.6588994171949467, + "learning_rate": 8.912655971479501e-07, + "logits/chosen": -0.37265440821647644, + "logits/rejected": -0.23569945991039276, + "logps/chosen": -4.7590532302856445, + "logps/rejected": -4.979560375213623, + "loss": 0.0546, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -4.7590532302856445, + "rewards/margins": 0.22050723433494568, + "rewards/rejected": -4.979560375213623, + "sft_loss": 4.484901428222656, + "step": 500 + }, + { + "epoch": 0.27027931092155877, + "grad_norm": 0.9600651230526771, + "learning_rate": 9.001782531194295e-07, + "logits/chosen": -0.3922001123428345, + "logits/rejected": -0.2255508154630661, + "logps/chosen": -4.7787580490112305, + "logps/rejected": -4.795541286468506, + "loss": 0.0567, + "rewards/accuracies": 0.53125, + "rewards/chosen": -4.7787580490112305, + "rewards/margins": 0.016783803701400757, + "rewards/rejected": -4.795541286468506, + "sft_loss": 4.484111785888672, + "step": 505 + }, + { + "epoch": 0.2729553437029604, + "grad_norm": 0.8779860995345065, + "learning_rate": 9.09090909090909e-07, + "logits/chosen": -0.25639277696609497, + "logits/rejected": -0.17562855780124664, + "logps/chosen": -4.838879585266113, + "logps/rejected": -5.002278804779053, + "loss": 0.0552, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -4.838879585266113, + "rewards/margins": 0.16339975595474243, + "rewards/rejected": -5.002278804779053, + "sft_loss": 4.533398628234863, + "step": 510 + }, + { + "epoch": 0.2756313764843619, + "grad_norm": 1.6340742571733686, + "learning_rate": 9.180035650623885e-07, + "logits/chosen": -0.29652127623558044, + "logits/rejected": -0.1382482796907425, + "logps/chosen": -4.777513027191162, + "logps/rejected": -4.839566230773926, + "loss": 0.0554, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -4.777513027191162, + "rewards/margins": 0.06205293536186218, + "rewards/rejected": -4.839566230773926, + "sft_loss": 4.462241172790527, + "step": 515 + }, + { + "epoch": 0.27830740926576353, + "grad_norm": 1.467892207273564, + "learning_rate": 9.26916221033868e-07, + "logits/chosen": -0.39073318243026733, + "logits/rejected": -0.17723903059959412, + "logps/chosen": -4.805098533630371, + "logps/rejected": -4.994345188140869, + "loss": 0.055, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.805098533630371, + "rewards/margins": 0.18924656510353088, + "rewards/rejected": -4.994345188140869, + "sft_loss": 4.511242866516113, + "step": 520 + }, + { + "epoch": 0.2809834420471651, + "grad_norm": 0.769155240820834, + "learning_rate": 9.358288770053476e-07, + "logits/chosen": -0.2005387246608734, + "logits/rejected": -0.061540864408016205, + "logps/chosen": -4.82773494720459, + "logps/rejected": -5.0809125900268555, + "loss": 0.0548, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.82773494720459, + "rewards/margins": 0.25317686796188354, + "rewards/rejected": -5.0809125900268555, + "sft_loss": 4.457303047180176, + "step": 525 + }, + { + "epoch": 0.2836594748285666, + "grad_norm": 1.1120839605961124, + "learning_rate": 9.44741532976827e-07, + "logits/chosen": -0.3281249403953552, + "logits/rejected": -0.25439485907554626, + "logps/chosen": -4.870813369750977, + "logps/rejected": -5.039546012878418, + "loss": 0.0562, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.870813369750977, + "rewards/margins": 0.16873237490653992, + "rewards/rejected": -5.039546012878418, + "sft_loss": 4.659360408782959, + "step": 530 + }, + { + "epoch": 0.28633550760996823, + "grad_norm": 2.0952234650377193, + "learning_rate": 9.536541889483066e-07, + "logits/chosen": -0.5139321684837341, + "logits/rejected": -0.1076575517654419, + "logps/chosen": -4.730218410491943, + "logps/rejected": -4.953000068664551, + "loss": 0.0544, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.730218410491943, + "rewards/margins": 0.22278185188770294, + "rewards/rejected": -4.953000068664551, + "sft_loss": 4.485461235046387, + "step": 535 + }, + { + "epoch": 0.2890115403913698, + "grad_norm": 0.9124179505228884, + "learning_rate": 9.62566844919786e-07, + "logits/chosen": -0.3589113652706146, + "logits/rejected": -0.20875516533851624, + "logps/chosen": -4.740080833435059, + "logps/rejected": -4.862841606140137, + "loss": 0.0563, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.740080833435059, + "rewards/margins": 0.12276099622249603, + "rewards/rejected": -4.862841606140137, + "sft_loss": 4.531832218170166, + "step": 540 + }, + { + "epoch": 0.2916875731727714, + "grad_norm": 1.040263178598027, + "learning_rate": 9.714795008912655e-07, + "logits/chosen": -0.39433753490448, + "logits/rejected": -0.11539351940155029, + "logps/chosen": -4.739431858062744, + "logps/rejected": -4.895320415496826, + "loss": 0.0544, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.739431858062744, + "rewards/margins": 0.15588879585266113, + "rewards/rejected": -4.895320415496826, + "sft_loss": 4.4855875968933105, + "step": 545 + }, + { + "epoch": 0.29436360595417294, + "grad_norm": 1.5371896765442068, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": -0.26903194189071655, + "logits/rejected": -0.18110091984272003, + "logps/chosen": -4.590166091918945, + "logps/rejected": -4.735904216766357, + "loss": 0.056, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -4.590166091918945, + "rewards/margins": 0.14573858678340912, + "rewards/rejected": -4.735904216766357, + "sft_loss": 4.3490447998046875, + "step": 550 + }, + { + "epoch": 0.2970396387355745, + "grad_norm": 1.4915643897189883, + "learning_rate": 9.893048128342244e-07, + "logits/chosen": -0.36765116453170776, + "logits/rejected": -0.19273436069488525, + "logps/chosen": -5.064404487609863, + "logps/rejected": -5.279716491699219, + "loss": 0.0561, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -5.064404487609863, + "rewards/margins": 0.2153124362230301, + "rewards/rejected": -5.279716491699219, + "sft_loss": 4.768413543701172, + "step": 555 + }, + { + "epoch": 0.2997156715169761, + "grad_norm": 1.2834538492458512, + "learning_rate": 9.98217468805704e-07, + "logits/chosen": -0.3625454008579254, + "logits/rejected": -0.3016476631164551, + "logps/chosen": -4.831945896148682, + "logps/rejected": -4.9502854347229, + "loss": 0.0557, + "rewards/accuracies": 0.53125, + "rewards/chosen": -4.831945896148682, + "rewards/margins": 0.11833939701318741, + "rewards/rejected": -4.9502854347229, + "sft_loss": 4.503427028656006, + "step": 560 + }, + { + "epoch": 0.30239170429837764, + "grad_norm": 1.6223027628936535, + "learning_rate": 9.999984476788462e-07, + "logits/chosen": -0.3643776476383209, + "logits/rejected": -0.2353856861591339, + "logps/chosen": -4.69890832901001, + "logps/rejected": -4.904583930969238, + "loss": 0.0556, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.69890832901001, + "rewards/margins": 0.20567627251148224, + "rewards/rejected": -4.904583930969238, + "sft_loss": 4.513741493225098, + "step": 565 + }, + { + "epoch": 0.30506773707977924, + "grad_norm": 0.9717056305058467, + "learning_rate": 9.999921413906797e-07, + "logits/chosen": -0.4494122564792633, + "logits/rejected": -0.12291695922613144, + "logps/chosen": -4.675474643707275, + "logps/rejected": -4.861818790435791, + "loss": 0.0549, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.675474643707275, + "rewards/margins": 0.18634450435638428, + "rewards/rejected": -4.861818790435791, + "sft_loss": 4.422656059265137, + "step": 570 + }, + { + "epoch": 0.3077437698611808, + "grad_norm": 0.8102835169481186, + "learning_rate": 9.999809841765644e-07, + "logits/chosen": -0.5083755850791931, + "logits/rejected": -0.468639612197876, + "logps/chosen": -4.939351558685303, + "logps/rejected": -5.073581218719482, + "loss": 0.0562, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.939351558685303, + "rewards/margins": 0.13423016667366028, + "rewards/rejected": -5.073581218719482, + "sft_loss": 4.74433708190918, + "step": 575 + }, + { + "epoch": 0.3104198026425824, + "grad_norm": 0.484057830475299, + "learning_rate": 9.999649761447477e-07, + "logits/chosen": -0.47592979669570923, + "logits/rejected": -0.19933168590068817, + "logps/chosen": -4.55141544342041, + "logps/rejected": -4.823070049285889, + "loss": 0.0549, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.55141544342041, + "rewards/margins": 0.27165549993515015, + "rewards/rejected": -4.823070049285889, + "sft_loss": 4.323666572570801, + "step": 580 + }, + { + "epoch": 0.31309583542398395, + "grad_norm": 0.4954728704364703, + "learning_rate": 9.999441174505398e-07, + "logits/chosen": -0.5497112274169922, + "logits/rejected": -0.40136200189590454, + "logps/chosen": -4.930845737457275, + "logps/rejected": -5.082024574279785, + "loss": 0.0548, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.930845737457275, + "rewards/margins": 0.15117934346199036, + "rewards/rejected": -5.082024574279785, + "sft_loss": 4.561702251434326, + "step": 585 + }, + { + "epoch": 0.3157718682053855, + "grad_norm": 0.8929142904448083, + "learning_rate": 9.999184082963116e-07, + "logits/chosen": -0.5046008825302124, + "logits/rejected": -0.3427330255508423, + "logps/chosen": -4.885519981384277, + "logps/rejected": -5.006654739379883, + "loss": 0.0551, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.885519981384277, + "rewards/margins": 0.12113462388515472, + "rewards/rejected": -5.006654739379883, + "sft_loss": 4.5403523445129395, + "step": 590 + }, + { + "epoch": 0.3184479009867871, + "grad_norm": 0.9240145223243589, + "learning_rate": 9.998878489314937e-07, + "logits/chosen": -0.46767106652259827, + "logits/rejected": -0.23591558635234833, + "logps/chosen": -4.70677375793457, + "logps/rejected": -4.994019508361816, + "loss": 0.055, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.70677375793457, + "rewards/margins": 0.28724604845046997, + "rewards/rejected": -4.994019508361816, + "sft_loss": 4.432908058166504, + "step": 595 + }, + { + "epoch": 0.32112393376818865, + "grad_norm": 1.0018417636595902, + "learning_rate": 9.99852439652573e-07, + "logits/chosen": -0.5504830479621887, + "logits/rejected": -0.3432347774505615, + "logps/chosen": -4.664198875427246, + "logps/rejected": -4.872433662414551, + "loss": 0.0546, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.664198875427246, + "rewards/margins": 0.20823463797569275, + "rewards/rejected": -4.872433662414551, + "sft_loss": 4.430227756500244, + "step": 600 + }, + { + "epoch": 0.32379996654959026, + "grad_norm": 0.6275109758749665, + "learning_rate": 9.998121808030904e-07, + "logits/chosen": -0.6691871881484985, + "logits/rejected": -0.5714275240898132, + "logps/chosen": -4.999661445617676, + "logps/rejected": -5.115338325500488, + "loss": 0.0558, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.999661445617676, + "rewards/margins": 0.11567720025777817, + "rewards/rejected": -5.115338325500488, + "sft_loss": 4.749647617340088, + "step": 605 + }, + { + "epoch": 0.3264759993309918, + "grad_norm": 1.9533627210971916, + "learning_rate": 9.997670727736379e-07, + "logits/chosen": -0.5287994146347046, + "logits/rejected": -0.2409917414188385, + "logps/chosen": -4.611300468444824, + "logps/rejected": -4.752462387084961, + "loss": 0.0548, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.611300468444824, + "rewards/margins": 0.14116191864013672, + "rewards/rejected": -4.752462387084961, + "sft_loss": 4.276651382446289, + "step": 610 + }, + { + "epoch": 0.32915203211239336, + "grad_norm": 1.0181190866876273, + "learning_rate": 9.99717116001853e-07, + "logits/chosen": -0.5561403036117554, + "logits/rejected": -0.4006883502006531, + "logps/chosen": -4.733808517456055, + "logps/rejected": -4.987280368804932, + "loss": 0.0541, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.733808517456055, + "rewards/margins": 0.25347140431404114, + "rewards/rejected": -4.987280368804932, + "sft_loss": 4.515638828277588, + "step": 615 + }, + { + "epoch": 0.33182806489379496, + "grad_norm": 1.8286375874490535, + "learning_rate": 9.996623109724173e-07, + "logits/chosen": -0.36191409826278687, + "logits/rejected": -0.23526597023010254, + "logps/chosen": -4.690080165863037, + "logps/rejected": -4.8197832107543945, + "loss": 0.0553, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.690080165863037, + "rewards/margins": 0.12970253825187683, + "rewards/rejected": -4.8197832107543945, + "sft_loss": 4.448857307434082, + "step": 620 + }, + { + "epoch": 0.3345040976751965, + "grad_norm": 0.8430739768632581, + "learning_rate": 9.996026582170488e-07, + "logits/chosen": -0.40729236602783203, + "logits/rejected": -0.15933868288993835, + "logps/chosen": -4.661129474639893, + "logps/rejected": -5.0198845863342285, + "loss": 0.0534, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.661129474639893, + "rewards/margins": 0.3587549328804016, + "rewards/rejected": -5.0198845863342285, + "sft_loss": 4.314611434936523, + "step": 625 + }, + { + "epoch": 0.3371801304565981, + "grad_norm": 0.41172622641288736, + "learning_rate": 9.995381583144996e-07, + "logits/chosen": -0.41392308473587036, + "logits/rejected": -0.23915371298789978, + "logps/chosen": -4.795115947723389, + "logps/rejected": -5.012903690338135, + "loss": 0.0544, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.795115947723389, + "rewards/margins": 0.2177872359752655, + "rewards/rejected": -5.012903690338135, + "sft_loss": 4.513781547546387, + "step": 630 + }, + { + "epoch": 0.33985616323799966, + "grad_norm": 1.3659635196177828, + "learning_rate": 9.994688118905471e-07, + "logits/chosen": -0.40934377908706665, + "logits/rejected": -0.06164498254656792, + "logps/chosen": -4.684535980224609, + "logps/rejected": -5.066915512084961, + "loss": 0.0539, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.684535980224609, + "rewards/margins": 0.38237953186035156, + "rewards/rejected": -5.066915512084961, + "sft_loss": 4.450982093811035, + "step": 635 + }, + { + "epoch": 0.3425321960194012, + "grad_norm": 0.7655809286311621, + "learning_rate": 9.993946196179912e-07, + "logits/chosen": -0.4428192973136902, + "logits/rejected": -0.13932375609874725, + "logps/chosen": -4.84511661529541, + "logps/rejected": -5.081221103668213, + "loss": 0.055, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.84511661529541, + "rewards/margins": 0.2361036241054535, + "rewards/rejected": -5.081221103668213, + "sft_loss": 4.533552646636963, + "step": 640 + }, + { + "epoch": 0.3452082288008028, + "grad_norm": 0.753608439880354, + "learning_rate": 9.993155822166455e-07, + "logits/chosen": -0.4735592305660248, + "logits/rejected": -0.3594627380371094, + "logps/chosen": -4.647860527038574, + "logps/rejected": -4.909556865692139, + "loss": 0.0545, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.647860527038574, + "rewards/margins": 0.2616958022117615, + "rewards/rejected": -4.909556865692139, + "sft_loss": 4.432980537414551, + "step": 645 + }, + { + "epoch": 0.34788426158220437, + "grad_norm": 0.7532799343685316, + "learning_rate": 9.992317004533313e-07, + "logits/chosen": -0.3656768798828125, + "logits/rejected": -0.24436573684215546, + "logps/chosen": -4.652246952056885, + "logps/rejected": -4.921938896179199, + "loss": 0.0551, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.652246952056885, + "rewards/margins": 0.26969173550605774, + "rewards/rejected": -4.921938896179199, + "sft_loss": 4.465109825134277, + "step": 650 + }, + { + "epoch": 0.350560294363606, + "grad_norm": 0.8544184685749583, + "learning_rate": 9.991429751418696e-07, + "logits/chosen": -0.38019418716430664, + "logits/rejected": -0.35641294717788696, + "logps/chosen": -4.640519618988037, + "logps/rejected": -4.896888732910156, + "loss": 0.0548, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.640519618988037, + "rewards/margins": 0.256369024515152, + "rewards/rejected": -4.896888732910156, + "sft_loss": 4.424017429351807, + "step": 655 + }, + { + "epoch": 0.3532363271450075, + "grad_norm": 0.7903309832011153, + "learning_rate": 9.99049407143074e-07, + "logits/chosen": -0.462789922952652, + "logits/rejected": -0.25584983825683594, + "logps/chosen": -5.043513298034668, + "logps/rejected": -5.086087703704834, + "loss": 0.0565, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -5.043513298034668, + "rewards/margins": 0.04257440194487572, + "rewards/rejected": -5.086087703704834, + "sft_loss": 4.764005661010742, + "step": 660 + }, + { + "epoch": 0.35591235992640907, + "grad_norm": 0.609573225241613, + "learning_rate": 9.989509973647416e-07, + "logits/chosen": -0.44240602850914, + "logits/rejected": -0.2531934976577759, + "logps/chosen": -4.6058125495910645, + "logps/rejected": -4.872105121612549, + "loss": 0.054, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.6058125495910645, + "rewards/margins": 0.26629284024238586, + "rewards/rejected": -4.872105121612549, + "sft_loss": 4.4222412109375, + "step": 665 + }, + { + "epoch": 0.3585883927078107, + "grad_norm": 0.4364736762052332, + "learning_rate": 9.988477467616445e-07, + "logits/chosen": -0.5186976194381714, + "logits/rejected": -0.2430642545223236, + "logps/chosen": -4.617299556732178, + "logps/rejected": -4.82781982421875, + "loss": 0.0548, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.617299556732178, + "rewards/margins": 0.21052002906799316, + "rewards/rejected": -4.82781982421875, + "sft_loss": 4.397097110748291, + "step": 670 + }, + { + "epoch": 0.3612644254892122, + "grad_norm": 1.137938867391629, + "learning_rate": 9.987396563355205e-07, + "logits/chosen": -0.517059326171875, + "logits/rejected": -0.4032202661037445, + "logps/chosen": -4.8206658363342285, + "logps/rejected": -5.0536298751831055, + "loss": 0.0554, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.8206658363342285, + "rewards/margins": 0.2329637110233307, + "rewards/rejected": -5.0536298751831055, + "sft_loss": 4.577258110046387, + "step": 675 + }, + { + "epoch": 0.36394045827061383, + "grad_norm": 1.2897163566434429, + "learning_rate": 9.986267271350631e-07, + "logits/chosen": -0.440659761428833, + "logits/rejected": -0.23207931220531464, + "logps/chosen": -4.67221736907959, + "logps/rejected": -4.837889194488525, + "loss": 0.0558, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.67221736907959, + "rewards/margins": 0.1656724214553833, + "rewards/rejected": -4.837889194488525, + "sft_loss": 4.413691997528076, + "step": 680 + }, + { + "epoch": 0.3666164910520154, + "grad_norm": 0.539702639060486, + "learning_rate": 9.985089602559123e-07, + "logits/chosen": -0.4820149838924408, + "logits/rejected": -0.21975748240947723, + "logps/chosen": -4.768852233886719, + "logps/rejected": -5.050930976867676, + "loss": 0.054, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.768852233886719, + "rewards/margins": 0.28207892179489136, + "rewards/rejected": -5.050930976867676, + "sft_loss": 4.496817588806152, + "step": 685 + }, + { + "epoch": 0.369292523833417, + "grad_norm": 1.5387946865898154, + "learning_rate": 9.983863568406428e-07, + "logits/chosen": -0.35290640592575073, + "logits/rejected": -0.29900383949279785, + "logps/chosen": -4.636789798736572, + "logps/rejected": -4.796220302581787, + "loss": 0.0552, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.636789798736572, + "rewards/margins": 0.1594308465719223, + "rewards/rejected": -4.796220302581787, + "sft_loss": 4.371980667114258, + "step": 690 + }, + { + "epoch": 0.37196855661481854, + "grad_norm": 0.6503780378168439, + "learning_rate": 9.982589180787532e-07, + "logits/chosen": -0.4263625741004944, + "logits/rejected": -0.2828146815299988, + "logps/chosen": -4.904356002807617, + "logps/rejected": -5.096182823181152, + "loss": 0.0549, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.904356002807617, + "rewards/margins": 0.19182677567005157, + "rewards/rejected": -5.096182823181152, + "sft_loss": 4.617617130279541, + "step": 695 + }, + { + "epoch": 0.3746445893962201, + "grad_norm": 0.5447784030210343, + "learning_rate": 9.981266452066553e-07, + "logits/chosen": -0.5558158755302429, + "logits/rejected": -0.32897305488586426, + "logps/chosen": -4.607967853546143, + "logps/rejected": -4.868782043457031, + "loss": 0.0542, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.607967853546143, + "rewards/margins": 0.2608141601085663, + "rewards/rejected": -4.868782043457031, + "sft_loss": 4.378183841705322, + "step": 700 + }, + { + "epoch": 0.3773206221776217, + "grad_norm": 0.6795933899283331, + "learning_rate": 9.979895395076608e-07, + "logits/chosen": -0.502457857131958, + "logits/rejected": -0.19821786880493164, + "logps/chosen": -4.631689071655273, + "logps/rejected": -4.896653175354004, + "loss": 0.0539, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.631689071655273, + "rewards/margins": 0.26496395468711853, + "rewards/rejected": -4.896653175354004, + "sft_loss": 4.34788703918457, + "step": 705 + }, + { + "epoch": 0.37999665495902324, + "grad_norm": 0.6873470967068471, + "learning_rate": 9.9784760231197e-07, + "logits/chosen": -0.5005318522453308, + "logits/rejected": -0.3380716145038605, + "logps/chosen": -4.7272210121154785, + "logps/rejected": -4.948512554168701, + "loss": 0.0545, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.7272210121154785, + "rewards/margins": 0.22129102051258087, + "rewards/rejected": -4.948512554168701, + "sft_loss": 4.455728054046631, + "step": 710 + }, + { + "epoch": 0.38267268774042484, + "grad_norm": 0.38741905047989544, + "learning_rate": 9.97700834996658e-07, + "logits/chosen": -0.5377181768417358, + "logits/rejected": -0.27840954065322876, + "logps/chosen": -4.827449798583984, + "logps/rejected": -5.016432285308838, + "loss": 0.0545, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.827449798583984, + "rewards/margins": 0.18898220360279083, + "rewards/rejected": -5.016432285308838, + "sft_loss": 4.430556297302246, + "step": 715 + }, + { + "epoch": 0.3853487205218264, + "grad_norm": 0.6920780216870869, + "learning_rate": 9.97549238985662e-07, + "logits/chosen": -0.49541395902633667, + "logits/rejected": -0.19519221782684326, + "logps/chosen": -4.899749755859375, + "logps/rejected": -5.18363094329834, + "loss": 0.0544, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.899749755859375, + "rewards/margins": 0.28388163447380066, + "rewards/rejected": -5.18363094329834, + "sft_loss": 4.593430519104004, + "step": 720 + }, + { + "epoch": 0.38802475330322794, + "grad_norm": 0.6183567658360065, + "learning_rate": 9.973928157497674e-07, + "logits/chosen": -0.7071774005889893, + "logits/rejected": -0.45961588621139526, + "logps/chosen": -4.625585079193115, + "logps/rejected": -4.892023086547852, + "loss": 0.0537, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.625585079193115, + "rewards/margins": 0.2664377987384796, + "rewards/rejected": -4.892023086547852, + "sft_loss": 4.314015865325928, + "step": 725 + }, + { + "epoch": 0.39070078608462955, + "grad_norm": 0.8093467088520951, + "learning_rate": 9.972315668065927e-07, + "logits/chosen": -0.6060682535171509, + "logits/rejected": -0.4073333740234375, + "logps/chosen": -4.745782375335693, + "logps/rejected": -4.918501377105713, + "loss": 0.0541, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -4.745782375335693, + "rewards/margins": 0.17271855473518372, + "rewards/rejected": -4.918501377105713, + "sft_loss": 4.408670902252197, + "step": 730 + }, + { + "epoch": 0.3933768188660311, + "grad_norm": 0.5190267369651727, + "learning_rate": 9.97065493720576e-07, + "logits/chosen": -0.6034917235374451, + "logits/rejected": -0.43217557668685913, + "logps/chosen": -4.639141082763672, + "logps/rejected": -4.880422115325928, + "loss": 0.0547, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.639141082763672, + "rewards/margins": 0.24128136038780212, + "rewards/rejected": -4.880422115325928, + "sft_loss": 4.418794631958008, + "step": 735 + }, + { + "epoch": 0.3960528516474327, + "grad_norm": 0.7299713178783463, + "learning_rate": 9.968945981029594e-07, + "logits/chosen": -0.5461141467094421, + "logits/rejected": -0.2863614559173584, + "logps/chosen": -4.790996551513672, + "logps/rejected": -5.0908708572387695, + "loss": 0.0531, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.790996551513672, + "rewards/margins": 0.299874484539032, + "rewards/rejected": -5.0908708572387695, + "sft_loss": 4.418035984039307, + "step": 740 + }, + { + "epoch": 0.39872888442883425, + "grad_norm": 1.357858967463249, + "learning_rate": 9.967188816117726e-07, + "logits/chosen": -0.4381503462791443, + "logits/rejected": -0.24060969054698944, + "logps/chosen": -4.734060287475586, + "logps/rejected": -5.104639530181885, + "loss": 0.0556, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.734060287475586, + "rewards/margins": 0.3705799877643585, + "rewards/rejected": -5.104639530181885, + "sft_loss": 4.420103549957275, + "step": 745 + }, + { + "epoch": 0.4014049172102358, + "grad_norm": 0.5571020275754393, + "learning_rate": 9.965383459518179e-07, + "logits/chosen": -0.6509796977043152, + "logits/rejected": -0.3502160310745239, + "logps/chosen": -5.010281085968018, + "logps/rejected": -5.239097595214844, + "loss": 0.0542, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -5.010281085968018, + "rewards/margins": 0.22881627082824707, + "rewards/rejected": -5.239097595214844, + "sft_loss": 4.609499931335449, + "step": 750 + }, + { + "epoch": 0.4040809499916374, + "grad_norm": 0.5180996933056597, + "learning_rate": 9.963529928746533e-07, + "logits/chosen": -0.5796464681625366, + "logits/rejected": -0.3192821145057678, + "logps/chosen": -4.6411871910095215, + "logps/rejected": -4.946473598480225, + "loss": 0.0545, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.6411871910095215, + "rewards/margins": 0.305286169052124, + "rewards/rejected": -4.946473598480225, + "sft_loss": 4.233515739440918, + "step": 755 + }, + { + "epoch": 0.40675698277303896, + "grad_norm": 0.383125627873452, + "learning_rate": 9.961628241785746e-07, + "logits/chosen": -0.7499212622642517, + "logits/rejected": -0.5933259129524231, + "logps/chosen": -4.820398807525635, + "logps/rejected": -5.086698055267334, + "loss": 0.0541, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.820398807525635, + "rewards/margins": 0.266299307346344, + "rewards/rejected": -5.086698055267334, + "sft_loss": 4.4908623695373535, + "step": 760 + }, + { + "epoch": 0.40943301555444056, + "grad_norm": 0.7665403862710062, + "learning_rate": 9.959678417085998e-07, + "logits/chosen": -0.684373140335083, + "logits/rejected": -0.5438544154167175, + "logps/chosen": -4.870017051696777, + "logps/rejected": -5.046212196350098, + "loss": 0.0552, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.870017051696777, + "rewards/margins": 0.17619472742080688, + "rewards/rejected": -5.046212196350098, + "sft_loss": 4.622311115264893, + "step": 765 + }, + { + "epoch": 0.4121090483358421, + "grad_norm": 0.41415913294882845, + "learning_rate": 9.957680473564493e-07, + "logits/chosen": -0.5799676775932312, + "logits/rejected": -0.32691285014152527, + "logps/chosen": -4.706719398498535, + "logps/rejected": -4.918424129486084, + "loss": 0.0541, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.706719398498535, + "rewards/margins": 0.21170508861541748, + "rewards/rejected": -4.918424129486084, + "sft_loss": 4.271069526672363, + "step": 770 + }, + { + "epoch": 0.41478508111724366, + "grad_norm": 0.443777235736431, + "learning_rate": 9.95563443060529e-07, + "logits/chosen": -0.7746785879135132, + "logits/rejected": -0.47731414437294006, + "logps/chosen": -4.810762405395508, + "logps/rejected": -5.0177717208862305, + "loss": 0.0541, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.810762405395508, + "rewards/margins": 0.2070087492465973, + "rewards/rejected": -5.0177717208862305, + "sft_loss": 4.463230609893799, + "step": 775 + }, + { + "epoch": 0.41746111389864526, + "grad_norm": 0.5265592870739666, + "learning_rate": 9.95354030805911e-07, + "logits/chosen": -0.7950371503829956, + "logits/rejected": -0.5439642071723938, + "logps/chosen": -4.769791603088379, + "logps/rejected": -4.9892578125, + "loss": 0.054, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.769791603088379, + "rewards/margins": 0.21946604549884796, + "rewards/rejected": -4.9892578125, + "sft_loss": 4.466793537139893, + "step": 780 + }, + { + "epoch": 0.4201371466800468, + "grad_norm": 0.8208738399246265, + "learning_rate": 9.951398126243133e-07, + "logits/chosen": -0.6324111223220825, + "logits/rejected": -0.486175000667572, + "logps/chosen": -4.708271026611328, + "logps/rejected": -4.979205131530762, + "loss": 0.0544, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.708271026611328, + "rewards/margins": 0.2709343433380127, + "rewards/rejected": -4.979205131530762, + "sft_loss": 4.414024829864502, + "step": 785 + }, + { + "epoch": 0.4228131794614484, + "grad_norm": 0.7140836228407493, + "learning_rate": 9.94920790594082e-07, + "logits/chosen": -0.6593544483184814, + "logits/rejected": -0.48353734612464905, + "logps/chosen": -4.598227500915527, + "logps/rejected": -4.899634838104248, + "loss": 0.0535, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.598227500915527, + "rewards/margins": 0.30140742659568787, + "rewards/rejected": -4.899634838104248, + "sft_loss": 4.312748908996582, + "step": 790 + }, + { + "epoch": 0.42548921224284997, + "grad_norm": 0.4014903013901931, + "learning_rate": 9.946969668401696e-07, + "logits/chosen": -0.6135523915290833, + "logits/rejected": -0.32322195172309875, + "logps/chosen": -4.550875663757324, + "logps/rejected": -4.979404926300049, + "loss": 0.0535, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.550875663757324, + "rewards/margins": 0.4285293519496918, + "rewards/rejected": -4.979404926300049, + "sft_loss": 4.304534912109375, + "step": 795 + }, + { + "epoch": 0.4281652450242516, + "grad_norm": 1.1777799079408109, + "learning_rate": 9.944683435341155e-07, + "logits/chosen": -0.433992862701416, + "logits/rejected": -0.33607035875320435, + "logps/chosen": -4.869369029998779, + "logps/rejected": -5.201943397521973, + "loss": 0.0537, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.869369029998779, + "rewards/margins": 0.3325735628604889, + "rewards/rejected": -5.201943397521973, + "sft_loss": 4.4167160987854, + "step": 800 + }, + { + "epoch": 0.4281652450242516, + "eval_logits/chosen": 0.0762772411108017, + "eval_logits/rejected": 0.21884751319885254, + "eval_logps/chosen": -4.661361217498779, + "eval_logps/rejected": -4.99029541015625, + "eval_loss": 0.05288328602910042, + "eval_rewards/accuracies": 0.6023738980293274, + "eval_rewards/chosen": -4.661361217498779, + "eval_rewards/margins": 0.32893452048301697, + "eval_rewards/rejected": -4.99029541015625, + "eval_runtime": 43.6891, + "eval_samples_per_second": 30.786, + "eval_sft_loss": 4.132972717285156, + "eval_steps_per_second": 7.714, + "step": 800 + }, + { + "epoch": 0.4308412778056531, + "grad_norm": 1.019461476829721, + "learning_rate": 9.942349228940236e-07, + "logits/chosen": -0.5534158945083618, + "logits/rejected": -0.2658959627151489, + "logps/chosen": -4.665492057800293, + "logps/rejected": -5.222234725952148, + "loss": 0.053, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.665492057800293, + "rewards/margins": 0.5567426681518555, + "rewards/rejected": -5.222234725952148, + "sft_loss": 4.35896635055542, + "step": 805 + }, + { + "epoch": 0.43351731058705467, + "grad_norm": 0.6905383851627591, + "learning_rate": 9.939967071845424e-07, + "logits/chosen": -0.5316272974014282, + "logits/rejected": -0.44879570603370667, + "logps/chosen": -4.868673324584961, + "logps/rejected": -5.094768524169922, + "loss": 0.0559, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.868673324584961, + "rewards/margins": 0.22609524428844452, + "rewards/rejected": -5.094768524169922, + "sft_loss": 4.565918922424316, + "step": 810 + }, + { + "epoch": 0.4361933433684563, + "grad_norm": 0.9640958718568238, + "learning_rate": 9.937536987168413e-07, + "logits/chosen": -0.5573722124099731, + "logits/rejected": -0.34027716517448425, + "logps/chosen": -4.796445846557617, + "logps/rejected": -5.075738430023193, + "loss": 0.0535, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.796445846557617, + "rewards/margins": 0.27929285168647766, + "rewards/rejected": -5.075738430023193, + "sft_loss": 4.393874645233154, + "step": 815 + }, + { + "epoch": 0.4388693761498578, + "grad_norm": 0.45255104653719846, + "learning_rate": 9.935058998485896e-07, + "logits/chosen": -0.5388621687889099, + "logits/rejected": -0.5094455480575562, + "logps/chosen": -4.702640533447266, + "logps/rejected": -4.948770523071289, + "loss": 0.0544, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.702640533447266, + "rewards/margins": 0.24612972140312195, + "rewards/rejected": -4.948770523071289, + "sft_loss": 4.344797134399414, + "step": 820 + }, + { + "epoch": 0.44154540893125943, + "grad_norm": 1.1306242091401009, + "learning_rate": 9.932533129839333e-07, + "logits/chosen": -0.7148083448410034, + "logits/rejected": -0.5382518172264099, + "logps/chosen": -4.7975382804870605, + "logps/rejected": -4.953667640686035, + "loss": 0.0552, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.7975382804870605, + "rewards/margins": 0.15612894296646118, + "rewards/rejected": -4.953667640686035, + "sft_loss": 4.56071138381958, + "step": 825 + }, + { + "epoch": 0.444221441712661, + "grad_norm": 0.6894798373046558, + "learning_rate": 9.929959405734711e-07, + "logits/chosen": -0.533441424369812, + "logits/rejected": -0.3360450863838196, + "logps/chosen": -4.676575660705566, + "logps/rejected": -4.950590133666992, + "loss": 0.0543, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.676575660705566, + "rewards/margins": 0.2740144729614258, + "rewards/rejected": -4.950590133666992, + "sft_loss": 4.489292621612549, + "step": 830 + }, + { + "epoch": 0.44689747449406253, + "grad_norm": 0.5615731774657157, + "learning_rate": 9.927337851142314e-07, + "logits/chosen": -0.535950779914856, + "logits/rejected": -0.3678244650363922, + "logps/chosen": -4.804347991943359, + "logps/rejected": -4.954916954040527, + "loss": 0.0554, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.804347991943359, + "rewards/margins": 0.1505691409111023, + "rewards/rejected": -4.954916954040527, + "sft_loss": 4.574199199676514, + "step": 835 + }, + { + "epoch": 0.44957350727546413, + "grad_norm": 0.5138053605954034, + "learning_rate": 9.924668491496474e-07, + "logits/chosen": -0.598753809928894, + "logits/rejected": -0.30034974217414856, + "logps/chosen": -4.644226551055908, + "logps/rejected": -4.914678573608398, + "loss": 0.0542, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.644226551055908, + "rewards/margins": 0.270452082157135, + "rewards/rejected": -4.914678573608398, + "sft_loss": 4.4098100662231445, + "step": 840 + }, + { + "epoch": 0.4522495400568657, + "grad_norm": 0.3953000760486427, + "learning_rate": 9.92195135269533e-07, + "logits/chosen": -0.571025013923645, + "logits/rejected": -0.4940189719200134, + "logps/chosen": -4.623247146606445, + "logps/rejected": -4.7912702560424805, + "loss": 0.0547, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.623247146606445, + "rewards/margins": 0.16802339255809784, + "rewards/rejected": -4.7912702560424805, + "sft_loss": 4.395881175994873, + "step": 845 + }, + { + "epoch": 0.4549255728382673, + "grad_norm": 0.9784287629098288, + "learning_rate": 9.919186461100574e-07, + "logits/chosen": -0.7064191699028015, + "logits/rejected": -0.5537311434745789, + "logps/chosen": -4.647214889526367, + "logps/rejected": -4.877057075500488, + "loss": 0.0542, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.647214889526367, + "rewards/margins": 0.22984282672405243, + "rewards/rejected": -4.877057075500488, + "sft_loss": 4.406468391418457, + "step": 850 + }, + { + "epoch": 0.45760160561966884, + "grad_norm": 0.6202522928914502, + "learning_rate": 9.9163738435372e-07, + "logits/chosen": -0.6961187124252319, + "logits/rejected": -0.4674125611782074, + "logps/chosen": -4.66948938369751, + "logps/rejected": -5.008526802062988, + "loss": 0.0547, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.66948938369751, + "rewards/margins": 0.33903709053993225, + "rewards/rejected": -5.008526802062988, + "sft_loss": 4.422011852264404, + "step": 855 + }, + { + "epoch": 0.4602776384010704, + "grad_norm": 1.1274872046878186, + "learning_rate": 9.913513527293234e-07, + "logits/chosen": -0.8537279367446899, + "logits/rejected": -0.6118448376655579, + "logps/chosen": -4.752366065979004, + "logps/rejected": -5.068416595458984, + "loss": 0.0541, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.752366065979004, + "rewards/margins": 0.3160504400730133, + "rewards/rejected": -5.068416595458984, + "sft_loss": 4.435427665710449, + "step": 860 + }, + { + "epoch": 0.462953671182472, + "grad_norm": 0.9165527333766942, + "learning_rate": 9.910605540119474e-07, + "logits/chosen": -0.7939995527267456, + "logits/rejected": -0.6034280061721802, + "logps/chosen": -4.908832550048828, + "logps/rejected": -5.1738457679748535, + "loss": 0.0545, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.908832550048828, + "rewards/margins": 0.2650133967399597, + "rewards/rejected": -5.1738457679748535, + "sft_loss": 4.486456394195557, + "step": 865 + }, + { + "epoch": 0.46562970396387354, + "grad_norm": 0.42446011187865473, + "learning_rate": 9.907649910229227e-07, + "logits/chosen": -0.9538064002990723, + "logits/rejected": -0.5534366965293884, + "logps/chosen": -4.690494537353516, + "logps/rejected": -4.987832546234131, + "loss": 0.0538, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.690494537353516, + "rewards/margins": 0.2973388135433197, + "rewards/rejected": -4.987832546234131, + "sft_loss": 4.383624076843262, + "step": 870 + }, + { + "epoch": 0.46830573674527515, + "grad_norm": 0.5446443287919899, + "learning_rate": 9.90464666629803e-07, + "logits/chosen": -0.6814507246017456, + "logits/rejected": -0.545750081539154, + "logps/chosen": -4.67910099029541, + "logps/rejected": -4.943177223205566, + "loss": 0.0551, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.67910099029541, + "rewards/margins": 0.26407718658447266, + "rewards/rejected": -4.943177223205566, + "sft_loss": 4.388939380645752, + "step": 875 + }, + { + "epoch": 0.4709817695266767, + "grad_norm": 0.7810222448754378, + "learning_rate": 9.901595837463363e-07, + "logits/chosen": -0.6461896896362305, + "logits/rejected": -0.390037477016449, + "logps/chosen": -4.738075256347656, + "logps/rejected": -5.075900554656982, + "loss": 0.0543, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.738075256347656, + "rewards/margins": 0.33782586455345154, + "rewards/rejected": -5.075900554656982, + "sft_loss": 4.471536159515381, + "step": 880 + }, + { + "epoch": 0.47365780230807825, + "grad_norm": 0.46368875012727806, + "learning_rate": 9.898497453324384e-07, + "logits/chosen": -0.6110280752182007, + "logits/rejected": -0.501623809337616, + "logps/chosen": -4.8805389404296875, + "logps/rejected": -5.093916893005371, + "loss": 0.054, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.8805389404296875, + "rewards/margins": 0.21337802708148956, + "rewards/rejected": -5.093916893005371, + "sft_loss": 4.527132034301758, + "step": 885 + }, + { + "epoch": 0.47633383508947985, + "grad_norm": 0.5772927826805968, + "learning_rate": 9.895351543941628e-07, + "logits/chosen": -0.6308537721633911, + "logits/rejected": -0.45382413268089294, + "logps/chosen": -4.536052703857422, + "logps/rejected": -4.7554030418396, + "loss": 0.0549, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -4.536052703857422, + "rewards/margins": 0.21935030817985535, + "rewards/rejected": -4.7554030418396, + "sft_loss": 4.236863136291504, + "step": 890 + }, + { + "epoch": 0.4790098678708814, + "grad_norm": 0.5909700614108395, + "learning_rate": 9.892158139836724e-07, + "logits/chosen": -0.667130172252655, + "logits/rejected": -0.5236655473709106, + "logps/chosen": -5.009900093078613, + "logps/rejected": -5.105405330657959, + "loss": 0.0549, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -5.009900093078613, + "rewards/margins": 0.09550575166940689, + "rewards/rejected": -5.105405330657959, + "sft_loss": 4.719082355499268, + "step": 895 + }, + { + "epoch": 0.481685900652283, + "grad_norm": 1.2509012937984993, + "learning_rate": 9.88891727199209e-07, + "logits/chosen": -0.7147120833396912, + "logits/rejected": -0.5922271609306335, + "logps/chosen": -4.690701961517334, + "logps/rejected": -4.896805763244629, + "loss": 0.0538, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.690701961517334, + "rewards/margins": 0.20610396564006805, + "rewards/rejected": -4.896805763244629, + "sft_loss": 4.374290466308594, + "step": 900 + }, + { + "epoch": 0.48436193343368455, + "grad_norm": 0.5020956780576539, + "learning_rate": 9.885628971850641e-07, + "logits/chosen": -0.6695514917373657, + "logits/rejected": -0.40942034125328064, + "logps/chosen": -4.461915016174316, + "logps/rejected": -4.818450927734375, + "loss": 0.0538, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.461915016174316, + "rewards/margins": 0.35653623938560486, + "rewards/rejected": -4.818450927734375, + "sft_loss": 4.175963878631592, + "step": 905 + }, + { + "epoch": 0.48703796621508616, + "grad_norm": 0.2706776295092486, + "learning_rate": 9.882293271315481e-07, + "logits/chosen": -0.7963489294052124, + "logits/rejected": -0.6448882818222046, + "logps/chosen": -4.85056209564209, + "logps/rejected": -5.1240363121032715, + "loss": 0.0541, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.85056209564209, + "rewards/margins": 0.2734738886356354, + "rewards/rejected": -5.1240363121032715, + "sft_loss": 4.56730842590332, + "step": 910 + }, + { + "epoch": 0.4897139989964877, + "grad_norm": 0.7830577955619996, + "learning_rate": 9.878910202749589e-07, + "logits/chosen": -0.7318392992019653, + "logits/rejected": -0.4440035820007324, + "logps/chosen": -4.606733322143555, + "logps/rejected": -4.9441142082214355, + "loss": 0.0535, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.606733322143555, + "rewards/margins": 0.33738085627555847, + "rewards/rejected": -4.9441142082214355, + "sft_loss": 4.38753080368042, + "step": 915 + }, + { + "epoch": 0.49239003177788926, + "grad_norm": 0.5249861703336051, + "learning_rate": 9.875479798975512e-07, + "logits/chosen": -0.5288979411125183, + "logits/rejected": -0.25817009806632996, + "logps/chosen": -4.555031776428223, + "logps/rejected": -4.973117828369141, + "loss": 0.0536, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.555031776428223, + "rewards/margins": 0.41808605194091797, + "rewards/rejected": -4.973117828369141, + "sft_loss": 4.3153204917907715, + "step": 920 + }, + { + "epoch": 0.49506606455929086, + "grad_norm": 0.6958792681399926, + "learning_rate": 9.87200209327504e-07, + "logits/chosen": -0.6519374847412109, + "logits/rejected": -0.3630174994468689, + "logps/chosen": -4.784842014312744, + "logps/rejected": -4.913394927978516, + "loss": 0.0546, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -4.784842014312744, + "rewards/margins": 0.12855303287506104, + "rewards/rejected": -4.913394927978516, + "sft_loss": 4.431909561157227, + "step": 925 + }, + { + "epoch": 0.4977420973406924, + "grad_norm": 0.6857385655509702, + "learning_rate": 9.868477119388894e-07, + "logits/chosen": -0.6547890901565552, + "logits/rejected": -0.5493338704109192, + "logps/chosen": -4.803074836730957, + "logps/rejected": -5.25410795211792, + "loss": 0.0538, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.803074836730957, + "rewards/margins": 0.45103350281715393, + "rewards/rejected": -5.25410795211792, + "sft_loss": 4.5226616859436035, + "step": 930 + }, + { + "epoch": 0.500418130122094, + "grad_norm": 0.7494384327463705, + "learning_rate": 9.864904911516383e-07, + "logits/chosen": -0.5585234761238098, + "logits/rejected": -0.4689217209815979, + "logps/chosen": -4.659033298492432, + "logps/rejected": -4.939300537109375, + "loss": 0.0542, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.659033298492432, + "rewards/margins": 0.28026682138442993, + "rewards/rejected": -4.939300537109375, + "sft_loss": 4.326298713684082, + "step": 935 + }, + { + "epoch": 0.5030941629034956, + "grad_norm": 0.5667236667391294, + "learning_rate": 9.861285504315084e-07, + "logits/chosen": -0.549680233001709, + "logits/rejected": -0.4027118682861328, + "logps/chosen": -4.72601318359375, + "logps/rejected": -5.003291130065918, + "loss": 0.0547, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.72601318359375, + "rewards/margins": 0.2772785723209381, + "rewards/rejected": -5.003291130065918, + "sft_loss": 4.4270710945129395, + "step": 940 + }, + { + "epoch": 0.5057701956848971, + "grad_norm": 0.3704541163727195, + "learning_rate": 9.857618932900502e-07, + "logits/chosen": -0.6717230081558228, + "logits/rejected": -0.4291006624698639, + "logps/chosen": -4.774092197418213, + "logps/rejected": -5.1116228103637695, + "loss": 0.0542, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.774092197418213, + "rewards/margins": 0.3375304341316223, + "rewards/rejected": -5.1116228103637695, + "sft_loss": 4.614851474761963, + "step": 945 + }, + { + "epoch": 0.5084462284662987, + "grad_norm": 1.080406869888111, + "learning_rate": 9.853905232845727e-07, + "logits/chosen": -0.5478076934814453, + "logits/rejected": -0.3193608224391937, + "logps/chosen": -4.520052909851074, + "logps/rejected": -4.752264976501465, + "loss": 0.0547, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.520052909851074, + "rewards/margins": 0.23221249878406525, + "rewards/rejected": -4.752264976501465, + "sft_loss": 4.331027507781982, + "step": 950 + }, + { + "epoch": 0.5111222612477003, + "grad_norm": 0.47093914637526024, + "learning_rate": 9.850144440181095e-07, + "logits/chosen": -0.48740583658218384, + "logits/rejected": -0.2183370292186737, + "logps/chosen": -4.712460994720459, + "logps/rejected": -4.964376926422119, + "loss": 0.0543, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.712460994720459, + "rewards/margins": 0.25191575288772583, + "rewards/rejected": -4.964376926422119, + "sft_loss": 4.447497367858887, + "step": 955 + }, + { + "epoch": 0.5137982940291018, + "grad_norm": 0.9253745426059107, + "learning_rate": 9.846336591393832e-07, + "logits/chosen": -0.45807933807373047, + "logits/rejected": -0.2740600109100342, + "logps/chosen": -4.678889751434326, + "logps/rejected": -4.9867753982543945, + "loss": 0.055, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.678889751434326, + "rewards/margins": 0.30788561701774597, + "rewards/rejected": -4.9867753982543945, + "sft_loss": 4.41696834564209, + "step": 960 + }, + { + "epoch": 0.5164743268105034, + "grad_norm": 0.5562158528705252, + "learning_rate": 9.842481723427704e-07, + "logits/chosen": -0.43384265899658203, + "logits/rejected": -0.3705459535121918, + "logps/chosen": -4.858570098876953, + "logps/rejected": -5.0885772705078125, + "loss": 0.0552, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -4.858570098876953, + "rewards/margins": 0.23000743985176086, + "rewards/rejected": -5.0885772705078125, + "sft_loss": 4.645135402679443, + "step": 965 + }, + { + "epoch": 0.519150359591905, + "grad_norm": 0.46758288708109713, + "learning_rate": 9.838579873682658e-07, + "logits/chosen": -0.4290435314178467, + "logits/rejected": -0.42416420578956604, + "logps/chosen": -4.561893463134766, + "logps/rejected": -4.798999786376953, + "loss": 0.0547, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.561893463134766, + "rewards/margins": 0.2371063530445099, + "rewards/rejected": -4.798999786376953, + "sft_loss": 4.215584754943848, + "step": 970 + }, + { + "epoch": 0.5218263923733065, + "grad_norm": 0.9168000242182311, + "learning_rate": 9.834631080014457e-07, + "logits/chosen": -0.6926376223564148, + "logits/rejected": -0.4112038016319275, + "logps/chosen": -4.897233486175537, + "logps/rejected": -5.223374366760254, + "loss": 0.0536, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.897233486175537, + "rewards/margins": 0.3261413872241974, + "rewards/rejected": -5.223374366760254, + "sft_loss": 4.6407694816589355, + "step": 975 + }, + { + "epoch": 0.5245024251547081, + "grad_norm": 0.8592004725850547, + "learning_rate": 9.830635380734312e-07, + "logits/chosen": -0.6455878615379333, + "logits/rejected": -0.4143894612789154, + "logps/chosen": -4.61000919342041, + "logps/rejected": -4.840395450592041, + "loss": 0.0545, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.61000919342041, + "rewards/margins": 0.23038557171821594, + "rewards/rejected": -4.840395450592041, + "sft_loss": 4.316131591796875, + "step": 980 + }, + { + "epoch": 0.5271784579361097, + "grad_norm": 0.4155059102576832, + "learning_rate": 9.826592814608517e-07, + "logits/chosen": -0.5832679867744446, + "logits/rejected": -0.30334407091140747, + "logps/chosen": -4.680327415466309, + "logps/rejected": -4.880679130554199, + "loss": 0.0543, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.680327415466309, + "rewards/margins": 0.20035116374492645, + "rewards/rejected": -4.880679130554199, + "sft_loss": 4.373396873474121, + "step": 985 + }, + { + "epoch": 0.5298544907175113, + "grad_norm": 0.6151022174046136, + "learning_rate": 9.822503420858067e-07, + "logits/chosen": -0.5774039030075073, + "logits/rejected": -0.5873244404792786, + "logps/chosen": -4.89186429977417, + "logps/rejected": -5.018913269042969, + "loss": 0.0546, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -4.89186429977417, + "rewards/margins": 0.12704893946647644, + "rewards/rejected": -5.018913269042969, + "sft_loss": 4.576397895812988, + "step": 990 + }, + { + "epoch": 0.5325305234989128, + "grad_norm": 0.9617406539039807, + "learning_rate": 9.818367239158277e-07, + "logits/chosen": -0.4993751645088196, + "logits/rejected": -0.41326743364334106, + "logps/chosen": -4.793762683868408, + "logps/rejected": -4.96630334854126, + "loss": 0.0542, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.793762683868408, + "rewards/margins": 0.1725403368473053, + "rewards/rejected": -4.96630334854126, + "sft_loss": 4.41725492477417, + "step": 995 + }, + { + "epoch": 0.5352065562803144, + "grad_norm": 0.9060064144194804, + "learning_rate": 9.8141843096384e-07, + "logits/chosen": -0.5664402842521667, + "logits/rejected": -0.34098461270332336, + "logps/chosen": -4.702371120452881, + "logps/rejected": -5.04431676864624, + "loss": 0.0542, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.702371120452881, + "rewards/margins": 0.3419460654258728, + "rewards/rejected": -5.04431676864624, + "sft_loss": 4.421466827392578, + "step": 1000 + }, + { + "epoch": 0.537882589061716, + "grad_norm": 0.5086244800614527, + "learning_rate": 9.809954672881237e-07, + "logits/chosen": -0.46155256032943726, + "logits/rejected": -0.2267765998840332, + "logps/chosen": -4.836634159088135, + "logps/rejected": -5.148859024047852, + "loss": 0.0543, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.836634159088135, + "rewards/margins": 0.3122252821922302, + "rewards/rejected": -5.148859024047852, + "sft_loss": 4.532876968383789, + "step": 1005 + }, + { + "epoch": 0.5405586218431175, + "grad_norm": 0.5176592683524385, + "learning_rate": 9.80567836992274e-07, + "logits/chosen": -0.47870248556137085, + "logits/rejected": -0.22733981907367706, + "logps/chosen": -4.524051666259766, + "logps/rejected": -4.881123065948486, + "loss": 0.0549, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.524051666259766, + "rewards/margins": 0.35707104206085205, + "rewards/rejected": -4.881123065948486, + "sft_loss": 4.257218837738037, + "step": 1010 + }, + { + "epoch": 0.5432346546245191, + "grad_norm": 0.6158973094099502, + "learning_rate": 9.801355442251625e-07, + "logits/chosen": -0.7883400321006775, + "logits/rejected": -0.5459173917770386, + "logps/chosen": -4.803179740905762, + "logps/rejected": -5.041607856750488, + "loss": 0.0545, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.803179740905762, + "rewards/margins": 0.23842862248420715, + "rewards/rejected": -5.041607856750488, + "sft_loss": 4.608723163604736, + "step": 1015 + }, + { + "epoch": 0.5459106874059207, + "grad_norm": 0.38857303823842887, + "learning_rate": 9.796985931808949e-07, + "logits/chosen": -0.8481922149658203, + "logits/rejected": -0.5922120809555054, + "logps/chosen": -4.708874702453613, + "logps/rejected": -4.960658073425293, + "loss": 0.0537, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.708874702453613, + "rewards/margins": 0.25178369879722595, + "rewards/rejected": -4.960658073425293, + "sft_loss": 4.406599521636963, + "step": 1020 + }, + { + "epoch": 0.5485867201873222, + "grad_norm": 0.37995724426168664, + "learning_rate": 9.792569880987724e-07, + "logits/chosen": -0.7771550416946411, + "logits/rejected": -0.6147995591163635, + "logps/chosen": -4.492480754852295, + "logps/rejected": -4.806388854980469, + "loss": 0.0536, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.492480754852295, + "rewards/margins": 0.3139081597328186, + "rewards/rejected": -4.806388854980469, + "sft_loss": 4.236155986785889, + "step": 1025 + }, + { + "epoch": 0.5512627529687238, + "grad_norm": 0.5433034173994814, + "learning_rate": 9.788107332632493e-07, + "logits/chosen": -0.7106307744979858, + "logits/rejected": -0.619476318359375, + "logps/chosen": -4.824548244476318, + "logps/rejected": -4.9400177001953125, + "loss": 0.0569, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.824548244476318, + "rewards/margins": 0.11546945571899414, + "rewards/rejected": -4.9400177001953125, + "sft_loss": 4.580724239349365, + "step": 1030 + }, + { + "epoch": 0.5539387857501255, + "grad_norm": 0.39975295156815127, + "learning_rate": 9.783598330038924e-07, + "logits/chosen": -0.7352392077445984, + "logits/rejected": -0.5586301684379578, + "logps/chosen": -4.770339488983154, + "logps/rejected": -4.9732537269592285, + "loss": 0.0541, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.770339488983154, + "rewards/margins": 0.20291383564472198, + "rewards/rejected": -4.9732537269592285, + "sft_loss": 4.535826206207275, + "step": 1035 + }, + { + "epoch": 0.5566148185315271, + "grad_norm": 0.5679160526366398, + "learning_rate": 9.779042916953376e-07, + "logits/chosen": -0.6250237226486206, + "logits/rejected": -0.35187873244285583, + "logps/chosen": -4.4509196281433105, + "logps/rejected": -4.852322578430176, + "loss": 0.0538, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.4509196281433105, + "rewards/margins": 0.4014025330543518, + "rewards/rejected": -4.852322578430176, + "sft_loss": 4.283907413482666, + "step": 1040 + }, + { + "epoch": 0.5592908513129285, + "grad_norm": 0.45573423362566345, + "learning_rate": 9.774441137572487e-07, + "logits/chosen": -0.6780288219451904, + "logits/rejected": -0.4629104733467102, + "logps/chosen": -4.64642858505249, + "logps/rejected": -4.976282596588135, + "loss": 0.0532, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.64642858505249, + "rewards/margins": 0.32985401153564453, + "rewards/rejected": -4.976282596588135, + "sft_loss": 4.352892875671387, + "step": 1045 + }, + { + "epoch": 0.5619668840943302, + "grad_norm": 0.422512298556522, + "learning_rate": 9.76979303654274e-07, + "logits/chosen": -0.6532023549079895, + "logits/rejected": -0.502606213092804, + "logps/chosen": -4.712841033935547, + "logps/rejected": -5.147298336029053, + "loss": 0.0539, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.712841033935547, + "rewards/margins": 0.434457391500473, + "rewards/rejected": -5.147298336029053, + "sft_loss": 4.4879865646362305, + "step": 1050 + }, + { + "epoch": 0.5646429168757318, + "grad_norm": 0.8131359180672422, + "learning_rate": 9.765098658960035e-07, + "logits/chosen": -0.5284560918807983, + "logits/rejected": -0.4684422016143799, + "logps/chosen": -4.553152561187744, + "logps/rejected": -4.902011871337891, + "loss": 0.0535, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.553152561187744, + "rewards/margins": 0.3488597273826599, + "rewards/rejected": -4.902011871337891, + "sft_loss": 4.281155586242676, + "step": 1055 + }, + { + "epoch": 0.5673189496571333, + "grad_norm": 0.9015355821027234, + "learning_rate": 9.76035805036924e-07, + "logits/chosen": -0.518097996711731, + "logits/rejected": -0.28143590688705444, + "logps/chosen": -4.709748268127441, + "logps/rejected": -5.019460201263428, + "loss": 0.0537, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.709748268127441, + "rewards/margins": 0.30971240997314453, + "rewards/rejected": -5.019460201263428, + "sft_loss": 4.399357795715332, + "step": 1060 + }, + { + "epoch": 0.5699949824385349, + "grad_norm": 0.749882232299179, + "learning_rate": 9.755571256763764e-07, + "logits/chosen": -0.4936675429344177, + "logits/rejected": -0.32630711793899536, + "logps/chosen": -4.695061683654785, + "logps/rejected": -5.094540596008301, + "loss": 0.0531, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.695061683654785, + "rewards/margins": 0.3994784355163574, + "rewards/rejected": -5.094540596008301, + "sft_loss": 4.374762535095215, + "step": 1065 + }, + { + "epoch": 0.5726710152199365, + "grad_norm": 0.4335609502427654, + "learning_rate": 9.750738324585097e-07, + "logits/chosen": -0.5365744233131409, + "logits/rejected": -0.21823947131633759, + "logps/chosen": -4.555490970611572, + "logps/rejected": -4.938295364379883, + "loss": 0.0536, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.555490970611572, + "rewards/margins": 0.3828045725822449, + "rewards/rejected": -4.938295364379883, + "sft_loss": 4.294363975524902, + "step": 1070 + }, + { + "epoch": 0.5753470480013381, + "grad_norm": 0.5841385822714422, + "learning_rate": 9.74585930072237e-07, + "logits/chosen": -0.5023024082183838, + "logits/rejected": -0.32627633213996887, + "logps/chosen": -4.687676906585693, + "logps/rejected": -5.107883453369141, + "loss": 0.0534, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.687676906585693, + "rewards/margins": 0.42020702362060547, + "rewards/rejected": -5.107883453369141, + "sft_loss": 4.394619464874268, + "step": 1075 + }, + { + "epoch": 0.5780230807827396, + "grad_norm": 0.3573018957478717, + "learning_rate": 9.740934232511892e-07, + "logits/chosen": -0.6413004398345947, + "logits/rejected": -0.5170631408691406, + "logps/chosen": -4.733739376068115, + "logps/rejected": -4.969666957855225, + "loss": 0.0544, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.733739376068115, + "rewards/margins": 0.23592762649059296, + "rewards/rejected": -4.969666957855225, + "sft_loss": 4.418457984924316, + "step": 1080 + }, + { + "epoch": 0.5806991135641412, + "grad_norm": 0.9387854678624943, + "learning_rate": 9.735963167736698e-07, + "logits/chosen": -0.5804794430732727, + "logits/rejected": -0.3907211422920227, + "logps/chosen": -4.788337707519531, + "logps/rejected": -5.068944454193115, + "loss": 0.0541, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.788337707519531, + "rewards/margins": 0.28060680627822876, + "rewards/rejected": -5.068944454193115, + "sft_loss": 4.502654075622559, + "step": 1085 + }, + { + "epoch": 0.5833751463455428, + "grad_norm": 0.574041624870192, + "learning_rate": 9.730946154626078e-07, + "logits/chosen": -0.5329641699790955, + "logits/rejected": -0.40476202964782715, + "logps/chosen": -4.6259355545043945, + "logps/rejected": -4.907242774963379, + "loss": 0.0537, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.6259355545043945, + "rewards/margins": 0.2813071608543396, + "rewards/rejected": -4.907242774963379, + "sft_loss": 4.300456523895264, + "step": 1090 + }, + { + "epoch": 0.5860511791269443, + "grad_norm": 0.7365756470251096, + "learning_rate": 9.725883241855117e-07, + "logits/chosen": -0.7071422338485718, + "logits/rejected": -0.5051488876342773, + "logps/chosen": -4.796841621398926, + "logps/rejected": -5.196400165557861, + "loss": 0.0542, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.796841621398926, + "rewards/margins": 0.39955854415893555, + "rewards/rejected": -5.196400165557861, + "sft_loss": 4.557826995849609, + "step": 1095 + }, + { + "epoch": 0.5887272119083459, + "grad_norm": 0.5917366709488491, + "learning_rate": 9.720774478544218e-07, + "logits/chosen": -0.4832594394683838, + "logits/rejected": -0.29662543535232544, + "logps/chosen": -4.595888614654541, + "logps/rejected": -4.967526435852051, + "loss": 0.0533, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.595888614654541, + "rewards/margins": 0.3716380000114441, + "rewards/rejected": -4.967526435852051, + "sft_loss": 4.249319553375244, + "step": 1100 + }, + { + "epoch": 0.5914032446897475, + "grad_norm": 0.3847051965590612, + "learning_rate": 9.715619914258624e-07, + "logits/chosen": -0.6779504418373108, + "logits/rejected": -0.5655364394187927, + "logps/chosen": -4.718003273010254, + "logps/rejected": -4.955427646636963, + "loss": 0.0536, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.718003273010254, + "rewards/margins": 0.2374243438243866, + "rewards/rejected": -4.955427646636963, + "sft_loss": 4.384034156799316, + "step": 1105 + }, + { + "epoch": 0.594079277471149, + "grad_norm": 0.543269055034267, + "learning_rate": 9.710419599007937e-07, + "logits/chosen": -0.592439591884613, + "logits/rejected": -0.3935183882713318, + "logps/chosen": -4.704367637634277, + "logps/rejected": -4.9490461349487305, + "loss": 0.0542, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.704367637634277, + "rewards/margins": 0.24467873573303223, + "rewards/rejected": -4.9490461349487305, + "sft_loss": 4.478358745574951, + "step": 1110 + }, + { + "epoch": 0.5967553102525506, + "grad_norm": 0.40955078166194536, + "learning_rate": 9.705173583245643e-07, + "logits/chosen": -0.6035395860671997, + "logits/rejected": -0.35157328844070435, + "logps/chosen": -4.598028659820557, + "logps/rejected": -4.925137042999268, + "loss": 0.0535, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.598028659820557, + "rewards/margins": 0.32710808515548706, + "rewards/rejected": -4.925137042999268, + "sft_loss": 4.3422746658325195, + "step": 1115 + }, + { + "epoch": 0.5994313430339522, + "grad_norm": 0.2948789244920242, + "learning_rate": 9.699881917868609e-07, + "logits/chosen": -0.6719862222671509, + "logits/rejected": -0.5192317962646484, + "logps/chosen": -4.481719970703125, + "logps/rejected": -4.777043342590332, + "loss": 0.0535, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.481719970703125, + "rewards/margins": 0.29532328248023987, + "rewards/rejected": -4.777043342590332, + "sft_loss": 4.248114585876465, + "step": 1120 + }, + { + "epoch": 0.6021073758153538, + "grad_norm": 0.8245398173478702, + "learning_rate": 9.694544654216594e-07, + "logits/chosen": -0.6878781318664551, + "logits/rejected": -0.4318612515926361, + "logps/chosen": -4.79897403717041, + "logps/rejected": -5.210911750793457, + "loss": 0.0535, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.79897403717041, + "rewards/margins": 0.4119381308555603, + "rewards/rejected": -5.210911750793457, + "sft_loss": 4.552307605743408, + "step": 1125 + }, + { + "epoch": 0.6047834085967553, + "grad_norm": 1.14084627054799, + "learning_rate": 9.689161844071755e-07, + "logits/chosen": -0.3799501657485962, + "logits/rejected": -0.2871240973472595, + "logps/chosen": -4.454643726348877, + "logps/rejected": -4.7599263191223145, + "loss": 0.0536, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.454643726348877, + "rewards/margins": 0.3052830696105957, + "rewards/rejected": -4.7599263191223145, + "sft_loss": 4.177380084991455, + "step": 1130 + }, + { + "epoch": 0.6074594413781569, + "grad_norm": 0.8660037391087473, + "learning_rate": 9.683733539658138e-07, + "logits/chosen": -0.5278415083885193, + "logits/rejected": -0.268408864736557, + "logps/chosen": -4.686205863952637, + "logps/rejected": -5.040558815002441, + "loss": 0.0532, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.686205863952637, + "rewards/margins": 0.3543532192707062, + "rewards/rejected": -5.040558815002441, + "sft_loss": 4.330956935882568, + "step": 1135 + }, + { + "epoch": 0.6101354741595585, + "grad_norm": 0.4970819248895511, + "learning_rate": 9.678259793641178e-07, + "logits/chosen": -0.48409318923950195, + "logits/rejected": -0.44155415892601013, + "logps/chosen": -4.872367858886719, + "logps/rejected": -5.101282119750977, + "loss": 0.0547, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.872367858886719, + "rewards/margins": 0.22891390323638916, + "rewards/rejected": -5.101282119750977, + "sft_loss": 4.596518039703369, + "step": 1140 + }, + { + "epoch": 0.61281150694096, + "grad_norm": 0.5812923017369568, + "learning_rate": 9.672740659127183e-07, + "logits/chosen": -0.5782763361930847, + "logits/rejected": -0.40878796577453613, + "logps/chosen": -4.521733283996582, + "logps/rejected": -4.903876304626465, + "loss": 0.0529, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.521733283996582, + "rewards/margins": 0.382142573595047, + "rewards/rejected": -4.903876304626465, + "sft_loss": 4.200924396514893, + "step": 1145 + }, + { + "epoch": 0.6154875397223616, + "grad_norm": 0.44409718097577794, + "learning_rate": 9.667176189662818e-07, + "logits/chosen": -0.5531161427497864, + "logits/rejected": -0.3877137005329132, + "logps/chosen": -4.67879581451416, + "logps/rejected": -4.9793853759765625, + "loss": 0.0538, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.67879581451416, + "rewards/margins": 0.3005899488925934, + "rewards/rejected": -4.9793853759765625, + "sft_loss": 4.385974407196045, + "step": 1150 + }, + { + "epoch": 0.6181635725037632, + "grad_norm": 0.5761368695976793, + "learning_rate": 9.661566439234592e-07, + "logits/chosen": -0.4771701395511627, + "logits/rejected": -0.3725683093070984, + "logps/chosen": -4.723606586456299, + "logps/rejected": -4.984592914581299, + "loss": 0.0541, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.723606586456299, + "rewards/margins": 0.26098623871803284, + "rewards/rejected": -4.984592914581299, + "sft_loss": 4.454628944396973, + "step": 1155 + }, + { + "epoch": 0.6208396052851648, + "grad_norm": 0.5556713770938857, + "learning_rate": 9.655911462268327e-07, + "logits/chosen": -0.4481441378593445, + "logits/rejected": -0.30789104104042053, + "logps/chosen": -4.546295166015625, + "logps/rejected": -4.988908767700195, + "loss": 0.0526, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.546295166015625, + "rewards/margins": 0.44261303544044495, + "rewards/rejected": -4.988908767700195, + "sft_loss": 4.256467819213867, + "step": 1160 + }, + { + "epoch": 0.6235156380665663, + "grad_norm": 0.4875769722274501, + "learning_rate": 9.650211313628636e-07, + "logits/chosen": -0.5960179567337036, + "logits/rejected": -0.4614837169647217, + "logps/chosen": -4.707175254821777, + "logps/rejected": -4.891497611999512, + "loss": 0.0553, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.707175254821777, + "rewards/margins": 0.18432240188121796, + "rewards/rejected": -4.891497611999512, + "sft_loss": 4.4733428955078125, + "step": 1165 + }, + { + "epoch": 0.6261916708479679, + "grad_norm": 0.4014279228736453, + "learning_rate": 9.644466048618386e-07, + "logits/chosen": -0.6922046542167664, + "logits/rejected": -0.4939608573913574, + "logps/chosen": -5.006098747253418, + "logps/rejected": -5.234212398529053, + "loss": 0.0542, + "rewards/accuracies": 0.59375, + "rewards/chosen": -5.006098747253418, + "rewards/margins": 0.22811400890350342, + "rewards/rejected": -5.234212398529053, + "sft_loss": 4.620914459228516, + "step": 1170 + }, + { + "epoch": 0.6288677036293695, + "grad_norm": 0.8535156242979435, + "learning_rate": 9.63867572297816e-07, + "logits/chosen": -0.705421507358551, + "logits/rejected": -0.4823324680328369, + "logps/chosen": -4.5951361656188965, + "logps/rejected": -4.936031818389893, + "loss": 0.0543, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.5951361656188965, + "rewards/margins": 0.34089553356170654, + "rewards/rejected": -4.936031818389893, + "sft_loss": 4.360302925109863, + "step": 1175 + }, + { + "epoch": 0.631543736410771, + "grad_norm": 0.30103834357953974, + "learning_rate": 9.632840392885727e-07, + "logits/chosen": -0.7017086744308472, + "logits/rejected": -0.4853813648223877, + "logps/chosen": -4.485077857971191, + "logps/rejected": -4.886071681976318, + "loss": 0.0536, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.485077857971191, + "rewards/margins": 0.4009944498538971, + "rewards/rejected": -4.886071681976318, + "sft_loss": 4.283857822418213, + "step": 1180 + }, + { + "epoch": 0.6342197691921726, + "grad_norm": 0.4847167733676787, + "learning_rate": 9.626960114955483e-07, + "logits/chosen": -0.6630807518959045, + "logits/rejected": -0.46446362137794495, + "logps/chosen": -4.6696085929870605, + "logps/rejected": -5.147528648376465, + "loss": 0.0533, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.6696085929870605, + "rewards/margins": 0.47792062163352966, + "rewards/rejected": -5.147528648376465, + "sft_loss": 4.391916751861572, + "step": 1185 + }, + { + "epoch": 0.6368958019735742, + "grad_norm": 0.6465226432916064, + "learning_rate": 9.621034946237909e-07, + "logits/chosen": -0.7046906352043152, + "logits/rejected": -0.514801025390625, + "logps/chosen": -4.797091007232666, + "logps/rejected": -5.1934494972229, + "loss": 0.0538, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.797091007232666, + "rewards/margins": 0.3963584899902344, + "rewards/rejected": -5.1934494972229, + "sft_loss": 4.548532485961914, + "step": 1190 + }, + { + "epoch": 0.6395718347549757, + "grad_norm": 0.6053610691894898, + "learning_rate": 9.615064944219021e-07, + "logits/chosen": -0.5024127960205078, + "logits/rejected": -0.33732375502586365, + "logps/chosen": -4.484908103942871, + "logps/rejected": -4.776598930358887, + "loss": 0.0528, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.484908103942871, + "rewards/margins": 0.2916913628578186, + "rewards/rejected": -4.776598930358887, + "sft_loss": 4.1641082763671875, + "step": 1195 + }, + { + "epoch": 0.6422478675363773, + "grad_norm": 1.0983117178191424, + "learning_rate": 9.609050166819803e-07, + "logits/chosen": -0.49139395356178284, + "logits/rejected": -0.4269639551639557, + "logps/chosen": -4.553410530090332, + "logps/rejected": -4.853245735168457, + "loss": 0.0545, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.553410530090332, + "rewards/margins": 0.29983487725257874, + "rewards/rejected": -4.853245735168457, + "sft_loss": 4.232519149780273, + "step": 1200 + }, + { + "epoch": 0.6422478675363773, + "eval_logits/chosen": -0.025698307901620865, + "eval_logits/rejected": 0.09142318367958069, + "eval_logps/chosen": -4.657962799072266, + "eval_logps/rejected": -5.0485663414001465, + "eval_loss": 0.05232247710227966, + "eval_rewards/accuracies": 0.6350148320198059, + "eval_rewards/chosen": -4.657962799072266, + "eval_rewards/margins": 0.390603631734848, + "eval_rewards/rejected": -5.0485663414001465, + "eval_runtime": 43.3819, + "eval_samples_per_second": 31.004, + "eval_sft_loss": 4.285591125488281, + "eval_steps_per_second": 7.768, + "step": 1200 + }, + { + "epoch": 0.6449239003177789, + "grad_norm": 0.6598117160164223, + "learning_rate": 9.602990672395653e-07, + "logits/chosen": -0.6095829010009766, + "logits/rejected": -0.38074302673339844, + "logps/chosen": -4.750426292419434, + "logps/rejected": -5.1771368980407715, + "loss": 0.0529, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.750426292419434, + "rewards/margins": 0.42671045660972595, + "rewards/rejected": -5.1771368980407715, + "sft_loss": 4.473883628845215, + "step": 1205 + }, + { + "epoch": 0.6475999330991805, + "grad_norm": 0.5694728776522204, + "learning_rate": 9.59688651973581e-07, + "logits/chosen": -0.5579411387443542, + "logits/rejected": -0.27562081813812256, + "logps/chosen": -4.6332831382751465, + "logps/rejected": -5.050353050231934, + "loss": 0.0541, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.6332831382751465, + "rewards/margins": 0.4170694351196289, + "rewards/rejected": -5.050353050231934, + "sft_loss": 4.405447959899902, + "step": 1210 + }, + { + "epoch": 0.650275965880582, + "grad_norm": 1.128531677554023, + "learning_rate": 9.590737768062792e-07, + "logits/chosen": -0.5781430602073669, + "logits/rejected": -0.4014016091823578, + "logps/chosen": -4.605533599853516, + "logps/rejected": -4.950179100036621, + "loss": 0.0537, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.605533599853516, + "rewards/margins": 0.34464550018310547, + "rewards/rejected": -4.950179100036621, + "sft_loss": 4.295781135559082, + "step": 1215 + }, + { + "epoch": 0.6529519986619836, + "grad_norm": 0.44594614543377137, + "learning_rate": 9.584544477031816e-07, + "logits/chosen": -0.3928416967391968, + "logits/rejected": -0.23103955388069153, + "logps/chosen": -4.714247703552246, + "logps/rejected": -5.023430347442627, + "loss": 0.0537, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.714247703552246, + "rewards/margins": 0.30918246507644653, + "rewards/rejected": -5.023430347442627, + "sft_loss": 4.452761650085449, + "step": 1220 + }, + { + "epoch": 0.6556280314433852, + "grad_norm": 0.5839455517038359, + "learning_rate": 9.578306706730215e-07, + "logits/chosen": -0.6410695314407349, + "logits/rejected": -0.3730124831199646, + "logps/chosen": -4.631039619445801, + "logps/rejected": -4.9336748123168945, + "loss": 0.0531, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.631039619445801, + "rewards/margins": 0.3026350438594818, + "rewards/rejected": -4.9336748123168945, + "sft_loss": 4.3163886070251465, + "step": 1225 + }, + { + "epoch": 0.6583040642247867, + "grad_norm": 0.643742164156604, + "learning_rate": 9.572024517676865e-07, + "logits/chosen": -0.5921844840049744, + "logits/rejected": -0.46048134565353394, + "logps/chosen": -4.5794901847839355, + "logps/rejected": -4.9018235206604, + "loss": 0.055, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.5794901847839355, + "rewards/margins": 0.3223329186439514, + "rewards/rejected": -4.9018235206604, + "sft_loss": 4.405655860900879, + "step": 1230 + }, + { + "epoch": 0.6609800970061883, + "grad_norm": 0.45905634809888224, + "learning_rate": 9.565697970821593e-07, + "logits/chosen": -0.6735567450523376, + "logits/rejected": -0.4286714196205139, + "logps/chosen": -4.814480781555176, + "logps/rejected": -5.108050346374512, + "loss": 0.0536, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.814480781555176, + "rewards/margins": 0.2935686707496643, + "rewards/rejected": -5.108050346374512, + "sft_loss": 4.5338850021362305, + "step": 1235 + }, + { + "epoch": 0.6636561297875899, + "grad_norm": 0.3718124611962775, + "learning_rate": 9.559327127544585e-07, + "logits/chosen": -0.7341528534889221, + "logits/rejected": -0.5741795897483826, + "logps/chosen": -4.655917167663574, + "logps/rejected": -4.946690559387207, + "loss": 0.0536, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.655917167663574, + "rewards/margins": 0.2907727062702179, + "rewards/rejected": -4.946690559387207, + "sft_loss": 4.393622398376465, + "step": 1240 + }, + { + "epoch": 0.6663321625689914, + "grad_norm": 0.47313857382521357, + "learning_rate": 9.552912049655789e-07, + "logits/chosen": -0.5752710103988647, + "logits/rejected": -0.3442533612251282, + "logps/chosen": -4.453244209289551, + "logps/rejected": -4.790841102600098, + "loss": 0.0541, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.453244209289551, + "rewards/margins": 0.3375973701477051, + "rewards/rejected": -4.790841102600098, + "sft_loss": 4.221789836883545, + "step": 1245 + }, + { + "epoch": 0.669008195350393, + "grad_norm": 0.3080289767371704, + "learning_rate": 9.546452799394315e-07, + "logits/chosen": -0.6375377178192139, + "logits/rejected": -0.35768041014671326, + "logps/chosen": -4.7251386642456055, + "logps/rejected": -5.06402063369751, + "loss": 0.0537, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.7251386642456055, + "rewards/margins": 0.3388821482658386, + "rewards/rejected": -5.06402063369751, + "sft_loss": 4.505545616149902, + "step": 1250 + }, + { + "epoch": 0.6716842281317946, + "grad_norm": 0.46157795630326126, + "learning_rate": 9.539949439427846e-07, + "logits/chosen": -0.5481323003768921, + "logits/rejected": -0.40522176027297974, + "logps/chosen": -4.713761329650879, + "logps/rejected": -5.103206157684326, + "loss": 0.0543, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.713761329650879, + "rewards/margins": 0.38944488763809204, + "rewards/rejected": -5.103206157684326, + "sft_loss": 4.457167148590088, + "step": 1255 + }, + { + "epoch": 0.6743602609131962, + "grad_norm": 0.7554972971844888, + "learning_rate": 9.533402032852002e-07, + "logits/chosen": -0.5324119329452515, + "logits/rejected": -0.3285521864891052, + "logps/chosen": -4.660325050354004, + "logps/rejected": -5.11639404296875, + "loss": 0.053, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.660325050354004, + "rewards/margins": 0.45606860518455505, + "rewards/rejected": -5.11639404296875, + "sft_loss": 4.358373641967773, + "step": 1260 + }, + { + "epoch": 0.6770362936945977, + "grad_norm": 0.5334990626640641, + "learning_rate": 9.526810643189754e-07, + "logits/chosen": -0.4682994484901428, + "logits/rejected": -0.20002928376197815, + "logps/chosen": -4.4835896492004395, + "logps/rejected": -4.984886646270752, + "loss": 0.053, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.4835896492004395, + "rewards/margins": 0.501296877861023, + "rewards/rejected": -4.984886646270752, + "sft_loss": 4.2979960441589355, + "step": 1265 + }, + { + "epoch": 0.6797123264759993, + "grad_norm": 0.43311466275245447, + "learning_rate": 9.52017533439079e-07, + "logits/chosen": -0.6111350655555725, + "logits/rejected": -0.5189183950424194, + "logps/chosen": -4.642319679260254, + "logps/rejected": -4.935871601104736, + "loss": 0.055, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.642319679260254, + "rewards/margins": 0.29355183243751526, + "rewards/rejected": -4.935871601104736, + "sft_loss": 4.443874359130859, + "step": 1270 + }, + { + "epoch": 0.6823883592574009, + "grad_norm": 0.4523237714480055, + "learning_rate": 9.513496170830909e-07, + "logits/chosen": -0.7188066244125366, + "logits/rejected": -0.6016985177993774, + "logps/chosen": -4.738122463226318, + "logps/rejected": -4.9969658851623535, + "loss": 0.0537, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.738122463226318, + "rewards/margins": 0.25884348154067993, + "rewards/rejected": -4.9969658851623535, + "sft_loss": 4.435435771942139, + "step": 1275 + }, + { + "epoch": 0.6850643920388024, + "grad_norm": 0.39885392574273837, + "learning_rate": 9.506773217311382e-07, + "logits/chosen": -0.6342407464981079, + "logits/rejected": -0.4105973243713379, + "logps/chosen": -4.633099555969238, + "logps/rejected": -4.917786121368408, + "loss": 0.0534, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.633099555969238, + "rewards/margins": 0.2846868932247162, + "rewards/rejected": -4.917786121368408, + "sft_loss": 4.4025044441223145, + "step": 1280 + }, + { + "epoch": 0.687740424820204, + "grad_norm": 0.46859664979499294, + "learning_rate": 9.500006539058334e-07, + "logits/chosen": -0.5705159902572632, + "logits/rejected": -0.32873377203941345, + "logps/chosen": -4.4862871170043945, + "logps/rejected": -4.728259086608887, + "loss": 0.0534, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.4862871170043945, + "rewards/margins": 0.24197213351726532, + "rewards/rejected": -4.728259086608887, + "sft_loss": 4.215309143066406, + "step": 1285 + }, + { + "epoch": 0.6904164576016056, + "grad_norm": 0.5566937884089165, + "learning_rate": 9.493196201722109e-07, + "logits/chosen": -0.6300413012504578, + "logits/rejected": -0.4329058527946472, + "logps/chosen": -4.681478500366211, + "logps/rejected": -4.944789886474609, + "loss": 0.0537, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.681478500366211, + "rewards/margins": 0.26331159472465515, + "rewards/rejected": -4.944789886474609, + "sft_loss": 4.417520999908447, + "step": 1290 + }, + { + "epoch": 0.6930924903830072, + "grad_norm": 0.5005083892393065, + "learning_rate": 9.486342271376628e-07, + "logits/chosen": -0.5651583671569824, + "logits/rejected": -0.6070525050163269, + "logps/chosen": -4.731893539428711, + "logps/rejected": -5.015305519104004, + "loss": 0.0538, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.731893539428711, + "rewards/margins": 0.28341221809387207, + "rewards/rejected": -5.015305519104004, + "sft_loss": 4.412493705749512, + "step": 1295 + }, + { + "epoch": 0.6957685231644087, + "grad_norm": 0.4047781839205759, + "learning_rate": 9.479444814518755e-07, + "logits/chosen": -0.5466286540031433, + "logits/rejected": -0.2215103805065155, + "logps/chosen": -4.484042644500732, + "logps/rejected": -5.004049301147461, + "loss": 0.0522, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.484042644500732, + "rewards/margins": 0.5200066566467285, + "rewards/rejected": -5.004049301147461, + "sft_loss": 4.189947605133057, + "step": 1300 + }, + { + "epoch": 0.6984445559458103, + "grad_norm": 0.4919221601673572, + "learning_rate": 9.472503898067645e-07, + "logits/chosen": -0.3866714835166931, + "logits/rejected": -0.3418964743614197, + "logps/chosen": -4.764806747436523, + "logps/rejected": -4.945952892303467, + "loss": 0.0548, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.764806747436523, + "rewards/margins": 0.18114568293094635, + "rewards/rejected": -4.945952892303467, + "sft_loss": 4.389309883117676, + "step": 1305 + }, + { + "epoch": 0.701120588727212, + "grad_norm": 0.6165938817684519, + "learning_rate": 9.465519589364099e-07, + "logits/chosen": -0.43105238676071167, + "logits/rejected": -0.34996071457862854, + "logps/chosen": -4.986048698425293, + "logps/rejected": -5.196448802947998, + "loss": 0.0539, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.986048698425293, + "rewards/margins": 0.2104000300168991, + "rewards/rejected": -5.196448802947998, + "sft_loss": 4.601518154144287, + "step": 1310 + }, + { + "epoch": 0.7037966215086134, + "grad_norm": 0.7147540339684912, + "learning_rate": 9.458491956169914e-07, + "logits/chosen": -0.5200138688087463, + "logits/rejected": -0.30936622619628906, + "logps/chosen": -4.600648403167725, + "logps/rejected": -4.895071983337402, + "loss": 0.0534, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.600648403167725, + "rewards/margins": 0.29442328214645386, + "rewards/rejected": -4.895071983337402, + "sft_loss": 4.186682224273682, + "step": 1315 + }, + { + "epoch": 0.706472654290015, + "grad_norm": 0.5222909906022749, + "learning_rate": 9.451421066667215e-07, + "logits/chosen": -0.6883410215377808, + "logits/rejected": -0.45219460129737854, + "logps/chosen": -4.589325904846191, + "logps/rejected": -5.036540508270264, + "loss": 0.0527, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.589325904846191, + "rewards/margins": 0.4472144544124603, + "rewards/rejected": -5.036540508270264, + "sft_loss": 4.3194098472595215, + "step": 1320 + }, + { + "epoch": 0.7091486870714167, + "grad_norm": 0.4776830565945863, + "learning_rate": 9.444306989457805e-07, + "logits/chosen": -0.46004819869995117, + "logits/rejected": -0.3241944909095764, + "logps/chosen": -4.868544101715088, + "logps/rejected": -5.153488636016846, + "loss": 0.0542, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.868544101715088, + "rewards/margins": 0.2849445641040802, + "rewards/rejected": -5.153488636016846, + "sft_loss": 4.496047019958496, + "step": 1325 + }, + { + "epoch": 0.7118247198528181, + "grad_norm": 0.4294909802110886, + "learning_rate": 9.437149793562489e-07, + "logits/chosen": -0.5180003046989441, + "logits/rejected": -0.3660891652107239, + "logps/chosen": -4.651247501373291, + "logps/rejected": -4.924403190612793, + "loss": 0.054, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.651247501373291, + "rewards/margins": 0.2731553614139557, + "rewards/rejected": -4.924403190612793, + "sft_loss": 4.393563270568848, + "step": 1330 + }, + { + "epoch": 0.7145007526342197, + "grad_norm": 0.39095469809878836, + "learning_rate": 9.429949548420417e-07, + "logits/chosen": -0.5731022357940674, + "logits/rejected": -0.4370029866695404, + "logps/chosen": -4.695939064025879, + "logps/rejected": -4.961706638336182, + "loss": 0.0536, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.695939064025879, + "rewards/margins": 0.2657679617404938, + "rewards/rejected": -4.961706638336182, + "sft_loss": 4.3769636154174805, + "step": 1335 + }, + { + "epoch": 0.7171767854156214, + "grad_norm": 0.430223860508395, + "learning_rate": 9.422706323888396e-07, + "logits/chosen": -0.5357168912887573, + "logits/rejected": -0.49134665727615356, + "logps/chosen": -4.6197052001953125, + "logps/rejected": -4.928733825683594, + "loss": 0.0535, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.6197052001953125, + "rewards/margins": 0.3090288043022156, + "rewards/rejected": -4.928733825683594, + "sft_loss": 4.343020439147949, + "step": 1340 + }, + { + "epoch": 0.719852818197023, + "grad_norm": 0.5805561587916378, + "learning_rate": 9.415420190240225e-07, + "logits/chosen": -0.5039713382720947, + "logits/rejected": -0.23104509711265564, + "logps/chosen": -4.593451976776123, + "logps/rejected": -5.125218868255615, + "loss": 0.0519, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.593451976776123, + "rewards/margins": 0.5317668914794922, + "rewards/rejected": -5.125218868255615, + "sft_loss": 4.35361385345459, + "step": 1345 + }, + { + "epoch": 0.7225288509784245, + "grad_norm": 0.794948230843, + "learning_rate": 9.408091218166002e-07, + "logits/chosen": -0.4368966519832611, + "logits/rejected": -0.349046915769577, + "logps/chosen": -4.646870136260986, + "logps/rejected": -4.846858024597168, + "loss": 0.0545, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.646870136260986, + "rewards/margins": 0.19998829066753387, + "rewards/rejected": -4.846858024597168, + "sft_loss": 4.331336498260498, + "step": 1350 + }, + { + "epoch": 0.7252048837598261, + "grad_norm": 0.4322649148913923, + "learning_rate": 9.400719478771449e-07, + "logits/chosen": -0.4928915500640869, + "logits/rejected": -0.10693871974945068, + "logps/chosen": -4.772175312042236, + "logps/rejected": -5.275957107543945, + "loss": 0.0523, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.772175312042236, + "rewards/margins": 0.5037820935249329, + "rewards/rejected": -5.275957107543945, + "sft_loss": 4.457993507385254, + "step": 1355 + }, + { + "epoch": 0.7278809165412277, + "grad_norm": 0.4188247735094768, + "learning_rate": 9.393305043577209e-07, + "logits/chosen": -0.4668586850166321, + "logits/rejected": -0.3280408978462219, + "logps/chosen": -4.57827615737915, + "logps/rejected": -4.987173080444336, + "loss": 0.0538, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.57827615737915, + "rewards/margins": 0.40889644622802734, + "rewards/rejected": -4.987173080444336, + "sft_loss": 4.273009300231934, + "step": 1360 + }, + { + "epoch": 0.7305569493226292, + "grad_norm": 0.43986121858771965, + "learning_rate": 9.38584798451817e-07, + "logits/chosen": -0.5263036489486694, + "logits/rejected": -0.32945355772972107, + "logps/chosen": -4.534844398498535, + "logps/rejected": -4.883469581604004, + "loss": 0.0528, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.534844398498535, + "rewards/margins": 0.3486255705356598, + "rewards/rejected": -4.883469581604004, + "sft_loss": 4.275891304016113, + "step": 1365 + }, + { + "epoch": 0.7332329821040308, + "grad_norm": 0.5027182729571974, + "learning_rate": 9.37834837394275e-07, + "logits/chosen": -0.41481298208236694, + "logits/rejected": -0.24069428443908691, + "logps/chosen": -4.659480094909668, + "logps/rejected": -5.198307514190674, + "loss": 0.0531, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.659480094909668, + "rewards/margins": 0.5388270616531372, + "rewards/rejected": -5.198307514190674, + "sft_loss": 4.420654296875, + "step": 1370 + }, + { + "epoch": 0.7359090148854324, + "grad_norm": 0.5596253221586619, + "learning_rate": 9.370806284612203e-07, + "logits/chosen": -0.4726240038871765, + "logits/rejected": -0.2989794909954071, + "logps/chosen": -4.519133567810059, + "logps/rejected": -5.032240867614746, + "loss": 0.053, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.519133567810059, + "rewards/margins": 0.5131076574325562, + "rewards/rejected": -5.032240867614746, + "sft_loss": 4.242257118225098, + "step": 1375 + }, + { + "epoch": 0.738585047666834, + "grad_norm": 0.639067801858824, + "learning_rate": 9.363221789699912e-07, + "logits/chosen": -0.5229755640029907, + "logits/rejected": -0.3458749055862427, + "logps/chosen": -4.645009517669678, + "logps/rejected": -5.047579765319824, + "loss": 0.0535, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.645009517669678, + "rewards/margins": 0.4025697112083435, + "rewards/rejected": -5.047579765319824, + "sft_loss": 4.33743953704834, + "step": 1380 + }, + { + "epoch": 0.7412610804482355, + "grad_norm": 0.45340914139326294, + "learning_rate": 9.355594962790682e-07, + "logits/chosen": -0.5707454681396484, + "logits/rejected": -0.39737915992736816, + "logps/chosen": -4.724990367889404, + "logps/rejected": -5.140081405639648, + "loss": 0.0531, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.724990367889404, + "rewards/margins": 0.4150908589363098, + "rewards/rejected": -5.140081405639648, + "sft_loss": 4.456721305847168, + "step": 1385 + }, + { + "epoch": 0.7439371132296371, + "grad_norm": 0.7726334686248777, + "learning_rate": 9.34792587788002e-07, + "logits/chosen": -0.41615360975265503, + "logits/rejected": -0.25168564915657043, + "logps/chosen": -4.589061260223389, + "logps/rejected": -4.944598197937012, + "loss": 0.0538, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.589061260223389, + "rewards/margins": 0.3555375933647156, + "rewards/rejected": -4.944598197937012, + "sft_loss": 4.252989292144775, + "step": 1390 + }, + { + "epoch": 0.7466131460110387, + "grad_norm": 0.9713420234440161, + "learning_rate": 9.34021460937342e-07, + "logits/chosen": -0.54308021068573, + "logits/rejected": -0.46706587076187134, + "logps/chosen": -4.68400764465332, + "logps/rejected": -4.97832727432251, + "loss": 0.0542, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.68400764465332, + "rewards/margins": 0.2943192422389984, + "rewards/rejected": -4.97832727432251, + "sft_loss": 4.438172817230225, + "step": 1395 + }, + { + "epoch": 0.7492891787924402, + "grad_norm": 0.7646700933687028, + "learning_rate": 9.332461232085646e-07, + "logits/chosen": -0.7934912443161011, + "logits/rejected": -0.5852801203727722, + "logps/chosen": -4.717600345611572, + "logps/rejected": -4.994270324707031, + "loss": 0.0542, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.717600345611572, + "rewards/margins": 0.27667081356048584, + "rewards/rejected": -4.994270324707031, + "sft_loss": 4.516864776611328, + "step": 1400 + }, + { + "epoch": 0.7519652115738418, + "grad_norm": 1.1584729658858994, + "learning_rate": 9.324665821239998e-07, + "logits/chosen": -0.7201007604598999, + "logits/rejected": -0.4515800476074219, + "logps/chosen": -4.434556484222412, + "logps/rejected": -4.892245292663574, + "loss": 0.0539, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.434556484222412, + "rewards/margins": 0.45768898725509644, + "rewards/rejected": -4.892245292663574, + "sft_loss": 4.247842788696289, + "step": 1405 + }, + { + "epoch": 0.7546412443552434, + "grad_norm": 0.6140739611213539, + "learning_rate": 9.316828452467583e-07, + "logits/chosen": -0.6372144222259521, + "logits/rejected": -0.42637553811073303, + "logps/chosen": -4.58826208114624, + "logps/rejected": -5.025907516479492, + "loss": 0.0531, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.58826208114624, + "rewards/margins": 0.4376456141471863, + "rewards/rejected": -5.025907516479492, + "sft_loss": 4.415618896484375, + "step": 1410 + }, + { + "epoch": 0.7573172771366449, + "grad_norm": 0.6626048352879617, + "learning_rate": 9.30894920180659e-07, + "logits/chosen": -0.47773098945617676, + "logits/rejected": -0.34003114700317383, + "logps/chosen": -4.645096778869629, + "logps/rejected": -4.930912971496582, + "loss": 0.0528, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.645096778869629, + "rewards/margins": 0.2858158051967621, + "rewards/rejected": -4.930912971496582, + "sft_loss": 4.292731285095215, + "step": 1415 + }, + { + "epoch": 0.7599933099180465, + "grad_norm": 0.46790800138866595, + "learning_rate": 9.301028145701543e-07, + "logits/chosen": -0.4381941854953766, + "logits/rejected": -0.23344704508781433, + "logps/chosen": -4.556414604187012, + "logps/rejected": -5.132447242736816, + "loss": 0.053, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.556414604187012, + "rewards/margins": 0.5760326385498047, + "rewards/rejected": -5.132447242736816, + "sft_loss": 4.284694671630859, + "step": 1420 + }, + { + "epoch": 0.7626693426994481, + "grad_norm": 0.4538761637030669, + "learning_rate": 9.293065361002563e-07, + "logits/chosen": -0.46119099855422974, + "logits/rejected": -0.28453510999679565, + "logps/chosen": -4.576390266418457, + "logps/rejected": -5.065820217132568, + "loss": 0.055, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.576390266418457, + "rewards/margins": 0.4894295632839203, + "rewards/rejected": -5.065820217132568, + "sft_loss": 4.269384860992432, + "step": 1425 + }, + { + "epoch": 0.7653453754808497, + "grad_norm": 0.5262247782905854, + "learning_rate": 9.285060924964622e-07, + "logits/chosen": -0.5923141241073608, + "logits/rejected": -0.43897438049316406, + "logps/chosen": -4.758378028869629, + "logps/rejected": -5.130233287811279, + "loss": 0.0531, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.758378028869629, + "rewards/margins": 0.3718549311161041, + "rewards/rejected": -5.130233287811279, + "sft_loss": 4.387657165527344, + "step": 1430 + }, + { + "epoch": 0.7680214082622512, + "grad_norm": 0.8738310108131122, + "learning_rate": 9.277014915246792e-07, + "logits/chosen": -0.4979740083217621, + "logits/rejected": -0.4129610061645508, + "logps/chosen": -4.627547740936279, + "logps/rejected": -5.047521114349365, + "loss": 0.0538, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.627547740936279, + "rewards/margins": 0.41997361183166504, + "rewards/rejected": -5.047521114349365, + "sft_loss": 4.41188907623291, + "step": 1435 + }, + { + "epoch": 0.7706974410436528, + "grad_norm": 0.3665660050498204, + "learning_rate": 9.268927409911498e-07, + "logits/chosen": -0.6547388434410095, + "logits/rejected": -0.5283665060997009, + "logps/chosen": -4.4866533279418945, + "logps/rejected": -4.788092136383057, + "loss": 0.0533, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.4866533279418945, + "rewards/margins": 0.3014386296272278, + "rewards/rejected": -4.788092136383057, + "sft_loss": 4.192168235778809, + "step": 1440 + }, + { + "epoch": 0.7733734738250544, + "grad_norm": 0.5777401513346766, + "learning_rate": 9.260798487423749e-07, + "logits/chosen": -0.8416692614555359, + "logits/rejected": -0.5481675863265991, + "logps/chosen": -4.734822750091553, + "logps/rejected": -5.159518241882324, + "loss": 0.0529, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.734822750091553, + "rewards/margins": 0.42469555139541626, + "rewards/rejected": -5.159518241882324, + "sft_loss": 4.497926235198975, + "step": 1445 + }, + { + "epoch": 0.7760495066064559, + "grad_norm": 0.31040149478224743, + "learning_rate": 9.252628226650389e-07, + "logits/chosen": -0.6390944719314575, + "logits/rejected": -0.5294175744056702, + "logps/chosen": -4.691042423248291, + "logps/rejected": -4.936129093170166, + "loss": 0.0535, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.691042423248291, + "rewards/margins": 0.24508705735206604, + "rewards/rejected": -4.936129093170166, + "sft_loss": 4.376203536987305, + "step": 1450 + }, + { + "epoch": 0.7787255393878575, + "grad_norm": 0.8324745102838857, + "learning_rate": 9.244416706859321e-07, + "logits/chosen": -0.6490924954414368, + "logits/rejected": -0.43776410818099976, + "logps/chosen": -4.428481578826904, + "logps/rejected": -4.945733070373535, + "loss": 0.0533, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.428481578826904, + "rewards/margins": 0.5172508955001831, + "rewards/rejected": -4.945733070373535, + "sft_loss": 4.210099220275879, + "step": 1455 + }, + { + "epoch": 0.7814015721692591, + "grad_norm": 0.4524607634156159, + "learning_rate": 9.23616400771875e-07, + "logits/chosen": -0.6316944360733032, + "logits/rejected": -0.37926986813545227, + "logps/chosen": -4.5279340744018555, + "logps/rejected": -4.91841459274292, + "loss": 0.0521, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.5279340744018555, + "rewards/margins": 0.39048075675964355, + "rewards/rejected": -4.91841459274292, + "sft_loss": 4.214154243469238, + "step": 1460 + }, + { + "epoch": 0.7840776049506607, + "grad_norm": 0.45011613171946563, + "learning_rate": 9.227870209296395e-07, + "logits/chosen": -0.5129834413528442, + "logits/rejected": -0.3363080322742462, + "logps/chosen": -4.668064117431641, + "logps/rejected": -5.000769138336182, + "loss": 0.0538, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.668064117431641, + "rewards/margins": 0.3327048420906067, + "rewards/rejected": -5.000769138336182, + "sft_loss": 4.383709907531738, + "step": 1465 + }, + { + "epoch": 0.7867536377320622, + "grad_norm": 0.7776345772417559, + "learning_rate": 9.219535392058728e-07, + "logits/chosen": -0.5440296530723572, + "logits/rejected": -0.5481287837028503, + "logps/chosen": -4.869925022125244, + "logps/rejected": -5.206021308898926, + "loss": 0.054, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.869925022125244, + "rewards/margins": 0.33609622716903687, + "rewards/rejected": -5.206021308898926, + "sft_loss": 4.53743314743042, + "step": 1470 + }, + { + "epoch": 0.7894296705134638, + "grad_norm": 0.72464037163324, + "learning_rate": 9.211159636870181e-07, + "logits/chosen": -0.5129449367523193, + "logits/rejected": -0.3094898760318756, + "logps/chosen": -4.535016059875488, + "logps/rejected": -4.995938777923584, + "loss": 0.0526, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.535016059875488, + "rewards/margins": 0.46092361211776733, + "rewards/rejected": -4.995938777923584, + "sft_loss": 4.218871116638184, + "step": 1475 + }, + { + "epoch": 0.7921057032948654, + "grad_norm": 0.43282860953379676, + "learning_rate": 9.202743024992367e-07, + "logits/chosen": -0.29366233944892883, + "logits/rejected": -0.19437307119369507, + "logps/chosen": -4.601809978485107, + "logps/rejected": -4.955982208251953, + "loss": 0.054, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.601809978485107, + "rewards/margins": 0.35417240858078003, + "rewards/rejected": -4.955982208251953, + "sft_loss": 4.2811689376831055, + "step": 1480 + }, + { + "epoch": 0.7947817360762669, + "grad_norm": 0.6048286155475231, + "learning_rate": 9.194285638083293e-07, + "logits/chosen": -0.4645848870277405, + "logits/rejected": -0.2716544270515442, + "logps/chosen": -4.692347049713135, + "logps/rejected": -5.111114501953125, + "loss": 0.0521, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.692347049713135, + "rewards/margins": 0.4187680780887604, + "rewards/rejected": -5.111114501953125, + "sft_loss": 4.267924785614014, + "step": 1485 + }, + { + "epoch": 0.7974577688576685, + "grad_norm": 0.7623333749730745, + "learning_rate": 9.185787558196562e-07, + "logits/chosen": -0.5028216242790222, + "logits/rejected": -0.36704540252685547, + "logps/chosen": -4.721834182739258, + "logps/rejected": -5.188471794128418, + "loss": 0.0525, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.721834182739258, + "rewards/margins": 0.466637521982193, + "rewards/rejected": -5.188471794128418, + "sft_loss": 4.410996437072754, + "step": 1490 + }, + { + "epoch": 0.8001338016390701, + "grad_norm": 0.5986005442913178, + "learning_rate": 9.177248867780583e-07, + "logits/chosen": -0.38116052746772766, + "logits/rejected": -0.2935437262058258, + "logps/chosen": -4.649445533752441, + "logps/rejected": -4.961583137512207, + "loss": 0.0549, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.649445533752441, + "rewards/margins": 0.3121368885040283, + "rewards/rejected": -4.961583137512207, + "sft_loss": 4.2997026443481445, + "step": 1495 + }, + { + "epoch": 0.8028098344204716, + "grad_norm": 0.5435666495008531, + "learning_rate": 9.168669649677769e-07, + "logits/chosen": -0.6052119135856628, + "logits/rejected": -0.4620954394340515, + "logps/chosen": -4.813775539398193, + "logps/rejected": -5.131553649902344, + "loss": 0.0548, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.813775539398193, + "rewards/margins": 0.31777825951576233, + "rewards/rejected": -5.131553649902344, + "sft_loss": 4.5045881271362305, + "step": 1500 + }, + { + "epoch": 0.8054858672018732, + "grad_norm": 0.47022443877330955, + "learning_rate": 9.16004998712373e-07, + "logits/chosen": -0.5376960039138794, + "logits/rejected": -0.4372090697288513, + "logps/chosen": -4.671207904815674, + "logps/rejected": -4.986814498901367, + "loss": 0.0534, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.671207904815674, + "rewards/margins": 0.3156066834926605, + "rewards/rejected": -4.986814498901367, + "sft_loss": 4.38083553314209, + "step": 1505 + }, + { + "epoch": 0.8081618999832748, + "grad_norm": 0.6199400338313545, + "learning_rate": 9.151389963746472e-07, + "logits/chosen": -0.5899304747581482, + "logits/rejected": -0.18980534374713898, + "logps/chosen": -4.4528584480285645, + "logps/rejected": -5.0324931144714355, + "loss": 0.0524, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.4528584480285645, + "rewards/margins": 0.579634964466095, + "rewards/rejected": -5.0324931144714355, + "sft_loss": 4.2412800788879395, + "step": 1510 + }, + { + "epoch": 0.8108379327646764, + "grad_norm": 0.5555948687982465, + "learning_rate": 9.142689663565577e-07, + "logits/chosen": -0.5152315497398376, + "logits/rejected": -0.436471164226532, + "logps/chosen": -4.5293426513671875, + "logps/rejected": -4.902256965637207, + "loss": 0.0535, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.5293426513671875, + "rewards/margins": 0.3729146420955658, + "rewards/rejected": -4.902256965637207, + "sft_loss": 4.266984462738037, + "step": 1515 + }, + { + "epoch": 0.8135139655460779, + "grad_norm": 0.45142612803831084, + "learning_rate": 9.133949170991397e-07, + "logits/chosen": -0.53922039270401, + "logits/rejected": -0.4263296127319336, + "logps/chosen": -4.764470100402832, + "logps/rejected": -5.035752296447754, + "loss": 0.0537, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.764470100402832, + "rewards/margins": 0.27128297090530396, + "rewards/rejected": -5.035752296447754, + "sft_loss": 4.539823055267334, + "step": 1520 + }, + { + "epoch": 0.8161899983274795, + "grad_norm": 0.41254383948397666, + "learning_rate": 9.125168570824231e-07, + "logits/chosen": -0.6185752153396606, + "logits/rejected": -0.3851194977760315, + "logps/chosen": -4.496891021728516, + "logps/rejected": -4.882938861846924, + "loss": 0.0531, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.496891021728516, + "rewards/margins": 0.3860477805137634, + "rewards/rejected": -4.882938861846924, + "sft_loss": 4.258194923400879, + "step": 1525 + }, + { + "epoch": 0.8188660311088811, + "grad_norm": 0.7151117766432287, + "learning_rate": 9.116347948253496e-07, + "logits/chosen": -0.5224160552024841, + "logits/rejected": -0.3366536498069763, + "logps/chosen": -4.449323654174805, + "logps/rejected": -4.829797267913818, + "loss": 0.0529, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.449323654174805, + "rewards/margins": 0.3804740905761719, + "rewards/rejected": -4.829797267913818, + "sft_loss": 4.185049533843994, + "step": 1530 + }, + { + "epoch": 0.8215420638902826, + "grad_norm": 0.583199273167788, + "learning_rate": 9.107487388856916e-07, + "logits/chosen": -0.5745779275894165, + "logits/rejected": -0.3310237228870392, + "logps/chosen": -4.638999938964844, + "logps/rejected": -5.003554344177246, + "loss": 0.0529, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.638999938964844, + "rewards/margins": 0.3645547330379486, + "rewards/rejected": -5.003554344177246, + "sft_loss": 4.2992844581604, + "step": 1535 + }, + { + "epoch": 0.8242180966716842, + "grad_norm": 1.0175227991306681, + "learning_rate": 9.098586978599673e-07, + "logits/chosen": -0.4655645489692688, + "logits/rejected": -0.27722448110580444, + "logps/chosen": -4.643935680389404, + "logps/rejected": -5.168910503387451, + "loss": 0.0526, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.643935680389404, + "rewards/margins": 0.524974524974823, + "rewards/rejected": -5.168910503387451, + "sft_loss": 4.284567832946777, + "step": 1540 + }, + { + "epoch": 0.8268941294530858, + "grad_norm": 1.0535335408279394, + "learning_rate": 9.089646803833588e-07, + "logits/chosen": -0.48223942518234253, + "logits/rejected": -0.29267364740371704, + "logps/chosen": -4.699334621429443, + "logps/rejected": -5.175503730773926, + "loss": 0.0529, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.699334621429443, + "rewards/margins": 0.47616925835609436, + "rewards/rejected": -5.175503730773926, + "sft_loss": 4.3690996170043945, + "step": 1545 + }, + { + "epoch": 0.8295701622344873, + "grad_norm": 1.225203498147348, + "learning_rate": 9.080666951296276e-07, + "logits/chosen": -0.5685257315635681, + "logits/rejected": -0.23230977356433868, + "logps/chosen": -4.532471656799316, + "logps/rejected": -5.177938938140869, + "loss": 0.0525, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.532471656799316, + "rewards/margins": 0.6454674601554871, + "rewards/rejected": -5.177938938140869, + "sft_loss": 4.264753818511963, + "step": 1550 + }, + { + "epoch": 0.8322461950158889, + "grad_norm": 0.3571080799880975, + "learning_rate": 9.071647508110305e-07, + "logits/chosen": -0.5444567203521729, + "logits/rejected": -0.20953519642353058, + "logps/chosen": -4.405203819274902, + "logps/rejected": -5.111395359039307, + "loss": 0.0518, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.405203819274902, + "rewards/margins": 0.706192135810852, + "rewards/rejected": -5.111395359039307, + "sft_loss": 4.1048126220703125, + "step": 1555 + }, + { + "epoch": 0.8349222277972905, + "grad_norm": 0.4647677566083696, + "learning_rate": 9.062588561782354e-07, + "logits/chosen": -0.3920247256755829, + "logits/rejected": -0.31407880783081055, + "logps/chosen": -4.9274492263793945, + "logps/rejected": -5.254195690155029, + "loss": 0.0543, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.9274492263793945, + "rewards/margins": 0.32674673199653625, + "rewards/rejected": -5.254195690155029, + "sft_loss": 4.650223731994629, + "step": 1560 + }, + { + "epoch": 0.8375982605786921, + "grad_norm": 0.33147523191362094, + "learning_rate": 9.053490200202358e-07, + "logits/chosen": -0.4448007047176361, + "logits/rejected": -0.37099045515060425, + "logps/chosen": -4.541591167449951, + "logps/rejected": -4.890256404876709, + "loss": 0.0546, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.541591167449951, + "rewards/margins": 0.3486659526824951, + "rewards/rejected": -4.890256404876709, + "sft_loss": 4.340108871459961, + "step": 1565 + }, + { + "epoch": 0.8402742933600936, + "grad_norm": 0.666464713418103, + "learning_rate": 9.044352511642661e-07, + "logits/chosen": -0.4679701328277588, + "logits/rejected": -0.3994170129299164, + "logps/chosen": -4.796227931976318, + "logps/rejected": -5.057922840118408, + "loss": 0.0545, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.796227931976318, + "rewards/margins": 0.2616943120956421, + "rewards/rejected": -5.057922840118408, + "sft_loss": 4.517434597015381, + "step": 1570 + }, + { + "epoch": 0.8429503261414952, + "grad_norm": 0.6293748246790352, + "learning_rate": 9.03517558475716e-07, + "logits/chosen": -0.5948684811592102, + "logits/rejected": -0.4503151774406433, + "logps/chosen": -4.529299736022949, + "logps/rejected": -4.76159143447876, + "loss": 0.056, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.529299736022949, + "rewards/margins": 0.23229138553142548, + "rewards/rejected": -4.76159143447876, + "sft_loss": 4.296181678771973, + "step": 1575 + }, + { + "epoch": 0.8456263589228968, + "grad_norm": 0.4250320609206886, + "learning_rate": 9.025959508580436e-07, + "logits/chosen": -0.6207831501960754, + "logits/rejected": -0.3034132719039917, + "logps/chosen": -4.6496171951293945, + "logps/rejected": -5.029486656188965, + "loss": 0.053, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.6496171951293945, + "rewards/margins": 0.3798690736293793, + "rewards/rejected": -5.029486656188965, + "sft_loss": 4.412766456604004, + "step": 1580 + }, + { + "epoch": 0.8483023917042983, + "grad_norm": 0.3411551452075428, + "learning_rate": 9.016704372526905e-07, + "logits/chosen": -0.6650777459144592, + "logits/rejected": -0.4226778447628021, + "logps/chosen": -4.5687479972839355, + "logps/rejected": -5.018956661224365, + "loss": 0.0531, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.5687479972839355, + "rewards/margins": 0.4502086639404297, + "rewards/rejected": -5.018956661224365, + "sft_loss": 4.3290324211120605, + "step": 1585 + }, + { + "epoch": 0.8509784244856999, + "grad_norm": 0.3485149666875214, + "learning_rate": 9.007410266389934e-07, + "logits/chosen": -0.6740955114364624, + "logits/rejected": -0.6200405359268188, + "logps/chosen": -4.6393561363220215, + "logps/rejected": -4.842759132385254, + "loss": 0.0538, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.6393561363220215, + "rewards/margins": 0.20340339839458466, + "rewards/rejected": -4.842759132385254, + "sft_loss": 4.307949066162109, + "step": 1590 + }, + { + "epoch": 0.8536544572671015, + "grad_norm": 0.6993713590336478, + "learning_rate": 8.998077280340981e-07, + "logits/chosen": -0.5477308034896851, + "logits/rejected": -0.4848629832267761, + "logps/chosen": -4.8374176025390625, + "logps/rejected": -5.147733688354492, + "loss": 0.0537, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.8374176025390625, + "rewards/margins": 0.31031590700149536, + "rewards/rejected": -5.147733688354492, + "sft_loss": 4.451597690582275, + "step": 1595 + }, + { + "epoch": 0.8563304900485031, + "grad_norm": 0.5857090427241004, + "learning_rate": 8.988705504928722e-07, + "logits/chosen": -0.593254029750824, + "logits/rejected": -0.3634767532348633, + "logps/chosen": -4.616876125335693, + "logps/rejected": -5.206143856048584, + "loss": 0.0518, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.616876125335693, + "rewards/margins": 0.5892679691314697, + "rewards/rejected": -5.206143856048584, + "sft_loss": 4.2967424392700195, + "step": 1600 + }, + { + "epoch": 0.8563304900485031, + "eval_logits/chosen": -0.02904585748910904, + "eval_logits/rejected": 0.07823801785707474, + "eval_logps/chosen": -4.5006561279296875, + "eval_logps/rejected": -4.917550086975098, + "eval_loss": 0.051895011216402054, + "eval_rewards/accuracies": 0.6313056349754333, + "eval_rewards/chosen": -4.5006561279296875, + "eval_rewards/margins": 0.41689401865005493, + "eval_rewards/rejected": -4.917550086975098, + "eval_runtime": 43.4471, + "eval_samples_per_second": 30.957, + "eval_sft_loss": 4.063570499420166, + "eval_steps_per_second": 7.757, + "step": 1600 + }, + { + "epoch": 0.8590065228299046, + "grad_norm": 0.4116631557826821, + "learning_rate": 8.979295031078157e-07, + "logits/chosen": -0.5330547094345093, + "logits/rejected": -0.25229379534721375, + "logps/chosen": -4.346199035644531, + "logps/rejected": -4.8608598709106445, + "loss": 0.0518, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.346199035644531, + "rewards/margins": 0.5146608948707581, + "rewards/rejected": -4.8608598709106445, + "sft_loss": 4.030482292175293, + "step": 1605 + }, + { + "epoch": 0.8616825556113062, + "grad_norm": 0.38214100986360633, + "learning_rate": 8.969845950089751e-07, + "logits/chosen": -0.5902666449546814, + "logits/rejected": -0.3709181249141693, + "logps/chosen": -4.7403693199157715, + "logps/rejected": -5.202174186706543, + "loss": 0.0538, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.7403693199157715, + "rewards/margins": 0.4618045389652252, + "rewards/rejected": -5.202174186706543, + "sft_loss": 4.481407165527344, + "step": 1610 + }, + { + "epoch": 0.8643585883927078, + "grad_norm": 0.41539489108804767, + "learning_rate": 8.960358353638526e-07, + "logits/chosen": -0.4401358664035797, + "logits/rejected": -0.2931682765483856, + "logps/chosen": -4.8379106521606445, + "logps/rejected": -5.323012828826904, + "loss": 0.0535, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.8379106521606445, + "rewards/margins": 0.48510226607322693, + "rewards/rejected": -5.323012828826904, + "sft_loss": 4.449693202972412, + "step": 1615 + }, + { + "epoch": 0.8670346211741093, + "grad_norm": 0.9036399070841549, + "learning_rate": 8.950832333773184e-07, + "logits/chosen": -0.45871132612228394, + "logits/rejected": -0.25480368733406067, + "logps/chosen": -4.593859672546387, + "logps/rejected": -4.8868608474731445, + "loss": 0.0541, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.593859672546387, + "rewards/margins": 0.2930009961128235, + "rewards/rejected": -4.8868608474731445, + "sft_loss": 4.267394065856934, + "step": 1620 + }, + { + "epoch": 0.869710653955511, + "grad_norm": 0.4427581928743918, + "learning_rate": 8.941267982915213e-07, + "logits/chosen": -0.45720523595809937, + "logits/rejected": -0.39475345611572266, + "logps/chosen": -4.652650356292725, + "logps/rejected": -4.871710777282715, + "loss": 0.0551, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.652650356292725, + "rewards/margins": 0.2190600335597992, + "rewards/rejected": -4.871710777282715, + "sft_loss": 4.429585933685303, + "step": 1625 + }, + { + "epoch": 0.8723866867369126, + "grad_norm": 0.6495835714729182, + "learning_rate": 8.931665393857983e-07, + "logits/chosen": -0.5988067388534546, + "logits/rejected": -0.41781100630760193, + "logps/chosen": -4.665750026702881, + "logps/rejected": -5.207381725311279, + "loss": 0.0533, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.665750026702881, + "rewards/margins": 0.5416315793991089, + "rewards/rejected": -5.207381725311279, + "sft_loss": 4.4793195724487305, + "step": 1630 + }, + { + "epoch": 0.875062719518314, + "grad_norm": 0.41623413128943515, + "learning_rate": 8.922024659765861e-07, + "logits/chosen": -0.7399147748947144, + "logits/rejected": -0.5550050139427185, + "logps/chosen": -4.498051643371582, + "logps/rejected": -4.938418865203857, + "loss": 0.0534, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.498051643371582, + "rewards/margins": 0.44036778807640076, + "rewards/rejected": -4.938418865203857, + "sft_loss": 4.29925012588501, + "step": 1635 + }, + { + "epoch": 0.8777387522997157, + "grad_norm": 0.5313489161329956, + "learning_rate": 8.912345874173288e-07, + "logits/chosen": -0.7936776876449585, + "logits/rejected": -0.5909181237220764, + "logps/chosen": -4.6866841316223145, + "logps/rejected": -5.111433506011963, + "loss": 0.0532, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.6866841316223145, + "rewards/margins": 0.4247500002384186, + "rewards/rejected": -5.111433506011963, + "sft_loss": 4.379754543304443, + "step": 1640 + }, + { + "epoch": 0.8804147850811173, + "grad_norm": 0.6359924799554241, + "learning_rate": 8.902629130983885e-07, + "logits/chosen": -0.7347515821456909, + "logits/rejected": -0.6840382814407349, + "logps/chosen": -4.671685695648193, + "logps/rejected": -4.900129795074463, + "loss": 0.0551, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.671685695648193, + "rewards/margins": 0.22844386100769043, + "rewards/rejected": -4.900129795074463, + "sft_loss": 4.440484046936035, + "step": 1645 + }, + { + "epoch": 0.8830908178625189, + "grad_norm": 0.5805478038553051, + "learning_rate": 8.892874524469537e-07, + "logits/chosen": -0.5966772437095642, + "logits/rejected": -0.5708019137382507, + "logps/chosen": -4.531647682189941, + "logps/rejected": -4.784287452697754, + "loss": 0.0534, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.531647682189941, + "rewards/margins": 0.25264012813568115, + "rewards/rejected": -4.784287452697754, + "sft_loss": 4.222924709320068, + "step": 1650 + }, + { + "epoch": 0.8857668506439204, + "grad_norm": 0.4154036618062291, + "learning_rate": 8.883082149269478e-07, + "logits/chosen": -0.7492375373840332, + "logits/rejected": -0.625057578086853, + "logps/chosen": -4.6985764503479, + "logps/rejected": -5.03641414642334, + "loss": 0.0533, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.6985764503479, + "rewards/margins": 0.33783769607543945, + "rewards/rejected": -5.03641414642334, + "sft_loss": 4.449755668640137, + "step": 1655 + }, + { + "epoch": 0.888442883425322, + "grad_norm": 0.6942420678365312, + "learning_rate": 8.873252100389377e-07, + "logits/chosen": -0.6865184307098389, + "logits/rejected": -0.6350366473197937, + "logps/chosen": -4.639318466186523, + "logps/rejected": -4.9327192306518555, + "loss": 0.0541, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.639318466186523, + "rewards/margins": 0.2934008240699768, + "rewards/rejected": -4.9327192306518555, + "sft_loss": 4.407034873962402, + "step": 1660 + }, + { + "epoch": 0.8911189162067236, + "grad_norm": 0.8556379469701134, + "learning_rate": 8.863384473200411e-07, + "logits/chosen": -0.5269213914871216, + "logits/rejected": -0.47171592712402344, + "logps/chosen": -4.581943511962891, + "logps/rejected": -4.845519065856934, + "loss": 0.054, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.581943511962891, + "rewards/margins": 0.2635752558708191, + "rewards/rejected": -4.845519065856934, + "sft_loss": 4.290720462799072, + "step": 1665 + }, + { + "epoch": 0.8937949489881251, + "grad_norm": 0.38175959218553795, + "learning_rate": 8.853479363438342e-07, + "logits/chosen": -0.5232313871383667, + "logits/rejected": -0.3149745464324951, + "logps/chosen": -4.757014751434326, + "logps/rejected": -5.126891136169434, + "loss": 0.0536, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.757014751434326, + "rewards/margins": 0.3698762059211731, + "rewards/rejected": -5.126891136169434, + "sft_loss": 4.466257572174072, + "step": 1670 + }, + { + "epoch": 0.8964709817695267, + "grad_norm": 0.7736549454767151, + "learning_rate": 8.843536867202588e-07, + "logits/chosen": -0.5337976217269897, + "logits/rejected": -0.30675259232521057, + "logps/chosen": -4.592353343963623, + "logps/rejected": -5.095428943634033, + "loss": 0.0539, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.592353343963623, + "rewards/margins": 0.5030753016471863, + "rewards/rejected": -5.095428943634033, + "sft_loss": 4.437192440032959, + "step": 1675 + }, + { + "epoch": 0.8991470145509283, + "grad_norm": 0.5225751200668408, + "learning_rate": 8.833557080955292e-07, + "logits/chosen": -0.668506383895874, + "logits/rejected": -0.5236512422561646, + "logps/chosen": -4.579380035400391, + "logps/rejected": -4.8987932205200195, + "loss": 0.0548, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.579380035400391, + "rewards/margins": 0.3194130063056946, + "rewards/rejected": -4.8987932205200195, + "sft_loss": 4.3848490715026855, + "step": 1680 + }, + { + "epoch": 0.9018230473323299, + "grad_norm": 0.3776357485013768, + "learning_rate": 8.823540101520381e-07, + "logits/chosen": -0.6779472231864929, + "logits/rejected": -0.41766971349716187, + "logps/chosen": -4.687167644500732, + "logps/rejected": -5.149572372436523, + "loss": 0.0527, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.687167644500732, + "rewards/margins": 0.4624043107032776, + "rewards/rejected": -5.149572372436523, + "sft_loss": 4.447117805480957, + "step": 1685 + }, + { + "epoch": 0.9044990801137314, + "grad_norm": 0.40393689961090434, + "learning_rate": 8.813486026082637e-07, + "logits/chosen": -0.6170944571495056, + "logits/rejected": -0.3785988688468933, + "logps/chosen": -4.538905143737793, + "logps/rejected": -5.0261921882629395, + "loss": 0.052, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.538905143737793, + "rewards/margins": 0.48728686571121216, + "rewards/rejected": -5.0261921882629395, + "sft_loss": 4.21726131439209, + "step": 1690 + }, + { + "epoch": 0.907175112895133, + "grad_norm": 0.6594051353912328, + "learning_rate": 8.803394952186742e-07, + "logits/chosen": -0.6432759165763855, + "logits/rejected": -0.46522051095962524, + "logps/chosen": -4.532651424407959, + "logps/rejected": -4.9046783447265625, + "loss": 0.0529, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.532651424407959, + "rewards/margins": 0.37202686071395874, + "rewards/rejected": -4.9046783447265625, + "sft_loss": 4.1833367347717285, + "step": 1695 + }, + { + "epoch": 0.9098511456765346, + "grad_norm": 0.7064046912842677, + "learning_rate": 8.793266977736342e-07, + "logits/chosen": -0.5009998083114624, + "logits/rejected": -0.6162468791007996, + "logps/chosen": -4.827506065368652, + "logps/rejected": -4.991827964782715, + "loss": 0.0557, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -4.827506065368652, + "rewards/margins": 0.16432145237922668, + "rewards/rejected": -4.991827964782715, + "sft_loss": 4.603185653686523, + "step": 1700 + }, + { + "epoch": 0.9125271784579361, + "grad_norm": 0.5206622644102082, + "learning_rate": 8.783102200993085e-07, + "logits/chosen": -0.5576699376106262, + "logits/rejected": -0.43255311250686646, + "logps/chosen": -4.724583625793457, + "logps/rejected": -5.145751953125, + "loss": 0.053, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.724583625793457, + "rewards/margins": 0.42116886377334595, + "rewards/rejected": -5.145751953125, + "sft_loss": 4.429846286773682, + "step": 1705 + }, + { + "epoch": 0.9152032112393377, + "grad_norm": 0.44476556944134527, + "learning_rate": 8.772900720575683e-07, + "logits/chosen": -0.5629488229751587, + "logits/rejected": -0.43112772703170776, + "logps/chosen": -4.643448829650879, + "logps/rejected": -4.956890106201172, + "loss": 0.0532, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.643448829650879, + "rewards/margins": 0.3134412169456482, + "rewards/rejected": -4.956890106201172, + "sft_loss": 4.35292911529541, + "step": 1710 + }, + { + "epoch": 0.9178792440207393, + "grad_norm": 0.3480510172874056, + "learning_rate": 8.762662635458944e-07, + "logits/chosen": -0.5707327127456665, + "logits/rejected": -0.3803301453590393, + "logps/chosen": -4.5035834312438965, + "logps/rejected": -4.909355640411377, + "loss": 0.053, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.5035834312438965, + "rewards/margins": 0.4057716429233551, + "rewards/rejected": -4.909355640411377, + "sft_loss": 4.233164310455322, + "step": 1715 + }, + { + "epoch": 0.9205552768021408, + "grad_norm": 0.5164062533786418, + "learning_rate": 8.752388044972811e-07, + "logits/chosen": -0.484900563955307, + "logits/rejected": -0.4002462923526764, + "logps/chosen": -4.537683963775635, + "logps/rejected": -4.98577356338501, + "loss": 0.0527, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.537683963775635, + "rewards/margins": 0.448089599609375, + "rewards/rejected": -4.98577356338501, + "sft_loss": 4.265603542327881, + "step": 1720 + }, + { + "epoch": 0.9232313095835424, + "grad_norm": 1.3750125559284796, + "learning_rate": 8.74207704880141e-07, + "logits/chosen": -0.38507509231567383, + "logits/rejected": -0.2651823163032532, + "logps/chosen": -4.553491115570068, + "logps/rejected": -5.053616523742676, + "loss": 0.0524, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.553491115570068, + "rewards/margins": 0.5001254081726074, + "rewards/rejected": -5.053616523742676, + "sft_loss": 4.2198309898376465, + "step": 1725 + }, + { + "epoch": 0.925907342364944, + "grad_norm": 0.6479414735451374, + "learning_rate": 8.731729746982068e-07, + "logits/chosen": -0.4087442457675934, + "logits/rejected": -0.3513071537017822, + "logps/chosen": -4.680237293243408, + "logps/rejected": -5.0000081062316895, + "loss": 0.0537, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.680237293243408, + "rewards/margins": 0.319771409034729, + "rewards/rejected": -5.0000081062316895, + "sft_loss": 4.4230875968933105, + "step": 1730 + }, + { + "epoch": 0.9285833751463456, + "grad_norm": 0.5320410614211661, + "learning_rate": 8.721346239904355e-07, + "logits/chosen": -0.6621562242507935, + "logits/rejected": -0.38414087891578674, + "logps/chosen": -4.588827610015869, + "logps/rejected": -5.170763969421387, + "loss": 0.0532, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.588827610015869, + "rewards/margins": 0.5819366574287415, + "rewards/rejected": -5.170763969421387, + "sft_loss": 4.343879699707031, + "step": 1735 + }, + { + "epoch": 0.9312594079277471, + "grad_norm": 0.7707100372795159, + "learning_rate": 8.710926628309101e-07, + "logits/chosen": -0.6921442151069641, + "logits/rejected": -0.4576943814754486, + "logps/chosen": -4.3977508544921875, + "logps/rejected": -4.836879730224609, + "loss": 0.0523, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.3977508544921875, + "rewards/margins": 0.4391290545463562, + "rewards/rejected": -4.836879730224609, + "sft_loss": 4.144339561462402, + "step": 1740 + }, + { + "epoch": 0.9339354407091487, + "grad_norm": 0.40155269984358594, + "learning_rate": 8.700471013287424e-07, + "logits/chosen": -0.5236523747444153, + "logits/rejected": -0.5326138734817505, + "logps/chosen": -4.680853366851807, + "logps/rejected": -4.947844982147217, + "loss": 0.0542, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.680853366851807, + "rewards/margins": 0.266991525888443, + "rewards/rejected": -4.947844982147217, + "sft_loss": 4.386083126068115, + "step": 1745 + }, + { + "epoch": 0.9366114734905503, + "grad_norm": 0.5819037718837893, + "learning_rate": 8.689979496279746e-07, + "logits/chosen": -0.7221859693527222, + "logits/rejected": -0.6695482730865479, + "logps/chosen": -4.943053722381592, + "logps/rejected": -5.197215557098389, + "loss": 0.0556, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.943053722381592, + "rewards/margins": 0.2541615962982178, + "rewards/rejected": -5.197215557098389, + "sft_loss": 4.715271949768066, + "step": 1750 + }, + { + "epoch": 0.9392875062719518, + "grad_norm": 0.4677346992925345, + "learning_rate": 8.679452179074811e-07, + "logits/chosen": -0.7230926752090454, + "logits/rejected": -0.5767534375190735, + "logps/chosen": -4.456429481506348, + "logps/rejected": -4.846767425537109, + "loss": 0.0529, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.456429481506348, + "rewards/margins": 0.39033815264701843, + "rewards/rejected": -4.846767425537109, + "sft_loss": 4.245125770568848, + "step": 1755 + }, + { + "epoch": 0.9419635390533534, + "grad_norm": 0.9766948042968379, + "learning_rate": 8.668889163808698e-07, + "logits/chosen": -0.6952082514762878, + "logits/rejected": -0.5020134449005127, + "logps/chosen": -4.320244312286377, + "logps/rejected": -4.711088180541992, + "loss": 0.0531, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.320244312286377, + "rewards/margins": 0.3908434510231018, + "rewards/rejected": -4.711088180541992, + "sft_loss": 4.1057281494140625, + "step": 1760 + }, + { + "epoch": 0.944639571834755, + "grad_norm": 0.8113564262746916, + "learning_rate": 8.658290552963827e-07, + "logits/chosen": -0.6360453963279724, + "logits/rejected": -0.5571495294570923, + "logps/chosen": -4.789154529571533, + "logps/rejected": -5.147796154022217, + "loss": 0.0547, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.789154529571533, + "rewards/margins": 0.35864168405532837, + "rewards/rejected": -5.147796154022217, + "sft_loss": 4.56979513168335, + "step": 1765 + }, + { + "epoch": 0.9473156046161565, + "grad_norm": 0.40121347821380127, + "learning_rate": 8.647656449367966e-07, + "logits/chosen": -0.6600741147994995, + "logits/rejected": -0.48249635100364685, + "logps/chosen": -4.793465614318848, + "logps/rejected": -5.12413215637207, + "loss": 0.0543, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.793465614318848, + "rewards/margins": 0.33066678047180176, + "rewards/rejected": -5.12413215637207, + "sft_loss": 4.610315799713135, + "step": 1770 + }, + { + "epoch": 0.9499916373975581, + "grad_norm": 0.4609934021865174, + "learning_rate": 8.636986956193235e-07, + "logits/chosen": -0.7527320981025696, + "logits/rejected": -0.6049268841743469, + "logps/chosen": -4.484477519989014, + "logps/rejected": -4.966238975524902, + "loss": 0.0527, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.484477519989014, + "rewards/margins": 0.4817616045475006, + "rewards/rejected": -4.966238975524902, + "sft_loss": 4.206968307495117, + "step": 1775 + }, + { + "epoch": 0.9526676701789597, + "grad_norm": 0.5990607543889642, + "learning_rate": 8.626282176955104e-07, + "logits/chosen": -0.7125250697135925, + "logits/rejected": -0.5920495390892029, + "logps/chosen": -4.532000541687012, + "logps/rejected": -4.994868278503418, + "loss": 0.053, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.532000541687012, + "rewards/margins": 0.46286827325820923, + "rewards/rejected": -4.994868278503418, + "sft_loss": 4.308735370635986, + "step": 1780 + }, + { + "epoch": 0.9553437029603613, + "grad_norm": 0.6096852450418404, + "learning_rate": 8.615542215511389e-07, + "logits/chosen": -0.6793350577354431, + "logits/rejected": -0.6178755760192871, + "logps/chosen": -4.636038303375244, + "logps/rejected": -4.855711460113525, + "loss": 0.0548, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.636038303375244, + "rewards/margins": 0.21967339515686035, + "rewards/rejected": -4.855711460113525, + "sft_loss": 4.38653564453125, + "step": 1785 + }, + { + "epoch": 0.9580197357417628, + "grad_norm": 0.37934684130563323, + "learning_rate": 8.604767176061241e-07, + "logits/chosen": -0.6598840951919556, + "logits/rejected": -0.5501774549484253, + "logps/chosen": -4.648791313171387, + "logps/rejected": -5.084364414215088, + "loss": 0.0527, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.648791313171387, + "rewards/margins": 0.4355725347995758, + "rewards/rejected": -5.084364414215088, + "sft_loss": 4.3693437576293945, + "step": 1790 + }, + { + "epoch": 0.9606957685231644, + "grad_norm": 0.3543703405808491, + "learning_rate": 8.593957163144141e-07, + "logits/chosen": -0.8177486658096313, + "logits/rejected": -0.6528640985488892, + "logps/chosen": -4.524102210998535, + "logps/rejected": -5.0155134201049805, + "loss": 0.0526, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.524102210998535, + "rewards/margins": 0.4914116859436035, + "rewards/rejected": -5.0155134201049805, + "sft_loss": 4.297591686248779, + "step": 1795 + }, + { + "epoch": 0.963371801304566, + "grad_norm": 0.587837761567623, + "learning_rate": 8.58311228163888e-07, + "logits/chosen": -0.7575830817222595, + "logits/rejected": -0.6973570585250854, + "logps/chosen": -4.548133850097656, + "logps/rejected": -4.882990837097168, + "loss": 0.0534, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.548133850097656, + "rewards/margins": 0.3348572254180908, + "rewards/rejected": -4.882990837097168, + "sft_loss": 4.285576343536377, + "step": 1800 + }, + { + "epoch": 0.9660478340859675, + "grad_norm": 0.7831749014712782, + "learning_rate": 8.57223263676255e-07, + "logits/chosen": -0.827855110168457, + "logits/rejected": -0.6802206039428711, + "logps/chosen": -4.526045799255371, + "logps/rejected": -5.129607200622559, + "loss": 0.0524, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.526045799255371, + "rewards/margins": 0.603561282157898, + "rewards/rejected": -5.129607200622559, + "sft_loss": 4.310399055480957, + "step": 1805 + }, + { + "epoch": 0.9687238668673691, + "grad_norm": 0.4646618216970684, + "learning_rate": 8.561318334069511e-07, + "logits/chosen": -0.695126473903656, + "logits/rejected": -0.5704531073570251, + "logps/chosen": -4.636171340942383, + "logps/rejected": -5.058096408843994, + "loss": 0.0528, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.636171340942383, + "rewards/margins": 0.42192497849464417, + "rewards/rejected": -5.058096408843994, + "sft_loss": 4.393563747406006, + "step": 1810 + }, + { + "epoch": 0.9713998996487707, + "grad_norm": 0.4712524042717296, + "learning_rate": 8.550369479450375e-07, + "logits/chosen": -0.5622086524963379, + "logits/rejected": -0.40890851616859436, + "logps/chosen": -4.376356601715088, + "logps/rejected": -4.874255657196045, + "loss": 0.0527, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.376356601715088, + "rewards/margins": 0.4978991448879242, + "rewards/rejected": -4.874255657196045, + "sft_loss": 4.152787685394287, + "step": 1815 + }, + { + "epoch": 0.9740759324301723, + "grad_norm": 0.34063083871666683, + "learning_rate": 8.539386179130977e-07, + "logits/chosen": -0.49509549140930176, + "logits/rejected": -0.4550551474094391, + "logps/chosen": -4.642989158630371, + "logps/rejected": -5.026967525482178, + "loss": 0.053, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.642989158630371, + "rewards/margins": 0.3839784264564514, + "rewards/rejected": -5.026967525482178, + "sft_loss": 4.345905303955078, + "step": 1820 + }, + { + "epoch": 0.9767519652115738, + "grad_norm": 0.3872496478526205, + "learning_rate": 8.528368539671347e-07, + "logits/chosen": -0.5788191556930542, + "logits/rejected": -0.3689228594303131, + "logps/chosen": -4.652213096618652, + "logps/rejected": -5.309985160827637, + "loss": 0.0531, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.652213096618652, + "rewards/margins": 0.6577725410461426, + "rewards/rejected": -5.309985160827637, + "sft_loss": 4.4433393478393555, + "step": 1825 + }, + { + "epoch": 0.9794279979929754, + "grad_norm": 0.9287089525439055, + "learning_rate": 8.51731666796467e-07, + "logits/chosen": -0.3933059573173523, + "logits/rejected": -0.328712522983551, + "logps/chosen": -4.452856540679932, + "logps/rejected": -4.835246562957764, + "loss": 0.0532, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.452856540679932, + "rewards/margins": 0.38238975405693054, + "rewards/rejected": -4.835246562957764, + "sft_loss": 4.1753621101379395, + "step": 1830 + }, + { + "epoch": 0.982104030774377, + "grad_norm": 0.5941262114362317, + "learning_rate": 8.506230671236254e-07, + "logits/chosen": -0.5021569132804871, + "logits/rejected": -0.39401504397392273, + "logps/chosen": -4.731034278869629, + "logps/rejected": -5.092568397521973, + "loss": 0.0532, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.731034278869629, + "rewards/margins": 0.3615338206291199, + "rewards/rejected": -5.092568397521973, + "sft_loss": 4.504293441772461, + "step": 1835 + }, + { + "epoch": 0.9847800635557785, + "grad_norm": 0.34993440604791326, + "learning_rate": 8.495110657042488e-07, + "logits/chosen": -0.47997450828552246, + "logits/rejected": -0.2725691795349121, + "logps/chosen": -4.54381799697876, + "logps/rejected": -5.032081127166748, + "loss": 0.0538, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.54381799697876, + "rewards/margins": 0.4882632791996002, + "rewards/rejected": -5.032081127166748, + "sft_loss": 4.36675500869751, + "step": 1840 + }, + { + "epoch": 0.9874560963371801, + "grad_norm": 0.4308703258389709, + "learning_rate": 8.483956733269799e-07, + "logits/chosen": -0.4384092688560486, + "logits/rejected": -0.3426581919193268, + "logps/chosen": -4.591732978820801, + "logps/rejected": -4.909899711608887, + "loss": 0.0531, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.591732978820801, + "rewards/margins": 0.31816643476486206, + "rewards/rejected": -4.909899711608887, + "sft_loss": 4.256570816040039, + "step": 1845 + }, + { + "epoch": 0.9901321291185817, + "grad_norm": 0.3473658903289487, + "learning_rate": 8.472769008133602e-07, + "logits/chosen": -0.6959007978439331, + "logits/rejected": -0.5554211735725403, + "logps/chosen": -4.5757036209106445, + "logps/rejected": -4.93710994720459, + "loss": 0.0537, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.5757036209106445, + "rewards/margins": 0.3614066243171692, + "rewards/rejected": -4.93710994720459, + "sft_loss": 4.2838358879089355, + "step": 1850 + }, + { + "epoch": 0.9928081618999832, + "grad_norm": 0.4555695972184036, + "learning_rate": 8.461547590177259e-07, + "logits/chosen": -0.557471513748169, + "logits/rejected": -0.37859243154525757, + "logps/chosen": -4.733495712280273, + "logps/rejected": -5.202332973480225, + "loss": 0.0515, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.733495712280273, + "rewards/margins": 0.4688374996185303, + "rewards/rejected": -5.202332973480225, + "sft_loss": 4.25360631942749, + "step": 1855 + }, + { + "epoch": 0.9954841946813848, + "grad_norm": 0.43279035633127416, + "learning_rate": 8.450292588271014e-07, + "logits/chosen": -0.604584276676178, + "logits/rejected": -0.4833999276161194, + "logps/chosen": -4.606520652770996, + "logps/rejected": -5.053445339202881, + "loss": 0.0523, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.606520652770996, + "rewards/margins": 0.44692516326904297, + "rewards/rejected": -5.053445339202881, + "sft_loss": 4.216244220733643, + "step": 1860 + }, + { + "epoch": 0.9981602274627864, + "grad_norm": 0.5329839703813494, + "learning_rate": 8.439004111610945e-07, + "logits/chosen": -0.5827174782752991, + "logits/rejected": -0.5101505517959595, + "logps/chosen": -4.813737869262695, + "logps/rejected": -5.12241268157959, + "loss": 0.0533, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.813737869262695, + "rewards/margins": 0.30867481231689453, + "rewards/rejected": -5.12241268157959, + "sft_loss": 4.392908573150635, + "step": 1865 + }, + { + "epoch": 1.000836260244188, + "grad_norm": 0.3367694694066443, + "learning_rate": 8.427682269717901e-07, + "logits/chosen": -0.6478679776191711, + "logits/rejected": -0.4914635717868805, + "logps/chosen": -4.561735153198242, + "logps/rejected": -5.053101539611816, + "loss": 0.0525, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.561735153198242, + "rewards/margins": 0.49136656522750854, + "rewards/rejected": -5.053101539611816, + "sft_loss": 4.242537498474121, + "step": 1870 + }, + { + "epoch": 1.0035122930255895, + "grad_norm": 0.5834097301481967, + "learning_rate": 8.416327172436446e-07, + "logits/chosen": -0.8005746603012085, + "logits/rejected": -0.6261088848114014, + "logps/chosen": -4.756998538970947, + "logps/rejected": -5.0415496826171875, + "loss": 0.0542, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.756998538970947, + "rewards/margins": 0.28455111384391785, + "rewards/rejected": -5.0415496826171875, + "sft_loss": 4.461302757263184, + "step": 1875 + }, + { + "epoch": 1.0061883258069912, + "grad_norm": 0.3973798220767986, + "learning_rate": 8.404938929933778e-07, + "logits/chosen": -0.6104685664176941, + "logits/rejected": -0.4688517153263092, + "logps/chosen": -4.621617794036865, + "logps/rejected": -5.099360466003418, + "loss": 0.052, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.621617794036865, + "rewards/margins": 0.47774267196655273, + "rewards/rejected": -5.099360466003418, + "sft_loss": 4.235894203186035, + "step": 1880 + }, + { + "epoch": 1.0088643585883927, + "grad_norm": 0.3540909470534527, + "learning_rate": 8.39351765269868e-07, + "logits/chosen": -0.6760979890823364, + "logits/rejected": -0.609889805316925, + "logps/chosen": -4.65677547454834, + "logps/rejected": -5.0093488693237305, + "loss": 0.0539, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.65677547454834, + "rewards/margins": 0.3525732159614563, + "rewards/rejected": -5.0093488693237305, + "sft_loss": 4.3292412757873535, + "step": 1885 + }, + { + "epoch": 1.0115403913697942, + "grad_norm": 0.7267303197089038, + "learning_rate": 8.382063451540431e-07, + "logits/chosen": -0.7155488729476929, + "logits/rejected": -0.45979467034339905, + "logps/chosen": -4.453604698181152, + "logps/rejected": -4.879714488983154, + "loss": 0.0533, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.453604698181152, + "rewards/margins": 0.4261098802089691, + "rewards/rejected": -4.879714488983154, + "sft_loss": 4.265443801879883, + "step": 1890 + }, + { + "epoch": 1.014216424151196, + "grad_norm": 0.4939611442573637, + "learning_rate": 8.370576437587742e-07, + "logits/chosen": -0.6435521841049194, + "logits/rejected": -0.6209043264389038, + "logps/chosen": -4.716995716094971, + "logps/rejected": -5.006557464599609, + "loss": 0.0531, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.716995716094971, + "rewards/margins": 0.28956204652786255, + "rewards/rejected": -5.006557464599609, + "sft_loss": 4.35626220703125, + "step": 1895 + }, + { + "epoch": 1.0168924569325974, + "grad_norm": 0.41701206009092195, + "learning_rate": 8.359056722287674e-07, + "logits/chosen": -0.7903778553009033, + "logits/rejected": -0.4564805030822754, + "logps/chosen": -4.639558792114258, + "logps/rejected": -5.135783672332764, + "loss": 0.0524, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.639558792114258, + "rewards/margins": 0.49622488021850586, + "rewards/rejected": -5.135783672332764, + "sft_loss": 4.399038791656494, + "step": 1900 + }, + { + "epoch": 1.019568489713999, + "grad_norm": 0.7172297640459512, + "learning_rate": 8.347504417404553e-07, + "logits/chosen": -0.5581150054931641, + "logits/rejected": -0.4032842218875885, + "logps/chosen": -4.544151306152344, + "logps/rejected": -4.86648416519165, + "loss": 0.0542, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.544151306152344, + "rewards/margins": 0.32233288884162903, + "rewards/rejected": -4.86648416519165, + "sft_loss": 4.188483238220215, + "step": 1905 + }, + { + "epoch": 1.0222445224954007, + "grad_norm": 0.5879181967847867, + "learning_rate": 8.335919635018893e-07, + "logits/chosen": -0.7893685102462769, + "logits/rejected": -0.6554333567619324, + "logps/chosen": -4.7575788497924805, + "logps/rejected": -5.040469169616699, + "loss": 0.0538, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.7575788497924805, + "rewards/margins": 0.2828896641731262, + "rewards/rejected": -5.040469169616699, + "sft_loss": 4.464777946472168, + "step": 1910 + }, + { + "epoch": 1.0249205552768021, + "grad_norm": 0.43231433946425346, + "learning_rate": 8.324302487526303e-07, + "logits/chosen": -0.7919416427612305, + "logits/rejected": -0.6639026403427124, + "logps/chosen": -4.676050186157227, + "logps/rejected": -5.0417985916137695, + "loss": 0.0529, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.676050186157227, + "rewards/margins": 0.36574864387512207, + "rewards/rejected": -5.0417985916137695, + "sft_loss": 4.378249168395996, + "step": 1915 + }, + { + "epoch": 1.0275965880582036, + "grad_norm": 0.37404157282655504, + "learning_rate": 8.312653087636398e-07, + "logits/chosen": -0.7558452486991882, + "logits/rejected": -0.6526741981506348, + "logps/chosen": -4.579874515533447, + "logps/rejected": -4.968623161315918, + "loss": 0.0534, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.579874515533447, + "rewards/margins": 0.38874801993370056, + "rewards/rejected": -4.968623161315918, + "sft_loss": 4.276757717132568, + "step": 1920 + }, + { + "epoch": 1.0302726208396054, + "grad_norm": 0.36303918679572256, + "learning_rate": 8.300971548371711e-07, + "logits/chosen": -0.883111834526062, + "logits/rejected": -0.602270781993866, + "logps/chosen": -4.452870845794678, + "logps/rejected": -4.911247730255127, + "loss": 0.052, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.452870845794678, + "rewards/margins": 0.45837679505348206, + "rewards/rejected": -4.911247730255127, + "sft_loss": 4.131048202514648, + "step": 1925 + }, + { + "epoch": 1.0329486536210069, + "grad_norm": 0.490560671457728, + "learning_rate": 8.289257983066582e-07, + "logits/chosen": -0.7516878843307495, + "logits/rejected": -0.5508066415786743, + "logps/chosen": -4.64019775390625, + "logps/rejected": -5.095065593719482, + "loss": 0.0528, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.64019775390625, + "rewards/margins": 0.4548683166503906, + "rewards/rejected": -5.095065593719482, + "sft_loss": 4.342806339263916, + "step": 1930 + }, + { + "epoch": 1.0356246864024083, + "grad_norm": 0.2893276344274073, + "learning_rate": 8.277512505366077e-07, + "logits/chosen": -0.8057788014411926, + "logits/rejected": -0.5403832197189331, + "logps/chosen": -4.649588108062744, + "logps/rejected": -5.086615085601807, + "loss": 0.0531, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.649588108062744, + "rewards/margins": 0.4370269179344177, + "rewards/rejected": -5.086615085601807, + "sft_loss": 4.376861095428467, + "step": 1935 + }, + { + "epoch": 1.03830071918381, + "grad_norm": 0.7350489462981791, + "learning_rate": 8.265735229224868e-07, + "logits/chosen": -0.6775953769683838, + "logits/rejected": -0.5745862722396851, + "logps/chosen": -4.5659379959106445, + "logps/rejected": -5.098782062530518, + "loss": 0.052, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.5659379959106445, + "rewards/margins": 0.5328438878059387, + "rewards/rejected": -5.098782062530518, + "sft_loss": 4.174136161804199, + "step": 1940 + }, + { + "epoch": 1.0409767519652116, + "grad_norm": 0.4516735988201357, + "learning_rate": 8.253926268906144e-07, + "logits/chosen": -0.7164157629013062, + "logits/rejected": -0.5405601859092712, + "logps/chosen": -4.749704837799072, + "logps/rejected": -5.371539115905762, + "loss": 0.0527, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.749704837799072, + "rewards/margins": 0.6218348145484924, + "rewards/rejected": -5.371539115905762, + "sft_loss": 4.391979694366455, + "step": 1945 + }, + { + "epoch": 1.043652784746613, + "grad_norm": 0.3510630604415655, + "learning_rate": 8.242085738980487e-07, + "logits/chosen": -0.5420058965682983, + "logits/rejected": -0.2874351739883423, + "logps/chosen": -4.528346061706543, + "logps/rejected": -5.030544281005859, + "loss": 0.0531, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.528346061706543, + "rewards/margins": 0.5021986961364746, + "rewards/rejected": -5.030544281005859, + "sft_loss": 4.249731540679932, + "step": 1950 + }, + { + "epoch": 1.0463288175280148, + "grad_norm": 0.5538265414870637, + "learning_rate": 8.230213754324772e-07, + "logits/chosen": -0.6377596259117126, + "logits/rejected": -0.5630909204483032, + "logps/chosen": -4.5644450187683105, + "logps/rejected": -4.965079307556152, + "loss": 0.0539, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.5644450187683105, + "rewards/margins": 0.40063467621803284, + "rewards/rejected": -4.965079307556152, + "sft_loss": 4.360657215118408, + "step": 1955 + }, + { + "epoch": 1.0490048503094163, + "grad_norm": 0.2949614545223873, + "learning_rate": 8.218310430121045e-07, + "logits/chosen": -0.5675156712532043, + "logits/rejected": -0.5481168031692505, + "logps/chosen": -4.707152366638184, + "logps/rejected": -4.992379188537598, + "loss": 0.0539, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.707152366638184, + "rewards/margins": 0.2852264642715454, + "rewards/rejected": -4.992379188537598, + "sft_loss": 4.478783130645752, + "step": 1960 + }, + { + "epoch": 1.051680883090818, + "grad_norm": 0.5543113807704609, + "learning_rate": 8.20637588185541e-07, + "logits/chosen": -0.5608028173446655, + "logits/rejected": -0.46988582611083984, + "logps/chosen": -4.442027568817139, + "logps/rejected": -4.943470478057861, + "loss": 0.0531, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.442027568817139, + "rewards/margins": 0.5014427900314331, + "rewards/rejected": -4.943470478057861, + "sft_loss": 4.245416164398193, + "step": 1965 + }, + { + "epoch": 1.0543569158722195, + "grad_norm": 0.2866117529311254, + "learning_rate": 8.194410225316906e-07, + "logits/chosen": -0.7090498208999634, + "logits/rejected": -0.5142877101898193, + "logps/chosen": -4.616601467132568, + "logps/rejected": -5.014645576477051, + "loss": 0.0529, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.616601467132568, + "rewards/margins": 0.39804330468177795, + "rewards/rejected": -5.014645576477051, + "sft_loss": 4.298870086669922, + "step": 1970 + }, + { + "epoch": 1.057032948653621, + "grad_norm": 0.35295190473633326, + "learning_rate": 8.182413576596385e-07, + "logits/chosen": -0.5405132174491882, + "logits/rejected": -0.48180103302001953, + "logps/chosen": -4.607327938079834, + "logps/rejected": -5.084053039550781, + "loss": 0.0532, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.607327938079834, + "rewards/margins": 0.4767250418663025, + "rewards/rejected": -5.084053039550781, + "sft_loss": 4.384486675262451, + "step": 1975 + }, + { + "epoch": 1.0597089814350227, + "grad_norm": 0.3776636814624771, + "learning_rate": 8.170386052085389e-07, + "logits/chosen": -0.5723873376846313, + "logits/rejected": -0.44267210364341736, + "logps/chosen": -4.660275459289551, + "logps/rejected": -5.051526069641113, + "loss": 0.0534, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.660275459289551, + "rewards/margins": 0.39125025272369385, + "rewards/rejected": -5.051526069641113, + "sft_loss": 4.37464714050293, + "step": 1980 + }, + { + "epoch": 1.0623850142164242, + "grad_norm": 0.5159615230619127, + "learning_rate": 8.158327768475008e-07, + "logits/chosen": -0.6213208436965942, + "logits/rejected": -0.4670190215110779, + "logps/chosen": -4.665583610534668, + "logps/rejected": -4.967374801635742, + "loss": 0.0529, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.665583610534668, + "rewards/margins": 0.30179107189178467, + "rewards/rejected": -4.967374801635742, + "sft_loss": 4.264622211456299, + "step": 1985 + }, + { + "epoch": 1.0650610469978257, + "grad_norm": 0.44400302792011925, + "learning_rate": 8.146238842754767e-07, + "logits/chosen": -0.6752325296401978, + "logits/rejected": -0.5508004426956177, + "logps/chosen": -4.6823554039001465, + "logps/rejected": -5.140440940856934, + "loss": 0.0533, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.6823554039001465, + "rewards/margins": 0.45808520913124084, + "rewards/rejected": -5.140440940856934, + "sft_loss": 4.321152210235596, + "step": 1990 + }, + { + "epoch": 1.0677370797792274, + "grad_norm": 0.7057418884353844, + "learning_rate": 8.134119392211476e-07, + "logits/chosen": -0.5761233568191528, + "logits/rejected": -0.38899824023246765, + "logps/chosen": -4.757379055023193, + "logps/rejected": -5.293605804443359, + "loss": 0.0528, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.757379055023193, + "rewards/margins": 0.5362268686294556, + "rewards/rejected": -5.293605804443359, + "sft_loss": 4.403469085693359, + "step": 1995 + }, + { + "epoch": 1.0704131125606289, + "grad_norm": 0.49240902562783034, + "learning_rate": 8.121969534428094e-07, + "logits/chosen": -0.7272932529449463, + "logits/rejected": -0.5523107051849365, + "logps/chosen": -4.743283271789551, + "logps/rejected": -5.234428882598877, + "loss": 0.0537, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.743283271789551, + "rewards/margins": 0.4911455512046814, + "rewards/rejected": -5.234428882598877, + "sft_loss": 4.387631416320801, + "step": 2000 + }, + { + "epoch": 1.0704131125606289, + "eval_logits/chosen": -0.23999834060668945, + "eval_logits/rejected": -0.15501756966114044, + "eval_logps/chosen": -4.426951885223389, + "eval_logps/rejected": -4.892354488372803, + "eval_loss": 0.051702603697776794, + "eval_rewards/accuracies": 0.6468842625617981, + "eval_rewards/chosen": -4.426951885223389, + "eval_rewards/margins": 0.4654025435447693, + "eval_rewards/rejected": -4.892354488372803, + "eval_runtime": 43.2906, + "eval_samples_per_second": 31.069, + "eval_sft_loss": 3.9661672115325928, + "eval_steps_per_second": 7.785, + "step": 2000 + }, + { + "epoch": 1.0730891453420304, + "grad_norm": 0.4600706793047822, + "learning_rate": 8.109789387282599e-07, + "logits/chosen": -0.6717931032180786, + "logits/rejected": -0.6304608583450317, + "logps/chosen": -4.575514793395996, + "logps/rejected": -4.920340538024902, + "loss": 0.0535, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.575514793395996, + "rewards/margins": 0.3448256552219391, + "rewards/rejected": -4.920340538024902, + "sft_loss": 4.244610786437988, + "step": 2005 + }, + { + "epoch": 1.075765178123432, + "grad_norm": 0.3970379766667623, + "learning_rate": 8.097579068946827e-07, + "logits/chosen": -0.7990378141403198, + "logits/rejected": -0.6704200506210327, + "logps/chosen": -4.71152400970459, + "logps/rejected": -5.189643859863281, + "loss": 0.0522, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.71152400970459, + "rewards/margins": 0.4781200885772705, + "rewards/rejected": -5.189643859863281, + "sft_loss": 4.398639678955078, + "step": 2010 + }, + { + "epoch": 1.0784412109048336, + "grad_norm": 0.5944728609383647, + "learning_rate": 8.085338697885344e-07, + "logits/chosen": -0.7603497505187988, + "logits/rejected": -0.6357791423797607, + "logps/chosen": -4.704463481903076, + "logps/rejected": -5.05355978012085, + "loss": 0.0532, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.704463481903076, + "rewards/margins": 0.3490960896015167, + "rewards/rejected": -5.05355978012085, + "sft_loss": 4.395869255065918, + "step": 2015 + }, + { + "epoch": 1.081117243686235, + "grad_norm": 0.37605343362394694, + "learning_rate": 8.073068392854282e-07, + "logits/chosen": -0.826943039894104, + "logits/rejected": -0.5462093949317932, + "logps/chosen": -4.333527565002441, + "logps/rejected": -4.886292457580566, + "loss": 0.0514, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.333527565002441, + "rewards/margins": 0.5527652502059937, + "rewards/rejected": -4.886292457580566, + "sft_loss": 4.111708641052246, + "step": 2020 + }, + { + "epoch": 1.0837932764676368, + "grad_norm": 0.7179662433757041, + "learning_rate": 8.060768272900193e-07, + "logits/chosen": -0.6332185864448547, + "logits/rejected": -0.4485379755496979, + "logps/chosen": -4.430613040924072, + "logps/rejected": -4.984936237335205, + "loss": 0.0525, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.430613040924072, + "rewards/margins": 0.5543233156204224, + "rewards/rejected": -4.984936237335205, + "sft_loss": 4.166750907897949, + "step": 2025 + }, + { + "epoch": 1.0864693092490383, + "grad_norm": 0.35942591017215597, + "learning_rate": 8.0484384573589e-07, + "logits/chosen": -0.6678962111473083, + "logits/rejected": -0.6808444857597351, + "logps/chosen": -4.590447425842285, + "logps/rejected": -4.973687648773193, + "loss": 0.0534, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.590447425842285, + "rewards/margins": 0.3832399249076843, + "rewards/rejected": -4.973687648773193, + "sft_loss": 4.327738285064697, + "step": 2030 + }, + { + "epoch": 1.0891453420304398, + "grad_norm": 0.3819578692381291, + "learning_rate": 8.03607906585432e-07, + "logits/chosen": -0.7403281331062317, + "logits/rejected": -0.5643646121025085, + "logps/chosen": -4.80985164642334, + "logps/rejected": -5.229228496551514, + "loss": 0.0534, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.80985164642334, + "rewards/margins": 0.4193764328956604, + "rewards/rejected": -5.229228496551514, + "sft_loss": 4.544407844543457, + "step": 2035 + }, + { + "epoch": 1.0918213748118415, + "grad_norm": 0.4776251797363461, + "learning_rate": 8.023690218297329e-07, + "logits/chosen": -0.6995172500610352, + "logits/rejected": -0.6782066226005554, + "logps/chosen": -4.589086532592773, + "logps/rejected": -4.949246406555176, + "loss": 0.0522, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.589086532592773, + "rewards/margins": 0.36015990376472473, + "rewards/rejected": -4.949246406555176, + "sft_loss": 4.224146842956543, + "step": 2040 + }, + { + "epoch": 1.094497407593243, + "grad_norm": 0.7479474801927108, + "learning_rate": 8.01127203488458e-07, + "logits/chosen": -0.5383197665214539, + "logits/rejected": -0.48996859788894653, + "logps/chosen": -4.540228366851807, + "logps/rejected": -5.0145792961120605, + "loss": 0.0537, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.540228366851807, + "rewards/margins": 0.47435110807418823, + "rewards/rejected": -5.0145792961120605, + "sft_loss": 4.176971435546875, + "step": 2045 + }, + { + "epoch": 1.0971734403746445, + "grad_norm": 0.4776403747608541, + "learning_rate": 7.998824636097339e-07, + "logits/chosen": -0.6831840872764587, + "logits/rejected": -0.5318336486816406, + "logps/chosen": -4.693176746368408, + "logps/rejected": -5.142681121826172, + "loss": 0.0541, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.693176746368408, + "rewards/margins": 0.44950443506240845, + "rewards/rejected": -5.142681121826172, + "sft_loss": 4.48852014541626, + "step": 2050 + }, + { + "epoch": 1.0998494731560462, + "grad_norm": 0.5140746201417399, + "learning_rate": 7.986348142700328e-07, + "logits/chosen": -0.7236698269844055, + "logits/rejected": -0.5705364346504211, + "logps/chosen": -4.719154357910156, + "logps/rejected": -5.148931980133057, + "loss": 0.0535, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.719154357910156, + "rewards/margins": 0.42977744340896606, + "rewards/rejected": -5.148931980133057, + "sft_loss": 4.52241325378418, + "step": 2055 + }, + { + "epoch": 1.1025255059374477, + "grad_norm": 0.4179464858196893, + "learning_rate": 7.973842675740539e-07, + "logits/chosen": -0.609150767326355, + "logits/rejected": -0.5410366058349609, + "logps/chosen": -4.482192039489746, + "logps/rejected": -4.9732208251953125, + "loss": 0.052, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.482192039489746, + "rewards/margins": 0.49102896451950073, + "rewards/rejected": -4.9732208251953125, + "sft_loss": 4.188296318054199, + "step": 2060 + }, + { + "epoch": 1.1052015387188494, + "grad_norm": 0.5180053361561886, + "learning_rate": 7.961308356546066e-07, + "logits/chosen": -0.7119373083114624, + "logits/rejected": -0.5679913759231567, + "logps/chosen": -4.229228973388672, + "logps/rejected": -4.735593318939209, + "loss": 0.0523, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.229228973388672, + "rewards/margins": 0.5063642263412476, + "rewards/rejected": -4.735593318939209, + "sft_loss": 3.9967262744903564, + "step": 2065 + }, + { + "epoch": 1.107877571500251, + "grad_norm": 0.46718871726056116, + "learning_rate": 7.948745306724931e-07, + "logits/chosen": -0.7739778757095337, + "logits/rejected": -0.6170490980148315, + "logps/chosen": -4.635968208312988, + "logps/rejected": -5.1852192878723145, + "loss": 0.0518, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.635968208312988, + "rewards/margins": 0.549250602722168, + "rewards/rejected": -5.1852192878723145, + "sft_loss": 4.369393825531006, + "step": 2070 + }, + { + "epoch": 1.1105536042816524, + "grad_norm": 0.3984587230629792, + "learning_rate": 7.936153648163897e-07, + "logits/chosen": -0.8065184354782104, + "logits/rejected": -0.7064257860183716, + "logps/chosen": -4.752967834472656, + "logps/rejected": -5.092961311340332, + "loss": 0.0534, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.752967834472656, + "rewards/margins": 0.33999359607696533, + "rewards/rejected": -5.092961311340332, + "sft_loss": 4.481800079345703, + "step": 2075 + }, + { + "epoch": 1.1132296370630541, + "grad_norm": 0.5695032759468597, + "learning_rate": 7.92353350302729e-07, + "logits/chosen": -0.7206646800041199, + "logits/rejected": -0.548992931842804, + "logps/chosen": -4.335890293121338, + "logps/rejected": -4.820664882659912, + "loss": 0.0531, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.335890293121338, + "rewards/margins": 0.48477450013160706, + "rewards/rejected": -4.820664882659912, + "sft_loss": 4.050368785858154, + "step": 2080 + }, + { + "epoch": 1.1159056698444556, + "grad_norm": 0.5829877306762072, + "learning_rate": 7.910884993755816e-07, + "logits/chosen": -0.7600020170211792, + "logits/rejected": -0.6812421679496765, + "logps/chosen": -4.553534984588623, + "logps/rejected": -5.294806480407715, + "loss": 0.0522, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.553534984588623, + "rewards/margins": 0.741270899772644, + "rewards/rejected": -5.294806480407715, + "sft_loss": 4.326718807220459, + "step": 2085 + }, + { + "epoch": 1.118581702625857, + "grad_norm": 0.8262505274696786, + "learning_rate": 7.898208243065367e-07, + "logits/chosen": -0.7206606864929199, + "logits/rejected": -0.7447828054428101, + "logps/chosen": -4.787585258483887, + "logps/rejected": -5.069046497344971, + "loss": 0.0538, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.787585258483887, + "rewards/margins": 0.2814616560935974, + "rewards/rejected": -5.069046497344971, + "sft_loss": 4.394356727600098, + "step": 2090 + }, + { + "epoch": 1.1212577354072588, + "grad_norm": 0.5172306289125376, + "learning_rate": 7.88550337394583e-07, + "logits/chosen": -0.8299128413200378, + "logits/rejected": -0.6698902249336243, + "logps/chosen": -4.428835391998291, + "logps/rejected": -4.9404144287109375, + "loss": 0.0525, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.428835391998291, + "rewards/margins": 0.5115790367126465, + "rewards/rejected": -4.9404144287109375, + "sft_loss": 4.22437047958374, + "step": 2095 + }, + { + "epoch": 1.1239337681886603, + "grad_norm": 0.3443635655516794, + "learning_rate": 7.872770509659905e-07, + "logits/chosen": -0.6373104453086853, + "logits/rejected": -0.5988988876342773, + "logps/chosen": -4.619525909423828, + "logps/rejected": -4.976244926452637, + "loss": 0.0525, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.619525909423828, + "rewards/margins": 0.3567189574241638, + "rewards/rejected": -4.976244926452637, + "sft_loss": 4.2509589195251465, + "step": 2100 + }, + { + "epoch": 1.1266098009700618, + "grad_norm": 0.3976039731943874, + "learning_rate": 7.860009773741896e-07, + "logits/chosen": -0.5825362801551819, + "logits/rejected": -0.4224637448787689, + "logps/chosen": -4.629540920257568, + "logps/rejected": -5.204381465911865, + "loss": 0.0522, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.629540920257568, + "rewards/margins": 0.5748408436775208, + "rewards/rejected": -5.204381465911865, + "sft_loss": 4.34817361831665, + "step": 2105 + }, + { + "epoch": 1.1292858337514635, + "grad_norm": 0.39212041229796935, + "learning_rate": 7.84722128999652e-07, + "logits/chosen": -0.6080501675605774, + "logits/rejected": -0.45994147658348083, + "logps/chosen": -4.713656425476074, + "logps/rejected": -5.265693187713623, + "loss": 0.0538, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.713656425476074, + "rewards/margins": 0.5520361661911011, + "rewards/rejected": -5.265693187713623, + "sft_loss": 4.4130964279174805, + "step": 2110 + }, + { + "epoch": 1.131961866532865, + "grad_norm": 1.2673650048653755, + "learning_rate": 7.834405182497699e-07, + "logits/chosen": -0.49251309037208557, + "logits/rejected": -0.4597319960594177, + "logps/chosen": -4.524658203125, + "logps/rejected": -4.95531702041626, + "loss": 0.0544, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.524658203125, + "rewards/margins": 0.43065857887268066, + "rewards/rejected": -4.95531702041626, + "sft_loss": 4.2953691482543945, + "step": 2115 + }, + { + "epoch": 1.1346378993142665, + "grad_norm": 0.5472533304467926, + "learning_rate": 7.821561575587368e-07, + "logits/chosen": -0.632826566696167, + "logits/rejected": -0.6065362095832825, + "logps/chosen": -4.617916584014893, + "logps/rejected": -4.967799663543701, + "loss": 0.0539, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.617916584014893, + "rewards/margins": 0.34988293051719666, + "rewards/rejected": -4.967799663543701, + "sft_loss": 4.453722953796387, + "step": 2120 + }, + { + "epoch": 1.1373139320956682, + "grad_norm": 0.277069241699628, + "learning_rate": 7.808690593874254e-07, + "logits/chosen": -0.7417432069778442, + "logits/rejected": -0.6385709047317505, + "logps/chosen": -4.6788010597229, + "logps/rejected": -5.170589923858643, + "loss": 0.0548, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.6788010597229, + "rewards/margins": 0.491788774728775, + "rewards/rejected": -5.170589923858643, + "sft_loss": 4.516565322875977, + "step": 2125 + }, + { + "epoch": 1.1399899648770697, + "grad_norm": 0.47122826813421975, + "learning_rate": 7.79579236223268e-07, + "logits/chosen": -0.6222350001335144, + "logits/rejected": -0.3234289288520813, + "logps/chosen": -4.440484523773193, + "logps/rejected": -4.997524261474609, + "loss": 0.0512, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.440484523773193, + "rewards/margins": 0.5570399165153503, + "rewards/rejected": -4.997524261474609, + "sft_loss": 4.1651482582092285, + "step": 2130 + }, + { + "epoch": 1.1426659976584714, + "grad_norm": 0.44261863106415245, + "learning_rate": 7.782867005801346e-07, + "logits/chosen": -0.6503061056137085, + "logits/rejected": -0.42080339789390564, + "logps/chosen": -4.290419578552246, + "logps/rejected": -4.850780963897705, + "loss": 0.0529, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.290419578552246, + "rewards/margins": 0.560361385345459, + "rewards/rejected": -4.850780963897705, + "sft_loss": 4.068234443664551, + "step": 2135 + }, + { + "epoch": 1.145342030439873, + "grad_norm": 0.6257258204652094, + "learning_rate": 7.769914649982117e-07, + "logits/chosen": -0.6751757264137268, + "logits/rejected": -0.5070086717605591, + "logps/chosen": -4.67188024520874, + "logps/rejected": -5.1316142082214355, + "loss": 0.0526, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.67188024520874, + "rewards/margins": 0.4597338140010834, + "rewards/rejected": -5.1316142082214355, + "sft_loss": 4.425201416015625, + "step": 2140 + }, + { + "epoch": 1.1480180632212744, + "grad_norm": 0.5302938713487861, + "learning_rate": 7.756935420438803e-07, + "logits/chosen": -0.6334782838821411, + "logits/rejected": -0.5502496957778931, + "logps/chosen": -4.623457908630371, + "logps/rejected": -5.174169540405273, + "loss": 0.0517, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.623457908630371, + "rewards/margins": 0.5507121086120605, + "rewards/rejected": -5.174169540405273, + "sft_loss": 4.292878150939941, + "step": 2145 + }, + { + "epoch": 1.1506940960026761, + "grad_norm": 0.6070596328759367, + "learning_rate": 7.743929443095951e-07, + "logits/chosen": -0.6257916688919067, + "logits/rejected": -0.5801526308059692, + "logps/chosen": -4.365163326263428, + "logps/rejected": -4.9333815574646, + "loss": 0.0531, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.365163326263428, + "rewards/margins": 0.568219006061554, + "rewards/rejected": -4.9333815574646, + "sft_loss": 4.0889482498168945, + "step": 2150 + }, + { + "epoch": 1.1533701287840776, + "grad_norm": 0.6839084267850211, + "learning_rate": 7.730896844137609e-07, + "logits/chosen": -0.6672028303146362, + "logits/rejected": -0.5766544342041016, + "logps/chosen": -4.858401775360107, + "logps/rejected": -5.171161651611328, + "loss": 0.0538, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.858401775360107, + "rewards/margins": 0.3127599358558655, + "rewards/rejected": -5.171161651611328, + "sft_loss": 4.545053005218506, + "step": 2155 + }, + { + "epoch": 1.1560461615654791, + "grad_norm": 0.3568908807824978, + "learning_rate": 7.717837750006106e-07, + "logits/chosen": -0.7871010303497314, + "logits/rejected": -0.7272329330444336, + "logps/chosen": -4.782454490661621, + "logps/rejected": -5.240635871887207, + "loss": 0.0531, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.782454490661621, + "rewards/margins": 0.45818084478378296, + "rewards/rejected": -5.240635871887207, + "sft_loss": 4.542524337768555, + "step": 2160 + }, + { + "epoch": 1.1587221943468808, + "grad_norm": 0.5990662498663287, + "learning_rate": 7.704752287400832e-07, + "logits/chosen": -0.73150235414505, + "logits/rejected": -0.5180791020393372, + "logps/chosen": -4.3465166091918945, + "logps/rejected": -4.793034076690674, + "loss": 0.0532, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.3465166091918945, + "rewards/margins": 0.44651785492897034, + "rewards/rejected": -4.793034076690674, + "sft_loss": 4.138575553894043, + "step": 2165 + }, + { + "epoch": 1.1613982271282823, + "grad_norm": 0.27564186929080403, + "learning_rate": 7.691640583277004e-07, + "logits/chosen": -0.6673997640609741, + "logits/rejected": -0.48911604285240173, + "logps/chosen": -4.416393756866455, + "logps/rejected": -4.9265360832214355, + "loss": 0.0521, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.416393756866455, + "rewards/margins": 0.5101426839828491, + "rewards/rejected": -4.9265360832214355, + "sft_loss": 4.156350135803223, + "step": 2170 + }, + { + "epoch": 1.1640742599096838, + "grad_norm": 0.4813794132035719, + "learning_rate": 7.678502764844433e-07, + "logits/chosen": -0.7979347705841064, + "logits/rejected": -0.5554312467575073, + "logps/chosen": -4.576869964599609, + "logps/rejected": -4.963016986846924, + "loss": 0.0526, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.576869964599609, + "rewards/margins": 0.3861469328403473, + "rewards/rejected": -4.963016986846924, + "sft_loss": 4.301800727844238, + "step": 2175 + }, + { + "epoch": 1.1667502926910855, + "grad_norm": 0.3189556223539962, + "learning_rate": 7.665338959566288e-07, + "logits/chosen": -0.68720543384552, + "logits/rejected": -0.6039665937423706, + "logps/chosen": -4.594615459442139, + "logps/rejected": -5.138156414031982, + "loss": 0.0517, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.594615459442139, + "rewards/margins": 0.5435405969619751, + "rewards/rejected": -5.138156414031982, + "sft_loss": 4.311945915222168, + "step": 2180 + }, + { + "epoch": 1.169426325472487, + "grad_norm": 0.7522816530936208, + "learning_rate": 7.652149295157868e-07, + "logits/chosen": -0.5866094827651978, + "logits/rejected": -0.40514469146728516, + "logps/chosen": -4.510667324066162, + "logps/rejected": -4.878499984741211, + "loss": 0.053, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.510667324066162, + "rewards/margins": 0.3678319752216339, + "rewards/rejected": -4.878499984741211, + "sft_loss": 4.197674751281738, + "step": 2185 + }, + { + "epoch": 1.1721023582538885, + "grad_norm": 0.4463112427262321, + "learning_rate": 7.638933899585354e-07, + "logits/chosen": -0.44559282064437866, + "logits/rejected": -0.45199769735336304, + "logps/chosen": -4.532196998596191, + "logps/rejected": -5.0780205726623535, + "loss": 0.0524, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.532196998596191, + "rewards/margins": 0.5458240509033203, + "rewards/rejected": -5.0780205726623535, + "sft_loss": 4.232436180114746, + "step": 2190 + }, + { + "epoch": 1.1747783910352902, + "grad_norm": 0.457643428777109, + "learning_rate": 7.625692901064573e-07, + "logits/chosen": -0.6123597025871277, + "logits/rejected": -0.5176479816436768, + "logps/chosen": -4.659605026245117, + "logps/rejected": -5.072868347167969, + "loss": 0.0551, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -4.659605026245117, + "rewards/margins": 0.41326403617858887, + "rewards/rejected": -5.072868347167969, + "sft_loss": 4.42981481552124, + "step": 2195 + }, + { + "epoch": 1.1774544238166917, + "grad_norm": 0.39494702241247864, + "learning_rate": 7.61242642805975e-07, + "logits/chosen": -0.7008353471755981, + "logits/rejected": -0.7373430132865906, + "logps/chosen": -4.8912858963012695, + "logps/rejected": -5.152310848236084, + "loss": 0.0543, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -4.8912858963012695, + "rewards/margins": 0.2610251009464264, + "rewards/rejected": -5.152310848236084, + "sft_loss": 4.551990509033203, + "step": 2200 + }, + { + "epoch": 1.1801304565980932, + "grad_norm": 0.3579528644218798, + "learning_rate": 7.599134609282266e-07, + "logits/chosen": -0.8311487436294556, + "logits/rejected": -0.5965268015861511, + "logps/chosen": -4.442378044128418, + "logps/rejected": -4.958862781524658, + "loss": 0.053, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.442378044128418, + "rewards/margins": 0.516484797000885, + "rewards/rejected": -4.958862781524658, + "sft_loss": 4.260416030883789, + "step": 2205 + }, + { + "epoch": 1.182806489379495, + "grad_norm": 0.44236643382684815, + "learning_rate": 7.585817573689402e-07, + "logits/chosen": -0.7978760600090027, + "logits/rejected": -0.6854046583175659, + "logps/chosen": -4.419394016265869, + "logps/rejected": -5.014368057250977, + "loss": 0.0518, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.419394016265869, + "rewards/margins": 0.5949746370315552, + "rewards/rejected": -5.014368057250977, + "sft_loss": 4.203858852386475, + "step": 2210 + }, + { + "epoch": 1.1854825221608964, + "grad_norm": 0.4060038941710105, + "learning_rate": 7.572475450483098e-07, + "logits/chosen": -0.704419732093811, + "logits/rejected": -0.5912919044494629, + "logps/chosen": -4.481686115264893, + "logps/rejected": -5.0178141593933105, + "loss": 0.0522, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.481686115264893, + "rewards/margins": 0.5361284017562866, + "rewards/rejected": -5.0178141593933105, + "sft_loss": 4.2278852462768555, + "step": 2215 + }, + { + "epoch": 1.188158554942298, + "grad_norm": 0.5070512938760626, + "learning_rate": 7.559108369108689e-07, + "logits/chosen": -0.7109116315841675, + "logits/rejected": -0.5894996523857117, + "logps/chosen": -4.390748500823975, + "logps/rejected": -4.943523406982422, + "loss": 0.0529, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.390748500823975, + "rewards/margins": 0.5527750253677368, + "rewards/rejected": -4.943523406982422, + "sft_loss": 4.148125648498535, + "step": 2220 + }, + { + "epoch": 1.1908345877236997, + "grad_norm": 0.5159637234340495, + "learning_rate": 7.54571645925366e-07, + "logits/chosen": -0.8002594709396362, + "logits/rejected": -0.5314058661460876, + "logps/chosen": -4.7372260093688965, + "logps/rejected": -5.4383015632629395, + "loss": 0.0517, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.7372260093688965, + "rewards/margins": 0.7010757327079773, + "rewards/rejected": -5.4383015632629395, + "sft_loss": 4.354250907897949, + "step": 2225 + }, + { + "epoch": 1.1935106205051011, + "grad_norm": 0.45824704280534556, + "learning_rate": 7.532299850846378e-07, + "logits/chosen": -0.7148901224136353, + "logits/rejected": -0.5222693681716919, + "logps/chosen": -4.647156715393066, + "logps/rejected": -5.444609642028809, + "loss": 0.0521, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.647156715393066, + "rewards/margins": 0.7974528074264526, + "rewards/rejected": -5.444609642028809, + "sft_loss": 4.288125038146973, + "step": 2230 + }, + { + "epoch": 1.1961866532865026, + "grad_norm": 0.6411370012537118, + "learning_rate": 7.518858674054838e-07, + "logits/chosen": -0.658902645111084, + "logits/rejected": -0.44083452224731445, + "logps/chosen": -4.602587699890137, + "logps/rejected": -5.086310386657715, + "loss": 0.0529, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.602587699890137, + "rewards/margins": 0.48372262716293335, + "rewards/rejected": -5.086310386657715, + "sft_loss": 4.2280964851379395, + "step": 2235 + }, + { + "epoch": 1.1988626860679044, + "grad_norm": 0.4630768053585857, + "learning_rate": 7.505393059285394e-07, + "logits/chosen": -0.7535982728004456, + "logits/rejected": -0.5658230781555176, + "logps/chosen": -4.64790153503418, + "logps/rejected": -5.012650966644287, + "loss": 0.0535, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.64790153503418, + "rewards/margins": 0.36474916338920593, + "rewards/rejected": -5.012650966644287, + "sft_loss": 4.332772254943848, + "step": 2240 + }, + { + "epoch": 1.2015387188493059, + "grad_norm": 0.47993843494898725, + "learning_rate": 7.491903137181501e-07, + "logits/chosen": -0.7029468417167664, + "logits/rejected": -0.6749275326728821, + "logps/chosen": -4.604931831359863, + "logps/rejected": -4.9890336990356445, + "loss": 0.0536, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.604931831359863, + "rewards/margins": 0.3841017186641693, + "rewards/rejected": -4.9890336990356445, + "sft_loss": 4.398990631103516, + "step": 2245 + }, + { + "epoch": 1.2042147516307076, + "grad_norm": 0.41159973312127757, + "learning_rate": 7.478389038622441e-07, + "logits/chosen": -0.6323720216751099, + "logits/rejected": -0.6110953092575073, + "logps/chosen": -4.512185096740723, + "logps/rejected": -4.970728874206543, + "loss": 0.0522, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.512185096740723, + "rewards/margins": 0.45854368805885315, + "rewards/rejected": -4.970728874206543, + "sft_loss": 4.247210502624512, + "step": 2250 + }, + { + "epoch": 1.206890784412109, + "grad_norm": 0.4682841990616749, + "learning_rate": 7.46485089472206e-07, + "logits/chosen": -0.7587865591049194, + "logits/rejected": -0.7158206701278687, + "logps/chosen": -4.485775947570801, + "logps/rejected": -4.8737568855285645, + "loss": 0.0537, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.485775947570801, + "rewards/margins": 0.3879804015159607, + "rewards/rejected": -4.8737568855285645, + "sft_loss": 4.248816013336182, + "step": 2255 + }, + { + "epoch": 1.2095668171935106, + "grad_norm": 0.44708077693442627, + "learning_rate": 7.451288836827487e-07, + "logits/chosen": -0.7529920339584351, + "logits/rejected": -0.7776705622673035, + "logps/chosen": -4.722033500671387, + "logps/rejected": -5.016490459442139, + "loss": 0.0538, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.722033500671387, + "rewards/margins": 0.2944570481777191, + "rewards/rejected": -5.016490459442139, + "sft_loss": 4.490199089050293, + "step": 2260 + }, + { + "epoch": 1.2122428499749123, + "grad_norm": 0.5230816480612343, + "learning_rate": 7.437702996517869e-07, + "logits/chosen": -0.80247563123703, + "logits/rejected": -0.7050091028213501, + "logps/chosen": -4.602765083312988, + "logps/rejected": -4.979498863220215, + "loss": 0.0529, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.602765083312988, + "rewards/margins": 0.37673383951187134, + "rewards/rejected": -4.979498863220215, + "sft_loss": 4.365364074707031, + "step": 2265 + }, + { + "epoch": 1.2149188827563138, + "grad_norm": 0.4773651019537365, + "learning_rate": 7.424093505603087e-07, + "logits/chosen": -0.8925487399101257, + "logits/rejected": -0.6971911191940308, + "logps/chosen": -4.3991804122924805, + "logps/rejected": -4.9190897941589355, + "loss": 0.0517, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.3991804122924805, + "rewards/margins": 0.5199095010757446, + "rewards/rejected": -4.9190897941589355, + "sft_loss": 4.129302024841309, + "step": 2270 + }, + { + "epoch": 1.2175949155377153, + "grad_norm": 0.5743721034439623, + "learning_rate": 7.410460496122482e-07, + "logits/chosen": -0.7015866637229919, + "logits/rejected": -0.5496960878372192, + "logps/chosen": -4.501893043518066, + "logps/rejected": -5.030624866485596, + "loss": 0.0515, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.501893043518066, + "rewards/margins": 0.5287320017814636, + "rewards/rejected": -5.030624866485596, + "sft_loss": 4.109658718109131, + "step": 2275 + }, + { + "epoch": 1.220270948319117, + "grad_norm": 0.5284106323606012, + "learning_rate": 7.396804100343572e-07, + "logits/chosen": -0.8011897206306458, + "logits/rejected": -0.5881227254867554, + "logps/chosen": -4.559887409210205, + "logps/rejected": -5.120944023132324, + "loss": 0.0513, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.559887409210205, + "rewards/margins": 0.5610562562942505, + "rewards/rejected": -5.120944023132324, + "sft_loss": 4.201657295227051, + "step": 2280 + }, + { + "epoch": 1.2229469811005185, + "grad_norm": 0.4225976306052054, + "learning_rate": 7.383124450760768e-07, + "logits/chosen": -0.7284170985221863, + "logits/rejected": -0.5309914946556091, + "logps/chosen": -4.601205348968506, + "logps/rejected": -5.269949913024902, + "loss": 0.0518, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.601205348968506, + "rewards/margins": 0.668745219707489, + "rewards/rejected": -5.269949913024902, + "sft_loss": 4.3044843673706055, + "step": 2285 + }, + { + "epoch": 1.22562301388192, + "grad_norm": 0.7224182412559759, + "learning_rate": 7.369421680094091e-07, + "logits/chosen": -0.7828829884529114, + "logits/rejected": -0.6176687479019165, + "logps/chosen": -4.525793552398682, + "logps/rejected": -5.050825119018555, + "loss": 0.0525, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.525793552398682, + "rewards/margins": 0.5250317454338074, + "rewards/rejected": -5.050825119018555, + "sft_loss": 4.251994609832764, + "step": 2290 + }, + { + "epoch": 1.2282990466633217, + "grad_norm": 0.629047772424287, + "learning_rate": 7.355695921287881e-07, + "logits/chosen": -0.7749170064926147, + "logits/rejected": -0.6841250061988831, + "logps/chosen": -4.591757297515869, + "logps/rejected": -5.037539482116699, + "loss": 0.0536, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.591757297515869, + "rewards/margins": 0.4457823634147644, + "rewards/rejected": -5.037539482116699, + "sft_loss": 4.263556480407715, + "step": 2295 + }, + { + "epoch": 1.2309750794447232, + "grad_norm": 0.33053420351210816, + "learning_rate": 7.341947307509513e-07, + "logits/chosen": -0.7545934915542603, + "logits/rejected": -0.6486998796463013, + "logps/chosen": -4.742500305175781, + "logps/rejected": -5.1753973960876465, + "loss": 0.053, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.742500305175781, + "rewards/margins": 0.4328971803188324, + "rewards/rejected": -5.1753973960876465, + "sft_loss": 4.4836931228637695, + "step": 2300 + }, + { + "epoch": 1.233651112226125, + "grad_norm": 0.4071489993833283, + "learning_rate": 7.328175972148094e-07, + "logits/chosen": -0.7530182600021362, + "logits/rejected": -0.6168416142463684, + "logps/chosen": -4.746242523193359, + "logps/rejected": -5.2826247215271, + "loss": 0.0534, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.746242523193359, + "rewards/margins": 0.5363827347755432, + "rewards/rejected": -5.2826247215271, + "sft_loss": 4.459273815155029, + "step": 2305 + }, + { + "epoch": 1.2363271450075264, + "grad_norm": 0.5441597534623708, + "learning_rate": 7.314382048813185e-07, + "logits/chosen": -0.7168510556221008, + "logits/rejected": -0.43174856901168823, + "logps/chosen": -4.272377967834473, + "logps/rejected": -4.978228569030762, + "loss": 0.0509, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.272377967834473, + "rewards/margins": 0.7058510780334473, + "rewards/rejected": -4.978228569030762, + "sft_loss": 4.000433444976807, + "step": 2310 + }, + { + "epoch": 1.2390031777889279, + "grad_norm": 0.6252127343519807, + "learning_rate": 7.300565671333486e-07, + "logits/chosen": -0.8265350461006165, + "logits/rejected": -0.6041692495346069, + "logps/chosen": -4.5154876708984375, + "logps/rejected": -5.120383262634277, + "loss": 0.0516, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.5154876708984375, + "rewards/margins": 0.6048959493637085, + "rewards/rejected": -5.120383262634277, + "sft_loss": 4.2072625160217285, + "step": 2315 + }, + { + "epoch": 1.2416792105703296, + "grad_norm": 0.4222806528582792, + "learning_rate": 7.286726973755554e-07, + "logits/chosen": -0.7032849192619324, + "logits/rejected": -0.6845365762710571, + "logps/chosen": -4.55859375, + "logps/rejected": -5.039181709289551, + "loss": 0.0526, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.55859375, + "rewards/margins": 0.4805881083011627, + "rewards/rejected": -5.039181709289551, + "sft_loss": 4.184603691101074, + "step": 2320 + }, + { + "epoch": 1.244355243351731, + "grad_norm": 0.5106826580742952, + "learning_rate": 7.272866090342493e-07, + "logits/chosen": -0.5765025615692139, + "logits/rejected": -0.5348777770996094, + "logps/chosen": -4.575560569763184, + "logps/rejected": -5.200422286987305, + "loss": 0.0513, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.575560569763184, + "rewards/margins": 0.6248610615730286, + "rewards/rejected": -5.200422286987305, + "sft_loss": 4.18874979019165, + "step": 2325 + }, + { + "epoch": 1.2470312761331326, + "grad_norm": 0.36219022238136406, + "learning_rate": 7.258983155572656e-07, + "logits/chosen": -0.7583931684494019, + "logits/rejected": -0.6806284189224243, + "logps/chosen": -4.756449222564697, + "logps/rejected": -5.231799125671387, + "loss": 0.0529, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.756449222564697, + "rewards/margins": 0.4753497540950775, + "rewards/rejected": -5.231799125671387, + "sft_loss": 4.347174644470215, + "step": 2330 + }, + { + "epoch": 1.2497073089145343, + "grad_norm": 0.7028939653486833, + "learning_rate": 7.245078304138335e-07, + "logits/chosen": -0.676438570022583, + "logits/rejected": -0.617641270160675, + "logps/chosen": -4.50102424621582, + "logps/rejected": -5.152174472808838, + "loss": 0.052, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.50102424621582, + "rewards/margins": 0.6511501669883728, + "rewards/rejected": -5.152174472808838, + "sft_loss": 4.230152606964111, + "step": 2335 + }, + { + "epoch": 1.2523833416959358, + "grad_norm": 0.34441628678949693, + "learning_rate": 7.231151670944462e-07, + "logits/chosen": -0.9201499223709106, + "logits/rejected": -0.6691521406173706, + "logps/chosen": -4.548752307891846, + "logps/rejected": -5.115777969360352, + "loss": 0.0525, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.548752307891846, + "rewards/margins": 0.5670259594917297, + "rewards/rejected": -5.115777969360352, + "sft_loss": 4.256083965301514, + "step": 2340 + }, + { + "epoch": 1.2550593744773373, + "grad_norm": 0.36028138123982145, + "learning_rate": 7.217203391107291e-07, + "logits/chosen": -0.8337070345878601, + "logits/rejected": -0.6574937105178833, + "logps/chosen": -4.54742431640625, + "logps/rejected": -5.139768600463867, + "loss": 0.0526, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.54742431640625, + "rewards/margins": 0.5923444032669067, + "rewards/rejected": -5.139768600463867, + "sft_loss": 4.214510440826416, + "step": 2345 + }, + { + "epoch": 1.257735407258739, + "grad_norm": 0.5578957686131796, + "learning_rate": 7.203233599953096e-07, + "logits/chosen": -0.8540999293327332, + "logits/rejected": -0.6997717618942261, + "logps/chosen": -4.4829535484313965, + "logps/rejected": -5.033291816711426, + "loss": 0.0524, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.4829535484313965, + "rewards/margins": 0.5503381490707397, + "rewards/rejected": -5.033291816711426, + "sft_loss": 4.220416069030762, + "step": 2350 + }, + { + "epoch": 1.2604114400401405, + "grad_norm": 0.4751570065353926, + "learning_rate": 7.189242433016852e-07, + "logits/chosen": -0.778272271156311, + "logits/rejected": -0.6525672674179077, + "logps/chosen": -4.632129669189453, + "logps/rejected": -5.244833946228027, + "loss": 0.0526, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.632129669189453, + "rewards/margins": 0.6127038598060608, + "rewards/rejected": -5.244833946228027, + "sft_loss": 4.3556694984436035, + "step": 2355 + }, + { + "epoch": 1.263087472821542, + "grad_norm": 0.49060819947586853, + "learning_rate": 7.17523002604092e-07, + "logits/chosen": -0.8550931811332703, + "logits/rejected": -0.6443161368370056, + "logps/chosen": -4.523386001586914, + "logps/rejected": -5.068552017211914, + "loss": 0.0526, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.523386001586914, + "rewards/margins": 0.5451655983924866, + "rewards/rejected": -5.068552017211914, + "sft_loss": 4.249871253967285, + "step": 2360 + }, + { + "epoch": 1.2657635056029437, + "grad_norm": 0.8862204290910091, + "learning_rate": 7.161196514973734e-07, + "logits/chosen": -0.7608314752578735, + "logits/rejected": -0.5959222912788391, + "logps/chosen": -4.416409969329834, + "logps/rejected": -5.110037326812744, + "loss": 0.0538, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -4.416409969329834, + "rewards/margins": 0.6936279535293579, + "rewards/rejected": -5.110037326812744, + "sft_loss": 4.177985191345215, + "step": 2365 + }, + { + "epoch": 1.2684395383843452, + "grad_norm": 0.36966512161661286, + "learning_rate": 7.147142035968483e-07, + "logits/chosen": -0.7292311787605286, + "logits/rejected": -0.5311886668205261, + "logps/chosen": -4.622040748596191, + "logps/rejected": -5.120987415313721, + "loss": 0.053, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.622040748596191, + "rewards/margins": 0.49894601106643677, + "rewards/rejected": -5.120987415313721, + "sft_loss": 4.340848445892334, + "step": 2370 + }, + { + "epoch": 1.2711155711657467, + "grad_norm": 0.6284592603398197, + "learning_rate": 7.133066725381781e-07, + "logits/chosen": -0.9135788679122925, + "logits/rejected": -0.701241135597229, + "logps/chosen": -4.508200645446777, + "logps/rejected": -5.073288917541504, + "loss": 0.0527, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.508200645446777, + "rewards/margins": 0.565088152885437, + "rewards/rejected": -5.073288917541504, + "sft_loss": 4.304009437561035, + "step": 2375 + }, + { + "epoch": 1.2737916039471484, + "grad_norm": 0.4830072872353554, + "learning_rate": 7.118970719772354e-07, + "logits/chosen": -0.8243740200996399, + "logits/rejected": -0.6196537017822266, + "logps/chosen": -4.614374160766602, + "logps/rejected": -5.266146659851074, + "loss": 0.0523, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.614374160766602, + "rewards/margins": 0.651772141456604, + "rewards/rejected": -5.266146659851074, + "sft_loss": 4.351268768310547, + "step": 2380 + }, + { + "epoch": 1.27646763672855, + "grad_norm": 0.4899922503825756, + "learning_rate": 7.104854155899711e-07, + "logits/chosen": -0.676455020904541, + "logits/rejected": -0.5914028286933899, + "logps/chosen": -4.441977024078369, + "logps/rejected": -4.935439109802246, + "loss": 0.0524, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.441977024078369, + "rewards/margins": 0.4934620261192322, + "rewards/rejected": -4.935439109802246, + "sft_loss": 4.1658124923706055, + "step": 2385 + }, + { + "epoch": 1.2791436695099514, + "grad_norm": 0.4339700877673433, + "learning_rate": 7.090717170722817e-07, + "logits/chosen": -0.6420737504959106, + "logits/rejected": -0.6251760721206665, + "logps/chosen": -4.5333662033081055, + "logps/rejected": -5.1688055992126465, + "loss": 0.0515, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.5333662033081055, + "rewards/margins": 0.6354392766952515, + "rewards/rejected": -5.1688055992126465, + "sft_loss": 4.251795768737793, + "step": 2390 + }, + { + "epoch": 1.2818197022913531, + "grad_norm": 0.5061167164386356, + "learning_rate": 7.076559901398762e-07, + "logits/chosen": -0.7426393628120422, + "logits/rejected": -0.6046496629714966, + "logps/chosen": -4.450405120849609, + "logps/rejected": -4.918197154998779, + "loss": 0.0527, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.450405120849609, + "rewards/margins": 0.46779197454452515, + "rewards/rejected": -4.918197154998779, + "sft_loss": 4.215243339538574, + "step": 2395 + }, + { + "epoch": 1.2844957350727546, + "grad_norm": 0.5037524653154726, + "learning_rate": 7.062382485281436e-07, + "logits/chosen": -0.64927077293396, + "logits/rejected": -0.513473391532898, + "logps/chosen": -4.409026145935059, + "logps/rejected": -4.936314105987549, + "loss": 0.0533, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.409026145935059, + "rewards/margins": 0.5272881388664246, + "rewards/rejected": -4.936314105987549, + "sft_loss": 4.165338039398193, + "step": 2400 + }, + { + "epoch": 1.2844957350727546, + "eval_logits/chosen": -0.2459660917520523, + "eval_logits/rejected": -0.15563316643238068, + "eval_logps/chosen": -4.822916507720947, + "eval_logps/rejected": -5.425734519958496, + "eval_loss": 0.051439397037029266, + "eval_rewards/accuracies": 0.6632047295570374, + "eval_rewards/chosen": -4.822916507720947, + "eval_rewards/margins": 0.6028181910514832, + "eval_rewards/rejected": -5.425734519958496, + "eval_runtime": 43.403, + "eval_samples_per_second": 30.989, + "eval_sft_loss": 4.406880855560303, + "eval_steps_per_second": 7.764, + "step": 2400 + }, + { + "epoch": 1.287171767854156, + "grad_norm": 0.7512180519376704, + "learning_rate": 7.048185059920193e-07, + "logits/chosen": -0.6440736651420593, + "logits/rejected": -0.5108321905136108, + "logps/chosen": -4.895911693572998, + "logps/rejected": -5.490865230560303, + "loss": 0.0528, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.895911693572998, + "rewards/margins": 0.5949534773826599, + "rewards/rejected": -5.490865230560303, + "sft_loss": 4.582104206085205, + "step": 2405 + }, + { + "epoch": 1.2898478006355578, + "grad_norm": 1.0070888677024856, + "learning_rate": 7.033967763058516e-07, + "logits/chosen": -0.722671389579773, + "logits/rejected": -0.5478376150131226, + "logps/chosen": -4.615888595581055, + "logps/rejected": -5.002371788024902, + "loss": 0.0521, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.615888595581055, + "rewards/margins": 0.386482298374176, + "rewards/rejected": -5.002371788024902, + "sft_loss": 4.239572048187256, + "step": 2410 + }, + { + "epoch": 1.2925238334169593, + "grad_norm": 0.4258529079691424, + "learning_rate": 7.019730732632681e-07, + "logits/chosen": -0.5446482300758362, + "logits/rejected": -0.43018198013305664, + "logps/chosen": -4.422608375549316, + "logps/rejected": -5.125778675079346, + "loss": 0.0518, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.422608375549316, + "rewards/margins": 0.7031702995300293, + "rewards/rejected": -5.125778675079346, + "sft_loss": 4.148711681365967, + "step": 2415 + }, + { + "epoch": 1.2951998661983608, + "grad_norm": 0.35467819540066825, + "learning_rate": 7.005474106770418e-07, + "logits/chosen": -0.6596757173538208, + "logits/rejected": -0.5504968166351318, + "logps/chosen": -4.54642391204834, + "logps/rejected": -5.045216083526611, + "loss": 0.0521, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.54642391204834, + "rewards/margins": 0.4987919330596924, + "rewards/rejected": -5.045216083526611, + "sft_loss": 4.238245964050293, + "step": 2420 + }, + { + "epoch": 1.2978758989797625, + "grad_norm": 1.3599351436307947, + "learning_rate": 6.991198023789577e-07, + "logits/chosen": -0.5905810594558716, + "logits/rejected": -0.5072682499885559, + "logps/chosen": -4.447530269622803, + "logps/rejected": -4.882772445678711, + "loss": 0.0538, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.447530269622803, + "rewards/margins": 0.43524178862571716, + "rewards/rejected": -4.882772445678711, + "sft_loss": 4.207924842834473, + "step": 2425 + }, + { + "epoch": 1.300551931761164, + "grad_norm": 0.5025417009492776, + "learning_rate": 6.976902622196776e-07, + "logits/chosen": -0.6522349119186401, + "logits/rejected": -0.5629934072494507, + "logps/chosen": -4.689788818359375, + "logps/rejected": -5.103816509246826, + "loss": 0.0537, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.689788818359375, + "rewards/margins": 0.41402775049209595, + "rewards/rejected": -5.103816509246826, + "sft_loss": 4.374016761779785, + "step": 2430 + }, + { + "epoch": 1.3032279645425655, + "grad_norm": 0.33461264496064647, + "learning_rate": 6.962588040686064e-07, + "logits/chosen": -0.7623521089553833, + "logits/rejected": -0.5751533508300781, + "logps/chosen": -4.745035171508789, + "logps/rejected": -5.1734185218811035, + "loss": 0.0537, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.745035171508789, + "rewards/margins": 0.42838358879089355, + "rewards/rejected": -5.1734185218811035, + "sft_loss": 4.583779811859131, + "step": 2435 + }, + { + "epoch": 1.3059039973239672, + "grad_norm": 0.3503261781496817, + "learning_rate": 6.948254418137573e-07, + "logits/chosen": -0.712530255317688, + "logits/rejected": -0.5704913139343262, + "logps/chosen": -4.597674369812012, + "logps/rejected": -5.1268134117126465, + "loss": 0.0526, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.597674369812012, + "rewards/margins": 0.5291392803192139, + "rewards/rejected": -5.1268134117126465, + "sft_loss": 4.329079627990723, + "step": 2440 + }, + { + "epoch": 1.3085800301053687, + "grad_norm": 0.39441341051128054, + "learning_rate": 6.933901893616174e-07, + "logits/chosen": -0.7677714824676514, + "logits/rejected": -0.6126347184181213, + "logps/chosen": -4.438683032989502, + "logps/rejected": -5.042284965515137, + "loss": 0.0519, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.438683032989502, + "rewards/margins": 0.6036025881767273, + "rewards/rejected": -5.042284965515137, + "sft_loss": 4.119725227355957, + "step": 2445 + }, + { + "epoch": 1.3112560628867704, + "grad_norm": 0.3927956162770441, + "learning_rate": 6.919530606370121e-07, + "logits/chosen": -0.7254393696784973, + "logits/rejected": -0.5506623983383179, + "logps/chosen": -4.357143402099609, + "logps/rejected": -4.954403877258301, + "loss": 0.0517, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.357143402099609, + "rewards/margins": 0.5972608327865601, + "rewards/rejected": -4.954403877258301, + "sft_loss": 4.115630149841309, + "step": 2450 + }, + { + "epoch": 1.313932095668172, + "grad_norm": 0.3738986864645462, + "learning_rate": 6.905140695829706e-07, + "logits/chosen": -0.7687052488327026, + "logits/rejected": -0.46032652258872986, + "logps/chosen": -4.44050407409668, + "logps/rejected": -5.113463878631592, + "loss": 0.0514, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.44050407409668, + "rewards/margins": 0.6729599237442017, + "rewards/rejected": -5.113463878631592, + "sft_loss": 4.17454719543457, + "step": 2455 + }, + { + "epoch": 1.3166081284495736, + "grad_norm": 0.5571532095288572, + "learning_rate": 6.890732301605904e-07, + "logits/chosen": -0.6341904401779175, + "logits/rejected": -0.5448669195175171, + "logps/chosen": -4.6078996658325195, + "logps/rejected": -5.000533103942871, + "loss": 0.0531, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.6078996658325195, + "rewards/margins": 0.3926334083080292, + "rewards/rejected": -5.000533103942871, + "sft_loss": 4.291760444641113, + "step": 2460 + }, + { + "epoch": 1.3192841612309751, + "grad_norm": 0.3892123944114807, + "learning_rate": 6.876305563489021e-07, + "logits/chosen": -0.6441881060600281, + "logits/rejected": -0.5762056112289429, + "logps/chosen": -4.400938510894775, + "logps/rejected": -5.1529011726379395, + "loss": 0.0504, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.400938510894775, + "rewards/margins": 0.7519630193710327, + "rewards/rejected": -5.1529011726379395, + "sft_loss": 4.062161445617676, + "step": 2465 + }, + { + "epoch": 1.3219601940123766, + "grad_norm": 0.5043097198631741, + "learning_rate": 6.861860621447331e-07, + "logits/chosen": -0.7821289300918579, + "logits/rejected": -0.6575089693069458, + "logps/chosen": -4.7076544761657715, + "logps/rejected": -5.158883094787598, + "loss": 0.0539, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.7076544761657715, + "rewards/margins": 0.451228529214859, + "rewards/rejected": -5.158883094787598, + "sft_loss": 4.450681686401367, + "step": 2470 + }, + { + "epoch": 1.3246362267937783, + "grad_norm": 0.5241208894891419, + "learning_rate": 6.847397615625725e-07, + "logits/chosen": -0.6322156190872192, + "logits/rejected": -0.5924761295318604, + "logps/chosen": -4.701780796051025, + "logps/rejected": -5.140883922576904, + "loss": 0.0522, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.701780796051025, + "rewards/margins": 0.43910354375839233, + "rewards/rejected": -5.140883922576904, + "sft_loss": 4.368111610412598, + "step": 2475 + }, + { + "epoch": 1.3273122595751798, + "grad_norm": 0.5530610503614979, + "learning_rate": 6.83291668634435e-07, + "logits/chosen": -0.7902230024337769, + "logits/rejected": -0.5947554111480713, + "logps/chosen": -4.581168174743652, + "logps/rejected": -5.159932613372803, + "loss": 0.0522, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.581168174743652, + "rewards/margins": 0.5787646174430847, + "rewards/rejected": -5.159932613372803, + "sft_loss": 4.308629512786865, + "step": 2480 + }, + { + "epoch": 1.3299882923565813, + "grad_norm": 0.5053924424200537, + "learning_rate": 6.818417974097246e-07, + "logits/chosen": -0.5258646607398987, + "logits/rejected": -0.38841742277145386, + "logps/chosen": -4.452823638916016, + "logps/rejected": -5.15665864944458, + "loss": 0.0516, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.452823638916016, + "rewards/margins": 0.7038346529006958, + "rewards/rejected": -5.15665864944458, + "sft_loss": 4.177115440368652, + "step": 2485 + }, + { + "epoch": 1.332664325137983, + "grad_norm": 0.46316275553685127, + "learning_rate": 6.803901619550981e-07, + "logits/chosen": -0.6718374490737915, + "logits/rejected": -0.6161580085754395, + "logps/chosen": -4.422329902648926, + "logps/rejected": -4.936065673828125, + "loss": 0.0529, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.422329902648926, + "rewards/margins": 0.5137358903884888, + "rewards/rejected": -4.936065673828125, + "sft_loss": 4.147505283355713, + "step": 2490 + }, + { + "epoch": 1.3353403579193845, + "grad_norm": 0.5381914488734175, + "learning_rate": 6.789367763543292e-07, + "logits/chosen": -0.6015291213989258, + "logits/rejected": -0.5835751295089722, + "logps/chosen": -4.570321559906006, + "logps/rejected": -5.038084983825684, + "loss": 0.0544, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.570321559906006, + "rewards/margins": 0.467763751745224, + "rewards/rejected": -5.038084983825684, + "sft_loss": 4.3525590896606445, + "step": 2495 + }, + { + "epoch": 1.338016390700786, + "grad_norm": 0.7920108460285353, + "learning_rate": 6.774816547081714e-07, + "logits/chosen": -0.6614540815353394, + "logits/rejected": -0.4659046232700348, + "logps/chosen": -4.600603103637695, + "logps/rejected": -5.15115213394165, + "loss": 0.0535, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.600603103637695, + "rewards/margins": 0.5505493879318237, + "rewards/rejected": -5.15115213394165, + "sft_loss": 4.411740303039551, + "step": 2500 + }, + { + "epoch": 1.3406924234821878, + "grad_norm": 0.32826827986852447, + "learning_rate": 6.760248111342211e-07, + "logits/chosen": -0.7377767562866211, + "logits/rejected": -0.5531325340270996, + "logps/chosen": -4.688912868499756, + "logps/rejected": -5.238001823425293, + "loss": 0.0523, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.688912868499756, + "rewards/margins": 0.5490891337394714, + "rewards/rejected": -5.238001823425293, + "sft_loss": 4.432499885559082, + "step": 2505 + }, + { + "epoch": 1.3433684562635893, + "grad_norm": 0.6428193005965454, + "learning_rate": 6.745662597667813e-07, + "logits/chosen": -0.750898540019989, + "logits/rejected": -0.6026118993759155, + "logps/chosen": -4.439126014709473, + "logps/rejected": -4.934445381164551, + "loss": 0.0519, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.439126014709473, + "rewards/margins": 0.49531856179237366, + "rewards/rejected": -4.934445381164551, + "sft_loss": 4.148636817932129, + "step": 2510 + }, + { + "epoch": 1.3460444890449907, + "grad_norm": 0.38982972965941254, + "learning_rate": 6.731060147567236e-07, + "logits/chosen": -0.5894494652748108, + "logits/rejected": -0.5215369462966919, + "logps/chosen": -4.36592960357666, + "logps/rejected": -4.934881210327148, + "loss": 0.0519, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.36592960357666, + "rewards/margins": 0.5689516067504883, + "rewards/rejected": -4.934881210327148, + "sft_loss": 4.115760326385498, + "step": 2515 + }, + { + "epoch": 1.3487205218263925, + "grad_norm": 0.611393333794855, + "learning_rate": 6.716440902713515e-07, + "logits/chosen": -0.6964327096939087, + "logits/rejected": -0.6343324184417725, + "logps/chosen": -4.4690141677856445, + "logps/rejected": -4.8751397132873535, + "loss": 0.0527, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.4690141677856445, + "rewards/margins": 0.406125545501709, + "rewards/rejected": -4.8751397132873535, + "sft_loss": 4.147188663482666, + "step": 2520 + }, + { + "epoch": 1.351396554607794, + "grad_norm": 0.39953792525208676, + "learning_rate": 6.701805004942627e-07, + "logits/chosen": -0.6524925231933594, + "logits/rejected": -0.595403254032135, + "logps/chosen": -4.759545803070068, + "logps/rejected": -5.341307640075684, + "loss": 0.0532, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.759545803070068, + "rewards/margins": 0.5817619562149048, + "rewards/rejected": -5.341307640075684, + "sft_loss": 4.543875694274902, + "step": 2525 + }, + { + "epoch": 1.3540725873891954, + "grad_norm": 0.5767017487926428, + "learning_rate": 6.687152596252119e-07, + "logits/chosen": -0.7337282299995422, + "logits/rejected": -0.6855202913284302, + "logps/chosen": -4.838815212249756, + "logps/rejected": -5.192466735839844, + "loss": 0.0538, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.838815212249756, + "rewards/margins": 0.35365158319473267, + "rewards/rejected": -5.192466735839844, + "sft_loss": 4.54294490814209, + "step": 2530 + }, + { + "epoch": 1.3567486201705972, + "grad_norm": 0.4398488876945579, + "learning_rate": 6.672483818799722e-07, + "logits/chosen": -0.7373114824295044, + "logits/rejected": -0.5846437215805054, + "logps/chosen": -4.462458610534668, + "logps/rejected": -4.956521511077881, + "loss": 0.0528, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.462458610534668, + "rewards/margins": 0.4940629005432129, + "rewards/rejected": -4.956521511077881, + "sft_loss": 4.213451862335205, + "step": 2535 + }, + { + "epoch": 1.3594246529519987, + "grad_norm": 0.3887196561405752, + "learning_rate": 6.657798814901978e-07, + "logits/chosen": -0.7088289260864258, + "logits/rejected": -0.4987329840660095, + "logps/chosen": -4.533008098602295, + "logps/rejected": -4.962521553039551, + "loss": 0.0516, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.533008098602295, + "rewards/margins": 0.4295133650302887, + "rewards/rejected": -4.962521553039551, + "sft_loss": 4.149649143218994, + "step": 2540 + }, + { + "epoch": 1.3621006857334002, + "grad_norm": 0.51660624133589, + "learning_rate": 6.643097727032863e-07, + "logits/chosen": -0.7439020872116089, + "logits/rejected": -0.5362112522125244, + "logps/chosen": -4.448864936828613, + "logps/rejected": -5.117484092712402, + "loss": 0.0516, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.448864936828613, + "rewards/margins": 0.6686197519302368, + "rewards/rejected": -5.117484092712402, + "sft_loss": 4.22825288772583, + "step": 2545 + }, + { + "epoch": 1.3647767185148019, + "grad_norm": 0.5837046444529704, + "learning_rate": 6.628380697822392e-07, + "logits/chosen": -0.7480605244636536, + "logits/rejected": -0.5812298059463501, + "logps/chosen": -4.5380167961120605, + "logps/rejected": -5.00238561630249, + "loss": 0.0526, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.5380167961120605, + "rewards/margins": 0.46436864137649536, + "rewards/rejected": -5.00238561630249, + "sft_loss": 4.235762119293213, + "step": 2550 + }, + { + "epoch": 1.3674527512962034, + "grad_norm": 0.5264631149675114, + "learning_rate": 6.61364787005525e-07, + "logits/chosen": -0.7261337041854858, + "logits/rejected": -0.5831155180931091, + "logps/chosen": -4.57515287399292, + "logps/rejected": -5.160813331604004, + "loss": 0.0529, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.57515287399292, + "rewards/margins": 0.585660457611084, + "rewards/rejected": -5.160813331604004, + "sft_loss": 4.378745079040527, + "step": 2555 + }, + { + "epoch": 1.3701287840776049, + "grad_norm": 0.4918096353013463, + "learning_rate": 6.598899386669395e-07, + "logits/chosen": -0.7453008890151978, + "logits/rejected": -0.6159285306930542, + "logps/chosen": -4.560245513916016, + "logps/rejected": -5.077498435974121, + "loss": 0.0534, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.560245513916016, + "rewards/margins": 0.5172520279884338, + "rewards/rejected": -5.077498435974121, + "sft_loss": 4.279725074768066, + "step": 2560 + }, + { + "epoch": 1.3728048168590066, + "grad_norm": 0.44468809524085257, + "learning_rate": 6.584135390754679e-07, + "logits/chosen": -0.7995618581771851, + "logits/rejected": -0.6510564088821411, + "logps/chosen": -4.504146099090576, + "logps/rejected": -5.2412333488464355, + "loss": 0.0519, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.504146099090576, + "rewards/margins": 0.7370876669883728, + "rewards/rejected": -5.2412333488464355, + "sft_loss": 4.247662544250488, + "step": 2565 + }, + { + "epoch": 1.375480849640408, + "grad_norm": 0.38358992062706415, + "learning_rate": 6.569356025551454e-07, + "logits/chosen": -0.7912853956222534, + "logits/rejected": -0.7308619022369385, + "logps/chosen": -4.615150451660156, + "logps/rejected": -5.038762092590332, + "loss": 0.0521, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.615150451660156, + "rewards/margins": 0.42361217737197876, + "rewards/rejected": -5.038762092590332, + "sft_loss": 4.207141399383545, + "step": 2570 + }, + { + "epoch": 1.3781568824218096, + "grad_norm": 0.36018657240876495, + "learning_rate": 6.554561434449186e-07, + "logits/chosen": -0.9121583104133606, + "logits/rejected": -0.7354962825775146, + "logps/chosen": -4.590145587921143, + "logps/rejected": -5.142067909240723, + "loss": 0.0527, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.590145587921143, + "rewards/margins": 0.5519219636917114, + "rewards/rejected": -5.142067909240723, + "sft_loss": 4.361257076263428, + "step": 2575 + }, + { + "epoch": 1.3808329152032113, + "grad_norm": 0.41305806068904577, + "learning_rate": 6.539751760985063e-07, + "logits/chosen": -0.780017614364624, + "logits/rejected": -0.7046865224838257, + "logps/chosen": -4.54480504989624, + "logps/rejected": -4.859285831451416, + "loss": 0.0538, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.54480504989624, + "rewards/margins": 0.31448012590408325, + "rewards/rejected": -4.859285831451416, + "sft_loss": 4.202549934387207, + "step": 2580 + }, + { + "epoch": 1.3835089479846128, + "grad_norm": 0.3782339057679898, + "learning_rate": 6.524927148842602e-07, + "logits/chosen": -0.7151792049407959, + "logits/rejected": -0.5270150303840637, + "logps/chosen": -4.705697059631348, + "logps/rejected": -5.182741641998291, + "loss": 0.0519, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.705697059631348, + "rewards/margins": 0.47704464197158813, + "rewards/rejected": -5.182741641998291, + "sft_loss": 4.334061622619629, + "step": 2585 + }, + { + "epoch": 1.3861849807660143, + "grad_norm": 0.5868680617117082, + "learning_rate": 6.510087741850254e-07, + "logits/chosen": -0.7695601582527161, + "logits/rejected": -0.6490595936775208, + "logps/chosen": -4.606133460998535, + "logps/rejected": -5.070815086364746, + "loss": 0.0528, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.606133460998535, + "rewards/margins": 0.464682012796402, + "rewards/rejected": -5.070815086364746, + "sft_loss": 4.332033634185791, + "step": 2590 + }, + { + "epoch": 1.388861013547416, + "grad_norm": 0.31923571248340193, + "learning_rate": 6.495233683980012e-07, + "logits/chosen": -0.7917790412902832, + "logits/rejected": -0.757839560508728, + "logps/chosen": -4.3590850830078125, + "logps/rejected": -4.865941047668457, + "loss": 0.0524, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.3590850830078125, + "rewards/margins": 0.5068557858467102, + "rewards/rejected": -4.865941047668457, + "sft_loss": 4.141876220703125, + "step": 2595 + }, + { + "epoch": 1.3915370463288175, + "grad_norm": 0.6110778260580617, + "learning_rate": 6.480365119346011e-07, + "logits/chosen": -0.7061268091201782, + "logits/rejected": -0.5795341730117798, + "logps/chosen": -4.5417585372924805, + "logps/rejected": -4.974133491516113, + "loss": 0.0526, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.5417585372924805, + "rewards/margins": 0.4323754906654358, + "rewards/rejected": -4.974133491516113, + "sft_loss": 4.267592906951904, + "step": 2600 + }, + { + "epoch": 1.394213079110219, + "grad_norm": 0.3968746308711125, + "learning_rate": 6.465482192203129e-07, + "logits/chosen": -0.6922782063484192, + "logits/rejected": -0.6162427663803101, + "logps/chosen": -4.611149787902832, + "logps/rejected": -5.119748115539551, + "loss": 0.0529, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.611149787902832, + "rewards/margins": 0.508597731590271, + "rewards/rejected": -5.119748115539551, + "sft_loss": 4.388543605804443, + "step": 2605 + }, + { + "epoch": 1.3968891118916207, + "grad_norm": 0.43719125204764003, + "learning_rate": 6.45058504694559e-07, + "logits/chosen": -0.6123635768890381, + "logits/rejected": -0.5769099593162537, + "logps/chosen": -4.499598503112793, + "logps/rejected": -5.056746006011963, + "loss": 0.0532, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.499598503112793, + "rewards/margins": 0.557147204875946, + "rewards/rejected": -5.056746006011963, + "sft_loss": 4.259655952453613, + "step": 2610 + }, + { + "epoch": 1.3995651446730222, + "grad_norm": 0.6082587458332754, + "learning_rate": 6.435673828105564e-07, + "logits/chosen": -0.7298904657363892, + "logits/rejected": -0.5628734230995178, + "logps/chosen": -4.518126964569092, + "logps/rejected": -5.07608699798584, + "loss": 0.0534, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.518126964569092, + "rewards/margins": 0.5579599142074585, + "rewards/rejected": -5.07608699798584, + "sft_loss": 4.245197296142578, + "step": 2615 + }, + { + "epoch": 1.402241177454424, + "grad_norm": 0.8367099282361746, + "learning_rate": 6.420748680351763e-07, + "logits/chosen": -0.6793943643569946, + "logits/rejected": -0.7437562942504883, + "logps/chosen": -4.676467418670654, + "logps/rejected": -5.091729164123535, + "loss": 0.0542, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.676467418670654, + "rewards/margins": 0.4152621626853943, + "rewards/rejected": -5.091729164123535, + "sft_loss": 4.483765125274658, + "step": 2620 + }, + { + "epoch": 1.4049172102358254, + "grad_norm": 0.3433494848599413, + "learning_rate": 6.405809748488032e-07, + "logits/chosen": -0.8351935148239136, + "logits/rejected": -0.6800636053085327, + "logps/chosen": -4.701047420501709, + "logps/rejected": -5.2720818519592285, + "loss": 0.0524, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.701047420501709, + "rewards/margins": 0.571033775806427, + "rewards/rejected": -5.2720818519592285, + "sft_loss": 4.444343566894531, + "step": 2625 + }, + { + "epoch": 1.4075932430172269, + "grad_norm": 0.4795161285636052, + "learning_rate": 6.390857177451956e-07, + "logits/chosen": -0.8992173075675964, + "logits/rejected": -0.6891031265258789, + "logps/chosen": -4.538784027099609, + "logps/rejected": -5.005777835845947, + "loss": 0.0531, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.538784027099609, + "rewards/margins": 0.46699291467666626, + "rewards/rejected": -5.005777835845947, + "sft_loss": 4.31063985824585, + "step": 2630 + }, + { + "epoch": 1.4102692757986286, + "grad_norm": 0.7016521627801586, + "learning_rate": 6.375891112313445e-07, + "logits/chosen": -0.8801708221435547, + "logits/rejected": -0.7778578996658325, + "logps/chosen": -4.37764310836792, + "logps/rejected": -4.836381912231445, + "loss": 0.0519, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.37764310836792, + "rewards/margins": 0.4587384760379791, + "rewards/rejected": -4.836381912231445, + "sft_loss": 4.069215774536133, + "step": 2635 + }, + { + "epoch": 1.41294530858003, + "grad_norm": 0.4657342923390584, + "learning_rate": 6.360911698273326e-07, + "logits/chosen": -0.7782545685768127, + "logits/rejected": -0.6499952077865601, + "logps/chosen": -4.533600807189941, + "logps/rejected": -4.894816875457764, + "loss": 0.0543, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.533600807189941, + "rewards/margins": 0.36121657490730286, + "rewards/rejected": -4.894816875457764, + "sft_loss": 4.284762382507324, + "step": 2640 + }, + { + "epoch": 1.4156213413614318, + "grad_norm": 0.47112647697309784, + "learning_rate": 6.345919080661944e-07, + "logits/chosen": -0.9207308888435364, + "logits/rejected": -0.8575568199157715, + "logps/chosen": -4.626215934753418, + "logps/rejected": -5.244388580322266, + "loss": 0.0522, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.626215934753418, + "rewards/margins": 0.6181727051734924, + "rewards/rejected": -5.244388580322266, + "sft_loss": 4.426272392272949, + "step": 2645 + }, + { + "epoch": 1.4182973741428333, + "grad_norm": 0.29671780119762087, + "learning_rate": 6.330913404937737e-07, + "logits/chosen": -0.9336298108100891, + "logits/rejected": -0.7710980176925659, + "logps/chosen": -4.727460861206055, + "logps/rejected": -5.37843132019043, + "loss": 0.0519, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.727460861206055, + "rewards/margins": 0.6509709358215332, + "rewards/rejected": -5.37843132019043, + "sft_loss": 4.423439979553223, + "step": 2650 + }, + { + "epoch": 1.4209734069242348, + "grad_norm": 0.5791699497151932, + "learning_rate": 6.315894816685838e-07, + "logits/chosen": -0.7895804643630981, + "logits/rejected": -0.6132604479789734, + "logps/chosen": -4.567627906799316, + "logps/rejected": -4.8765106201171875, + "loss": 0.0527, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.567627906799316, + "rewards/margins": 0.3088833689689636, + "rewards/rejected": -4.8765106201171875, + "sft_loss": 4.217142105102539, + "step": 2655 + }, + { + "epoch": 1.4236494397056365, + "grad_norm": 0.5073327462343239, + "learning_rate": 6.300863461616657e-07, + "logits/chosen": -0.7717041969299316, + "logits/rejected": -0.6730908155441284, + "logps/chosen": -4.412945747375488, + "logps/rejected": -4.921000003814697, + "loss": 0.053, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.412945747375488, + "rewards/margins": 0.5080535411834717, + "rewards/rejected": -4.921000003814697, + "sft_loss": 4.149726390838623, + "step": 2660 + }, + { + "epoch": 1.426325472487038, + "grad_norm": 0.3239165109984895, + "learning_rate": 6.285819485564465e-07, + "logits/chosen": -0.9396502375602722, + "logits/rejected": -0.7643145322799683, + "logps/chosen": -4.451190948486328, + "logps/rejected": -5.017029762268066, + "loss": 0.0523, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.451190948486328, + "rewards/margins": 0.5658388733863831, + "rewards/rejected": -5.017029762268066, + "sft_loss": 4.155117511749268, + "step": 2665 + }, + { + "epoch": 1.4290015052684395, + "grad_norm": 0.47727707504839423, + "learning_rate": 6.270763034485986e-07, + "logits/chosen": -0.8083940744400024, + "logits/rejected": -0.6736945509910583, + "logps/chosen": -4.661080360412598, + "logps/rejected": -5.173447132110596, + "loss": 0.0516, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.661080360412598, + "rewards/margins": 0.5123669505119324, + "rewards/rejected": -5.173447132110596, + "sft_loss": 4.300747871398926, + "step": 2670 + }, + { + "epoch": 1.4316775380498412, + "grad_norm": 0.49975394521225786, + "learning_rate": 6.255694254458972e-07, + "logits/chosen": -0.9003822207450867, + "logits/rejected": -0.7234520316123962, + "logps/chosen": -4.687695503234863, + "logps/rejected": -5.162412166595459, + "loss": 0.0545, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.687695503234863, + "rewards/margins": 0.474717378616333, + "rewards/rejected": -5.162412166595459, + "sft_loss": 4.253655433654785, + "step": 2675 + }, + { + "epoch": 1.4343535708312427, + "grad_norm": 0.7970279083312815, + "learning_rate": 6.240613291680795e-07, + "logits/chosen": -0.8419411778450012, + "logits/rejected": -0.653814435005188, + "logps/chosen": -4.679329872131348, + "logps/rejected": -5.142355918884277, + "loss": 0.0531, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.679329872131348, + "rewards/margins": 0.4630259871482849, + "rewards/rejected": -5.142355918884277, + "sft_loss": 4.350041389465332, + "step": 2680 + }, + { + "epoch": 1.4370296036126442, + "grad_norm": 0.44742716716600295, + "learning_rate": 6.225520292467021e-07, + "logits/chosen": -0.8692100644111633, + "logits/rejected": -0.6362851858139038, + "logps/chosen": -4.339302062988281, + "logps/rejected": -5.107569217681885, + "loss": 0.05, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.339302062988281, + "rewards/margins": 0.7682672739028931, + "rewards/rejected": -5.107569217681885, + "sft_loss": 4.058346748352051, + "step": 2685 + }, + { + "epoch": 1.439705636394046, + "grad_norm": 0.5956502684124177, + "learning_rate": 6.210415403249993e-07, + "logits/chosen": -0.9499366879463196, + "logits/rejected": -0.6553173065185547, + "logps/chosen": -4.524402141571045, + "logps/rejected": -5.320879936218262, + "loss": 0.051, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.524402141571045, + "rewards/margins": 0.7964780330657959, + "rewards/rejected": -5.320879936218262, + "sft_loss": 4.101487159729004, + "step": 2690 + }, + { + "epoch": 1.4423816691754474, + "grad_norm": 0.6211044159448419, + "learning_rate": 6.195298770577415e-07, + "logits/chosen": -0.7495101094245911, + "logits/rejected": -0.7657794952392578, + "logps/chosen": -4.553832054138184, + "logps/rejected": -5.104920864105225, + "loss": 0.0532, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.553832054138184, + "rewards/margins": 0.551088809967041, + "rewards/rejected": -5.104920864105225, + "sft_loss": 4.29389762878418, + "step": 2695 + }, + { + "epoch": 1.445057701956849, + "grad_norm": 0.3685359376502068, + "learning_rate": 6.180170541110923e-07, + "logits/chosen": -0.82825767993927, + "logits/rejected": -0.6391871571540833, + "logps/chosen": -4.653820514678955, + "logps/rejected": -5.102269649505615, + "loss": 0.0534, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.653820514678955, + "rewards/margins": 0.4484497010707855, + "rewards/rejected": -5.102269649505615, + "sft_loss": 4.338795185089111, + "step": 2700 + }, + { + "epoch": 1.4477337347382506, + "grad_norm": 0.36879136698007464, + "learning_rate": 6.165030861624663e-07, + "logits/chosen": -1.0215765237808228, + "logits/rejected": -0.704619288444519, + "logps/chosen": -4.500130653381348, + "logps/rejected": -5.3099188804626465, + "loss": 0.0514, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.500130653381348, + "rewards/margins": 0.809788703918457, + "rewards/rejected": -5.3099188804626465, + "sft_loss": 4.320158004760742, + "step": 2705 + }, + { + "epoch": 1.4504097675196521, + "grad_norm": 0.35921794005610297, + "learning_rate": 6.149879879003876e-07, + "logits/chosen": -0.736519992351532, + "logits/rejected": -0.7717832326889038, + "logps/chosen": -4.527623176574707, + "logps/rejected": -4.958807945251465, + "loss": 0.0526, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.527623176574707, + "rewards/margins": 0.431184858083725, + "rewards/rejected": -4.958807945251465, + "sft_loss": 4.252358436584473, + "step": 2710 + }, + { + "epoch": 1.4530858003010536, + "grad_norm": 0.4029998204084026, + "learning_rate": 6.13471774024346e-07, + "logits/chosen": -0.9023638963699341, + "logits/rejected": -0.7861912250518799, + "logps/chosen": -4.328760623931885, + "logps/rejected": -4.899123191833496, + "loss": 0.0513, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.328760623931885, + "rewards/margins": 0.5703624486923218, + "rewards/rejected": -4.899123191833496, + "sft_loss": 4.081745624542236, + "step": 2715 + }, + { + "epoch": 1.4557618330824553, + "grad_norm": 0.3181867162536053, + "learning_rate": 6.119544592446551e-07, + "logits/chosen": -0.8461063504219055, + "logits/rejected": -0.7370610237121582, + "logps/chosen": -4.367936611175537, + "logps/rejected": -4.853856086730957, + "loss": 0.0524, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.367936611175537, + "rewards/margins": 0.48591962456703186, + "rewards/rejected": -4.853856086730957, + "sft_loss": 4.085618019104004, + "step": 2720 + }, + { + "epoch": 1.4584378658638568, + "grad_norm": 0.4417312535342543, + "learning_rate": 6.104360582823096e-07, + "logits/chosen": -0.853473961353302, + "logits/rejected": -0.7386851906776428, + "logps/chosen": -4.7194013595581055, + "logps/rejected": -5.207268714904785, + "loss": 0.0531, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.7194013595581055, + "rewards/margins": 0.4878672659397125, + "rewards/rejected": -5.207268714904785, + "sft_loss": 4.425844192504883, + "step": 2725 + }, + { + "epoch": 1.4611138986452583, + "grad_norm": 0.4267754873095855, + "learning_rate": 6.089165858688423e-07, + "logits/chosen": -0.7712498903274536, + "logits/rejected": -0.5837336778640747, + "logps/chosen": -4.546807289123535, + "logps/rejected": -5.284149169921875, + "loss": 0.0507, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.546807289123535, + "rewards/margins": 0.7373424768447876, + "rewards/rejected": -5.284149169921875, + "sft_loss": 4.248849391937256, + "step": 2730 + }, + { + "epoch": 1.46378993142666, + "grad_norm": 0.5243832632135247, + "learning_rate": 6.073960567461811e-07, + "logits/chosen": -0.7328991889953613, + "logits/rejected": -0.5446587800979614, + "logps/chosen": -4.3400068283081055, + "logps/rejected": -5.033965110778809, + "loss": 0.0511, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.3400068283081055, + "rewards/margins": 0.6939582824707031, + "rewards/rejected": -5.033965110778809, + "sft_loss": 4.018763542175293, + "step": 2735 + }, + { + "epoch": 1.4664659642080615, + "grad_norm": 0.43775113695751916, + "learning_rate": 6.058744856665065e-07, + "logits/chosen": -0.7534220218658447, + "logits/rejected": -0.6736117601394653, + "logps/chosen": -4.569155693054199, + "logps/rejected": -5.47993278503418, + "loss": 0.0507, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.569155693054199, + "rewards/margins": 0.9107775688171387, + "rewards/rejected": -5.47993278503418, + "sft_loss": 4.218738555908203, + "step": 2740 + }, + { + "epoch": 1.469141996989463, + "grad_norm": 0.4717035016463936, + "learning_rate": 6.043518873921074e-07, + "logits/chosen": -0.7917760610580444, + "logits/rejected": -0.657507061958313, + "logps/chosen": -4.352883338928223, + "logps/rejected": -4.968850135803223, + "loss": 0.0514, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.352883338928223, + "rewards/margins": 0.6159666776657104, + "rewards/rejected": -4.968850135803223, + "sft_loss": 3.9638657569885254, + "step": 2745 + }, + { + "epoch": 1.4718180297708647, + "grad_norm": 0.45414652768336783, + "learning_rate": 6.028282766952393e-07, + "logits/chosen": -0.7075755596160889, + "logits/rejected": -0.6383055448532104, + "logps/chosen": -4.6790876388549805, + "logps/rejected": -5.391210556030273, + "loss": 0.0507, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.6790876388549805, + "rewards/margins": 0.7121232151985168, + "rewards/rejected": -5.391210556030273, + "sft_loss": 4.254571914672852, + "step": 2750 + }, + { + "epoch": 1.4744940625522662, + "grad_norm": 0.5878041839400147, + "learning_rate": 6.013036683579798e-07, + "logits/chosen": -0.6836889386177063, + "logits/rejected": -0.5880746841430664, + "logps/chosen": -4.447044849395752, + "logps/rejected": -5.132805824279785, + "loss": 0.0517, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.447044849395752, + "rewards/margins": 0.6857603788375854, + "rewards/rejected": -5.132805824279785, + "sft_loss": 4.128294944763184, + "step": 2755 + }, + { + "epoch": 1.4771700953336677, + "grad_norm": 0.44709587630970354, + "learning_rate": 5.997780771720854e-07, + "logits/chosen": -0.8651409149169922, + "logits/rejected": -0.6353691816329956, + "logps/chosen": -4.558709144592285, + "logps/rejected": -5.283412933349609, + "loss": 0.0523, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.558709144592285, + "rewards/margins": 0.7247046232223511, + "rewards/rejected": -5.283412933349609, + "sft_loss": 4.31949520111084, + "step": 2760 + }, + { + "epoch": 1.4798461281150694, + "grad_norm": 0.4967270396229397, + "learning_rate": 5.982515179388486e-07, + "logits/chosen": -0.6980944871902466, + "logits/rejected": -0.5798202753067017, + "logps/chosen": -4.641325950622559, + "logps/rejected": -5.210552215576172, + "loss": 0.0527, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.641325950622559, + "rewards/margins": 0.5692263841629028, + "rewards/rejected": -5.210552215576172, + "sft_loss": 4.369772911071777, + "step": 2765 + }, + { + "epoch": 1.482522160896471, + "grad_norm": 0.414039866021628, + "learning_rate": 5.967240054689541e-07, + "logits/chosen": -0.8741037249565125, + "logits/rejected": -0.8427600860595703, + "logps/chosen": -4.335209846496582, + "logps/rejected": -4.899285316467285, + "loss": 0.0525, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.335209846496582, + "rewards/margins": 0.5640758872032166, + "rewards/rejected": -4.899285316467285, + "sft_loss": 4.151376247406006, + "step": 2770 + }, + { + "epoch": 1.4851981936778724, + "grad_norm": 0.32893641730688494, + "learning_rate": 5.951955545823342e-07, + "logits/chosen": -0.8423898816108704, + "logits/rejected": -0.7706656455993652, + "logps/chosen": -4.534244537353516, + "logps/rejected": -5.130573272705078, + "loss": 0.0535, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.534244537353516, + "rewards/margins": 0.5963292717933655, + "rewards/rejected": -5.130573272705078, + "sft_loss": 4.373254776000977, + "step": 2775 + }, + { + "epoch": 1.4878742264592741, + "grad_norm": 0.2899564595789967, + "learning_rate": 5.936661801080263e-07, + "logits/chosen": -0.8321715593338013, + "logits/rejected": -0.7527654767036438, + "logps/chosen": -4.561938285827637, + "logps/rejected": -5.114630222320557, + "loss": 0.0524, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.561938285827637, + "rewards/margins": 0.5526921153068542, + "rewards/rejected": -5.114630222320557, + "sft_loss": 4.300882339477539, + "step": 2780 + }, + { + "epoch": 1.4905502592406756, + "grad_norm": 0.39218345035640906, + "learning_rate": 5.92135896884028e-07, + "logits/chosen": -0.8997092247009277, + "logits/rejected": -0.7471407055854797, + "logps/chosen": -4.617938041687012, + "logps/rejected": -5.369228839874268, + "loss": 0.0519, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.617938041687012, + "rewards/margins": 0.7512913346290588, + "rewards/rejected": -5.369228839874268, + "sft_loss": 4.363840579986572, + "step": 2785 + }, + { + "epoch": 1.4932262920220774, + "grad_norm": 0.5000524915500977, + "learning_rate": 5.906047197571541e-07, + "logits/chosen": -0.7601592540740967, + "logits/rejected": -0.8226611018180847, + "logps/chosen": -4.56928014755249, + "logps/rejected": -4.973417282104492, + "loss": 0.0548, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.56928014755249, + "rewards/margins": 0.4041372835636139, + "rewards/rejected": -4.973417282104492, + "sft_loss": 4.3844685554504395, + "step": 2790 + }, + { + "epoch": 1.4959023248034788, + "grad_norm": 0.4501939041101977, + "learning_rate": 5.890726635828919e-07, + "logits/chosen": -0.6789978742599487, + "logits/rejected": -0.7200356721878052, + "logps/chosen": -4.450923919677734, + "logps/rejected": -4.913413047790527, + "loss": 0.0536, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.450923919677734, + "rewards/margins": 0.46248936653137207, + "rewards/rejected": -4.913413047790527, + "sft_loss": 4.257745265960693, + "step": 2795 + }, + { + "epoch": 1.4985783575848803, + "grad_norm": 0.4064590009353923, + "learning_rate": 5.875397432252569e-07, + "logits/chosen": -0.8166916966438293, + "logits/rejected": -0.7958462834358215, + "logps/chosen": -4.582040786743164, + "logps/rejected": -5.0602827072143555, + "loss": 0.0522, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.582040786743164, + "rewards/margins": 0.47824162244796753, + "rewards/rejected": -5.0602827072143555, + "sft_loss": 4.298692226409912, + "step": 2800 + }, + { + "epoch": 1.4985783575848803, + "eval_logits/chosen": -0.38490617275238037, + "eval_logits/rejected": -0.29844847321510315, + "eval_logps/chosen": -4.544572830200195, + "eval_logps/rejected": -5.137356758117676, + "eval_loss": 0.051060404628515244, + "eval_rewards/accuracies": 0.6802670359611511, + "eval_rewards/chosen": -4.544572830200195, + "eval_rewards/margins": 0.5927836298942566, + "eval_rewards/rejected": -5.137356758117676, + "eval_runtime": 43.3873, + "eval_samples_per_second": 31.0, + "eval_sft_loss": 4.224380970001221, + "eval_steps_per_second": 7.767, + "step": 2800 + }, + { + "epoch": 1.5012543903662818, + "grad_norm": 0.5857737743279292, + "learning_rate": 5.860059735566491e-07, + "logits/chosen": -0.986019492149353, + "logits/rejected": -0.7938674688339233, + "logps/chosen": -4.5381388664245605, + "logps/rejected": -5.097907066345215, + "loss": 0.0522, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.5381388664245605, + "rewards/margins": 0.5597677230834961, + "rewards/rejected": -5.097907066345215, + "sft_loss": 4.319216728210449, + "step": 2805 + }, + { + "epoch": 1.5039304231476835, + "grad_norm": 0.5663653190019956, + "learning_rate": 5.844713694577087e-07, + "logits/chosen": -0.7099121809005737, + "logits/rejected": -0.6400435566902161, + "logps/chosen": -4.539187431335449, + "logps/rejected": -5.13986873626709, + "loss": 0.0518, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.539187431335449, + "rewards/margins": 0.6006811857223511, + "rewards/rejected": -5.13986873626709, + "sft_loss": 4.334919452667236, + "step": 2810 + }, + { + "epoch": 1.5066064559290853, + "grad_norm": 0.8834082325413514, + "learning_rate": 5.829359458171714e-07, + "logits/chosen": -0.6759532690048218, + "logits/rejected": -0.5866900682449341, + "logps/chosen": -4.316249847412109, + "logps/rejected": -5.01406192779541, + "loss": 0.0513, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.316249847412109, + "rewards/margins": 0.6978114247322083, + "rewards/rejected": -5.01406192779541, + "sft_loss": 4.148660659790039, + "step": 2815 + }, + { + "epoch": 1.5092824887104868, + "grad_norm": 0.7366505641829346, + "learning_rate": 5.81399717531724e-07, + "logits/chosen": -0.7597302198410034, + "logits/rejected": -0.5598629713058472, + "logps/chosen": -4.290135860443115, + "logps/rejected": -4.8505539894104, + "loss": 0.0532, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.290135860443115, + "rewards/margins": 0.5604175329208374, + "rewards/rejected": -4.8505539894104, + "sft_loss": 4.115383148193359, + "step": 2820 + }, + { + "epoch": 1.5119585214918883, + "grad_norm": 0.4483292065178676, + "learning_rate": 5.798626995058602e-07, + "logits/chosen": -0.9454687833786011, + "logits/rejected": -0.7220587730407715, + "logps/chosen": -4.53515100479126, + "logps/rejected": -5.261549949645996, + "loss": 0.0525, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.53515100479126, + "rewards/margins": 0.7263993620872498, + "rewards/rejected": -5.261549949645996, + "sft_loss": 4.332110404968262, + "step": 2825 + }, + { + "epoch": 1.51463455427329, + "grad_norm": 0.5592988396281663, + "learning_rate": 5.783249066517354e-07, + "logits/chosen": -0.8145958781242371, + "logits/rejected": -0.7614253163337708, + "logps/chosen": -4.5657124519348145, + "logps/rejected": -5.182963848114014, + "loss": 0.0503, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.5657124519348145, + "rewards/margins": 0.6172509789466858, + "rewards/rejected": -5.182963848114014, + "sft_loss": 4.231114387512207, + "step": 2830 + }, + { + "epoch": 1.5173105870546915, + "grad_norm": 0.632303409364965, + "learning_rate": 5.767863538890228e-07, + "logits/chosen": -0.8183963894844055, + "logits/rejected": -0.6835195422172546, + "logps/chosen": -4.389924049377441, + "logps/rejected": -5.1250081062316895, + "loss": 0.0517, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.389924049377441, + "rewards/margins": 0.7350836992263794, + "rewards/rejected": -5.1250081062316895, + "sft_loss": 4.124279499053955, + "step": 2835 + }, + { + "epoch": 1.519986619836093, + "grad_norm": 0.541651076628437, + "learning_rate": 5.75247056144768e-07, + "logits/chosen": -0.8295953869819641, + "logits/rejected": -0.8045207858085632, + "logps/chosen": -4.505274772644043, + "logps/rejected": -4.960772514343262, + "loss": 0.0536, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.505274772644043, + "rewards/margins": 0.4554976522922516, + "rewards/rejected": -4.960772514343262, + "sft_loss": 4.231078147888184, + "step": 2840 + }, + { + "epoch": 1.5226626526174947, + "grad_norm": 0.40328763918897, + "learning_rate": 5.737070283532444e-07, + "logits/chosen": -0.9527303576469421, + "logits/rejected": -0.83274906873703, + "logps/chosen": -4.756344795227051, + "logps/rejected": -5.247215270996094, + "loss": 0.0533, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.756344795227051, + "rewards/margins": 0.49087056517601013, + "rewards/rejected": -5.247215270996094, + "sft_loss": 4.509295463562012, + "step": 2845 + }, + { + "epoch": 1.5253386853988962, + "grad_norm": 0.5501878255143199, + "learning_rate": 5.721662854558084e-07, + "logits/chosen": -0.9534880518913269, + "logits/rejected": -0.9018028378486633, + "logps/chosen": -4.588342666625977, + "logps/rejected": -5.2513628005981445, + "loss": 0.0512, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.588342666625977, + "rewards/margins": 0.6630192995071411, + "rewards/rejected": -5.2513628005981445, + "sft_loss": 4.240243911743164, + "step": 2850 + }, + { + "epoch": 1.5280147181802977, + "grad_norm": 0.4812915607269805, + "learning_rate": 5.706248424007545e-07, + "logits/chosen": -0.9855610728263855, + "logits/rejected": -0.7704046964645386, + "logps/chosen": -4.3409504890441895, + "logps/rejected": -4.86018180847168, + "loss": 0.0522, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.3409504890441895, + "rewards/margins": 0.5192316174507141, + "rewards/rejected": -4.86018180847168, + "sft_loss": 4.120865821838379, + "step": 2855 + }, + { + "epoch": 1.5306907509616994, + "grad_norm": 0.45233228772754347, + "learning_rate": 5.690827141431699e-07, + "logits/chosen": -1.017059326171875, + "logits/rejected": -0.7848398089408875, + "logps/chosen": -4.593303680419922, + "logps/rejected": -5.0092973709106445, + "loss": 0.0524, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.593303680419922, + "rewards/margins": 0.4159931242465973, + "rewards/rejected": -5.0092973709106445, + "sft_loss": 4.261613368988037, + "step": 2860 + }, + { + "epoch": 1.5333667837431009, + "grad_norm": 0.6506887003976763, + "learning_rate": 5.675399156447897e-07, + "logits/chosen": -1.0267689228057861, + "logits/rejected": -0.8769267797470093, + "logps/chosen": -4.629855155944824, + "logps/rejected": -5.098114967346191, + "loss": 0.0534, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.629855155944824, + "rewards/margins": 0.4682607054710388, + "rewards/rejected": -5.098114967346191, + "sft_loss": 4.358528137207031, + "step": 2865 + }, + { + "epoch": 1.5360428165245024, + "grad_norm": 0.4798389011577337, + "learning_rate": 5.659964618738515e-07, + "logits/chosen": -0.9780286550521851, + "logits/rejected": -0.8610752820968628, + "logps/chosen": -4.556265830993652, + "logps/rejected": -5.136031150817871, + "loss": 0.0515, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.556265830993652, + "rewards/margins": 0.5797653794288635, + "rewards/rejected": -5.136031150817871, + "sft_loss": 4.165777206420898, + "step": 2870 + }, + { + "epoch": 1.538718849305904, + "grad_norm": 0.5422032747817679, + "learning_rate": 5.644523678049509e-07, + "logits/chosen": -0.9687239527702332, + "logits/rejected": -0.8813830614089966, + "logps/chosen": -4.5033183097839355, + "logps/rejected": -4.969407558441162, + "loss": 0.0513, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.5033183097839355, + "rewards/margins": 0.4660890996456146, + "rewards/rejected": -4.969407558441162, + "sft_loss": 4.0725483894348145, + "step": 2875 + }, + { + "epoch": 1.5413948820873056, + "grad_norm": 0.5901973530137451, + "learning_rate": 5.629076484188952e-07, + "logits/chosen": -0.7941701412200928, + "logits/rejected": -0.7139784693717957, + "logps/chosen": -4.662365913391113, + "logps/rejected": -5.221032619476318, + "loss": 0.0526, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.662365913391113, + "rewards/margins": 0.5586673617362976, + "rewards/rejected": -5.221032619476318, + "sft_loss": 4.298089027404785, + "step": 2880 + }, + { + "epoch": 1.544070914868707, + "grad_norm": 0.364967962468647, + "learning_rate": 5.613623187025587e-07, + "logits/chosen": -0.8740784525871277, + "logits/rejected": -0.7707508206367493, + "logps/chosen": -4.610237121582031, + "logps/rejected": -5.228066444396973, + "loss": 0.0518, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.610237121582031, + "rewards/margins": 0.617829442024231, + "rewards/rejected": -5.228066444396973, + "sft_loss": 4.25256872177124, + "step": 2885 + }, + { + "epoch": 1.5467469476501088, + "grad_norm": 0.479594255068986, + "learning_rate": 5.598163936487369e-07, + "logits/chosen": -0.8517980575561523, + "logits/rejected": -0.6656395196914673, + "logps/chosen": -4.480307102203369, + "logps/rejected": -5.1197099685668945, + "loss": 0.0518, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.480307102203369, + "rewards/margins": 0.6394029259681702, + "rewards/rejected": -5.1197099685668945, + "sft_loss": 4.178659439086914, + "step": 2890 + }, + { + "epoch": 1.5494229804315103, + "grad_norm": 0.38971936523868217, + "learning_rate": 5.582698882560017e-07, + "logits/chosen": -0.8685995936393738, + "logits/rejected": -0.692309558391571, + "logps/chosen": -4.6656389236450195, + "logps/rejected": -5.286332130432129, + "loss": 0.0529, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.6656389236450195, + "rewards/margins": 0.6206930875778198, + "rewards/rejected": -5.286332130432129, + "sft_loss": 4.3696160316467285, + "step": 2895 + }, + { + "epoch": 1.5520990132129118, + "grad_norm": 0.44187326201186605, + "learning_rate": 5.567228175285549e-07, + "logits/chosen": -0.706171452999115, + "logits/rejected": -0.6334139108657837, + "logps/chosen": -4.3591508865356445, + "logps/rejected": -4.947355270385742, + "loss": 0.0502, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.3591508865356445, + "rewards/margins": 0.5882046222686768, + "rewards/rejected": -4.947355270385742, + "sft_loss": 3.9303412437438965, + "step": 2900 + }, + { + "epoch": 1.5547750459943135, + "grad_norm": 0.525767187888611, + "learning_rate": 5.551751964760838e-07, + "logits/chosen": -0.6887973546981812, + "logits/rejected": -0.7219077348709106, + "logps/chosen": -4.609784126281738, + "logps/rejected": -5.058163642883301, + "loss": 0.0531, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.609784126281738, + "rewards/margins": 0.4483796954154968, + "rewards/rejected": -5.058163642883301, + "sft_loss": 4.3274431228637695, + "step": 2905 + }, + { + "epoch": 1.557451078775715, + "grad_norm": 0.354163445464275, + "learning_rate": 5.536270401136145e-07, + "logits/chosen": -0.8383262753486633, + "logits/rejected": -0.7376397252082825, + "logps/chosen": -4.493884086608887, + "logps/rejected": -5.036890029907227, + "loss": 0.0514, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.493884086608887, + "rewards/margins": 0.5430058836936951, + "rewards/rejected": -5.036890029907227, + "sft_loss": 4.180073261260986, + "step": 2910 + }, + { + "epoch": 1.5601271115571165, + "grad_norm": 0.4674214316597324, + "learning_rate": 5.520783634613667e-07, + "logits/chosen": -0.7461773753166199, + "logits/rejected": -0.522574245929718, + "logps/chosen": -4.679784774780273, + "logps/rejected": -5.226053714752197, + "loss": 0.0524, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.679784774780273, + "rewards/margins": 0.5462688207626343, + "rewards/rejected": -5.226053714752197, + "sft_loss": 4.342376708984375, + "step": 2915 + }, + { + "epoch": 1.5628031443385182, + "grad_norm": 1.2543196580616705, + "learning_rate": 5.505291815446082e-07, + "logits/chosen": -0.749670147895813, + "logits/rejected": -0.6325221657752991, + "logps/chosen": -4.405003547668457, + "logps/rejected": -5.0354204177856445, + "loss": 0.0529, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.405003547668457, + "rewards/margins": 0.6304169297218323, + "rewards/rejected": -5.0354204177856445, + "sft_loss": 4.188388347625732, + "step": 2920 + }, + { + "epoch": 1.5654791771199197, + "grad_norm": 0.5222585233235869, + "learning_rate": 5.489795093935089e-07, + "logits/chosen": -0.7664873600006104, + "logits/rejected": -0.679275631904602, + "logps/chosen": -4.5715131759643555, + "logps/rejected": -5.1358561515808105, + "loss": 0.0519, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.5715131759643555, + "rewards/margins": 0.5643435716629028, + "rewards/rejected": -5.1358561515808105, + "sft_loss": 4.250007629394531, + "step": 2925 + }, + { + "epoch": 1.5681552099013212, + "grad_norm": 0.3737733749118266, + "learning_rate": 5.474293620429946e-07, + "logits/chosen": -0.9465273022651672, + "logits/rejected": -0.7799872159957886, + "logps/chosen": -4.570036888122559, + "logps/rejected": -5.4389424324035645, + "loss": 0.0506, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.570036888122559, + "rewards/margins": 0.8689058423042297, + "rewards/rejected": -5.4389424324035645, + "sft_loss": 4.252045631408691, + "step": 2930 + }, + { + "epoch": 1.570831242682723, + "grad_norm": 0.5110524346117336, + "learning_rate": 5.458787545326018e-07, + "logits/chosen": -0.8712457418441772, + "logits/rejected": -0.7498850226402283, + "logps/chosen": -4.555843830108643, + "logps/rejected": -5.041597366333008, + "loss": 0.0534, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.555843830108643, + "rewards/margins": 0.4857536256313324, + "rewards/rejected": -5.041597366333008, + "sft_loss": 4.287815570831299, + "step": 2935 + }, + { + "epoch": 1.5735072754641244, + "grad_norm": 0.4193933937806095, + "learning_rate": 5.443277019063311e-07, + "logits/chosen": -0.8290327787399292, + "logits/rejected": -0.6746954917907715, + "logps/chosen": -4.528292655944824, + "logps/rejected": -5.201540470123291, + "loss": 0.0523, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.528292655944824, + "rewards/margins": 0.6732484698295593, + "rewards/rejected": -5.201540470123291, + "sft_loss": 4.318000793457031, + "step": 2940 + }, + { + "epoch": 1.5761833082455259, + "grad_norm": 0.7984256655785995, + "learning_rate": 5.427762192125023e-07, + "logits/chosen": -0.7302804589271545, + "logits/rejected": -0.6393738985061646, + "logps/chosen": -4.297086238861084, + "logps/rejected": -4.874281406402588, + "loss": 0.0518, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.297086238861084, + "rewards/margins": 0.5771942734718323, + "rewards/rejected": -4.874281406402588, + "sft_loss": 4.018523693084717, + "step": 2945 + }, + { + "epoch": 1.5788593410269276, + "grad_norm": 0.5139984575136903, + "learning_rate": 5.41224321503607e-07, + "logits/chosen": -0.8039859533309937, + "logits/rejected": -0.5436501502990723, + "logps/chosen": -4.477331638336182, + "logps/rejected": -5.2113447189331055, + "loss": 0.0512, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.477331638336182, + "rewards/margins": 0.7340143322944641, + "rewards/rejected": -5.2113447189331055, + "sft_loss": 4.270988464355469, + "step": 2950 + }, + { + "epoch": 1.5815353738083293, + "grad_norm": 0.5318063905528159, + "learning_rate": 5.396720238361637e-07, + "logits/chosen": -0.7131645679473877, + "logits/rejected": -0.5724089741706848, + "logps/chosen": -4.619770526885986, + "logps/rejected": -5.129917621612549, + "loss": 0.0521, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.619770526885986, + "rewards/margins": 0.510147213935852, + "rewards/rejected": -5.129917621612549, + "sft_loss": 4.325153350830078, + "step": 2955 + }, + { + "epoch": 1.5842114065897306, + "grad_norm": 0.6270796231497009, + "learning_rate": 5.381193412705711e-07, + "logits/chosen": -0.8254559636116028, + "logits/rejected": -0.6606593728065491, + "logps/chosen": -4.438187122344971, + "logps/rejected": -5.023268222808838, + "loss": 0.0521, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.438187122344971, + "rewards/margins": 0.5850812196731567, + "rewards/rejected": -5.023268222808838, + "sft_loss": 4.149109840393066, + "step": 2960 + }, + { + "epoch": 1.5868874393711323, + "grad_norm": 0.3628875605274503, + "learning_rate": 5.365662888709622e-07, + "logits/chosen": -0.8125503659248352, + "logits/rejected": -0.650180459022522, + "logps/chosen": -4.428151607513428, + "logps/rejected": -5.005454063415527, + "loss": 0.0524, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.428151607513428, + "rewards/margins": 0.5773029923439026, + "rewards/rejected": -5.005454063415527, + "sft_loss": 4.177712917327881, + "step": 2965 + }, + { + "epoch": 1.589563472152534, + "grad_norm": 0.4408978324808102, + "learning_rate": 5.350128817050585e-07, + "logits/chosen": -0.8711091876029968, + "logits/rejected": -0.6683140993118286, + "logps/chosen": -4.5739264488220215, + "logps/rejected": -5.2639336585998535, + "loss": 0.0511, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.5739264488220215, + "rewards/margins": 0.690007209777832, + "rewards/rejected": -5.2639336585998535, + "sft_loss": 4.209588050842285, + "step": 2970 + }, + { + "epoch": 1.5922395049339353, + "grad_norm": 0.634029397334541, + "learning_rate": 5.334591348440229e-07, + "logits/chosen": -0.8157389760017395, + "logits/rejected": -0.6465741395950317, + "logps/chosen": -4.551390171051025, + "logps/rejected": -5.187037944793701, + "loss": 0.0526, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.551390171051025, + "rewards/margins": 0.6356481313705444, + "rewards/rejected": -5.187037944793701, + "sft_loss": 4.29290771484375, + "step": 2975 + }, + { + "epoch": 1.594915537715337, + "grad_norm": 0.4523710769588691, + "learning_rate": 5.319050633623141e-07, + "logits/chosen": -0.8819277882575989, + "logits/rejected": -0.7007160782814026, + "logps/chosen": -4.540030002593994, + "logps/rejected": -5.0742316246032715, + "loss": 0.0525, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.540030002593994, + "rewards/margins": 0.5342013239860535, + "rewards/rejected": -5.0742316246032715, + "sft_loss": 4.217968463897705, + "step": 2980 + }, + { + "epoch": 1.5975915704967387, + "grad_norm": 0.5121523451628749, + "learning_rate": 5.303506823375409e-07, + "logits/chosen": -0.8410658836364746, + "logits/rejected": -0.6168532371520996, + "logps/chosen": -4.4125237464904785, + "logps/rejected": -5.1476545333862305, + "loss": 0.0514, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.4125237464904785, + "rewards/margins": 0.7351310849189758, + "rewards/rejected": -5.1476545333862305, + "sft_loss": 4.170558929443359, + "step": 2985 + }, + { + "epoch": 1.60026760327814, + "grad_norm": 0.4242550067163679, + "learning_rate": 5.287960068503143e-07, + "logits/chosen": -0.8414437174797058, + "logits/rejected": -0.6301363110542297, + "logps/chosen": -4.453303337097168, + "logps/rejected": -5.125790596008301, + "loss": 0.0521, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.453303337097168, + "rewards/margins": 0.6724871397018433, + "rewards/rejected": -5.125790596008301, + "sft_loss": 4.234655857086182, + "step": 2990 + }, + { + "epoch": 1.6029436360595417, + "grad_norm": 0.6150856743479923, + "learning_rate": 5.272410519841032e-07, + "logits/chosen": -0.7048937678337097, + "logits/rejected": -0.6164297461509705, + "logps/chosen": -4.490935325622559, + "logps/rejected": -5.310682773590088, + "loss": 0.0503, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.490935325622559, + "rewards/margins": 0.819747805595398, + "rewards/rejected": -5.310682773590088, + "sft_loss": 4.104347229003906, + "step": 2995 + }, + { + "epoch": 1.6056196688409434, + "grad_norm": 0.44023608513343926, + "learning_rate": 5.256858328250861e-07, + "logits/chosen": -0.8334380388259888, + "logits/rejected": -0.6159783601760864, + "logps/chosen": -4.609116554260254, + "logps/rejected": -5.1678619384765625, + "loss": 0.052, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.609116554260254, + "rewards/margins": 0.5587445497512817, + "rewards/rejected": -5.1678619384765625, + "sft_loss": 4.265917778015137, + "step": 3000 + }, + { + "epoch": 1.608295701622345, + "grad_norm": 0.6277862194810916, + "learning_rate": 5.241303644620063e-07, + "logits/chosen": -0.9017225503921509, + "logits/rejected": -0.6855077147483826, + "logps/chosen": -4.580887794494629, + "logps/rejected": -5.039603233337402, + "loss": 0.0538, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.580887794494629, + "rewards/margins": 0.4587152898311615, + "rewards/rejected": -5.039603233337402, + "sft_loss": 4.258471488952637, + "step": 3005 + }, + { + "epoch": 1.6109717344037464, + "grad_norm": 0.3973293347863802, + "learning_rate": 5.225746619860248e-07, + "logits/chosen": -0.9179320335388184, + "logits/rejected": -0.7894536852836609, + "logps/chosen": -4.528168678283691, + "logps/rejected": -5.234195709228516, + "loss": 0.0532, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.528168678283691, + "rewards/margins": 0.7060272097587585, + "rewards/rejected": -5.234195709228516, + "sft_loss": 4.231138229370117, + "step": 3010 + }, + { + "epoch": 1.6136477671851481, + "grad_norm": 0.4052136199043096, + "learning_rate": 5.210187404905735e-07, + "logits/chosen": -0.7160056829452515, + "logits/rejected": -0.6443689465522766, + "logps/chosen": -4.8330841064453125, + "logps/rejected": -5.205080986022949, + "loss": 0.0539, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.8330841064453125, + "rewards/margins": 0.3719966411590576, + "rewards/rejected": -5.205080986022949, + "sft_loss": 4.528097152709961, + "step": 3015 + }, + { + "epoch": 1.6163237999665496, + "grad_norm": 0.327549476063512, + "learning_rate": 5.194626150712098e-07, + "logits/chosen": -0.9027112722396851, + "logits/rejected": -0.7617613077163696, + "logps/chosen": -4.600415229797363, + "logps/rejected": -5.1396894454956055, + "loss": 0.0531, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.600415229797363, + "rewards/margins": 0.539274275302887, + "rewards/rejected": -5.1396894454956055, + "sft_loss": 4.4282941818237305, + "step": 3020 + }, + { + "epoch": 1.6189998327479511, + "grad_norm": 0.3866774364589384, + "learning_rate": 5.179063008254695e-07, + "logits/chosen": -0.8924050331115723, + "logits/rejected": -0.692057728767395, + "logps/chosen": -4.390013694763184, + "logps/rejected": -4.872861385345459, + "loss": 0.0522, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.390013694763184, + "rewards/margins": 0.48284751176834106, + "rewards/rejected": -4.872861385345459, + "sft_loss": 4.109368324279785, + "step": 3025 + }, + { + "epoch": 1.6216758655293528, + "grad_norm": 0.5603972389461411, + "learning_rate": 5.163498128527199e-07, + "logits/chosen": -0.8043405413627625, + "logits/rejected": -0.6585147976875305, + "logps/chosen": -4.4106950759887695, + "logps/rejected": -4.972418308258057, + "loss": 0.0524, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.4106950759887695, + "rewards/margins": 0.5617232322692871, + "rewards/rejected": -4.972418308258057, + "sft_loss": 4.162978172302246, + "step": 3030 + }, + { + "epoch": 1.6243518983107543, + "grad_norm": 0.5226850952217248, + "learning_rate": 5.147931662540144e-07, + "logits/chosen": -0.7547642588615417, + "logits/rejected": -0.6366636753082275, + "logps/chosen": -4.469838619232178, + "logps/rejected": -4.992844581604004, + "loss": 0.0511, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.469838619232178, + "rewards/margins": 0.523006021976471, + "rewards/rejected": -4.992844581604004, + "sft_loss": 4.131374359130859, + "step": 3035 + }, + { + "epoch": 1.6270279310921558, + "grad_norm": 0.5621502773295101, + "learning_rate": 5.132363761319449e-07, + "logits/chosen": -0.8233652114868164, + "logits/rejected": -0.7582941651344299, + "logps/chosen": -4.578380584716797, + "logps/rejected": -5.224091053009033, + "loss": 0.0516, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.578380584716797, + "rewards/margins": 0.6457099914550781, + "rewards/rejected": -5.224091053009033, + "sft_loss": 4.231387615203857, + "step": 3040 + }, + { + "epoch": 1.6297039638735575, + "grad_norm": 0.5384394600398351, + "learning_rate": 5.116794575904962e-07, + "logits/chosen": -0.6828616261482239, + "logits/rejected": -0.6190515756607056, + "logps/chosen": -4.537975788116455, + "logps/rejected": -5.16754150390625, + "loss": 0.0512, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.537975788116455, + "rewards/margins": 0.6295658946037292, + "rewards/rejected": -5.16754150390625, + "sft_loss": 4.14799690246582, + "step": 3045 + }, + { + "epoch": 1.632379996654959, + "grad_norm": 0.3570274089495265, + "learning_rate": 5.101224257348987e-07, + "logits/chosen": -0.7118021845817566, + "logits/rejected": -0.6111310124397278, + "logps/chosen": -4.623868942260742, + "logps/rejected": -5.227258682250977, + "loss": 0.051, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.623868942260742, + "rewards/margins": 0.6033896207809448, + "rewards/rejected": -5.227258682250977, + "sft_loss": 4.14676570892334, + "step": 3050 + }, + { + "epoch": 1.6350560294363605, + "grad_norm": 0.41812252336808753, + "learning_rate": 5.085652956714823e-07, + "logits/chosen": -0.8265964388847351, + "logits/rejected": -0.6396560072898865, + "logps/chosen": -4.667733669281006, + "logps/rejected": -5.240296840667725, + "loss": 0.0534, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.667733669281006, + "rewards/margins": 0.572563111782074, + "rewards/rejected": -5.240296840667725, + "sft_loss": 4.416812419891357, + "step": 3055 + }, + { + "epoch": 1.6377320622177622, + "grad_norm": 0.4141573304369751, + "learning_rate": 5.070080825075298e-07, + "logits/chosen": -0.6839703321456909, + "logits/rejected": -0.4868551194667816, + "logps/chosen": -4.377912998199463, + "logps/rejected": -5.105005741119385, + "loss": 0.0525, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.377912998199463, + "rewards/margins": 0.7270928025245667, + "rewards/rejected": -5.105005741119385, + "sft_loss": 4.107635974884033, + "step": 3060 + }, + { + "epoch": 1.6404080949991637, + "grad_norm": 0.5889041881610368, + "learning_rate": 5.0545080135113e-07, + "logits/chosen": -0.6704410910606384, + "logits/rejected": -0.5773349404335022, + "logps/chosen": -4.510235786437988, + "logps/rejected": -5.127143383026123, + "loss": 0.0534, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.510235786437988, + "rewards/margins": 0.6169074177742004, + "rewards/rejected": -5.127143383026123, + "sft_loss": 4.2573747634887695, + "step": 3065 + }, + { + "epoch": 1.6430841277805652, + "grad_norm": 0.5491081889665882, + "learning_rate": 5.038934673110316e-07, + "logits/chosen": -0.7946319580078125, + "logits/rejected": -0.6988022327423096, + "logps/chosen": -4.589323043823242, + "logps/rejected": -5.201617240905762, + "loss": 0.0526, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.589323043823242, + "rewards/margins": 0.6122941374778748, + "rewards/rejected": -5.201617240905762, + "sft_loss": 4.306293487548828, + "step": 3070 + }, + { + "epoch": 1.645760160561967, + "grad_norm": 0.40370148653990523, + "learning_rate": 5.023360954964963e-07, + "logits/chosen": -0.8991419076919556, + "logits/rejected": -0.8406192660331726, + "logps/chosen": -4.753389358520508, + "logps/rejected": -5.2947564125061035, + "loss": 0.0527, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.753389358520508, + "rewards/margins": 0.5413663983345032, + "rewards/rejected": -5.2947564125061035, + "sft_loss": 4.370591640472412, + "step": 3075 + }, + { + "epoch": 1.6484361933433684, + "grad_norm": 0.4667987266950202, + "learning_rate": 5.007787010171524e-07, + "logits/chosen": -0.926565945148468, + "logits/rejected": -0.7107774615287781, + "logps/chosen": -4.401252746582031, + "logps/rejected": -5.139235019683838, + "loss": 0.0512, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.401252746582031, + "rewards/margins": 0.7379823923110962, + "rewards/rejected": -5.139235019683838, + "sft_loss": 4.14643669128418, + "step": 3080 + }, + { + "epoch": 1.65111222612477, + "grad_norm": 0.28977187648312863, + "learning_rate": 4.992212989828477e-07, + "logits/chosen": -0.7511423826217651, + "logits/rejected": -0.7302947044372559, + "logps/chosen": -4.628520965576172, + "logps/rejected": -5.08123779296875, + "loss": 0.0533, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.628520965576172, + "rewards/margins": 0.45271721482276917, + "rewards/rejected": -5.08123779296875, + "sft_loss": 4.332284927368164, + "step": 3085 + }, + { + "epoch": 1.6537882589061716, + "grad_norm": 0.5128071746578113, + "learning_rate": 4.976639045035036e-07, + "logits/chosen": -0.7045882940292358, + "logits/rejected": -0.651595413684845, + "logps/chosen": -4.456996440887451, + "logps/rejected": -4.9166107177734375, + "loss": 0.0533, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.456996440887451, + "rewards/margins": 0.45961475372314453, + "rewards/rejected": -4.9166107177734375, + "sft_loss": 4.227812767028809, + "step": 3090 + }, + { + "epoch": 1.6564642916875731, + "grad_norm": 0.7439117424034836, + "learning_rate": 4.961065326889683e-07, + "logits/chosen": -0.7401078939437866, + "logits/rejected": -0.5558581948280334, + "logps/chosen": -4.522768974304199, + "logps/rejected": -5.026648044586182, + "loss": 0.0539, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.522768974304199, + "rewards/margins": 0.5038790702819824, + "rewards/rejected": -5.026648044586182, + "sft_loss": 4.227447032928467, + "step": 3095 + }, + { + "epoch": 1.6591403244689746, + "grad_norm": 0.44974777016917117, + "learning_rate": 4.9454919864887e-07, + "logits/chosen": -0.8705316781997681, + "logits/rejected": -0.7544268369674683, + "logps/chosen": -4.580806732177734, + "logps/rejected": -5.142989158630371, + "loss": 0.0526, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.580806732177734, + "rewards/margins": 0.5621822476387024, + "rewards/rejected": -5.142989158630371, + "sft_loss": 4.323142051696777, + "step": 3100 + }, + { + "epoch": 1.6618163572503764, + "grad_norm": 0.60351015808472, + "learning_rate": 4.929919174924701e-07, + "logits/chosen": -0.9598020315170288, + "logits/rejected": -0.7137739062309265, + "logps/chosen": -4.588383674621582, + "logps/rejected": -5.193715572357178, + "loss": 0.0527, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.588383674621582, + "rewards/margins": 0.6053324937820435, + "rewards/rejected": -5.193715572357178, + "sft_loss": 4.364377498626709, + "step": 3105 + }, + { + "epoch": 1.6644923900317778, + "grad_norm": 0.39383948253171636, + "learning_rate": 4.914347043285177e-07, + "logits/chosen": -0.7553201913833618, + "logits/rejected": -0.5945597290992737, + "logps/chosen": -4.58227014541626, + "logps/rejected": -5.167197227478027, + "loss": 0.0515, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.58227014541626, + "rewards/margins": 0.5849268436431885, + "rewards/rejected": -5.167197227478027, + "sft_loss": 4.150500297546387, + "step": 3110 + }, + { + "epoch": 1.6671684228131793, + "grad_norm": 0.5947338725544065, + "learning_rate": 4.898775742651013e-07, + "logits/chosen": -0.6695328950881958, + "logits/rejected": -0.5480534434318542, + "logps/chosen": -4.455540657043457, + "logps/rejected": -5.015857696533203, + "loss": 0.0514, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.455540657043457, + "rewards/margins": 0.5603172183036804, + "rewards/rejected": -5.015857696533203, + "sft_loss": 4.052641868591309, + "step": 3115 + }, + { + "epoch": 1.669844455594581, + "grad_norm": 0.3739608220920118, + "learning_rate": 4.883205424095037e-07, + "logits/chosen": -0.8755930066108704, + "logits/rejected": -0.6813184022903442, + "logps/chosen": -4.374234199523926, + "logps/rejected": -5.120595455169678, + "loss": 0.0516, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.374234199523926, + "rewards/margins": 0.7463610172271729, + "rewards/rejected": -5.120595455169678, + "sft_loss": 4.135296821594238, + "step": 3120 + }, + { + "epoch": 1.6725204883759828, + "grad_norm": 0.47999415721425, + "learning_rate": 4.86763623868055e-07, + "logits/chosen": -0.7871851921081543, + "logits/rejected": -0.6408876180648804, + "logps/chosen": -4.715963363647461, + "logps/rejected": -5.292166233062744, + "loss": 0.0522, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.715963363647461, + "rewards/margins": 0.5762028098106384, + "rewards/rejected": -5.292166233062744, + "sft_loss": 4.366208076477051, + "step": 3125 + }, + { + "epoch": 1.675196521157384, + "grad_norm": 0.36852219394257385, + "learning_rate": 4.852068337459856e-07, + "logits/chosen": -0.7195180654525757, + "logits/rejected": -0.532809853553772, + "logps/chosen": -4.462833404541016, + "logps/rejected": -5.156439304351807, + "loss": 0.0508, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.462833404541016, + "rewards/margins": 0.6936055421829224, + "rewards/rejected": -5.156439304351807, + "sft_loss": 4.119335174560547, + "step": 3130 + }, + { + "epoch": 1.6778725539387858, + "grad_norm": 0.40507193582200285, + "learning_rate": 4.8365018714728e-07, + "logits/chosen": -0.7378911375999451, + "logits/rejected": -0.6666657328605652, + "logps/chosen": -4.564920425415039, + "logps/rejected": -5.0844197273254395, + "loss": 0.0536, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.564920425415039, + "rewards/margins": 0.5194991827011108, + "rewards/rejected": -5.0844197273254395, + "sft_loss": 4.275947093963623, + "step": 3135 + }, + { + "epoch": 1.6805485867201875, + "grad_norm": 0.31724766919051245, + "learning_rate": 4.820936991745304e-07, + "logits/chosen": -1.0948458909988403, + "logits/rejected": -0.9697257876396179, + "logps/chosen": -4.6035237312316895, + "logps/rejected": -5.034295082092285, + "loss": 0.0525, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.6035237312316895, + "rewards/margins": 0.43077144026756287, + "rewards/rejected": -5.034295082092285, + "sft_loss": 4.244277000427246, + "step": 3140 + }, + { + "epoch": 1.6832246195015887, + "grad_norm": 0.5310039889396487, + "learning_rate": 4.8053738492879e-07, + "logits/chosen": -0.8384534120559692, + "logits/rejected": -0.7171124815940857, + "logps/chosen": -4.526313781738281, + "logps/rejected": -5.261399269104004, + "loss": 0.0515, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.526313781738281, + "rewards/margins": 0.7350856065750122, + "rewards/rejected": -5.261399269104004, + "sft_loss": 4.204371452331543, + "step": 3145 + }, + { + "epoch": 1.6859006522829905, + "grad_norm": 0.5335564765964186, + "learning_rate": 4.789812595094265e-07, + "logits/chosen": -0.9797199368476868, + "logits/rejected": -0.8858574032783508, + "logps/chosen": -4.503859519958496, + "logps/rejected": -5.087957382202148, + "loss": 0.052, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.503859519958496, + "rewards/margins": 0.5840980410575867, + "rewards/rejected": -5.087957382202148, + "sft_loss": 4.208001613616943, + "step": 3150 + }, + { + "epoch": 1.6885766850643922, + "grad_norm": 0.42218902416314297, + "learning_rate": 4.774253380139752e-07, + "logits/chosen": -0.9380515813827515, + "logits/rejected": -0.8422372937202454, + "logps/chosen": -4.567145347595215, + "logps/rejected": -5.0723161697387695, + "loss": 0.0527, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.567145347595215, + "rewards/margins": 0.5051710605621338, + "rewards/rejected": -5.0723161697387695, + "sft_loss": 4.258059501647949, + "step": 3155 + }, + { + "epoch": 1.6912527178457935, + "grad_norm": 0.48659534318798275, + "learning_rate": 4.758696355379936e-07, + "logits/chosen": -0.6386554837226868, + "logits/rejected": -0.7299365401268005, + "logps/chosen": -4.623234272003174, + "logps/rejected": -5.053004264831543, + "loss": 0.0529, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.623234272003174, + "rewards/margins": 0.429770290851593, + "rewards/rejected": -5.053004264831543, + "sft_loss": 4.323592662811279, + "step": 3160 + }, + { + "epoch": 1.6939287506271952, + "grad_norm": 0.4845276460062366, + "learning_rate": 4.743141671749138e-07, + "logits/chosen": -0.9454347491264343, + "logits/rejected": -0.794151782989502, + "logps/chosen": -4.372522354125977, + "logps/rejected": -4.8364152908325195, + "loss": 0.0535, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.372522354125977, + "rewards/margins": 0.46389341354370117, + "rewards/rejected": -4.8364152908325195, + "sft_loss": 4.175771236419678, + "step": 3165 + }, + { + "epoch": 1.6966047834085969, + "grad_norm": 0.3380040956576745, + "learning_rate": 4.727589480158968e-07, + "logits/chosen": -0.8804110288619995, + "logits/rejected": -0.7924290299415588, + "logps/chosen": -4.5793280601501465, + "logps/rejected": -5.261588096618652, + "loss": 0.0515, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.5793280601501465, + "rewards/margins": 0.6822600960731506, + "rewards/rejected": -5.261588096618652, + "sft_loss": 4.343472003936768, + "step": 3170 + }, + { + "epoch": 1.6992808161899984, + "grad_norm": 0.4022845360049597, + "learning_rate": 4.712039931496855e-07, + "logits/chosen": -0.8952493667602539, + "logits/rejected": -0.7143739461898804, + "logps/chosen": -4.63895845413208, + "logps/rejected": -5.060461521148682, + "loss": 0.0524, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.63895845413208, + "rewards/margins": 0.42150363326072693, + "rewards/rejected": -5.060461521148682, + "sft_loss": 4.36241340637207, + "step": 3175 + }, + { + "epoch": 1.7019568489713999, + "grad_norm": 0.35743884470081916, + "learning_rate": 4.6964931766245905e-07, + "logits/chosen": -0.7012723684310913, + "logits/rejected": -0.6638824343681335, + "logps/chosen": -4.4255876541137695, + "logps/rejected": -4.9864630699157715, + "loss": 0.0537, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.4255876541137695, + "rewards/margins": 0.5608752369880676, + "rewards/rejected": -4.9864630699157715, + "sft_loss": 4.170031547546387, + "step": 3180 + }, + { + "epoch": 1.7046328817528016, + "grad_norm": 0.388598888460869, + "learning_rate": 4.6809493663768575e-07, + "logits/chosen": -0.7917588353157043, + "logits/rejected": -0.7726391553878784, + "logps/chosen": -4.632717132568359, + "logps/rejected": -4.928259372711182, + "loss": 0.0523, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -4.632717132568359, + "rewards/margins": 0.29554182291030884, + "rewards/rejected": -4.928259372711182, + "sft_loss": 4.243005275726318, + "step": 3185 + }, + { + "epoch": 1.707308914534203, + "grad_norm": 0.43334284868642725, + "learning_rate": 4.6654086515597716e-07, + "logits/chosen": -0.9887200593948364, + "logits/rejected": -0.7736762762069702, + "logps/chosen": -4.42282772064209, + "logps/rejected": -5.094423294067383, + "loss": 0.0517, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.42282772064209, + "rewards/margins": 0.6715952754020691, + "rewards/rejected": -5.094423294067383, + "sft_loss": 4.217164516448975, + "step": 3190 + }, + { + "epoch": 1.7099849473156046, + "grad_norm": 0.551121726681534, + "learning_rate": 4.6498711829494154e-07, + "logits/chosen": -0.9799394607543945, + "logits/rejected": -0.8590563535690308, + "logps/chosen": -4.606871604919434, + "logps/rejected": -5.045818328857422, + "loss": 0.0537, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.606871604919434, + "rewards/margins": 0.43894606828689575, + "rewards/rejected": -5.045818328857422, + "sft_loss": 4.364339351654053, + "step": 3195 + }, + { + "epoch": 1.7126609800970063, + "grad_norm": 0.36960789827653123, + "learning_rate": 4.6343371112903777e-07, + "logits/chosen": -0.8044673204421997, + "logits/rejected": -0.6007072329521179, + "logps/chosen": -4.615143299102783, + "logps/rejected": -5.214327335357666, + "loss": 0.053, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.615143299102783, + "rewards/margins": 0.5991836786270142, + "rewards/rejected": -5.214327335357666, + "sft_loss": 4.31845760345459, + "step": 3200 + }, + { + "epoch": 1.7126609800970063, + "eval_logits/chosen": -0.29473957419395447, + "eval_logits/rejected": -0.2032015025615692, + "eval_logps/chosen": -4.496006011962891, + "eval_logps/rejected": -5.107307434082031, + "eval_loss": 0.050797030329704285, + "eval_rewards/accuracies": 0.6691394448280334, + "eval_rewards/chosen": -4.496006011962891, + "eval_rewards/margins": 0.6113012433052063, + "eval_rewards/rejected": -5.107307434082031, + "eval_runtime": 43.3125, + "eval_samples_per_second": 31.053, + "eval_sft_loss": 4.119299411773682, + "eval_steps_per_second": 7.781, + "step": 3200 + }, + { + "epoch": 1.7153370128784078, + "grad_norm": 0.44427092334138696, + "learning_rate": 4.618806587294291e-07, + "logits/chosen": -0.87445068359375, + "logits/rejected": -0.7708224058151245, + "logps/chosen": -4.504208564758301, + "logps/rejected": -5.129702568054199, + "loss": 0.0519, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.504208564758301, + "rewards/margins": 0.6254942417144775, + "rewards/rejected": -5.129702568054199, + "sft_loss": 4.297804832458496, + "step": 3205 + }, + { + "epoch": 1.7180130456598093, + "grad_norm": 0.3724267286377576, + "learning_rate": 4.603279761638365e-07, + "logits/chosen": -0.8704094886779785, + "logits/rejected": -0.723287045955658, + "logps/chosen": -4.470217227935791, + "logps/rejected": -5.0563645362854, + "loss": 0.0526, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.470217227935791, + "rewards/margins": 0.5861474871635437, + "rewards/rejected": -5.0563645362854, + "sft_loss": 4.2170891761779785, + "step": 3210 + }, + { + "epoch": 1.720689078441211, + "grad_norm": 0.41128865897944417, + "learning_rate": 4.5877567849639315e-07, + "logits/chosen": -0.8630671501159668, + "logits/rejected": -0.7631199955940247, + "logps/chosen": -4.550658226013184, + "logps/rejected": -5.094977378845215, + "loss": 0.0526, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.550658226013184, + "rewards/margins": 0.5443195700645447, + "rewards/rejected": -5.094977378845215, + "sft_loss": 4.317914009094238, + "step": 3215 + }, + { + "epoch": 1.7233651112226125, + "grad_norm": 0.6568465025902127, + "learning_rate": 4.572237807874979e-07, + "logits/chosen": -1.0142710208892822, + "logits/rejected": -0.6812065243721008, + "logps/chosen": -4.546741008758545, + "logps/rejected": -5.181439399719238, + "loss": 0.0516, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.546741008758545, + "rewards/margins": 0.6346983909606934, + "rewards/rejected": -5.181439399719238, + "sft_loss": 4.28690242767334, + "step": 3220 + }, + { + "epoch": 1.726041144004014, + "grad_norm": 0.43508632949046916, + "learning_rate": 4.5567229809366895e-07, + "logits/chosen": -0.8844995498657227, + "logits/rejected": -0.7484878301620483, + "logps/chosen": -4.440615653991699, + "logps/rejected": -4.99686336517334, + "loss": 0.0521, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.440615653991699, + "rewards/margins": 0.5562475919723511, + "rewards/rejected": -4.99686336517334, + "sft_loss": 4.153700351715088, + "step": 3225 + }, + { + "epoch": 1.7287171767854157, + "grad_norm": 0.4122279782736568, + "learning_rate": 4.541212454673984e-07, + "logits/chosen": -0.9238206148147583, + "logits/rejected": -0.7181123495101929, + "logps/chosen": -4.5585713386535645, + "logps/rejected": -5.414360523223877, + "loss": 0.0515, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.5585713386535645, + "rewards/margins": 0.8557893633842468, + "rewards/rejected": -5.414360523223877, + "sft_loss": 4.298130989074707, + "step": 3230 + }, + { + "epoch": 1.7313932095668172, + "grad_norm": 0.4026073166205477, + "learning_rate": 4.525706379570055e-07, + "logits/chosen": -0.9051357507705688, + "logits/rejected": -0.832781195640564, + "logps/chosen": -4.346682548522949, + "logps/rejected": -4.964540481567383, + "loss": 0.0521, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.346682548522949, + "rewards/margins": 0.6178587675094604, + "rewards/rejected": -4.964540481567383, + "sft_loss": 4.1737565994262695, + "step": 3235 + }, + { + "epoch": 1.7340692423482187, + "grad_norm": 0.47424442428266617, + "learning_rate": 4.510204906064911e-07, + "logits/chosen": -0.8570457696914673, + "logits/rejected": -0.7040773630142212, + "logps/chosen": -4.4648237228393555, + "logps/rejected": -5.1131768226623535, + "loss": 0.0513, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.4648237228393555, + "rewards/margins": 0.6483533978462219, + "rewards/rejected": -5.1131768226623535, + "sft_loss": 4.171679496765137, + "step": 3240 + }, + { + "epoch": 1.7367452751296204, + "grad_norm": 1.0429902624418217, + "learning_rate": 4.4947081845539177e-07, + "logits/chosen": -0.9616962671279907, + "logits/rejected": -0.8357839584350586, + "logps/chosen": -4.420161247253418, + "logps/rejected": -5.0600433349609375, + "loss": 0.052, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.420161247253418, + "rewards/margins": 0.6398815512657166, + "rewards/rejected": -5.0600433349609375, + "sft_loss": 4.109945297241211, + "step": 3245 + }, + { + "epoch": 1.739421307911022, + "grad_norm": 0.40018549193747066, + "learning_rate": 4.479216365386333e-07, + "logits/chosen": -0.772409200668335, + "logits/rejected": -0.5954066514968872, + "logps/chosen": -4.221619129180908, + "logps/rejected": -5.036656856536865, + "loss": 0.0503, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.221619129180908, + "rewards/margins": 0.8150378465652466, + "rewards/rejected": -5.036656856536865, + "sft_loss": 3.9400768280029297, + "step": 3250 + }, + { + "epoch": 1.7420973406924234, + "grad_norm": 0.37909844451042285, + "learning_rate": 4.4637295988638555e-07, + "logits/chosen": -0.8761689066886902, + "logits/rejected": -0.8001937866210938, + "logps/chosen": -4.484442710876465, + "logps/rejected": -5.098120212554932, + "loss": 0.0517, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.484442710876465, + "rewards/margins": 0.6136777400970459, + "rewards/rejected": -5.098120212554932, + "sft_loss": 4.218336582183838, + "step": 3255 + }, + { + "epoch": 1.744773373473825, + "grad_norm": 0.6816501770865704, + "learning_rate": 4.4482480352391623e-07, + "logits/chosen": -0.8988161087036133, + "logits/rejected": -0.7821734547615051, + "logps/chosen": -4.642391204833984, + "logps/rejected": -5.23095703125, + "loss": 0.0531, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.642391204833984, + "rewards/margins": 0.5885659456253052, + "rewards/rejected": -5.23095703125, + "sft_loss": 4.404733657836914, + "step": 3260 + }, + { + "epoch": 1.7474494062552266, + "grad_norm": 0.5455705596230104, + "learning_rate": 4.4327718247144507e-07, + "logits/chosen": -0.7707226872444153, + "logits/rejected": -0.6102782487869263, + "logps/chosen": -4.860251426696777, + "logps/rejected": -5.384507656097412, + "loss": 0.0536, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.860251426696777, + "rewards/margins": 0.5242565870285034, + "rewards/rejected": -5.384507656097412, + "sft_loss": 4.573927879333496, + "step": 3265 + }, + { + "epoch": 1.750125439036628, + "grad_norm": 0.5162517294345276, + "learning_rate": 4.417301117439984e-07, + "logits/chosen": -0.9343627095222473, + "logits/rejected": -0.7939955592155457, + "logps/chosen": -4.480029106140137, + "logps/rejected": -4.994220733642578, + "loss": 0.0529, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.480029106140137, + "rewards/margins": 0.5141913294792175, + "rewards/rejected": -4.994220733642578, + "sft_loss": 4.212777137756348, + "step": 3270 + }, + { + "epoch": 1.7528014718180298, + "grad_norm": 0.4073109145029346, + "learning_rate": 4.401836063512631e-07, + "logits/chosen": -1.001089334487915, + "logits/rejected": -0.6555830240249634, + "logps/chosen": -4.451521873474121, + "logps/rejected": -5.125973224639893, + "loss": 0.0512, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.451521873474121, + "rewards/margins": 0.6744511723518372, + "rewards/rejected": -5.125973224639893, + "sft_loss": 4.141744136810303, + "step": 3275 + }, + { + "epoch": 1.7554775045994313, + "grad_norm": 0.5772050447307934, + "learning_rate": 4.386376812974413e-07, + "logits/chosen": -0.9038535952568054, + "logits/rejected": -0.8890512585639954, + "logps/chosen": -4.574075698852539, + "logps/rejected": -5.134883880615234, + "loss": 0.0521, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.574075698852539, + "rewards/margins": 0.5608078837394714, + "rewards/rejected": -5.134883880615234, + "sft_loss": 4.309540748596191, + "step": 3280 + }, + { + "epoch": 1.7581535373808328, + "grad_norm": 0.40686850263594554, + "learning_rate": 4.370923515811048e-07, + "logits/chosen": -0.9050912857055664, + "logits/rejected": -0.6649254560470581, + "logps/chosen": -4.3815155029296875, + "logps/rejected": -5.013126373291016, + "loss": 0.0513, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.3815155029296875, + "rewards/margins": 0.6316103935241699, + "rewards/rejected": -5.013126373291016, + "sft_loss": 4.075606346130371, + "step": 3285 + }, + { + "epoch": 1.7608295701622345, + "grad_norm": 0.39012420613123405, + "learning_rate": 4.35547632195049e-07, + "logits/chosen": -0.8044376373291016, + "logits/rejected": -0.7201283574104309, + "logps/chosen": -4.696098327636719, + "logps/rejected": -5.314190864562988, + "loss": 0.0524, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.696098327636719, + "rewards/margins": 0.6180926561355591, + "rewards/rejected": -5.314190864562988, + "sft_loss": 4.33831787109375, + "step": 3290 + }, + { + "epoch": 1.763505602943636, + "grad_norm": 0.7351191098993615, + "learning_rate": 4.340035381261484e-07, + "logits/chosen": -0.7771255373954773, + "logits/rejected": -0.7277175784111023, + "logps/chosen": -4.453921318054199, + "logps/rejected": -5.145596504211426, + "loss": 0.0511, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.453921318054199, + "rewards/margins": 0.6916751861572266, + "rewards/rejected": -5.145596504211426, + "sft_loss": 4.129612922668457, + "step": 3295 + }, + { + "epoch": 1.7661816357250375, + "grad_norm": 0.35475089495116524, + "learning_rate": 4.324600843552104e-07, + "logits/chosen": -0.9506170153617859, + "logits/rejected": -0.8384958505630493, + "logps/chosen": -4.479653358459473, + "logps/rejected": -5.169857978820801, + "loss": 0.0513, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.479653358459473, + "rewards/margins": 0.6902052760124207, + "rewards/rejected": -5.169857978820801, + "sft_loss": 4.17246150970459, + "step": 3300 + }, + { + "epoch": 1.7688576685064392, + "grad_norm": 0.3659908245134801, + "learning_rate": 4.309172858568302e-07, + "logits/chosen": -0.937427818775177, + "logits/rejected": -0.7483657598495483, + "logps/chosen": -4.533991813659668, + "logps/rejected": -5.1667327880859375, + "loss": 0.0515, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.533991813659668, + "rewards/margins": 0.6327404975891113, + "rewards/rejected": -5.1667327880859375, + "sft_loss": 4.246024131774902, + "step": 3305 + }, + { + "epoch": 1.771533701287841, + "grad_norm": 0.6137087486247289, + "learning_rate": 4.293751575992455e-07, + "logits/chosen": -0.7637144327163696, + "logits/rejected": -0.7552968859672546, + "logps/chosen": -4.631504535675049, + "logps/rejected": -5.132235527038574, + "loss": 0.053, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.631504535675049, + "rewards/margins": 0.5007305145263672, + "rewards/rejected": -5.132235527038574, + "sft_loss": 4.390534400939941, + "step": 3310 + }, + { + "epoch": 1.7742097340692422, + "grad_norm": 0.4352540545382265, + "learning_rate": 4.278337145441916e-07, + "logits/chosen": -0.886570155620575, + "logits/rejected": -0.7136012315750122, + "logps/chosen": -4.416377067565918, + "logps/rejected": -5.063299179077148, + "loss": 0.0515, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.416377067565918, + "rewards/margins": 0.6469219326972961, + "rewards/rejected": -5.063299179077148, + "sft_loss": 4.168293476104736, + "step": 3315 + }, + { + "epoch": 1.776885766850644, + "grad_norm": 0.3831451426766769, + "learning_rate": 4.262929716467556e-07, + "logits/chosen": -0.8285180926322937, + "logits/rejected": -0.5632731318473816, + "logps/chosen": -4.392007827758789, + "logps/rejected": -5.139756679534912, + "loss": 0.052, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.392007827758789, + "rewards/margins": 0.7477489709854126, + "rewards/rejected": -5.139756679534912, + "sft_loss": 4.160672187805176, + "step": 3320 + }, + { + "epoch": 1.7795617996320456, + "grad_norm": 0.39530178751687783, + "learning_rate": 4.247529438552321e-07, + "logits/chosen": -0.9192399978637695, + "logits/rejected": -0.7035531997680664, + "logps/chosen": -4.617929458618164, + "logps/rejected": -5.078191757202148, + "loss": 0.0532, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.617929458618164, + "rewards/margins": 0.4602627754211426, + "rewards/rejected": -5.078191757202148, + "sft_loss": 4.380537986755371, + "step": 3325 + }, + { + "epoch": 1.782237832413447, + "grad_norm": 0.44253110919207433, + "learning_rate": 4.232136461109773e-07, + "logits/chosen": -0.8389490842819214, + "logits/rejected": -0.7441287040710449, + "logps/chosen": -4.518656253814697, + "logps/rejected": -5.131009101867676, + "loss": 0.0514, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.518656253814697, + "rewards/margins": 0.6123533248901367, + "rewards/rejected": -5.131009101867676, + "sft_loss": 4.230744361877441, + "step": 3330 + }, + { + "epoch": 1.7849138651948486, + "grad_norm": 0.4149420144966067, + "learning_rate": 4.216750933482646e-07, + "logits/chosen": -0.8691641092300415, + "logits/rejected": -0.6906719207763672, + "logps/chosen": -4.620790958404541, + "logps/rejected": -5.059487342834473, + "loss": 0.053, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.620790958404541, + "rewards/margins": 0.4386964440345764, + "rewards/rejected": -5.059487342834473, + "sft_loss": 4.235035419464111, + "step": 3335 + }, + { + "epoch": 1.7875898979762503, + "grad_norm": 0.5139969806101036, + "learning_rate": 4.2013730049413986e-07, + "logits/chosen": -0.8421649932861328, + "logits/rejected": -0.6403535604476929, + "logps/chosen": -4.331358432769775, + "logps/rejected": -5.270960807800293, + "loss": 0.0505, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.331358432769775, + "rewards/margins": 0.9396018981933594, + "rewards/rejected": -5.270960807800293, + "sft_loss": 4.097912788391113, + "step": 3340 + }, + { + "epoch": 1.7902659307576518, + "grad_norm": 0.30430095720378014, + "learning_rate": 4.1860028246827594e-07, + "logits/chosen": -0.8284981846809387, + "logits/rejected": -0.5819277167320251, + "logps/chosen": -4.4769392013549805, + "logps/rejected": -5.007475852966309, + "loss": 0.0518, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.4769392013549805, + "rewards/margins": 0.5305370092391968, + "rewards/rejected": -5.007475852966309, + "sft_loss": 4.225216388702393, + "step": 3345 + }, + { + "epoch": 1.7929419635390533, + "grad_norm": 0.4921011154236414, + "learning_rate": 4.170640541828285e-07, + "logits/chosen": -0.9329277873039246, + "logits/rejected": -0.7737506031990051, + "logps/chosen": -4.421766757965088, + "logps/rejected": -5.002078056335449, + "loss": 0.0526, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.421766757965088, + "rewards/margins": 0.5803118348121643, + "rewards/rejected": -5.002078056335449, + "sft_loss": 4.226648807525635, + "step": 3350 + }, + { + "epoch": 1.795617996320455, + "grad_norm": 0.43199463592891063, + "learning_rate": 4.1552863054229116e-07, + "logits/chosen": -0.7047543525695801, + "logits/rejected": -0.6609756350517273, + "logps/chosen": -4.531655788421631, + "logps/rejected": -5.140510559082031, + "loss": 0.0521, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.531655788421631, + "rewards/margins": 0.6088550686836243, + "rewards/rejected": -5.140510559082031, + "sft_loss": 4.227361679077148, + "step": 3355 + }, + { + "epoch": 1.7982940291018565, + "grad_norm": 0.5426284178434481, + "learning_rate": 4.139940264433508e-07, + "logits/chosen": -0.7945530414581299, + "logits/rejected": -0.5765786170959473, + "logps/chosen": -4.41995906829834, + "logps/rejected": -5.090672016143799, + "loss": 0.0517, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.41995906829834, + "rewards/margins": 0.6707130074501038, + "rewards/rejected": -5.090672016143799, + "sft_loss": 4.104195594787598, + "step": 3360 + }, + { + "epoch": 1.800970061883258, + "grad_norm": 0.3120787417194607, + "learning_rate": 4.1246025677474303e-07, + "logits/chosen": -0.8796011805534363, + "logits/rejected": -0.6404326558113098, + "logps/chosen": -4.570914268493652, + "logps/rejected": -5.127249240875244, + "loss": 0.0531, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.570914268493652, + "rewards/margins": 0.5563352108001709, + "rewards/rejected": -5.127249240875244, + "sft_loss": 4.335757255554199, + "step": 3365 + }, + { + "epoch": 1.8036460946646597, + "grad_norm": 0.3718432668133724, + "learning_rate": 4.10927336417108e-07, + "logits/chosen": -0.8479071855545044, + "logits/rejected": -0.6178969740867615, + "logps/chosen": -4.612782955169678, + "logps/rejected": -5.014036655426025, + "loss": 0.0534, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.612782955169678, + "rewards/margins": 0.40125328302383423, + "rewards/rejected": -5.014036655426025, + "sft_loss": 4.301538944244385, + "step": 3370 + }, + { + "epoch": 1.8063221274460612, + "grad_norm": 0.6878982267884615, + "learning_rate": 4.093952802428457e-07, + "logits/chosen": -0.6994894742965698, + "logits/rejected": -0.6803777813911438, + "logps/chosen": -4.756828308105469, + "logps/rejected": -5.290118217468262, + "loss": 0.0534, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.756828308105469, + "rewards/margins": 0.5332905054092407, + "rewards/rejected": -5.290118217468262, + "sft_loss": 4.457432746887207, + "step": 3375 + }, + { + "epoch": 1.8089981602274627, + "grad_norm": 0.589099141329664, + "learning_rate": 4.0786410311597184e-07, + "logits/chosen": -0.9029039144515991, + "logits/rejected": -0.6868539452552795, + "logps/chosen": -4.576333522796631, + "logps/rejected": -5.159743309020996, + "loss": 0.052, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.576333522796631, + "rewards/margins": 0.5834100842475891, + "rewards/rejected": -5.159743309020996, + "sft_loss": 4.252318382263184, + "step": 3380 + }, + { + "epoch": 1.8116741930088645, + "grad_norm": 0.3999674824957761, + "learning_rate": 4.063338198919737e-07, + "logits/chosen": -0.8113029599189758, + "logits/rejected": -0.846416175365448, + "logps/chosen": -4.430078506469727, + "logps/rejected": -4.966437339782715, + "loss": 0.0528, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.430078506469727, + "rewards/margins": 0.5363594889640808, + "rewards/rejected": -4.966437339782715, + "sft_loss": 4.174395561218262, + "step": 3385 + }, + { + "epoch": 1.814350225790266, + "grad_norm": 0.38273738849063044, + "learning_rate": 4.0480444541766575e-07, + "logits/chosen": -0.8961130380630493, + "logits/rejected": -0.7447247505187988, + "logps/chosen": -4.732843399047852, + "logps/rejected": -5.325850486755371, + "loss": 0.0529, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.732843399047852, + "rewards/margins": 0.5930072665214539, + "rewards/rejected": -5.325850486755371, + "sft_loss": 4.414688587188721, + "step": 3390 + }, + { + "epoch": 1.8170262585716674, + "grad_norm": 0.5147586922678519, + "learning_rate": 4.0327599453104606e-07, + "logits/chosen": -0.9704347848892212, + "logits/rejected": -0.780985951423645, + "logps/chosen": -4.527245998382568, + "logps/rejected": -5.088501930236816, + "loss": 0.0509, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.527245998382568, + "rewards/margins": 0.5612561106681824, + "rewards/rejected": -5.088501930236816, + "sft_loss": 4.192348480224609, + "step": 3395 + }, + { + "epoch": 1.8197022913530692, + "grad_norm": 0.5005840786352599, + "learning_rate": 4.017484820611514e-07, + "logits/chosen": -0.7550173997879028, + "logits/rejected": -0.6032567620277405, + "logps/chosen": -4.417336463928223, + "logps/rejected": -5.080150127410889, + "loss": 0.0511, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.417336463928223, + "rewards/margins": 0.6628138422966003, + "rewards/rejected": -5.080150127410889, + "sft_loss": 4.079850673675537, + "step": 3400 + }, + { + "epoch": 1.8223783241344707, + "grad_norm": 0.8229748724775394, + "learning_rate": 4.002219228279148e-07, + "logits/chosen": -0.8199543952941895, + "logits/rejected": -0.6911274194717407, + "logps/chosen": -4.341639518737793, + "logps/rejected": -4.903729438781738, + "loss": 0.0517, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.341639518737793, + "rewards/margins": 0.5620898008346558, + "rewards/rejected": -4.903729438781738, + "sft_loss": 4.065591812133789, + "step": 3405 + }, + { + "epoch": 1.8250543569158721, + "grad_norm": 0.4876509117776437, + "learning_rate": 3.9869633164202045e-07, + "logits/chosen": -0.8162205815315247, + "logits/rejected": -0.52099609375, + "logps/chosen": -4.461337089538574, + "logps/rejected": -5.199841022491455, + "loss": 0.051, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.461337089538574, + "rewards/margins": 0.7385039925575256, + "rewards/rejected": -5.199841022491455, + "sft_loss": 4.167636394500732, + "step": 3410 + }, + { + "epoch": 1.8277303896972739, + "grad_norm": 0.7065591387395672, + "learning_rate": 3.9717172330476077e-07, + "logits/chosen": -0.9044772982597351, + "logits/rejected": -0.7785995006561279, + "logps/chosen": -4.586330413818359, + "logps/rejected": -5.2429423332214355, + "loss": 0.0522, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.586330413818359, + "rewards/margins": 0.6566125154495239, + "rewards/rejected": -5.2429423332214355, + "sft_loss": 4.347871780395508, + "step": 3415 + }, + { + "epoch": 1.8304064224786754, + "grad_norm": 0.5075784931382683, + "learning_rate": 3.956481126078927e-07, + "logits/chosen": -0.6835981607437134, + "logits/rejected": -0.5895021557807922, + "logps/chosen": -4.792338848114014, + "logps/rejected": -5.461629867553711, + "loss": 0.0543, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.792338848114014, + "rewards/margins": 0.6692904829978943, + "rewards/rejected": -5.461629867553711, + "sft_loss": 4.541243553161621, + "step": 3420 + }, + { + "epoch": 1.8330824552600768, + "grad_norm": 0.537368618423302, + "learning_rate": 3.941255143334937e-07, + "logits/chosen": -0.7706578969955444, + "logits/rejected": -0.7533372640609741, + "logps/chosen": -4.472137451171875, + "logps/rejected": -5.051485538482666, + "loss": 0.0514, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.472137451171875, + "rewards/margins": 0.5793476700782776, + "rewards/rejected": -5.051485538482666, + "sft_loss": 4.173643589019775, + "step": 3425 + }, + { + "epoch": 1.8357584880414786, + "grad_norm": 0.35854577541397925, + "learning_rate": 3.9260394325381895e-07, + "logits/chosen": -0.7394391894340515, + "logits/rejected": -0.6220877766609192, + "logps/chosen": -4.258572578430176, + "logps/rejected": -5.175926685333252, + "loss": 0.0503, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.258572578430176, + "rewards/margins": 0.9173545837402344, + "rewards/rejected": -5.175926685333252, + "sft_loss": 3.9583396911621094, + "step": 3430 + }, + { + "epoch": 1.83843452082288, + "grad_norm": 0.41343245811909185, + "learning_rate": 3.9108341413115784e-07, + "logits/chosen": -0.7238286733627319, + "logits/rejected": -0.6091488003730774, + "logps/chosen": -4.33715295791626, + "logps/rejected": -4.925466060638428, + "loss": 0.0516, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.33715295791626, + "rewards/margins": 0.5883134603500366, + "rewards/rejected": -4.925466060638428, + "sft_loss": 3.97629976272583, + "step": 3435 + }, + { + "epoch": 1.8411105536042816, + "grad_norm": 0.41918254444927916, + "learning_rate": 3.895639417176905e-07, + "logits/chosen": -0.8348200917243958, + "logits/rejected": -0.707625687122345, + "logps/chosen": -4.645773887634277, + "logps/rejected": -5.314620018005371, + "loss": 0.0542, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.645773887634277, + "rewards/margins": 0.668846607208252, + "rewards/rejected": -5.314620018005371, + "sft_loss": 4.392010688781738, + "step": 3440 + }, + { + "epoch": 1.8437865863856833, + "grad_norm": 0.42246506471392314, + "learning_rate": 3.8804554075534497e-07, + "logits/chosen": -0.8349407315254211, + "logits/rejected": -0.5855430960655212, + "logps/chosen": -4.490998268127441, + "logps/rejected": -5.2729082107543945, + "loss": 0.0517, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.490998268127441, + "rewards/margins": 0.7819093465805054, + "rewards/rejected": -5.2729082107543945, + "sft_loss": 4.270726203918457, + "step": 3445 + }, + { + "epoch": 1.8464626191670848, + "grad_norm": 0.6390590052604445, + "learning_rate": 3.8652822597565403e-07, + "logits/chosen": -0.8645042181015015, + "logits/rejected": -0.6598786115646362, + "logps/chosen": -4.601441383361816, + "logps/rejected": -5.257026672363281, + "loss": 0.0523, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.601441383361816, + "rewards/margins": 0.6555854678153992, + "rewards/rejected": -5.257026672363281, + "sft_loss": 4.356138229370117, + "step": 3450 + }, + { + "epoch": 1.8491386519484863, + "grad_norm": 0.39388686843240567, + "learning_rate": 3.850120120996123e-07, + "logits/chosen": -0.8234783411026001, + "logits/rejected": -0.5680242776870728, + "logps/chosen": -4.540238857269287, + "logps/rejected": -5.173902988433838, + "loss": 0.052, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.540238857269287, + "rewards/margins": 0.633664071559906, + "rewards/rejected": -5.173902988433838, + "sft_loss": 4.335484504699707, + "step": 3455 + }, + { + "epoch": 1.851814684729888, + "grad_norm": 0.4499215399446686, + "learning_rate": 3.8349691383753356e-07, + "logits/chosen": -0.6719542145729065, + "logits/rejected": -0.5450443029403687, + "logps/chosen": -4.431046962738037, + "logps/rejected": -5.082084655761719, + "loss": 0.0518, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.431046962738037, + "rewards/margins": 0.6510375142097473, + "rewards/rejected": -5.082084655761719, + "sft_loss": 4.082747459411621, + "step": 3460 + }, + { + "epoch": 1.8544907175112895, + "grad_norm": 0.3778904426429339, + "learning_rate": 3.819829458889078e-07, + "logits/chosen": -0.7996759414672852, + "logits/rejected": -0.7100083231925964, + "logps/chosen": -4.312413215637207, + "logps/rejected": -4.77877950668335, + "loss": 0.0522, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.312413215637207, + "rewards/margins": 0.4663669466972351, + "rewards/rejected": -4.77877950668335, + "sft_loss": 4.042895317077637, + "step": 3465 + }, + { + "epoch": 1.857166750292691, + "grad_norm": 0.5604460188611351, + "learning_rate": 3.804701229422585e-07, + "logits/chosen": -0.8722193837165833, + "logits/rejected": -0.8069941401481628, + "logps/chosen": -4.671416282653809, + "logps/rejected": -5.111981391906738, + "loss": 0.0542, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.671416282653809, + "rewards/margins": 0.4405645728111267, + "rewards/rejected": -5.111981391906738, + "sft_loss": 4.324137210845947, + "step": 3470 + }, + { + "epoch": 1.8598427830740927, + "grad_norm": 0.46389515283591515, + "learning_rate": 3.789584596750007e-07, + "logits/chosen": -0.8924866914749146, + "logits/rejected": -0.86207515001297, + "logps/chosen": -4.578545570373535, + "logps/rejected": -5.2069993019104, + "loss": 0.0525, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.578545570373535, + "rewards/margins": 0.62845379114151, + "rewards/rejected": -5.2069993019104, + "sft_loss": 4.352547645568848, + "step": 3475 + }, + { + "epoch": 1.8625188158554944, + "grad_norm": 0.3101780013628597, + "learning_rate": 3.77447970753298e-07, + "logits/chosen": -0.8054901957511902, + "logits/rejected": -0.8184728622436523, + "logps/chosen": -4.611974716186523, + "logps/rejected": -5.077882766723633, + "loss": 0.0536, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.611974716186523, + "rewards/margins": 0.4659079909324646, + "rewards/rejected": -5.077882766723633, + "sft_loss": 4.400869369506836, + "step": 3480 + }, + { + "epoch": 1.8651948486368957, + "grad_norm": 0.4254690093635414, + "learning_rate": 3.7593867083192057e-07, + "logits/chosen": -0.8784816861152649, + "logits/rejected": -0.7168563604354858, + "logps/chosen": -4.520941257476807, + "logps/rejected": -5.078371524810791, + "loss": 0.0522, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.520941257476807, + "rewards/margins": 0.5574301481246948, + "rewards/rejected": -5.078371524810791, + "sft_loss": 4.297972679138184, + "step": 3485 + }, + { + "epoch": 1.8678708814182974, + "grad_norm": 0.48612755028753435, + "learning_rate": 3.7443057455410276e-07, + "logits/chosen": -0.7678418159484863, + "logits/rejected": -0.6967147588729858, + "logps/chosen": -4.190707206726074, + "logps/rejected": -5.020942211151123, + "loss": 0.0509, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.190707206726074, + "rewards/margins": 0.8302351832389832, + "rewards/rejected": -5.020942211151123, + "sft_loss": 4.027964115142822, + "step": 3490 + }, + { + "epoch": 1.870546914199699, + "grad_norm": 0.357070124593642, + "learning_rate": 3.7292369655140145e-07, + "logits/chosen": -1.0313518047332764, + "logits/rejected": -0.8303739428520203, + "logps/chosen": -4.52823543548584, + "logps/rejected": -5.011662006378174, + "loss": 0.054, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.52823543548584, + "rewards/margins": 0.4834265112876892, + "rewards/rejected": -5.011662006378174, + "sft_loss": 4.35888147354126, + "step": 3495 + }, + { + "epoch": 1.8732229469811004, + "grad_norm": 0.35556182344734116, + "learning_rate": 3.714180514435534e-07, + "logits/chosen": -0.8014265298843384, + "logits/rejected": -0.6077925562858582, + "logps/chosen": -4.6149725914001465, + "logps/rejected": -5.177509307861328, + "loss": 0.0527, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.6149725914001465, + "rewards/margins": 0.562536895275116, + "rewards/rejected": -5.177509307861328, + "sft_loss": 4.300775527954102, + "step": 3500 + }, + { + "epoch": 1.875898979762502, + "grad_norm": 0.3562554445522406, + "learning_rate": 3.6991365383833426e-07, + "logits/chosen": -0.9311081171035767, + "logits/rejected": -0.7546035647392273, + "logps/chosen": -4.536148548126221, + "logps/rejected": -5.134714126586914, + "loss": 0.0516, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.536148548126221, + "rewards/margins": 0.5985656976699829, + "rewards/rejected": -5.134714126586914, + "sft_loss": 4.202921390533447, + "step": 3505 + }, + { + "epoch": 1.8785750125439038, + "grad_norm": 0.4070594382170017, + "learning_rate": 3.684105183314162e-07, + "logits/chosen": -0.9334009289741516, + "logits/rejected": -0.8775346875190735, + "logps/chosen": -4.4893903732299805, + "logps/rejected": -5.005202293395996, + "loss": 0.0522, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.4893903732299805, + "rewards/margins": 0.5158116221427917, + "rewards/rejected": -5.005202293395996, + "sft_loss": 4.235560894012451, + "step": 3510 + }, + { + "epoch": 1.881251045325305, + "grad_norm": 0.42486172086349194, + "learning_rate": 3.669086595062263e-07, + "logits/chosen": -0.8683802485466003, + "logits/rejected": -0.6446768641471863, + "logps/chosen": -4.424868583679199, + "logps/rejected": -5.10919713973999, + "loss": 0.0517, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.424868583679199, + "rewards/margins": 0.6843288540840149, + "rewards/rejected": -5.10919713973999, + "sft_loss": 4.203177452087402, + "step": 3515 + }, + { + "epoch": 1.8839270781067068, + "grad_norm": 0.3631473467602663, + "learning_rate": 3.654080919338056e-07, + "logits/chosen": -0.9535540342330933, + "logits/rejected": -0.7772972583770752, + "logps/chosen": -4.488328456878662, + "logps/rejected": -5.187651634216309, + "loss": 0.0518, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.488328456878662, + "rewards/margins": 0.6993231773376465, + "rewards/rejected": -5.187651634216309, + "sft_loss": 4.2527360916137695, + "step": 3520 + }, + { + "epoch": 1.8866031108881085, + "grad_norm": 1.108956434138034, + "learning_rate": 3.639088301726673e-07, + "logits/chosen": -0.8254637718200684, + "logits/rejected": -0.6029265522956848, + "logps/chosen": -4.488307476043701, + "logps/rejected": -5.122195720672607, + "loss": 0.0526, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.488307476043701, + "rewards/margins": 0.633887529373169, + "rewards/rejected": -5.122195720672607, + "sft_loss": 4.214524745941162, + "step": 3525 + }, + { + "epoch": 1.88927914366951, + "grad_norm": 0.297083487299301, + "learning_rate": 3.624108887686556e-07, + "logits/chosen": -0.8513669967651367, + "logits/rejected": -0.8039869070053101, + "logps/chosen": -4.6177592277526855, + "logps/rejected": -5.003042697906494, + "loss": 0.053, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.6177592277526855, + "rewards/margins": 0.38528352975845337, + "rewards/rejected": -5.003042697906494, + "sft_loss": 4.321106910705566, + "step": 3530 + }, + { + "epoch": 1.8919551764509115, + "grad_norm": 0.3894391188758439, + "learning_rate": 3.6091428225480433e-07, + "logits/chosen": -1.000799536705017, + "logits/rejected": -0.8797224164009094, + "logps/chosen": -4.493613243103027, + "logps/rejected": -5.126000881195068, + "loss": 0.0513, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.493613243103027, + "rewards/margins": 0.632387638092041, + "rewards/rejected": -5.126000881195068, + "sft_loss": 4.193148612976074, + "step": 3535 + }, + { + "epoch": 1.8946312092323132, + "grad_norm": 0.38449769948061835, + "learning_rate": 3.5941902515119674e-07, + "logits/chosen": -1.034938097000122, + "logits/rejected": -0.7592099905014038, + "logps/chosen": -4.570082664489746, + "logps/rejected": -5.047337532043457, + "loss": 0.0533, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.570082664489746, + "rewards/margins": 0.47725504636764526, + "rewards/rejected": -5.047337532043457, + "sft_loss": 4.295851707458496, + "step": 3540 + }, + { + "epoch": 1.8973072420137147, + "grad_norm": 0.5234235514253388, + "learning_rate": 3.5792513196482373e-07, + "logits/chosen": -1.1635518074035645, + "logits/rejected": -0.8318096399307251, + "logps/chosen": -4.4717559814453125, + "logps/rejected": -5.075552463531494, + "loss": 0.0503, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.4717559814453125, + "rewards/margins": 0.6037967205047607, + "rewards/rejected": -5.075552463531494, + "sft_loss": 4.137124538421631, + "step": 3545 + }, + { + "epoch": 1.8999832747951162, + "grad_norm": 0.33314820472642775, + "learning_rate": 3.5643261718944346e-07, + "logits/chosen": -0.8819448351860046, + "logits/rejected": -0.7907723188400269, + "logps/chosen": -4.5302557945251465, + "logps/rejected": -4.951289176940918, + "loss": 0.0536, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.5302557945251465, + "rewards/margins": 0.4210330843925476, + "rewards/rejected": -4.951289176940918, + "sft_loss": 4.2429728507995605, + "step": 3550 + }, + { + "epoch": 1.902659307576518, + "grad_norm": 0.5263923862202771, + "learning_rate": 3.5494149530544087e-07, + "logits/chosen": -1.0538041591644287, + "logits/rejected": -0.9239507913589478, + "logps/chosen": -4.604650497436523, + "logps/rejected": -5.217805862426758, + "loss": 0.0524, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.604650497436523, + "rewards/margins": 0.6131556034088135, + "rewards/rejected": -5.217805862426758, + "sft_loss": 4.297959327697754, + "step": 3555 + }, + { + "epoch": 1.9053353403579194, + "grad_norm": 0.5020939907325735, + "learning_rate": 3.534517807796871e-07, + "logits/chosen": -0.9725322723388672, + "logits/rejected": -0.8672218322753906, + "logps/chosen": -4.604799747467041, + "logps/rejected": -5.175978183746338, + "loss": 0.0523, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.604799747467041, + "rewards/margins": 0.571178674697876, + "rewards/rejected": -5.175978183746338, + "sft_loss": 4.33532190322876, + "step": 3560 + }, + { + "epoch": 1.908011373139321, + "grad_norm": 0.43958453397135727, + "learning_rate": 3.519634880653988e-07, + "logits/chosen": -0.841769814491272, + "logits/rejected": -0.7862639427185059, + "logps/chosen": -4.502523422241211, + "logps/rejected": -5.1410603523254395, + "loss": 0.0508, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.502523422241211, + "rewards/margins": 0.6385367512702942, + "rewards/rejected": -5.1410603523254395, + "sft_loss": 4.152388572692871, + "step": 3565 + }, + { + "epoch": 1.9106874059207226, + "grad_norm": 0.4603995201061191, + "learning_rate": 3.504766316019987e-07, + "logits/chosen": -0.8893829584121704, + "logits/rejected": -0.6943017840385437, + "logps/chosen": -4.210341453552246, + "logps/rejected": -4.964727401733398, + "loss": 0.0497, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.210341453552246, + "rewards/margins": 0.7543860673904419, + "rewards/rejected": -4.964727401733398, + "sft_loss": 3.8542957305908203, + "step": 3570 + }, + { + "epoch": 1.913363438702124, + "grad_norm": 0.4166426736441857, + "learning_rate": 3.489912258149745e-07, + "logits/chosen": -0.7692733407020569, + "logits/rejected": -0.6670472025871277, + "logps/chosen": -4.290536403656006, + "logps/rejected": -5.004633903503418, + "loss": 0.0513, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.290536403656006, + "rewards/margins": 0.7140974998474121, + "rewards/rejected": -5.004633903503418, + "sft_loss": 3.9590721130371094, + "step": 3575 + }, + { + "epoch": 1.9160394714835256, + "grad_norm": 0.4202601946525645, + "learning_rate": 3.475072851157397e-07, + "logits/chosen": -0.8163237571716309, + "logits/rejected": -0.7968477010726929, + "logps/chosen": -4.64694881439209, + "logps/rejected": -5.253317356109619, + "loss": 0.0523, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.64694881439209, + "rewards/margins": 0.6063681840896606, + "rewards/rejected": -5.253317356109619, + "sft_loss": 4.27789306640625, + "step": 3580 + }, + { + "epoch": 1.9187155042649273, + "grad_norm": 0.48439923094521753, + "learning_rate": 3.460248239014936e-07, + "logits/chosen": -0.7363717555999756, + "logits/rejected": -0.7516336441040039, + "logps/chosen": -4.712521553039551, + "logps/rejected": -5.380669593811035, + "loss": 0.052, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.712521553039551, + "rewards/margins": 0.6681480407714844, + "rewards/rejected": -5.380669593811035, + "sft_loss": 4.458998203277588, + "step": 3585 + }, + { + "epoch": 1.9213915370463288, + "grad_norm": 1.2607721506545888, + "learning_rate": 3.4454385655508134e-07, + "logits/chosen": -0.7563374042510986, + "logits/rejected": -0.7634093165397644, + "logps/chosen": -4.5644426345825195, + "logps/rejected": -5.0168280601501465, + "loss": 0.0544, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -4.5644426345825195, + "rewards/margins": 0.45238548517227173, + "rewards/rejected": -5.0168280601501465, + "sft_loss": 4.332982540130615, + "step": 3590 + }, + { + "epoch": 1.9240675698277303, + "grad_norm": 0.5141778910520323, + "learning_rate": 3.4306439744485447e-07, + "logits/chosen": -0.9014706611633301, + "logits/rejected": -0.6987265348434448, + "logps/chosen": -4.463529586791992, + "logps/rejected": -5.1967034339904785, + "loss": 0.0517, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.463529586791992, + "rewards/margins": 0.7331734895706177, + "rewards/rejected": -5.1967034339904785, + "sft_loss": 4.167792320251465, + "step": 3595 + }, + { + "epoch": 1.926743602609132, + "grad_norm": 0.7170772833367152, + "learning_rate": 3.415864609245322e-07, + "logits/chosen": -0.8435789942741394, + "logits/rejected": -0.6109930276870728, + "logps/chosen": -4.614388465881348, + "logps/rejected": -5.201871871948242, + "loss": 0.0538, + "rewards/accuracies": 0.59375, + "rewards/chosen": -4.614388465881348, + "rewards/margins": 0.5874830484390259, + "rewards/rejected": -5.201871871948242, + "sft_loss": 4.346115589141846, + "step": 3600 + }, + { + "epoch": 1.926743602609132, + "eval_logits/chosen": -0.32916495203971863, + "eval_logits/rejected": -0.2475946843624115, + "eval_logps/chosen": -4.419311046600342, + "eval_logps/rejected": -5.063793659210205, + "eval_loss": 0.050544675439596176, + "eval_rewards/accuracies": 0.6847180724143982, + "eval_rewards/chosen": -4.419311046600342, + "eval_rewards/margins": 0.6444829702377319, + "eval_rewards/rejected": -5.063793659210205, + "eval_runtime": 43.3773, + "eval_samples_per_second": 31.007, + "eval_sft_loss": 4.043381214141846, + "eval_steps_per_second": 7.769, + "step": 3600 + }, + { + "epoch": 1.9294196353905335, + "grad_norm": 0.4483136535653781, + "learning_rate": 3.401100613330605e-07, + "logits/chosen": -0.9276592135429382, + "logits/rejected": -0.9318366050720215, + "logps/chosen": -4.496493339538574, + "logps/rejected": -4.951340675354004, + "loss": 0.0527, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.496493339538574, + "rewards/margins": 0.45484742522239685, + "rewards/rejected": -4.951340675354004, + "sft_loss": 4.23056173324585, + "step": 3605 + }, + { + "epoch": 1.932095668171935, + "grad_norm": 0.31593869243204614, + "learning_rate": 3.3863521299447514e-07, + "logits/chosen": -0.9971421360969543, + "logits/rejected": -0.8217166662216187, + "logps/chosen": -4.537590980529785, + "logps/rejected": -5.150124549865723, + "loss": 0.0522, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.537590980529785, + "rewards/margins": 0.6125333905220032, + "rewards/rejected": -5.150124549865723, + "sft_loss": 4.283602714538574, + "step": 3610 + }, + { + "epoch": 1.9347717009533367, + "grad_norm": 0.40521049552058575, + "learning_rate": 3.371619302177609e-07, + "logits/chosen": -0.8417550921440125, + "logits/rejected": -0.700288712978363, + "logps/chosen": -4.353780269622803, + "logps/rejected": -4.941786766052246, + "loss": 0.0529, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.353780269622803, + "rewards/margins": 0.5880061388015747, + "rewards/rejected": -4.941786766052246, + "sft_loss": 4.163086891174316, + "step": 3615 + }, + { + "epoch": 1.9374477337347382, + "grad_norm": 0.5389906830610456, + "learning_rate": 3.3569022729671393e-07, + "logits/chosen": -0.9811625480651855, + "logits/rejected": -0.8910226821899414, + "logps/chosen": -4.600131511688232, + "logps/rejected": -5.100063800811768, + "loss": 0.0536, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.600131511688232, + "rewards/margins": 0.4999319016933441, + "rewards/rejected": -5.100063800811768, + "sft_loss": 4.426603317260742, + "step": 3620 + }, + { + "epoch": 1.9401237665161397, + "grad_norm": 0.4394386395821723, + "learning_rate": 3.342201185098024e-07, + "logits/chosen": -0.8498247265815735, + "logits/rejected": -0.9369813203811646, + "logps/chosen": -4.571401119232178, + "logps/rejected": -4.972775936126709, + "loss": 0.0524, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.571401119232178, + "rewards/margins": 0.4013746380805969, + "rewards/rejected": -4.972775936126709, + "sft_loss": 4.325285911560059, + "step": 3625 + }, + { + "epoch": 1.9427997992975414, + "grad_norm": 0.38906540416917657, + "learning_rate": 3.3275161812002807e-07, + "logits/chosen": -0.9747118949890137, + "logits/rejected": -0.9587947726249695, + "logps/chosen": -4.523557662963867, + "logps/rejected": -5.116204261779785, + "loss": 0.0529, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.523557662963867, + "rewards/margins": 0.5926466584205627, + "rewards/rejected": -5.116204261779785, + "sft_loss": 4.328993797302246, + "step": 3630 + }, + { + "epoch": 1.945475832078943, + "grad_norm": 0.37734258771151635, + "learning_rate": 3.312847403747883e-07, + "logits/chosen": -1.0632799863815308, + "logits/rejected": -0.9446859359741211, + "logps/chosen": -4.447041988372803, + "logps/rejected": -5.011303424835205, + "loss": 0.0519, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.447041988372803, + "rewards/margins": 0.5642611384391785, + "rewards/rejected": -5.011303424835205, + "sft_loss": 4.248290061950684, + "step": 3635 + }, + { + "epoch": 1.9481518648603444, + "grad_norm": 0.3448461543727165, + "learning_rate": 3.2981949950573733e-07, + "logits/chosen": -0.950855553150177, + "logits/rejected": -0.9059017300605774, + "logps/chosen": -4.567697048187256, + "logps/rejected": -4.874037742614746, + "loss": 0.0533, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.567697048187256, + "rewards/margins": 0.3063400685787201, + "rewards/rejected": -4.874037742614746, + "sft_loss": 4.307459831237793, + "step": 3640 + }, + { + "epoch": 1.9508278976417461, + "grad_norm": 0.3138553731803001, + "learning_rate": 3.283559097286486e-07, + "logits/chosen": -0.9531084895133972, + "logits/rejected": -0.8209444284439087, + "logps/chosen": -4.5451555252075195, + "logps/rejected": -5.012529373168945, + "loss": 0.0524, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.5451555252075195, + "rewards/margins": 0.4673749506473541, + "rewards/rejected": -5.012529373168945, + "sft_loss": 4.314141273498535, + "step": 3645 + }, + { + "epoch": 1.9535039304231478, + "grad_norm": 0.4786368381669681, + "learning_rate": 3.268939852432765e-07, + "logits/chosen": -1.0215755701065063, + "logits/rejected": -0.9069620370864868, + "logps/chosen": -4.601208686828613, + "logps/rejected": -4.959532737731934, + "loss": 0.054, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -4.601208686828613, + "rewards/margins": 0.3583240211009979, + "rewards/rejected": -4.959532737731934, + "sft_loss": 4.377420425415039, + "step": 3650 + }, + { + "epoch": 1.9561799632045491, + "grad_norm": 0.560950653630944, + "learning_rate": 3.254337402332187e-07, + "logits/chosen": -0.8691121935844421, + "logits/rejected": -0.7701493501663208, + "logps/chosen": -4.460612773895264, + "logps/rejected": -5.0053582191467285, + "loss": 0.052, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.460612773895264, + "rewards/margins": 0.5447447299957275, + "rewards/rejected": -5.0053582191467285, + "sft_loss": 4.138766288757324, + "step": 3655 + }, + { + "epoch": 1.9588559959859508, + "grad_norm": 0.7040297008633517, + "learning_rate": 3.239751888657788e-07, + "logits/chosen": -0.9016326665878296, + "logits/rejected": -0.7556005120277405, + "logps/chosen": -4.512946605682373, + "logps/rejected": -5.122220993041992, + "loss": 0.0524, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.512946605682373, + "rewards/margins": 0.6092746257781982, + "rewards/rejected": -5.122220993041992, + "sft_loss": 4.221314907073975, + "step": 3660 + }, + { + "epoch": 1.9615320287673526, + "grad_norm": 0.4938417120856117, + "learning_rate": 3.2251834529182856e-07, + "logits/chosen": -0.9205803871154785, + "logits/rejected": -0.8506709337234497, + "logps/chosen": -4.682583332061768, + "logps/rejected": -5.245231628417969, + "loss": 0.052, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.682583332061768, + "rewards/margins": 0.5626480579376221, + "rewards/rejected": -5.245231628417969, + "sft_loss": 4.3715009689331055, + "step": 3665 + }, + { + "epoch": 1.9642080615487538, + "grad_norm": 0.6781558880822852, + "learning_rate": 3.2106322364567075e-07, + "logits/chosen": -0.9177119135856628, + "logits/rejected": -0.7638968825340271, + "logps/chosen": -4.520151138305664, + "logps/rejected": -5.164031028747559, + "loss": 0.0528, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.520151138305664, + "rewards/margins": 0.6438802480697632, + "rewards/rejected": -5.164031028747559, + "sft_loss": 4.398355484008789, + "step": 3670 + }, + { + "epoch": 1.9668840943301555, + "grad_norm": 0.43076111918754245, + "learning_rate": 3.1960983804490183e-07, + "logits/chosen": -0.8420068621635437, + "logits/rejected": -0.6891841888427734, + "logps/chosen": -4.482913017272949, + "logps/rejected": -5.048392295837402, + "loss": 0.053, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.482913017272949, + "rewards/margins": 0.5654793977737427, + "rewards/rejected": -5.048392295837402, + "sft_loss": 4.185931205749512, + "step": 3675 + }, + { + "epoch": 1.9695601271115573, + "grad_norm": 0.47109133227895944, + "learning_rate": 3.1815820259027537e-07, + "logits/chosen": -0.8213092088699341, + "logits/rejected": -0.7251837849617004, + "logps/chosen": -4.373992919921875, + "logps/rejected": -4.963822841644287, + "loss": 0.051, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.373992919921875, + "rewards/margins": 0.5898297429084778, + "rewards/rejected": -4.963822841644287, + "sft_loss": 4.038293838500977, + "step": 3680 + }, + { + "epoch": 1.9722361598929585, + "grad_norm": 0.4587038973394695, + "learning_rate": 3.16708331365565e-07, + "logits/chosen": -0.8804284334182739, + "logits/rejected": -0.7934475541114807, + "logps/chosen": -4.585146903991699, + "logps/rejected": -5.182665824890137, + "loss": 0.0523, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.585146903991699, + "rewards/margins": 0.5975189805030823, + "rewards/rejected": -5.182665824890137, + "sft_loss": 4.350895881652832, + "step": 3685 + }, + { + "epoch": 1.9749121926743602, + "grad_norm": 0.4353653760418129, + "learning_rate": 3.152602384374275e-07, + "logits/chosen": -0.8628435134887695, + "logits/rejected": -0.6308306455612183, + "logps/chosen": -4.509026527404785, + "logps/rejected": -5.229661464691162, + "loss": 0.0514, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.509026527404785, + "rewards/margins": 0.720634937286377, + "rewards/rejected": -5.229661464691162, + "sft_loss": 4.181239128112793, + "step": 3690 + }, + { + "epoch": 1.977588225455762, + "grad_norm": 0.4775253211912713, + "learning_rate": 3.1381393785526697e-07, + "logits/chosen": -0.8128175735473633, + "logits/rejected": -0.7428448796272278, + "logps/chosen": -4.55011510848999, + "logps/rejected": -5.149205207824707, + "loss": 0.0518, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.55011510848999, + "rewards/margins": 0.5990898013114929, + "rewards/rejected": -5.149205207824707, + "sft_loss": 4.240631103515625, + "step": 3695 + }, + { + "epoch": 1.9802642582371635, + "grad_norm": 0.4083359155614466, + "learning_rate": 3.123694436510979e-07, + "logits/chosen": -0.7938219308853149, + "logits/rejected": -0.6490747928619385, + "logps/chosen": -4.4928998947143555, + "logps/rejected": -5.018836498260498, + "loss": 0.0531, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.4928998947143555, + "rewards/margins": 0.5259365439414978, + "rewards/rejected": -5.018836498260498, + "sft_loss": 4.249953269958496, + "step": 3700 + }, + { + "epoch": 1.982940291018565, + "grad_norm": 0.4099432277759763, + "learning_rate": 3.1092676983940946e-07, + "logits/chosen": -0.8014167547225952, + "logits/rejected": -0.7310387492179871, + "logps/chosen": -4.47890567779541, + "logps/rejected": -5.038219928741455, + "loss": 0.0518, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.47890567779541, + "rewards/margins": 0.5593137145042419, + "rewards/rejected": -5.038219928741455, + "sft_loss": 4.150893211364746, + "step": 3705 + }, + { + "epoch": 1.9856163237999667, + "grad_norm": 0.3350153632175618, + "learning_rate": 3.094859304170293e-07, + "logits/chosen": -0.6266270875930786, + "logits/rejected": -0.6359224915504456, + "logps/chosen": -4.500874042510986, + "logps/rejected": -5.046133995056152, + "loss": 0.0523, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.500874042510986, + "rewards/margins": 0.5452605485916138, + "rewards/rejected": -5.046133995056152, + "sft_loss": 4.219162940979004, + "step": 3710 + }, + { + "epoch": 1.9882923565813682, + "grad_norm": 0.3536268621775762, + "learning_rate": 3.0804693936298795e-07, + "logits/chosen": -0.8250945210456848, + "logits/rejected": -0.7901517152786255, + "logps/chosen": -4.515018463134766, + "logps/rejected": -5.176741600036621, + "loss": 0.052, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.515018463134766, + "rewards/margins": 0.6617237329483032, + "rewards/rejected": -5.176741600036621, + "sft_loss": 4.3591413497924805, + "step": 3715 + }, + { + "epoch": 1.9909683893627697, + "grad_norm": 0.40375392263119575, + "learning_rate": 3.066098106383826e-07, + "logits/chosen": -0.8308378458023071, + "logits/rejected": -0.7185466289520264, + "logps/chosen": -4.537173271179199, + "logps/rejected": -5.094812870025635, + "loss": 0.0517, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.537173271179199, + "rewards/margins": 0.5576392412185669, + "rewards/rejected": -5.094812870025635, + "sft_loss": 4.137363433837891, + "step": 3720 + }, + { + "epoch": 1.9936444221441714, + "grad_norm": 0.48157595238238843, + "learning_rate": 3.0517455818624263e-07, + "logits/chosen": -0.8573415875434875, + "logits/rejected": -0.7897814512252808, + "logps/chosen": -4.485922813415527, + "logps/rejected": -5.143821716308594, + "loss": 0.0521, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.485922813415527, + "rewards/margins": 0.6578995585441589, + "rewards/rejected": -5.143821716308594, + "sft_loss": 4.253503799438477, + "step": 3725 + }, + { + "epoch": 1.9963204549255729, + "grad_norm": 0.4845596241734785, + "learning_rate": 3.037411959313936e-07, + "logits/chosen": -0.7231793403625488, + "logits/rejected": -0.5665433406829834, + "logps/chosen": -4.50700044631958, + "logps/rejected": -5.043286323547363, + "loss": 0.0527, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.50700044631958, + "rewards/margins": 0.5362854599952698, + "rewards/rejected": -5.043286323547363, + "sft_loss": 4.2585368156433105, + "step": 3730 + }, + { + "epoch": 1.9989964877069744, + "grad_norm": 1.072754132396118, + "learning_rate": 3.023097377803224e-07, + "logits/chosen": -0.7113522887229919, + "logits/rejected": -0.6499849557876587, + "logps/chosen": -4.468755722045898, + "logps/rejected": -5.006053447723389, + "loss": 0.0533, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.468755722045898, + "rewards/margins": 0.5372985005378723, + "rewards/rejected": -5.006053447723389, + "sft_loss": 4.215538024902344, + "step": 3735 + }, + { + "epoch": 2.001672520488376, + "grad_norm": 0.4718352567577755, + "learning_rate": 3.008801976210423e-07, + "logits/chosen": -0.7235435843467712, + "logits/rejected": -0.6755806803703308, + "logps/chosen": -4.443969249725342, + "logps/rejected": -4.865131855010986, + "loss": 0.053, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.443969249725342, + "rewards/margins": 0.42116230726242065, + "rewards/rejected": -4.865131855010986, + "sft_loss": 4.133395195007324, + "step": 3740 + }, + { + "epoch": 2.0043485532697773, + "grad_norm": 0.45285574666734346, + "learning_rate": 2.994525893229581e-07, + "logits/chosen": -0.7920453548431396, + "logits/rejected": -0.7268325090408325, + "logps/chosen": -4.540145397186279, + "logps/rejected": -5.200549125671387, + "loss": 0.0507, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.540145397186279, + "rewards/margins": 0.6604036688804626, + "rewards/rejected": -5.200549125671387, + "sft_loss": 4.260335445404053, + "step": 3745 + }, + { + "epoch": 2.007024586051179, + "grad_norm": 0.4211428890658203, + "learning_rate": 2.98026926736732e-07, + "logits/chosen": -0.8909608721733093, + "logits/rejected": -0.7725866436958313, + "logps/chosen": -4.55959415435791, + "logps/rejected": -5.068253040313721, + "loss": 0.0526, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.55959415435791, + "rewards/margins": 0.5086590647697449, + "rewards/rejected": -5.068253040313721, + "sft_loss": 4.258984088897705, + "step": 3750 + }, + { + "epoch": 2.0097006188325808, + "grad_norm": 0.4308395644884328, + "learning_rate": 2.9660322369414846e-07, + "logits/chosen": -0.8293676376342773, + "logits/rejected": -0.6661375761032104, + "logps/chosen": -4.575963973999023, + "logps/rejected": -5.214879989624023, + "loss": 0.0526, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.575963973999023, + "rewards/margins": 0.638916015625, + "rewards/rejected": -5.214879989624023, + "sft_loss": 4.343644618988037, + "step": 3755 + }, + { + "epoch": 2.0123766516139825, + "grad_norm": 0.2876446754182886, + "learning_rate": 2.9518149400798063e-07, + "logits/chosen": -0.850312352180481, + "logits/rejected": -0.8274194002151489, + "logps/chosen": -4.500282287597656, + "logps/rejected": -5.1650567054748535, + "loss": 0.0517, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.500282287597656, + "rewards/margins": 0.6647745370864868, + "rewards/rejected": -5.1650567054748535, + "sft_loss": 4.275763511657715, + "step": 3760 + }, + { + "epoch": 2.0150526843953838, + "grad_norm": 0.39282700418404237, + "learning_rate": 2.9376175147185633e-07, + "logits/chosen": -0.844325065612793, + "logits/rejected": -0.5920781493186951, + "logps/chosen": -4.502379894256592, + "logps/rejected": -5.151594638824463, + "loss": 0.0511, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.502379894256592, + "rewards/margins": 0.649213969707489, + "rewards/rejected": -5.151594638824463, + "sft_loss": 4.227784633636475, + "step": 3765 + }, + { + "epoch": 2.0177287171767855, + "grad_norm": 0.42074838124805847, + "learning_rate": 2.9234400986012376e-07, + "logits/chosen": -0.8818314671516418, + "logits/rejected": -0.6762841939926147, + "logps/chosen": -4.428236961364746, + "logps/rejected": -5.1282124519348145, + "loss": 0.051, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.428236961364746, + "rewards/margins": 0.699974775314331, + "rewards/rejected": -5.1282124519348145, + "sft_loss": 4.132842063903809, + "step": 3770 + }, + { + "epoch": 2.020404749958187, + "grad_norm": 0.4956780345069873, + "learning_rate": 2.9092828292771817e-07, + "logits/chosen": -0.7929924130439758, + "logits/rejected": -0.7674404382705688, + "logps/chosen": -4.421003818511963, + "logps/rejected": -5.078767776489258, + "loss": 0.0515, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.421003818511963, + "rewards/margins": 0.6577636003494263, + "rewards/rejected": -5.078767776489258, + "sft_loss": 4.150858402252197, + "step": 3775 + }, + { + "epoch": 2.0230807827395885, + "grad_norm": 0.3934700234275401, + "learning_rate": 2.8951458441002875e-07, + "logits/chosen": -0.698670506477356, + "logits/rejected": -0.7248165011405945, + "logps/chosen": -4.2970147132873535, + "logps/rejected": -4.952702522277832, + "loss": 0.0513, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.2970147132873535, + "rewards/margins": 0.655687689781189, + "rewards/rejected": -4.952702522277832, + "sft_loss": 4.0070271492004395, + "step": 3780 + }, + { + "epoch": 2.02575681552099, + "grad_norm": 0.514669682771898, + "learning_rate": 2.881029280227643e-07, + "logits/chosen": -0.8328143358230591, + "logits/rejected": -0.6541591882705688, + "logps/chosen": -4.35631799697876, + "logps/rejected": -5.068148612976074, + "loss": 0.0509, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.35631799697876, + "rewards/margins": 0.7118302583694458, + "rewards/rejected": -5.068148612976074, + "sft_loss": 4.113364219665527, + "step": 3785 + }, + { + "epoch": 2.028432848302392, + "grad_norm": 0.5041909432877334, + "learning_rate": 2.8669332746182177e-07, + "logits/chosen": -0.9019726514816284, + "logits/rejected": -0.7289915084838867, + "logps/chosen": -4.497717380523682, + "logps/rejected": -5.219419002532959, + "loss": 0.0522, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.497717380523682, + "rewards/margins": 0.7217018604278564, + "rewards/rejected": -5.219419002532959, + "sft_loss": 4.304531097412109, + "step": 3790 + }, + { + "epoch": 2.031108881083793, + "grad_norm": 0.39504962692275486, + "learning_rate": 2.8528579640315156e-07, + "logits/chosen": -0.7584611177444458, + "logits/rejected": -0.7723220586776733, + "logps/chosen": -4.636214733123779, + "logps/rejected": -5.066348075866699, + "loss": 0.0523, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.636214733123779, + "rewards/margins": 0.43013325333595276, + "rewards/rejected": -5.066348075866699, + "sft_loss": 4.299112796783447, + "step": 3795 + }, + { + "epoch": 2.033784913865195, + "grad_norm": 0.670166400454059, + "learning_rate": 2.8388034850262646e-07, + "logits/chosen": -0.7787135243415833, + "logits/rejected": -0.6269806623458862, + "logps/chosen": -4.497548580169678, + "logps/rejected": -5.134943962097168, + "loss": 0.0522, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.497548580169678, + "rewards/margins": 0.637395441532135, + "rewards/rejected": -5.134943962097168, + "sft_loss": 4.2226152420043945, + "step": 3800 + }, + { + "epoch": 2.0364609466465966, + "grad_norm": 0.41224442261042604, + "learning_rate": 2.824769973959079e-07, + "logits/chosen": -0.770029604434967, + "logits/rejected": -0.6335395574569702, + "logps/chosen": -4.504012584686279, + "logps/rejected": -5.04500150680542, + "loss": 0.0524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.504012584686279, + "rewards/margins": 0.5409889221191406, + "rewards/rejected": -5.04500150680542, + "sft_loss": 4.187629222869873, + "step": 3805 + }, + { + "epoch": 2.039136979427998, + "grad_norm": 0.44522007655542895, + "learning_rate": 2.81075756698315e-07, + "logits/chosen": -0.615141749382019, + "logits/rejected": -0.5446062088012695, + "logps/chosen": -4.580349922180176, + "logps/rejected": -5.290804862976074, + "loss": 0.0515, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.580349922180176, + "rewards/margins": 0.7104545831680298, + "rewards/rejected": -5.290804862976074, + "sft_loss": 4.226175785064697, + "step": 3810 + }, + { + "epoch": 2.0418130122093996, + "grad_norm": 0.36982501701742904, + "learning_rate": 2.7967664000469035e-07, + "logits/chosen": -0.8423372507095337, + "logits/rejected": -0.7574399709701538, + "logps/chosen": -4.433310031890869, + "logps/rejected": -5.036944389343262, + "loss": 0.0507, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.433310031890869, + "rewards/margins": 0.6036348342895508, + "rewards/rejected": -5.036944389343262, + "sft_loss": 4.126814842224121, + "step": 3815 + }, + { + "epoch": 2.0444890449908013, + "grad_norm": 0.4483686878186856, + "learning_rate": 2.7827966088927095e-07, + "logits/chosen": -0.8709455728530884, + "logits/rejected": -0.6395670175552368, + "logps/chosen": -4.461734294891357, + "logps/rejected": -5.337707042694092, + "loss": 0.051, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.461734294891357, + "rewards/margins": 0.8759723901748657, + "rewards/rejected": -5.337707042694092, + "sft_loss": 4.206548690795898, + "step": 3820 + }, + { + "epoch": 2.0471650777722026, + "grad_norm": 0.5525955846494557, + "learning_rate": 2.768848329055538e-07, + "logits/chosen": -0.7681508660316467, + "logits/rejected": -0.725921630859375, + "logps/chosen": -4.424405574798584, + "logps/rejected": -5.064883232116699, + "loss": 0.0523, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.424405574798584, + "rewards/margins": 0.6404775977134705, + "rewards/rejected": -5.064883232116699, + "sft_loss": 4.222466945648193, + "step": 3825 + }, + { + "epoch": 2.0498411105536043, + "grad_norm": 0.6304674537764908, + "learning_rate": 2.7549216958616657e-07, + "logits/chosen": -0.8746339678764343, + "logits/rejected": -0.734113872051239, + "logps/chosen": -4.404292106628418, + "logps/rejected": -5.094416618347168, + "loss": 0.0508, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.404292106628418, + "rewards/margins": 0.6901249885559082, + "rewards/rejected": -5.094416618347168, + "sft_loss": 4.037627220153809, + "step": 3830 + }, + { + "epoch": 2.052517143335006, + "grad_norm": 0.5009731987661138, + "learning_rate": 2.741016844427344e-07, + "logits/chosen": -0.7817445993423462, + "logits/rejected": -0.601385235786438, + "logps/chosen": -4.428588390350342, + "logps/rejected": -5.240962028503418, + "loss": 0.0516, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.428588390350342, + "rewards/margins": 0.8123737573623657, + "rewards/rejected": -5.240962028503418, + "sft_loss": 4.188836097717285, + "step": 3835 + }, + { + "epoch": 2.0551931761164073, + "grad_norm": 0.6299463597143712, + "learning_rate": 2.7271339096575073e-07, + "logits/chosen": -0.7218152284622192, + "logits/rejected": -0.577179491519928, + "logps/chosen": -4.575904846191406, + "logps/rejected": -5.296813011169434, + "loss": 0.0523, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.575904846191406, + "rewards/margins": 0.7209089994430542, + "rewards/rejected": -5.296813011169434, + "sft_loss": 4.2866058349609375, + "step": 3840 + }, + { + "epoch": 2.057869208897809, + "grad_norm": 0.4385588051996929, + "learning_rate": 2.713273026244446e-07, + "logits/chosen": -0.8641475439071655, + "logits/rejected": -0.6361261606216431, + "logps/chosen": -4.40885066986084, + "logps/rejected": -5.324732303619385, + "loss": 0.0502, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.40885066986084, + "rewards/margins": 0.9158821105957031, + "rewards/rejected": -5.324732303619385, + "sft_loss": 4.178675651550293, + "step": 3845 + }, + { + "epoch": 2.0605452416792107, + "grad_norm": 0.4168058120641021, + "learning_rate": 2.6994343286665156e-07, + "logits/chosen": -0.8316490054130554, + "logits/rejected": -0.619785726070404, + "logps/chosen": -4.314603328704834, + "logps/rejected": -5.067779064178467, + "loss": 0.0518, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.314603328704834, + "rewards/margins": 0.7531753778457642, + "rewards/rejected": -5.067779064178467, + "sft_loss": 4.156607627868652, + "step": 3850 + }, + { + "epoch": 2.063221274460612, + "grad_norm": 0.5075649961997, + "learning_rate": 2.6856179511868156e-07, + "logits/chosen": -0.7296597957611084, + "logits/rejected": -0.5145146250724792, + "logps/chosen": -4.354414463043213, + "logps/rejected": -5.239541530609131, + "loss": 0.0511, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.354414463043213, + "rewards/margins": 0.8851274251937866, + "rewards/rejected": -5.239541530609131, + "sft_loss": 4.124449253082275, + "step": 3855 + }, + { + "epoch": 2.0658973072420137, + "grad_norm": 1.1238836661784852, + "learning_rate": 2.6718240278519056e-07, + "logits/chosen": -0.6999574899673462, + "logits/rejected": -0.5747981071472168, + "logps/chosen": -4.261233329772949, + "logps/rejected": -5.282289981842041, + "loss": 0.0517, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.261233329772949, + "rewards/margins": 1.021056056022644, + "rewards/rejected": -5.282289981842041, + "sft_loss": 4.051290035247803, + "step": 3860 + }, + { + "epoch": 2.0685733400234154, + "grad_norm": 0.4627375151319719, + "learning_rate": 2.6580526924904866e-07, + "logits/chosen": -0.8693877458572388, + "logits/rejected": -0.7199397087097168, + "logps/chosen": -4.682175636291504, + "logps/rejected": -5.245555400848389, + "loss": 0.0524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.682175636291504, + "rewards/margins": 0.5633805990219116, + "rewards/rejected": -5.245555400848389, + "sft_loss": 4.376290321350098, + "step": 3865 + }, + { + "epoch": 2.0712493728048167, + "grad_norm": 0.5160298725641026, + "learning_rate": 2.6443040787121186e-07, + "logits/chosen": -0.9046756029129028, + "logits/rejected": -0.8257226943969727, + "logps/chosen": -4.422331809997559, + "logps/rejected": -5.142786979675293, + "loss": 0.051, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.422331809997559, + "rewards/margins": 0.720454752445221, + "rewards/rejected": -5.142786979675293, + "sft_loss": 4.169841289520264, + "step": 3870 + }, + { + "epoch": 2.0739254055862184, + "grad_norm": 0.5638296933110125, + "learning_rate": 2.6305783199059084e-07, + "logits/chosen": -0.7855437397956848, + "logits/rejected": -0.7157570123672485, + "logps/chosen": -4.402723789215088, + "logps/rejected": -4.955962181091309, + "loss": 0.053, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.402723789215088, + "rewards/margins": 0.5532382726669312, + "rewards/rejected": -4.955962181091309, + "sft_loss": 4.0784478187561035, + "step": 3875 + }, + { + "epoch": 2.07660143836762, + "grad_norm": 0.38165204325516433, + "learning_rate": 2.6168755492392324e-07, + "logits/chosen": -0.7893201112747192, + "logits/rejected": -0.637191653251648, + "logps/chosen": -4.291540145874023, + "logps/rejected": -5.171633720397949, + "loss": 0.0489, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.291540145874023, + "rewards/margins": 0.8800934553146362, + "rewards/rejected": -5.171633720397949, + "sft_loss": 3.9720988273620605, + "step": 3880 + }, + { + "epoch": 2.0792774711490214, + "grad_norm": 0.6583163664132554, + "learning_rate": 2.6031958996564274e-07, + "logits/chosen": -0.8037883043289185, + "logits/rejected": -0.7117749452590942, + "logps/chosen": -4.445651054382324, + "logps/rejected": -5.30559778213501, + "loss": 0.0525, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.445651054382324, + "rewards/margins": 0.859946608543396, + "rewards/rejected": -5.30559778213501, + "sft_loss": 4.167903900146484, + "step": 3885 + }, + { + "epoch": 2.081953503930423, + "grad_norm": 0.4715461336878274, + "learning_rate": 2.589539503877518e-07, + "logits/chosen": -0.7256428599357605, + "logits/rejected": -0.6271122097969055, + "logps/chosen": -4.560754776000977, + "logps/rejected": -5.249648094177246, + "loss": 0.0515, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.560754776000977, + "rewards/margins": 0.6888927817344666, + "rewards/rejected": -5.249648094177246, + "sft_loss": 4.257047176361084, + "step": 3890 + }, + { + "epoch": 2.084629536711825, + "grad_norm": 0.5647077173254447, + "learning_rate": 2.5759064943969125e-07, + "logits/chosen": -0.8433957099914551, + "logits/rejected": -0.6119260787963867, + "logps/chosen": -4.4959588050842285, + "logps/rejected": -5.253961086273193, + "loss": 0.0503, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.4959588050842285, + "rewards/margins": 0.7580021023750305, + "rewards/rejected": -5.253961086273193, + "sft_loss": 4.209029197692871, + "step": 3895 + }, + { + "epoch": 2.087305569493226, + "grad_norm": 0.3465141084373431, + "learning_rate": 2.562297003482131e-07, + "logits/chosen": -0.660112202167511, + "logits/rejected": -0.6809910535812378, + "logps/chosen": -4.294495582580566, + "logps/rejected": -5.017926216125488, + "loss": 0.0508, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.294495582580566, + "rewards/margins": 0.7234315872192383, + "rewards/rejected": -5.017926216125488, + "sft_loss": 4.066096782684326, + "step": 3900 + }, + { + "epoch": 2.089981602274628, + "grad_norm": 0.3979943305801828, + "learning_rate": 2.548711163172512e-07, + "logits/chosen": -0.700593888759613, + "logits/rejected": -0.64739990234375, + "logps/chosen": -4.466081142425537, + "logps/rejected": -4.965723514556885, + "loss": 0.0534, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.466081142425537, + "rewards/margins": 0.49964267015457153, + "rewards/rejected": -4.965723514556885, + "sft_loss": 4.193602561950684, + "step": 3905 + }, + { + "epoch": 2.0926576350560295, + "grad_norm": 0.42928400117151966, + "learning_rate": 2.53514910527794e-07, + "logits/chosen": -0.727473795413971, + "logits/rejected": -0.5996404886245728, + "logps/chosen": -4.518040180206299, + "logps/rejected": -5.068653583526611, + "loss": 0.0521, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.518040180206299, + "rewards/margins": 0.5506137609481812, + "rewards/rejected": -5.068653583526611, + "sft_loss": 4.262098789215088, + "step": 3910 + }, + { + "epoch": 2.095333667837431, + "grad_norm": 0.37355707424726675, + "learning_rate": 2.5216109613775573e-07, + "logits/chosen": -0.8052509427070618, + "logits/rejected": -0.6294256448745728, + "logps/chosen": -4.585465431213379, + "logps/rejected": -5.209853649139404, + "loss": 0.0529, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.585465431213379, + "rewards/margins": 0.6243882179260254, + "rewards/rejected": -5.209853649139404, + "sft_loss": 4.374361991882324, + "step": 3915 + }, + { + "epoch": 2.0980097006188325, + "grad_norm": 0.48572956394977185, + "learning_rate": 2.5080968628184993e-07, + "logits/chosen": -0.7453235387802124, + "logits/rejected": -0.5784560441970825, + "logps/chosen": -4.471060752868652, + "logps/rejected": -5.356838226318359, + "loss": 0.0507, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.471060752868652, + "rewards/margins": 0.8857781291007996, + "rewards/rejected": -5.356838226318359, + "sft_loss": 4.242326736450195, + "step": 3920 + }, + { + "epoch": 2.1006857334002342, + "grad_norm": 0.6413231575213167, + "learning_rate": 2.494606940714605e-07, + "logits/chosen": -0.694869875907898, + "logits/rejected": -0.641283392906189, + "logps/chosen": -4.33608865737915, + "logps/rejected": -5.122622489929199, + "loss": 0.0509, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.33608865737915, + "rewards/margins": 0.7865341305732727, + "rewards/rejected": -5.122622489929199, + "sft_loss": 4.0861897468566895, + "step": 3925 + }, + { + "epoch": 2.103361766181636, + "grad_norm": 0.6794536127536217, + "learning_rate": 2.4811413259451625e-07, + "logits/chosen": -0.8177544474601746, + "logits/rejected": -0.6390523910522461, + "logps/chosen": -4.208698272705078, + "logps/rejected": -4.995844841003418, + "loss": 0.051, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.208698272705078, + "rewards/margins": 0.7871468663215637, + "rewards/rejected": -4.995844841003418, + "sft_loss": 4.0283613204956055, + "step": 3930 + }, + { + "epoch": 2.106037798963037, + "grad_norm": 0.635316600850136, + "learning_rate": 2.46770014915362e-07, + "logits/chosen": -0.7075114846229553, + "logits/rejected": -0.6430591344833374, + "logps/chosen": -4.4350762367248535, + "logps/rejected": -5.117311000823975, + "loss": 0.053, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.4350762367248535, + "rewards/margins": 0.6822348833084106, + "rewards/rejected": -5.117311000823975, + "sft_loss": 4.175159454345703, + "step": 3935 + }, + { + "epoch": 2.108713831744439, + "grad_norm": 0.5542245200634366, + "learning_rate": 2.45428354074634e-07, + "logits/chosen": -0.7162960171699524, + "logits/rejected": -0.6563288569450378, + "logps/chosen": -4.496886730194092, + "logps/rejected": -5.226065635681152, + "loss": 0.0516, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.496886730194092, + "rewards/margins": 0.7291792631149292, + "rewards/rejected": -5.226065635681152, + "sft_loss": 4.221349716186523, + "step": 3940 + }, + { + "epoch": 2.1113898645258407, + "grad_norm": 0.6123482841719484, + "learning_rate": 2.4408916308913105e-07, + "logits/chosen": -0.7921692132949829, + "logits/rejected": -0.5897735357284546, + "logps/chosen": -4.634129524230957, + "logps/rejected": -5.069488525390625, + "loss": 0.0544, + "rewards/accuracies": 0.5625, + "rewards/chosen": -4.634129524230957, + "rewards/margins": 0.4353586733341217, + "rewards/rejected": -5.069488525390625, + "sft_loss": 4.371542930603027, + "step": 3945 + }, + { + "epoch": 2.114065897307242, + "grad_norm": 0.4166900257171104, + "learning_rate": 2.4275245495169025e-07, + "logits/chosen": -0.694656491279602, + "logits/rejected": -0.5563549399375916, + "logps/chosen": -4.529080867767334, + "logps/rejected": -5.242933750152588, + "loss": 0.0516, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.529080867767334, + "rewards/margins": 0.7138529419898987, + "rewards/rejected": -5.242933750152588, + "sft_loss": 4.234822750091553, + "step": 3950 + }, + { + "epoch": 2.1167419300886436, + "grad_norm": 0.4856713048710508, + "learning_rate": 2.414182426310597e-07, + "logits/chosen": -0.7959322333335876, + "logits/rejected": -0.769672691822052, + "logps/chosen": -4.510166168212891, + "logps/rejected": -5.311366081237793, + "loss": 0.0515, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.510166168212891, + "rewards/margins": 0.801199734210968, + "rewards/rejected": -5.311366081237793, + "sft_loss": 4.296000003814697, + "step": 3955 + }, + { + "epoch": 2.1194179628700454, + "grad_norm": 0.6243875236957093, + "learning_rate": 2.400865390717734e-07, + "logits/chosen": -0.7460509538650513, + "logits/rejected": -0.6690871119499207, + "logps/chosen": -4.389871120452881, + "logps/rejected": -5.315016746520996, + "loss": 0.0516, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.389871120452881, + "rewards/margins": 0.9251459240913391, + "rewards/rejected": -5.315016746520996, + "sft_loss": 4.183564186096191, + "step": 3960 + }, + { + "epoch": 2.1220939956514466, + "grad_norm": 0.5658353195120891, + "learning_rate": 2.3875735719402475e-07, + "logits/chosen": -0.7674288749694824, + "logits/rejected": -0.634276807308197, + "logps/chosen": -4.3222336769104, + "logps/rejected": -5.103603839874268, + "loss": 0.0507, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.3222336769104, + "rewards/margins": 0.7813698649406433, + "rewards/rejected": -5.103603839874268, + "sft_loss": 4.120206356048584, + "step": 3965 + }, + { + "epoch": 2.1247700284328483, + "grad_norm": 0.5640306652342747, + "learning_rate": 2.3743070989354258e-07, + "logits/chosen": -0.7015902996063232, + "logits/rejected": -0.6395735740661621, + "logps/chosen": -4.416680335998535, + "logps/rejected": -5.131161689758301, + "loss": 0.0512, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.416680335998535, + "rewards/margins": 0.7144818305969238, + "rewards/rejected": -5.131161689758301, + "sft_loss": 4.159177780151367, + "step": 3970 + }, + { + "epoch": 2.12744606121425, + "grad_norm": 0.8818729398968036, + "learning_rate": 2.3610661004146454e-07, + "logits/chosen": -0.6776810884475708, + "logits/rejected": -0.5723994374275208, + "logps/chosen": -4.257053375244141, + "logps/rejected": -4.910529136657715, + "loss": 0.0505, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.257053375244141, + "rewards/margins": 0.6534755229949951, + "rewards/rejected": -4.910529136657715, + "sft_loss": 3.978513240814209, + "step": 3975 + }, + { + "epoch": 2.1301220939956513, + "grad_norm": 0.42266618942146955, + "learning_rate": 2.3478507048421314e-07, + "logits/chosen": -0.7549012899398804, + "logits/rejected": -0.6445156335830688, + "logps/chosen": -4.521634578704834, + "logps/rejected": -5.223923683166504, + "loss": 0.0515, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.521634578704834, + "rewards/margins": 0.7022888660430908, + "rewards/rejected": -5.223923683166504, + "sft_loss": 4.206809997558594, + "step": 3980 + }, + { + "epoch": 2.132798126777053, + "grad_norm": 0.46043866853182125, + "learning_rate": 2.334661040433713e-07, + "logits/chosen": -0.8515769243240356, + "logits/rejected": -0.7206335663795471, + "logps/chosen": -4.5367536544799805, + "logps/rejected": -5.210167407989502, + "loss": 0.0517, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.5367536544799805, + "rewards/margins": 0.6734140515327454, + "rewards/rejected": -5.210167407989502, + "sft_loss": 4.247954368591309, + "step": 3985 + }, + { + "epoch": 2.1354741595584548, + "grad_norm": 0.5358993416695645, + "learning_rate": 2.321497235155568e-07, + "logits/chosen": -0.8475052118301392, + "logits/rejected": -0.688243567943573, + "logps/chosen": -4.262228965759277, + "logps/rejected": -5.043619632720947, + "loss": 0.0517, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.262228965759277, + "rewards/margins": 0.7813904881477356, + "rewards/rejected": -5.043619632720947, + "sft_loss": 4.01896858215332, + "step": 3990 + }, + { + "epoch": 2.138150192339856, + "grad_norm": 0.4150651391308651, + "learning_rate": 2.3083594167229965e-07, + "logits/chosen": -0.8656677007675171, + "logits/rejected": -0.5582382678985596, + "logps/chosen": -4.472632884979248, + "logps/rejected": -5.255242347717285, + "loss": 0.0509, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.472632884979248, + "rewards/margins": 0.7826098203659058, + "rewards/rejected": -5.255242347717285, + "sft_loss": 4.183423042297363, + "step": 3995 + }, + { + "epoch": 2.1408262251212578, + "grad_norm": 0.4563166112067603, + "learning_rate": 2.295247712599167e-07, + "logits/chosen": -0.7128655910491943, + "logits/rejected": -0.6650645136833191, + "logps/chosen": -4.469484806060791, + "logps/rejected": -5.218777179718018, + "loss": 0.0504, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.469484806060791, + "rewards/margins": 0.7492923736572266, + "rewards/rejected": -5.218777179718018, + "sft_loss": 4.08309268951416, + "step": 4000 + }, + { + "epoch": 2.1408262251212578, + "eval_logits/chosen": -0.2918679714202881, + "eval_logits/rejected": -0.21033975481987, + "eval_logps/chosen": -4.464621543884277, + "eval_logps/rejected": -5.165764808654785, + "eval_loss": 0.050455424934625626, + "eval_rewards/accuracies": 0.6839762330055237, + "eval_rewards/chosen": -4.464621543884277, + "eval_rewards/margins": 0.7011440396308899, + "eval_rewards/rejected": -5.165764808654785, + "eval_runtime": 43.4608, + "eval_samples_per_second": 30.947, + "eval_sft_loss": 4.058549404144287, + "eval_steps_per_second": 7.754, + "step": 4000 + }, + { + "epoch": 2.1435022579026595, + "grad_norm": 0.6727805412244077, + "learning_rate": 2.2821622499938948e-07, + "logits/chosen": -0.7068689465522766, + "logits/rejected": -0.47361889481544495, + "logps/chosen": -4.4505085945129395, + "logps/rejected": -5.127585411071777, + "loss": 0.0508, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.4505085945129395, + "rewards/margins": 0.6770769357681274, + "rewards/rejected": -5.127585411071777, + "sft_loss": 4.1432719230651855, + "step": 4005 + }, + { + "epoch": 2.1461782906840607, + "grad_norm": 0.32224127814306514, + "learning_rate": 2.269103155862391e-07, + "logits/chosen": -0.8249640464782715, + "logits/rejected": -0.7345031499862671, + "logps/chosen": -4.459612846374512, + "logps/rejected": -5.158649444580078, + "loss": 0.05, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.459612846374512, + "rewards/margins": 0.6990362405776978, + "rewards/rejected": -5.158649444580078, + "sft_loss": 4.074860095977783, + "step": 4010 + }, + { + "epoch": 2.1488543234654625, + "grad_norm": 0.5506638370672858, + "learning_rate": 2.2560705569040483e-07, + "logits/chosen": -0.7756333351135254, + "logits/rejected": -0.47715824842453003, + "logps/chosen": -4.469395637512207, + "logps/rejected": -5.134026527404785, + "loss": 0.0527, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.469395637512207, + "rewards/margins": 0.6646314859390259, + "rewards/rejected": -5.134026527404785, + "sft_loss": 4.218961238861084, + "step": 4015 + }, + { + "epoch": 2.151530356246864, + "grad_norm": 0.3443569983012991, + "learning_rate": 2.2430645795611963e-07, + "logits/chosen": -0.8699603080749512, + "logits/rejected": -0.7662721276283264, + "logps/chosen": -4.326430320739746, + "logps/rejected": -5.181424140930176, + "loss": 0.0499, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.326430320739746, + "rewards/margins": 0.854993999004364, + "rewards/rejected": -5.181424140930176, + "sft_loss": 4.081225395202637, + "step": 4020 + }, + { + "epoch": 2.1542063890282654, + "grad_norm": 0.46911133108933106, + "learning_rate": 2.230085350017884e-07, + "logits/chosen": -0.7873638868331909, + "logits/rejected": -0.6871055364608765, + "logps/chosen": -4.547414302825928, + "logps/rejected": -5.233478546142578, + "loss": 0.0521, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.547414302825928, + "rewards/margins": 0.6860648393630981, + "rewards/rejected": -5.233478546142578, + "sft_loss": 4.271318435668945, + "step": 4025 + }, + { + "epoch": 2.156882421809667, + "grad_norm": 0.3329126350332984, + "learning_rate": 2.2171329941986554e-07, + "logits/chosen": -0.805925726890564, + "logits/rejected": -0.7726612091064453, + "logps/chosen": -4.549063682556152, + "logps/rejected": -5.230555057525635, + "loss": 0.0496, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.549063682556152, + "rewards/margins": 0.6814913153648376, + "rewards/rejected": -5.230555057525635, + "sft_loss": 4.1083083152771, + "step": 4030 + }, + { + "epoch": 2.159558454591069, + "grad_norm": 0.35451698745934257, + "learning_rate": 2.2042076377673202e-07, + "logits/chosen": -0.7483960390090942, + "logits/rejected": -0.8004158139228821, + "logps/chosen": -4.400989055633545, + "logps/rejected": -4.926598072052002, + "loss": 0.052, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.400989055633545, + "rewards/margins": 0.5256087183952332, + "rewards/rejected": -4.926598072052002, + "sft_loss": 4.080153465270996, + "step": 4035 + }, + { + "epoch": 2.16223448737247, + "grad_norm": 0.3835866808895003, + "learning_rate": 2.1913094061257476e-07, + "logits/chosen": -0.7398586869239807, + "logits/rejected": -0.7862161993980408, + "logps/chosen": -4.51585054397583, + "logps/rejected": -5.192282676696777, + "loss": 0.0509, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.51585054397583, + "rewards/margins": 0.6764318346977234, + "rewards/rejected": -5.192282676696777, + "sft_loss": 4.22283935546875, + "step": 4040 + }, + { + "epoch": 2.164910520153872, + "grad_norm": 0.5104943845035439, + "learning_rate": 2.178438424412633e-07, + "logits/chosen": -0.7213504314422607, + "logits/rejected": -0.576121985912323, + "logps/chosen": -4.430332183837891, + "logps/rejected": -5.1058149337768555, + "loss": 0.0516, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.430332183837891, + "rewards/margins": 0.6754826307296753, + "rewards/rejected": -5.1058149337768555, + "sft_loss": 4.154946804046631, + "step": 4045 + }, + { + "epoch": 2.1675865529352736, + "grad_norm": 0.4674501996338367, + "learning_rate": 2.165594817502302e-07, + "logits/chosen": -0.8302208781242371, + "logits/rejected": -0.6873995065689087, + "logps/chosen": -4.677610397338867, + "logps/rejected": -5.197269916534424, + "loss": 0.0549, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.677610397338867, + "rewards/margins": 0.5196588635444641, + "rewards/rejected": -5.197269916534424, + "sft_loss": 4.4770989418029785, + "step": 4050 + }, + { + "epoch": 2.170262585716675, + "grad_norm": 0.7933969079589734, + "learning_rate": 2.1527787100034806e-07, + "logits/chosen": -0.646473228931427, + "logits/rejected": -0.5958948135375977, + "logps/chosen": -4.4888916015625, + "logps/rejected": -5.003674030303955, + "loss": 0.0522, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.4888916015625, + "rewards/margins": 0.5147822499275208, + "rewards/rejected": -5.003674030303955, + "sft_loss": 4.261991024017334, + "step": 4055 + }, + { + "epoch": 2.1729386184980766, + "grad_norm": 0.4931963118357809, + "learning_rate": 2.1399902262581037e-07, + "logits/chosen": -0.7125464677810669, + "logits/rejected": -0.5157756209373474, + "logps/chosen": -4.63405704498291, + "logps/rejected": -5.100339412689209, + "loss": 0.0534, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.63405704498291, + "rewards/margins": 0.4662818908691406, + "rewards/rejected": -5.100339412689209, + "sft_loss": 4.32558012008667, + "step": 4060 + }, + { + "epoch": 2.1756146512794783, + "grad_norm": 0.4731138425152067, + "learning_rate": 2.127229490340094e-07, + "logits/chosen": -0.8338751792907715, + "logits/rejected": -0.767744243144989, + "logps/chosen": -4.563037872314453, + "logps/rejected": -5.240099906921387, + "loss": 0.0512, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.563037872314453, + "rewards/margins": 0.6770623922348022, + "rewards/rejected": -5.240099906921387, + "sft_loss": 4.253331661224365, + "step": 4065 + }, + { + "epoch": 2.1782906840608796, + "grad_norm": 0.6017968419842186, + "learning_rate": 2.1144966260541698e-07, + "logits/chosen": -0.7297824025154114, + "logits/rejected": -0.48667675256729126, + "logps/chosen": -4.502832412719727, + "logps/rejected": -5.1569108963012695, + "loss": 0.0516, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.502832412719727, + "rewards/margins": 0.6540783047676086, + "rewards/rejected": -5.1569108963012695, + "sft_loss": 4.205613613128662, + "step": 4070 + }, + { + "epoch": 2.1809667168422813, + "grad_norm": 0.605056601130136, + "learning_rate": 2.1017917569346332e-07, + "logits/chosen": -0.7562541961669922, + "logits/rejected": -0.5662790536880493, + "logps/chosen": -4.355158805847168, + "logps/rejected": -5.044077396392822, + "loss": 0.0498, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.355158805847168, + "rewards/margins": 0.6889182329177856, + "rewards/rejected": -5.044077396392822, + "sft_loss": 4.024393081665039, + "step": 4075 + }, + { + "epoch": 2.183642749623683, + "grad_norm": 0.4321712990803465, + "learning_rate": 2.0891150062441837e-07, + "logits/chosen": -0.7883024215698242, + "logits/rejected": -0.6575873494148254, + "logps/chosen": -4.471930503845215, + "logps/rejected": -5.290343284606934, + "loss": 0.0507, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.471930503845215, + "rewards/margins": 0.8184129595756531, + "rewards/rejected": -5.290343284606934, + "sft_loss": 4.168588161468506, + "step": 4080 + }, + { + "epoch": 2.1863187824050843, + "grad_norm": 0.5782685093734775, + "learning_rate": 2.0764664969727086e-07, + "logits/chosen": -0.6935213804244995, + "logits/rejected": -0.6597651243209839, + "logps/chosen": -4.432676792144775, + "logps/rejected": -5.141415596008301, + "loss": 0.0518, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.432676792144775, + "rewards/margins": 0.708739161491394, + "rewards/rejected": -5.141415596008301, + "sft_loss": 4.131624221801758, + "step": 4085 + }, + { + "epoch": 2.188994815186486, + "grad_norm": 0.4630350649187951, + "learning_rate": 2.0638463518361033e-07, + "logits/chosen": -0.8443444967269897, + "logits/rejected": -0.6547593474388123, + "logps/chosen": -4.442532539367676, + "logps/rejected": -5.188077449798584, + "loss": 0.0509, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.442532539367676, + "rewards/margins": 0.7455454468727112, + "rewards/rejected": -5.188077449798584, + "sft_loss": 4.115714073181152, + "step": 4090 + }, + { + "epoch": 2.1916708479678877, + "grad_norm": 0.38394800610862523, + "learning_rate": 2.0512546932750702e-07, + "logits/chosen": -0.8786581158638, + "logits/rejected": -0.8057855367660522, + "logps/chosen": -4.48517370223999, + "logps/rejected": -5.181807041168213, + "loss": 0.0506, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.48517370223999, + "rewards/margins": 0.6966326236724854, + "rewards/rejected": -5.181807041168213, + "sft_loss": 4.173416614532471, + "step": 4095 + }, + { + "epoch": 2.194346880749289, + "grad_norm": 0.6202912991395683, + "learning_rate": 2.0386916434539343e-07, + "logits/chosen": -0.7296054363250732, + "logits/rejected": -0.5482798218727112, + "logps/chosen": -4.548017501831055, + "logps/rejected": -5.236302375793457, + "loss": 0.0511, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.548017501831055, + "rewards/margins": 0.6882843971252441, + "rewards/rejected": -5.236302375793457, + "sft_loss": 4.268096446990967, + "step": 4100 + }, + { + "epoch": 2.1970229135306907, + "grad_norm": 0.3488816624853031, + "learning_rate": 2.0261573242594627e-07, + "logits/chosen": -0.795524001121521, + "logits/rejected": -0.5811234712600708, + "logps/chosen": -4.477652072906494, + "logps/rejected": -5.104580879211426, + "loss": 0.051, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.477652072906494, + "rewards/margins": 0.6269285678863525, + "rewards/rejected": -5.104580879211426, + "sft_loss": 4.149083137512207, + "step": 4105 + }, + { + "epoch": 2.1996989463120924, + "grad_norm": 0.6510997768380627, + "learning_rate": 2.0136518572996724e-07, + "logits/chosen": -0.7480728030204773, + "logits/rejected": -0.5544711351394653, + "logps/chosen": -4.376706600189209, + "logps/rejected": -5.170914173126221, + "loss": 0.0506, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.376706600189209, + "rewards/margins": 0.7942078709602356, + "rewards/rejected": -5.170914173126221, + "sft_loss": 4.084542274475098, + "step": 4110 + }, + { + "epoch": 2.202374979093494, + "grad_norm": 0.7380670951006642, + "learning_rate": 2.0011753639026617e-07, + "logits/chosen": -0.722009003162384, + "logits/rejected": -0.6640048027038574, + "logps/chosen": -4.3249921798706055, + "logps/rejected": -5.086312294006348, + "loss": 0.0513, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.3249921798706055, + "rewards/margins": 0.7613199949264526, + "rewards/rejected": -5.086312294006348, + "sft_loss": 4.073752403259277, + "step": 4115 + }, + { + "epoch": 2.2050510118748954, + "grad_norm": 0.6413941733357926, + "learning_rate": 1.988727965115421e-07, + "logits/chosen": -0.7592009902000427, + "logits/rejected": -0.6666856408119202, + "logps/chosen": -4.346851348876953, + "logps/rejected": -5.01780366897583, + "loss": 0.0516, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.346851348876953, + "rewards/margins": 0.6709519028663635, + "rewards/rejected": -5.01780366897583, + "sft_loss": 4.029229640960693, + "step": 4120 + }, + { + "epoch": 2.207727044656297, + "grad_norm": 0.513048432496639, + "learning_rate": 1.9763097817026713e-07, + "logits/chosen": -0.8778330087661743, + "logits/rejected": -0.6732766032218933, + "logps/chosen": -4.389376640319824, + "logps/rejected": -5.263741493225098, + "loss": 0.0501, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.389376640319824, + "rewards/margins": 0.8743650317192078, + "rewards/rejected": -5.263741493225098, + "sft_loss": 4.129411697387695, + "step": 4125 + }, + { + "epoch": 2.210403077437699, + "grad_norm": 0.6110412211607691, + "learning_rate": 1.9639209341456796e-07, + "logits/chosen": -0.713258683681488, + "logits/rejected": -0.632881760597229, + "logps/chosen": -4.497132778167725, + "logps/rejected": -5.232693195343018, + "loss": 0.0516, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.497132778167725, + "rewards/margins": 0.7355600595474243, + "rewards/rejected": -5.232693195343018, + "sft_loss": 4.2595624923706055, + "step": 4130 + }, + { + "epoch": 2.2130791102191, + "grad_norm": 0.6504637140199618, + "learning_rate": 1.951561542641102e-07, + "logits/chosen": -0.6895782947540283, + "logits/rejected": -0.7371370792388916, + "logps/chosen": -4.693241119384766, + "logps/rejected": -5.284863471984863, + "loss": 0.0533, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.693241119384766, + "rewards/margins": 0.5916224718093872, + "rewards/rejected": -5.284863471984863, + "sft_loss": 4.434107780456543, + "step": 4135 + }, + { + "epoch": 2.215755143000502, + "grad_norm": 0.4543352377658658, + "learning_rate": 1.939231727099806e-07, + "logits/chosen": -0.9554224014282227, + "logits/rejected": -0.9246322512626648, + "logps/chosen": -4.4521613121032715, + "logps/rejected": -5.085289001464844, + "loss": 0.0527, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.4521613121032715, + "rewards/margins": 0.6331278085708618, + "rewards/rejected": -5.085289001464844, + "sft_loss": 4.227784633636475, + "step": 4140 + }, + { + "epoch": 2.2184311757819035, + "grad_norm": 0.4284689952652344, + "learning_rate": 1.926931607145719e-07, + "logits/chosen": -0.6809648871421814, + "logits/rejected": -0.5421003103256226, + "logps/chosen": -4.518945693969727, + "logps/rejected": -5.160582065582275, + "loss": 0.0519, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.518945693969727, + "rewards/margins": 0.6416367292404175, + "rewards/rejected": -5.160582065582275, + "sft_loss": 4.263835906982422, + "step": 4145 + }, + { + "epoch": 2.221107208563305, + "grad_norm": 0.43424034162181147, + "learning_rate": 1.9146613021146564e-07, + "logits/chosen": -0.7884284257888794, + "logits/rejected": -0.680195152759552, + "logps/chosen": -4.412850379943848, + "logps/rejected": -5.129262924194336, + "loss": 0.0519, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.412850379943848, + "rewards/margins": 0.7164131999015808, + "rewards/rejected": -5.129262924194336, + "sft_loss": 4.188337326049805, + "step": 4150 + }, + { + "epoch": 2.2237832413447065, + "grad_norm": 0.4962627859609203, + "learning_rate": 1.9024209310531736e-07, + "logits/chosen": -0.7695158123970032, + "logits/rejected": -0.7895294427871704, + "logps/chosen": -4.313672065734863, + "logps/rejected": -4.9209136962890625, + "loss": 0.0509, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.313672065734863, + "rewards/margins": 0.6072418689727783, + "rewards/rejected": -4.9209136962890625, + "sft_loss": 3.9769134521484375, + "step": 4155 + }, + { + "epoch": 2.2264592741261082, + "grad_norm": 0.481643698172507, + "learning_rate": 1.890210612717401e-07, + "logits/chosen": -0.7807799577713013, + "logits/rejected": -0.6693819165229797, + "logps/chosen": -4.398434638977051, + "logps/rejected": -5.057561874389648, + "loss": 0.0518, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.398434638977051, + "rewards/margins": 0.6591275334358215, + "rewards/rejected": -5.057561874389648, + "sft_loss": 4.108274936676025, + "step": 4160 + }, + { + "epoch": 2.2291353069075095, + "grad_norm": 0.7413910120393763, + "learning_rate": 1.8780304655719054e-07, + "logits/chosen": -0.8045433163642883, + "logits/rejected": -0.6661940813064575, + "logps/chosen": -4.460057258605957, + "logps/rejected": -5.185713291168213, + "loss": 0.0513, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.460057258605957, + "rewards/margins": 0.725655734539032, + "rewards/rejected": -5.185713291168213, + "sft_loss": 4.1655497550964355, + "step": 4165 + }, + { + "epoch": 2.231811339688911, + "grad_norm": 0.4732959746390582, + "learning_rate": 1.865880607788523e-07, + "logits/chosen": -0.6616698503494263, + "logits/rejected": -0.5911990404129028, + "logps/chosen": -4.5712995529174805, + "logps/rejected": -5.2418389320373535, + "loss": 0.0523, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.5712995529174805, + "rewards/margins": 0.6705387234687805, + "rewards/rejected": -5.2418389320373535, + "sft_loss": 4.364117622375488, + "step": 4170 + }, + { + "epoch": 2.234487372470313, + "grad_norm": 0.8094323886363942, + "learning_rate": 1.8537611572452316e-07, + "logits/chosen": -0.794582724571228, + "logits/rejected": -0.7341277599334717, + "logps/chosen": -4.37639856338501, + "logps/rejected": -5.012908935546875, + "loss": 0.0517, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.37639856338501, + "rewards/margins": 0.6365109086036682, + "rewards/rejected": -5.012908935546875, + "sft_loss": 4.13184118270874, + "step": 4175 + }, + { + "epoch": 2.237163405251714, + "grad_norm": 0.4541511078659696, + "learning_rate": 1.84167223152499e-07, + "logits/chosen": -0.7890401482582092, + "logits/rejected": -0.5393490791320801, + "logps/chosen": -4.449801921844482, + "logps/rejected": -5.224527835845947, + "loss": 0.0524, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.449801921844482, + "rewards/margins": 0.7747262120246887, + "rewards/rejected": -5.224527835845947, + "sft_loss": 4.180199146270752, + "step": 4180 + }, + { + "epoch": 2.239839438033116, + "grad_norm": 0.48971091863328464, + "learning_rate": 1.8296139479146112e-07, + "logits/chosen": -0.733080267906189, + "logits/rejected": -0.7528313398361206, + "logps/chosen": -4.379557132720947, + "logps/rejected": -4.981081008911133, + "loss": 0.0507, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.379557132720947, + "rewards/margins": 0.6015235781669617, + "rewards/rejected": -4.981081008911133, + "sft_loss": 4.055811405181885, + "step": 4185 + }, + { + "epoch": 2.2425154708145176, + "grad_norm": 0.4429743125528724, + "learning_rate": 1.8175864234036132e-07, + "logits/chosen": -0.5849838852882385, + "logits/rejected": -0.5171209573745728, + "logps/chosen": -4.486813545227051, + "logps/rejected": -5.217012405395508, + "loss": 0.0529, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.486813545227051, + "rewards/margins": 0.7301994562149048, + "rewards/rejected": -5.217012405395508, + "sft_loss": 4.215363502502441, + "step": 4190 + }, + { + "epoch": 2.245191503595919, + "grad_norm": 0.3529342606467267, + "learning_rate": 1.805589774683094e-07, + "logits/chosen": -0.8781973719596863, + "logits/rejected": -0.7594443559646606, + "logps/chosen": -4.630221366882324, + "logps/rejected": -5.19809627532959, + "loss": 0.0514, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.630221366882324, + "rewards/margins": 0.5678743124008179, + "rewards/rejected": -5.19809627532959, + "sft_loss": 4.259702205657959, + "step": 4195 + }, + { + "epoch": 2.2478675363773206, + "grad_norm": 0.7539970445961938, + "learning_rate": 1.79362411814459e-07, + "logits/chosen": -0.5950525999069214, + "logits/rejected": -0.6381107568740845, + "logps/chosen": -4.614771842956543, + "logps/rejected": -5.1353583335876465, + "loss": 0.0539, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.614771842956543, + "rewards/margins": 0.5205863118171692, + "rewards/rejected": -5.1353583335876465, + "sft_loss": 4.3355183601379395, + "step": 4200 + }, + { + "epoch": 2.2505435691587223, + "grad_norm": 0.47071934411282107, + "learning_rate": 1.7816895698789552e-07, + "logits/chosen": -0.8427804112434387, + "logits/rejected": -0.7817627191543579, + "logps/chosen": -4.487260341644287, + "logps/rejected": -5.036301612854004, + "loss": 0.0517, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.487260341644287, + "rewards/margins": 0.5490409135818481, + "rewards/rejected": -5.036301612854004, + "sft_loss": 4.125479698181152, + "step": 4205 + }, + { + "epoch": 2.2532196019401236, + "grad_norm": 0.5406217374875316, + "learning_rate": 1.7697862456752271e-07, + "logits/chosen": -0.8153530955314636, + "logits/rejected": -0.690669059753418, + "logps/chosen": -4.521656036376953, + "logps/rejected": -5.366150856018066, + "loss": 0.0511, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.521656036376953, + "rewards/margins": 0.8444948196411133, + "rewards/rejected": -5.366150856018066, + "sft_loss": 4.276778221130371, + "step": 4210 + }, + { + "epoch": 2.2558956347215253, + "grad_norm": 0.5810986187677788, + "learning_rate": 1.7579142610195124e-07, + "logits/chosen": -0.7868109941482544, + "logits/rejected": -0.6204288601875305, + "logps/chosen": -4.511598587036133, + "logps/rejected": -5.312621116638184, + "loss": 0.0517, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.511598587036133, + "rewards/margins": 0.8010231256484985, + "rewards/rejected": -5.312621116638184, + "sft_loss": 4.231732368469238, + "step": 4215 + }, + { + "epoch": 2.258571667502927, + "grad_norm": 0.3654874885769355, + "learning_rate": 1.7460737310938568e-07, + "logits/chosen": -0.8439321517944336, + "logits/rejected": -0.6458622217178345, + "logps/chosen": -4.324358940124512, + "logps/rejected": -5.175654411315918, + "loss": 0.05, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.324358940124512, + "rewards/margins": 0.8512958288192749, + "rewards/rejected": -5.175654411315918, + "sft_loss": 4.1335344314575195, + "step": 4220 + }, + { + "epoch": 2.2612477002843283, + "grad_norm": 0.3690452777813944, + "learning_rate": 1.734264770775133e-07, + "logits/chosen": -0.8422131538391113, + "logits/rejected": -0.5545368790626526, + "logps/chosen": -4.361656665802002, + "logps/rejected": -5.022474765777588, + "loss": 0.0523, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.361656665802002, + "rewards/margins": 0.660818338394165, + "rewards/rejected": -5.022474765777588, + "sft_loss": 4.097499847412109, + "step": 4225 + }, + { + "epoch": 2.26392373306573, + "grad_norm": 0.40414939716552284, + "learning_rate": 1.7224874946339241e-07, + "logits/chosen": -0.799502968788147, + "logits/rejected": -0.7204964756965637, + "logps/chosen": -4.439566612243652, + "logps/rejected": -5.170541763305664, + "loss": 0.0505, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.439566612243652, + "rewards/margins": 0.7309752106666565, + "rewards/rejected": -5.170541763305664, + "sft_loss": 4.066781044006348, + "step": 4230 + }, + { + "epoch": 2.2665997658471317, + "grad_norm": 0.5952940841735112, + "learning_rate": 1.7107420169334186e-07, + "logits/chosen": -0.7833027243614197, + "logits/rejected": -0.7465739846229553, + "logps/chosen": -4.582724094390869, + "logps/rejected": -5.141078472137451, + "loss": 0.0518, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.582724094390869, + "rewards/margins": 0.5583544969558716, + "rewards/rejected": -5.141078472137451, + "sft_loss": 4.265813827514648, + "step": 4235 + }, + { + "epoch": 2.269275798628533, + "grad_norm": 0.4398041156053364, + "learning_rate": 1.6990284516282893e-07, + "logits/chosen": -0.7631226778030396, + "logits/rejected": -0.6925583481788635, + "logps/chosen": -4.404193878173828, + "logps/rejected": -5.143115520477295, + "loss": 0.05, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.404193878173828, + "rewards/margins": 0.7389219403266907, + "rewards/rejected": -5.143115520477295, + "sft_loss": 4.091065883636475, + "step": 4240 + }, + { + "epoch": 2.2719518314099347, + "grad_norm": 0.5284040359104598, + "learning_rate": 1.687346912363602e-07, + "logits/chosen": -0.8135977983474731, + "logits/rejected": -0.6727418303489685, + "logps/chosen": -4.361421585083008, + "logps/rejected": -5.143485069274902, + "loss": 0.0502, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.361421585083008, + "rewards/margins": 0.7820636630058289, + "rewards/rejected": -5.143485069274902, + "sft_loss": 4.039944648742676, + "step": 4245 + }, + { + "epoch": 2.2746278641913364, + "grad_norm": 0.427930060724418, + "learning_rate": 1.675697512473697e-07, + "logits/chosen": -0.7885524034500122, + "logits/rejected": -0.5956434011459351, + "logps/chosen": -4.521124362945557, + "logps/rejected": -5.362552642822266, + "loss": 0.0496, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.521124362945557, + "rewards/margins": 0.8414284586906433, + "rewards/rejected": -5.362552642822266, + "sft_loss": 4.1074538230896, + "step": 4250 + }, + { + "epoch": 2.2773038969727377, + "grad_norm": 0.4548490650516944, + "learning_rate": 1.6640803649811087e-07, + "logits/chosen": -0.806907057762146, + "logits/rejected": -0.5140247344970703, + "logps/chosen": -4.435779571533203, + "logps/rejected": -5.372751712799072, + "loss": 0.0502, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.435779571533203, + "rewards/margins": 0.9369718432426453, + "rewards/rejected": -5.372751712799072, + "sft_loss": 4.071789741516113, + "step": 4255 + }, + { + "epoch": 2.2799799297541394, + "grad_norm": 0.433676020311226, + "learning_rate": 1.6524955825954472e-07, + "logits/chosen": -0.7338653802871704, + "logits/rejected": -0.675475001335144, + "logps/chosen": -4.471889019012451, + "logps/rejected": -5.032140254974365, + "loss": 0.0531, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -4.471889019012451, + "rewards/margins": 0.5602517127990723, + "rewards/rejected": -5.032140254974365, + "sft_loss": 4.22143030166626, + "step": 4260 + }, + { + "epoch": 2.282655962535541, + "grad_norm": 0.44709679685057646, + "learning_rate": 1.6409432777123277e-07, + "logits/chosen": -0.8582879304885864, + "logits/rejected": -0.7037560939788818, + "logps/chosen": -4.3130292892456055, + "logps/rejected": -5.295225143432617, + "loss": 0.049, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.3130292892456055, + "rewards/margins": 0.9821959733963013, + "rewards/rejected": -5.295225143432617, + "sft_loss": 4.0433030128479, + "step": 4265 + }, + { + "epoch": 2.285331995316943, + "grad_norm": 0.6454731179033166, + "learning_rate": 1.6294235624122577e-07, + "logits/chosen": -0.6963884234428406, + "logits/rejected": -0.4233129620552063, + "logps/chosen": -4.356287956237793, + "logps/rejected": -5.090372085571289, + "loss": 0.0517, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.356287956237793, + "rewards/margins": 0.7340839505195618, + "rewards/rejected": -5.090372085571289, + "sft_loss": 4.082320213317871, + "step": 4270 + }, + { + "epoch": 2.288008028098344, + "grad_norm": 0.4092375456675233, + "learning_rate": 1.6179365484595697e-07, + "logits/chosen": -0.7554842233657837, + "logits/rejected": -0.6546878218650818, + "logps/chosen": -4.4962849617004395, + "logps/rejected": -5.201930999755859, + "loss": 0.052, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.4962849617004395, + "rewards/margins": 0.7056463360786438, + "rewards/rejected": -5.201930999755859, + "sft_loss": 4.25368595123291, + "step": 4275 + }, + { + "epoch": 2.290684060879746, + "grad_norm": 0.7796030085865294, + "learning_rate": 1.60648234730132e-07, + "logits/chosen": -0.8003331422805786, + "logits/rejected": -0.7303368449211121, + "logps/chosen": -4.419107913970947, + "logps/rejected": -5.249577522277832, + "loss": 0.05, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.419107913970947, + "rewards/margins": 0.8304697275161743, + "rewards/rejected": -5.249577522277832, + "sft_loss": 4.124005317687988, + "step": 4280 + }, + { + "epoch": 2.293360093661147, + "grad_norm": 0.5726370771240188, + "learning_rate": 1.595061070066222e-07, + "logits/chosen": -0.6955457925796509, + "logits/rejected": -0.7629978060722351, + "logps/chosen": -4.541815757751465, + "logps/rejected": -5.194106101989746, + "loss": 0.0516, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.541815757751465, + "rewards/margins": 0.6522905826568604, + "rewards/rejected": -5.194106101989746, + "sft_loss": 4.267590522766113, + "step": 4285 + }, + { + "epoch": 2.296036126442549, + "grad_norm": 0.47755328394996693, + "learning_rate": 1.5836728275635542e-07, + "logits/chosen": -0.8143297433853149, + "logits/rejected": -0.6354633569717407, + "logps/chosen": -4.457322120666504, + "logps/rejected": -5.215367317199707, + "loss": 0.051, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.457322120666504, + "rewards/margins": 0.7580451965332031, + "rewards/rejected": -5.215367317199707, + "sft_loss": 4.198925971984863, + "step": 4290 + }, + { + "epoch": 2.2987121592239506, + "grad_norm": 0.3876912944758544, + "learning_rate": 1.5723177302820984e-07, + "logits/chosen": -0.8010802268981934, + "logits/rejected": -0.7300946712493896, + "logps/chosen": -4.525595664978027, + "logps/rejected": -5.123916149139404, + "loss": 0.0525, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.525595664978027, + "rewards/margins": 0.5983200073242188, + "rewards/rejected": -5.123916149139404, + "sft_loss": 4.291536331176758, + "step": 4295 + }, + { + "epoch": 2.3013881920053523, + "grad_norm": 0.4036539451488429, + "learning_rate": 1.5609958883890544e-07, + "logits/chosen": -0.6942816376686096, + "logits/rejected": -0.5985075235366821, + "logps/chosen": -4.369370937347412, + "logps/rejected": -5.014688014984131, + "loss": 0.0506, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.369370937347412, + "rewards/margins": 0.6453171968460083, + "rewards/rejected": -5.014688014984131, + "sft_loss": 4.008292198181152, + "step": 4300 + }, + { + "epoch": 2.3040642247867535, + "grad_norm": 0.5265430776252449, + "learning_rate": 1.5497074117289865e-07, + "logits/chosen": -0.753690242767334, + "logits/rejected": -0.634030282497406, + "logps/chosen": -4.488044738769531, + "logps/rejected": -5.264333724975586, + "loss": 0.0513, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.488044738769531, + "rewards/margins": 0.7762894034385681, + "rewards/rejected": -5.264333724975586, + "sft_loss": 4.194975852966309, + "step": 4305 + }, + { + "epoch": 2.3067402575681553, + "grad_norm": 0.3790394544769755, + "learning_rate": 1.5384524098227402e-07, + "logits/chosen": -0.7402902841567993, + "logits/rejected": -0.5147100687026978, + "logps/chosen": -4.349900245666504, + "logps/rejected": -5.224468231201172, + "loss": 0.0508, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.349900245666504, + "rewards/margins": 0.8745684623718262, + "rewards/rejected": -5.224468231201172, + "sft_loss": 4.091513156890869, + "step": 4310 + }, + { + "epoch": 2.3094162903495565, + "grad_norm": 0.5268560797622209, + "learning_rate": 1.5272309918663974e-07, + "logits/chosen": -0.7533577084541321, + "logits/rejected": -0.5765639543533325, + "logps/chosen": -4.552016735076904, + "logps/rejected": -5.05606746673584, + "loss": 0.0529, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.552016735076904, + "rewards/margins": 0.5040509104728699, + "rewards/rejected": -5.05606746673584, + "sft_loss": 4.245696544647217, + "step": 4315 + }, + { + "epoch": 2.3120923231309582, + "grad_norm": 0.497137273147704, + "learning_rate": 1.516043266730201e-07, + "logits/chosen": -0.7740308046340942, + "logits/rejected": -0.6450439095497131, + "logps/chosen": -4.40801477432251, + "logps/rejected": -5.11243200302124, + "loss": 0.0523, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.40801477432251, + "rewards/margins": 0.7044172286987305, + "rewards/rejected": -5.11243200302124, + "sft_loss": 4.104895114898682, + "step": 4320 + }, + { + "epoch": 2.31476835591236, + "grad_norm": 0.5593452729007138, + "learning_rate": 1.504889342957512e-07, + "logits/chosen": -0.7910572290420532, + "logits/rejected": -0.5866572260856628, + "logps/chosen": -4.533371448516846, + "logps/rejected": -5.306161403656006, + "loss": 0.052, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.533371448516846, + "rewards/margins": 0.7727904319763184, + "rewards/rejected": -5.306161403656006, + "sft_loss": 4.273785591125488, + "step": 4325 + }, + { + "epoch": 2.3174443886937617, + "grad_norm": 0.4308516372542784, + "learning_rate": 1.4937693287637453e-07, + "logits/chosen": -0.811202883720398, + "logits/rejected": -0.6923697590827942, + "logps/chosen": -4.52161169052124, + "logps/rejected": -5.191643238067627, + "loss": 0.0514, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.52161169052124, + "rewards/margins": 0.670030951499939, + "rewards/rejected": -5.191643238067627, + "sft_loss": 4.221057415008545, + "step": 4330 + }, + { + "epoch": 2.320120421475163, + "grad_norm": 0.37138887048768443, + "learning_rate": 1.4826833320353305e-07, + "logits/chosen": -0.7932590246200562, + "logits/rejected": -0.7206074595451355, + "logps/chosen": -4.624764919281006, + "logps/rejected": -5.276534557342529, + "loss": 0.0515, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.624764919281006, + "rewards/margins": 0.6517696976661682, + "rewards/rejected": -5.276534557342529, + "sft_loss": 4.188200950622559, + "step": 4335 + }, + { + "epoch": 2.3227964542565647, + "grad_norm": 0.4345851380582179, + "learning_rate": 1.4716314603286528e-07, + "logits/chosen": -0.8649484515190125, + "logits/rejected": -0.634868323802948, + "logps/chosen": -4.535511493682861, + "logps/rejected": -5.249735355377197, + "loss": 0.0517, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.535511493682861, + "rewards/margins": 0.7142241597175598, + "rewards/rejected": -5.249735355377197, + "sft_loss": 4.229328632354736, + "step": 4340 + }, + { + "epoch": 2.3254724870379664, + "grad_norm": 0.5104610877999559, + "learning_rate": 1.4606138208690233e-07, + "logits/chosen": -0.7997044324874878, + "logits/rejected": -0.7385850548744202, + "logps/chosen": -4.5416460037231445, + "logps/rejected": -5.044976711273193, + "loss": 0.052, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.5416460037231445, + "rewards/margins": 0.5033308267593384, + "rewards/rejected": -5.044976711273193, + "sft_loss": 4.25277853012085, + "step": 4345 + }, + { + "epoch": 2.3281485198193677, + "grad_norm": 0.3642995428464966, + "learning_rate": 1.4496305205496251e-07, + "logits/chosen": -0.7962000966072083, + "logits/rejected": -0.749233067035675, + "logps/chosen": -4.49139404296875, + "logps/rejected": -5.204357624053955, + "loss": 0.0511, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.49139404296875, + "rewards/margins": 0.7129632830619812, + "rewards/rejected": -5.204357624053955, + "sft_loss": 4.271798133850098, + "step": 4350 + }, + { + "epoch": 2.3308245526007694, + "grad_norm": 0.45592266136726567, + "learning_rate": 1.4386816659304895e-07, + "logits/chosen": -0.9152060747146606, + "logits/rejected": -0.7515543699264526, + "logps/chosen": -4.481812477111816, + "logps/rejected": -5.05482292175293, + "loss": 0.0526, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.481812477111816, + "rewards/margins": 0.5730103254318237, + "rewards/rejected": -5.05482292175293, + "sft_loss": 4.24124002456665, + "step": 4355 + }, + { + "epoch": 2.333500585382171, + "grad_norm": 0.43698490211025665, + "learning_rate": 1.4277673632374492e-07, + "logits/chosen": -0.8096843957901001, + "logits/rejected": -0.5594863891601562, + "logps/chosen": -4.307278156280518, + "logps/rejected": -5.064208984375, + "loss": 0.0505, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.307278156280518, + "rewards/margins": 0.7569302320480347, + "rewards/rejected": -5.064208984375, + "sft_loss": 4.024901866912842, + "step": 4360 + }, + { + "epoch": 2.3361766181635724, + "grad_norm": 0.40380334550713853, + "learning_rate": 1.416887718361119e-07, + "logits/chosen": -0.7265334725379944, + "logits/rejected": -0.7379759550094604, + "logps/chosen": -4.446936130523682, + "logps/rejected": -5.041685581207275, + "loss": 0.0532, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.446936130523682, + "rewards/margins": 0.5947496294975281, + "rewards/rejected": -5.041685581207275, + "sft_loss": 4.248701095581055, + "step": 4365 + }, + { + "epoch": 2.338852650944974, + "grad_norm": 0.4955611851621508, + "learning_rate": 1.406042836855859e-07, + "logits/chosen": -0.74248206615448, + "logits/rejected": -0.6353031396865845, + "logps/chosen": -4.549657344818115, + "logps/rejected": -5.308493137359619, + "loss": 0.0517, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.549657344818115, + "rewards/margins": 0.7588354349136353, + "rewards/rejected": -5.308493137359619, + "sft_loss": 4.329131603240967, + "step": 4370 + }, + { + "epoch": 2.341528683726376, + "grad_norm": 0.37714745130252286, + "learning_rate": 1.3952328239387595e-07, + "logits/chosen": -0.8745020627975464, + "logits/rejected": -0.6099725365638733, + "logps/chosen": -4.425809860229492, + "logps/rejected": -5.203674793243408, + "loss": 0.0517, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.425809860229492, + "rewards/margins": 0.7778650522232056, + "rewards/rejected": -5.203674793243408, + "sft_loss": 4.267220973968506, + "step": 4375 + }, + { + "epoch": 2.344204716507777, + "grad_norm": 0.4350403366163347, + "learning_rate": 1.3844577844886109e-07, + "logits/chosen": -0.8680629730224609, + "logits/rejected": -0.635954737663269, + "logps/chosen": -4.190207004547119, + "logps/rejected": -5.01309871673584, + "loss": 0.0503, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.190207004547119, + "rewards/margins": 0.8228910565376282, + "rewards/rejected": -5.01309871673584, + "sft_loss": 4.028267860412598, + "step": 4380 + }, + { + "epoch": 2.346880749289179, + "grad_norm": 0.5090011205479518, + "learning_rate": 1.3737178230448955e-07, + "logits/chosen": -0.8712724447250366, + "logits/rejected": -0.7374725341796875, + "logps/chosen": -4.453272819519043, + "logps/rejected": -5.0184431076049805, + "loss": 0.0518, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.453272819519043, + "rewards/margins": 0.5651699900627136, + "rewards/rejected": -5.0184431076049805, + "sft_loss": 4.107565879821777, + "step": 4385 + }, + { + "epoch": 2.3495567820705805, + "grad_norm": 0.44470986998643264, + "learning_rate": 1.363013043806764e-07, + "logits/chosen": -0.8503999710083008, + "logits/rejected": -0.7448652386665344, + "logps/chosen": -4.280064105987549, + "logps/rejected": -4.982614994049072, + "loss": 0.0512, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.280064105987549, + "rewards/margins": 0.7025504112243652, + "rewards/rejected": -4.982614994049072, + "sft_loss": 4.109427452087402, + "step": 4390 + }, + { + "epoch": 2.3522328148519818, + "grad_norm": 0.7032192811206371, + "learning_rate": 1.352343550632034e-07, + "logits/chosen": -0.8059431910514832, + "logits/rejected": -0.6396316885948181, + "logps/chosen": -4.340572357177734, + "logps/rejected": -5.229712009429932, + "loss": 0.0513, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.340572357177734, + "rewards/margins": 0.8891401290893555, + "rewards/rejected": -5.229712009429932, + "sft_loss": 4.127902030944824, + "step": 4395 + }, + { + "epoch": 2.3549088476333835, + "grad_norm": 0.39015834229113866, + "learning_rate": 1.3417094470361722e-07, + "logits/chosen": -0.8471466302871704, + "logits/rejected": -0.6921052932739258, + "logps/chosen": -4.541820049285889, + "logps/rejected": -5.182088375091553, + "loss": 0.053, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.541820049285889, + "rewards/margins": 0.6402683258056641, + "rewards/rejected": -5.182088375091553, + "sft_loss": 4.28817081451416, + "step": 4400 + }, + { + "epoch": 2.3549088476333835, + "eval_logits/chosen": -0.3632008731365204, + "eval_logits/rejected": -0.285022109746933, + "eval_logps/chosen": -4.476656436920166, + "eval_logps/rejected": -5.172234535217285, + "eval_loss": 0.050458874553442, + "eval_rewards/accuracies": 0.6839762330055237, + "eval_rewards/chosen": -4.476656436920166, + "eval_rewards/margins": 0.6955785751342773, + "eval_rewards/rejected": -5.172234535217285, + "eval_runtime": 43.4365, + "eval_samples_per_second": 30.965, + "eval_sft_loss": 4.090471267700195, + "eval_steps_per_second": 7.758, + "step": 4400 + }, + { + "epoch": 2.357584880414785, + "grad_norm": 0.41993569063387165, + "learning_rate": 1.3311108361913015e-07, + "logits/chosen": -0.899549663066864, + "logits/rejected": -0.9365586042404175, + "logps/chosen": -4.511231422424316, + "logps/rejected": -5.088454246520996, + "loss": 0.0522, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.511231422424316, + "rewards/margins": 0.5772226452827454, + "rewards/rejected": -5.088454246520996, + "sft_loss": 4.286912441253662, + "step": 4405 + }, + { + "epoch": 2.3602609131961865, + "grad_norm": 0.43817622591238337, + "learning_rate": 1.3205478209251874e-07, + "logits/chosen": -0.7712717056274414, + "logits/rejected": -0.7058557868003845, + "logps/chosen": -4.5705180168151855, + "logps/rejected": -5.281630039215088, + "loss": 0.0519, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.5705180168151855, + "rewards/margins": 0.711111307144165, + "rewards/rejected": -5.281630039215088, + "sft_loss": 4.32778787612915, + "step": 4410 + }, + { + "epoch": 2.362936945977588, + "grad_norm": 0.4837368965922269, + "learning_rate": 1.310020503720254e-07, + "logits/chosen": -0.793159008026123, + "logits/rejected": -0.62529456615448, + "logps/chosen": -4.411933898925781, + "logps/rejected": -5.071666240692139, + "loss": 0.0518, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.411933898925781, + "rewards/margins": 0.6597329378128052, + "rewards/rejected": -5.071666240692139, + "sft_loss": 4.156750679016113, + "step": 4415 + }, + { + "epoch": 2.36561297875899, + "grad_norm": 0.4008401185189836, + "learning_rate": 1.2995289867125752e-07, + "logits/chosen": -0.823562741279602, + "logits/rejected": -0.7247136831283569, + "logps/chosen": -4.479001045227051, + "logps/rejected": -5.045378684997559, + "loss": 0.0518, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.479001045227051, + "rewards/margins": 0.5663775205612183, + "rewards/rejected": -5.045378684997559, + "sft_loss": 4.197256565093994, + "step": 4420 + }, + { + "epoch": 2.368289011540391, + "grad_norm": 0.3267884216984576, + "learning_rate": 1.2890733716908986e-07, + "logits/chosen": -0.7804628014564514, + "logits/rejected": -0.7346547842025757, + "logps/chosen": -4.284626483917236, + "logps/rejected": -4.872819423675537, + "loss": 0.0528, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.284626483917236, + "rewards/margins": 0.5881929397583008, + "rewards/rejected": -4.872819423675537, + "sft_loss": 4.072215557098389, + "step": 4425 + }, + { + "epoch": 2.370965044321793, + "grad_norm": 0.4383847142525197, + "learning_rate": 1.2786537600956454e-07, + "logits/chosen": -0.8587814569473267, + "logits/rejected": -0.6906665563583374, + "logps/chosen": -4.313345432281494, + "logps/rejected": -5.195896148681641, + "loss": 0.0487, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.313345432281494, + "rewards/margins": 0.8825508952140808, + "rewards/rejected": -5.195896148681641, + "sft_loss": 3.9952876567840576, + "step": 4430 + }, + { + "epoch": 2.3736410771031946, + "grad_norm": 0.45896207390131155, + "learning_rate": 1.268270253017933e-07, + "logits/chosen": -0.854796290397644, + "logits/rejected": -0.6479583382606506, + "logps/chosen": -4.433968544006348, + "logps/rejected": -5.119320869445801, + "loss": 0.0518, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.433968544006348, + "rewards/margins": 0.6853523850440979, + "rewards/rejected": -5.119320869445801, + "sft_loss": 4.195663928985596, + "step": 4435 + }, + { + "epoch": 2.376317109884596, + "grad_norm": 0.44999129974127655, + "learning_rate": 1.257922951198591e-07, + "logits/chosen": -0.9596928358078003, + "logits/rejected": -0.657541036605835, + "logps/chosen": -4.332813262939453, + "logps/rejected": -5.0844526290893555, + "loss": 0.0507, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.332813262939453, + "rewards/margins": 0.7516393065452576, + "rewards/rejected": -5.0844526290893555, + "sft_loss": 4.116638660430908, + "step": 4440 + }, + { + "epoch": 2.3789931426659976, + "grad_norm": 0.5436196141333159, + "learning_rate": 1.24761195502719e-07, + "logits/chosen": -0.8771514892578125, + "logits/rejected": -0.6101125478744507, + "logps/chosen": -4.607757091522217, + "logps/rejected": -5.213659763336182, + "loss": 0.0508, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.607757091522217, + "rewards/margins": 0.6059027910232544, + "rewards/rejected": -5.213659763336182, + "sft_loss": 4.251969337463379, + "step": 4445 + }, + { + "epoch": 2.3816691754473993, + "grad_norm": 0.7332787813082794, + "learning_rate": 1.2373373645410573e-07, + "logits/chosen": -0.762103259563446, + "logits/rejected": -0.624140739440918, + "logps/chosen": -4.644742012023926, + "logps/rejected": -5.275030136108398, + "loss": 0.0522, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.644742012023926, + "rewards/margins": 0.6302889585494995, + "rewards/rejected": -5.275030136108398, + "sft_loss": 4.331053733825684, + "step": 4450 + }, + { + "epoch": 2.384345208228801, + "grad_norm": 0.4631029256293614, + "learning_rate": 1.2270992794243175e-07, + "logits/chosen": -0.8583146333694458, + "logits/rejected": -0.7607332468032837, + "logps/chosen": -4.444066524505615, + "logps/rejected": -5.150336265563965, + "loss": 0.0508, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.444066524505615, + "rewards/margins": 0.7062696218490601, + "rewards/rejected": -5.150336265563965, + "sft_loss": 4.155452728271484, + "step": 4455 + }, + { + "epoch": 2.3870212410102023, + "grad_norm": 0.8308078175306337, + "learning_rate": 1.2168977990069147e-07, + "logits/chosen": -0.8691369891166687, + "logits/rejected": -0.6420689821243286, + "logps/chosen": -4.282433032989502, + "logps/rejected": -5.122197151184082, + "loss": 0.0517, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.282433032989502, + "rewards/margins": 0.839765191078186, + "rewards/rejected": -5.122197151184082, + "sft_loss": 3.9892375469207764, + "step": 4460 + }, + { + "epoch": 2.389697273791604, + "grad_norm": 0.3763620124349001, + "learning_rate": 1.206733022263659e-07, + "logits/chosen": -0.8173881769180298, + "logits/rejected": -0.6439284086227417, + "logps/chosen": -4.300229549407959, + "logps/rejected": -5.05617094039917, + "loss": 0.0501, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.300229549407959, + "rewards/margins": 0.7559418678283691, + "rewards/rejected": -5.05617094039917, + "sft_loss": 3.9857208728790283, + "step": 4465 + }, + { + "epoch": 2.3923733065730053, + "grad_norm": 0.5485172489909115, + "learning_rate": 1.1966050478132572e-07, + "logits/chosen": -0.7517123222351074, + "logits/rejected": -0.6935483813285828, + "logps/chosen": -4.504305362701416, + "logps/rejected": -5.118971347808838, + "loss": 0.0525, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.504305362701416, + "rewards/margins": 0.6146660447120667, + "rewards/rejected": -5.118971347808838, + "sft_loss": 4.205090522766113, + "step": 4470 + }, + { + "epoch": 2.395049339354407, + "grad_norm": 0.42943747081626427, + "learning_rate": 1.1865139739173635e-07, + "logits/chosen": -0.8406542539596558, + "logits/rejected": -0.6299400329589844, + "logps/chosen": -4.632096290588379, + "logps/rejected": -5.228485584259033, + "loss": 0.0515, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.632096290588379, + "rewards/margins": 0.5963901281356812, + "rewards/rejected": -5.228485584259033, + "sft_loss": 4.2738494873046875, + "step": 4475 + }, + { + "epoch": 2.3977253721358087, + "grad_norm": 0.4209416780986708, + "learning_rate": 1.1764598984796187e-07, + "logits/chosen": -0.9492759704589844, + "logits/rejected": -0.8266083598136902, + "logps/chosen": -4.4401984214782715, + "logps/rejected": -5.127415657043457, + "loss": 0.0508, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.4401984214782715, + "rewards/margins": 0.6872166395187378, + "rewards/rejected": -5.127415657043457, + "sft_loss": 4.163792133331299, + "step": 4480 + }, + { + "epoch": 2.4004014049172104, + "grad_norm": 0.404322929379254, + "learning_rate": 1.1664429190447095e-07, + "logits/chosen": -0.7948734760284424, + "logits/rejected": -0.7301923036575317, + "logps/chosen": -4.572030544281006, + "logps/rejected": -5.346141338348389, + "loss": 0.0515, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.572030544281006, + "rewards/margins": 0.7741105556488037, + "rewards/rejected": -5.346141338348389, + "sft_loss": 4.311251640319824, + "step": 4485 + }, + { + "epoch": 2.4030774376986117, + "grad_norm": 0.3949020633907638, + "learning_rate": 1.1564631327974122e-07, + "logits/chosen": -0.9128694534301758, + "logits/rejected": -0.6827823519706726, + "logps/chosen": -4.563012599945068, + "logps/rejected": -5.230254173278809, + "loss": 0.052, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.563012599945068, + "rewards/margins": 0.6672413349151611, + "rewards/rejected": -5.230254173278809, + "sft_loss": 4.242611408233643, + "step": 4490 + }, + { + "epoch": 2.4057534704800134, + "grad_norm": 0.553553569804726, + "learning_rate": 1.1465206365616587e-07, + "logits/chosen": -0.9420261383056641, + "logits/rejected": -0.7209168672561646, + "logps/chosen": -4.519663333892822, + "logps/rejected": -5.303266525268555, + "loss": 0.0504, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.519663333892822, + "rewards/margins": 0.7836031913757324, + "rewards/rejected": -5.303266525268555, + "sft_loss": 4.19036340713501, + "step": 4495 + }, + { + "epoch": 2.408429503261415, + "grad_norm": 0.3632794095375088, + "learning_rate": 1.1366155267995887e-07, + "logits/chosen": -0.7734476327896118, + "logits/rejected": -0.7625142931938171, + "logps/chosen": -4.501950263977051, + "logps/rejected": -5.078164100646973, + "loss": 0.0516, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.501950263977051, + "rewards/margins": 0.5762136578559875, + "rewards/rejected": -5.078164100646973, + "sft_loss": 4.153041839599609, + "step": 4500 + }, + { + "epoch": 2.4111055360428164, + "grad_norm": 0.45684321809705974, + "learning_rate": 1.1267478996106228e-07, + "logits/chosen": -0.8146098852157593, + "logits/rejected": -0.6006115078926086, + "logps/chosen": -4.422109127044678, + "logps/rejected": -5.165881156921387, + "loss": 0.0518, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.422109127044678, + "rewards/margins": 0.7437718510627747, + "rewards/rejected": -5.165881156921387, + "sft_loss": 4.101393699645996, + "step": 4505 + }, + { + "epoch": 2.413781568824218, + "grad_norm": 0.4183010395909284, + "learning_rate": 1.116917850730521e-07, + "logits/chosen": -0.8815043568611145, + "logits/rejected": -0.7283543348312378, + "logps/chosen": -4.545543193817139, + "logps/rejected": -5.047765254974365, + "loss": 0.0528, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.545543193817139, + "rewards/margins": 0.5022218823432922, + "rewards/rejected": -5.047765254974365, + "sft_loss": 4.220363616943359, + "step": 4510 + }, + { + "epoch": 2.41645760160562, + "grad_norm": 0.505742763058506, + "learning_rate": 1.1071254755304637e-07, + "logits/chosen": -0.8288267850875854, + "logits/rejected": -0.7838674187660217, + "logps/chosen": -4.297197341918945, + "logps/rejected": -5.042912483215332, + "loss": 0.0494, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.297197341918945, + "rewards/margins": 0.7457149028778076, + "rewards/rejected": -5.042912483215332, + "sft_loss": 3.972548007965088, + "step": 4515 + }, + { + "epoch": 2.419133634387021, + "grad_norm": 0.4752082402543453, + "learning_rate": 1.0973708690161143e-07, + "logits/chosen": -0.8543336987495422, + "logits/rejected": -0.74104243516922, + "logps/chosen": -4.514694690704346, + "logps/rejected": -5.1261162757873535, + "loss": 0.0511, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.514694690704346, + "rewards/margins": 0.6114215850830078, + "rewards/rejected": -5.1261162757873535, + "sft_loss": 4.206650257110596, + "step": 4520 + }, + { + "epoch": 2.421809667168423, + "grad_norm": 0.47586666938227407, + "learning_rate": 1.0876541258267119e-07, + "logits/chosen": -0.8609308004379272, + "logits/rejected": -0.6737977266311646, + "logps/chosen": -4.337949275970459, + "logps/rejected": -5.158361911773682, + "loss": 0.0505, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.337949275970459, + "rewards/margins": 0.8204119801521301, + "rewards/rejected": -5.158361911773682, + "sft_loss": 4.102989196777344, + "step": 4525 + }, + { + "epoch": 2.4244856999498245, + "grad_norm": 0.7712988555674866, + "learning_rate": 1.0779753402341379e-07, + "logits/chosen": -0.9207685589790344, + "logits/rejected": -0.84931480884552, + "logps/chosen": -4.338393688201904, + "logps/rejected": -5.032431602478027, + "loss": 0.0521, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.338393688201904, + "rewards/margins": 0.6940376162528992, + "rewards/rejected": -5.032431602478027, + "sft_loss": 4.081472396850586, + "step": 4530 + }, + { + "epoch": 2.427161732731226, + "grad_norm": 0.5783647376843744, + "learning_rate": 1.0683346061420157e-07, + "logits/chosen": -0.7379239797592163, + "logits/rejected": -0.6506637334823608, + "logps/chosen": -4.515606880187988, + "logps/rejected": -5.266798973083496, + "loss": 0.0526, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.515606880187988, + "rewards/margins": 0.7511924505233765, + "rewards/rejected": -5.266798973083496, + "sft_loss": 4.281412124633789, + "step": 4535 + }, + { + "epoch": 2.4298377655126275, + "grad_norm": 0.44600935802431835, + "learning_rate": 1.0587320170847874e-07, + "logits/chosen": -0.8168829083442688, + "logits/rejected": -0.7206194400787354, + "logps/chosen": -4.545700550079346, + "logps/rejected": -5.104142189025879, + "loss": 0.0533, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.545700550079346, + "rewards/margins": 0.5584414601325989, + "rewards/rejected": -5.104142189025879, + "sft_loss": 4.292700290679932, + "step": 4540 + }, + { + "epoch": 2.4325137982940293, + "grad_norm": 0.36067335080276597, + "learning_rate": 1.0491676662268156e-07, + "logits/chosen": -0.7393335103988647, + "logits/rejected": -0.6619399785995483, + "logps/chosen": -4.518947601318359, + "logps/rejected": -5.257824897766113, + "loss": 0.0506, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.518947601318359, + "rewards/margins": 0.7388771176338196, + "rewards/rejected": -5.257824897766113, + "sft_loss": 4.184815406799316, + "step": 4545 + }, + { + "epoch": 2.4351898310754305, + "grad_norm": 0.5409398505388431, + "learning_rate": 1.0396416463614732e-07, + "logits/chosen": -0.8931136131286621, + "logits/rejected": -0.8121291995048523, + "logps/chosen": -4.453886985778809, + "logps/rejected": -5.1217451095581055, + "loss": 0.052, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.453886985778809, + "rewards/margins": 0.667857825756073, + "rewards/rejected": -5.1217451095581055, + "sft_loss": 4.168035507202148, + "step": 4550 + }, + { + "epoch": 2.4378658638568322, + "grad_norm": 1.0388002306851687, + "learning_rate": 1.0301540499102479e-07, + "logits/chosen": -0.7921608686447144, + "logits/rejected": -0.7310789227485657, + "logps/chosen": -4.489283561706543, + "logps/rejected": -5.021276950836182, + "loss": 0.0532, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.489283561706543, + "rewards/margins": 0.5319927334785461, + "rewards/rejected": -5.021276950836182, + "sft_loss": 4.189752101898193, + "step": 4555 + }, + { + "epoch": 2.440541896638234, + "grad_norm": 0.55059593910096, + "learning_rate": 1.0207049689218405e-07, + "logits/chosen": -0.9317811131477356, + "logits/rejected": -0.6625012755393982, + "logps/chosen": -4.478198051452637, + "logps/rejected": -5.189414978027344, + "loss": 0.0521, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.478198051452637, + "rewards/margins": 0.7112170457839966, + "rewards/rejected": -5.189414978027344, + "sft_loss": 4.156213760375977, + "step": 4560 + }, + { + "epoch": 2.4432179294196352, + "grad_norm": 0.3662390057934976, + "learning_rate": 1.0112944950712782e-07, + "logits/chosen": -0.8565780520439148, + "logits/rejected": -0.7790525555610657, + "logps/chosen": -4.4673357009887695, + "logps/rejected": -5.257838249206543, + "loss": 0.0502, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.4673357009887695, + "rewards/margins": 0.7905027866363525, + "rewards/rejected": -5.257838249206543, + "sft_loss": 4.1379194259643555, + "step": 4565 + }, + { + "epoch": 2.445893962201037, + "grad_norm": 0.59163711890118, + "learning_rate": 1.0019227196590174e-07, + "logits/chosen": -0.8295857310295105, + "logits/rejected": -0.6748948097229004, + "logps/chosen": -4.575309753417969, + "logps/rejected": -5.304391384124756, + "loss": 0.0519, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.575309753417969, + "rewards/margins": 0.7290816903114319, + "rewards/rejected": -5.304391384124756, + "sft_loss": 4.324819087982178, + "step": 4570 + }, + { + "epoch": 2.4485699949824387, + "grad_norm": 0.3863191151206876, + "learning_rate": 9.925897336100664e-08, + "logits/chosen": -0.7855613827705383, + "logits/rejected": -0.7414531111717224, + "logps/chosen": -4.514120101928711, + "logps/rejected": -5.226920127868652, + "loss": 0.0522, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.514120101928711, + "rewards/margins": 0.7128003239631653, + "rewards/rejected": -5.226920127868652, + "sft_loss": 4.242648124694824, + "step": 4575 + }, + { + "epoch": 2.45124602776384, + "grad_norm": 0.5313135323825595, + "learning_rate": 9.832956274730946e-08, + "logits/chosen": -0.7910794019699097, + "logits/rejected": -0.7502156496047974, + "logps/chosen": -4.682957172393799, + "logps/rejected": -5.110443115234375, + "loss": 0.0529, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.682957172393799, + "rewards/margins": 0.42748576402664185, + "rewards/rejected": -5.110443115234375, + "sft_loss": 4.3094916343688965, + "step": 4580 + }, + { + "epoch": 2.4539220605452416, + "grad_norm": 0.6164310535748478, + "learning_rate": 9.740404914195633e-08, + "logits/chosen": -0.8025332689285278, + "logits/rejected": -0.6623189449310303, + "logps/chosen": -4.397765159606934, + "logps/rejected": -5.139482021331787, + "loss": 0.0511, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.397765159606934, + "rewards/margins": 0.7417163252830505, + "rewards/rejected": -5.139482021331787, + "sft_loss": 4.196730613708496, + "step": 4585 + }, + { + "epoch": 2.4565980933266434, + "grad_norm": 0.5277575602538304, + "learning_rate": 9.648244152428392e-08, + "logits/chosen": -0.8694614171981812, + "logits/rejected": -0.7217963933944702, + "logps/chosen": -4.478517055511475, + "logps/rejected": -4.989750862121582, + "loss": 0.0529, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.478517055511475, + "rewards/margins": 0.5112346410751343, + "rewards/rejected": -4.989750862121582, + "sft_loss": 4.229212760925293, + "step": 4590 + }, + { + "epoch": 2.4592741261080446, + "grad_norm": 0.3465526549100429, + "learning_rate": 9.556474883573379e-08, + "logits/chosen": -0.8793339729309082, + "logits/rejected": -0.7501915693283081, + "logps/chosen": -4.311750411987305, + "logps/rejected": -5.096708297729492, + "loss": 0.0523, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.311750411987305, + "rewards/margins": 0.7849579453468323, + "rewards/rejected": -5.096708297729492, + "sft_loss": 4.058889389038086, + "step": 4595 + }, + { + "epoch": 2.4619501588894463, + "grad_norm": 0.6362567491428798, + "learning_rate": 9.465097997976412e-08, + "logits/chosen": -0.8457925915718079, + "logits/rejected": -0.6267693042755127, + "logps/chosen": -4.335329532623291, + "logps/rejected": -5.132666110992432, + "loss": 0.0509, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.335329532623291, + "rewards/margins": 0.7973363399505615, + "rewards/rejected": -5.132666110992432, + "sft_loss": 4.092240333557129, + "step": 4600 + }, + { + "epoch": 2.464626191670848, + "grad_norm": 0.5709648129829065, + "learning_rate": 9.374114382176457e-08, + "logits/chosen": -0.8717010617256165, + "logits/rejected": -0.6866291761398315, + "logps/chosen": -4.533945560455322, + "logps/rejected": -5.346493721008301, + "loss": 0.0503, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.533945560455322, + "rewards/margins": 0.8125476837158203, + "rewards/rejected": -5.346493721008301, + "sft_loss": 4.200273036956787, + "step": 4605 + }, + { + "epoch": 2.46730222445225, + "grad_norm": 0.41256938888295447, + "learning_rate": 9.283524918896945e-08, + "logits/chosen": -0.9007472991943359, + "logits/rejected": -0.7356555461883545, + "logps/chosen": -4.5387139320373535, + "logps/rejected": -5.191751956939697, + "loss": 0.0522, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.5387139320373535, + "rewards/margins": 0.6530376076698303, + "rewards/rejected": -5.191751956939697, + "sft_loss": 4.2472429275512695, + "step": 4610 + }, + { + "epoch": 2.469978257233651, + "grad_norm": 0.46700861734247096, + "learning_rate": 9.193330487037232e-08, + "logits/chosen": -0.8507378697395325, + "logits/rejected": -0.6919487714767456, + "logps/chosen": -4.463172912597656, + "logps/rejected": -5.266392230987549, + "loss": 0.0517, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.463172912597656, + "rewards/margins": 0.8032194972038269, + "rewards/rejected": -5.266392230987549, + "sft_loss": 4.272570610046387, + "step": 4615 + }, + { + "epoch": 2.4726542900150528, + "grad_norm": 0.4668769656142938, + "learning_rate": 9.103531961664118e-08, + "logits/chosen": -0.7989420890808105, + "logits/rejected": -0.6214526295661926, + "logps/chosen": -4.455315113067627, + "logps/rejected": -5.171011447906494, + "loss": 0.0501, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.455315113067627, + "rewards/margins": 0.7156961560249329, + "rewards/rejected": -5.171011447906494, + "sft_loss": 4.132755279541016, + "step": 4620 + }, + { + "epoch": 2.475330322796454, + "grad_norm": 0.47924486796798166, + "learning_rate": 9.014130214003269e-08, + "logits/chosen": -0.8464191555976868, + "logits/rejected": -0.8963597416877747, + "logps/chosen": -4.2732834815979, + "logps/rejected": -4.967657089233398, + "loss": 0.051, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.2732834815979, + "rewards/margins": 0.6943733096122742, + "rewards/rejected": -4.967657089233398, + "sft_loss": 3.9865031242370605, + "step": 4625 + }, + { + "epoch": 2.4780063555778558, + "grad_norm": 0.32552256494364223, + "learning_rate": 8.925126111430848e-08, + "logits/chosen": -0.7193098068237305, + "logits/rejected": -0.675899088382721, + "logps/chosen": -4.4091105461120605, + "logps/rejected": -5.017752647399902, + "loss": 0.051, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.4091105461120605, + "rewards/margins": 0.6086419224739075, + "rewards/rejected": -5.017752647399902, + "sft_loss": 4.017719268798828, + "step": 4630 + }, + { + "epoch": 2.4806823883592575, + "grad_norm": 0.5496220681748125, + "learning_rate": 8.83652051746504e-08, + "logits/chosen": -0.689524233341217, + "logits/rejected": -0.5122482180595398, + "logps/chosen": -4.472263813018799, + "logps/rejected": -5.335396766662598, + "loss": 0.051, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.472263813018799, + "rewards/margins": 0.8631328344345093, + "rewards/rejected": -5.335396766662598, + "sft_loss": 4.2152910232543945, + "step": 4635 + }, + { + "epoch": 2.483358421140659, + "grad_norm": 0.31765867467775627, + "learning_rate": 8.748314291757696e-08, + "logits/chosen": -0.7355210781097412, + "logits/rejected": -0.6390531659126282, + "logps/chosen": -4.5320844650268555, + "logps/rejected": -5.168013572692871, + "loss": 0.0521, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.5320844650268555, + "rewards/margins": 0.635929524898529, + "rewards/rejected": -5.168013572692871, + "sft_loss": 4.182335376739502, + "step": 4640 + }, + { + "epoch": 2.4860344539220605, + "grad_norm": 0.5771252870993228, + "learning_rate": 8.660508290086032e-08, + "logits/chosen": -0.7999386191368103, + "logits/rejected": -0.6518659591674805, + "logps/chosen": -4.393448829650879, + "logps/rejected": -5.0774006843566895, + "loss": 0.0514, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.393448829650879, + "rewards/margins": 0.6839522123336792, + "rewards/rejected": -5.0774006843566895, + "sft_loss": 4.168224811553955, + "step": 4645 + }, + { + "epoch": 2.488710486703462, + "grad_norm": 0.5150308708468381, + "learning_rate": 8.573103364344231e-08, + "logits/chosen": -0.8791343569755554, + "logits/rejected": -0.6491331458091736, + "logps/chosen": -4.423648834228516, + "logps/rejected": -5.2099127769470215, + "loss": 0.0502, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.423648834228516, + "rewards/margins": 0.7862640619277954, + "rewards/rejected": -5.2099127769470215, + "sft_loss": 4.084849834442139, + "step": 4650 + }, + { + "epoch": 2.4913865194848634, + "grad_norm": 0.3661674336980175, + "learning_rate": 8.486100362535292e-08, + "logits/chosen": -0.8177053332328796, + "logits/rejected": -0.6740037202835083, + "logps/chosen": -4.4994401931762695, + "logps/rejected": -5.0591583251953125, + "loss": 0.0517, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.4994401931762695, + "rewards/margins": 0.5597187280654907, + "rewards/rejected": -5.0591583251953125, + "sft_loss": 4.204558372497559, + "step": 4655 + }, + { + "epoch": 2.494062552266265, + "grad_norm": 0.5140280133573151, + "learning_rate": 8.399500128762693e-08, + "logits/chosen": -0.8273393511772156, + "logits/rejected": -0.7141873240470886, + "logps/chosen": -4.562862396240234, + "logps/rejected": -5.185477256774902, + "loss": 0.0509, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.562862396240234, + "rewards/margins": 0.6226149201393127, + "rewards/rejected": -5.185477256774902, + "sft_loss": 4.275196552276611, + "step": 4660 + }, + { + "epoch": 2.496738585047667, + "grad_norm": 0.42529640095647614, + "learning_rate": 8.313303503222313e-08, + "logits/chosen": -0.8213428258895874, + "logits/rejected": -0.7776002883911133, + "logps/chosen": -4.575163841247559, + "logps/rejected": -5.10188102722168, + "loss": 0.0522, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.575163841247559, + "rewards/margins": 0.5267173051834106, + "rewards/rejected": -5.10188102722168, + "sft_loss": 4.26285982131958, + "step": 4665 + }, + { + "epoch": 2.4994146178290686, + "grad_norm": 0.4007308001400934, + "learning_rate": 8.227511322194164e-08, + "logits/chosen": -0.7592180371284485, + "logits/rejected": -0.637819766998291, + "logps/chosen": -4.361072063446045, + "logps/rejected": -4.973891258239746, + "loss": 0.0527, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.361072063446045, + "rewards/margins": 0.6128195524215698, + "rewards/rejected": -4.973891258239746, + "sft_loss": 4.0938920974731445, + "step": 4670 + }, + { + "epoch": 2.50209065061047, + "grad_norm": 0.40321531518970055, + "learning_rate": 8.142124418034385e-08, + "logits/chosen": -0.7266249656677246, + "logits/rejected": -0.5235757827758789, + "logps/chosen": -4.5698018074035645, + "logps/rejected": -5.1615800857543945, + "loss": 0.0522, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.5698018074035645, + "rewards/margins": 0.5917780995368958, + "rewards/rejected": -5.1615800857543945, + "sft_loss": 4.234898567199707, + "step": 4675 + }, + { + "epoch": 2.5047666833918716, + "grad_norm": 0.39867424326365275, + "learning_rate": 8.057143619167073e-08, + "logits/chosen": -0.7164444923400879, + "logits/rejected": -0.5923494696617126, + "logps/chosen": -4.497181415557861, + "logps/rejected": -5.251903533935547, + "loss": 0.0515, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.497181415557861, + "rewards/margins": 0.7547226548194885, + "rewards/rejected": -5.251903533935547, + "sft_loss": 4.232115745544434, + "step": 4680 + }, + { + "epoch": 2.507442716173273, + "grad_norm": 0.5680624236093563, + "learning_rate": 7.97256975007633e-08, + "logits/chosen": -0.7831182479858398, + "logits/rejected": -0.565954327583313, + "logps/chosen": -4.234208106994629, + "logps/rejected": -5.022221565246582, + "loss": 0.0509, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.234208106994629, + "rewards/margins": 0.7880129814147949, + "rewards/rejected": -5.022221565246582, + "sft_loss": 4.004078388214111, + "step": 4685 + }, + { + "epoch": 2.5101187489546746, + "grad_norm": 0.3881217825976462, + "learning_rate": 7.888403631298186e-08, + "logits/chosen": -0.6825178861618042, + "logits/rejected": -0.6623591184616089, + "logps/chosen": -4.475518226623535, + "logps/rejected": -5.0694780349731445, + "loss": 0.0526, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.475518226623535, + "rewards/margins": 0.593960165977478, + "rewards/rejected": -5.0694780349731445, + "sft_loss": 4.17282247543335, + "step": 4690 + }, + { + "epoch": 2.5127947817360763, + "grad_norm": 0.4290121459253616, + "learning_rate": 7.804646079412719e-08, + "logits/chosen": -0.7428678274154663, + "logits/rejected": -0.5591611266136169, + "logps/chosen": -4.560364246368408, + "logps/rejected": -5.201272964477539, + "loss": 0.0516, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.560364246368408, + "rewards/margins": 0.6409088969230652, + "rewards/rejected": -5.201272964477539, + "sft_loss": 4.295758247375488, + "step": 4695 + }, + { + "epoch": 2.515470814517478, + "grad_norm": 0.41012547747397127, + "learning_rate": 7.72129790703604e-08, + "logits/chosen": -0.8270618319511414, + "logits/rejected": -0.6811308264732361, + "logps/chosen": -4.522173881530762, + "logps/rejected": -5.09686803817749, + "loss": 0.0522, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.522173881530762, + "rewards/margins": 0.5746942758560181, + "rewards/rejected": -5.09686803817749, + "sft_loss": 4.222050666809082, + "step": 4700 + }, + { + "epoch": 2.5181468472988793, + "grad_norm": 0.3557932248875748, + "learning_rate": 7.638359922812504e-08, + "logits/chosen": -0.7023937702178955, + "logits/rejected": -0.6598516702651978, + "logps/chosen": -4.509191989898682, + "logps/rejected": -5.237663269042969, + "loss": 0.0502, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.509191989898682, + "rewards/margins": 0.7284715175628662, + "rewards/rejected": -5.237663269042969, + "sft_loss": 4.194537162780762, + "step": 4705 + }, + { + "epoch": 2.520822880080281, + "grad_norm": 0.5817691255500771, + "learning_rate": 7.555832931406774e-08, + "logits/chosen": -0.8006316423416138, + "logits/rejected": -0.5774275064468384, + "logps/chosen": -4.520382881164551, + "logps/rejected": -5.25664758682251, + "loss": 0.0519, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.520382881164551, + "rewards/margins": 0.7362645864486694, + "rewards/rejected": -5.25664758682251, + "sft_loss": 4.245955467224121, + "step": 4710 + }, + { + "epoch": 2.5234989128616827, + "grad_norm": 0.3995963858342414, + "learning_rate": 7.47371773349611e-08, + "logits/chosen": -0.7040305733680725, + "logits/rejected": -0.7341457605361938, + "logps/chosen": -4.4279961585998535, + "logps/rejected": -5.22509241104126, + "loss": 0.0496, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.4279961585998535, + "rewards/margins": 0.7970967888832092, + "rewards/rejected": -5.22509241104126, + "sft_loss": 4.074034690856934, + "step": 4715 + }, + { + "epoch": 2.526174945643084, + "grad_norm": 0.3769854449599952, + "learning_rate": 7.392015125762496e-08, + "logits/chosen": -0.8018277883529663, + "logits/rejected": -0.6637119054794312, + "logps/chosen": -4.422866344451904, + "logps/rejected": -5.083056449890137, + "loss": 0.0506, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.422866344451904, + "rewards/margins": 0.6601905822753906, + "rewards/rejected": -5.083056449890137, + "sft_loss": 4.099396705627441, + "step": 4720 + }, + { + "epoch": 2.5288509784244857, + "grad_norm": 0.4314105030286715, + "learning_rate": 7.310725900885018e-08, + "logits/chosen": -0.8106725811958313, + "logits/rejected": -0.795625627040863, + "logps/chosen": -4.441800594329834, + "logps/rejected": -4.995269298553467, + "loss": 0.0534, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.441800594329834, + "rewards/margins": 0.5534688234329224, + "rewards/rejected": -4.995269298553467, + "sft_loss": 4.204402923583984, + "step": 4725 + }, + { + "epoch": 2.5315270112058874, + "grad_norm": 0.5168731561480471, + "learning_rate": 7.229850847532076e-08, + "logits/chosen": -0.7228676080703735, + "logits/rejected": -0.5544254183769226, + "logps/chosen": -4.531182765960693, + "logps/rejected": -5.202092170715332, + "loss": 0.051, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.531182765960693, + "rewards/margins": 0.6709097623825073, + "rewards/rejected": -5.202092170715332, + "sft_loss": 4.208251953125, + "step": 4730 + }, + { + "epoch": 2.5342030439872887, + "grad_norm": 0.7606664409858177, + "learning_rate": 7.149390750353779e-08, + "logits/chosen": -0.6411561965942383, + "logits/rejected": -0.7863910794258118, + "logps/chosen": -4.404947757720947, + "logps/rejected": -4.9736504554748535, + "loss": 0.0511, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.404947757720947, + "rewards/margins": 0.5687026381492615, + "rewards/rejected": -4.9736504554748535, + "sft_loss": 4.057023048400879, + "step": 4735 + }, + { + "epoch": 2.5368790767686904, + "grad_norm": 0.39633161239377634, + "learning_rate": 7.069346389974374e-08, + "logits/chosen": -0.8106769323348999, + "logits/rejected": -0.6289903521537781, + "logps/chosen": -4.468173027038574, + "logps/rejected": -5.0915846824646, + "loss": 0.0526, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.468173027038574, + "rewards/margins": 0.623412013053894, + "rewards/rejected": -5.0915846824646, + "sft_loss": 4.241403102874756, + "step": 4740 + }, + { + "epoch": 2.539555109550092, + "grad_norm": 0.41054122270576154, + "learning_rate": 6.989718542984563e-08, + "logits/chosen": -0.7652484774589539, + "logits/rejected": -0.7496183514595032, + "logps/chosen": -4.5117692947387695, + "logps/rejected": -5.081242084503174, + "loss": 0.0521, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.5117692947387695, + "rewards/margins": 0.5694732666015625, + "rewards/rejected": -5.081242084503174, + "sft_loss": 4.277536869049072, + "step": 4745 + }, + { + "epoch": 2.5422311423314934, + "grad_norm": 0.6095427658221313, + "learning_rate": 6.9105079819341e-08, + "logits/chosen": -0.708454430103302, + "logits/rejected": -0.4635780453681946, + "logps/chosen": -4.330394744873047, + "logps/rejected": -5.344074249267578, + "loss": 0.0498, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.330394744873047, + "rewards/margins": 1.0136792659759521, + "rewards/rejected": -5.344074249267578, + "sft_loss": 4.1580681800842285, + "step": 4750 + }, + { + "epoch": 2.544907175112895, + "grad_norm": 0.40158170861414555, + "learning_rate": 6.831715475324163e-08, + "logits/chosen": -0.8929897546768188, + "logits/rejected": -0.6932271718978882, + "logps/chosen": -4.430346488952637, + "logps/rejected": -5.198345184326172, + "loss": 0.0516, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.430346488952637, + "rewards/margins": 0.7679981589317322, + "rewards/rejected": -5.198345184326172, + "sft_loss": 4.236446857452393, + "step": 4755 + }, + { + "epoch": 2.547583207894297, + "grad_norm": 0.9430362390645314, + "learning_rate": 6.753341787600026e-08, + "logits/chosen": -0.8296493291854858, + "logits/rejected": -0.733299970626831, + "logps/chosen": -4.424052715301514, + "logps/rejected": -5.159595489501953, + "loss": 0.0519, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.424052715301514, + "rewards/margins": 0.7355419397354126, + "rewards/rejected": -5.159595489501953, + "sft_loss": 4.191681861877441, + "step": 4760 + }, + { + "epoch": 2.5502592406756985, + "grad_norm": 0.5193369946732992, + "learning_rate": 6.67538767914353e-08, + "logits/chosen": -0.7756398916244507, + "logits/rejected": -0.5511414408683777, + "logps/chosen": -4.501200199127197, + "logps/rejected": -5.170102596282959, + "loss": 0.0543, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.501200199127197, + "rewards/margins": 0.668903112411499, + "rewards/rejected": -5.170102596282959, + "sft_loss": 4.344412326812744, + "step": 4765 + }, + { + "epoch": 2.5529352734571, + "grad_norm": 0.625624192281797, + "learning_rate": 6.597853906265793e-08, + "logits/chosen": -0.7071702480316162, + "logits/rejected": -0.5840771794319153, + "logps/chosen": -4.405869483947754, + "logps/rejected": -5.307399749755859, + "loss": 0.0507, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.405869483947754, + "rewards/margins": 0.901530385017395, + "rewards/rejected": -5.307399749755859, + "sft_loss": 4.106754779815674, + "step": 4770 + }, + { + "epoch": 2.5556113062385015, + "grad_norm": 0.44989926116378653, + "learning_rate": 6.5207412211998e-08, + "logits/chosen": -0.6850845217704773, + "logits/rejected": -0.5953121185302734, + "logps/chosen": -4.426508903503418, + "logps/rejected": -5.078802108764648, + "loss": 0.0524, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.426508903503418, + "rewards/margins": 0.6522935628890991, + "rewards/rejected": -5.078802108764648, + "sft_loss": 4.111783027648926, + "step": 4775 + }, + { + "epoch": 2.558287339019903, + "grad_norm": 0.3869486131303834, + "learning_rate": 6.444050372093186e-08, + "logits/chosen": -0.7650526165962219, + "logits/rejected": -0.6805169582366943, + "logps/chosen": -4.382111549377441, + "logps/rejected": -5.045238971710205, + "loss": 0.0513, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.382111549377441, + "rewards/margins": 0.6631268858909607, + "rewards/rejected": -5.045238971710205, + "sft_loss": 4.1749267578125, + "step": 4780 + }, + { + "epoch": 2.5609633718013045, + "grad_norm": 0.5142574862060383, + "learning_rate": 6.367782103000873e-08, + "logits/chosen": -0.7648038268089294, + "logits/rejected": -0.7223659753799438, + "logps/chosen": -4.393990993499756, + "logps/rejected": -4.955014228820801, + "loss": 0.0526, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.393990993499756, + "rewards/margins": 0.5610231161117554, + "rewards/rejected": -4.955014228820801, + "sft_loss": 4.111966133117676, + "step": 4785 + }, + { + "epoch": 2.5636394045827062, + "grad_norm": 0.4258157669046418, + "learning_rate": 6.29193715387798e-08, + "logits/chosen": -0.8260219693183899, + "logits/rejected": -0.7080743312835693, + "logps/chosen": -4.401946544647217, + "logps/rejected": -5.105975151062012, + "loss": 0.0525, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.401946544647217, + "rewards/margins": 0.7040285468101501, + "rewards/rejected": -5.105975151062012, + "sft_loss": 4.163133144378662, + "step": 4790 + }, + { + "epoch": 2.566315437364108, + "grad_norm": 0.6189933185149098, + "learning_rate": 6.216516260572502e-08, + "logits/chosen": -0.7578010559082031, + "logits/rejected": -0.657882809638977, + "logps/chosen": -4.521510124206543, + "logps/rejected": -5.196859359741211, + "loss": 0.0518, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.521510124206543, + "rewards/margins": 0.6753486394882202, + "rewards/rejected": -5.196859359741211, + "sft_loss": 4.304436683654785, + "step": 4795 + }, + { + "epoch": 2.568991470145509, + "grad_norm": 0.567926027513947, + "learning_rate": 6.141520154818297e-08, + "logits/chosen": -0.7929534912109375, + "logits/rejected": -0.6780120134353638, + "logps/chosen": -4.525482654571533, + "logps/rejected": -5.13308048248291, + "loss": 0.0525, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.525482654571533, + "rewards/margins": 0.6075973510742188, + "rewards/rejected": -5.13308048248291, + "sft_loss": 4.307214736938477, + "step": 4800 + }, + { + "epoch": 2.568991470145509, + "eval_logits/chosen": -0.2741285264492035, + "eval_logits/rejected": -0.18904297053813934, + "eval_logps/chosen": -4.448305130004883, + "eval_logps/rejected": -5.142607688903809, + "eval_loss": 0.05042795091867447, + "eval_rewards/accuracies": 0.6832343935966492, + "eval_rewards/chosen": -4.448305130004883, + "eval_rewards/margins": 0.6943021416664124, + "eval_rewards/rejected": -5.142607688903809, + "eval_runtime": 43.661, + "eval_samples_per_second": 30.805, + "eval_sft_loss": 4.069952011108398, + "eval_steps_per_second": 7.719, + "step": 4800 + }, + { + "epoch": 2.571667502926911, + "grad_norm": 0.36636097203702034, + "learning_rate": 6.066949564227897e-08, + "logits/chosen": -0.7946752905845642, + "logits/rejected": -0.716626763343811, + "logps/chosen": -4.3803510665893555, + "logps/rejected": -5.064523696899414, + "loss": 0.0515, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.3803510665893555, + "rewards/margins": 0.6841726303100586, + "rewards/rejected": -5.064523696899414, + "sft_loss": 4.091052055358887, + "step": 4805 + }, + { + "epoch": 2.574343535708312, + "grad_norm": 0.7677640156297736, + "learning_rate": 5.992805212285523e-08, + "logits/chosen": -0.7754315137863159, + "logits/rejected": -0.7091799378395081, + "logps/chosen": -4.335581302642822, + "logps/rejected": -5.124228477478027, + "loss": 0.0511, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.335581302642822, + "rewards/margins": 0.7886467576026917, + "rewards/rejected": -5.124228477478027, + "sft_loss": 4.099038600921631, + "step": 4810 + }, + { + "epoch": 2.577019568489714, + "grad_norm": 0.5748646729931978, + "learning_rate": 5.9190878183399684e-08, + "logits/chosen": -0.7570444345474243, + "logits/rejected": -0.613994300365448, + "logps/chosen": -4.440479755401611, + "logps/rejected": -5.164405822753906, + "loss": 0.052, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.440479755401611, + "rewards/margins": 0.7239259481430054, + "rewards/rejected": -5.164405822753906, + "sft_loss": 4.215497016906738, + "step": 4815 + }, + { + "epoch": 2.5796956012711156, + "grad_norm": 0.4191146750687918, + "learning_rate": 5.845798097597748e-08, + "logits/chosen": -0.7721574306488037, + "logits/rejected": -0.6697301268577576, + "logps/chosen": -4.579713821411133, + "logps/rejected": -5.104306221008301, + "loss": 0.0529, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.579713821411133, + "rewards/margins": 0.5245919227600098, + "rewards/rejected": -5.104306221008301, + "sft_loss": 4.305607795715332, + "step": 4820 + }, + { + "epoch": 2.5823716340525174, + "grad_norm": 0.45022059402292774, + "learning_rate": 5.772936761116026e-08, + "logits/chosen": -0.7701061964035034, + "logits/rejected": -0.6287974715232849, + "logps/chosen": -4.400467395782471, + "logps/rejected": -5.098750114440918, + "loss": 0.051, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.400467395782471, + "rewards/margins": 0.6982828378677368, + "rewards/rejected": -5.098750114440918, + "sft_loss": 4.131747245788574, + "step": 4825 + }, + { + "epoch": 2.5850476668339186, + "grad_norm": 0.4799590961567342, + "learning_rate": 5.700504515795829e-08, + "logits/chosen": -0.8211865425109863, + "logits/rejected": -0.6615251302719116, + "logps/chosen": -4.437475681304932, + "logps/rejected": -5.135467529296875, + "loss": 0.0513, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.437475681304932, + "rewards/margins": 0.6979917287826538, + "rewards/rejected": -5.135467529296875, + "sft_loss": 4.257898330688477, + "step": 4830 + }, + { + "epoch": 2.5877236996153203, + "grad_norm": 0.42527193100918037, + "learning_rate": 5.628502064375101e-08, + "logits/chosen": -0.8720036745071411, + "logits/rejected": -0.6801129579544067, + "logps/chosen": -4.475311279296875, + "logps/rejected": -5.252869129180908, + "loss": 0.0507, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.475311279296875, + "rewards/margins": 0.777558445930481, + "rewards/rejected": -5.252869129180908, + "sft_loss": 4.221250057220459, + "step": 4835 + }, + { + "epoch": 2.5903997323967216, + "grad_norm": 0.5115033248703027, + "learning_rate": 5.55693010542197e-08, + "logits/chosen": -0.7869556546211243, + "logits/rejected": -0.5591322779655457, + "logps/chosen": -4.349687099456787, + "logps/rejected": -5.18341064453125, + "loss": 0.0485, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.349687099456787, + "rewards/margins": 0.8337236642837524, + "rewards/rejected": -5.18341064453125, + "sft_loss": 3.9441981315612793, + "step": 4840 + }, + { + "epoch": 2.5930757651781233, + "grad_norm": 0.7067733700435774, + "learning_rate": 5.485789333327856e-08, + "logits/chosen": -0.7598763108253479, + "logits/rejected": -0.7221094369888306, + "logps/chosen": -4.49146842956543, + "logps/rejected": -5.119671821594238, + "loss": 0.0522, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.49146842956543, + "rewards/margins": 0.628203809261322, + "rewards/rejected": -5.119671821594238, + "sft_loss": 4.2429094314575195, + "step": 4845 + }, + { + "epoch": 2.595751797959525, + "grad_norm": 0.45260597716363027, + "learning_rate": 5.4150804383008675e-08, + "logits/chosen": -0.910417914390564, + "logits/rejected": -0.7389578819274902, + "logps/chosen": -4.464565277099609, + "logps/rejected": -5.181889533996582, + "loss": 0.053, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.464565277099609, + "rewards/margins": 0.7173237204551697, + "rewards/rejected": -5.181889533996582, + "sft_loss": 4.224325656890869, + "step": 4850 + }, + { + "epoch": 2.5984278307409268, + "grad_norm": 0.5973972322326072, + "learning_rate": 5.344804106359002e-08, + "logits/chosen": -0.6693606376647949, + "logits/rejected": -0.5169059634208679, + "logps/chosen": -4.443115711212158, + "logps/rejected": -5.097043037414551, + "loss": 0.0524, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.443115711212158, + "rewards/margins": 0.653927206993103, + "rewards/rejected": -5.097043037414551, + "sft_loss": 4.195808410644531, + "step": 4855 + }, + { + "epoch": 2.601103863522328, + "grad_norm": 0.5755470171232112, + "learning_rate": 5.274961019323559e-08, + "logits/chosen": -0.7551354169845581, + "logits/rejected": -0.6369063854217529, + "logps/chosen": -4.399575710296631, + "logps/rejected": -5.089932918548584, + "loss": 0.0528, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.399575710296631, + "rewards/margins": 0.6903573870658875, + "rewards/rejected": -5.089932918548584, + "sft_loss": 4.221864700317383, + "step": 4860 + }, + { + "epoch": 2.6037798963037297, + "grad_norm": 0.4840213634050733, + "learning_rate": 5.205551854812451e-08, + "logits/chosen": -0.8901827931404114, + "logits/rejected": -0.8028414845466614, + "logps/chosen": -4.3761115074157715, + "logps/rejected": -5.150932312011719, + "loss": 0.0504, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.3761115074157715, + "rewards/margins": 0.7748211026191711, + "rewards/rejected": -5.150932312011719, + "sft_loss": 4.163114070892334, + "step": 4865 + }, + { + "epoch": 2.606455929085131, + "grad_norm": 0.8958131855189024, + "learning_rate": 5.1365772862337177e-08, + "logits/chosen": -0.6926102638244629, + "logits/rejected": -0.6045705080032349, + "logps/chosen": -4.361238956451416, + "logps/rejected": -5.185271263122559, + "loss": 0.0509, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.361238956451416, + "rewards/margins": 0.8240326642990112, + "rewards/rejected": -5.185271263122559, + "sft_loss": 4.081574440002441, + "step": 4870 + }, + { + "epoch": 2.6091319618665327, + "grad_norm": 0.6988608914200577, + "learning_rate": 5.068037982778905e-08, + "logits/chosen": -0.6733156442642212, + "logits/rejected": -0.5771912336349487, + "logps/chosen": -4.478516101837158, + "logps/rejected": -5.19637393951416, + "loss": 0.0516, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.478516101837158, + "rewards/margins": 0.7178576588630676, + "rewards/rejected": -5.19637393951416, + "sft_loss": 4.19520902633667, + "step": 4875 + }, + { + "epoch": 2.6118079946479344, + "grad_norm": 0.5200460676604345, + "learning_rate": 4.999934609416656e-08, + "logits/chosen": -0.7352452278137207, + "logits/rejected": -0.6223322153091431, + "logps/chosen": -4.333428859710693, + "logps/rejected": -5.200177192687988, + "loss": 0.051, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.333428859710693, + "rewards/margins": 0.8667477369308472, + "rewards/rejected": -5.200177192687988, + "sft_loss": 4.1180853843688965, + "step": 4880 + }, + { + "epoch": 2.614484027429336, + "grad_norm": 0.5114132721276469, + "learning_rate": 4.932267826886183e-08, + "logits/chosen": -0.6905040144920349, + "logits/rejected": -0.6525358557701111, + "logps/chosen": -4.4714531898498535, + "logps/rejected": -5.135130882263184, + "loss": 0.0526, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.4714531898498535, + "rewards/margins": 0.6636782884597778, + "rewards/rejected": -5.135130882263184, + "sft_loss": 4.293010711669922, + "step": 4885 + }, + { + "epoch": 2.6171600602107374, + "grad_norm": 0.3541675492702616, + "learning_rate": 4.8650382916909206e-08, + "logits/chosen": -0.8800719976425171, + "logits/rejected": -0.6866661310195923, + "logps/chosen": -4.409409523010254, + "logps/rejected": -5.073835849761963, + "loss": 0.052, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.409409523010254, + "rewards/margins": 0.6644265055656433, + "rewards/rejected": -5.073835849761963, + "sft_loss": 4.160528182983398, + "step": 4890 + }, + { + "epoch": 2.619836092992139, + "grad_norm": 0.5684592362716069, + "learning_rate": 4.7982466560920976e-08, + "logits/chosen": -0.7659167051315308, + "logits/rejected": -0.7323800921440125, + "logps/chosen": -4.4443440437316895, + "logps/rejected": -5.052102088928223, + "loss": 0.052, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.4443440437316895, + "rewards/margins": 0.6077579259872437, + "rewards/rejected": -5.052102088928223, + "sft_loss": 4.219048976898193, + "step": 4895 + }, + { + "epoch": 2.622512125773541, + "grad_norm": 0.6689392592042074, + "learning_rate": 4.7318935681024685e-08, + "logits/chosen": -0.7107598185539246, + "logits/rejected": -0.5510147213935852, + "logps/chosen": -4.424412250518799, + "logps/rejected": -5.242027282714844, + "loss": 0.0511, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.424412250518799, + "rewards/margins": 0.8176156282424927, + "rewards/rejected": -5.242027282714844, + "sft_loss": 4.165490627288818, + "step": 4900 + }, + { + "epoch": 2.625188158554942, + "grad_norm": 0.5722256802272633, + "learning_rate": 4.6659796714799745e-08, + "logits/chosen": -0.7005605697631836, + "logits/rejected": -0.576447606086731, + "logps/chosen": -4.384562969207764, + "logps/rejected": -5.328797817230225, + "loss": 0.0493, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.384562969207764, + "rewards/margins": 0.9442348480224609, + "rewards/rejected": -5.328797817230225, + "sft_loss": 4.149122714996338, + "step": 4905 + }, + { + "epoch": 2.627864191336344, + "grad_norm": 0.5219836508629162, + "learning_rate": 4.60050560572155e-08, + "logits/chosen": -0.7817445993423462, + "logits/rejected": -0.8545292615890503, + "logps/chosen": -4.5263991355896, + "logps/rejected": -5.273439884185791, + "loss": 0.0523, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.5263991355896, + "rewards/margins": 0.7470411658287048, + "rewards/rejected": -5.273439884185791, + "sft_loss": 4.291499137878418, + "step": 4910 + }, + { + "epoch": 2.6305402241177456, + "grad_norm": 0.38646571365518173, + "learning_rate": 4.535472006056834e-08, + "logits/chosen": -0.7773014903068542, + "logits/rejected": -0.585507333278656, + "logps/chosen": -4.419951438903809, + "logps/rejected": -5.040675640106201, + "loss": 0.0516, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.419951438903809, + "rewards/margins": 0.6207249164581299, + "rewards/rejected": -5.040675640106201, + "sft_loss": 4.128361225128174, + "step": 4915 + }, + { + "epoch": 2.6332162568991473, + "grad_norm": 0.4415088424078639, + "learning_rate": 4.470879503442132e-08, + "logits/chosen": -0.7938684225082397, + "logits/rejected": -0.6925408244132996, + "logps/chosen": -4.400835990905762, + "logps/rejected": -5.0848212242126465, + "loss": 0.051, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.400835990905762, + "rewards/margins": 0.6839861869812012, + "rewards/rejected": -5.0848212242126465, + "sft_loss": 4.152151584625244, + "step": 4920 + }, + { + "epoch": 2.6358922896805486, + "grad_norm": 0.5438520820103692, + "learning_rate": 4.406728724554154e-08, + "logits/chosen": -0.9053106307983398, + "logits/rejected": -0.6270970106124878, + "logps/chosen": -4.350544452667236, + "logps/rejected": -5.182711601257324, + "loss": 0.0503, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.350544452667236, + "rewards/margins": 0.8321673274040222, + "rewards/rejected": -5.182711601257324, + "sft_loss": 4.118407249450684, + "step": 4925 + }, + { + "epoch": 2.6385683224619503, + "grad_norm": 0.40830510209473897, + "learning_rate": 4.3430202917840664e-08, + "logits/chosen": -0.7182999849319458, + "logits/rejected": -0.5021204352378845, + "logps/chosen": -4.545349597930908, + "logps/rejected": -5.3235087394714355, + "loss": 0.0518, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.545349597930908, + "rewards/margins": 0.7781594395637512, + "rewards/rejected": -5.3235087394714355, + "sft_loss": 4.261865615844727, + "step": 4930 + }, + { + "epoch": 2.6412443552433515, + "grad_norm": 0.46964279049381513, + "learning_rate": 4.279754823231346e-08, + "logits/chosen": -0.776440441608429, + "logits/rejected": -0.5842413902282715, + "logps/chosen": -4.306726932525635, + "logps/rejected": -5.086239337921143, + "loss": 0.0504, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.306726932525635, + "rewards/margins": 0.7795121073722839, + "rewards/rejected": -5.086239337921143, + "sft_loss": 3.9869303703308105, + "step": 4935 + }, + { + "epoch": 2.6439203880247533, + "grad_norm": 0.49471229779296944, + "learning_rate": 4.216932932697859e-08, + "logits/chosen": -0.832381546497345, + "logits/rejected": -0.7716125249862671, + "logps/chosen": -4.284350395202637, + "logps/rejected": -4.896607398986816, + "loss": 0.0511, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.284350395202637, + "rewards/margins": 0.6122564077377319, + "rewards/rejected": -4.896607398986816, + "sft_loss": 4.027619361877441, + "step": 4940 + }, + { + "epoch": 2.646596420806155, + "grad_norm": 0.5063133009359136, + "learning_rate": 4.154555229681844e-08, + "logits/chosen": -0.7936744093894958, + "logits/rejected": -0.5885189175605774, + "logps/chosen": -4.427206993103027, + "logps/rejected": -5.143578052520752, + "loss": 0.0509, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.427206993103027, + "rewards/margins": 0.7163704633712769, + "rewards/rejected": -5.143578052520752, + "sft_loss": 4.04074764251709, + "step": 4945 + }, + { + "epoch": 2.6492724535875567, + "grad_norm": 0.5395838576200975, + "learning_rate": 4.092622319372069e-08, + "logits/chosen": -0.7596436142921448, + "logits/rejected": -0.5754319429397583, + "logps/chosen": -4.363346099853516, + "logps/rejected": -5.0439653396606445, + "loss": 0.0518, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.363346099853516, + "rewards/margins": 0.6806186437606812, + "rewards/rejected": -5.0439653396606445, + "sft_loss": 4.0527753829956055, + "step": 4950 + }, + { + "epoch": 2.651948486368958, + "grad_norm": 0.4273690281432811, + "learning_rate": 4.031134802641889e-08, + "logits/chosen": -0.754179060459137, + "logits/rejected": -0.7639588713645935, + "logps/chosen": -4.647191524505615, + "logps/rejected": -5.165177345275879, + "loss": 0.0515, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.647191524505615, + "rewards/margins": 0.5179857015609741, + "rewards/rejected": -5.165177345275879, + "sft_loss": 4.326084136962891, + "step": 4955 + }, + { + "epoch": 2.6546245191503597, + "grad_norm": 0.4601641742802231, + "learning_rate": 3.970093276043468e-08, + "logits/chosen": -0.6982657313346863, + "logits/rejected": -0.6274687051773071, + "logps/chosen": -4.353320121765137, + "logps/rejected": -5.106142997741699, + "loss": 0.052, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.353320121765137, + "rewards/margins": 0.7528237700462341, + "rewards/rejected": -5.106142997741699, + "sft_loss": 4.145398139953613, + "step": 4960 + }, + { + "epoch": 2.657300551931761, + "grad_norm": 0.4035041863470644, + "learning_rate": 3.9094983318019584e-08, + "logits/chosen": -0.8743463754653931, + "logits/rejected": -0.7296475172042847, + "logps/chosen": -4.384496212005615, + "logps/rejected": -5.0877203941345215, + "loss": 0.0517, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.384496212005615, + "rewards/margins": 0.7032240629196167, + "rewards/rejected": -5.0877203941345215, + "sft_loss": 4.198531150817871, + "step": 4965 + }, + { + "epoch": 2.6599765847131627, + "grad_norm": 0.4512012847530197, + "learning_rate": 3.849350557809789e-08, + "logits/chosen": -0.6640986204147339, + "logits/rejected": -0.653388500213623, + "logps/chosen": -4.503409385681152, + "logps/rejected": -5.102181434631348, + "loss": 0.0502, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.503409385681152, + "rewards/margins": 0.5987719297409058, + "rewards/rejected": -5.102181434631348, + "sft_loss": 4.060650825500488, + "step": 4970 + }, + { + "epoch": 2.6626526174945644, + "grad_norm": 0.6901863489404572, + "learning_rate": 3.789650537620903e-08, + "logits/chosen": -0.688258945941925, + "logits/rejected": -0.6832031011581421, + "logps/chosen": -4.540500164031982, + "logps/rejected": -5.2547407150268555, + "loss": 0.0513, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.540500164031982, + "rewards/margins": 0.7142406105995178, + "rewards/rejected": -5.2547407150268555, + "sft_loss": 4.2646074295043945, + "step": 4975 + }, + { + "epoch": 2.665328650275966, + "grad_norm": 0.42588906837966245, + "learning_rate": 3.730398850445182e-08, + "logits/chosen": -0.6220318078994751, + "logits/rejected": -0.6261580586433411, + "logps/chosen": -4.496462821960449, + "logps/rejected": -5.189952373504639, + "loss": 0.052, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.496462821960449, + "rewards/margins": 0.6934901475906372, + "rewards/rejected": -5.189952373504639, + "sft_loss": 4.15579080581665, + "step": 4980 + }, + { + "epoch": 2.6680046830573674, + "grad_norm": 0.4940453819815409, + "learning_rate": 3.671596071142735e-08, + "logits/chosen": -0.7255562543869019, + "logits/rejected": -0.5626224875450134, + "logps/chosen": -4.4464030265808105, + "logps/rejected": -5.3239946365356445, + "loss": 0.0508, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.4464030265808105, + "rewards/margins": 0.8775907754898071, + "rewards/rejected": -5.3239946365356445, + "sft_loss": 4.130679130554199, + "step": 4985 + }, + { + "epoch": 2.670680715838769, + "grad_norm": 0.533231803665976, + "learning_rate": 3.6132427702183996e-08, + "logits/chosen": -0.898454487323761, + "logits/rejected": -0.7337983846664429, + "logps/chosen": -4.380088806152344, + "logps/rejected": -5.301820278167725, + "loss": 0.0502, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.380088806152344, + "rewards/margins": 0.9217315912246704, + "rewards/rejected": -5.301820278167725, + "sft_loss": 4.153786659240723, + "step": 4990 + }, + { + "epoch": 2.6733567486201704, + "grad_norm": 0.4560585718830667, + "learning_rate": 3.555339513816147e-08, + "logits/chosen": -0.8258197903633118, + "logits/rejected": -0.8702704310417175, + "logps/chosen": -4.535338401794434, + "logps/rejected": -5.168300151824951, + "loss": 0.0529, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.535338401794434, + "rewards/margins": 0.6329620480537415, + "rewards/rejected": -5.168300151824951, + "sft_loss": 4.283738136291504, + "step": 4995 + }, + { + "epoch": 2.676032781401572, + "grad_norm": 0.4913083081311584, + "learning_rate": 3.497886863713639e-08, + "logits/chosen": -0.7957427501678467, + "logits/rejected": -0.7848941087722778, + "logps/chosen": -4.596715927124023, + "logps/rejected": -5.212932586669922, + "loss": 0.052, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.596715927124023, + "rewards/margins": 0.6162165403366089, + "rewards/rejected": -5.212932586669922, + "sft_loss": 4.317381381988525, + "step": 5000 + }, + { + "epoch": 2.678708814182974, + "grad_norm": 0.5146051345946518, + "learning_rate": 3.440885377316721e-08, + "logits/chosen": -0.7044495344161987, + "logits/rejected": -0.6470869779586792, + "logps/chosen": -4.40933895111084, + "logps/rejected": -4.96150016784668, + "loss": 0.0517, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.40933895111084, + "rewards/margins": 0.5521610975265503, + "rewards/rejected": -4.96150016784668, + "sft_loss": 4.148724555969238, + "step": 5005 + }, + { + "epoch": 2.6813848469643755, + "grad_norm": 0.5127153318494577, + "learning_rate": 3.384335607654082e-08, + "logits/chosen": -0.662913978099823, + "logits/rejected": -0.6405649185180664, + "logps/chosen": -4.492884159088135, + "logps/rejected": -5.25106143951416, + "loss": 0.0509, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.492884159088135, + "rewards/margins": 0.7581772804260254, + "rewards/rejected": -5.25106143951416, + "sft_loss": 4.1412672996521, + "step": 5010 + }, + { + "epoch": 2.684060879745777, + "grad_norm": 0.49440876947556955, + "learning_rate": 3.328238103371811e-08, + "logits/chosen": -0.755716860294342, + "logits/rejected": -0.7262305021286011, + "logps/chosen": -4.506894111633301, + "logps/rejected": -5.315282344818115, + "loss": 0.0503, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.506894111633301, + "rewards/margins": 0.8083890676498413, + "rewards/rejected": -5.315282344818115, + "sft_loss": 4.149899482727051, + "step": 5015 + }, + { + "epoch": 2.6867369125271785, + "grad_norm": 0.5390351293879576, + "learning_rate": 3.272593408728169e-08, + "logits/chosen": -0.8326853513717651, + "logits/rejected": -0.5620445013046265, + "logps/chosen": -4.3972320556640625, + "logps/rejected": -5.159958839416504, + "loss": 0.0506, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.3972320556640625, + "rewards/margins": 0.7627268433570862, + "rewards/rejected": -5.159958839416504, + "sft_loss": 4.1451311111450195, + "step": 5020 + }, + { + "epoch": 2.6894129453085798, + "grad_norm": 0.4278067734510489, + "learning_rate": 3.217402063588204e-08, + "logits/chosen": -0.8494710922241211, + "logits/rejected": -0.6724889874458313, + "logps/chosen": -4.459986686706543, + "logps/rejected": -5.137322902679443, + "loss": 0.0525, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.459986686706543, + "rewards/margins": 0.6773372888565063, + "rewards/rejected": -5.137322902679443, + "sft_loss": 4.2519850730896, + "step": 5025 + }, + { + "epoch": 2.6920889780899815, + "grad_norm": 0.5935683504662128, + "learning_rate": 3.162664603418608e-08, + "logits/chosen": -0.7728666067123413, + "logits/rejected": -0.7169617414474487, + "logps/chosen": -4.379161357879639, + "logps/rejected": -5.187068939208984, + "loss": 0.0511, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.379161357879639, + "rewards/margins": 0.807907223701477, + "rewards/rejected": -5.187068939208984, + "sft_loss": 4.1201324462890625, + "step": 5030 + }, + { + "epoch": 2.694765010871383, + "grad_norm": 0.38628884775927563, + "learning_rate": 3.1083815592824416e-08, + "logits/chosen": -0.752131462097168, + "logits/rejected": -0.6836836934089661, + "logps/chosen": -4.516790866851807, + "logps/rejected": -5.249283790588379, + "loss": 0.0515, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.516790866851807, + "rewards/margins": 0.7324928045272827, + "rewards/rejected": -5.249283790588379, + "sft_loss": 4.276049613952637, + "step": 5035 + }, + { + "epoch": 2.697441043652785, + "grad_norm": 0.5871576493438839, + "learning_rate": 3.054553457834053e-08, + "logits/chosen": -0.6166585087776184, + "logits/rejected": -0.6771060824394226, + "logps/chosen": -4.626035213470459, + "logps/rejected": -5.179619789123535, + "loss": 0.0527, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.626035213470459, + "rewards/margins": 0.5535842180252075, + "rewards/rejected": -5.179619789123535, + "sft_loss": 4.302709102630615, + "step": 5040 + }, + { + "epoch": 2.700117076434186, + "grad_norm": 0.6160064556957795, + "learning_rate": 3.0011808213139036e-08, + "logits/chosen": -0.6177735328674316, + "logits/rejected": -0.6710564494132996, + "logps/chosen": -4.500868320465088, + "logps/rejected": -5.045173168182373, + "loss": 0.0515, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.500868320465088, + "rewards/margins": 0.5443046689033508, + "rewards/rejected": -5.045173168182373, + "sft_loss": 4.213364124298096, + "step": 5045 + }, + { + "epoch": 2.702793109215588, + "grad_norm": 0.6859916514482486, + "learning_rate": 2.948264167543568e-08, + "logits/chosen": -0.7522085905075073, + "logits/rejected": -0.6835848093032837, + "logps/chosen": -4.500675678253174, + "logps/rejected": -5.0596089363098145, + "loss": 0.0518, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.500675678253174, + "rewards/margins": 0.5589330792427063, + "rewards/rejected": -5.0596089363098145, + "sft_loss": 4.171204566955566, + "step": 5050 + }, + { + "epoch": 2.7054691419969896, + "grad_norm": 0.4799906826336578, + "learning_rate": 2.8958040099206216e-08, + "logits/chosen": -0.8550014495849609, + "logits/rejected": -0.7934740781784058, + "logps/chosen": -4.4887800216674805, + "logps/rejected": -5.191119194030762, + "loss": 0.0512, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.4887800216674805, + "rewards/margins": 0.7023388743400574, + "rewards/rejected": -5.191119194030762, + "sft_loss": 4.201326847076416, + "step": 5055 + }, + { + "epoch": 2.708145174778391, + "grad_norm": 0.7859840677597958, + "learning_rate": 2.843800857413775e-08, + "logits/chosen": -0.6860078573226929, + "logits/rejected": -0.654365062713623, + "logps/chosen": -4.445427417755127, + "logps/rejected": -4.997714996337891, + "loss": 0.0529, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.445427417755127, + "rewards/margins": 0.5522874593734741, + "rewards/rejected": -4.997714996337891, + "sft_loss": 4.158383846282959, + "step": 5060 + }, + { + "epoch": 2.7108212075597926, + "grad_norm": 0.4526236800027281, + "learning_rate": 2.7922552145578203e-08, + "logits/chosen": -0.7144232988357544, + "logits/rejected": -0.45682835578918457, + "logps/chosen": -4.429794788360596, + "logps/rejected": -5.128900051116943, + "loss": 0.0511, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.429794788360596, + "rewards/margins": 0.6991047263145447, + "rewards/rejected": -5.128900051116943, + "sft_loss": 4.1685028076171875, + "step": 5065 + }, + { + "epoch": 2.7134972403411943, + "grad_norm": 0.4394735274325847, + "learning_rate": 2.7411675814488277e-08, + "logits/chosen": -0.697012722492218, + "logits/rejected": -0.56499183177948, + "logps/chosen": -4.419262886047363, + "logps/rejected": -5.056243896484375, + "loss": 0.0507, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.419262886047363, + "rewards/margins": 0.6369813680648804, + "rewards/rejected": -5.056243896484375, + "sft_loss": 4.206353664398193, + "step": 5070 + }, + { + "epoch": 2.7161732731225956, + "grad_norm": 0.5891818486629351, + "learning_rate": 2.690538453739216e-08, + "logits/chosen": -0.7441337704658508, + "logits/rejected": -0.706308901309967, + "logps/chosen": -4.351699352264404, + "logps/rejected": -4.927661895751953, + "loss": 0.0531, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.351699352264404, + "rewards/margins": 0.5759629011154175, + "rewards/rejected": -4.927661895751953, + "sft_loss": 4.168587684631348, + "step": 5075 + }, + { + "epoch": 2.7188493059039973, + "grad_norm": 0.43729963617249973, + "learning_rate": 2.6403683226330298e-08, + "logits/chosen": -0.8585704565048218, + "logits/rejected": -0.6886411309242249, + "logps/chosen": -4.4380598068237305, + "logps/rejected": -5.095976829528809, + "loss": 0.0517, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.4380598068237305, + "rewards/margins": 0.6579168438911438, + "rewards/rejected": -5.095976829528809, + "sft_loss": 4.219568729400635, + "step": 5080 + }, + { + "epoch": 2.721525338685399, + "grad_norm": 0.45646069193158206, + "learning_rate": 2.5906576748810804e-08, + "logits/chosen": -0.8393117189407349, + "logits/rejected": -0.7262551784515381, + "logps/chosen": -4.536147117614746, + "logps/rejected": -5.36475944519043, + "loss": 0.051, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.536147117614746, + "rewards/margins": 0.8286125063896179, + "rewards/rejected": -5.36475944519043, + "sft_loss": 4.3527655601501465, + "step": 5085 + }, + { + "epoch": 2.7242013714668003, + "grad_norm": 0.40349095053106443, + "learning_rate": 2.5414069927763016e-08, + "logits/chosen": -0.9187151789665222, + "logits/rejected": -0.7296867370605469, + "logps/chosen": -4.362442970275879, + "logps/rejected": -5.214807033538818, + "loss": 0.0498, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.362442970275879, + "rewards/margins": 0.8523637056350708, + "rewards/rejected": -5.214807033538818, + "sft_loss": 4.102611064910889, + "step": 5090 + }, + { + "epoch": 2.726877404248202, + "grad_norm": 0.31381315759075395, + "learning_rate": 2.4926167541490185e-08, + "logits/chosen": -0.941425621509552, + "logits/rejected": -0.7067974805831909, + "logps/chosen": -4.382145404815674, + "logps/rejected": -5.232102870941162, + "loss": 0.0508, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.382145404815674, + "rewards/margins": 0.849958062171936, + "rewards/rejected": -5.232102870941162, + "sft_loss": 4.168827056884766, + "step": 5095 + }, + { + "epoch": 2.7295534370296037, + "grad_norm": 0.4666743335275937, + "learning_rate": 2.4442874323623574e-08, + "logits/chosen": -0.7078942060470581, + "logits/rejected": -0.5640963912010193, + "logps/chosen": -4.456545829772949, + "logps/rejected": -5.237743377685547, + "loss": 0.0516, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.456545829772949, + "rewards/margins": 0.7811979055404663, + "rewards/rejected": -5.237743377685547, + "sft_loss": 4.171876430511475, + "step": 5100 + }, + { + "epoch": 2.7322294698110055, + "grad_norm": 0.5009611946781228, + "learning_rate": 2.396419496307589e-08, + "logits/chosen": -0.7781413197517395, + "logits/rejected": -0.5991483330726624, + "logps/chosen": -4.574313163757324, + "logps/rejected": -5.172600746154785, + "loss": 0.051, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.574313163757324, + "rewards/margins": 0.5982874631881714, + "rewards/rejected": -5.172600746154785, + "sft_loss": 4.223634243011475, + "step": 5105 + }, + { + "epoch": 2.7349055025924067, + "grad_norm": 0.40995838518576144, + "learning_rate": 2.349013410399653e-08, + "logits/chosen": -0.769730269908905, + "logits/rejected": -0.6913172006607056, + "logps/chosen": -4.531946182250977, + "logps/rejected": -5.277296543121338, + "loss": 0.0505, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.531946182250977, + "rewards/margins": 0.7453504800796509, + "rewards/rejected": -5.277296543121338, + "sft_loss": 4.089993476867676, + "step": 5110 + }, + { + "epoch": 2.7375815353738084, + "grad_norm": 0.4386421790875959, + "learning_rate": 2.3020696345725954e-08, + "logits/chosen": -0.8437735438346863, + "logits/rejected": -0.6403151750564575, + "logps/chosen": -4.284999847412109, + "logps/rejected": -5.3409104347229, + "loss": 0.0495, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.284999847412109, + "rewards/margins": 1.0559107065200806, + "rewards/rejected": -5.3409104347229, + "sft_loss": 4.055171966552734, + "step": 5115 + }, + { + "epoch": 2.7402575681552097, + "grad_norm": 0.6096353214247594, + "learning_rate": 2.2555886242751398e-08, + "logits/chosen": -0.7667422294616699, + "logits/rejected": -0.7132569551467896, + "logps/chosen": -4.329689025878906, + "logps/rejected": -5.106996059417725, + "loss": 0.0513, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.329689025878906, + "rewards/margins": 0.7773071527481079, + "rewards/rejected": -5.106996059417725, + "sft_loss": 4.15250825881958, + "step": 5120 + }, + { + "epoch": 2.7429336009366114, + "grad_norm": 0.739154952389392, + "learning_rate": 2.2095708304662453e-08, + "logits/chosen": -0.8948475122451782, + "logits/rejected": -0.6626867055892944, + "logps/chosen": -4.5080671310424805, + "logps/rejected": -5.133249282836914, + "loss": 0.0513, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.5080671310424805, + "rewards/margins": 0.6251822710037231, + "rewards/rejected": -5.133249282836914, + "sft_loss": 4.2104692459106445, + "step": 5125 + }, + { + "epoch": 2.745609633718013, + "grad_norm": 0.40146489525306356, + "learning_rate": 2.16401669961076e-08, + "logits/chosen": -0.8997477293014526, + "logits/rejected": -0.6744705438613892, + "logps/chosen": -4.385453701019287, + "logps/rejected": -5.082536220550537, + "loss": 0.0517, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.385453701019287, + "rewards/margins": 0.6970824003219604, + "rewards/rejected": -5.082536220550537, + "sft_loss": 4.138113975524902, + "step": 5130 + }, + { + "epoch": 2.748285666499415, + "grad_norm": 0.5550146219227535, + "learning_rate": 2.1189266736750532e-08, + "logits/chosen": -0.6366299390792847, + "logits/rejected": -0.6059743165969849, + "logps/chosen": -4.567104816436768, + "logps/rejected": -5.097182273864746, + "loss": 0.0529, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.567104816436768, + "rewards/margins": 0.5300775766372681, + "rewards/rejected": -5.097182273864746, + "sft_loss": 4.227593898773193, + "step": 5135 + }, + { + "epoch": 2.750961699280816, + "grad_norm": 0.7238548964288543, + "learning_rate": 2.0743011901227623e-08, + "logits/chosen": -0.7072452902793884, + "logits/rejected": -0.5823957920074463, + "logps/chosen": -4.160729885101318, + "logps/rejected": -4.983979225158691, + "loss": 0.0506, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.160729885101318, + "rewards/margins": 0.8232491612434387, + "rewards/rejected": -4.983979225158691, + "sft_loss": 3.919672727584839, + "step": 5140 + }, + { + "epoch": 2.753637732062218, + "grad_norm": 0.6313164538400142, + "learning_rate": 2.030140681910508e-08, + "logits/chosen": -0.7149503827095032, + "logits/rejected": -0.5912491083145142, + "logps/chosen": -4.391037464141846, + "logps/rejected": -5.0824480056762695, + "loss": 0.051, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.391037464141846, + "rewards/margins": 0.6914108991622925, + "rewards/rejected": -5.0824480056762695, + "sft_loss": 4.110098838806152, + "step": 5145 + }, + { + "epoch": 2.756313764843619, + "grad_norm": 0.36572611483915707, + "learning_rate": 1.986445577483753e-08, + "logits/chosen": -0.7998876571655273, + "logits/rejected": -0.6524479985237122, + "logps/chosen": -4.311178207397461, + "logps/rejected": -5.040547847747803, + "loss": 0.0511, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.311178207397461, + "rewards/margins": 0.7293696999549866, + "rewards/rejected": -5.040547847747803, + "sft_loss": 4.074983596801758, + "step": 5150 + }, + { + "epoch": 2.758989797625021, + "grad_norm": 0.40350920608337093, + "learning_rate": 1.9432163007725765e-08, + "logits/chosen": -0.8454571962356567, + "logits/rejected": -0.7627595663070679, + "logps/chosen": -4.490768909454346, + "logps/rejected": -5.0736775398254395, + "loss": 0.0522, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.490768909454346, + "rewards/margins": 0.5829084515571594, + "rewards/rejected": -5.0736775398254395, + "sft_loss": 4.289803504943848, + "step": 5155 + }, + { + "epoch": 2.7616658304064226, + "grad_norm": 0.35743425580419486, + "learning_rate": 1.9004532711876297e-08, + "logits/chosen": -0.7591687440872192, + "logits/rejected": -0.7784875631332397, + "logps/chosen": -4.46417760848999, + "logps/rejected": -5.038629531860352, + "loss": 0.0519, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.46417760848999, + "rewards/margins": 0.5744518637657166, + "rewards/rejected": -5.038629531860352, + "sft_loss": 4.209836483001709, + "step": 5160 + }, + { + "epoch": 2.7643418631878243, + "grad_norm": 0.40974843728329524, + "learning_rate": 1.8581569036159928e-08, + "logits/chosen": -0.7757205367088318, + "logits/rejected": -0.5620787739753723, + "logps/chosen": -4.352284908294678, + "logps/rejected": -5.135004997253418, + "loss": 0.0511, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.352284908294678, + "rewards/margins": 0.7827197909355164, + "rewards/rejected": -5.135004997253418, + "sft_loss": 4.145803928375244, + "step": 5165 + }, + { + "epoch": 2.7670178959692255, + "grad_norm": 0.38651979743253223, + "learning_rate": 1.8163276084172285e-08, + "logits/chosen": -0.7541450262069702, + "logits/rejected": -0.6550502181053162, + "logps/chosen": -4.484699249267578, + "logps/rejected": -5.219845771789551, + "loss": 0.0504, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.484699249267578, + "rewards/margins": 0.7351458072662354, + "rewards/rejected": -5.219845771789551, + "sft_loss": 4.190016746520996, + "step": 5170 + }, + { + "epoch": 2.7696939287506273, + "grad_norm": 0.4746599739406412, + "learning_rate": 1.7749657914193194e-08, + "logits/chosen": -0.7308141589164734, + "logits/rejected": -0.6888738870620728, + "logps/chosen": -4.515814304351807, + "logps/rejected": -5.2973856925964355, + "loss": 0.0499, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.515814304351807, + "rewards/margins": 0.7815715074539185, + "rewards/rejected": -5.2973856925964355, + "sft_loss": 4.202316761016846, + "step": 5175 + }, + { + "epoch": 2.7723699615320285, + "grad_norm": 0.4964614722849225, + "learning_rate": 1.7340718539148203e-08, + "logits/chosen": -0.6771430969238281, + "logits/rejected": -0.6231086850166321, + "logps/chosen": -4.451423168182373, + "logps/rejected": -5.105251789093018, + "loss": 0.0525, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.451423168182373, + "rewards/margins": 0.653827965259552, + "rewards/rejected": -5.105251789093018, + "sft_loss": 4.232422351837158, + "step": 5180 + }, + { + "epoch": 2.7750459943134302, + "grad_norm": 0.38468316076352804, + "learning_rate": 1.6936461926568724e-08, + "logits/chosen": -0.7071312665939331, + "logits/rejected": -0.5591712594032288, + "logps/chosen": -4.469587802886963, + "logps/rejected": -5.192925453186035, + "loss": 0.0518, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.469587802886963, + "rewards/margins": 0.7233376502990723, + "rewards/rejected": -5.192925453186035, + "sft_loss": 4.177231788635254, + "step": 5185 + }, + { + "epoch": 2.777722027094832, + "grad_norm": 0.44091666285478454, + "learning_rate": 1.6536891998554346e-08, + "logits/chosen": -0.8431358337402344, + "logits/rejected": -0.6786155700683594, + "logps/chosen": -4.418313980102539, + "logps/rejected": -5.12216854095459, + "loss": 0.0504, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.418313980102539, + "rewards/margins": 0.7038545608520508, + "rewards/rejected": -5.12216854095459, + "sft_loss": 4.129395961761475, + "step": 5190 + }, + { + "epoch": 2.7803980598762337, + "grad_norm": 0.4200872795921329, + "learning_rate": 1.6142012631734093e-08, + "logits/chosen": -0.6793020963668823, + "logits/rejected": -0.5708988904953003, + "logps/chosen": -4.42996883392334, + "logps/rejected": -5.1513671875, + "loss": 0.0499, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.42996883392334, + "rewards/margins": 0.7213989496231079, + "rewards/rejected": -5.1513671875, + "sft_loss": 4.167457580566406, + "step": 5195 + }, + { + "epoch": 2.783074092657635, + "grad_norm": 0.3583087592611993, + "learning_rate": 1.575182765722949e-08, + "logits/chosen": -0.8591778874397278, + "logits/rejected": -0.6734490990638733, + "logps/chosen": -4.4382429122924805, + "logps/rejected": -5.177542209625244, + "loss": 0.0509, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.4382429122924805, + "rewards/margins": 0.7392994165420532, + "rewards/rejected": -5.177542209625244, + "sft_loss": 4.069396018981934, + "step": 5200 + }, + { + "epoch": 2.783074092657635, + "eval_logits/chosen": -0.2376498281955719, + "eval_logits/rejected": -0.15155045688152313, + "eval_logps/chosen": -4.39319372177124, + "eval_logps/rejected": -5.099259376525879, + "eval_loss": 0.05041056498885155, + "eval_rewards/accuracies": 0.6854599118232727, + "eval_rewards/chosen": -4.39319372177124, + "eval_rewards/margins": 0.70606529712677, + "eval_rewards/rejected": -5.099259376525879, + "eval_runtime": 43.6092, + "eval_samples_per_second": 30.842, + "eval_sft_loss": 4.013496398925781, + "eval_steps_per_second": 7.728, + "step": 5200 + }, + { + "epoch": 2.7857501254390367, + "grad_norm": 0.6691376618275345, + "learning_rate": 1.536634086061672e-08, + "logits/chosen": -0.7162854075431824, + "logits/rejected": -0.7120442986488342, + "logps/chosen": -4.272818088531494, + "logps/rejected": -5.1587724685668945, + "loss": 0.0498, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.272818088531494, + "rewards/margins": 0.8859542012214661, + "rewards/rejected": -5.1587724685668945, + "sft_loss": 4.018543243408203, + "step": 5205 + }, + { + "epoch": 2.788426158220438, + "grad_norm": 0.6331576336219715, + "learning_rate": 1.4985555981890495e-08, + "logits/chosen": -0.770778477191925, + "logits/rejected": -0.6682685017585754, + "logps/chosen": -4.637447357177734, + "logps/rejected": -5.280395030975342, + "loss": 0.0523, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.637447357177734, + "rewards/margins": 0.6429480314254761, + "rewards/rejected": -5.280395030975342, + "sft_loss": 4.285575866699219, + "step": 5210 + }, + { + "epoch": 2.7911021910018396, + "grad_norm": 0.45844722316500985, + "learning_rate": 1.4609476715427226e-08, + "logits/chosen": -0.7665554285049438, + "logits/rejected": -0.7006998658180237, + "logps/chosen": -4.294949531555176, + "logps/rejected": -5.111362457275391, + "loss": 0.05, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.294949531555176, + "rewards/margins": 0.8164127469062805, + "rewards/rejected": -5.111362457275391, + "sft_loss": 4.0890374183654785, + "step": 5215 + }, + { + "epoch": 2.7937782237832414, + "grad_norm": 0.46167473933351355, + "learning_rate": 1.4238106709949792e-08, + "logits/chosen": -0.7603719234466553, + "logits/rejected": -0.704023003578186, + "logps/chosen": -4.407052993774414, + "logps/rejected": -5.269985198974609, + "loss": 0.0501, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.407052993774414, + "rewards/margins": 0.8629329800605774, + "rewards/rejected": -5.269985198974609, + "sft_loss": 4.130857467651367, + "step": 5220 + }, + { + "epoch": 2.796454256564643, + "grad_norm": 0.6465361159968553, + "learning_rate": 1.3871449568491511e-08, + "logits/chosen": -0.6923054456710815, + "logits/rejected": -0.5152607560157776, + "logps/chosen": -4.521544456481934, + "logps/rejected": -5.11342716217041, + "loss": 0.0532, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.521544456481934, + "rewards/margins": 0.5918827056884766, + "rewards/rejected": -5.11342716217041, + "sft_loss": 4.219715118408203, + "step": 5225 + }, + { + "epoch": 2.7991302893460444, + "grad_norm": 0.5335497897965382, + "learning_rate": 1.3509508848361606e-08, + "logits/chosen": -0.8408746719360352, + "logits/rejected": -0.7014719843864441, + "logps/chosen": -4.4506120681762695, + "logps/rejected": -5.232138633728027, + "loss": 0.0495, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.4506120681762695, + "rewards/margins": 0.7815271615982056, + "rewards/rejected": -5.232138633728027, + "sft_loss": 4.041218280792236, + "step": 5230 + }, + { + "epoch": 2.801806322127446, + "grad_norm": 0.4340640550491573, + "learning_rate": 1.3152288061110517e-08, + "logits/chosen": -0.8067764043807983, + "logits/rejected": -0.6543148159980774, + "logps/chosen": -4.338916778564453, + "logps/rejected": -5.174741744995117, + "loss": 0.0499, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -4.338916778564453, + "rewards/margins": 0.8358249664306641, + "rewards/rejected": -5.174741744995117, + "sft_loss": 4.055007457733154, + "step": 5235 + }, + { + "epoch": 2.804482354908848, + "grad_norm": 0.4702838319568216, + "learning_rate": 1.2799790672495814e-08, + "logits/chosen": -0.8347498178482056, + "logits/rejected": -0.6073392629623413, + "logps/chosen": -4.421563148498535, + "logps/rejected": -5.192359447479248, + "loss": 0.0507, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.421563148498535, + "rewards/margins": 0.7707957029342651, + "rewards/rejected": -5.192359447479248, + "sft_loss": 4.199678897857666, + "step": 5240 + }, + { + "epoch": 2.807158387690249, + "grad_norm": 0.40117546884872646, + "learning_rate": 1.2452020102448835e-08, + "logits/chosen": -0.7439224123954773, + "logits/rejected": -0.6960216760635376, + "logps/chosen": -4.4497785568237305, + "logps/rejected": -5.058270454406738, + "loss": 0.0525, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.4497785568237305, + "rewards/margins": 0.6084924340248108, + "rewards/rejected": -5.058270454406738, + "sft_loss": 4.284776210784912, + "step": 5245 + }, + { + "epoch": 2.8098344204716508, + "grad_norm": 0.4585374000452793, + "learning_rate": 1.2108979725041103e-08, + "logits/chosen": -0.8621365427970886, + "logits/rejected": -0.7040331363677979, + "logps/chosen": -4.3691277503967285, + "logps/rejected": -5.159308910369873, + "loss": 0.051, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.3691277503967285, + "rewards/margins": 0.7901814579963684, + "rewards/rejected": -5.159308910369873, + "sft_loss": 4.125119686126709, + "step": 5250 + }, + { + "epoch": 2.8125104532530525, + "grad_norm": 0.5149332418443064, + "learning_rate": 1.1770672868451958e-08, + "logits/chosen": -0.786795973777771, + "logits/rejected": -0.5567874908447266, + "logps/chosen": -4.3395161628723145, + "logps/rejected": -5.155367374420166, + "loss": 0.0511, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.3395161628723145, + "rewards/margins": 0.8158513307571411, + "rewards/rejected": -5.155367374420166, + "sft_loss": 4.167263984680176, + "step": 5255 + }, + { + "epoch": 2.8151864860344538, + "grad_norm": 0.4589040615922559, + "learning_rate": 1.1437102814935872e-08, + "logits/chosen": -0.7088602781295776, + "logits/rejected": -0.6603358387947083, + "logps/chosen": -4.456506252288818, + "logps/rejected": -5.139347076416016, + "loss": 0.0537, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.456506252288818, + "rewards/margins": 0.6828408241271973, + "rewards/rejected": -5.139347076416016, + "sft_loss": 4.205148220062256, + "step": 5260 + }, + { + "epoch": 2.8178625188158555, + "grad_norm": 0.41054547983463774, + "learning_rate": 1.1108272800791018e-08, + "logits/chosen": -0.8841502070426941, + "logits/rejected": -0.6486460566520691, + "logps/chosen": -4.255162239074707, + "logps/rejected": -5.279638290405273, + "loss": 0.0492, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -4.255162239074707, + "rewards/margins": 1.024476408958435, + "rewards/rejected": -5.279638290405273, + "sft_loss": 4.01407527923584, + "step": 5265 + }, + { + "epoch": 2.820538551597257, + "grad_norm": 0.3870327970812567, + "learning_rate": 1.078418601632769e-08, + "logits/chosen": -0.72391676902771, + "logits/rejected": -0.5991867780685425, + "logps/chosen": -4.367644309997559, + "logps/rejected": -5.057244777679443, + "loss": 0.0517, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.367644309997559, + "rewards/margins": 0.689600944519043, + "rewards/rejected": -5.057244777679443, + "sft_loss": 4.147953033447266, + "step": 5270 + }, + { + "epoch": 2.8232145843786585, + "grad_norm": 0.49824908401362694, + "learning_rate": 1.0464845605837159e-08, + "logits/chosen": -0.729150116443634, + "logits/rejected": -0.6012585163116455, + "logps/chosen": -4.400598049163818, + "logps/rejected": -5.120515823364258, + "loss": 0.0515, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.400598049163818, + "rewards/margins": 0.7199177742004395, + "rewards/rejected": -5.120515823364258, + "sft_loss": 4.194445610046387, + "step": 5275 + }, + { + "epoch": 2.82589061716006, + "grad_norm": 0.37218461921657003, + "learning_rate": 1.0150254667561642e-08, + "logits/chosen": -0.7861107587814331, + "logits/rejected": -0.5771303176879883, + "logps/chosen": -4.436859130859375, + "logps/rejected": -5.288681507110596, + "loss": 0.0522, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.436859130859375, + "rewards/margins": 0.851822018623352, + "rewards/rejected": -5.288681507110596, + "sft_loss": 4.093539237976074, + "step": 5280 + }, + { + "epoch": 2.828566649941462, + "grad_norm": 0.37692200660419406, + "learning_rate": 9.840416253663719e-09, + "logits/chosen": -0.8008987307548523, + "logits/rejected": -0.6927303075790405, + "logps/chosen": -4.387301445007324, + "logps/rejected": -5.200824737548828, + "loss": 0.0511, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.387301445007324, + "rewards/margins": 0.8135232925415039, + "rewards/rejected": -5.200824737548828, + "sft_loss": 4.164353847503662, + "step": 5285 + }, + { + "epoch": 2.8312426827228636, + "grad_norm": 0.43110480795076206, + "learning_rate": 9.535333370197074e-09, + "logits/chosen": -0.7683074474334717, + "logits/rejected": -0.6206791996955872, + "logps/chosen": -4.306056976318359, + "logps/rejected": -5.059770107269287, + "loss": 0.0504, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.306056976318359, + "rewards/margins": 0.7537132501602173, + "rewards/rejected": -5.059770107269287, + "sft_loss": 4.051373481750488, + "step": 5290 + }, + { + "epoch": 2.833918715504265, + "grad_norm": 0.39348380949619655, + "learning_rate": 9.23500897707713e-09, + "logits/chosen": -0.8012930750846863, + "logits/rejected": -0.6044802069664001, + "logps/chosen": -4.543567180633545, + "logps/rejected": -5.1686201095581055, + "loss": 0.0516, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.543567180633545, + "rewards/margins": 0.6250527501106262, + "rewards/rejected": -5.1686201095581055, + "sft_loss": 4.221306800842285, + "step": 5295 + }, + { + "epoch": 2.8365947482856666, + "grad_norm": 0.6265290893699348, + "learning_rate": 8.939445988052574e-09, + "logits/chosen": -0.7233133912086487, + "logits/rejected": -0.6620453596115112, + "logps/chosen": -4.482752799987793, + "logps/rejected": -5.183930397033691, + "loss": 0.0505, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.482752799987793, + "rewards/margins": 0.7011777758598328, + "rewards/rejected": -5.183930397033691, + "sft_loss": 4.141602516174316, + "step": 5300 + }, + { + "epoch": 2.839270781067068, + "grad_norm": 0.5143935043837382, + "learning_rate": 8.648647270676656e-09, + "logits/chosen": -0.6966943740844727, + "logits/rejected": -0.601551353931427, + "logps/chosen": -4.2645697593688965, + "logps/rejected": -5.0711989402771, + "loss": 0.0501, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.2645697593688965, + "rewards/margins": 0.8066291809082031, + "rewards/rejected": -5.0711989402771, + "sft_loss": 3.9466934204101562, + "step": 5305 + }, + { + "epoch": 2.8419468138484696, + "grad_norm": 0.5979309433728134, + "learning_rate": 8.362615646279991e-09, + "logits/chosen": -0.918570876121521, + "logits/rejected": -0.6350168585777283, + "logps/chosen": -4.436649322509766, + "logps/rejected": -5.3039045333862305, + "loss": 0.0509, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.436649322509766, + "rewards/margins": 0.8672553300857544, + "rewards/rejected": -5.3039045333862305, + "sft_loss": 4.19779109954834, + "step": 5310 + }, + { + "epoch": 2.8446228466298713, + "grad_norm": 0.5780580944954342, + "learning_rate": 8.081353889942466e-09, + "logits/chosen": -0.6806483268737793, + "logits/rejected": -0.500455915927887, + "logps/chosen": -4.434041500091553, + "logps/rejected": -5.082981586456299, + "loss": 0.0522, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.434041500091553, + "rewards/margins": 0.6489400267601013, + "rewards/rejected": -5.082981586456299, + "sft_loss": 4.191941261291504, + "step": 5315 + }, + { + "epoch": 2.847298879411273, + "grad_norm": 0.3981472388972543, + "learning_rate": 7.804864730467042e-09, + "logits/chosen": -0.6645959615707397, + "logits/rejected": -0.6632333397865295, + "logps/chosen": -4.3376359939575195, + "logps/rejected": -5.054964542388916, + "loss": 0.0506, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.3376359939575195, + "rewards/margins": 0.7173280715942383, + "rewards/rejected": -5.054964542388916, + "sft_loss": 4.088263988494873, + "step": 5320 + }, + { + "epoch": 2.8499749121926743, + "grad_norm": 0.5496295929814721, + "learning_rate": 7.533150850352665e-09, + "logits/chosen": -0.6918113231658936, + "logits/rejected": -0.5065929889678955, + "logps/chosen": -4.538696765899658, + "logps/rejected": -5.3134541511535645, + "loss": 0.0503, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.538696765899658, + "rewards/margins": 0.7747570872306824, + "rewards/rejected": -5.3134541511535645, + "sft_loss": 4.243317604064941, + "step": 5325 + }, + { + "epoch": 2.852650944974076, + "grad_norm": 0.5330143026502275, + "learning_rate": 7.2662148857686175e-09, + "logits/chosen": -0.6605895757675171, + "logits/rejected": -0.5792001485824585, + "logps/chosen": -4.514766693115234, + "logps/rejected": -5.122227668762207, + "loss": 0.0518, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.514766693115234, + "rewards/margins": 0.6074615120887756, + "rewards/rejected": -5.122227668762207, + "sft_loss": 4.139904022216797, + "step": 5330 + }, + { + "epoch": 2.8553269777554773, + "grad_norm": 0.4151414572497506, + "learning_rate": 7.0040594265287635e-09, + "logits/chosen": -0.5867056846618652, + "logits/rejected": -0.68376225233078, + "logps/chosen": -4.496993064880371, + "logps/rejected": -4.990506172180176, + "loss": 0.0528, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.496993064880371, + "rewards/margins": 0.49351271986961365, + "rewards/rejected": -4.990506172180176, + "sft_loss": 4.215310096740723, + "step": 5335 + }, + { + "epoch": 2.858003010536879, + "grad_norm": 0.42968389745954066, + "learning_rate": 6.746687016066566e-09, + "logits/chosen": -0.6900774240493774, + "logits/rejected": -0.6579724550247192, + "logps/chosen": -4.339878559112549, + "logps/rejected": -4.979432106018066, + "loss": 0.052, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.339878559112549, + "rewards/margins": 0.6395532488822937, + "rewards/rejected": -4.979432106018066, + "sft_loss": 4.117095470428467, + "step": 5340 + }, + { + "epoch": 2.8606790433182807, + "grad_norm": 0.5137218352574986, + "learning_rate": 6.494100151410276e-09, + "logits/chosen": -0.897404670715332, + "logits/rejected": -0.6970912218093872, + "logps/chosen": -4.338336944580078, + "logps/rejected": -5.060499668121338, + "loss": 0.0512, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.338336944580078, + "rewards/margins": 0.7221625447273254, + "rewards/rejected": -5.060499668121338, + "sft_loss": 4.119289875030518, + "step": 5345 + }, + { + "epoch": 2.8633550760996824, + "grad_norm": 0.9439486911375388, + "learning_rate": 6.246301283158728e-09, + "logits/chosen": -0.6498326063156128, + "logits/rejected": -0.6877025365829468, + "logps/chosen": -4.418894290924072, + "logps/rejected": -5.056708812713623, + "loss": 0.0525, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.418894290924072, + "rewards/margins": 0.6378144025802612, + "rewards/rejected": -5.056708812713623, + "sft_loss": 4.190070152282715, + "step": 5350 + }, + { + "epoch": 2.8660311088810837, + "grad_norm": 0.5415123457240395, + "learning_rate": 6.0032928154576944e-09, + "logits/chosen": -0.7772369980812073, + "logits/rejected": -0.7227431535720825, + "logps/chosen": -4.4435014724731445, + "logps/rejected": -5.072482585906982, + "loss": 0.0532, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.4435014724731445, + "rewards/margins": 0.6289812326431274, + "rewards/rejected": -5.072482585906982, + "sft_loss": 4.237886905670166, + "step": 5355 + }, + { + "epoch": 2.8687071416624854, + "grad_norm": 0.6985459324311037, + "learning_rate": 5.76507710597629e-09, + "logits/chosen": -0.8166434168815613, + "logits/rejected": -0.6002271175384521, + "logps/chosen": -4.332741737365723, + "logps/rejected": -5.127420425415039, + "loss": 0.0508, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.332741737365723, + "rewards/margins": 0.7946786284446716, + "rewards/rejected": -5.127420425415039, + "sft_loss": 4.032086372375488, + "step": 5360 + }, + { + "epoch": 2.8713831744438867, + "grad_norm": 0.3729066256025123, + "learning_rate": 5.531656465884438e-09, + "logits/chosen": -0.8140283823013306, + "logits/rejected": -0.6739929914474487, + "logps/chosen": -4.36183500289917, + "logps/rejected": -5.198390960693359, + "loss": 0.0496, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.36183500289917, + "rewards/margins": 0.8365559577941895, + "rewards/rejected": -5.198390960693359, + "sft_loss": 4.06744384765625, + "step": 5365 + }, + { + "epoch": 2.8740592072252884, + "grad_norm": 0.7101122740083048, + "learning_rate": 5.303033159830217e-09, + "logits/chosen": -0.6431114673614502, + "logits/rejected": -0.6394237279891968, + "logps/chosen": -4.446755409240723, + "logps/rejected": -4.919232368469238, + "loss": 0.0521, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.446755409240723, + "rewards/margins": 0.4724767804145813, + "rewards/rejected": -4.919232368469238, + "sft_loss": 4.0875115394592285, + "step": 5370 + }, + { + "epoch": 2.87673524000669, + "grad_norm": 0.4761441037980727, + "learning_rate": 5.079209405917939e-09, + "logits/chosen": -0.7186871767044067, + "logits/rejected": -0.6644418835639954, + "logps/chosen": -4.400277614593506, + "logps/rejected": -5.278224945068359, + "loss": 0.0512, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.400277614593506, + "rewards/margins": 0.8779473304748535, + "rewards/rejected": -5.278224945068359, + "sft_loss": 4.172907829284668, + "step": 5375 + }, + { + "epoch": 2.879411272788092, + "grad_norm": 0.44965344183186384, + "learning_rate": 4.860187375686664e-09, + "logits/chosen": -0.840808093547821, + "logits/rejected": -0.5630152821540833, + "logps/chosen": -4.355297088623047, + "logps/rejected": -5.174862384796143, + "loss": 0.0504, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -4.355297088623047, + "rewards/margins": 0.8195658922195435, + "rewards/rejected": -5.174862384796143, + "sft_loss": 4.117367744445801, + "step": 5380 + }, + { + "epoch": 2.882087305569493, + "grad_norm": 0.48662423395902865, + "learning_rate": 4.64596919408905e-09, + "logits/chosen": -0.6709896922111511, + "logits/rejected": -0.6175445914268494, + "logps/chosen": -4.350368976593018, + "logps/rejected": -4.9599738121032715, + "loss": 0.0524, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.350368976593018, + "rewards/margins": 0.6096046566963196, + "rewards/rejected": -4.9599738121032715, + "sft_loss": 4.05443811416626, + "step": 5385 + }, + { + "epoch": 2.884763338350895, + "grad_norm": 0.5097574203945324, + "learning_rate": 4.436556939470814e-09, + "logits/chosen": -0.8031808733940125, + "logits/rejected": -0.6307480335235596, + "logps/chosen": -4.699088096618652, + "logps/rejected": -5.155211448669434, + "loss": 0.0536, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.699088096618652, + "rewards/margins": 0.456122487783432, + "rewards/rejected": -5.155211448669434, + "sft_loss": 4.452386856079102, + "step": 5390 + }, + { + "epoch": 2.887439371132296, + "grad_norm": 0.4438377766535619, + "learning_rate": 4.23195264355064e-09, + "logits/chosen": -0.935401439666748, + "logits/rejected": -0.6709798574447632, + "logps/chosen": -4.372194766998291, + "logps/rejected": -5.0931525230407715, + "loss": 0.0513, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -4.372194766998291, + "rewards/margins": 0.7209590077400208, + "rewards/rejected": -5.0931525230407715, + "sft_loss": 4.168568134307861, + "step": 5395 + }, + { + "epoch": 2.890115403913698, + "grad_norm": 0.3898996829524812, + "learning_rate": 4.032158291400245e-09, + "logits/chosen": -0.7936094403266907, + "logits/rejected": -0.5639082193374634, + "logps/chosen": -4.286824703216553, + "logps/rejected": -5.411582946777344, + "loss": 0.0481, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.286824703216553, + "rewards/margins": 1.1247583627700806, + "rewards/rejected": -5.411582946777344, + "sft_loss": 3.914653778076172, + "step": 5400 + }, + { + "epoch": 2.8927914366950995, + "grad_norm": 0.37144975755492143, + "learning_rate": 3.837175821425398e-09, + "logits/chosen": -0.6788768172264099, + "logits/rejected": -0.6351056098937988, + "logps/chosen": -4.575131416320801, + "logps/rejected": -5.201429843902588, + "loss": 0.0522, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.575131416320801, + "rewards/margins": 0.6262980103492737, + "rewards/rejected": -5.201429843902588, + "sft_loss": 4.266793251037598, + "step": 5405 + }, + { + "epoch": 2.8954674694765012, + "grad_norm": 0.4819502320454787, + "learning_rate": 3.6470071253467683e-09, + "logits/chosen": -0.7794539332389832, + "logits/rejected": -0.6866748332977295, + "logps/chosen": -4.382658958435059, + "logps/rejected": -5.218075752258301, + "loss": 0.0499, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.382658958435059, + "rewards/margins": 0.8354169130325317, + "rewards/rejected": -5.218075752258301, + "sft_loss": 4.06662654876709, + "step": 5410 + }, + { + "epoch": 2.8981435022579025, + "grad_norm": 0.4675477289758297, + "learning_rate": 3.461654048181939e-09, + "logits/chosen": -0.8172122836112976, + "logits/rejected": -0.5970250964164734, + "logps/chosen": -4.448482513427734, + "logps/rejected": -5.097105979919434, + "loss": 0.0525, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.448482513427734, + "rewards/margins": 0.6486231088638306, + "rewards/rejected": -5.097105979919434, + "sft_loss": 4.25862979888916, + "step": 5415 + }, + { + "epoch": 2.9008195350393042, + "grad_norm": 0.481561756284469, + "learning_rate": 3.281118388227255e-09, + "logits/chosen": -0.7290286421775818, + "logits/rejected": -0.6467230319976807, + "logps/chosen": -4.468303680419922, + "logps/rejected": -5.0093994140625, + "loss": 0.0538, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.468303680419922, + "rewards/margins": 0.5410959124565125, + "rewards/rejected": -5.0093994140625, + "sft_loss": 4.1863837242126465, + "step": 5420 + }, + { + "epoch": 2.903495567820706, + "grad_norm": 0.5204386750591277, + "learning_rate": 3.1054018970405048e-09, + "logits/chosen": -0.7600063681602478, + "logits/rejected": -0.6038556694984436, + "logps/chosen": -4.390046119689941, + "logps/rejected": -5.161233901977539, + "loss": 0.05, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -4.390046119689941, + "rewards/margins": 0.7711877822875977, + "rewards/rejected": -5.161233901977539, + "sft_loss": 4.058727741241455, + "step": 5425 + }, + { + "epoch": 2.906171600602107, + "grad_norm": 0.3750239879457203, + "learning_rate": 2.9345062794238207e-09, + "logits/chosen": -0.7968858480453491, + "logits/rejected": -0.5854495763778687, + "logps/chosen": -4.418655872344971, + "logps/rejected": -5.176025390625, + "loss": 0.0513, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.418655872344971, + "rewards/margins": 0.757369875907898, + "rewards/rejected": -5.176025390625, + "sft_loss": 4.210319519042969, + "step": 5430 + }, + { + "epoch": 2.908847633383509, + "grad_norm": 0.42673124272737256, + "learning_rate": 2.7684331934072492e-09, + "logits/chosen": -0.8364018201828003, + "logits/rejected": -0.774222731590271, + "logps/chosen": -4.359699726104736, + "logps/rejected": -5.043485164642334, + "loss": 0.0519, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -4.359699726104736, + "rewards/margins": 0.6837862133979797, + "rewards/rejected": -5.043485164642334, + "sft_loss": 4.126735210418701, + "step": 5435 + }, + { + "epoch": 2.9115236661649107, + "grad_norm": 0.52408152068924, + "learning_rate": 2.6071842502326526e-09, + "logits/chosen": -0.8465268015861511, + "logits/rejected": -0.6868315935134888, + "logps/chosen": -4.515316963195801, + "logps/rejected": -5.097006797790527, + "loss": 0.0523, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.515316963195801, + "rewards/margins": 0.5816894173622131, + "rewards/rejected": -5.097006797790527, + "sft_loss": 4.289347171783447, + "step": 5440 + }, + { + "epoch": 2.9141996989463124, + "grad_norm": 0.5496966834501419, + "learning_rate": 2.450761014337888e-09, + "logits/chosen": -0.5394278168678284, + "logits/rejected": -0.48762258887290955, + "logps/chosen": -4.465736389160156, + "logps/rejected": -5.261734962463379, + "loss": 0.0527, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.465736389160156, + "rewards/margins": 0.7959989905357361, + "rewards/rejected": -5.261734962463379, + "sft_loss": 4.236559867858887, + "step": 5445 + }, + { + "epoch": 2.9168757317277136, + "grad_norm": 0.7378399710882302, + "learning_rate": 2.299165003341985e-09, + "logits/chosen": -0.5353703498840332, + "logits/rejected": -0.4766581058502197, + "logps/chosen": -4.363390922546387, + "logps/rejected": -5.103070259094238, + "loss": 0.0514, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.363390922546387, + "rewards/margins": 0.739679217338562, + "rewards/rejected": -5.103070259094238, + "sft_loss": 4.137202262878418, + "step": 5450 + }, + { + "epoch": 2.9195517645091154, + "grad_norm": 0.5686744201567238, + "learning_rate": 2.1523976880299945e-09, + "logits/chosen": -0.8194225430488586, + "logits/rejected": -0.6335167288780212, + "logps/chosen": -4.4221296310424805, + "logps/rejected": -4.990933895111084, + "loss": 0.0535, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.4221296310424805, + "rewards/margins": 0.5688046813011169, + "rewards/rejected": -4.990933895111084, + "sft_loss": 4.245485782623291, + "step": 5455 + }, + { + "epoch": 2.9222277972905166, + "grad_norm": 0.5578072935373403, + "learning_rate": 2.010460492339161e-09, + "logits/chosen": -0.7293864488601685, + "logits/rejected": -0.5793944597244263, + "logps/chosen": -4.39241886138916, + "logps/rejected": -5.086266994476318, + "loss": 0.0501, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.39241886138916, + "rewards/margins": 0.6938482522964478, + "rewards/rejected": -5.086266994476318, + "sft_loss": 4.0339789390563965, + "step": 5460 + }, + { + "epoch": 2.9249038300719183, + "grad_norm": 0.5421431657374898, + "learning_rate": 1.8733547933446614e-09, + "logits/chosen": -0.823691189289093, + "logits/rejected": -0.5740182995796204, + "logps/chosen": -4.491018295288086, + "logps/rejected": -5.095160007476807, + "loss": 0.0519, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.491018295288086, + "rewards/margins": 0.6041414737701416, + "rewards/rejected": -5.095160007476807, + "sft_loss": 4.210727691650391, + "step": 5465 + }, + { + "epoch": 2.92757986285332, + "grad_norm": 0.5262232764119357, + "learning_rate": 1.7410819212467231e-09, + "logits/chosen": -0.7801578640937805, + "logits/rejected": -0.7355565428733826, + "logps/chosen": -4.562183856964111, + "logps/rejected": -5.0614728927612305, + "loss": 0.0543, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -4.562183856964111, + "rewards/margins": 0.49928945302963257, + "rewards/rejected": -5.0614728927612305, + "sft_loss": 4.305178642272949, + "step": 5470 + }, + { + "epoch": 2.9302558956347218, + "grad_norm": 0.33633377390369845, + "learning_rate": 1.613643159357192e-09, + "logits/chosen": -0.6578032374382019, + "logits/rejected": -0.7792307734489441, + "logps/chosen": -4.518008232116699, + "logps/rejected": -5.044106483459473, + "loss": 0.0525, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.518008232116699, + "rewards/margins": 0.5260982513427734, + "rewards/rejected": -5.044106483459473, + "sft_loss": 4.252413749694824, + "step": 5475 + }, + { + "epoch": 2.932931928416123, + "grad_norm": 0.5280603635941091, + "learning_rate": 1.4910397440875967e-09, + "logits/chosen": -0.7603357434272766, + "logits/rejected": -0.6517842411994934, + "logps/chosen": -4.375611305236816, + "logps/rejected": -5.041906833648682, + "loss": 0.0512, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.375611305236816, + "rewards/margins": 0.6662946939468384, + "rewards/rejected": -5.041906833648682, + "sft_loss": 4.093026161193848, + "step": 5480 + }, + { + "epoch": 2.9356079611975248, + "grad_norm": 0.43302048339241966, + "learning_rate": 1.3732728649368253e-09, + "logits/chosen": -0.6951078176498413, + "logits/rejected": -0.5088014602661133, + "logps/chosen": -4.342538356781006, + "logps/rejected": -5.075597763061523, + "loss": 0.0511, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.342538356781006, + "rewards/margins": 0.7330597639083862, + "rewards/rejected": -5.075597763061523, + "sft_loss": 4.080550193786621, + "step": 5485 + }, + { + "epoch": 2.938283993978926, + "grad_norm": 0.4333874844955521, + "learning_rate": 1.260343664479524e-09, + "logits/chosen": -0.7748027443885803, + "logits/rejected": -0.7607916593551636, + "logps/chosen": -4.558234214782715, + "logps/rejected": -5.103475570678711, + "loss": 0.0514, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.558234214782715, + "rewards/margins": 0.5452412962913513, + "rewards/rejected": -5.103475570678711, + "sft_loss": 4.141097068786621, + "step": 5490 + }, + { + "epoch": 2.9409600267603278, + "grad_norm": 0.5357566836246764, + "learning_rate": 1.1522532383554384e-09, + "logits/chosen": -0.8556427955627441, + "logits/rejected": -0.5980334281921387, + "logps/chosen": -4.472742557525635, + "logps/rejected": -5.171594142913818, + "loss": 0.0519, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.472742557525635, + "rewards/margins": 0.6988515257835388, + "rewards/rejected": -5.171594142913818, + "sft_loss": 4.282252311706543, + "step": 5495 + }, + { + "epoch": 2.9436360595417295, + "grad_norm": 0.298521633055877, + "learning_rate": 1.049002635258256e-09, + "logits/chosen": -0.6451120972633362, + "logits/rejected": -0.5836285352706909, + "logps/chosen": -4.546640872955322, + "logps/rejected": -5.189908027648926, + "loss": 0.0516, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.546640872955322, + "rewards/margins": 0.6432673335075378, + "rewards/rejected": -5.189908027648926, + "sft_loss": 4.219212532043457, + "step": 5500 + }, + { + "epoch": 2.946312092323131, + "grad_norm": 0.47740541750846544, + "learning_rate": 9.505928569258358e-10, + "logits/chosen": -0.6692383289337158, + "logits/rejected": -0.6824567317962646, + "logps/chosen": -4.47029972076416, + "logps/rejected": -5.136569499969482, + "loss": 0.0515, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.47029972076416, + "rewards/margins": 0.6662701368331909, + "rewards/rejected": -5.136569499969482, + "sft_loss": 4.262817859649658, + "step": 5505 + }, + { + "epoch": 2.9489881251045325, + "grad_norm": 0.49326359074807213, + "learning_rate": 8.57024858130273e-10, + "logits/chosen": -0.7504460215568542, + "logits/rejected": -0.5905870199203491, + "logps/chosen": -4.473368167877197, + "logps/rejected": -5.392681121826172, + "loss": 0.0514, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.473368167877197, + "rewards/margins": 0.9193128347396851, + "rewards/rejected": -5.392681121826172, + "sft_loss": 4.297789096832275, + "step": 5510 + }, + { + "epoch": 2.951664157885934, + "grad_norm": 0.39847456506363377, + "learning_rate": 7.682995466686826e-10, + "logits/chosen": -0.8444706201553345, + "logits/rejected": -0.6898127794265747, + "logps/chosen": -4.56125545501709, + "logps/rejected": -5.338624000549316, + "loss": 0.0503, + "rewards/accuracies": 0.65625, + "rewards/chosen": -4.56125545501709, + "rewards/margins": 0.7773683071136475, + "rewards/rejected": -5.338624000549316, + "sft_loss": 4.119109153747559, + "step": 5515 + }, + { + "epoch": 2.9543401906673354, + "grad_norm": 0.4569463801314364, + "learning_rate": 6.844177833543741e-10, + "logits/chosen": -0.752629816532135, + "logits/rejected": -0.7100390791893005, + "logps/chosen": -4.399282932281494, + "logps/rejected": -5.064263343811035, + "loss": 0.0513, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.399282932281494, + "rewards/margins": 0.6649808287620544, + "rewards/rejected": -5.064263343811035, + "sft_loss": 4.0922088623046875, + "step": 5520 + }, + { + "epoch": 2.957016223448737, + "grad_norm": 0.5095330277701998, + "learning_rate": 6.053803820087467e-10, + "logits/chosen": -0.7440576553344727, + "logits/rejected": -0.5561671257019043, + "logps/chosen": -4.5473103523254395, + "logps/rejected": -5.268017768859863, + "loss": 0.0532, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.5473103523254395, + "rewards/margins": 0.7207077145576477, + "rewards/rejected": -5.268017768859863, + "sft_loss": 4.374567985534668, + "step": 5525 + }, + { + "epoch": 2.959692256230139, + "grad_norm": 0.5418342484558206, + "learning_rate": 5.311881094528514e-10, + "logits/chosen": -0.894533634185791, + "logits/rejected": -0.6330394744873047, + "logps/chosen": -4.468113422393799, + "logps/rejected": -5.109396934509277, + "loss": 0.0515, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.468113422393799, + "rewards/margins": 0.6412833333015442, + "rewards/rejected": -5.109396934509277, + "sft_loss": 4.15500545501709, + "step": 5530 + }, + { + "epoch": 2.9623682890115406, + "grad_norm": 0.7560684828912863, + "learning_rate": 4.6184168550050806e-10, + "logits/chosen": -0.7792457342147827, + "logits/rejected": -0.7371629476547241, + "logps/chosen": -4.565438270568848, + "logps/rejected": -5.105252742767334, + "loss": 0.0537, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -4.565438270568848, + "rewards/margins": 0.539814829826355, + "rewards/rejected": -5.105252742767334, + "sft_loss": 4.340537071228027, + "step": 5535 + }, + { + "epoch": 2.965044321792942, + "grad_norm": 0.4911869174076437, + "learning_rate": 3.973417829510328e-10, + "logits/chosen": -0.8998439908027649, + "logits/rejected": -0.7843471765518188, + "logps/chosen": -4.363260746002197, + "logps/rejected": -5.053696632385254, + "loss": 0.0513, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.363260746002197, + "rewards/margins": 0.6904358267784119, + "rewards/rejected": -5.053696632385254, + "sft_loss": 4.130491256713867, + "step": 5540 + }, + { + "epoch": 2.9677203545743436, + "grad_norm": 0.625561234732518, + "learning_rate": 3.3768902758274377e-10, + "logits/chosen": -0.7709188461303711, + "logits/rejected": -0.6807830929756165, + "logps/chosen": -4.335400104522705, + "logps/rejected": -5.092108726501465, + "loss": 0.051, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.335400104522705, + "rewards/margins": 0.7567082643508911, + "rewards/rejected": -5.092108726501465, + "sft_loss": 4.07431173324585, + "step": 5545 + }, + { + "epoch": 2.970396387355745, + "grad_norm": 0.40367028372201813, + "learning_rate": 2.8288399814691e-10, + "logits/chosen": -0.6472212672233582, + "logits/rejected": -0.6156342625617981, + "logps/chosen": -4.514806270599365, + "logps/rejected": -5.145084381103516, + "loss": 0.0518, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -4.514806270599365, + "rewards/margins": 0.6302779912948608, + "rewards/rejected": -5.145084381103516, + "sft_loss": 4.210458278656006, + "step": 5550 + }, + { + "epoch": 2.9730724201371466, + "grad_norm": 0.38644244219702006, + "learning_rate": 2.3292722636220066e-10, + "logits/chosen": -0.7775002717971802, + "logits/rejected": -0.5841083526611328, + "logps/chosen": -4.504853248596191, + "logps/rejected": -5.351606845855713, + "loss": 0.0505, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.504853248596191, + "rewards/margins": 0.8467535972595215, + "rewards/rejected": -5.351606845855713, + "sft_loss": 4.162545680999756, + "step": 5555 + }, + { + "epoch": 2.9757484529185483, + "grad_norm": 0.525091240664735, + "learning_rate": 1.8781919690946668e-10, + "logits/chosen": -0.7250782251358032, + "logits/rejected": -0.7407873868942261, + "logps/chosen": -4.452993392944336, + "logps/rejected": -5.02133846282959, + "loss": 0.0516, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -4.452993392944336, + "rewards/margins": 0.5683449506759644, + "rewards/rejected": -5.02133846282959, + "sft_loss": 4.157462120056152, + "step": 5560 + }, + { + "epoch": 2.97842448569995, + "grad_norm": 0.46121490745190485, + "learning_rate": 1.4756034742696711e-10, + "logits/chosen": -0.8873132467269897, + "logits/rejected": -0.7985233068466187, + "logps/chosen": -4.419460773468018, + "logps/rejected": -5.141184329986572, + "loss": 0.052, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.419460773468018, + "rewards/margins": 0.7217229604721069, + "rewards/rejected": -5.141184329986572, + "sft_loss": 4.184345245361328, + "step": 5565 + }, + { + "epoch": 2.9811005184813513, + "grad_norm": 0.5079173509922432, + "learning_rate": 1.12151068506261e-10, + "logits/chosen": -0.7679609060287476, + "logits/rejected": -0.6635790467262268, + "logps/chosen": -4.502461910247803, + "logps/rejected": -5.3459858894348145, + "loss": 0.0493, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -4.502461910247803, + "rewards/margins": 0.8435236215591431, + "rewards/rejected": -5.3459858894348145, + "sft_loss": 4.08714485168457, + "step": 5570 + }, + { + "epoch": 2.983776551262753, + "grad_norm": 0.6251494874129625, + "learning_rate": 8.159170368826629e-11, + "logits/chosen": -0.778547465801239, + "logits/rejected": -0.6166914701461792, + "logps/chosen": -4.4004130363464355, + "logps/rejected": -5.13525390625, + "loss": 0.0509, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -4.4004130363464355, + "rewards/margins": 0.734841525554657, + "rewards/rejected": -5.13525390625, + "sft_loss": 4.074005603790283, + "step": 5575 + }, + { + "epoch": 2.9864525840441547, + "grad_norm": 0.4323529956911697, + "learning_rate": 5.588254946015114e-11, + "logits/chosen": -0.937720775604248, + "logits/rejected": -0.6476465463638306, + "logps/chosen": -4.396031856536865, + "logps/rejected": -5.199591636657715, + "loss": 0.0505, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -4.396031856536865, + "rewards/margins": 0.8035598993301392, + "rewards/rejected": -5.199591636657715, + "sft_loss": 4.089269161224365, + "step": 5580 + }, + { + "epoch": 2.989128616825556, + "grad_norm": 0.4034195581734263, + "learning_rate": 3.502385525216978e-11, + "logits/chosen": -0.8808103799819946, + "logits/rejected": -0.7039750814437866, + "logps/chosen": -4.423071384429932, + "logps/rejected": -5.164847373962402, + "loss": 0.0515, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -4.423071384429932, + "rewards/margins": 0.7417756915092468, + "rewards/rejected": -5.164847373962402, + "sft_loss": 4.177689552307129, + "step": 5585 + }, + { + "epoch": 2.9918046496069577, + "grad_norm": 0.3570840544619443, + "learning_rate": 1.901582343555308e-11, + "logits/chosen": -0.7723742127418518, + "logits/rejected": -0.709536075592041, + "logps/chosen": -4.62722635269165, + "logps/rejected": -5.315423488616943, + "loss": 0.0525, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -4.62722635269165, + "rewards/margins": 0.688197135925293, + "rewards/rejected": -5.315423488616943, + "sft_loss": 4.275882720947266, + "step": 5590 + }, + { + "epoch": 2.9944806823883594, + "grad_norm": 0.4923520610414012, + "learning_rate": 7.858609320232634e-12, + "logits/chosen": -0.8053900599479675, + "logits/rejected": -0.6184626221656799, + "logps/chosen": -4.50331974029541, + "logps/rejected": -5.1998724937438965, + "loss": 0.0515, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.50331974029541, + "rewards/margins": 0.696553111076355, + "rewards/rejected": -5.1998724937438965, + "sft_loss": 4.270722389221191, + "step": 5595 + }, + { + "epoch": 2.9971567151697607, + "grad_norm": 0.5386939604422419, + "learning_rate": 1.5523211535639624e-12, + "logits/chosen": -0.8037877082824707, + "logits/rejected": -0.6862096190452576, + "logps/chosen": -4.483442306518555, + "logps/rejected": -5.466944694519043, + "loss": 0.0504, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -4.483442306518555, + "rewards/margins": 0.983502209186554, + "rewards/rejected": -5.466944694519043, + "sft_loss": 4.2200493812561035, + "step": 5600 + }, + { + "epoch": 2.9971567151697607, + "eval_logits/chosen": -0.3687087595462799, + "eval_logits/rejected": -0.2954810559749603, + "eval_logps/chosen": -4.423083305358887, + "eval_logps/rejected": -5.141829967498779, + "eval_loss": 0.05035361275076866, + "eval_rewards/accuracies": 0.6862017512321472, + "eval_rewards/chosen": -4.423083305358887, + "eval_rewards/margins": 0.7187467217445374, + "eval_rewards/rejected": -5.141829967498779, + "eval_runtime": 44.0385, + "eval_samples_per_second": 30.541, + "eval_sft_loss": 4.028059959411621, + "eval_steps_per_second": 7.652, + "step": 5600 + }, + { + "epoch": 2.999297541394882, + "step": 5604, + "total_flos": 0.0, + "train_loss": 0.0619825580417259, + "train_runtime": 31817.9407, + "train_samples_per_second": 5.637, + "train_steps_per_second": 0.176 + } + ], + "logging_steps": 5, + "max_steps": 5604, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}