{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.13175905660515472, "eval_steps": 2000, "global_step": 18000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007319947589175262, "grad_norm": 800.0, "learning_rate": 4.879238838741157e-07, "loss": 0.952, "step": 100 }, { "epoch": 0.0014639895178350524, "grad_norm": 768.0, "learning_rate": 9.758477677482314e-07, "loss": 0.8458, "step": 200 }, { "epoch": 0.0021959842767525785, "grad_norm": 1600.0, "learning_rate": 1.4637716516223471e-06, "loss": 0.8841, "step": 300 }, { "epoch": 0.0029279790356701047, "grad_norm": 616.0, "learning_rate": 1.951695535496463e-06, "loss": 0.9277, "step": 400 }, { "epoch": 0.003659973794587631, "grad_norm": 436.0, "learning_rate": 2.4396194193705783e-06, "loss": 0.6913, "step": 500 }, { "epoch": 0.004391968553505157, "grad_norm": 1256.0, "learning_rate": 2.9275433032446943e-06, "loss": 0.6372, "step": 600 }, { "epoch": 0.005123963312422683, "grad_norm": 764.0, "learning_rate": 3.41546718711881e-06, "loss": 1.0322, "step": 700 }, { "epoch": 0.005855958071340209, "grad_norm": 1408.0, "learning_rate": 3.903391070992926e-06, "loss": 0.7853, "step": 800 }, { "epoch": 0.006587952830257735, "grad_norm": 43.5, "learning_rate": 4.391314954867041e-06, "loss": 0.9377, "step": 900 }, { "epoch": 0.007319947589175262, "grad_norm": 684.0, "learning_rate": 4.879238838741157e-06, "loss": 1.4695, "step": 1000 }, { "epoch": 0.008051942348092788, "grad_norm": 524.0, "learning_rate": 5.367162722615272e-06, "loss": 1.4889, "step": 1100 }, { "epoch": 0.008783937107010314, "grad_norm": 33.25, "learning_rate": 5.8550866064893885e-06, "loss": 1.2154, "step": 1200 }, { "epoch": 0.00951593186592784, "grad_norm": 1.171875, "learning_rate": 6.343010490363504e-06, "loss": 1.3154, "step": 1300 }, { "epoch": 0.010247926624845366, "grad_norm": 388.0, "learning_rate": 6.83093437423762e-06, "loss": 1.5739, "step": 1400 }, { "epoch": 0.010979921383762893, "grad_norm": 251.0, "learning_rate": 7.318858258111735e-06, "loss": 0.7903, "step": 1500 }, { "epoch": 0.011711916142680419, "grad_norm": 95.5, "learning_rate": 7.806782141985851e-06, "loss": 0.6962, "step": 1600 }, { "epoch": 0.012443910901597945, "grad_norm": 516.0, "learning_rate": 8.294706025859967e-06, "loss": 1.2352, "step": 1700 }, { "epoch": 0.01317590566051547, "grad_norm": 41.0, "learning_rate": 8.782629909734082e-06, "loss": 0.924, "step": 1800 }, { "epoch": 0.013907900419432996, "grad_norm": 0.91015625, "learning_rate": 9.270553793608198e-06, "loss": 1.467, "step": 1900 }, { "epoch": 0.014639895178350524, "grad_norm": 1.15625, "learning_rate": 9.758477677482313e-06, "loss": 0.9323, "step": 2000 }, { "epoch": 0.014639895178350524, "eval_loss": 1.4799224138259888, "eval_runtime": 27.9405, "eval_samples_per_second": 17.895, "eval_steps_per_second": 17.895, "step": 2000 }, { "epoch": 0.01537188993726805, "grad_norm": 39.5, "learning_rate": 1.0246401561356429e-05, "loss": 1.2229, "step": 2100 }, { "epoch": 0.016103884696185577, "grad_norm": 984.0, "learning_rate": 1.0734325445230544e-05, "loss": 1.7086, "step": 2200 }, { "epoch": 0.016835879455103103, "grad_norm": 684.0, "learning_rate": 1.122224932910466e-05, "loss": 0.9654, "step": 2300 }, { "epoch": 0.01756787421402063, "grad_norm": 840.0, "learning_rate": 1.1710173212978777e-05, "loss": 1.3702, "step": 2400 }, { "epoch": 0.018299868972938154, "grad_norm": 388.0, "learning_rate": 1.2198097096852893e-05, "loss": 1.0888, "step": 2500 }, { "epoch": 0.01903186373185568, "grad_norm": 832.0, "learning_rate": 1.2686020980727008e-05, "loss": 0.9989, "step": 2600 }, { "epoch": 0.019763858490773206, "grad_norm": 0.3984375, "learning_rate": 1.3173944864601122e-05, "loss": 1.3161, "step": 2700 }, { "epoch": 0.02049585324969073, "grad_norm": 644.0, "learning_rate": 1.366186874847524e-05, "loss": 1.1279, "step": 2800 }, { "epoch": 0.021227848008608257, "grad_norm": 414.0, "learning_rate": 1.4149792632349354e-05, "loss": 1.2044, "step": 2900 }, { "epoch": 0.021959842767525786, "grad_norm": 12.1875, "learning_rate": 1.463771651622347e-05, "loss": 1.0637, "step": 3000 }, { "epoch": 0.022691837526443312, "grad_norm": 0.95703125, "learning_rate": 1.5125640400097585e-05, "loss": 1.2628, "step": 3100 }, { "epoch": 0.023423832285360838, "grad_norm": 10.875, "learning_rate": 1.5613564283971703e-05, "loss": 1.3439, "step": 3200 }, { "epoch": 0.024155827044278363, "grad_norm": 256.0, "learning_rate": 1.6101488167845818e-05, "loss": 1.2094, "step": 3300 }, { "epoch": 0.02488782180319589, "grad_norm": 0.267578125, "learning_rate": 1.6589412051719934e-05, "loss": 1.4852, "step": 3400 }, { "epoch": 0.025619816562113415, "grad_norm": 54.25, "learning_rate": 1.707733593559405e-05, "loss": 1.1689, "step": 3500 }, { "epoch": 0.02635181132103094, "grad_norm": 308.0, "learning_rate": 1.7565259819468165e-05, "loss": 1.0845, "step": 3600 }, { "epoch": 0.027083806079948466, "grad_norm": 264.0, "learning_rate": 1.805318370334228e-05, "loss": 1.2785, "step": 3700 }, { "epoch": 0.027815800838865992, "grad_norm": 81.5, "learning_rate": 1.8541107587216396e-05, "loss": 1.0887, "step": 3800 }, { "epoch": 0.02854779559778352, "grad_norm": 696.0, "learning_rate": 1.902903147109051e-05, "loss": 1.2968, "step": 3900 }, { "epoch": 0.029279790356701047, "grad_norm": 186.0, "learning_rate": 1.9516955354964627e-05, "loss": 1.3166, "step": 4000 }, { "epoch": 0.029279790356701047, "eval_loss": 0.8257483839988708, "eval_runtime": 27.9567, "eval_samples_per_second": 17.885, "eval_steps_per_second": 17.885, "step": 4000 }, { "epoch": 0.030011785115618573, "grad_norm": 1.9140625, "learning_rate": 1.9999999997189743e-05, "loss": 1.171, "step": 4100 }, { "epoch": 0.0307437798745361, "grad_norm": 211.0, "learning_rate": 1.9999971332569874e-05, "loss": 1.1772, "step": 4200 }, { "epoch": 0.03147577463345363, "grad_norm": 0.1416015625, "learning_rate": 1.9999886462973602e-05, "loss": 1.0697, "step": 4300 }, { "epoch": 0.032207769392371154, "grad_norm": 0.220703125, "learning_rate": 1.9999745388877933e-05, "loss": 1.2177, "step": 4400 }, { "epoch": 0.03293976415128868, "grad_norm": 0.283203125, "learning_rate": 1.999954811107578e-05, "loss": 1.1959, "step": 4500 }, { "epoch": 0.033671758910206205, "grad_norm": 0.490234375, "learning_rate": 1.9999294630675945e-05, "loss": 1.1617, "step": 4600 }, { "epoch": 0.03440375366912373, "grad_norm": 390.0, "learning_rate": 1.999898494910312e-05, "loss": 1.1348, "step": 4700 }, { "epoch": 0.03513574842804126, "grad_norm": 0.279296875, "learning_rate": 1.999861906809787e-05, "loss": 1.1857, "step": 4800 }, { "epoch": 0.03586774318695878, "grad_norm": 620.0, "learning_rate": 1.9998196989716637e-05, "loss": 1.1041, "step": 4900 }, { "epoch": 0.03659973794587631, "grad_norm": 7.9375, "learning_rate": 1.999771871633172e-05, "loss": 1.2604, "step": 5000 }, { "epoch": 0.037331732704793834, "grad_norm": 0.1328125, "learning_rate": 1.9997184250631257e-05, "loss": 1.1525, "step": 5100 }, { "epoch": 0.03806372746371136, "grad_norm": 988.0, "learning_rate": 1.999659359561922e-05, "loss": 1.1125, "step": 5200 }, { "epoch": 0.038795722222628885, "grad_norm": 528.0, "learning_rate": 1.99959467546154e-05, "loss": 1.0241, "step": 5300 }, { "epoch": 0.03952771698154641, "grad_norm": 0.08203125, "learning_rate": 1.999524373125537e-05, "loss": 1.0007, "step": 5400 }, { "epoch": 0.04025971174046394, "grad_norm": 0.06494140625, "learning_rate": 1.9994484529490483e-05, "loss": 1.7392, "step": 5500 }, { "epoch": 0.04099170649938146, "grad_norm": 155.0, "learning_rate": 1.9993669153587842e-05, "loss": 1.6975, "step": 5600 }, { "epoch": 0.04172370125829899, "grad_norm": 0.1787109375, "learning_rate": 1.9992797608130284e-05, "loss": 1.3126, "step": 5700 }, { "epoch": 0.042455696017216514, "grad_norm": 102.5, "learning_rate": 1.9991869898016337e-05, "loss": 1.0694, "step": 5800 }, { "epoch": 0.04318769077613404, "grad_norm": 282.0, "learning_rate": 1.999088602846021e-05, "loss": 1.1731, "step": 5900 }, { "epoch": 0.04391968553505157, "grad_norm": 756.0, "learning_rate": 1.998984600499175e-05, "loss": 0.9569, "step": 6000 }, { "epoch": 0.04391968553505157, "eval_loss": 1.0243369340896606, "eval_runtime": 27.9367, "eval_samples_per_second": 17.898, "eval_steps_per_second": 17.898, "step": 6000 }, { "epoch": 0.0446516802939691, "grad_norm": 0.08935546875, "learning_rate": 1.9988749833456433e-05, "loss": 0.8217, "step": 6100 }, { "epoch": 0.045383675052886624, "grad_norm": 0.1650390625, "learning_rate": 1.9987597520015302e-05, "loss": 0.9041, "step": 6200 }, { "epoch": 0.04611566981180415, "grad_norm": 70.0, "learning_rate": 1.998638907114495e-05, "loss": 1.0699, "step": 6300 }, { "epoch": 0.046847664570721675, "grad_norm": 178.0, "learning_rate": 1.998512449363748e-05, "loss": 0.9322, "step": 6400 }, { "epoch": 0.0475796593296392, "grad_norm": 0.1533203125, "learning_rate": 1.9983803794600468e-05, "loss": 0.9877, "step": 6500 }, { "epoch": 0.04831165408855673, "grad_norm": 368.0, "learning_rate": 1.998242698145692e-05, "loss": 1.0714, "step": 6600 }, { "epoch": 0.04904364884747425, "grad_norm": 0.279296875, "learning_rate": 1.9980994061945238e-05, "loss": 0.9344, "step": 6700 }, { "epoch": 0.04977564360639178, "grad_norm": 2800.0, "learning_rate": 1.997950504411916e-05, "loss": 1.2076, "step": 6800 }, { "epoch": 0.050507638365309304, "grad_norm": 0.31640625, "learning_rate": 1.9977959936347732e-05, "loss": 1.0685, "step": 6900 }, { "epoch": 0.05123963312422683, "grad_norm": 29.75, "learning_rate": 1.9976358747315254e-05, "loss": 1.1026, "step": 7000 }, { "epoch": 0.051971627883144356, "grad_norm": 2112.0, "learning_rate": 1.9974701486021233e-05, "loss": 1.0783, "step": 7100 }, { "epoch": 0.05270362264206188, "grad_norm": 0.111328125, "learning_rate": 1.997298816178033e-05, "loss": 0.8777, "step": 7200 }, { "epoch": 0.05343561740097941, "grad_norm": 0.07080078125, "learning_rate": 1.9971218784222302e-05, "loss": 0.9701, "step": 7300 }, { "epoch": 0.05416761215989693, "grad_norm": 132.0, "learning_rate": 1.9969393363291963e-05, "loss": 0.9978, "step": 7400 }, { "epoch": 0.05489960691881446, "grad_norm": 2.03125, "learning_rate": 1.9967511909249118e-05, "loss": 1.2451, "step": 7500 }, { "epoch": 0.055631601677731984, "grad_norm": 912.0, "learning_rate": 1.99655744326685e-05, "loss": 0.8866, "step": 7600 }, { "epoch": 0.05636359643664951, "grad_norm": 0.10986328125, "learning_rate": 1.9963580944439732e-05, "loss": 0.9139, "step": 7700 }, { "epoch": 0.05709559119556704, "grad_norm": 0.1796875, "learning_rate": 1.9961531455767233e-05, "loss": 1.0991, "step": 7800 }, { "epoch": 0.05782758595448457, "grad_norm": 0.45703125, "learning_rate": 1.9959425978170187e-05, "loss": 1.0318, "step": 7900 }, { "epoch": 0.058559580713402094, "grad_norm": 161.0, "learning_rate": 1.995726452348246e-05, "loss": 1.0115, "step": 8000 }, { "epoch": 0.058559580713402094, "eval_loss": 1.2017102241516113, "eval_runtime": 27.9424, "eval_samples_per_second": 17.894, "eval_steps_per_second": 17.894, "step": 8000 }, { "epoch": 0.05929157547231962, "grad_norm": 94.5, "learning_rate": 1.9955047103852534e-05, "loss": 1.3752, "step": 8100 }, { "epoch": 0.060023570231237146, "grad_norm": 83.0, "learning_rate": 1.995277373174345e-05, "loss": 1.0333, "step": 8200 }, { "epoch": 0.06075556499015467, "grad_norm": 0.1708984375, "learning_rate": 1.9950444419932723e-05, "loss": 1.0582, "step": 8300 }, { "epoch": 0.0614875597490722, "grad_norm": 8.5, "learning_rate": 1.994805918151229e-05, "loss": 0.9273, "step": 8400 }, { "epoch": 0.06221955450798972, "grad_norm": 0.1376953125, "learning_rate": 1.9945618029888408e-05, "loss": 0.8619, "step": 8500 }, { "epoch": 0.06295154926690726, "grad_norm": 552.0, "learning_rate": 1.994312097878161e-05, "loss": 1.2394, "step": 8600 }, { "epoch": 0.06368354402582478, "grad_norm": 0.62890625, "learning_rate": 1.99405680422266e-05, "loss": 0.8713, "step": 8700 }, { "epoch": 0.06441553878474231, "grad_norm": 152.0, "learning_rate": 1.9937959234572198e-05, "loss": 0.9949, "step": 8800 }, { "epoch": 0.06514753354365983, "grad_norm": 99.0, "learning_rate": 1.993529457048124e-05, "loss": 1.0313, "step": 8900 }, { "epoch": 0.06587952830257736, "grad_norm": 1004.0, "learning_rate": 1.993257406493051e-05, "loss": 1.0299, "step": 9000 }, { "epoch": 0.06661152306149488, "grad_norm": 0.16796875, "learning_rate": 1.9929797733210644e-05, "loss": 0.9293, "step": 9100 }, { "epoch": 0.06734351782041241, "grad_norm": 0.75, "learning_rate": 1.992696559092605e-05, "loss": 1.04, "step": 9200 }, { "epoch": 0.06807551257932994, "grad_norm": 5.15625, "learning_rate": 1.992407765399483e-05, "loss": 1.072, "step": 9300 }, { "epoch": 0.06880750733824746, "grad_norm": 0.12890625, "learning_rate": 1.992113393864867e-05, "loss": 1.102, "step": 9400 }, { "epoch": 0.06953950209716499, "grad_norm": 0.66796875, "learning_rate": 1.9918134461432763e-05, "loss": 1.0206, "step": 9500 }, { "epoch": 0.07027149685608251, "grad_norm": 0.158203125, "learning_rate": 1.991507923920571e-05, "loss": 0.7945, "step": 9600 }, { "epoch": 0.07100349161500004, "grad_norm": 4.75, "learning_rate": 1.991196828913943e-05, "loss": 1.1373, "step": 9700 }, { "epoch": 0.07173548637391756, "grad_norm": 88.0, "learning_rate": 1.9908801628719063e-05, "loss": 1.0789, "step": 9800 }, { "epoch": 0.07246748113283509, "grad_norm": 0.283203125, "learning_rate": 1.9905579275742866e-05, "loss": 0.9591, "step": 9900 }, { "epoch": 0.07319947589175262, "grad_norm": 484.0, "learning_rate": 1.990230124832212e-05, "loss": 1.1461, "step": 10000 }, { "epoch": 0.07319947589175262, "eval_loss": 0.7712569832801819, "eval_runtime": 28.1186, "eval_samples_per_second": 17.782, "eval_steps_per_second": 17.782, "step": 10000 }, { "epoch": 0.07393147065067014, "grad_norm": 696.0, "learning_rate": 1.9898967564881014e-05, "loss": 1.0556, "step": 10100 }, { "epoch": 0.07466346540958767, "grad_norm": 9.4375, "learning_rate": 1.9895578244156576e-05, "loss": 1.1493, "step": 10200 }, { "epoch": 0.0753954601685052, "grad_norm": 1.34375, "learning_rate": 1.989213330519852e-05, "loss": 0.8955, "step": 10300 }, { "epoch": 0.07612745492742272, "grad_norm": 146.0, "learning_rate": 1.988863276736918e-05, "loss": 1.2152, "step": 10400 }, { "epoch": 0.07685944968634024, "grad_norm": 756.0, "learning_rate": 1.9885076650343364e-05, "loss": 1.0884, "step": 10500 }, { "epoch": 0.07759144444525777, "grad_norm": 151.0, "learning_rate": 1.988146497410829e-05, "loss": 1.1883, "step": 10600 }, { "epoch": 0.0783234392041753, "grad_norm": 100.5, "learning_rate": 1.987779775896343e-05, "loss": 0.9924, "step": 10700 }, { "epoch": 0.07905543396309282, "grad_norm": 0.77734375, "learning_rate": 1.9874075025520417e-05, "loss": 0.7545, "step": 10800 }, { "epoch": 0.07978742872201035, "grad_norm": 2.0625, "learning_rate": 1.987029679470292e-05, "loss": 0.7715, "step": 10900 }, { "epoch": 0.08051942348092787, "grad_norm": 0.33203125, "learning_rate": 1.9866463087746544e-05, "loss": 0.7923, "step": 11000 }, { "epoch": 0.0812514182398454, "grad_norm": 0.53125, "learning_rate": 1.986257392619869e-05, "loss": 1.122, "step": 11100 }, { "epoch": 0.08198341299876293, "grad_norm": 0.1845703125, "learning_rate": 1.9858629331918445e-05, "loss": 0.9972, "step": 11200 }, { "epoch": 0.08271540775768045, "grad_norm": 130.0, "learning_rate": 1.9854629327076454e-05, "loss": 1.0698, "step": 11300 }, { "epoch": 0.08344740251659798, "grad_norm": 119.5, "learning_rate": 1.9850573934154798e-05, "loss": 1.163, "step": 11400 }, { "epoch": 0.0841793972755155, "grad_norm": 0.17578125, "learning_rate": 1.9846463175946872e-05, "loss": 0.8634, "step": 11500 }, { "epoch": 0.08491139203443303, "grad_norm": 0.1806640625, "learning_rate": 1.9842297075557243e-05, "loss": 1.0536, "step": 11600 }, { "epoch": 0.08564338679335055, "grad_norm": 0.12353515625, "learning_rate": 1.9838075656401546e-05, "loss": 0.826, "step": 11700 }, { "epoch": 0.08637538155226808, "grad_norm": 3.5625, "learning_rate": 1.9833798942206312e-05, "loss": 0.9368, "step": 11800 }, { "epoch": 0.0871073763111856, "grad_norm": 3712.0, "learning_rate": 1.9829466957008884e-05, "loss": 0.9388, "step": 11900 }, { "epoch": 0.08783937107010314, "grad_norm": 352.0, "learning_rate": 1.9825079725157236e-05, "loss": 1.0504, "step": 12000 }, { "epoch": 0.08783937107010314, "eval_loss": 0.770910382270813, "eval_runtime": 27.9411, "eval_samples_per_second": 17.895, "eval_steps_per_second": 17.895, "step": 12000 }, { "epoch": 0.08857136582902067, "grad_norm": 211.0, "learning_rate": 1.982063727130987e-05, "loss": 0.9014, "step": 12100 }, { "epoch": 0.0893033605879382, "grad_norm": 258.0, "learning_rate": 1.9816139620435657e-05, "loss": 0.9101, "step": 12200 }, { "epoch": 0.09003535534685572, "grad_norm": 0.765625, "learning_rate": 1.9811586797813706e-05, "loss": 1.0403, "step": 12300 }, { "epoch": 0.09076735010577325, "grad_norm": 8.1875, "learning_rate": 1.9806978829033218e-05, "loss": 0.9556, "step": 12400 }, { "epoch": 0.09149934486469077, "grad_norm": 0.232421875, "learning_rate": 1.9802315739993346e-05, "loss": 0.8063, "step": 12500 }, { "epoch": 0.0922313396236083, "grad_norm": 0.396484375, "learning_rate": 1.9797597556903048e-05, "loss": 0.8704, "step": 12600 }, { "epoch": 0.09296333438252583, "grad_norm": 94.5, "learning_rate": 1.9792824306280934e-05, "loss": 1.0443, "step": 12700 }, { "epoch": 0.09369532914144335, "grad_norm": 0.09912109375, "learning_rate": 1.9787996014955126e-05, "loss": 0.9383, "step": 12800 }, { "epoch": 0.09442732390036088, "grad_norm": 0.2216796875, "learning_rate": 1.9783112710063098e-05, "loss": 0.9516, "step": 12900 }, { "epoch": 0.0951593186592784, "grad_norm": 0.1005859375, "learning_rate": 1.9778174419051538e-05, "loss": 0.9241, "step": 13000 }, { "epoch": 0.09589131341819593, "grad_norm": 106.0, "learning_rate": 1.977318116967618e-05, "loss": 0.9661, "step": 13100 }, { "epoch": 0.09662330817711345, "grad_norm": 0.314453125, "learning_rate": 1.976813299000164e-05, "loss": 1.0954, "step": 13200 }, { "epoch": 0.09735530293603098, "grad_norm": 114.5, "learning_rate": 1.9763029908401294e-05, "loss": 0.9344, "step": 13300 }, { "epoch": 0.0980872976949485, "grad_norm": 0.447265625, "learning_rate": 1.9757871953557078e-05, "loss": 1.0499, "step": 13400 }, { "epoch": 0.09881929245386603, "grad_norm": 78.5, "learning_rate": 1.975265915445934e-05, "loss": 0.9215, "step": 13500 }, { "epoch": 0.09955128721278356, "grad_norm": 0.1767578125, "learning_rate": 1.97473915404067e-05, "loss": 1.048, "step": 13600 }, { "epoch": 0.10028328197170108, "grad_norm": 0.490234375, "learning_rate": 1.9742069141005853e-05, "loss": 1.0092, "step": 13700 }, { "epoch": 0.10101527673061861, "grad_norm": 1.4296875, "learning_rate": 1.9736691986171413e-05, "loss": 0.964, "step": 13800 }, { "epoch": 0.10174727148953613, "grad_norm": 0.2421875, "learning_rate": 1.9731260106125757e-05, "loss": 0.8828, "step": 13900 }, { "epoch": 0.10247926624845366, "grad_norm": 88.0, "learning_rate": 1.972577353139884e-05, "loss": 1.0908, "step": 14000 }, { "epoch": 0.10247926624845366, "eval_loss": 0.8814056515693665, "eval_runtime": 27.9852, "eval_samples_per_second": 17.867, "eval_steps_per_second": 17.867, "step": 14000 }, { "epoch": 0.10321126100737119, "grad_norm": 0.18359375, "learning_rate": 1.9720232292828033e-05, "loss": 0.9781, "step": 14100 }, { "epoch": 0.10394325576628871, "grad_norm": 0.31640625, "learning_rate": 1.971463642155794e-05, "loss": 0.9888, "step": 14200 }, { "epoch": 0.10467525052520624, "grad_norm": 0.0966796875, "learning_rate": 1.9708985949040237e-05, "loss": 1.0119, "step": 14300 }, { "epoch": 0.10540724528412376, "grad_norm": 1.203125, "learning_rate": 1.9703280907033475e-05, "loss": 1.0127, "step": 14400 }, { "epoch": 0.10613924004304129, "grad_norm": 0.53125, "learning_rate": 1.9697521327602928e-05, "loss": 1.0275, "step": 14500 }, { "epoch": 0.10687123480195881, "grad_norm": 0.28515625, "learning_rate": 1.9691707243120386e-05, "loss": 0.869, "step": 14600 }, { "epoch": 0.10760322956087634, "grad_norm": 1.3125, "learning_rate": 1.9685838686263998e-05, "loss": 0.8713, "step": 14700 }, { "epoch": 0.10833522431979387, "grad_norm": 244.0, "learning_rate": 1.9679915690018062e-05, "loss": 1.0574, "step": 14800 }, { "epoch": 0.10906721907871139, "grad_norm": 0.298828125, "learning_rate": 1.9673938287672865e-05, "loss": 0.8997, "step": 14900 }, { "epoch": 0.10979921383762892, "grad_norm": 0.76171875, "learning_rate": 1.966790651282447e-05, "loss": 1.2234, "step": 15000 }, { "epoch": 0.11053120859654644, "grad_norm": 0.2060546875, "learning_rate": 1.9661820399374564e-05, "loss": 0.8861, "step": 15100 }, { "epoch": 0.11126320335546397, "grad_norm": 9.0, "learning_rate": 1.9655679981530224e-05, "loss": 0.9659, "step": 15200 }, { "epoch": 0.1119951981143815, "grad_norm": 142.0, "learning_rate": 1.964948529380375e-05, "loss": 1.0234, "step": 15300 }, { "epoch": 0.11272719287329902, "grad_norm": 0.171875, "learning_rate": 1.964323637101247e-05, "loss": 1.011, "step": 15400 }, { "epoch": 0.11345918763221656, "grad_norm": 0.65625, "learning_rate": 1.9636933248278545e-05, "loss": 0.9565, "step": 15500 }, { "epoch": 0.11419118239113409, "grad_norm": 76.0, "learning_rate": 1.9630575961028765e-05, "loss": 0.9768, "step": 15600 }, { "epoch": 0.11492317715005161, "grad_norm": 242.0, "learning_rate": 1.9624164544994343e-05, "loss": 0.7916, "step": 15700 }, { "epoch": 0.11565517190896914, "grad_norm": 160.0, "learning_rate": 1.9617699036210737e-05, "loss": 0.8392, "step": 15800 }, { "epoch": 0.11638716666788666, "grad_norm": 0.5390625, "learning_rate": 1.9611179471017423e-05, "loss": 0.8403, "step": 15900 }, { "epoch": 0.11711916142680419, "grad_norm": 3.453125, "learning_rate": 1.9604605886057712e-05, "loss": 0.7843, "step": 16000 }, { "epoch": 0.11711916142680419, "eval_loss": 0.9412841796875, "eval_runtime": 27.963, "eval_samples_per_second": 17.881, "eval_steps_per_second": 17.881, "step": 16000 }, { "epoch": 0.11785115618572171, "grad_norm": 0.67578125, "learning_rate": 1.9597978318278523e-05, "loss": 1.0179, "step": 16100 }, { "epoch": 0.11858315094463924, "grad_norm": 116.0, "learning_rate": 1.9591296804930198e-05, "loss": 0.9158, "step": 16200 }, { "epoch": 0.11931514570355677, "grad_norm": 9.875, "learning_rate": 1.958456138356627e-05, "loss": 0.9174, "step": 16300 }, { "epoch": 0.12004714046247429, "grad_norm": 1.1953125, "learning_rate": 1.957777209204327e-05, "loss": 0.942, "step": 16400 }, { "epoch": 0.12077913522139182, "grad_norm": 288.0, "learning_rate": 1.95709289685205e-05, "loss": 0.8128, "step": 16500 }, { "epoch": 0.12151112998030934, "grad_norm": 117.0, "learning_rate": 1.956403205145984e-05, "loss": 1.0152, "step": 16600 }, { "epoch": 0.12224312473922687, "grad_norm": 90.0, "learning_rate": 1.9557081379625494e-05, "loss": 0.809, "step": 16700 }, { "epoch": 0.1229751194981444, "grad_norm": 0.050537109375, "learning_rate": 1.9550076992083818e-05, "loss": 0.7162, "step": 16800 }, { "epoch": 0.12370711425706192, "grad_norm": 0.18359375, "learning_rate": 1.9543018928203066e-05, "loss": 0.7201, "step": 16900 }, { "epoch": 0.12443910901597945, "grad_norm": 0.1748046875, "learning_rate": 1.9535907227653182e-05, "loss": 1.279, "step": 17000 }, { "epoch": 0.12517110377489696, "grad_norm": 0.1708984375, "learning_rate": 1.952874193040558e-05, "loss": 0.5654, "step": 17100 }, { "epoch": 0.1259030985338145, "grad_norm": 0.0673828125, "learning_rate": 1.9521523076732903e-05, "loss": 0.7602, "step": 17200 }, { "epoch": 0.12663509329273204, "grad_norm": 3760.0, "learning_rate": 1.951425070720883e-05, "loss": 0.9334, "step": 17300 }, { "epoch": 0.12736708805164956, "grad_norm": 93.0, "learning_rate": 1.9506924862707804e-05, "loss": 1.1316, "step": 17400 }, { "epoch": 0.1280990828105671, "grad_norm": 0.28125, "learning_rate": 1.949954558440484e-05, "loss": 1.0999, "step": 17500 }, { "epoch": 0.12883107756948461, "grad_norm": 0.1806640625, "learning_rate": 1.9492112913775273e-05, "loss": 0.929, "step": 17600 }, { "epoch": 0.12956307232840214, "grad_norm": 3904.0, "learning_rate": 1.9484626892594525e-05, "loss": 0.7699, "step": 17700 }, { "epoch": 0.13029506708731967, "grad_norm": 0.447265625, "learning_rate": 1.9477087562937888e-05, "loss": 1.0148, "step": 17800 }, { "epoch": 0.1310270618462372, "grad_norm": 8.25, "learning_rate": 1.9469494967180262e-05, "loss": 0.8446, "step": 17900 }, { "epoch": 0.13175905660515472, "grad_norm": 148.0, "learning_rate": 1.9461849147995942e-05, "loss": 0.8187, "step": 18000 }, { "epoch": 0.13175905660515472, "eval_loss": 0.8699705600738525, "eval_runtime": 27.9351, "eval_samples_per_second": 17.899, "eval_steps_per_second": 17.899, "step": 18000 } ], "logging_steps": 100, "max_steps": 136613, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.742241467994931e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }