{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.058559580713402094, "eval_steps": 2000, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007319947589175262, "grad_norm": 800.0, "learning_rate": 4.879238838741157e-07, "loss": 0.952, "step": 100 }, { "epoch": 0.0014639895178350524, "grad_norm": 768.0, "learning_rate": 9.758477677482314e-07, "loss": 0.8458, "step": 200 }, { "epoch": 0.0021959842767525785, "grad_norm": 1600.0, "learning_rate": 1.4637716516223471e-06, "loss": 0.8841, "step": 300 }, { "epoch": 0.0029279790356701047, "grad_norm": 616.0, "learning_rate": 1.951695535496463e-06, "loss": 0.9277, "step": 400 }, { "epoch": 0.003659973794587631, "grad_norm": 436.0, "learning_rate": 2.4396194193705783e-06, "loss": 0.6913, "step": 500 }, { "epoch": 0.004391968553505157, "grad_norm": 1256.0, "learning_rate": 2.9275433032446943e-06, "loss": 0.6372, "step": 600 }, { "epoch": 0.005123963312422683, "grad_norm": 764.0, "learning_rate": 3.41546718711881e-06, "loss": 1.0322, "step": 700 }, { "epoch": 0.005855958071340209, "grad_norm": 1408.0, "learning_rate": 3.903391070992926e-06, "loss": 0.7853, "step": 800 }, { "epoch": 0.006587952830257735, "grad_norm": 43.5, "learning_rate": 4.391314954867041e-06, "loss": 0.9377, "step": 900 }, { "epoch": 0.007319947589175262, "grad_norm": 684.0, "learning_rate": 4.879238838741157e-06, "loss": 1.4695, "step": 1000 }, { "epoch": 0.008051942348092788, "grad_norm": 524.0, "learning_rate": 5.367162722615272e-06, "loss": 1.4889, "step": 1100 }, { "epoch": 0.008783937107010314, "grad_norm": 33.25, "learning_rate": 5.8550866064893885e-06, "loss": 1.2154, "step": 1200 }, { "epoch": 0.00951593186592784, "grad_norm": 1.171875, "learning_rate": 6.343010490363504e-06, "loss": 1.3154, "step": 1300 }, { "epoch": 0.010247926624845366, "grad_norm": 388.0, "learning_rate": 6.83093437423762e-06, "loss": 1.5739, "step": 1400 }, { "epoch": 0.010979921383762893, "grad_norm": 251.0, "learning_rate": 7.318858258111735e-06, "loss": 0.7903, "step": 1500 }, { "epoch": 0.011711916142680419, "grad_norm": 95.5, "learning_rate": 7.806782141985851e-06, "loss": 0.6962, "step": 1600 }, { "epoch": 0.012443910901597945, "grad_norm": 516.0, "learning_rate": 8.294706025859967e-06, "loss": 1.2352, "step": 1700 }, { "epoch": 0.01317590566051547, "grad_norm": 41.0, "learning_rate": 8.782629909734082e-06, "loss": 0.924, "step": 1800 }, { "epoch": 0.013907900419432996, "grad_norm": 0.91015625, "learning_rate": 9.270553793608198e-06, "loss": 1.467, "step": 1900 }, { "epoch": 0.014639895178350524, "grad_norm": 1.15625, "learning_rate": 9.758477677482313e-06, "loss": 0.9323, "step": 2000 }, { "epoch": 0.014639895178350524, "eval_loss": 1.4799224138259888, "eval_runtime": 27.9405, "eval_samples_per_second": 17.895, "eval_steps_per_second": 17.895, "step": 2000 }, { "epoch": 0.01537188993726805, "grad_norm": 39.5, "learning_rate": 1.0246401561356429e-05, "loss": 1.2229, "step": 2100 }, { "epoch": 0.016103884696185577, "grad_norm": 984.0, "learning_rate": 1.0734325445230544e-05, "loss": 1.7086, "step": 2200 }, { "epoch": 0.016835879455103103, "grad_norm": 684.0, "learning_rate": 1.122224932910466e-05, "loss": 0.9654, "step": 2300 }, { "epoch": 0.01756787421402063, "grad_norm": 840.0, "learning_rate": 1.1710173212978777e-05, "loss": 1.3702, "step": 2400 }, { "epoch": 0.018299868972938154, "grad_norm": 388.0, "learning_rate": 1.2198097096852893e-05, "loss": 1.0888, "step": 2500 }, { "epoch": 0.01903186373185568, "grad_norm": 832.0, "learning_rate": 1.2686020980727008e-05, "loss": 0.9989, "step": 2600 }, { "epoch": 0.019763858490773206, "grad_norm": 0.3984375, "learning_rate": 1.3173944864601122e-05, "loss": 1.3161, "step": 2700 }, { "epoch": 0.02049585324969073, "grad_norm": 644.0, "learning_rate": 1.366186874847524e-05, "loss": 1.1279, "step": 2800 }, { "epoch": 0.021227848008608257, "grad_norm": 414.0, "learning_rate": 1.4149792632349354e-05, "loss": 1.2044, "step": 2900 }, { "epoch": 0.021959842767525786, "grad_norm": 12.1875, "learning_rate": 1.463771651622347e-05, "loss": 1.0637, "step": 3000 }, { "epoch": 0.022691837526443312, "grad_norm": 0.95703125, "learning_rate": 1.5125640400097585e-05, "loss": 1.2628, "step": 3100 }, { "epoch": 0.023423832285360838, "grad_norm": 10.875, "learning_rate": 1.5613564283971703e-05, "loss": 1.3439, "step": 3200 }, { "epoch": 0.024155827044278363, "grad_norm": 256.0, "learning_rate": 1.6101488167845818e-05, "loss": 1.2094, "step": 3300 }, { "epoch": 0.02488782180319589, "grad_norm": 0.267578125, "learning_rate": 1.6589412051719934e-05, "loss": 1.4852, "step": 3400 }, { "epoch": 0.025619816562113415, "grad_norm": 54.25, "learning_rate": 1.707733593559405e-05, "loss": 1.1689, "step": 3500 }, { "epoch": 0.02635181132103094, "grad_norm": 308.0, "learning_rate": 1.7565259819468165e-05, "loss": 1.0845, "step": 3600 }, { "epoch": 0.027083806079948466, "grad_norm": 264.0, "learning_rate": 1.805318370334228e-05, "loss": 1.2785, "step": 3700 }, { "epoch": 0.027815800838865992, "grad_norm": 81.5, "learning_rate": 1.8541107587216396e-05, "loss": 1.0887, "step": 3800 }, { "epoch": 0.02854779559778352, "grad_norm": 696.0, "learning_rate": 1.902903147109051e-05, "loss": 1.2968, "step": 3900 }, { "epoch": 0.029279790356701047, "grad_norm": 186.0, "learning_rate": 1.9516955354964627e-05, "loss": 1.3166, "step": 4000 }, { "epoch": 0.029279790356701047, "eval_loss": 0.8257483839988708, "eval_runtime": 27.9567, "eval_samples_per_second": 17.885, "eval_steps_per_second": 17.885, "step": 4000 }, { "epoch": 0.030011785115618573, "grad_norm": 1.9140625, "learning_rate": 1.9999999997189743e-05, "loss": 1.171, "step": 4100 }, { "epoch": 0.0307437798745361, "grad_norm": 211.0, "learning_rate": 1.9999971332569874e-05, "loss": 1.1772, "step": 4200 }, { "epoch": 0.03147577463345363, "grad_norm": 0.1416015625, "learning_rate": 1.9999886462973602e-05, "loss": 1.0697, "step": 4300 }, { "epoch": 0.032207769392371154, "grad_norm": 0.220703125, "learning_rate": 1.9999745388877933e-05, "loss": 1.2177, "step": 4400 }, { "epoch": 0.03293976415128868, "grad_norm": 0.283203125, "learning_rate": 1.999954811107578e-05, "loss": 1.1959, "step": 4500 }, { "epoch": 0.033671758910206205, "grad_norm": 0.490234375, "learning_rate": 1.9999294630675945e-05, "loss": 1.1617, "step": 4600 }, { "epoch": 0.03440375366912373, "grad_norm": 390.0, "learning_rate": 1.999898494910312e-05, "loss": 1.1348, "step": 4700 }, { "epoch": 0.03513574842804126, "grad_norm": 0.279296875, "learning_rate": 1.999861906809787e-05, "loss": 1.1857, "step": 4800 }, { "epoch": 0.03586774318695878, "grad_norm": 620.0, "learning_rate": 1.9998196989716637e-05, "loss": 1.1041, "step": 4900 }, { "epoch": 0.03659973794587631, "grad_norm": 7.9375, "learning_rate": 1.999771871633172e-05, "loss": 1.2604, "step": 5000 }, { "epoch": 0.037331732704793834, "grad_norm": 0.1328125, "learning_rate": 1.9997184250631257e-05, "loss": 1.1525, "step": 5100 }, { "epoch": 0.03806372746371136, "grad_norm": 988.0, "learning_rate": 1.999659359561922e-05, "loss": 1.1125, "step": 5200 }, { "epoch": 0.038795722222628885, "grad_norm": 528.0, "learning_rate": 1.99959467546154e-05, "loss": 1.0241, "step": 5300 }, { "epoch": 0.03952771698154641, "grad_norm": 0.08203125, "learning_rate": 1.999524373125537e-05, "loss": 1.0007, "step": 5400 }, { "epoch": 0.04025971174046394, "grad_norm": 0.06494140625, "learning_rate": 1.9994484529490483e-05, "loss": 1.7392, "step": 5500 }, { "epoch": 0.04099170649938146, "grad_norm": 155.0, "learning_rate": 1.9993669153587842e-05, "loss": 1.6975, "step": 5600 }, { "epoch": 0.04172370125829899, "grad_norm": 0.1787109375, "learning_rate": 1.9992797608130284e-05, "loss": 1.3126, "step": 5700 }, { "epoch": 0.042455696017216514, "grad_norm": 102.5, "learning_rate": 1.9991869898016337e-05, "loss": 1.0694, "step": 5800 }, { "epoch": 0.04318769077613404, "grad_norm": 282.0, "learning_rate": 1.999088602846021e-05, "loss": 1.1731, "step": 5900 }, { "epoch": 0.04391968553505157, "grad_norm": 756.0, "learning_rate": 1.998984600499175e-05, "loss": 0.9569, "step": 6000 }, { "epoch": 0.04391968553505157, "eval_loss": 1.0243369340896606, "eval_runtime": 27.9367, "eval_samples_per_second": 17.898, "eval_steps_per_second": 17.898, "step": 6000 }, { "epoch": 0.0446516802939691, "grad_norm": 0.08935546875, "learning_rate": 1.9988749833456433e-05, "loss": 0.8217, "step": 6100 }, { "epoch": 0.045383675052886624, "grad_norm": 0.1650390625, "learning_rate": 1.9987597520015302e-05, "loss": 0.9041, "step": 6200 }, { "epoch": 0.04611566981180415, "grad_norm": 70.0, "learning_rate": 1.998638907114495e-05, "loss": 1.0699, "step": 6300 }, { "epoch": 0.046847664570721675, "grad_norm": 178.0, "learning_rate": 1.998512449363748e-05, "loss": 0.9322, "step": 6400 }, { "epoch": 0.0475796593296392, "grad_norm": 0.1533203125, "learning_rate": 1.9983803794600468e-05, "loss": 0.9877, "step": 6500 }, { "epoch": 0.04831165408855673, "grad_norm": 368.0, "learning_rate": 1.998242698145692e-05, "loss": 1.0714, "step": 6600 }, { "epoch": 0.04904364884747425, "grad_norm": 0.279296875, "learning_rate": 1.9980994061945238e-05, "loss": 0.9344, "step": 6700 }, { "epoch": 0.04977564360639178, "grad_norm": 2800.0, "learning_rate": 1.997950504411916e-05, "loss": 1.2076, "step": 6800 }, { "epoch": 0.050507638365309304, "grad_norm": 0.31640625, "learning_rate": 1.9977959936347732e-05, "loss": 1.0685, "step": 6900 }, { "epoch": 0.05123963312422683, "grad_norm": 29.75, "learning_rate": 1.9976358747315254e-05, "loss": 1.1026, "step": 7000 }, { "epoch": 0.051971627883144356, "grad_norm": 2112.0, "learning_rate": 1.9974701486021233e-05, "loss": 1.0783, "step": 7100 }, { "epoch": 0.05270362264206188, "grad_norm": 0.111328125, "learning_rate": 1.997298816178033e-05, "loss": 0.8777, "step": 7200 }, { "epoch": 0.05343561740097941, "grad_norm": 0.07080078125, "learning_rate": 1.9971218784222302e-05, "loss": 0.9701, "step": 7300 }, { "epoch": 0.05416761215989693, "grad_norm": 132.0, "learning_rate": 1.9969393363291963e-05, "loss": 0.9978, "step": 7400 }, { "epoch": 0.05489960691881446, "grad_norm": 2.03125, "learning_rate": 1.9967511909249118e-05, "loss": 1.2451, "step": 7500 }, { "epoch": 0.055631601677731984, "grad_norm": 912.0, "learning_rate": 1.99655744326685e-05, "loss": 0.8866, "step": 7600 }, { "epoch": 0.05636359643664951, "grad_norm": 0.10986328125, "learning_rate": 1.9963580944439732e-05, "loss": 0.9139, "step": 7700 }, { "epoch": 0.05709559119556704, "grad_norm": 0.1796875, "learning_rate": 1.9961531455767233e-05, "loss": 1.0991, "step": 7800 }, { "epoch": 0.05782758595448457, "grad_norm": 0.45703125, "learning_rate": 1.9959425978170187e-05, "loss": 1.0318, "step": 7900 }, { "epoch": 0.058559580713402094, "grad_norm": 161.0, "learning_rate": 1.995726452348246e-05, "loss": 1.0115, "step": 8000 }, { "epoch": 0.058559580713402094, "eval_loss": 1.2017102241516113, "eval_runtime": 27.9424, "eval_samples_per_second": 17.894, "eval_steps_per_second": 17.894, "step": 8000 } ], "logging_steps": 100, "max_steps": 136613, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.4433598031570944e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }