{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 1956, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07668711656441718, "grad_norm": 6.9312968254089355, "learning_rate": 1.948875255623722e-05, "loss": 2.9126, "step": 50 }, { "epoch": 0.15337423312883436, "grad_norm": 7.094263076782227, "learning_rate": 1.8977505112474438e-05, "loss": 0.9007, "step": 100 }, { "epoch": 0.15337423312883436, "eval_loss": 0.5516239404678345, "eval_runtime": 1.7021, "eval_samples_per_second": 117.505, "eval_steps_per_second": 29.376, "step": 100 }, { "epoch": 0.23006134969325154, "grad_norm": 5.485144138336182, "learning_rate": 1.8466257668711657e-05, "loss": 0.4804, "step": 150 }, { "epoch": 0.3067484662576687, "grad_norm": 0.7752339243888855, "learning_rate": 1.795501022494888e-05, "loss": 0.2893, "step": 200 }, { "epoch": 0.3067484662576687, "eval_loss": 0.285244345664978, "eval_runtime": 1.7614, "eval_samples_per_second": 113.543, "eval_steps_per_second": 28.386, "step": 200 }, { "epoch": 0.3834355828220859, "grad_norm": 0.43274009227752686, "learning_rate": 1.7443762781186097e-05, "loss": 0.204, "step": 250 }, { "epoch": 0.4601226993865031, "grad_norm": 3.576641321182251, "learning_rate": 1.6932515337423315e-05, "loss": 0.2428, "step": 300 }, { "epoch": 0.4601226993865031, "eval_loss": 0.20607389509677887, "eval_runtime": 1.6287, "eval_samples_per_second": 122.798, "eval_steps_per_second": 30.7, "step": 300 }, { "epoch": 0.5368098159509203, "grad_norm": 5.083515167236328, "learning_rate": 1.6421267893660533e-05, "loss": 0.1801, "step": 350 }, { "epoch": 0.6134969325153374, "grad_norm": 8.63024616241455, "learning_rate": 1.591002044989775e-05, "loss": 0.2088, "step": 400 }, { "epoch": 0.6134969325153374, "eval_loss": 0.1618553102016449, "eval_runtime": 1.7332, "eval_samples_per_second": 115.396, "eval_steps_per_second": 28.849, "step": 400 }, { "epoch": 0.6901840490797546, "grad_norm": 4.400120258331299, "learning_rate": 1.539877300613497e-05, "loss": 0.1688, "step": 450 }, { "epoch": 0.7668711656441718, "grad_norm": 1.805452585220337, "learning_rate": 1.488752556237219e-05, "loss": 0.1627, "step": 500 }, { "epoch": 0.7668711656441718, "eval_loss": 0.16040299832820892, "eval_runtime": 1.7462, "eval_samples_per_second": 114.531, "eval_steps_per_second": 28.633, "step": 500 }, { "epoch": 0.843558282208589, "grad_norm": 2.7593345642089844, "learning_rate": 1.4376278118609408e-05, "loss": 0.151, "step": 550 }, { "epoch": 0.9202453987730062, "grad_norm": 4.870810031890869, "learning_rate": 1.3865030674846627e-05, "loss": 0.1662, "step": 600 }, { "epoch": 0.9202453987730062, "eval_loss": 0.16780619323253632, "eval_runtime": 1.5157, "eval_samples_per_second": 131.95, "eval_steps_per_second": 32.987, "step": 600 }, { "epoch": 0.9969325153374233, "grad_norm": 15.548288345336914, "learning_rate": 1.3353783231083845e-05, "loss": 0.1377, "step": 650 }, { "epoch": 1.0736196319018405, "grad_norm": 2.5365755558013916, "learning_rate": 1.2842535787321065e-05, "loss": 0.1602, "step": 700 }, { "epoch": 1.0736196319018405, "eval_loss": 0.15782643854618073, "eval_runtime": 1.7328, "eval_samples_per_second": 115.421, "eval_steps_per_second": 28.855, "step": 700 }, { "epoch": 1.1503067484662577, "grad_norm": 3.997312307357788, "learning_rate": 1.2331288343558283e-05, "loss": 0.138, "step": 750 }, { "epoch": 1.2269938650306749, "grad_norm": 3.9103379249572754, "learning_rate": 1.1820040899795502e-05, "loss": 0.1219, "step": 800 }, { "epoch": 1.2269938650306749, "eval_loss": 0.15942689776420593, "eval_runtime": 1.7668, "eval_samples_per_second": 113.196, "eval_steps_per_second": 28.299, "step": 800 }, { "epoch": 1.303680981595092, "grad_norm": 1.4422495365142822, "learning_rate": 1.130879345603272e-05, "loss": 0.1074, "step": 850 }, { "epoch": 1.3803680981595092, "grad_norm": 0.34805914759635925, "learning_rate": 1.079754601226994e-05, "loss": 0.1135, "step": 900 }, { "epoch": 1.3803680981595092, "eval_loss": 0.1677168309688568, "eval_runtime": 1.6828, "eval_samples_per_second": 118.847, "eval_steps_per_second": 29.712, "step": 900 }, { "epoch": 1.4570552147239264, "grad_norm": 0.277852326631546, "learning_rate": 1.0286298568507158e-05, "loss": 0.0991, "step": 950 }, { "epoch": 1.5337423312883436, "grad_norm": 0.3739337921142578, "learning_rate": 9.775051124744377e-06, "loss": 0.1024, "step": 1000 }, { "epoch": 1.5337423312883436, "eval_loss": 0.16195625066757202, "eval_runtime": 1.5162, "eval_samples_per_second": 131.911, "eval_steps_per_second": 32.978, "step": 1000 }, { "epoch": 1.6104294478527608, "grad_norm": 1.7558486461639404, "learning_rate": 9.263803680981595e-06, "loss": 0.1549, "step": 1050 }, { "epoch": 1.687116564417178, "grad_norm": 3.3036224842071533, "learning_rate": 8.752556237218815e-06, "loss": 0.111, "step": 1100 }, { "epoch": 1.687116564417178, "eval_loss": 0.15855944156646729, "eval_runtime": 1.5416, "eval_samples_per_second": 129.739, "eval_steps_per_second": 32.435, "step": 1100 }, { "epoch": 1.7638036809815951, "grad_norm": 0.5878134369850159, "learning_rate": 8.241308793456033e-06, "loss": 0.0904, "step": 1150 }, { "epoch": 1.8404907975460123, "grad_norm": 1.7855305671691895, "learning_rate": 7.730061349693252e-06, "loss": 0.1053, "step": 1200 }, { "epoch": 1.8404907975460123, "eval_loss": 0.1541932076215744, "eval_runtime": 1.514, "eval_samples_per_second": 132.097, "eval_steps_per_second": 33.024, "step": 1200 }, { "epoch": 1.9171779141104295, "grad_norm": 7.03360652923584, "learning_rate": 7.218813905930471e-06, "loss": 0.1058, "step": 1250 }, { "epoch": 1.9938650306748467, "grad_norm": 0.12814605236053467, "learning_rate": 6.707566462167689e-06, "loss": 0.0922, "step": 1300 }, { "epoch": 1.9938650306748467, "eval_loss": 0.1525595486164093, "eval_runtime": 1.8272, "eval_samples_per_second": 109.455, "eval_steps_per_second": 27.364, "step": 1300 }, { "epoch": 2.0705521472392636, "grad_norm": 1.1744565963745117, "learning_rate": 6.1963190184049085e-06, "loss": 0.0975, "step": 1350 }, { "epoch": 2.147239263803681, "grad_norm": 4.074454307556152, "learning_rate": 5.685071574642127e-06, "loss": 0.087, "step": 1400 }, { "epoch": 2.147239263803681, "eval_loss": 0.1584751158952713, "eval_runtime": 1.558, "eval_samples_per_second": 128.371, "eval_steps_per_second": 32.093, "step": 1400 }, { "epoch": 2.223926380368098, "grad_norm": 1.7600996494293213, "learning_rate": 5.173824130879346e-06, "loss": 0.0913, "step": 1450 }, { "epoch": 2.3006134969325154, "grad_norm": 4.647277355194092, "learning_rate": 4.662576687116564e-06, "loss": 0.1259, "step": 1500 }, { "epoch": 2.3006134969325154, "eval_loss": 0.15520721673965454, "eval_runtime": 1.5428, "eval_samples_per_second": 129.632, "eval_steps_per_second": 32.408, "step": 1500 }, { "epoch": 2.3773006134969323, "grad_norm": 1.1398224830627441, "learning_rate": 4.1513292433537835e-06, "loss": 0.083, "step": 1550 }, { "epoch": 2.4539877300613497, "grad_norm": 0.7803178429603577, "learning_rate": 3.6400817995910027e-06, "loss": 0.1027, "step": 1600 }, { "epoch": 2.4539877300613497, "eval_loss": 0.15836574137210846, "eval_runtime": 1.62, "eval_samples_per_second": 123.459, "eval_steps_per_second": 30.865, "step": 1600 }, { "epoch": 2.530674846625767, "grad_norm": 3.176084280014038, "learning_rate": 3.1288343558282214e-06, "loss": 0.0963, "step": 1650 }, { "epoch": 2.607361963190184, "grad_norm": 4.374606609344482, "learning_rate": 2.61758691206544e-06, "loss": 0.0771, "step": 1700 }, { "epoch": 2.607361963190184, "eval_loss": 0.159062922000885, "eval_runtime": 1.5714, "eval_samples_per_second": 127.277, "eval_steps_per_second": 31.819, "step": 1700 }, { "epoch": 2.684049079754601, "grad_norm": 0.12128473073244095, "learning_rate": 2.1063394683026585e-06, "loss": 0.0841, "step": 1750 }, { "epoch": 2.7607361963190185, "grad_norm": 4.712184906005859, "learning_rate": 1.5950920245398775e-06, "loss": 0.0918, "step": 1800 }, { "epoch": 2.7607361963190185, "eval_loss": 0.1609487533569336, "eval_runtime": 1.5318, "eval_samples_per_second": 130.563, "eval_steps_per_second": 32.641, "step": 1800 }, { "epoch": 2.837423312883436, "grad_norm": 4.953312397003174, "learning_rate": 1.0838445807770962e-06, "loss": 0.1102, "step": 1850 }, { "epoch": 2.914110429447853, "grad_norm": 0.9757829308509827, "learning_rate": 5.72597137014315e-07, "loss": 0.0782, "step": 1900 }, { "epoch": 2.914110429447853, "eval_loss": 0.16041229665279388, "eval_runtime": 1.72, "eval_samples_per_second": 116.276, "eval_steps_per_second": 29.069, "step": 1900 }, { "epoch": 2.9907975460122698, "grad_norm": 0.24268025159835815, "learning_rate": 6.134969325153375e-08, "loss": 0.0817, "step": 1950 } ], "logging_steps": 50, "max_steps": 1956, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": -1956, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9454166582575104.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }