{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 205, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024390243902439025, "grad_norm": 0.2731004059314728, "learning_rate": 2.3809523809523808e-06, "loss": 0.6255, "step": 1 }, { "epoch": 0.12195121951219512, "grad_norm": 0.27167728543281555, "learning_rate": 1.1904761904761905e-05, "loss": 0.6283, "step": 5 }, { "epoch": 0.24390243902439024, "grad_norm": 0.24203768372535706, "learning_rate": 2.380952380952381e-05, "loss": 0.6021, "step": 10 }, { "epoch": 0.36585365853658536, "grad_norm": 0.1512613147497177, "learning_rate": 3.571428571428572e-05, "loss": 0.5736, "step": 15 }, { "epoch": 0.4878048780487805, "grad_norm": 0.13210485875606537, "learning_rate": 4.761904761904762e-05, "loss": 0.5307, "step": 20 }, { "epoch": 0.6097560975609756, "grad_norm": 0.11809483170509338, "learning_rate": 4.994171922976348e-05, "loss": 0.4787, "step": 25 }, { "epoch": 0.7317073170731707, "grad_norm": 0.1285855919122696, "learning_rate": 4.9705419236058825e-05, "loss": 0.4354, "step": 30 }, { "epoch": 0.8536585365853658, "grad_norm": 0.10194303095340729, "learning_rate": 4.9289177234948535e-05, "loss": 0.4031, "step": 35 }, { "epoch": 0.975609756097561, "grad_norm": 0.0670081079006195, "learning_rate": 4.8696024926503396e-05, "loss": 0.3835, "step": 40 }, { "epoch": 1.0, "eval_loss": 0.37915876507759094, "eval_runtime": 7.8362, "eval_samples_per_second": 18.631, "eval_steps_per_second": 0.638, "step": 41 }, { "epoch": 1.0975609756097562, "grad_norm": 0.07868604362010956, "learning_rate": 4.793028253763633e-05, "loss": 0.367, "step": 45 }, { "epoch": 1.2195121951219512, "grad_norm": 0.08379828184843063, "learning_rate": 4.69975273557146e-05, "loss": 0.3689, "step": 50 }, { "epoch": 1.3414634146341464, "grad_norm": 0.089637391269207, "learning_rate": 4.5904553106367774e-05, "loss": 0.3695, "step": 55 }, { "epoch": 1.4634146341463414, "grad_norm": 0.08855098485946655, "learning_rate": 4.4659320471363314e-05, "loss": 0.3615, "step": 60 }, { "epoch": 1.5853658536585367, "grad_norm": 0.09250541776418686, "learning_rate": 4.3270899106953105e-05, "loss": 0.3584, "step": 65 }, { "epoch": 1.7073170731707317, "grad_norm": 0.08777391910552979, "learning_rate": 4.174940158500041e-05, "loss": 0.3443, "step": 70 }, { "epoch": 1.8292682926829267, "grad_norm": 0.08619946986436844, "learning_rate": 4.0105909738027365e-05, "loss": 0.3491, "step": 75 }, { "epoch": 1.951219512195122, "grad_norm": 0.09870754927396774, "learning_rate": 3.835239394464901e-05, "loss": 0.3431, "step": 80 }, { "epoch": 2.0, "eval_loss": 0.3487952947616577, "eval_runtime": 7.7885, "eval_samples_per_second": 18.746, "eval_steps_per_second": 0.642, "step": 82 }, { "epoch": 2.073170731707317, "grad_norm": 0.08834455162286758, "learning_rate": 3.6501625943278805e-05, "loss": 0.3355, "step": 85 }, { "epoch": 2.1951219512195124, "grad_norm": 0.07206975668668747, "learning_rate": 3.456708580912725e-05, "loss": 0.3394, "step": 90 }, { "epoch": 2.317073170731707, "grad_norm": 0.05374612286686897, "learning_rate": 3.25628637720269e-05, "loss": 0.3462, "step": 95 }, { "epoch": 2.4390243902439024, "grad_norm": 0.050639085471630096, "learning_rate": 3.0503557590194143e-05, "loss": 0.3246, "step": 100 }, { "epoch": 2.5609756097560976, "grad_norm": 0.06731884926557541, "learning_rate": 2.840416622740617e-05, "loss": 0.2627, "step": 105 }, { "epoch": 2.682926829268293, "grad_norm": 0.047202687710523605, "learning_rate": 2.6279980607995836e-05, "loss": 0.268, "step": 110 }, { "epoch": 2.8048780487804876, "grad_norm": 0.045556213706731796, "learning_rate": 2.4146472245350805e-05, "loss": 0.2513, "step": 115 }, { "epoch": 2.926829268292683, "grad_norm": 0.043327976018190384, "learning_rate": 2.201918055509173e-05, "loss": 0.2489, "step": 120 }, { "epoch": 3.0, "eval_loss": 0.2567618787288666, "eval_runtime": 13.2824, "eval_samples_per_second": 10.992, "eval_steps_per_second": 0.376, "step": 123 }, { "epoch": 3.048780487804878, "grad_norm": 0.045429013669490814, "learning_rate": 1.991359967368416e-05, "loss": 0.2556, "step": 125 }, { "epoch": 3.1707317073170733, "grad_norm": 0.03991298750042915, "learning_rate": 1.7845065606841472e-05, "loss": 0.2418, "step": 130 }, { "epoch": 3.292682926829268, "grad_norm": 0.04288507252931595, "learning_rate": 1.582864452967359e-05, "loss": 0.2455, "step": 135 }, { "epoch": 3.4146341463414633, "grad_norm": 0.04243966192007065, "learning_rate": 1.3879023052147899e-05, "loss": 0.2494, "step": 140 }, { "epoch": 3.5365853658536586, "grad_norm": 0.041205402463674545, "learning_rate": 1.2010401249114167e-05, "loss": 0.241, "step": 145 }, { "epoch": 3.658536585365854, "grad_norm": 0.04464095085859299, "learning_rate": 1.0236389234009727e-05, "loss": 0.2512, "step": 150 }, { "epoch": 3.7804878048780486, "grad_norm": 0.04085472226142883, "learning_rate": 8.569908029550685e-06, "loss": 0.2461, "step": 155 }, { "epoch": 3.902439024390244, "grad_norm": 0.041141241788864136, "learning_rate": 7.02309545741773e-06, "loss": 0.2468, "step": 160 }, { "epoch": 4.0, "eval_loss": 0.2538050413131714, "eval_runtime": 11.622, "eval_samples_per_second": 12.562, "eval_steps_per_second": 0.43, "step": 164 }, { "epoch": 4.024390243902439, "grad_norm": 0.042266517877578735, "learning_rate": 5.607217732389503e-06, "loss": 0.2496, "step": 165 }, { "epoch": 4.146341463414634, "grad_norm": 0.039677053689956665, "learning_rate": 4.332587404827854e-06, "loss": 0.2429, "step": 170 }, { "epoch": 4.2682926829268295, "grad_norm": 0.0383627712726593, "learning_rate": 3.208488249181216e-06, "loss": 0.2452, "step": 175 }, { "epoch": 4.390243902439025, "grad_norm": 0.039468664675951004, "learning_rate": 2.2431076455809467e-06, "loss": 0.2438, "step": 180 }, { "epoch": 4.512195121951219, "grad_norm": 0.0408608578145504, "learning_rate": 1.44347694702949e-06, "loss": 0.2451, "step": 185 }, { "epoch": 4.634146341463414, "grad_norm": 0.03901852294802666, "learning_rate": 8.154202665162147e-07, "loss": 0.2531, "step": 190 }, { "epoch": 4.7560975609756095, "grad_norm": 0.039755210280418396, "learning_rate": 3.635120570700784e-07, "loss": 0.243, "step": 195 }, { "epoch": 4.878048780487805, "grad_norm": 0.04330005869269371, "learning_rate": 9.104379371500105e-08, "loss": 0.2447, "step": 200 }, { "epoch": 5.0, "grad_norm": 0.04034363478422165, "learning_rate": 0.0, "loss": 0.2454, "step": 205 }, { "epoch": 5.0, "eval_loss": 0.2536194324493408, "eval_runtime": 11.5668, "eval_samples_per_second": 12.622, "eval_steps_per_second": 0.432, "step": 205 }, { "epoch": 5.0, "step": 205, "total_flos": 7.472149011177144e+17, "train_loss": 0.12734018942204917, "train_runtime": 865.1722, "train_samples_per_second": 7.542, "train_steps_per_second": 0.237 } ], "logging_steps": 5, "max_steps": 205, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.472149011177144e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }