|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.06940251375904835, |
|
"eval_steps": 500, |
|
"global_step": 5000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001388050275180967, |
|
"grad_norm": 1.386236310005188, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 2.432, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.002776100550361934, |
|
"grad_norm": 1.2055169343948364, |
|
"learning_rate": 0.00019793814432989693, |
|
"loss": 1.8849, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0041641508255429015, |
|
"grad_norm": 1.1103655099868774, |
|
"learning_rate": 0.00019381443298969073, |
|
"loss": 1.7944, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.005552201100723868, |
|
"grad_norm": 1.1928976774215698, |
|
"learning_rate": 0.00018969072164948454, |
|
"loss": 1.7832, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.006940251375904836, |
|
"grad_norm": 1.2889844179153442, |
|
"learning_rate": 0.00018556701030927837, |
|
"loss": 1.811, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.008328301651085803, |
|
"grad_norm": 1.304612159729004, |
|
"learning_rate": 0.00018144329896907217, |
|
"loss": 1.7718, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.00971635192626677, |
|
"grad_norm": 1.2339240312576294, |
|
"learning_rate": 0.00017731958762886598, |
|
"loss": 1.7757, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.011104402201447736, |
|
"grad_norm": 1.196730136871338, |
|
"learning_rate": 0.0001731958762886598, |
|
"loss": 1.7639, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.012492452476628703, |
|
"grad_norm": 1.1369080543518066, |
|
"learning_rate": 0.00016907216494845361, |
|
"loss": 1.7508, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.013880502751809671, |
|
"grad_norm": 1.1714072227478027, |
|
"learning_rate": 0.00016494845360824742, |
|
"loss": 1.7451, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.015268553026990638, |
|
"grad_norm": 2.0464041233062744, |
|
"learning_rate": 0.00016082474226804125, |
|
"loss": 1.7266, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.016656603302171606, |
|
"grad_norm": 1.2468883991241455, |
|
"learning_rate": 0.00015670103092783506, |
|
"loss": 1.7476, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.018044653577352573, |
|
"grad_norm": 1.3278380632400513, |
|
"learning_rate": 0.00015257731958762886, |
|
"loss": 1.713, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.01943270385253354, |
|
"grad_norm": 1.83975088596344, |
|
"learning_rate": 0.0001484536082474227, |
|
"loss": 1.712, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.020820754127714506, |
|
"grad_norm": 1.1055878400802612, |
|
"learning_rate": 0.0001443298969072165, |
|
"loss": 1.7305, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.022208804402895473, |
|
"grad_norm": 1.2078220844268799, |
|
"learning_rate": 0.0001402061855670103, |
|
"loss": 1.7094, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.02359685467807644, |
|
"grad_norm": 1.9538626670837402, |
|
"learning_rate": 0.00013608247422680414, |
|
"loss": 1.7252, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.024984904953257406, |
|
"grad_norm": 1.149594783782959, |
|
"learning_rate": 0.00013195876288659794, |
|
"loss": 1.7108, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.026372955228438372, |
|
"grad_norm": 1.2434228658676147, |
|
"learning_rate": 0.00012783505154639175, |
|
"loss": 1.7106, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.027761005503619342, |
|
"grad_norm": 1.0074732303619385, |
|
"learning_rate": 0.00012371134020618558, |
|
"loss": 1.7222, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.02914905577880031, |
|
"grad_norm": 1.2611148357391357, |
|
"learning_rate": 0.00011958762886597938, |
|
"loss": 1.6937, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.030537106053981276, |
|
"grad_norm": 1.0606039762496948, |
|
"learning_rate": 0.00011546391752577319, |
|
"loss": 1.6904, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.03192515632916224, |
|
"grad_norm": 1.202054500579834, |
|
"learning_rate": 0.00011134020618556702, |
|
"loss": 1.7305, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.03331320660434321, |
|
"grad_norm": 0.9994720220565796, |
|
"learning_rate": 0.00010721649484536083, |
|
"loss": 1.7045, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.034701256879524175, |
|
"grad_norm": 1.222708821296692, |
|
"learning_rate": 0.00010309278350515463, |
|
"loss": 1.6707, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.036089307154705146, |
|
"grad_norm": 1.170048475265503, |
|
"learning_rate": 9.896907216494846e-05, |
|
"loss": 1.6999, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.03747735742988611, |
|
"grad_norm": 1.3302826881408691, |
|
"learning_rate": 9.484536082474227e-05, |
|
"loss": 1.6899, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.03886540770506708, |
|
"grad_norm": 1.1185508966445923, |
|
"learning_rate": 9.072164948453609e-05, |
|
"loss": 1.6899, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.04025345798024804, |
|
"grad_norm": 1.6227563619613647, |
|
"learning_rate": 8.65979381443299e-05, |
|
"loss": 1.6844, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.04164150825542901, |
|
"grad_norm": 1.4536927938461304, |
|
"learning_rate": 8.247422680412371e-05, |
|
"loss": 1.6873, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.04302955853060998, |
|
"grad_norm": 1.2233431339263916, |
|
"learning_rate": 7.835051546391753e-05, |
|
"loss": 1.656, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.044417608805790945, |
|
"grad_norm": 1.3101099729537964, |
|
"learning_rate": 7.422680412371135e-05, |
|
"loss": 1.6722, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.045805659080971915, |
|
"grad_norm": 1.224885106086731, |
|
"learning_rate": 7.010309278350515e-05, |
|
"loss": 1.6536, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.04719370935615288, |
|
"grad_norm": 1.1796605587005615, |
|
"learning_rate": 6.597938144329897e-05, |
|
"loss": 1.6783, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.04858175963133385, |
|
"grad_norm": 1.2505239248275757, |
|
"learning_rate": 6.185567010309279e-05, |
|
"loss": 1.676, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.04996980990651481, |
|
"grad_norm": 0.9648654460906982, |
|
"learning_rate": 5.7731958762886594e-05, |
|
"loss": 1.6617, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.05135786018169578, |
|
"grad_norm": 1.0220248699188232, |
|
"learning_rate": 5.360824742268041e-05, |
|
"loss": 1.6611, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.052745910456876745, |
|
"grad_norm": 1.0856778621673584, |
|
"learning_rate": 4.948453608247423e-05, |
|
"loss": 1.6608, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.054133960732057715, |
|
"grad_norm": 1.439794659614563, |
|
"learning_rate": 4.536082474226804e-05, |
|
"loss": 1.6693, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.055522011007238685, |
|
"grad_norm": 1.1624224185943604, |
|
"learning_rate": 4.1237113402061855e-05, |
|
"loss": 1.6447, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.05691006128241965, |
|
"grad_norm": 1.0445908308029175, |
|
"learning_rate": 3.7113402061855674e-05, |
|
"loss": 1.6458, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.05829811155760062, |
|
"grad_norm": 1.2009379863739014, |
|
"learning_rate": 3.2989690721649485e-05, |
|
"loss": 1.6362, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.05968616183278158, |
|
"grad_norm": 1.1339406967163086, |
|
"learning_rate": 2.8865979381443297e-05, |
|
"loss": 1.6605, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.06107421210796255, |
|
"grad_norm": 1.1409072875976562, |
|
"learning_rate": 2.4742268041237116e-05, |
|
"loss": 1.6582, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.062462262383143514, |
|
"grad_norm": 1.0138684511184692, |
|
"learning_rate": 2.0618556701030927e-05, |
|
"loss": 1.6539, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.06385031265832448, |
|
"grad_norm": 1.0418405532836914, |
|
"learning_rate": 1.6494845360824743e-05, |
|
"loss": 1.6676, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.06523836293350545, |
|
"grad_norm": 1.0475600957870483, |
|
"learning_rate": 1.2371134020618558e-05, |
|
"loss": 1.6416, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.06662641320868642, |
|
"grad_norm": 1.334047555923462, |
|
"learning_rate": 8.247422680412371e-06, |
|
"loss": 1.6297, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.06801446348386739, |
|
"grad_norm": 1.0327249765396118, |
|
"learning_rate": 4.123711340206186e-06, |
|
"loss": 1.6329, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.06940251375904835, |
|
"grad_norm": 1.007521390914917, |
|
"learning_rate": 0.0, |
|
"loss": 1.6405, |
|
"step": 5000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1538290661520589e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|