|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.802228412256268, |
|
"eval_steps": 30, |
|
"global_step": 300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.22284122562674094, |
|
"grad_norm": 0.2538502514362335, |
|
"learning_rate": 2.785515320334262e-07, |
|
"loss": 0.0283, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.4456824512534819, |
|
"grad_norm": 0.13635846972465515, |
|
"learning_rate": 5.571030640668524e-07, |
|
"loss": 0.0344, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6685236768802229, |
|
"grad_norm": 0.25132906436920166, |
|
"learning_rate": 8.356545961002786e-07, |
|
"loss": 0.0305, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6685236768802229, |
|
"eval_loss": 0.030975496396422386, |
|
"eval_runtime": 6.5316, |
|
"eval_samples_per_second": 229.652, |
|
"eval_steps_per_second": 14.392, |
|
"eval_sts_dev_pearson_cosine": 0.7953296758719961, |
|
"eval_sts_dev_pearson_dot": 0.6855921619048916, |
|
"eval_sts_dev_pearson_euclidean": 0.7647603423822984, |
|
"eval_sts_dev_pearson_manhattan": 0.7662305710281121, |
|
"eval_sts_dev_pearson_max": 0.7953296758719961, |
|
"eval_sts_dev_spearman_cosine": 0.7938998183894888, |
|
"eval_sts_dev_spearman_dot": 0.6701160606364611, |
|
"eval_sts_dev_spearman_euclidean": 0.764275064463694, |
|
"eval_sts_dev_spearman_manhattan": 0.7663956716038323, |
|
"eval_sts_dev_spearman_max": 0.7938998183894888, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.8913649025069638, |
|
"grad_norm": 0.2590219974517822, |
|
"learning_rate": 1.1142061281337048e-06, |
|
"loss": 0.0489, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.1337047353760445, |
|
"grad_norm": 0.2477671355009079, |
|
"learning_rate": 1.392757660167131e-06, |
|
"loss": 0.0382, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.3565459610027855, |
|
"grad_norm": 0.2230578511953354, |
|
"learning_rate": 1.6713091922005572e-06, |
|
"loss": 0.0271, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.3565459610027855, |
|
"eval_loss": 0.02927413582801819, |
|
"eval_runtime": 6.1022, |
|
"eval_samples_per_second": 245.812, |
|
"eval_steps_per_second": 15.404, |
|
"eval_sts_dev_pearson_cosine": 0.8001627825550413, |
|
"eval_sts_dev_pearson_dot": 0.7013280153939746, |
|
"eval_sts_dev_pearson_euclidean": 0.7629781135707555, |
|
"eval_sts_dev_pearson_manhattan": 0.7647370302448242, |
|
"eval_sts_dev_pearson_max": 0.8001627825550413, |
|
"eval_sts_dev_spearman_cosine": 0.7994084764965521, |
|
"eval_sts_dev_spearman_dot": 0.6877298483304968, |
|
"eval_sts_dev_spearman_euclidean": 0.7623008729981257, |
|
"eval_sts_dev_spearman_manhattan": 0.7650295208380897, |
|
"eval_sts_dev_spearman_max": 0.7994084764965521, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.5793871866295266, |
|
"grad_norm": 0.23978064954280853, |
|
"learning_rate": 1.9498607242339835e-06, |
|
"loss": 0.0344, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.8022284122562673, |
|
"grad_norm": 0.2269248366355896, |
|
"learning_rate": 2.2284122562674097e-06, |
|
"loss": 0.0382, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.0445682451253484, |
|
"grad_norm": 0.1311478465795517, |
|
"learning_rate": 2.506963788300836e-06, |
|
"loss": 0.0419, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.0445682451253484, |
|
"eval_loss": 0.0279527697712183, |
|
"eval_runtime": 6.1525, |
|
"eval_samples_per_second": 243.802, |
|
"eval_steps_per_second": 15.278, |
|
"eval_sts_dev_pearson_cosine": 0.8052740525083868, |
|
"eval_sts_dev_pearson_dot": 0.7129779531910554, |
|
"eval_sts_dev_pearson_euclidean": 0.7630256540163647, |
|
"eval_sts_dev_pearson_manhattan": 0.7649555842254796, |
|
"eval_sts_dev_pearson_max": 0.8052740525083868, |
|
"eval_sts_dev_spearman_cosine": 0.805932936440032, |
|
"eval_sts_dev_spearman_dot": 0.7013448783489886, |
|
"eval_sts_dev_spearman_euclidean": 0.762706783236441, |
|
"eval_sts_dev_spearman_manhattan": 0.7655443912587759, |
|
"eval_sts_dev_spearman_max": 0.805932936440032, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.267409470752089, |
|
"grad_norm": 0.15666936337947845, |
|
"learning_rate": 2.785515320334262e-06, |
|
"loss": 0.0244, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.4902506963788302, |
|
"grad_norm": 0.14549851417541504, |
|
"learning_rate": 3.064066852367688e-06, |
|
"loss": 0.0307, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.713091922005571, |
|
"grad_norm": 0.20197178423404694, |
|
"learning_rate": 3.3426183844011143e-06, |
|
"loss": 0.0291, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.713091922005571, |
|
"eval_loss": 0.02694467455148697, |
|
"eval_runtime": 6.528, |
|
"eval_samples_per_second": 229.78, |
|
"eval_steps_per_second": 14.4, |
|
"eval_sts_dev_pearson_cosine": 0.8095317257793349, |
|
"eval_sts_dev_pearson_dot": 0.7228217786137938, |
|
"eval_sts_dev_pearson_euclidean": 0.7635943588878411, |
|
"eval_sts_dev_pearson_manhattan": 0.7656672001584354, |
|
"eval_sts_dev_pearson_max": 0.8095317257793349, |
|
"eval_sts_dev_spearman_cosine": 0.8107539995821735, |
|
"eval_sts_dev_spearman_dot": 0.7126247484390617, |
|
"eval_sts_dev_spearman_euclidean": 0.7634838306489425, |
|
"eval_sts_dev_spearman_manhattan": 0.7664168478564297, |
|
"eval_sts_dev_spearman_max": 0.8107539995821735, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.935933147632312, |
|
"grad_norm": 0.2107369303703308, |
|
"learning_rate": 3.6211699164345405e-06, |
|
"loss": 0.038, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 3.1782729805013927, |
|
"grad_norm": 0.15846215188503265, |
|
"learning_rate": 3.899721448467967e-06, |
|
"loss": 0.0269, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.401114206128134, |
|
"grad_norm": 0.17715278267860413, |
|
"learning_rate": 4.178272980501394e-06, |
|
"loss": 0.0268, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.401114206128134, |
|
"eval_loss": 0.026173867285251617, |
|
"eval_runtime": 6.306, |
|
"eval_samples_per_second": 237.869, |
|
"eval_steps_per_second": 14.906, |
|
"eval_sts_dev_pearson_cosine": 0.8136326182189031, |
|
"eval_sts_dev_pearson_dot": 0.7289342393989602, |
|
"eval_sts_dev_pearson_euclidean": 0.7658102043154281, |
|
"eval_sts_dev_pearson_manhattan": 0.7680399446033591, |
|
"eval_sts_dev_pearson_max": 0.8136326182189031, |
|
"eval_sts_dev_spearman_cosine": 0.8154563967795785, |
|
"eval_sts_dev_spearman_dot": 0.7204276033712009, |
|
"eval_sts_dev_spearman_euclidean": 0.7661516256266799, |
|
"eval_sts_dev_spearman_manhattan": 0.7692973830139536, |
|
"eval_sts_dev_spearman_max": 0.8154563967795785, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.6239554317548746, |
|
"grad_norm": 0.1337411254644394, |
|
"learning_rate": 4.456824512534819e-06, |
|
"loss": 0.0246, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.8467966573816157, |
|
"grad_norm": 0.20471176505088806, |
|
"learning_rate": 4.735376044568246e-06, |
|
"loss": 0.0313, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 4.089136490250697, |
|
"grad_norm": 0.12327426671981812, |
|
"learning_rate": 5.013927576601672e-06, |
|
"loss": 0.0303, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 4.089136490250697, |
|
"eval_loss": 0.02586401253938675, |
|
"eval_runtime": 6.8399, |
|
"eval_samples_per_second": 219.3, |
|
"eval_steps_per_second": 13.743, |
|
"eval_sts_dev_pearson_cosine": 0.8163121986548724, |
|
"eval_sts_dev_pearson_dot": 0.7330841259509188, |
|
"eval_sts_dev_pearson_euclidean": 0.7674859088604027, |
|
"eval_sts_dev_pearson_manhattan": 0.7697974598144367, |
|
"eval_sts_dev_pearson_max": 0.8163121986548724, |
|
"eval_sts_dev_spearman_cosine": 0.8184908732804921, |
|
"eval_sts_dev_spearman_dot": 0.7250521959658871, |
|
"eval_sts_dev_spearman_euclidean": 0.7684563123887144, |
|
"eval_sts_dev_spearman_manhattan": 0.7715573641686395, |
|
"eval_sts_dev_spearman_max": 0.8184908732804921, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 4.311977715877437, |
|
"grad_norm": 0.11181030422449112, |
|
"learning_rate": 5.292479108635098e-06, |
|
"loss": 0.0198, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.534818941504178, |
|
"grad_norm": 0.11830934137105942, |
|
"learning_rate": 5.571030640668524e-06, |
|
"loss": 0.0257, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.757660167130919, |
|
"grad_norm": 0.1775977462530136, |
|
"learning_rate": 5.849582172701951e-06, |
|
"loss": 0.0242, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.757660167130919, |
|
"eval_loss": 0.02551957406103611, |
|
"eval_runtime": 6.4245, |
|
"eval_samples_per_second": 233.481, |
|
"eval_steps_per_second": 14.631, |
|
"eval_sts_dev_pearson_cosine": 0.8184173000480589, |
|
"eval_sts_dev_pearson_dot": 0.7369533513611706, |
|
"eval_sts_dev_pearson_euclidean": 0.7687482582532739, |
|
"eval_sts_dev_pearson_manhattan": 0.7712300663924829, |
|
"eval_sts_dev_pearson_max": 0.8184173000480589, |
|
"eval_sts_dev_spearman_cosine": 0.8201930470486518, |
|
"eval_sts_dev_spearman_dot": 0.7292325959243812, |
|
"eval_sts_dev_spearman_euclidean": 0.7696170592602297, |
|
"eval_sts_dev_spearman_manhattan": 0.7729809111066369, |
|
"eval_sts_dev_spearman_max": 0.8201930470486518, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.9805013927576605, |
|
"grad_norm": 0.23354189097881317, |
|
"learning_rate": 6.128133704735376e-06, |
|
"loss": 0.0293, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 5.222841225626741, |
|
"grad_norm": 0.12718431651592255, |
|
"learning_rate": 6.406685236768803e-06, |
|
"loss": 0.0193, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 5.445682451253482, |
|
"grad_norm": 0.1111082211136818, |
|
"learning_rate": 6.685236768802229e-06, |
|
"loss": 0.0222, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.445682451253482, |
|
"eval_loss": 0.02539980411529541, |
|
"eval_runtime": 6.3582, |
|
"eval_samples_per_second": 235.915, |
|
"eval_steps_per_second": 14.784, |
|
"eval_sts_dev_pearson_cosine": 0.8203051470878093, |
|
"eval_sts_dev_pearson_dot": 0.7391973842870876, |
|
"eval_sts_dev_pearson_euclidean": 0.7710328054708023, |
|
"eval_sts_dev_pearson_manhattan": 0.7734981812206646, |
|
"eval_sts_dev_pearson_max": 0.8203051470878093, |
|
"eval_sts_dev_spearman_cosine": 0.8222047787628998, |
|
"eval_sts_dev_spearman_dot": 0.7306726496212352, |
|
"eval_sts_dev_spearman_euclidean": 0.7721080064054946, |
|
"eval_sts_dev_spearman_manhattan": 0.7758967012553709, |
|
"eval_sts_dev_spearman_max": 0.8222047787628998, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.6685236768802225, |
|
"grad_norm": 0.167997807264328, |
|
"learning_rate": 6.963788300835655e-06, |
|
"loss": 0.0184, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.891364902506964, |
|
"grad_norm": 0.18360492587089539, |
|
"learning_rate": 7.242339832869081e-06, |
|
"loss": 0.0243, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 6.133704735376044, |
|
"grad_norm": 0.11399545520544052, |
|
"learning_rate": 7.5208913649025075e-06, |
|
"loss": 0.0204, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 6.133704735376044, |
|
"eval_loss": 0.025426626205444336, |
|
"eval_runtime": 6.3377, |
|
"eval_samples_per_second": 236.678, |
|
"eval_steps_per_second": 14.832, |
|
"eval_sts_dev_pearson_cosine": 0.8215923043460271, |
|
"eval_sts_dev_pearson_dot": 0.7427941063103285, |
|
"eval_sts_dev_pearson_euclidean": 0.7725242056053008, |
|
"eval_sts_dev_pearson_manhattan": 0.7749558209132376, |
|
"eval_sts_dev_pearson_max": 0.8215923043460271, |
|
"eval_sts_dev_spearman_cosine": 0.8234628421089484, |
|
"eval_sts_dev_spearman_dot": 0.7343279809432616, |
|
"eval_sts_dev_spearman_euclidean": 0.7742054612821838, |
|
"eval_sts_dev_spearman_manhattan": 0.777339758218875, |
|
"eval_sts_dev_spearman_max": 0.8234628421089484, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 6.3565459610027855, |
|
"grad_norm": 0.14734485745429993, |
|
"learning_rate": 7.799442896935934e-06, |
|
"loss": 0.0147, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 6.579387186629527, |
|
"grad_norm": 0.14232878386974335, |
|
"learning_rate": 8.07799442896936e-06, |
|
"loss": 0.0196, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 6.802228412256268, |
|
"grad_norm": 0.12475496530532837, |
|
"learning_rate": 8.356545961002787e-06, |
|
"loss": 0.0176, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.802228412256268, |
|
"eval_loss": 0.025328340008854866, |
|
"eval_runtime": 6.1771, |
|
"eval_samples_per_second": 242.832, |
|
"eval_steps_per_second": 15.217, |
|
"eval_sts_dev_pearson_cosine": 0.8219368394963247, |
|
"eval_sts_dev_pearson_dot": 0.7469111462936613, |
|
"eval_sts_dev_pearson_euclidean": 0.7729334760561297, |
|
"eval_sts_dev_pearson_manhattan": 0.7754957053869553, |
|
"eval_sts_dev_pearson_max": 0.8219368394963247, |
|
"eval_sts_dev_spearman_cosine": 0.8227360781964935, |
|
"eval_sts_dev_spearman_dot": 0.7392541828806165, |
|
"eval_sts_dev_spearman_euclidean": 0.7748490630523356, |
|
"eval_sts_dev_spearman_manhattan": 0.7782586536188661, |
|
"eval_sts_dev_spearman_max": 0.8227360781964935, |
|
"step": 300 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 440, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 300, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|