CocoRoF's picture
Training in progress, step 300, checkpoint
b9918f5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.802228412256268,
"eval_steps": 30,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.22284122562674094,
"grad_norm": 0.2538502514362335,
"learning_rate": 2.785515320334262e-07,
"loss": 0.0283,
"step": 10
},
{
"epoch": 0.4456824512534819,
"grad_norm": 0.13635846972465515,
"learning_rate": 5.571030640668524e-07,
"loss": 0.0344,
"step": 20
},
{
"epoch": 0.6685236768802229,
"grad_norm": 0.25132906436920166,
"learning_rate": 8.356545961002786e-07,
"loss": 0.0305,
"step": 30
},
{
"epoch": 0.6685236768802229,
"eval_loss": 0.030975496396422386,
"eval_runtime": 6.5316,
"eval_samples_per_second": 229.652,
"eval_steps_per_second": 14.392,
"eval_sts_dev_pearson_cosine": 0.7953296758719961,
"eval_sts_dev_pearson_dot": 0.6855921619048916,
"eval_sts_dev_pearson_euclidean": 0.7647603423822984,
"eval_sts_dev_pearson_manhattan": 0.7662305710281121,
"eval_sts_dev_pearson_max": 0.7953296758719961,
"eval_sts_dev_spearman_cosine": 0.7938998183894888,
"eval_sts_dev_spearman_dot": 0.6701160606364611,
"eval_sts_dev_spearman_euclidean": 0.764275064463694,
"eval_sts_dev_spearman_manhattan": 0.7663956716038323,
"eval_sts_dev_spearman_max": 0.7938998183894888,
"step": 30
},
{
"epoch": 0.8913649025069638,
"grad_norm": 0.2590219974517822,
"learning_rate": 1.1142061281337048e-06,
"loss": 0.0489,
"step": 40
},
{
"epoch": 1.1337047353760445,
"grad_norm": 0.2477671355009079,
"learning_rate": 1.392757660167131e-06,
"loss": 0.0382,
"step": 50
},
{
"epoch": 1.3565459610027855,
"grad_norm": 0.2230578511953354,
"learning_rate": 1.6713091922005572e-06,
"loss": 0.0271,
"step": 60
},
{
"epoch": 1.3565459610027855,
"eval_loss": 0.02927413582801819,
"eval_runtime": 6.1022,
"eval_samples_per_second": 245.812,
"eval_steps_per_second": 15.404,
"eval_sts_dev_pearson_cosine": 0.8001627825550413,
"eval_sts_dev_pearson_dot": 0.7013280153939746,
"eval_sts_dev_pearson_euclidean": 0.7629781135707555,
"eval_sts_dev_pearson_manhattan": 0.7647370302448242,
"eval_sts_dev_pearson_max": 0.8001627825550413,
"eval_sts_dev_spearman_cosine": 0.7994084764965521,
"eval_sts_dev_spearman_dot": 0.6877298483304968,
"eval_sts_dev_spearman_euclidean": 0.7623008729981257,
"eval_sts_dev_spearman_manhattan": 0.7650295208380897,
"eval_sts_dev_spearman_max": 0.7994084764965521,
"step": 60
},
{
"epoch": 1.5793871866295266,
"grad_norm": 0.23978064954280853,
"learning_rate": 1.9498607242339835e-06,
"loss": 0.0344,
"step": 70
},
{
"epoch": 1.8022284122562673,
"grad_norm": 0.2269248366355896,
"learning_rate": 2.2284122562674097e-06,
"loss": 0.0382,
"step": 80
},
{
"epoch": 2.0445682451253484,
"grad_norm": 0.1311478465795517,
"learning_rate": 2.506963788300836e-06,
"loss": 0.0419,
"step": 90
},
{
"epoch": 2.0445682451253484,
"eval_loss": 0.0279527697712183,
"eval_runtime": 6.1525,
"eval_samples_per_second": 243.802,
"eval_steps_per_second": 15.278,
"eval_sts_dev_pearson_cosine": 0.8052740525083868,
"eval_sts_dev_pearson_dot": 0.7129779531910554,
"eval_sts_dev_pearson_euclidean": 0.7630256540163647,
"eval_sts_dev_pearson_manhattan": 0.7649555842254796,
"eval_sts_dev_pearson_max": 0.8052740525083868,
"eval_sts_dev_spearman_cosine": 0.805932936440032,
"eval_sts_dev_spearman_dot": 0.7013448783489886,
"eval_sts_dev_spearman_euclidean": 0.762706783236441,
"eval_sts_dev_spearman_manhattan": 0.7655443912587759,
"eval_sts_dev_spearman_max": 0.805932936440032,
"step": 90
},
{
"epoch": 2.267409470752089,
"grad_norm": 0.15666936337947845,
"learning_rate": 2.785515320334262e-06,
"loss": 0.0244,
"step": 100
},
{
"epoch": 2.4902506963788302,
"grad_norm": 0.14549851417541504,
"learning_rate": 3.064066852367688e-06,
"loss": 0.0307,
"step": 110
},
{
"epoch": 2.713091922005571,
"grad_norm": 0.20197178423404694,
"learning_rate": 3.3426183844011143e-06,
"loss": 0.0291,
"step": 120
},
{
"epoch": 2.713091922005571,
"eval_loss": 0.02694467455148697,
"eval_runtime": 6.528,
"eval_samples_per_second": 229.78,
"eval_steps_per_second": 14.4,
"eval_sts_dev_pearson_cosine": 0.8095317257793349,
"eval_sts_dev_pearson_dot": 0.7228217786137938,
"eval_sts_dev_pearson_euclidean": 0.7635943588878411,
"eval_sts_dev_pearson_manhattan": 0.7656672001584354,
"eval_sts_dev_pearson_max": 0.8095317257793349,
"eval_sts_dev_spearman_cosine": 0.8107539995821735,
"eval_sts_dev_spearman_dot": 0.7126247484390617,
"eval_sts_dev_spearman_euclidean": 0.7634838306489425,
"eval_sts_dev_spearman_manhattan": 0.7664168478564297,
"eval_sts_dev_spearman_max": 0.8107539995821735,
"step": 120
},
{
"epoch": 2.935933147632312,
"grad_norm": 0.2107369303703308,
"learning_rate": 3.6211699164345405e-06,
"loss": 0.038,
"step": 130
},
{
"epoch": 3.1782729805013927,
"grad_norm": 0.15846215188503265,
"learning_rate": 3.899721448467967e-06,
"loss": 0.0269,
"step": 140
},
{
"epoch": 3.401114206128134,
"grad_norm": 0.17715278267860413,
"learning_rate": 4.178272980501394e-06,
"loss": 0.0268,
"step": 150
},
{
"epoch": 3.401114206128134,
"eval_loss": 0.026173867285251617,
"eval_runtime": 6.306,
"eval_samples_per_second": 237.869,
"eval_steps_per_second": 14.906,
"eval_sts_dev_pearson_cosine": 0.8136326182189031,
"eval_sts_dev_pearson_dot": 0.7289342393989602,
"eval_sts_dev_pearson_euclidean": 0.7658102043154281,
"eval_sts_dev_pearson_manhattan": 0.7680399446033591,
"eval_sts_dev_pearson_max": 0.8136326182189031,
"eval_sts_dev_spearman_cosine": 0.8154563967795785,
"eval_sts_dev_spearman_dot": 0.7204276033712009,
"eval_sts_dev_spearman_euclidean": 0.7661516256266799,
"eval_sts_dev_spearman_manhattan": 0.7692973830139536,
"eval_sts_dev_spearman_max": 0.8154563967795785,
"step": 150
},
{
"epoch": 3.6239554317548746,
"grad_norm": 0.1337411254644394,
"learning_rate": 4.456824512534819e-06,
"loss": 0.0246,
"step": 160
},
{
"epoch": 3.8467966573816157,
"grad_norm": 0.20471176505088806,
"learning_rate": 4.735376044568246e-06,
"loss": 0.0313,
"step": 170
},
{
"epoch": 4.089136490250697,
"grad_norm": 0.12327426671981812,
"learning_rate": 5.013927576601672e-06,
"loss": 0.0303,
"step": 180
},
{
"epoch": 4.089136490250697,
"eval_loss": 0.02586401253938675,
"eval_runtime": 6.8399,
"eval_samples_per_second": 219.3,
"eval_steps_per_second": 13.743,
"eval_sts_dev_pearson_cosine": 0.8163121986548724,
"eval_sts_dev_pearson_dot": 0.7330841259509188,
"eval_sts_dev_pearson_euclidean": 0.7674859088604027,
"eval_sts_dev_pearson_manhattan": 0.7697974598144367,
"eval_sts_dev_pearson_max": 0.8163121986548724,
"eval_sts_dev_spearman_cosine": 0.8184908732804921,
"eval_sts_dev_spearman_dot": 0.7250521959658871,
"eval_sts_dev_spearman_euclidean": 0.7684563123887144,
"eval_sts_dev_spearman_manhattan": 0.7715573641686395,
"eval_sts_dev_spearman_max": 0.8184908732804921,
"step": 180
},
{
"epoch": 4.311977715877437,
"grad_norm": 0.11181030422449112,
"learning_rate": 5.292479108635098e-06,
"loss": 0.0198,
"step": 190
},
{
"epoch": 4.534818941504178,
"grad_norm": 0.11830934137105942,
"learning_rate": 5.571030640668524e-06,
"loss": 0.0257,
"step": 200
},
{
"epoch": 4.757660167130919,
"grad_norm": 0.1775977462530136,
"learning_rate": 5.849582172701951e-06,
"loss": 0.0242,
"step": 210
},
{
"epoch": 4.757660167130919,
"eval_loss": 0.02551957406103611,
"eval_runtime": 6.4245,
"eval_samples_per_second": 233.481,
"eval_steps_per_second": 14.631,
"eval_sts_dev_pearson_cosine": 0.8184173000480589,
"eval_sts_dev_pearson_dot": 0.7369533513611706,
"eval_sts_dev_pearson_euclidean": 0.7687482582532739,
"eval_sts_dev_pearson_manhattan": 0.7712300663924829,
"eval_sts_dev_pearson_max": 0.8184173000480589,
"eval_sts_dev_spearman_cosine": 0.8201930470486518,
"eval_sts_dev_spearman_dot": 0.7292325959243812,
"eval_sts_dev_spearman_euclidean": 0.7696170592602297,
"eval_sts_dev_spearman_manhattan": 0.7729809111066369,
"eval_sts_dev_spearman_max": 0.8201930470486518,
"step": 210
},
{
"epoch": 4.9805013927576605,
"grad_norm": 0.23354189097881317,
"learning_rate": 6.128133704735376e-06,
"loss": 0.0293,
"step": 220
},
{
"epoch": 5.222841225626741,
"grad_norm": 0.12718431651592255,
"learning_rate": 6.406685236768803e-06,
"loss": 0.0193,
"step": 230
},
{
"epoch": 5.445682451253482,
"grad_norm": 0.1111082211136818,
"learning_rate": 6.685236768802229e-06,
"loss": 0.0222,
"step": 240
},
{
"epoch": 5.445682451253482,
"eval_loss": 0.02539980411529541,
"eval_runtime": 6.3582,
"eval_samples_per_second": 235.915,
"eval_steps_per_second": 14.784,
"eval_sts_dev_pearson_cosine": 0.8203051470878093,
"eval_sts_dev_pearson_dot": 0.7391973842870876,
"eval_sts_dev_pearson_euclidean": 0.7710328054708023,
"eval_sts_dev_pearson_manhattan": 0.7734981812206646,
"eval_sts_dev_pearson_max": 0.8203051470878093,
"eval_sts_dev_spearman_cosine": 0.8222047787628998,
"eval_sts_dev_spearman_dot": 0.7306726496212352,
"eval_sts_dev_spearman_euclidean": 0.7721080064054946,
"eval_sts_dev_spearman_manhattan": 0.7758967012553709,
"eval_sts_dev_spearman_max": 0.8222047787628998,
"step": 240
},
{
"epoch": 5.6685236768802225,
"grad_norm": 0.167997807264328,
"learning_rate": 6.963788300835655e-06,
"loss": 0.0184,
"step": 250
},
{
"epoch": 5.891364902506964,
"grad_norm": 0.18360492587089539,
"learning_rate": 7.242339832869081e-06,
"loss": 0.0243,
"step": 260
},
{
"epoch": 6.133704735376044,
"grad_norm": 0.11399545520544052,
"learning_rate": 7.5208913649025075e-06,
"loss": 0.0204,
"step": 270
},
{
"epoch": 6.133704735376044,
"eval_loss": 0.025426626205444336,
"eval_runtime": 6.3377,
"eval_samples_per_second": 236.678,
"eval_steps_per_second": 14.832,
"eval_sts_dev_pearson_cosine": 0.8215923043460271,
"eval_sts_dev_pearson_dot": 0.7427941063103285,
"eval_sts_dev_pearson_euclidean": 0.7725242056053008,
"eval_sts_dev_pearson_manhattan": 0.7749558209132376,
"eval_sts_dev_pearson_max": 0.8215923043460271,
"eval_sts_dev_spearman_cosine": 0.8234628421089484,
"eval_sts_dev_spearman_dot": 0.7343279809432616,
"eval_sts_dev_spearman_euclidean": 0.7742054612821838,
"eval_sts_dev_spearman_manhattan": 0.777339758218875,
"eval_sts_dev_spearman_max": 0.8234628421089484,
"step": 270
},
{
"epoch": 6.3565459610027855,
"grad_norm": 0.14734485745429993,
"learning_rate": 7.799442896935934e-06,
"loss": 0.0147,
"step": 280
},
{
"epoch": 6.579387186629527,
"grad_norm": 0.14232878386974335,
"learning_rate": 8.07799442896936e-06,
"loss": 0.0196,
"step": 290
},
{
"epoch": 6.802228412256268,
"grad_norm": 0.12475496530532837,
"learning_rate": 8.356545961002787e-06,
"loss": 0.0176,
"step": 300
},
{
"epoch": 6.802228412256268,
"eval_loss": 0.025328340008854866,
"eval_runtime": 6.1771,
"eval_samples_per_second": 242.832,
"eval_steps_per_second": 15.217,
"eval_sts_dev_pearson_cosine": 0.8219368394963247,
"eval_sts_dev_pearson_dot": 0.7469111462936613,
"eval_sts_dev_pearson_euclidean": 0.7729334760561297,
"eval_sts_dev_pearson_manhattan": 0.7754957053869553,
"eval_sts_dev_pearson_max": 0.8219368394963247,
"eval_sts_dev_spearman_cosine": 0.8227360781964935,
"eval_sts_dev_spearman_dot": 0.7392541828806165,
"eval_sts_dev_spearman_euclidean": 0.7748490630523356,
"eval_sts_dev_spearman_manhattan": 0.7782586536188661,
"eval_sts_dev_spearman_max": 0.8227360781964935,
"step": 300
}
],
"logging_steps": 10,
"max_steps": 440,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}