|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.008466603951082, |
|
"eval_steps": 50, |
|
"global_step": 67, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015051740357478834, |
|
"grad_norm": 0.9344162940979004, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6705, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.015051740357478834, |
|
"eval_loss": 0.6895620226860046, |
|
"eval_runtime": 10.3955, |
|
"eval_samples_per_second": 21.548, |
|
"eval_steps_per_second": 5.387, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.030103480714957668, |
|
"grad_norm": 0.9302381873130798, |
|
"learning_rate": 4e-05, |
|
"loss": 0.6946, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.045155221072436504, |
|
"grad_norm": 0.9359163045883179, |
|
"learning_rate": 6e-05, |
|
"loss": 0.7409, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.060206961429915336, |
|
"grad_norm": 0.9642816781997681, |
|
"learning_rate": 8e-05, |
|
"loss": 0.6928, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.07525870178739416, |
|
"grad_norm": 0.8229894042015076, |
|
"learning_rate": 0.0001, |
|
"loss": 0.577, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09031044214487301, |
|
"grad_norm": 0.4650634527206421, |
|
"learning_rate": 0.00012, |
|
"loss": 0.5228, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.10536218250235184, |
|
"grad_norm": 0.4150318205356598, |
|
"learning_rate": 0.00014, |
|
"loss": 0.4753, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.12041392285983067, |
|
"grad_norm": 0.5093539357185364, |
|
"learning_rate": 0.00016, |
|
"loss": 0.505, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1354656632173095, |
|
"grad_norm": 0.4901506006717682, |
|
"learning_rate": 0.00018, |
|
"loss": 0.4358, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.15051740357478832, |
|
"grad_norm": 0.3130745589733124, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4193, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16556914393226718, |
|
"grad_norm": 0.4450761675834656, |
|
"learning_rate": 0.00019984815164333163, |
|
"loss": 0.3821, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.18062088428974601, |
|
"grad_norm": 0.44579023122787476, |
|
"learning_rate": 0.00019939306773179497, |
|
"loss": 0.3957, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.19567262464722485, |
|
"grad_norm": 0.3342147469520569, |
|
"learning_rate": 0.00019863613034027224, |
|
"loss": 0.3776, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.21072436500470368, |
|
"grad_norm": 0.3169708549976349, |
|
"learning_rate": 0.00019757963826274357, |
|
"loss": 0.4155, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.2257761053621825, |
|
"grad_norm": 0.3119909167289734, |
|
"learning_rate": 0.00019622680003092503, |
|
"loss": 0.404, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.24082784571966134, |
|
"grad_norm": 0.2754630148410797, |
|
"learning_rate": 0.00019458172417006347, |
|
"loss": 0.3648, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2558795860771402, |
|
"grad_norm": 0.3073170781135559, |
|
"learning_rate": 0.00019264940672148018, |
|
"loss": 0.3565, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.270931326434619, |
|
"grad_norm": 0.2857714295387268, |
|
"learning_rate": 0.00019043571606975777, |
|
"loss": 0.3717, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.28598306679209784, |
|
"grad_norm": 0.25880104303359985, |
|
"learning_rate": 0.0001879473751206489, |
|
"loss": 0.3576, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.30103480714957664, |
|
"grad_norm": 0.2680279314517975, |
|
"learning_rate": 0.00018519194088383273, |
|
"loss": 0.3916, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3160865475070555, |
|
"grad_norm": 0.3286673426628113, |
|
"learning_rate": 0.0001821777815225245, |
|
"loss": 0.4095, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.33113828786453436, |
|
"grad_norm": 0.2846011817455292, |
|
"learning_rate": 0.00017891405093963938, |
|
"loss": 0.3653, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.34619002822201317, |
|
"grad_norm": 0.2800520360469818, |
|
"learning_rate": 0.00017541066097768963, |
|
"loss": 0.3917, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.36124176857949203, |
|
"grad_norm": 0.26333191990852356, |
|
"learning_rate": 0.00017167825131684513, |
|
"loss": 0.3782, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.37629350893697083, |
|
"grad_norm": 0.2338825762271881, |
|
"learning_rate": 0.00016772815716257412, |
|
"loss": 0.3709, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3913452492944497, |
|
"grad_norm": 0.23630400002002716, |
|
"learning_rate": 0.00016357237482099684, |
|
"loss": 0.3753, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.4063969896519285, |
|
"grad_norm": 0.22607026994228363, |
|
"learning_rate": 0.00015922352526649803, |
|
"loss": 0.356, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.42144873000940736, |
|
"grad_norm": 0.23991024494171143, |
|
"learning_rate": 0.00015469481581224272, |
|
"loss": 0.3531, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.43650047036688616, |
|
"grad_norm": 0.2309068888425827, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.377, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.451552210724365, |
|
"grad_norm": 0.27483415603637695, |
|
"learning_rate": 0.00014515333583108896, |
|
"loss": 0.3769, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4666039510818438, |
|
"grad_norm": 0.253368616104126, |
|
"learning_rate": 0.00014016954246529696, |
|
"loss": 0.3579, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.4816556914393227, |
|
"grad_norm": 0.2460198998451233, |
|
"learning_rate": 0.00013506375551927547, |
|
"loss": 0.3424, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.4967074317968015, |
|
"grad_norm": 0.24006766080856323, |
|
"learning_rate": 0.00012985148110016947, |
|
"loss": 0.4105, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5117591721542804, |
|
"grad_norm": 0.242473304271698, |
|
"learning_rate": 0.00012454854871407994, |
|
"loss": 0.3545, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5268109125117592, |
|
"grad_norm": 0.22222432494163513, |
|
"learning_rate": 0.00011917106319237386, |
|
"loss": 0.3626, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.541862652869238, |
|
"grad_norm": 0.2172698825597763, |
|
"learning_rate": 0.00011373535578184082, |
|
"loss": 0.3121, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5569143932267169, |
|
"grad_norm": 0.19818319380283356, |
|
"learning_rate": 0.00010825793454723325, |
|
"loss": 0.345, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.5719661335841957, |
|
"grad_norm": 0.23813501000404358, |
|
"learning_rate": 0.00010275543423681621, |
|
"loss": 0.3227, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.5870178739416745, |
|
"grad_norm": 0.22211886942386627, |
|
"learning_rate": 9.724456576318381e-05, |
|
"loss": 0.3428, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.6020696142991533, |
|
"grad_norm": 0.20863519608974457, |
|
"learning_rate": 9.174206545276677e-05, |
|
"loss": 0.3357, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6171213546566322, |
|
"grad_norm": 0.24819327890872955, |
|
"learning_rate": 8.626464421815919e-05, |
|
"loss": 0.3799, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.632173095014111, |
|
"grad_norm": 0.2553349435329437, |
|
"learning_rate": 8.082893680762619e-05, |
|
"loss": 0.3727, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6472248353715898, |
|
"grad_norm": 0.21132054924964905, |
|
"learning_rate": 7.54514512859201e-05, |
|
"loss": 0.3465, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6622765757290687, |
|
"grad_norm": 0.24369095265865326, |
|
"learning_rate": 7.014851889983057e-05, |
|
"loss": 0.3558, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.6773283160865475, |
|
"grad_norm": 0.2254449874162674, |
|
"learning_rate": 6.493624448072457e-05, |
|
"loss": 0.3473, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6923800564440263, |
|
"grad_norm": 0.2430431842803955, |
|
"learning_rate": 5.983045753470308e-05, |
|
"loss": 0.3568, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.7074317968015051, |
|
"grad_norm": 0.22099405527114868, |
|
"learning_rate": 5.484666416891109e-05, |
|
"loss": 0.3572, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7224835371589841, |
|
"grad_norm": 0.22853314876556396, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.3486, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7375352775164629, |
|
"grad_norm": 0.23912452161312103, |
|
"learning_rate": 4.530518418775733e-05, |
|
"loss": 0.3443, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.7525870178739417, |
|
"grad_norm": 0.2265036553144455, |
|
"learning_rate": 4.077647473350201e-05, |
|
"loss": 0.3062, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7525870178739417, |
|
"eval_loss": 0.3498118817806244, |
|
"eval_runtime": 10.5271, |
|
"eval_samples_per_second": 21.278, |
|
"eval_steps_per_second": 5.32, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7676387582314205, |
|
"grad_norm": 0.23014026880264282, |
|
"learning_rate": 3.642762517900322e-05, |
|
"loss": 0.3562, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.7826904985888994, |
|
"grad_norm": 0.2177976369857788, |
|
"learning_rate": 3.227184283742591e-05, |
|
"loss": 0.3473, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.7977422389463782, |
|
"grad_norm": 0.22603270411491394, |
|
"learning_rate": 2.8321748683154893e-05, |
|
"loss": 0.3505, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.812793979303857, |
|
"grad_norm": 0.22148269414901733, |
|
"learning_rate": 2.4589339022310386e-05, |
|
"loss": 0.332, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.8278457196613358, |
|
"grad_norm": 0.2047235518693924, |
|
"learning_rate": 2.1085949060360654e-05, |
|
"loss": 0.3512, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.8428974600188147, |
|
"grad_norm": 0.21254387497901917, |
|
"learning_rate": 1.7822218477475494e-05, |
|
"loss": 0.3424, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.8579492003762935, |
|
"grad_norm": 0.21842923760414124, |
|
"learning_rate": 1.4808059116167305e-05, |
|
"loss": 0.3436, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.8730009407337723, |
|
"grad_norm": 0.22964705526828766, |
|
"learning_rate": 1.2052624879351104e-05, |
|
"loss": 0.3484, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.8880526810912511, |
|
"grad_norm": 0.2156173586845398, |
|
"learning_rate": 9.564283930242257e-06, |
|
"loss": 0.3396, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.90310442144873, |
|
"grad_norm": 0.21700626611709595, |
|
"learning_rate": 7.350593278519824e-06, |
|
"loss": 0.3249, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9181561618062088, |
|
"grad_norm": 0.2113395631313324, |
|
"learning_rate": 5.418275829936537e-06, |
|
"loss": 0.3659, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.9332079021636877, |
|
"grad_norm": 0.2139684408903122, |
|
"learning_rate": 3.7731999690749585e-06, |
|
"loss": 0.3683, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.9482596425211665, |
|
"grad_norm": 0.22140838205814362, |
|
"learning_rate": 2.420361737256438e-06, |
|
"loss": 0.3689, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.9633113828786454, |
|
"grad_norm": 0.21749098598957062, |
|
"learning_rate": 1.3638696597277679e-06, |
|
"loss": 0.3842, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.9783631232361242, |
|
"grad_norm": 0.2486245036125183, |
|
"learning_rate": 6.069322682050516e-07, |
|
"loss": 0.3889, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.993414863593603, |
|
"grad_norm": 0.22336921095848083, |
|
"learning_rate": 1.518483566683826e-07, |
|
"loss": 0.3572, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.008466603951082, |
|
"grad_norm": 0.3670308291912079, |
|
"learning_rate": 0.0, |
|
"loss": 0.4966, |
|
"step": 67 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 67, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.389133837801882e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|