{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.008466603951082, "eval_steps": 50, "global_step": 67, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015051740357478834, "grad_norm": 0.9344162940979004, "learning_rate": 2e-05, "loss": 0.6705, "step": 1 }, { "epoch": 0.015051740357478834, "eval_loss": 0.6895620226860046, "eval_runtime": 10.3955, "eval_samples_per_second": 21.548, "eval_steps_per_second": 5.387, "step": 1 }, { "epoch": 0.030103480714957668, "grad_norm": 0.9302381873130798, "learning_rate": 4e-05, "loss": 0.6946, "step": 2 }, { "epoch": 0.045155221072436504, "grad_norm": 0.9359163045883179, "learning_rate": 6e-05, "loss": 0.7409, "step": 3 }, { "epoch": 0.060206961429915336, "grad_norm": 0.9642816781997681, "learning_rate": 8e-05, "loss": 0.6928, "step": 4 }, { "epoch": 0.07525870178739416, "grad_norm": 0.8229894042015076, "learning_rate": 0.0001, "loss": 0.577, "step": 5 }, { "epoch": 0.09031044214487301, "grad_norm": 0.4650634527206421, "learning_rate": 0.00012, "loss": 0.5228, "step": 6 }, { "epoch": 0.10536218250235184, "grad_norm": 0.4150318205356598, "learning_rate": 0.00014, "loss": 0.4753, "step": 7 }, { "epoch": 0.12041392285983067, "grad_norm": 0.5093539357185364, "learning_rate": 0.00016, "loss": 0.505, "step": 8 }, { "epoch": 0.1354656632173095, "grad_norm": 0.4901506006717682, "learning_rate": 0.00018, "loss": 0.4358, "step": 9 }, { "epoch": 0.15051740357478832, "grad_norm": 0.3130745589733124, "learning_rate": 0.0002, "loss": 0.4193, "step": 10 }, { "epoch": 0.16556914393226718, "grad_norm": 0.4450761675834656, "learning_rate": 0.00019984815164333163, "loss": 0.3821, "step": 11 }, { "epoch": 0.18062088428974601, "grad_norm": 0.44579023122787476, "learning_rate": 0.00019939306773179497, "loss": 0.3957, "step": 12 }, { "epoch": 0.19567262464722485, "grad_norm": 0.3342147469520569, "learning_rate": 0.00019863613034027224, "loss": 0.3776, "step": 13 }, { "epoch": 0.21072436500470368, "grad_norm": 0.3169708549976349, "learning_rate": 0.00019757963826274357, "loss": 0.4155, "step": 14 }, { "epoch": 0.2257761053621825, "grad_norm": 0.3119909167289734, "learning_rate": 0.00019622680003092503, "loss": 0.404, "step": 15 }, { "epoch": 0.24082784571966134, "grad_norm": 0.2754630148410797, "learning_rate": 0.00019458172417006347, "loss": 0.3648, "step": 16 }, { "epoch": 0.2558795860771402, "grad_norm": 0.3073170781135559, "learning_rate": 0.00019264940672148018, "loss": 0.3565, "step": 17 }, { "epoch": 0.270931326434619, "grad_norm": 0.2857714295387268, "learning_rate": 0.00019043571606975777, "loss": 0.3717, "step": 18 }, { "epoch": 0.28598306679209784, "grad_norm": 0.25880104303359985, "learning_rate": 0.0001879473751206489, "loss": 0.3576, "step": 19 }, { "epoch": 0.30103480714957664, "grad_norm": 0.2680279314517975, "learning_rate": 0.00018519194088383273, "loss": 0.3916, "step": 20 }, { "epoch": 0.3160865475070555, "grad_norm": 0.3286673426628113, "learning_rate": 0.0001821777815225245, "loss": 0.4095, "step": 21 }, { "epoch": 0.33113828786453436, "grad_norm": 0.2846011817455292, "learning_rate": 0.00017891405093963938, "loss": 0.3653, "step": 22 }, { "epoch": 0.34619002822201317, "grad_norm": 0.2800520360469818, "learning_rate": 0.00017541066097768963, "loss": 0.3917, "step": 23 }, { "epoch": 0.36124176857949203, "grad_norm": 0.26333191990852356, "learning_rate": 0.00017167825131684513, "loss": 0.3782, "step": 24 }, { "epoch": 0.37629350893697083, "grad_norm": 0.2338825762271881, "learning_rate": 0.00016772815716257412, "loss": 0.3709, "step": 25 }, { "epoch": 0.3913452492944497, "grad_norm": 0.23630400002002716, "learning_rate": 0.00016357237482099684, "loss": 0.3753, "step": 26 }, { "epoch": 0.4063969896519285, "grad_norm": 0.22607026994228363, "learning_rate": 0.00015922352526649803, "loss": 0.356, "step": 27 }, { "epoch": 0.42144873000940736, "grad_norm": 0.23991024494171143, "learning_rate": 0.00015469481581224272, "loss": 0.3531, "step": 28 }, { "epoch": 0.43650047036688616, "grad_norm": 0.2309068888425827, "learning_rate": 0.00015000000000000001, "loss": 0.377, "step": 29 }, { "epoch": 0.451552210724365, "grad_norm": 0.27483415603637695, "learning_rate": 0.00014515333583108896, "loss": 0.3769, "step": 30 }, { "epoch": 0.4666039510818438, "grad_norm": 0.253368616104126, "learning_rate": 0.00014016954246529696, "loss": 0.3579, "step": 31 }, { "epoch": 0.4816556914393227, "grad_norm": 0.2460198998451233, "learning_rate": 0.00013506375551927547, "loss": 0.3424, "step": 32 }, { "epoch": 0.4967074317968015, "grad_norm": 0.24006766080856323, "learning_rate": 0.00012985148110016947, "loss": 0.4105, "step": 33 }, { "epoch": 0.5117591721542804, "grad_norm": 0.242473304271698, "learning_rate": 0.00012454854871407994, "loss": 0.3545, "step": 34 }, { "epoch": 0.5268109125117592, "grad_norm": 0.22222432494163513, "learning_rate": 0.00011917106319237386, "loss": 0.3626, "step": 35 }, { "epoch": 0.541862652869238, "grad_norm": 0.2172698825597763, "learning_rate": 0.00011373535578184082, "loss": 0.3121, "step": 36 }, { "epoch": 0.5569143932267169, "grad_norm": 0.19818319380283356, "learning_rate": 0.00010825793454723325, "loss": 0.345, "step": 37 }, { "epoch": 0.5719661335841957, "grad_norm": 0.23813501000404358, "learning_rate": 0.00010275543423681621, "loss": 0.3227, "step": 38 }, { "epoch": 0.5870178739416745, "grad_norm": 0.22211886942386627, "learning_rate": 9.724456576318381e-05, "loss": 0.3428, "step": 39 }, { "epoch": 0.6020696142991533, "grad_norm": 0.20863519608974457, "learning_rate": 9.174206545276677e-05, "loss": 0.3357, "step": 40 }, { "epoch": 0.6171213546566322, "grad_norm": 0.24819327890872955, "learning_rate": 8.626464421815919e-05, "loss": 0.3799, "step": 41 }, { "epoch": 0.632173095014111, "grad_norm": 0.2553349435329437, "learning_rate": 8.082893680762619e-05, "loss": 0.3727, "step": 42 }, { "epoch": 0.6472248353715898, "grad_norm": 0.21132054924964905, "learning_rate": 7.54514512859201e-05, "loss": 0.3465, "step": 43 }, { "epoch": 0.6622765757290687, "grad_norm": 0.24369095265865326, "learning_rate": 7.014851889983057e-05, "loss": 0.3558, "step": 44 }, { "epoch": 0.6773283160865475, "grad_norm": 0.2254449874162674, "learning_rate": 6.493624448072457e-05, "loss": 0.3473, "step": 45 }, { "epoch": 0.6923800564440263, "grad_norm": 0.2430431842803955, "learning_rate": 5.983045753470308e-05, "loss": 0.3568, "step": 46 }, { "epoch": 0.7074317968015051, "grad_norm": 0.22099405527114868, "learning_rate": 5.484666416891109e-05, "loss": 0.3572, "step": 47 }, { "epoch": 0.7224835371589841, "grad_norm": 0.22853314876556396, "learning_rate": 5.000000000000002e-05, "loss": 0.3486, "step": 48 }, { "epoch": 0.7375352775164629, "grad_norm": 0.23912452161312103, "learning_rate": 4.530518418775733e-05, "loss": 0.3443, "step": 49 }, { "epoch": 0.7525870178739417, "grad_norm": 0.2265036553144455, "learning_rate": 4.077647473350201e-05, "loss": 0.3062, "step": 50 }, { "epoch": 0.7525870178739417, "eval_loss": 0.3498118817806244, "eval_runtime": 10.5271, "eval_samples_per_second": 21.278, "eval_steps_per_second": 5.32, "step": 50 }, { "epoch": 0.7676387582314205, "grad_norm": 0.23014026880264282, "learning_rate": 3.642762517900322e-05, "loss": 0.3562, "step": 51 }, { "epoch": 0.7826904985888994, "grad_norm": 0.2177976369857788, "learning_rate": 3.227184283742591e-05, "loss": 0.3473, "step": 52 }, { "epoch": 0.7977422389463782, "grad_norm": 0.22603270411491394, "learning_rate": 2.8321748683154893e-05, "loss": 0.3505, "step": 53 }, { "epoch": 0.812793979303857, "grad_norm": 0.22148269414901733, "learning_rate": 2.4589339022310386e-05, "loss": 0.332, "step": 54 }, { "epoch": 0.8278457196613358, "grad_norm": 0.2047235518693924, "learning_rate": 2.1085949060360654e-05, "loss": 0.3512, "step": 55 }, { "epoch": 0.8428974600188147, "grad_norm": 0.21254387497901917, "learning_rate": 1.7822218477475494e-05, "loss": 0.3424, "step": 56 }, { "epoch": 0.8579492003762935, "grad_norm": 0.21842923760414124, "learning_rate": 1.4808059116167305e-05, "loss": 0.3436, "step": 57 }, { "epoch": 0.8730009407337723, "grad_norm": 0.22964705526828766, "learning_rate": 1.2052624879351104e-05, "loss": 0.3484, "step": 58 }, { "epoch": 0.8880526810912511, "grad_norm": 0.2156173586845398, "learning_rate": 9.564283930242257e-06, "loss": 0.3396, "step": 59 }, { "epoch": 0.90310442144873, "grad_norm": 0.21700626611709595, "learning_rate": 7.350593278519824e-06, "loss": 0.3249, "step": 60 }, { "epoch": 0.9181561618062088, "grad_norm": 0.2113395631313324, "learning_rate": 5.418275829936537e-06, "loss": 0.3659, "step": 61 }, { "epoch": 0.9332079021636877, "grad_norm": 0.2139684408903122, "learning_rate": 3.7731999690749585e-06, "loss": 0.3683, "step": 62 }, { "epoch": 0.9482596425211665, "grad_norm": 0.22140838205814362, "learning_rate": 2.420361737256438e-06, "loss": 0.3689, "step": 63 }, { "epoch": 0.9633113828786454, "grad_norm": 0.21749098598957062, "learning_rate": 1.3638696597277679e-06, "loss": 0.3842, "step": 64 }, { "epoch": 0.9783631232361242, "grad_norm": 0.2486245036125183, "learning_rate": 6.069322682050516e-07, "loss": 0.3889, "step": 65 }, { "epoch": 0.993414863593603, "grad_norm": 0.22336921095848083, "learning_rate": 1.518483566683826e-07, "loss": 0.3572, "step": 66 }, { "epoch": 1.008466603951082, "grad_norm": 0.3670308291912079, "learning_rate": 0.0, "loss": 0.4966, "step": 67 } ], "logging_steps": 1, "max_steps": 67, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.389133837801882e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }