{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0033791279119378785, "eval_steps": 50, "global_step": 99, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.4132605171089684e-05, "grad_norm": 0.20423279702663422, "learning_rate": 5e-05, "loss": 3.2544, "step": 1 }, { "epoch": 3.4132605171089684e-05, "eval_loss": 3.076941728591919, "eval_runtime": 203.9843, "eval_samples_per_second": 60.475, "eval_steps_per_second": 30.238, "step": 1 }, { "epoch": 6.826521034217937e-05, "grad_norm": 0.19064632058143616, "learning_rate": 0.0001, "loss": 2.8993, "step": 2 }, { "epoch": 0.00010239781551326905, "grad_norm": 0.19087578356266022, "learning_rate": 0.00015, "loss": 2.8242, "step": 3 }, { "epoch": 0.00013653042068435874, "grad_norm": 0.18373772501945496, "learning_rate": 0.0002, "loss": 2.6837, "step": 4 }, { "epoch": 0.0001706630258554484, "grad_norm": 0.26902732253074646, "learning_rate": 0.00025, "loss": 3.3408, "step": 5 }, { "epoch": 0.0002047956310265381, "grad_norm": 0.24464839696884155, "learning_rate": 0.0003, "loss": 3.0973, "step": 6 }, { "epoch": 0.0002389282361976278, "grad_norm": 0.25614237785339355, "learning_rate": 0.00035, "loss": 2.9122, "step": 7 }, { "epoch": 0.00027306084136871747, "grad_norm": 0.29932963848114014, "learning_rate": 0.0004, "loss": 3.0787, "step": 8 }, { "epoch": 0.0003071934465398072, "grad_norm": 0.2929045855998993, "learning_rate": 0.00045000000000000004, "loss": 2.8395, "step": 9 }, { "epoch": 0.0003413260517108968, "grad_norm": 0.31398922204971313, "learning_rate": 0.0005, "loss": 2.9357, "step": 10 }, { "epoch": 0.00037545865688198653, "grad_norm": 0.3056373596191406, "learning_rate": 0.0004998442655654946, "loss": 2.6586, "step": 11 }, { "epoch": 0.0004095912620530762, "grad_norm": 0.261355996131897, "learning_rate": 0.0004993772562876909, "loss": 2.7995, "step": 12 }, { "epoch": 0.0004437238672241659, "grad_norm": 0.34815752506256104, "learning_rate": 0.0004985995540019955, "loss": 3.5264, "step": 13 }, { "epoch": 0.0004778564723952556, "grad_norm": 0.4225376844406128, "learning_rate": 0.0004975121276286136, "loss": 2.9188, "step": 14 }, { "epoch": 0.0005119890775663453, "grad_norm": 0.40409791469573975, "learning_rate": 0.0004961163319653958, "loss": 2.6648, "step": 15 }, { "epoch": 0.0005461216827374349, "grad_norm": 0.4268514811992645, "learning_rate": 0.0004944139059999286, "loss": 2.3858, "step": 16 }, { "epoch": 0.0005802542879085246, "grad_norm": 0.3788816034793854, "learning_rate": 0.000492406970742972, "loss": 2.5092, "step": 17 }, { "epoch": 0.0006143868930796144, "grad_norm": 0.5883328914642334, "learning_rate": 0.0004900980265859448, "loss": 3.279, "step": 18 }, { "epoch": 0.000648519498250704, "grad_norm": 0.3739059567451477, "learning_rate": 0.0004874899501857477, "loss": 2.6785, "step": 19 }, { "epoch": 0.0006826521034217937, "grad_norm": 0.32422783970832825, "learning_rate": 0.00048458599088080736, "loss": 2.4865, "step": 20 }, { "epoch": 0.0007167847085928833, "grad_norm": 0.28062283992767334, "learning_rate": 0.0004813897666428053, "loss": 3.0097, "step": 21 }, { "epoch": 0.0007509173137639731, "grad_norm": 0.3173668384552002, "learning_rate": 0.00047790525956913543, "loss": 2.8305, "step": 22 }, { "epoch": 0.0007850499189350627, "grad_norm": 0.34149301052093506, "learning_rate": 0.0004741368109217071, "loss": 2.3109, "step": 23 }, { "epoch": 0.0008191825241061524, "grad_norm": 0.3601361811161041, "learning_rate": 0.00047008911571827283, "loss": 2.6649, "step": 24 }, { "epoch": 0.0008533151292772421, "grad_norm": 0.34317097067832947, "learning_rate": 0.00046576721688302105, "loss": 2.5954, "step": 25 }, { "epoch": 0.0008874477344483318, "grad_norm": 0.32293665409088135, "learning_rate": 0.0004611764989637205, "loss": 2.5435, "step": 26 }, { "epoch": 0.0009215803396194214, "grad_norm": 0.35975202918052673, "learning_rate": 0.0004563226814232444, "loss": 2.9584, "step": 27 }, { "epoch": 0.0009557129447905112, "grad_norm": 0.2800445258617401, "learning_rate": 0.0004512118115138315, "loss": 2.4723, "step": 28 }, { "epoch": 0.0009898455499616008, "grad_norm": 0.3043380677700043, "learning_rate": 0.0004458502567429631, "loss": 2.6894, "step": 29 }, { "epoch": 0.0010239781551326906, "grad_norm": 0.31911131739616394, "learning_rate": 0.00044024469694024196, "loss": 2.6563, "step": 30 }, { "epoch": 0.0010581107603037801, "grad_norm": 0.309402734041214, "learning_rate": 0.00043440211593515554, "loss": 2.4612, "step": 31 }, { "epoch": 0.0010922433654748699, "grad_norm": 0.30311980843544006, "learning_rate": 0.0004283297928560951, "loss": 2.7588, "step": 32 }, { "epoch": 0.0011263759706459596, "grad_norm": 0.3477875292301178, "learning_rate": 0.0004220352930614672, "loss": 2.4841, "step": 33 }, { "epoch": 0.0011605085758170492, "grad_norm": 0.37769827246665955, "learning_rate": 0.00041552645871420013, "loss": 2.5664, "step": 34 }, { "epoch": 0.001194641180988139, "grad_norm": 0.32566434144973755, "learning_rate": 0.00040881139901138467, "loss": 2.4234, "step": 35 }, { "epoch": 0.0012287737861592287, "grad_norm": 0.38163110613822937, "learning_rate": 0.00040189848008122475, "loss": 2.5688, "step": 36 }, { "epoch": 0.0012629063913303182, "grad_norm": 0.3524303734302521, "learning_rate": 0.00039479631455988334, "loss": 2.752, "step": 37 }, { "epoch": 0.001297038996501408, "grad_norm": 0.3833262622356415, "learning_rate": 0.0003875137508612103, "loss": 2.635, "step": 38 }, { "epoch": 0.0013311716016724975, "grad_norm": 0.30518823862075806, "learning_rate": 0.00038005986215272055, "loss": 2.5431, "step": 39 }, { "epoch": 0.0013653042068435873, "grad_norm": 0.3856472671031952, "learning_rate": 0.0003724439350515571, "loss": 2.7964, "step": 40 }, { "epoch": 0.001399436812014677, "grad_norm": 0.339182585477829, "learning_rate": 0.0003646754580545226, "loss": 2.4652, "step": 41 }, { "epoch": 0.0014335694171857666, "grad_norm": 0.3530672490596771, "learning_rate": 0.000356764109716594, "loss": 2.8105, "step": 42 }, { "epoch": 0.0014677020223568564, "grad_norm": 0.35563480854034424, "learning_rate": 0.00034871974659264783, "loss": 3.1352, "step": 43 }, { "epoch": 0.0015018346275279461, "grad_norm": 0.3361399471759796, "learning_rate": 0.0003405523909574206, "loss": 2.6759, "step": 44 }, { "epoch": 0.0015359672326990357, "grad_norm": 0.3976024091243744, "learning_rate": 0.0003322722183190025, "loss": 2.9357, "step": 45 }, { "epoch": 0.0015700998378701254, "grad_norm": 0.4400416314601898, "learning_rate": 0.0003238895447414211, "loss": 3.0256, "step": 46 }, { "epoch": 0.0016042324430412152, "grad_norm": 0.2710314989089966, "learning_rate": 0.0003154148139921102, "loss": 2.3832, "step": 47 }, { "epoch": 0.0016383650482123047, "grad_norm": 0.34264132380485535, "learning_rate": 0.00030685858453027663, "loss": 2.8334, "step": 48 }, { "epoch": 0.0016724976533833945, "grad_norm": 0.3671528398990631, "learning_rate": 0.0002982315163523742, "loss": 2.7482, "step": 49 }, { "epoch": 0.0017066302585544842, "grad_norm": 0.2519057095050812, "learning_rate": 0.000289544357711076, "loss": 2.7811, "step": 50 }, { "epoch": 0.0017066302585544842, "eval_loss": 2.6318328380584717, "eval_runtime": 203.8827, "eval_samples_per_second": 60.505, "eval_steps_per_second": 30.253, "step": 50 }, { "epoch": 0.0017407628637255738, "grad_norm": 0.2467103898525238, "learning_rate": 0.0002808079317242896, "loss": 2.2711, "step": 51 }, { "epoch": 0.0017748954688966635, "grad_norm": 0.2773958742618561, "learning_rate": 0.0002720331228909005, "loss": 2.4991, "step": 52 }, { "epoch": 0.0018090280740677533, "grad_norm": 0.22879059612751007, "learning_rate": 0.00026323086353004075, "loss": 2.5932, "step": 53 }, { "epoch": 0.0018431606792388428, "grad_norm": 0.2841690480709076, "learning_rate": 0.0002544121201607822, "loss": 2.7066, "step": 54 }, { "epoch": 0.0018772932844099326, "grad_norm": 0.3463197648525238, "learning_rate": 0.00024558787983921783, "loss": 2.4738, "step": 55 }, { "epoch": 0.0019114258895810224, "grad_norm": 0.30249327421188354, "learning_rate": 0.0002367691364699592, "loss": 2.5275, "step": 56 }, { "epoch": 0.001945558494752112, "grad_norm": 0.3161616921424866, "learning_rate": 0.00022796687710909964, "loss": 2.45, "step": 57 }, { "epoch": 0.0019796910999232017, "grad_norm": 0.3322688937187195, "learning_rate": 0.00021919206827571036, "loss": 2.6401, "step": 58 }, { "epoch": 0.002013823705094291, "grad_norm": 0.32816797494888306, "learning_rate": 0.00021045564228892402, "loss": 2.3709, "step": 59 }, { "epoch": 0.002047956310265381, "grad_norm": 0.3372400999069214, "learning_rate": 0.00020176848364762578, "loss": 2.5829, "step": 60 }, { "epoch": 0.0020820889154364707, "grad_norm": 0.39470982551574707, "learning_rate": 0.00019314141546972343, "loss": 2.631, "step": 61 }, { "epoch": 0.0021162215206075603, "grad_norm": 0.4079892635345459, "learning_rate": 0.00018458518600788986, "loss": 2.5531, "step": 62 }, { "epoch": 0.0021503541257786502, "grad_norm": 0.3176794946193695, "learning_rate": 0.00017611045525857898, "loss": 2.4695, "step": 63 }, { "epoch": 0.0021844867309497398, "grad_norm": 0.29289788007736206, "learning_rate": 0.0001677277816809975, "loss": 2.7621, "step": 64 }, { "epoch": 0.0022186193361208293, "grad_norm": 0.3670552670955658, "learning_rate": 0.00015944760904257942, "loss": 2.4779, "step": 65 }, { "epoch": 0.0022527519412919193, "grad_norm": 0.31173592805862427, "learning_rate": 0.0001512802534073522, "loss": 2.5398, "step": 66 }, { "epoch": 0.002286884546463009, "grad_norm": 0.31395551562309265, "learning_rate": 0.00014323589028340596, "loss": 2.3511, "step": 67 }, { "epoch": 0.0023210171516340984, "grad_norm": 0.30225545167922974, "learning_rate": 0.00013532454194547733, "loss": 2.2839, "step": 68 }, { "epoch": 0.0023551497568051884, "grad_norm": 0.3490327298641205, "learning_rate": 0.00012755606494844294, "loss": 2.1876, "step": 69 }, { "epoch": 0.002389282361976278, "grad_norm": 0.265563040971756, "learning_rate": 0.00011994013784727947, "loss": 2.4273, "step": 70 }, { "epoch": 0.0024234149671473674, "grad_norm": 0.4211519956588745, "learning_rate": 0.00011248624913878966, "loss": 2.4147, "step": 71 }, { "epoch": 0.0024575475723184574, "grad_norm": 0.3521697223186493, "learning_rate": 0.0001052036854401166, "loss": 2.5654, "step": 72 }, { "epoch": 0.002491680177489547, "grad_norm": 0.28024736046791077, "learning_rate": 9.810151991877531e-05, "loss": 2.7851, "step": 73 }, { "epoch": 0.0025258127826606365, "grad_norm": 0.3537457287311554, "learning_rate": 9.118860098861537e-05, "loss": 2.7206, "step": 74 }, { "epoch": 0.0025599453878317265, "grad_norm": 0.38442355394363403, "learning_rate": 8.44735412857999e-05, "loss": 2.9931, "step": 75 }, { "epoch": 0.002594077993002816, "grad_norm": 0.2771991193294525, "learning_rate": 7.79647069385328e-05, "loss": 2.366, "step": 76 }, { "epoch": 0.0026282105981739056, "grad_norm": 0.30113691091537476, "learning_rate": 7.167020714390501e-05, "loss": 2.7356, "step": 77 }, { "epoch": 0.002662343203344995, "grad_norm": 0.3958965241909027, "learning_rate": 6.559788406484446e-05, "loss": 2.3764, "step": 78 }, { "epoch": 0.002696475808516085, "grad_norm": 0.2729441523551941, "learning_rate": 5.975530305975807e-05, "loss": 2.4155, "step": 79 }, { "epoch": 0.0027306084136871746, "grad_norm": 0.4083710312843323, "learning_rate": 5.414974325703686e-05, "loss": 2.7141, "step": 80 }, { "epoch": 0.002764741018858264, "grad_norm": 0.2846662700176239, "learning_rate": 4.8788188486168616e-05, "loss": 2.2121, "step": 81 }, { "epoch": 0.002798873624029354, "grad_norm": 0.28273066878318787, "learning_rate": 4.367731857675569e-05, "loss": 2.5148, "step": 82 }, { "epoch": 0.0028330062292004437, "grad_norm": 0.23538333177566528, "learning_rate": 3.882350103627952e-05, "loss": 2.4308, "step": 83 }, { "epoch": 0.002867138834371533, "grad_norm": 0.3415232002735138, "learning_rate": 3.423278311697897e-05, "loss": 2.8553, "step": 84 }, { "epoch": 0.002901271439542623, "grad_norm": 0.2868449091911316, "learning_rate": 2.9910884281727225e-05, "loss": 2.7417, "step": 85 }, { "epoch": 0.0029354040447137127, "grad_norm": 0.32795730233192444, "learning_rate": 2.586318907829291e-05, "loss": 2.5102, "step": 86 }, { "epoch": 0.0029695366498848023, "grad_norm": 0.2663126289844513, "learning_rate": 2.209474043086457e-05, "loss": 2.5792, "step": 87 }, { "epoch": 0.0030036692550558922, "grad_norm": 0.4900624454021454, "learning_rate": 1.861023335719475e-05, "loss": 3.2786, "step": 88 }, { "epoch": 0.003037801860226982, "grad_norm": 0.3926824927330017, "learning_rate": 1.5414009119192633e-05, "loss": 2.5358, "step": 89 }, { "epoch": 0.0030719344653980713, "grad_norm": 0.32311323285102844, "learning_rate": 1.25100498142523e-05, "loss": 2.7474, "step": 90 }, { "epoch": 0.0031060670705691613, "grad_norm": 0.3624803423881531, "learning_rate": 9.901973414055187e-06, "loss": 2.6785, "step": 91 }, { "epoch": 0.003140199675740251, "grad_norm": 0.31824859976768494, "learning_rate": 7.593029257027956e-06, "loss": 2.175, "step": 92 }, { "epoch": 0.0031743322809113404, "grad_norm": 0.4055838882923126, "learning_rate": 5.5860940000714015e-06, "loss": 2.7394, "step": 93 }, { "epoch": 0.0032084648860824304, "grad_norm": 0.3027328848838806, "learning_rate": 3.8836680346041594e-06, "loss": 2.8705, "step": 94 }, { "epoch": 0.00324259749125352, "grad_norm": 0.2983163595199585, "learning_rate": 2.487872371386424e-06, "loss": 2.5091, "step": 95 }, { "epoch": 0.0032767300964246095, "grad_norm": 0.38754749298095703, "learning_rate": 1.4004459980045125e-06, "loss": 2.9323, "step": 96 }, { "epoch": 0.0033108627015956994, "grad_norm": 0.33583030104637146, "learning_rate": 6.22743712309054e-07, "loss": 2.6292, "step": 97 }, { "epoch": 0.003344995306766789, "grad_norm": 0.37954822182655334, "learning_rate": 1.557344345054501e-07, "loss": 2.7452, "step": 98 }, { "epoch": 0.0033791279119378785, "grad_norm": 0.2241569459438324, "learning_rate": 0.0, "loss": 2.5134, "step": 99 } ], "logging_steps": 1, "max_steps": 99, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4169798900514816.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }