{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 5000, "global_step": 87900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11376564277588168, "grad_norm": 4.130064964294434, "learning_rate": 7.960000000000001e-05, "loss": 5.5183, "step": 1000 }, { "epoch": 0.22753128555176336, "grad_norm": 3.2718617916107178, "learning_rate": 0.0001596, "loss": 2.7046, "step": 2000 }, { "epoch": 0.3412969283276451, "grad_norm": 3.3528096675872803, "learning_rate": 0.0002396, "loss": 1.8501, "step": 3000 }, { "epoch": 0.4550625711035267, "grad_norm": 3.1067423820495605, "learning_rate": 0.0003196, "loss": 1.6051, "step": 4000 }, { "epoch": 0.5688282138794084, "grad_norm": 2.107377052307129, "learning_rate": 0.0003996, "loss": 1.4934, "step": 5000 }, { "epoch": 0.5688282138794084, "eval_accuracy": 0.644404, "eval_loss": 1.4417771100997925, "eval_runtime": 11.7201, "eval_samples_per_second": 21330.792, "eval_steps_per_second": 41.723, "step": 5000 }, { "epoch": 0.6825938566552902, "grad_norm": 2.3725779056549072, "learning_rate": 0.00047960000000000006, "loss": 1.4263, "step": 6000 }, { "epoch": 0.7963594994311718, "grad_norm": 1.8981488943099976, "learning_rate": 0.0005596, "loss": 1.3761, "step": 7000 }, { "epoch": 0.9101251422070534, "grad_norm": 1.904260516166687, "learning_rate": 0.0006396, "loss": 1.339, "step": 8000 }, { "epoch": 1.023890784982935, "grad_norm": 1.4444975852966309, "learning_rate": 0.00071952, "loss": 1.3071, "step": 9000 }, { "epoch": 1.1376564277588168, "grad_norm": 1.3250641822814941, "learning_rate": 0.00079952, "loss": 1.2717, "step": 10000 }, { "epoch": 1.1376564277588168, "eval_accuracy": 0.677084, "eval_loss": 1.288106918334961, "eval_runtime": 11.2915, "eval_samples_per_second": 22140.537, "eval_steps_per_second": 43.307, "step": 10000 }, { "epoch": 1.2514220705346986, "grad_norm": 1.4477468729019165, "learning_rate": 0.0007996786565611985, "loss": 1.2557, "step": 11000 }, { "epoch": 1.36518771331058, "grad_norm": 1.1922030448913574, "learning_rate": 0.0007987086748436788, "loss": 1.2356, "step": 12000 }, { "epoch": 1.4789533560864618, "grad_norm": 1.2596089839935303, "learning_rate": 0.0007970896788508052, "loss": 1.2048, "step": 13000 }, { "epoch": 1.5927189988623436, "grad_norm": 1.0349920988082886, "learning_rate": 0.0007948275336376884, "loss": 1.1905, "step": 14000 }, { "epoch": 1.7064846416382253, "grad_norm": 1.2018927335739136, "learning_rate": 0.0007919213896323948, "loss": 1.1742, "step": 15000 }, { "epoch": 1.7064846416382253, "eval_accuracy": 0.705248, "eval_loss": 1.166063904762268, "eval_runtime": 10.2238, "eval_samples_per_second": 24452.808, "eval_steps_per_second": 47.83, "step": 15000 }, { "epoch": 1.820250284414107, "grad_norm": 1.0107839107513428, "learning_rate": 0.0007883779147866073, "loss": 1.1634, "step": 16000 }, { "epoch": 1.9340159271899886, "grad_norm": 1.0561189651489258, "learning_rate": 0.0007842028713931261, "loss": 1.1485, "step": 17000 }, { "epoch": 2.04778156996587, "grad_norm": 1.2642914056777954, "learning_rate": 0.0007794030487826318, "loss": 1.1177, "step": 18000 }, { "epoch": 2.161547212741752, "grad_norm": 1.041416049003601, "learning_rate": 0.0007739862522830791, "loss": 1.0864, "step": 19000 }, { "epoch": 2.2753128555176336, "grad_norm": 1.0880264043807983, "learning_rate": 0.0007679612905269062, "loss": 1.0846, "step": 20000 }, { "epoch": 2.2753128555176336, "eval_accuracy": 0.717844, "eval_loss": 1.1148858070373535, "eval_runtime": 10.0405, "eval_samples_per_second": 24899.239, "eval_steps_per_second": 48.703, "step": 20000 }, { "epoch": 2.3890784982935154, "grad_norm": 1.2247668504714966, "learning_rate": 0.0007613448798360993, "loss": 1.0832, "step": 21000 }, { "epoch": 2.502844141069397, "grad_norm": 0.939128577709198, "learning_rate": 0.0007541345353494786, "loss": 1.0718, "step": 22000 }, { "epoch": 2.616609783845279, "grad_norm": 0.8472806811332703, "learning_rate": 0.0007463563776182788, "loss": 1.0741, "step": 23000 }, { "epoch": 2.73037542662116, "grad_norm": 0.9360683560371399, "learning_rate": 0.000738007485475254, "loss": 1.066, "step": 24000 }, { "epoch": 2.8441410693970424, "grad_norm": 0.8565033674240112, "learning_rate": 0.0007291456059015493, "loss": 1.0619, "step": 25000 }, { "epoch": 2.8441410693970424, "eval_accuracy": 0.726136, "eval_loss": 1.077797293663025, "eval_runtime": 10.346, "eval_samples_per_second": 24163.966, "eval_steps_per_second": 47.265, "step": 25000 }, { "epoch": 2.9579067121729237, "grad_norm": 1.0906000137329102, "learning_rate": 0.0007197139797510538, "loss": 1.055, "step": 26000 }, { "epoch": 3.0716723549488054, "grad_norm": 0.9203127026557922, "learning_rate": 0.0007097624442901132, "loss": 1.0186, "step": 27000 }, { "epoch": 3.185437997724687, "grad_norm": 0.9751584529876709, "learning_rate": 0.0006993071824080197, "loss": 1.0015, "step": 28000 }, { "epoch": 3.299203640500569, "grad_norm": 0.9114183187484741, "learning_rate": 0.0006883651961389032, "loss": 1.0015, "step": 29000 }, { "epoch": 3.4129692832764507, "grad_norm": 0.8756843209266663, "learning_rate": 0.0006769542790135331, "loss": 1.0029, "step": 30000 }, { "epoch": 3.4129692832764507, "eval_accuracy": 0.732152, "eval_loss": 1.0556296110153198, "eval_runtime": 10.3382, "eval_samples_per_second": 24182.19, "eval_steps_per_second": 47.3, "step": 30000 }, { "epoch": 3.526734926052332, "grad_norm": 0.9515267014503479, "learning_rate": 0.0006650929871240102, "loss": 1.0071, "step": 31000 }, { "epoch": 3.640500568828214, "grad_norm": 0.8560661673545837, "learning_rate": 0.0006528131100577897, "loss": 1.0052, "step": 32000 }, { "epoch": 3.7542662116040955, "grad_norm": 0.7972965836524963, "learning_rate": 0.0006401100359805646, "loss": 0.9941, "step": 33000 }, { "epoch": 3.868031854379977, "grad_norm": 0.8569052219390869, "learning_rate": 0.0006270165021451055, "loss": 0.9958, "step": 34000 }, { "epoch": 3.981797497155859, "grad_norm": 0.8554688096046448, "learning_rate": 0.0006135538008644762, "loss": 0.9936, "step": 35000 }, { "epoch": 3.981797497155859, "eval_accuracy": 0.737548, "eval_loss": 1.0317354202270508, "eval_runtime": 10.3557, "eval_samples_per_second": 24141.212, "eval_steps_per_second": 47.22, "step": 35000 }, { "epoch": 4.09556313993174, "grad_norm": 0.8586387634277344, "learning_rate": 0.0005997438247807972, "loss": 0.9476, "step": 36000 }, { "epoch": 4.2093287827076225, "grad_norm": 0.8404794335365295, "learning_rate": 0.0005856233206956809, "loss": 0.9405, "step": 37000 }, { "epoch": 4.323094425483504, "grad_norm": 0.8823213577270508, "learning_rate": 0.0005711869855083862, "loss": 0.9397, "step": 38000 }, { "epoch": 4.436860068259386, "grad_norm": 0.8017415404319763, "learning_rate": 0.0005565019625838785, "loss": 0.9469, "step": 39000 }, { "epoch": 4.550625711035267, "grad_norm": 0.8456715941429138, "learning_rate": 0.0005415332824531774, "loss": 0.9429, "step": 40000 }, { "epoch": 4.550625711035267, "eval_accuracy": 0.742428, "eval_loss": 1.0149633884429932, "eval_runtime": 10.3228, "eval_samples_per_second": 24218.315, "eval_steps_per_second": 47.371, "step": 40000 }, { "epoch": 4.664391353811149, "grad_norm": 0.920251190662384, "learning_rate": 0.0005263344451321572, "loss": 0.9433, "step": 41000 }, { "epoch": 4.778156996587031, "grad_norm": 0.7446300983428955, "learning_rate": 0.000510930166515435, "loss": 0.9433, "step": 42000 }, { "epoch": 4.891922639362912, "grad_norm": 0.9206159114837646, "learning_rate": 0.0004953454965801175, "loss": 0.9442, "step": 43000 }, { "epoch": 5.005688282138794, "grad_norm": 0.9331917762756348, "learning_rate": 0.00047960577865027823, "loss": 0.9357, "step": 44000 }, { "epoch": 5.1194539249146755, "grad_norm": 0.8155319094657898, "learning_rate": 0.0004637366081844012, "loss": 0.8818, "step": 45000 }, { "epoch": 5.1194539249146755, "eval_accuracy": 0.74508, "eval_loss": 1.0118529796600342, "eval_runtime": 10.3937, "eval_samples_per_second": 24053.141, "eval_steps_per_second": 47.048, "step": 45000 }, { "epoch": 5.233219567690558, "grad_norm": 0.9535221457481384, "learning_rate": 0.0004477637911528123, "loss": 0.884, "step": 46000 }, { "epoch": 5.346985210466439, "grad_norm": 0.9470248222351074, "learning_rate": 0.0004317293826802243, "loss": 0.8892, "step": 47000 }, { "epoch": 5.460750853242321, "grad_norm": 0.87973952293396, "learning_rate": 0.0004156273608793912, "loss": 0.8903, "step": 48000 }, { "epoch": 5.5745164960182025, "grad_norm": 0.8131686449050903, "learning_rate": 0.0003995160577323998, "loss": 0.8871, "step": 49000 }, { "epoch": 5.688282138794084, "grad_norm": 0.8810710906982422, "learning_rate": 0.0003833894224734173, "loss": 0.8868, "step": 50000 }, { "epoch": 5.688282138794084, "eval_accuracy": 0.74856, "eval_loss": 0.9947025179862976, "eval_runtime": 10.546, "eval_samples_per_second": 23705.725, "eval_steps_per_second": 46.368, "step": 50000 }, { "epoch": 5.802047781569966, "grad_norm": 0.9855514764785767, "learning_rate": 0.0003673058762504636, "loss": 0.8876, "step": 51000 }, { "epoch": 5.915813424345847, "grad_norm": 0.9044457674026489, "learning_rate": 0.0003512593787221045, "loss": 0.8879, "step": 52000 }, { "epoch": 6.0295790671217295, "grad_norm": 0.7832645773887634, "learning_rate": 0.0003353239798219901, "loss": 0.8673, "step": 53000 }, { "epoch": 6.143344709897611, "grad_norm": 0.8719345331192017, "learning_rate": 0.0003194617325587946, "loss": 0.8263, "step": 54000 }, { "epoch": 6.257110352673493, "grad_norm": 0.8254925608634949, "learning_rate": 0.0003037304542170158, "loss": 0.8323, "step": 55000 }, { "epoch": 6.257110352673493, "eval_accuracy": 0.74912, "eval_loss": 1.0007187128067017, "eval_runtime": 11.7739, "eval_samples_per_second": 21233.418, "eval_steps_per_second": 41.533, "step": 55000 }, { "epoch": 6.370875995449374, "grad_norm": 0.9038862586021423, "learning_rate": 0.00028815572653093183, "loss": 0.8324, "step": 56000 }, { "epoch": 6.484641638225256, "grad_norm": 0.9034783244132996, "learning_rate": 0.0002727781703007723, "loss": 0.8346, "step": 57000 }, { "epoch": 6.598407281001138, "grad_norm": 0.9226493239402771, "learning_rate": 0.0002576372347370359, "loss": 0.8362, "step": 58000 }, { "epoch": 6.712172923777019, "grad_norm": 0.964350163936615, "learning_rate": 0.0002426819230705446, "loss": 0.8375, "step": 59000 }, { "epoch": 6.825938566552901, "grad_norm": 0.9121440052986145, "learning_rate": 0.00022798243735498786, "loss": 0.838, "step": 60000 }, { "epoch": 6.825938566552901, "eval_accuracy": 0.752208, "eval_loss": 0.985443651676178, "eval_runtime": 11.2968, "eval_samples_per_second": 22130.153, "eval_steps_per_second": 43.287, "step": 60000 }, { "epoch": 6.939704209328783, "grad_norm": 0.9455272555351257, "learning_rate": 0.00021356268145433904, "loss": 0.83, "step": 61000 }, { "epoch": 7.053469852104665, "grad_norm": 0.888077437877655, "learning_rate": 0.0001994461043443134, "loss": 0.8038, "step": 62000 }, { "epoch": 7.167235494880546, "grad_norm": 0.9904555678367615, "learning_rate": 0.00018565566198034617, "loss": 0.7762, "step": 63000 }, { "epoch": 7.281001137656427, "grad_norm": 0.9325763583183289, "learning_rate": 0.00017221377996730371, "loss": 0.7832, "step": 64000 }, { "epoch": 7.39476678043231, "grad_norm": 1.1246719360351562, "learning_rate": 0.00015915519640338763, "loss": 0.7835, "step": 65000 }, { "epoch": 7.39476678043231, "eval_accuracy": 0.752124, "eval_loss": 0.9989385604858398, "eval_runtime": 11.8501, "eval_samples_per_second": 21096.838, "eval_steps_per_second": 41.265, "step": 65000 }, { "epoch": 7.508532423208191, "grad_norm": 0.7936950922012329, "learning_rate": 0.0001464750070030455, "loss": 0.785, "step": 66000 }, { "epoch": 7.622298065984073, "grad_norm": 1.1911970376968384, "learning_rate": 0.0001342191476487465, "loss": 0.7855, "step": 67000 }, { "epoch": 7.736063708759954, "grad_norm": 1.1150842905044556, "learning_rate": 0.00012238301552628276, "loss": 0.7876, "step": 68000 }, { "epoch": 7.849829351535837, "grad_norm": 0.8056913018226624, "learning_rate": 0.00011100948879440256, "loss": 0.7854, "step": 69000 }, { "epoch": 7.963594994311718, "grad_norm": 0.9775083065032959, "learning_rate": 0.00010009429600800158, "loss": 0.7836, "step": 70000 }, { "epoch": 7.963594994311718, "eval_accuracy": 0.753456, "eval_loss": 0.9900269508361816, "eval_runtime": 11.6532, "eval_samples_per_second": 21453.25, "eval_steps_per_second": 41.963, "step": 70000 }, { "epoch": 8.0773606370876, "grad_norm": 0.8832221627235413, "learning_rate": 8.966680090652002e-05, "loss": 0.7513, "step": 71000 }, { "epoch": 8.19112627986348, "grad_norm": 1.0215216875076294, "learning_rate": 7.97439603705802e-05, "loss": 0.7438, "step": 72000 }, { "epoch": 8.304891922639364, "grad_norm": 0.9041171669960022, "learning_rate": 7.035104738078215e-05, "loss": 0.7425, "step": 73000 }, { "epoch": 8.418657565415245, "grad_norm": 0.9626539349555969, "learning_rate": 6.148453433191126e-05, "loss": 0.7463, "step": 74000 }, { "epoch": 8.532423208191126, "grad_norm": 1.0108016729354858, "learning_rate": 5.317654149350526e-05, "loss": 0.7451, "step": 75000 }, { "epoch": 8.532423208191126, "eval_accuracy": 0.752852, "eval_loss": 1.00435471534729, "eval_runtime": 11.8355, "eval_samples_per_second": 21122.845, "eval_steps_per_second": 41.316, "step": 75000 }, { "epoch": 8.646188850967008, "grad_norm": 0.9667897820472717, "learning_rate": 4.5423948713286365e-05, "loss": 0.7421, "step": 76000 }, { "epoch": 8.759954493742889, "grad_norm": 0.8784095644950867, "learning_rate": 3.825484091802838e-05, "loss": 0.7486, "step": 77000 }, { "epoch": 8.873720136518772, "grad_norm": 0.9861534833908081, "learning_rate": 3.166652567546153e-05, "loss": 0.7426, "step": 78000 }, { "epoch": 8.987485779294653, "grad_norm": 0.9912355542182922, "learning_rate": 2.5682870142857394e-05, "loss": 0.7428, "step": 79000 }, { "epoch": 9.101251422070535, "grad_norm": 0.8502938747406006, "learning_rate": 2.0301627096753005e-05, "loss": 0.7207, "step": 80000 }, { "epoch": 9.101251422070535, "eval_accuracy": 0.753112, "eval_loss": 1.0053608417510986, "eval_runtime": 12.237, "eval_samples_per_second": 20429.87, "eval_steps_per_second": 39.961, "step": 80000 }, { "epoch": 9.215017064846416, "grad_norm": 0.8870697617530823, "learning_rate": 1.5537838186957887e-05, "loss": 0.7224, "step": 81000 }, { "epoch": 9.328782707622299, "grad_norm": 0.9085851907730103, "learning_rate": 1.1399250144500695e-05, "loss": 0.7236, "step": 82000 }, { "epoch": 9.44254835039818, "grad_norm": 0.9153009057044983, "learning_rate": 7.895782041054834e-06, "loss": 0.7208, "step": 83000 }, { "epoch": 9.556313993174061, "grad_norm": 0.9525237679481506, "learning_rate": 5.026118113090661e-06, "loss": 0.7208, "step": 84000 }, { "epoch": 9.670079635949943, "grad_norm": 0.9772939682006836, "learning_rate": 2.8006541473553527e-06, "loss": 0.721, "step": 85000 }, { "epoch": 9.670079635949943, "eval_accuracy": 0.75292, "eval_loss": 1.0081429481506348, "eval_runtime": 12.3975, "eval_samples_per_second": 20165.284, "eval_steps_per_second": 39.443, "step": 85000 }, { "epoch": 9.783845278725824, "grad_norm": 0.9219892024993896, "learning_rate": 1.2198128698185597e-06, "loss": 0.7207, "step": 86000 }, { "epoch": 9.897610921501707, "grad_norm": 0.8879119157791138, "learning_rate": 2.8555010796385004e-07, "loss": 0.7195, "step": 87000 }, { "epoch": 10.0, "step": 87900, "total_flos": 9.6637212e+17, "train_loss": 1.0272872150581716, "train_runtime": 4575.9892, "train_samples_per_second": 9833.939, "train_steps_per_second": 19.209 } ], "logging_steps": 1000, "max_steps": 87900, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5000, "total_flos": 9.6637212e+17, "train_batch_size": 512, "trial_name": null, "trial_params": null }