|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 5000, |
|
"global_step": 87900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11376564277588168, |
|
"grad_norm": 4.130064964294434, |
|
"learning_rate": 7.960000000000001e-05, |
|
"loss": 5.5183, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.22753128555176336, |
|
"grad_norm": 3.2718617916107178, |
|
"learning_rate": 0.0001596, |
|
"loss": 2.7046, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3412969283276451, |
|
"grad_norm": 3.3528096675872803, |
|
"learning_rate": 0.0002396, |
|
"loss": 1.8501, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.4550625711035267, |
|
"grad_norm": 3.1067423820495605, |
|
"learning_rate": 0.0003196, |
|
"loss": 1.6051, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.5688282138794084, |
|
"grad_norm": 2.107377052307129, |
|
"learning_rate": 0.0003996, |
|
"loss": 1.4934, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5688282138794084, |
|
"eval_accuracy": 0.644404, |
|
"eval_loss": 1.4417771100997925, |
|
"eval_runtime": 11.7201, |
|
"eval_samples_per_second": 21330.792, |
|
"eval_steps_per_second": 41.723, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.6825938566552902, |
|
"grad_norm": 2.3725779056549072, |
|
"learning_rate": 0.00047960000000000006, |
|
"loss": 1.4263, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.7963594994311718, |
|
"grad_norm": 1.8981488943099976, |
|
"learning_rate": 0.0005596, |
|
"loss": 1.3761, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.9101251422070534, |
|
"grad_norm": 1.904260516166687, |
|
"learning_rate": 0.0006396, |
|
"loss": 1.339, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.023890784982935, |
|
"grad_norm": 1.4444975852966309, |
|
"learning_rate": 0.00071952, |
|
"loss": 1.3071, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.1376564277588168, |
|
"grad_norm": 1.3250641822814941, |
|
"learning_rate": 0.00079952, |
|
"loss": 1.2717, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.1376564277588168, |
|
"eval_accuracy": 0.677084, |
|
"eval_loss": 1.288106918334961, |
|
"eval_runtime": 11.2915, |
|
"eval_samples_per_second": 22140.537, |
|
"eval_steps_per_second": 43.307, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.2514220705346986, |
|
"grad_norm": 1.4477468729019165, |
|
"learning_rate": 0.0007996786565611985, |
|
"loss": 1.2557, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.36518771331058, |
|
"grad_norm": 1.1922030448913574, |
|
"learning_rate": 0.0007987086748436788, |
|
"loss": 1.2356, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.4789533560864618, |
|
"grad_norm": 1.2596089839935303, |
|
"learning_rate": 0.0007970896788508052, |
|
"loss": 1.2048, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.5927189988623436, |
|
"grad_norm": 1.0349920988082886, |
|
"learning_rate": 0.0007948275336376884, |
|
"loss": 1.1905, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.7064846416382253, |
|
"grad_norm": 1.2018927335739136, |
|
"learning_rate": 0.0007919213896323948, |
|
"loss": 1.1742, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.7064846416382253, |
|
"eval_accuracy": 0.705248, |
|
"eval_loss": 1.166063904762268, |
|
"eval_runtime": 10.2238, |
|
"eval_samples_per_second": 24452.808, |
|
"eval_steps_per_second": 47.83, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.820250284414107, |
|
"grad_norm": 1.0107839107513428, |
|
"learning_rate": 0.0007883779147866073, |
|
"loss": 1.1634, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.9340159271899886, |
|
"grad_norm": 1.0561189651489258, |
|
"learning_rate": 0.0007842028713931261, |
|
"loss": 1.1485, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.04778156996587, |
|
"grad_norm": 1.2642914056777954, |
|
"learning_rate": 0.0007794030487826318, |
|
"loss": 1.1177, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.161547212741752, |
|
"grad_norm": 1.041416049003601, |
|
"learning_rate": 0.0007739862522830791, |
|
"loss": 1.0864, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.2753128555176336, |
|
"grad_norm": 1.0880264043807983, |
|
"learning_rate": 0.0007679612905269062, |
|
"loss": 1.0846, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.2753128555176336, |
|
"eval_accuracy": 0.717844, |
|
"eval_loss": 1.1148858070373535, |
|
"eval_runtime": 10.0405, |
|
"eval_samples_per_second": 24899.239, |
|
"eval_steps_per_second": 48.703, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.3890784982935154, |
|
"grad_norm": 1.2247668504714966, |
|
"learning_rate": 0.0007613448798360993, |
|
"loss": 1.0832, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.502844141069397, |
|
"grad_norm": 0.939128577709198, |
|
"learning_rate": 0.0007541345353494786, |
|
"loss": 1.0718, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.616609783845279, |
|
"grad_norm": 0.8472806811332703, |
|
"learning_rate": 0.0007463563776182788, |
|
"loss": 1.0741, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.73037542662116, |
|
"grad_norm": 0.9360683560371399, |
|
"learning_rate": 0.000738007485475254, |
|
"loss": 1.066, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.8441410693970424, |
|
"grad_norm": 0.8565033674240112, |
|
"learning_rate": 0.0007291456059015493, |
|
"loss": 1.0619, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.8441410693970424, |
|
"eval_accuracy": 0.726136, |
|
"eval_loss": 1.077797293663025, |
|
"eval_runtime": 10.346, |
|
"eval_samples_per_second": 24163.966, |
|
"eval_steps_per_second": 47.265, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.9579067121729237, |
|
"grad_norm": 1.0906000137329102, |
|
"learning_rate": 0.0007197139797510538, |
|
"loss": 1.055, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 3.0716723549488054, |
|
"grad_norm": 0.9203127026557922, |
|
"learning_rate": 0.0007097624442901132, |
|
"loss": 1.0186, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 3.185437997724687, |
|
"grad_norm": 0.9751584529876709, |
|
"learning_rate": 0.0006993071824080197, |
|
"loss": 1.0015, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 3.299203640500569, |
|
"grad_norm": 0.9114183187484741, |
|
"learning_rate": 0.0006883651961389032, |
|
"loss": 1.0015, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.4129692832764507, |
|
"grad_norm": 0.8756843209266663, |
|
"learning_rate": 0.0006769542790135331, |
|
"loss": 1.0029, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.4129692832764507, |
|
"eval_accuracy": 0.732152, |
|
"eval_loss": 1.0556296110153198, |
|
"eval_runtime": 10.3382, |
|
"eval_samples_per_second": 24182.19, |
|
"eval_steps_per_second": 47.3, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.526734926052332, |
|
"grad_norm": 0.9515267014503479, |
|
"learning_rate": 0.0006650929871240102, |
|
"loss": 1.0071, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 3.640500568828214, |
|
"grad_norm": 0.8560661673545837, |
|
"learning_rate": 0.0006528131100577897, |
|
"loss": 1.0052, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 3.7542662116040955, |
|
"grad_norm": 0.7972965836524963, |
|
"learning_rate": 0.0006401100359805646, |
|
"loss": 0.9941, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 3.868031854379977, |
|
"grad_norm": 0.8569052219390869, |
|
"learning_rate": 0.0006270165021451055, |
|
"loss": 0.9958, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 3.981797497155859, |
|
"grad_norm": 0.8554688096046448, |
|
"learning_rate": 0.0006135538008644762, |
|
"loss": 0.9936, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 3.981797497155859, |
|
"eval_accuracy": 0.737548, |
|
"eval_loss": 1.0317354202270508, |
|
"eval_runtime": 10.3557, |
|
"eval_samples_per_second": 24141.212, |
|
"eval_steps_per_second": 47.22, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 4.09556313993174, |
|
"grad_norm": 0.8586387634277344, |
|
"learning_rate": 0.0005997438247807972, |
|
"loss": 0.9476, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 4.2093287827076225, |
|
"grad_norm": 0.8404794335365295, |
|
"learning_rate": 0.0005856233206956809, |
|
"loss": 0.9405, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 4.323094425483504, |
|
"grad_norm": 0.8823213577270508, |
|
"learning_rate": 0.0005711869855083862, |
|
"loss": 0.9397, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 4.436860068259386, |
|
"grad_norm": 0.8017415404319763, |
|
"learning_rate": 0.0005565019625838785, |
|
"loss": 0.9469, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 4.550625711035267, |
|
"grad_norm": 0.8456715941429138, |
|
"learning_rate": 0.0005415332824531774, |
|
"loss": 0.9429, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.550625711035267, |
|
"eval_accuracy": 0.742428, |
|
"eval_loss": 1.0149633884429932, |
|
"eval_runtime": 10.3228, |
|
"eval_samples_per_second": 24218.315, |
|
"eval_steps_per_second": 47.371, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.664391353811149, |
|
"grad_norm": 0.920251190662384, |
|
"learning_rate": 0.0005263344451321572, |
|
"loss": 0.9433, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 4.778156996587031, |
|
"grad_norm": 0.7446300983428955, |
|
"learning_rate": 0.000510930166515435, |
|
"loss": 0.9433, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 4.891922639362912, |
|
"grad_norm": 0.9206159114837646, |
|
"learning_rate": 0.0004953454965801175, |
|
"loss": 0.9442, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 5.005688282138794, |
|
"grad_norm": 0.9331917762756348, |
|
"learning_rate": 0.00047960577865027823, |
|
"loss": 0.9357, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 5.1194539249146755, |
|
"grad_norm": 0.8155319094657898, |
|
"learning_rate": 0.0004637366081844012, |
|
"loss": 0.8818, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 5.1194539249146755, |
|
"eval_accuracy": 0.74508, |
|
"eval_loss": 1.0118529796600342, |
|
"eval_runtime": 10.3937, |
|
"eval_samples_per_second": 24053.141, |
|
"eval_steps_per_second": 47.048, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 5.233219567690558, |
|
"grad_norm": 0.9535221457481384, |
|
"learning_rate": 0.0004477637911528123, |
|
"loss": 0.884, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 5.346985210466439, |
|
"grad_norm": 0.9470248222351074, |
|
"learning_rate": 0.0004317293826802243, |
|
"loss": 0.8892, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 5.460750853242321, |
|
"grad_norm": 0.87973952293396, |
|
"learning_rate": 0.0004156273608793912, |
|
"loss": 0.8903, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 5.5745164960182025, |
|
"grad_norm": 0.8131686449050903, |
|
"learning_rate": 0.0003995160577323998, |
|
"loss": 0.8871, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 5.688282138794084, |
|
"grad_norm": 0.8810710906982422, |
|
"learning_rate": 0.0003833894224734173, |
|
"loss": 0.8868, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 5.688282138794084, |
|
"eval_accuracy": 0.74856, |
|
"eval_loss": 0.9947025179862976, |
|
"eval_runtime": 10.546, |
|
"eval_samples_per_second": 23705.725, |
|
"eval_steps_per_second": 46.368, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 5.802047781569966, |
|
"grad_norm": 0.9855514764785767, |
|
"learning_rate": 0.0003673058762504636, |
|
"loss": 0.8876, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 5.915813424345847, |
|
"grad_norm": 0.9044457674026489, |
|
"learning_rate": 0.0003512593787221045, |
|
"loss": 0.8879, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 6.0295790671217295, |
|
"grad_norm": 0.7832645773887634, |
|
"learning_rate": 0.0003353239798219901, |
|
"loss": 0.8673, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 6.143344709897611, |
|
"grad_norm": 0.8719345331192017, |
|
"learning_rate": 0.0003194617325587946, |
|
"loss": 0.8263, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 6.257110352673493, |
|
"grad_norm": 0.8254925608634949, |
|
"learning_rate": 0.0003037304542170158, |
|
"loss": 0.8323, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 6.257110352673493, |
|
"eval_accuracy": 0.74912, |
|
"eval_loss": 1.0007187128067017, |
|
"eval_runtime": 11.7739, |
|
"eval_samples_per_second": 21233.418, |
|
"eval_steps_per_second": 41.533, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 6.370875995449374, |
|
"grad_norm": 0.9038862586021423, |
|
"learning_rate": 0.00028815572653093183, |
|
"loss": 0.8324, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 6.484641638225256, |
|
"grad_norm": 0.9034783244132996, |
|
"learning_rate": 0.0002727781703007723, |
|
"loss": 0.8346, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 6.598407281001138, |
|
"grad_norm": 0.9226493239402771, |
|
"learning_rate": 0.0002576372347370359, |
|
"loss": 0.8362, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 6.712172923777019, |
|
"grad_norm": 0.964350163936615, |
|
"learning_rate": 0.0002426819230705446, |
|
"loss": 0.8375, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 6.825938566552901, |
|
"grad_norm": 0.9121440052986145, |
|
"learning_rate": 0.00022798243735498786, |
|
"loss": 0.838, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 6.825938566552901, |
|
"eval_accuracy": 0.752208, |
|
"eval_loss": 0.985443651676178, |
|
"eval_runtime": 11.2968, |
|
"eval_samples_per_second": 22130.153, |
|
"eval_steps_per_second": 43.287, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 6.939704209328783, |
|
"grad_norm": 0.9455272555351257, |
|
"learning_rate": 0.00021356268145433904, |
|
"loss": 0.83, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 7.053469852104665, |
|
"grad_norm": 0.888077437877655, |
|
"learning_rate": 0.0001994461043443134, |
|
"loss": 0.8038, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 7.167235494880546, |
|
"grad_norm": 0.9904555678367615, |
|
"learning_rate": 0.00018565566198034617, |
|
"loss": 0.7762, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 7.281001137656427, |
|
"grad_norm": 0.9325763583183289, |
|
"learning_rate": 0.00017221377996730371, |
|
"loss": 0.7832, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 7.39476678043231, |
|
"grad_norm": 1.1246719360351562, |
|
"learning_rate": 0.00015915519640338763, |
|
"loss": 0.7835, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 7.39476678043231, |
|
"eval_accuracy": 0.752124, |
|
"eval_loss": 0.9989385604858398, |
|
"eval_runtime": 11.8501, |
|
"eval_samples_per_second": 21096.838, |
|
"eval_steps_per_second": 41.265, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 7.508532423208191, |
|
"grad_norm": 0.7936950922012329, |
|
"learning_rate": 0.0001464750070030455, |
|
"loss": 0.785, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 7.622298065984073, |
|
"grad_norm": 1.1911970376968384, |
|
"learning_rate": 0.0001342191476487465, |
|
"loss": 0.7855, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 7.736063708759954, |
|
"grad_norm": 1.1150842905044556, |
|
"learning_rate": 0.00012238301552628276, |
|
"loss": 0.7876, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 7.849829351535837, |
|
"grad_norm": 0.8056913018226624, |
|
"learning_rate": 0.00011100948879440256, |
|
"loss": 0.7854, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 7.963594994311718, |
|
"grad_norm": 0.9775083065032959, |
|
"learning_rate": 0.00010009429600800158, |
|
"loss": 0.7836, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 7.963594994311718, |
|
"eval_accuracy": 0.753456, |
|
"eval_loss": 0.9900269508361816, |
|
"eval_runtime": 11.6532, |
|
"eval_samples_per_second": 21453.25, |
|
"eval_steps_per_second": 41.963, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 8.0773606370876, |
|
"grad_norm": 0.8832221627235413, |
|
"learning_rate": 8.966680090652002e-05, |
|
"loss": 0.7513, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 8.19112627986348, |
|
"grad_norm": 1.0215216875076294, |
|
"learning_rate": 7.97439603705802e-05, |
|
"loss": 0.7438, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 8.304891922639364, |
|
"grad_norm": 0.9041171669960022, |
|
"learning_rate": 7.035104738078215e-05, |
|
"loss": 0.7425, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 8.418657565415245, |
|
"grad_norm": 0.9626539349555969, |
|
"learning_rate": 6.148453433191126e-05, |
|
"loss": 0.7463, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 8.532423208191126, |
|
"grad_norm": 1.0108016729354858, |
|
"learning_rate": 5.317654149350526e-05, |
|
"loss": 0.7451, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 8.532423208191126, |
|
"eval_accuracy": 0.752852, |
|
"eval_loss": 1.00435471534729, |
|
"eval_runtime": 11.8355, |
|
"eval_samples_per_second": 21122.845, |
|
"eval_steps_per_second": 41.316, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 8.646188850967008, |
|
"grad_norm": 0.9667897820472717, |
|
"learning_rate": 4.5423948713286365e-05, |
|
"loss": 0.7421, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 8.759954493742889, |
|
"grad_norm": 0.8784095644950867, |
|
"learning_rate": 3.825484091802838e-05, |
|
"loss": 0.7486, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 8.873720136518772, |
|
"grad_norm": 0.9861534833908081, |
|
"learning_rate": 3.166652567546153e-05, |
|
"loss": 0.7426, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 8.987485779294653, |
|
"grad_norm": 0.9912355542182922, |
|
"learning_rate": 2.5682870142857394e-05, |
|
"loss": 0.7428, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 9.101251422070535, |
|
"grad_norm": 0.8502938747406006, |
|
"learning_rate": 2.0301627096753005e-05, |
|
"loss": 0.7207, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 9.101251422070535, |
|
"eval_accuracy": 0.753112, |
|
"eval_loss": 1.0053608417510986, |
|
"eval_runtime": 12.237, |
|
"eval_samples_per_second": 20429.87, |
|
"eval_steps_per_second": 39.961, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 9.215017064846416, |
|
"grad_norm": 0.8870697617530823, |
|
"learning_rate": 1.5537838186957887e-05, |
|
"loss": 0.7224, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 9.328782707622299, |
|
"grad_norm": 0.9085851907730103, |
|
"learning_rate": 1.1399250144500695e-05, |
|
"loss": 0.7236, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 9.44254835039818, |
|
"grad_norm": 0.9153009057044983, |
|
"learning_rate": 7.895782041054834e-06, |
|
"loss": 0.7208, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 9.556313993174061, |
|
"grad_norm": 0.9525237679481506, |
|
"learning_rate": 5.026118113090661e-06, |
|
"loss": 0.7208, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 9.670079635949943, |
|
"grad_norm": 0.9772939682006836, |
|
"learning_rate": 2.8006541473553527e-06, |
|
"loss": 0.721, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 9.670079635949943, |
|
"eval_accuracy": 0.75292, |
|
"eval_loss": 1.0081429481506348, |
|
"eval_runtime": 12.3975, |
|
"eval_samples_per_second": 20165.284, |
|
"eval_steps_per_second": 39.443, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 9.783845278725824, |
|
"grad_norm": 0.9219892024993896, |
|
"learning_rate": 1.2198128698185597e-06, |
|
"loss": 0.7207, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 9.897610921501707, |
|
"grad_norm": 0.8879119157791138, |
|
"learning_rate": 2.8555010796385004e-07, |
|
"loss": 0.7195, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 87900, |
|
"total_flos": 9.6637212e+17, |
|
"train_loss": 1.0272872150581716, |
|
"train_runtime": 4575.9892, |
|
"train_samples_per_second": 9833.939, |
|
"train_steps_per_second": 19.209 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 87900, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 5000, |
|
"total_flos": 9.6637212e+17, |
|
"train_batch_size": 512, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|