diff --git a/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4f28a4b786996566a60498e807fb2c2c023865f --- /dev/null +++ b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bcd5d1be4eef090f25107320bdc3fa45b0b49de4aa932c10101ecbf8e909fc2 +size 10570070620 diff --git a/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fc82956342426c7717ebfde736b686ae065483f --- /dev/null +++ b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f6bf8328543f213865d2e8e3e1bb3fdd8a265a04a1481281f124ec81bfe479a +size 10570070556 diff --git a/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad5d9b191c77f9f73a8d5e1076285bcffcbe0acc --- /dev/null +++ b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f75f4fb108ec829abf2bfaaf794e1fbcc46e4d81dcd73d6a6410e51c3e46788 +size 10570070620 diff --git a/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..944069778b859b7a92a9f3581a22c88d8de37b22 --- /dev/null +++ b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9482ed426a1450f27282964cb0f01100a576d247113b61b3e97f86e1e2d8fcf5 +size 10570070556 diff --git a/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee352704fd61d1b71f737aba0e0a4c2641f9d2c2 --- /dev/null +++ b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e982fd709bf79f0a9cf2e8b31920d55684fca9cd2e51c27190fbde533de19a22 +size 10570070748 diff --git a/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e93e0522571237ac86399405e85e35353403314f --- /dev/null +++ b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2809a7ddac3b3ce9e29dbf5c255fc50f36f674438b50f400158ff1a4b8cf07a2 +size 10570070556 diff --git a/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15705d7e528f54490a23acf251c22909e207f5aa --- /dev/null +++ b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1c23c9d0ec7fb7f407d540ce769597c583952c41efe961406f061919d63c475 +size 10570070556 diff --git a/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8826dd8bf9714b86058cb4a1b000a885e1e42e49 --- /dev/null +++ b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d88b079ca7d9ddf65b649b7d3a9a7c02a4401f76af13f2a1b7543cac1531448 +size 10570070428 diff --git a/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/mp_rank_00_model_states.pt b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ea20eff6252c71a260380f174c2c8a1e9df708f --- /dev/null +++ b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/checkpoint/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e8265ce197916dfa690de7b0cd270c18914ee9baadf3cc000bd992ee88c47f8 +size 15142167290 diff --git a/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/latest b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/latest new file mode 100644 index 0000000000000000000000000000000000000000..cba72880aceabcd7bb2ab41315a70f60d597b12a --- /dev/null +++ b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/latest @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47320987f9a49d5b00119b960f247a956773f57543982b8bfcb6da5bb3afd9ef +size 10 diff --git a/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/zero_to_fp32.py b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..fa3d3fabcb348988a649d21ba1cea6973b029a39 --- /dev/null +++ b/checkpoints/epoch=1-step=38-train_total_step=0.35786.ckpt/zero_to_fp32.py @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92dcd9cc03d6010b4d060b73c8826d71ff9e29a030154e9f192e34ad80457f3a +size 29219 diff --git a/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd66611f0ba960dafd71723a85025418344d9326 --- /dev/null +++ b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b8500b3023771afba20a264cb6ca23e731137a167c45707b7a7202b47fd33e4 +size 10570070620 diff --git a/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc0f5206dba7ed5e66cfc1400f4919f9dee86d35 --- /dev/null +++ b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7760e744ac17ea940dc47bd1664669b391bbb480967b1c135c92ca129e133462 +size 10570070556 diff --git a/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..170cf99832577bb633d9d3dc3f7993c4a17accbf --- /dev/null +++ b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c72ce2b43aef6723e0a87d44d4c2175d3d9cf4356c9dcf48dbe9b736a8b420a5 +size 10570070620 diff --git a/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..707912d43d6e9ced5a3bc74029c7d7ba98a98634 --- /dev/null +++ b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7f2590fee04f9f3349b7fd3dfb76abbe4087e1a724c541f3651fbe5e3ffc45e +size 10570070556 diff --git a/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8de5f3a0f2c59c52c133fd0b0e1a98aa00efba59 --- /dev/null +++ b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:109bd401bd3a5909cdd8a1aaa09621d65f689464789eeb38fced8e42eb43bd6f +size 10570070748 diff --git a/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3229b037555c5912169604b857bd0fcdc00c2b0b --- /dev/null +++ b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae23f5a7c636e9db5b90b20bb65d4c21336079d971642217223d8239d07c30cd +size 10570070556 diff --git a/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb1b2b0afca59e54ce3494e12ce3c2414be159a2 --- /dev/null +++ b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bd01377da6314c1d67534034bcfdcaa39e395353637fe262ec24f534179c1ff +size 10570070556 diff --git a/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..158167813b38352e181c3d91acd62c03439ae90a --- /dev/null +++ b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bc66859f10acba5ae0f2a2d138cd6b9926265842607f7ac0859297e63ee4324 +size 10570070428 diff --git a/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/mp_rank_00_model_states.pt b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..02774e6d09d501dac5116f90c85bab7a819ae197 --- /dev/null +++ b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/checkpoint/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df64ca27b1ddf7146e7e1e8c1c7084985e87f1b6da001a0ad95ad33c617b1cb4 +size 15142167290 diff --git a/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/latest b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/latest new file mode 100644 index 0000000000000000000000000000000000000000..cba72880aceabcd7bb2ab41315a70f60d597b12a --- /dev/null +++ b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/latest @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47320987f9a49d5b00119b960f247a956773f57543982b8bfcb6da5bb3afd9ef +size 10 diff --git a/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/zero_to_fp32.py b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..fa3d3fabcb348988a649d21ba1cea6973b029a39 --- /dev/null +++ b/checkpoints/epoch=1-step=43-train_total_step=0.31789.ckpt/zero_to_fp32.py @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92dcd9cc03d6010b4d060b73c8826d71ff9e29a030154e9f192e34ad80457f3a +size 29219 diff --git a/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be5311551b515be4799901b2f668f0d62d871b1b --- /dev/null +++ b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:501577397d945219e1308d30038d42b8de0a677b25ac25e5e2e403b80fe19c7b +size 10570070620 diff --git a/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2605d4fbb6f4f29f0e3ef66529b7be638131216d --- /dev/null +++ b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd629c05478f096223d9fa65b5ca358d230b40e84818645d9231605453d55ec4 +size 10570070556 diff --git a/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f33fdc1f14fa384d97ff55f054d7eba4810d603 --- /dev/null +++ b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66a07adc1c481e072830051041682643afc6d64cfb57f0b7a8e59fdf0f6ee29b +size 10570070620 diff --git a/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..08668089cfd497db4f5210bdb20fef242cb78d9e --- /dev/null +++ b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:829a1e30c609431936aabc3d924132109499b0b8624e15241c0ae54bf9cbd125 +size 10570070556 diff --git a/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97294bf2febf826fb4ddabc416f4e3f86fce4303 --- /dev/null +++ b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f349ebd3da49e26f9cddfb2a10f67e115875e74424d991fc903e65db50ebf955 +size 10570070748 diff --git a/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef619e4a9dd0224cc53eb78e2e4cce191b2a195d --- /dev/null +++ b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e3ce540ab06e73ba7e08adc57a0fe9064114696c273645eadc4cfa900281419 +size 10570070556 diff --git a/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8e753db56bc9cc55a829807ed288c503d2415d0 --- /dev/null +++ b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67710470cb06bdf52eaff528107a0b1fa6a07ce00bd818bb98bb4a558849532a +size 10570070556 diff --git a/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9bb5ba8bd88754a9dee6b050ba370d848988bccc --- /dev/null +++ b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7a98d767a177caa7903e2c349d993e9f751312173f8b20c9be8685ad4622b1c +size 10570070428 diff --git a/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/mp_rank_00_model_states.pt b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..481bbac60583b6835e4a12b6a4bee5ed7515d83f --- /dev/null +++ b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/checkpoint/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:846d6636b7e849b04851bd30ce6708acd989c347200746e78780b824dda4e4a4 +size 15142167290 diff --git a/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/latest b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/latest new file mode 100644 index 0000000000000000000000000000000000000000..cba72880aceabcd7bb2ab41315a70f60d597b12a --- /dev/null +++ b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/latest @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47320987f9a49d5b00119b960f247a956773f57543982b8bfcb6da5bb3afd9ef +size 10 diff --git a/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/zero_to_fp32.py b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..fa3d3fabcb348988a649d21ba1cea6973b029a39 --- /dev/null +++ b/checkpoints/epoch=1-step=48-train_total_step=0.19104.ckpt/zero_to_fp32.py @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92dcd9cc03d6010b4d060b73c8826d71ff9e29a030154e9f192e34ad80457f3a +size 29219 diff --git a/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..968028c8bbb12b589dfa34f19c7c4bbd02d0fe96 --- /dev/null +++ b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:046a98a6fce1f8dc7713d8c0cea09ea0fcf17b1cf0a814e78f0c94e1aad9f732 +size 10570070620 diff --git a/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..190f33db8def2ea298b63225e827ebf0cfa487e1 --- /dev/null +++ b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a01d81d560c5d9080abe2bc5106d5f17f6f84f08418859f9899d2671865b177d +size 10570070556 diff --git a/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..89ca300e6bbab1100642186fac653a6b6dd068ea --- /dev/null +++ b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb888db1d15c96b30ebe1d6c54613ff409f987b24150ad5e91bd8d1b65f78572 +size 10570070620 diff --git a/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ff925a969c22082587bb6e3f43c6bdd2c4353e3 --- /dev/null +++ b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:562213605019906235631ccf47fdcc10b76237b580854ec3f6b73abbbf9b8d73 +size 10570070556 diff --git a/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b56bbbe5e3fa37c5837a2846161546fd3aa1b62b --- /dev/null +++ b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2d9c62d7c11941aa1d0de0e4e5df96871599ac8247e4221f45f8299ef489ac +size 10570070748 diff --git a/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..18dcc38166edb6323924bd18ed73661be3965da8 --- /dev/null +++ b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f87e0d58c2406bbc72e7434ae0735cf687eb1e34fdf5b02d90f7594a809ebb5 +size 10570070556 diff --git a/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa7279a0f1733846c043a1adac073230fc64923c --- /dev/null +++ b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc0c8ba476ce7cb1b14afb7affb789b853827585d433ba216cae1a2ce58e32ff +size 10570070556 diff --git a/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..18f7294accb3f474765f17085b56351dc3365b8a --- /dev/null +++ b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48490d0473c39dfcd4e9f0dbefafabf2724691089ea97a2996469fa0e8b2d60d +size 10570070428 diff --git a/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/mp_rank_00_model_states.pt b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7eefc355b84b70535fe754aacfd98040649b3da --- /dev/null +++ b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/checkpoint/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47251f8ae80b3ece6e5959d6b8d6067e23aebcf3c13e4461a5bb886e1be7c599 +size 15142167290 diff --git a/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/latest b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/latest new file mode 100644 index 0000000000000000000000000000000000000000..cba72880aceabcd7bb2ab41315a70f60d597b12a --- /dev/null +++ b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/latest @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47320987f9a49d5b00119b960f247a956773f57543982b8bfcb6da5bb3afd9ef +size 10 diff --git a/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/zero_to_fp32.py b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..fa3d3fabcb348988a649d21ba1cea6973b029a39 --- /dev/null +++ b/checkpoints/epoch=1-step=53-train_total_step=0.12526.ckpt/zero_to_fp32.py @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92dcd9cc03d6010b4d060b73c8826d71ff9e29a030154e9f192e34ad80457f3a +size 29219 diff --git a/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b691edc797f37834c072a0049dcf97682d6323a4 --- /dev/null +++ b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf9eef68dd8f571e506f6b65061898a762158980bcb0957d06b54dfc95b18e65 +size 10570070620 diff --git a/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7eebc244bcb49899791fe31fc50e23b16ae8155 --- /dev/null +++ b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8a26e307e8ffbfaf759e7f2c02a960cac8012dc5431802fa54ca5814394a659 +size 10570070556 diff --git a/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01c7ccc762548a2cc95ca7bee4d532fbf0457e4a --- /dev/null +++ b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63e9b005d6ef3f42160bc89f2b57e8519163679f14d3175be80c3d67652d2f41 +size 10570070620 diff --git a/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b18e824b754f03f3716b2d2f15a79af02a4a528a --- /dev/null +++ b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe3275e4902e1df091d59fc5bb8ef15acf6c77f2e306f31d82953aadf2ae3b64 +size 10570070556 diff --git a/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76b131ac6c94e8b3d7350508811fa2ede415a84a --- /dev/null +++ b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e65e49e1d422c8dbb1c4a3bbb5918750b828907467a5571c7360d483fb2f08d8 +size 10570070748 diff --git a/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9db6dc6d947347fcf2c8645e62a59fe01bbdf82 --- /dev/null +++ b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe77a514da26ceb4317cae3146482181c1ed292148a0ea387c2666a924e727c3 +size 10570070556 diff --git a/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5df147618aed152805d2a8e43d80b8cfebb2b0e4 --- /dev/null +++ b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3da846c6044aa3d69575a4f9c71397c27f744354e130c43c039bb80464d1a75e +size 10570070556 diff --git a/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a48c2f748a48ccece5318d3f78d5baa172a1ef5 --- /dev/null +++ b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f66c30a760013d698a57662417c3bc720f05da32c974002f4dd7995f138681dd +size 10570070428 diff --git a/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/mp_rank_00_model_states.pt b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6660ff58a89a0e7a1644d23fdee3bc999cc1e6fd --- /dev/null +++ b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/checkpoint/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bd50f39338c025139c0952dc0cace9a2ed3f2a3d132319ac7e5d35db88ed723 +size 15142167290 diff --git a/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/latest b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/latest new file mode 100644 index 0000000000000000000000000000000000000000..cba72880aceabcd7bb2ab41315a70f60d597b12a --- /dev/null +++ b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/latest @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47320987f9a49d5b00119b960f247a956773f57543982b8bfcb6da5bb3afd9ef +size 10 diff --git a/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/output_dir/pytorch_model-00001-of-00002.bin b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/output_dir/pytorch_model-00001-of-00002.bin new file mode 100644 index 0000000000000000000000000000000000000000..00a45ecfb8f3619656ca50d549d9deb697285917 --- /dev/null +++ b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/output_dir/pytorch_model-00001-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eebae089a64d6390e86b944d9aa2a4557de6fe6ce6ed80c3931e29455cee762c +size 28186868793 diff --git a/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/output_dir/pytorch_model-00002-of-00002.bin b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/output_dir/pytorch_model-00002-of-00002.bin new file mode 100644 index 0000000000000000000000000000000000000000..e35fb13c3d30ab3eded93915da92955fe67be122 --- /dev/null +++ b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/output_dir/pytorch_model-00002-of-00002.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9319863497d44c8a19ca4e8d35ce2811d2a109f2d6f49abe76e8e413938c42e0 +size 524330983 diff --git a/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/output_dir/pytorch_model.bin.index.json b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/output_dir/pytorch_model.bin.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0ffb3200548b738e00e758be7f61526fbba6d21b --- /dev/null +++ b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/output_dir/pytorch_model.bin.index.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aef7202ee5939fd42df4912f2a9f1b70faf726831d43733e85c295ed3d434e0e +size 22001 diff --git a/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/zero_to_fp32.py b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..fa3d3fabcb348988a649d21ba1cea6973b029a39 --- /dev/null +++ b/checkpoints/epoch=1-step=56-val_total_epoch=0.13236.ckpt/zero_to_fp32.py @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92dcd9cc03d6010b4d060b73c8826d71ff9e29a030154e9f192e34ad80457f3a +size 29219 diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a51af4bfbc054b1ff73f19fc689f49146d009983 --- /dev/null +++ b/config.yaml @@ -0,0 +1,153 @@ +# lightning.pytorch==2.4.0.dev20240728 +seed_everything: 123 +trainer: + accelerator: gpu + strategy: + class_path: lightning.pytorch.strategies.DeepSpeedStrategy + init_args: + accelerator: null + zero_optimization: true + stage: 2 + remote_device: null + offload_optimizer: false + offload_parameters: true + offload_params_device: cpu + nvme_path: /local_nvme + params_buffer_count: 5 + params_buffer_size: 100000000 + max_in_cpu: 1000000000 + offload_optimizer_device: cpu + optimizer_buffer_count: 4 + block_size: 1048576 + queue_depth: 8 + single_submit: false + overlap_events: true + thread_count: 1 + pin_memory: true + sub_group_size: 1000000000000 + contiguous_gradients: true + overlap_comm: true + allgather_partitions: true + reduce_scatter: true + allgather_bucket_size: 200000000 + reduce_bucket_size: 200000000 + zero_allow_untested_optimizer: true + logging_batch_size_per_gpu: auto + config: null + logging_level: 30 + parallel_devices: null + cluster_environment: null + loss_scale: 0.0 + initial_scale_power: 16 + loss_scale_window: 1000 + hysteresis: 2 + min_loss_scale: 1 + partition_activations: false + cpu_checkpointing: false + contiguous_memory_optimization: false + synchronize_checkpoint_boundary: false + load_full_weights: false + precision_plugin: null + process_group_backend: null + devices: 8 + num_nodes: 1 + precision: bf16-true + logger: + class_path: lightning.pytorch.loggers.TensorBoardLogger + init_args: + save_dir: /media/logs + name: main + version: null + log_graph: false + default_hp_metric: true + prefix: '' + sub_dir: null + comment: '' + purge_step: null + max_queue: 10 + flush_secs: 120 + filename_suffix: '' + callbacks: null + fast_dev_run: false + max_epochs: 2 + min_epochs: null + max_steps: -1 + min_steps: null + max_time: null + limit_train_batches: null + limit_val_batches: null + limit_test_batches: null + limit_predict_batches: null + overfit_batches: 0.0 + val_check_interval: null + check_val_every_n_epoch: 1 + num_sanity_val_steps: 0 + log_every_n_steps: 1 + enable_checkpointing: null + enable_progress_bar: null + enable_model_summary: null + accumulate_grad_batches: 8 + gradient_clip_val: null + gradient_clip_algorithm: null + deterministic: null + benchmark: null + inference_mode: true + use_distributed_sampler: true + profiler: null + detect_anomaly: false + barebones: false + plugins: null + sync_batchnorm: false + reload_dataloaders_every_n_epochs: 0 + default_root_dir: null +model: + config: + model_name: Mistral-7B-v0.2 + dtype: bfloat16 + num_thoughts: 2 + thought_length: 8 + lookahead_tokens: 4 + embedding_grad_weights: 100.0 + temperature: 1.0 + do_sample: true + train_max_length: 120 + offload_cache: false + top_k: null + top_p: null + checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2 + weight_decay: 0.001 + warmup_steps: 20 + policy_weight: 1.0 + init_lr: 1.0e-06 + optimizer: + class_path: torch.optim.AdamW + init_args: + lr: 1.0e-06 + betas: + - 0.9 + - 0.999 + eps: 1.0e-08 + weight_decay: 0.001 + amsgrad: false + maximize: false + foreach: null + capturable: false + differentiable: false + fused: null + scheduler: null +ckpt_path: null +data: + class_path: src.dataset.OpenWebMathDataModule + init_args: + data_path: /media/datasets/openwebmath + tokenizer: + class_path: src.dataset.SpecialTokenizer + init_args: + checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2 + batch_size: 1 + max_seq_length: 120 + num_samples: 2048 + ignore_index: -100 + val_split_fraction: 0.125 + seed: 42 + num_workers: 1 diff --git a/events.out.tfevents.1731497713.e265bc7b2e59.58957.0 b/events.out.tfevents.1731497713.e265bc7b2e59.58957.0 new file mode 100644 index 0000000000000000000000000000000000000000..50ab4324bdd08029fdbf6d8ad59194b6a1f6e9bc --- /dev/null +++ b/events.out.tfevents.1731497713.e265bc7b2e59.58957.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ffc842123831b3b0cdbbcce542e0dc7e7000ebeb2e8eb264d814dcdd5e87072 +size 52472 diff --git a/hparams.yaml b/hparams.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93 --- /dev/null +++ b/hparams.yaml @@ -0,0 +1 @@ +{}