g4rg commited on
Commit
56e563a
·
verified ·
1 Parent(s): 820b672

Training in progress, step 128, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d281dead310dc912d96ea9c1ecf041030d9b9eda5a70050289f56893a32f795
3
  size 763470136
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8da66d7ae6c07456dfdb2566c5efbc9cb757f30489aab971f6c4fa69c36c8240
3
  size 763470136
last-checkpoint/global_step128/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab5c1eca63228d208fef21271623bf1dda90820685fa4ff73d43cc07b9e3a6bf
3
+ size 385019984
last-checkpoint/global_step128/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3abba731438838d262e081ab6656053f5e7bc06db83dc0bfc411b549a9ed8b66
3
+ size 385019984
last-checkpoint/global_step128/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:970afd7d80b6a61367bc12f47be0c512e1f32d2234943bd945faef7866167cfc
3
+ size 385019984
last-checkpoint/global_step128/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:268eab3cb4ccc9209f280462a7d444c81c55f1a38862a93a265b1af32bb5b3a9
3
+ size 385019984
last-checkpoint/global_step128/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bc2045f902a068e626ed65bd6cda8fa9d476d4b2a0b9e493e884673880b6dcd
3
+ size 385019984
last-checkpoint/global_step128/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8289fec92c8b9638eef098d975c47ad2bf6029fbd1de8abc8ba4f347b8937d7f
3
+ size 385019984
last-checkpoint/global_step128/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1fa8ae1ae93312dd1dd7e33d65813bc05af4ba623885861d5d6c01d66318f2b
3
+ size 348711830
last-checkpoint/global_step128/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65af44adee25bbf11d802d36a8cccd8a9f081373b711850daae83bab3c78f086
3
+ size 348711830
last-checkpoint/global_step128/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cea62ae08ed200f7243b5fd26222c0091c7afaad29120b4c8bc806a969947791
3
+ size 348711830
last-checkpoint/global_step128/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b1ca8e65609885bb731a9e4a58657586a83392ffa40d95dab7739ae61e242ae
3
+ size 348711830
last-checkpoint/global_step128/zero_pp_rank_4_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c833e27ba8cdd45f29846b16e57c5f5928b3739f8d066bb48946d85a7d59e821
3
+ size 348711830
last-checkpoint/global_step128/zero_pp_rank_5_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59f9990347be616df13fac928c52ab06150bbabb50e8663cb2d2d03ee43991b9
3
+ size 348711830
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step96
 
1
+ global_step128
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18fb5e6493092cd8f71e5b3842d879509d55ae6cab2bf942dd56c48e7b8cc9fc
3
  size 15472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74fbe9c3428ed3e9c35b612dab93cb88760e9a705b6c000851dabad16e459b72
3
  size 15472
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9fc4d72ba97fb58bd464a1fddcf7c6f2d733fa949f29b90e179d642b44eb624e
3
  size 15472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09696d37ecf847e753dcf238b8abaa5cd29c004c3225bbd9bb36e502bbd3e1d2
3
  size 15472
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c2da6915b4a587863bc60c730f828ac7764e70c41ebff8c368e95f045073b7f
3
  size 15472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fddcb8b3b958d20b652b04c28b148d9888da159024072edc127f9894cb6961c
3
  size 15472
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89e0de0d83a177df7de41df7ea72b48a294a2d0589d8ca1035b9dd419b036e81
3
  size 15472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f28a47ba026f0903768f763280985efa3436a6168461fd9e14c78c1da328d9c
3
  size 15472
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d6218d38ef6fd867f07998ab60de53f729af52921318b25e9685aae0a3d9044
3
  size 15472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:183a725b6dcc281ea67c71bdd66ade19c4182db0091e458e1ec13520873d8d61
3
  size 15472
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10040054714cbe309f578c161884fad07491098cb4b32684d3d08fcb91914d00
3
  size 15472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea090671e9e0107c57eb483ad64dfda3f8f03477d846083978fd325481ad13ae
3
  size 15472
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c8d252fdd44d3c445fb84b7c408c35fe553a7fa8fd66113443f48d6125c89b0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b8cc489c9f7d5c810d95c1d7bed07638ecb58d3cdf988fa074bd79e5349fe0
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.6193548387096774,
5
  "eval_steps": 32,
6
- "global_step": 96,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -711,6 +711,238 @@
711
  "eval_samples_per_second": 1.599,
712
  "eval_steps_per_second": 0.064,
713
  "step": 96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
714
  }
715
  ],
716
  "logging_steps": 1,
@@ -730,7 +962,7 @@
730
  "attributes": {}
731
  }
732
  },
733
- "total_flos": 196559178301440.0,
734
  "train_batch_size": 5,
735
  "trial_name": null,
736
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8258064516129032,
5
  "eval_steps": 32,
6
+ "global_step": 128,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
711
  "eval_samples_per_second": 1.599,
712
  "eval_steps_per_second": 0.064,
713
  "step": 96
714
+ },
715
+ {
716
+ "epoch": 0.6258064516129033,
717
+ "grad_norm": 0.36219529568521813,
718
+ "learning_rate": 2.285588522547411e-05,
719
+ "loss": 1.2681,
720
+ "step": 97
721
+ },
722
+ {
723
+ "epoch": 0.632258064516129,
724
+ "grad_norm": 0.4601161674119361,
725
+ "learning_rate": 2.234861262612199e-05,
726
+ "loss": 1.2387,
727
+ "step": 98
728
+ },
729
+ {
730
+ "epoch": 0.6387096774193548,
731
+ "grad_norm": 0.6207212832715766,
732
+ "learning_rate": 2.184408872350019e-05,
733
+ "loss": 1.2087,
734
+ "step": 99
735
+ },
736
+ {
737
+ "epoch": 0.6451612903225806,
738
+ "grad_norm": 0.3655891991096712,
739
+ "learning_rate": 2.134258272337814e-05,
740
+ "loss": 1.2769,
741
+ "step": 100
742
+ },
743
+ {
744
+ "epoch": 0.6516129032258065,
745
+ "grad_norm": 0.4394265602792923,
746
+ "learning_rate": 2.084436222122142e-05,
747
+ "loss": 1.0799,
748
+ "step": 101
749
+ },
750
+ {
751
+ "epoch": 0.6580645161290323,
752
+ "grad_norm": 0.5059663574517834,
753
+ "learning_rate": 2.0349693059407215e-05,
754
+ "loss": 1.0953,
755
+ "step": 102
756
+ },
757
+ {
758
+ "epoch": 0.6645161290322581,
759
+ "grad_norm": 0.34732606007316424,
760
+ "learning_rate": 1.9858839185375123e-05,
761
+ "loss": 1.224,
762
+ "step": 103
763
+ },
764
+ {
765
+ "epoch": 0.6709677419354839,
766
+ "grad_norm": 0.5464551769086812,
767
+ "learning_rate": 1.9372062510789063e-05,
768
+ "loss": 1.2413,
769
+ "step": 104
770
+ },
771
+ {
772
+ "epoch": 0.6774193548387096,
773
+ "grad_norm": 0.977742231459624,
774
+ "learning_rate": 1.888962277178548e-05,
775
+ "loss": 1.2118,
776
+ "step": 105
777
+ },
778
+ {
779
+ "epoch": 0.6838709677419355,
780
+ "grad_norm": 2.537109489591264,
781
+ "learning_rate": 1.8411777390382367e-05,
782
+ "loss": 1.2513,
783
+ "step": 106
784
+ },
785
+ {
786
+ "epoch": 0.6903225806451613,
787
+ "grad_norm": 0.35948844839880034,
788
+ "learning_rate": 1.7938781337123016e-05,
789
+ "loss": 1.1404,
790
+ "step": 107
791
+ },
792
+ {
793
+ "epoch": 0.6967741935483871,
794
+ "grad_norm": 0.457105884170092,
795
+ "learning_rate": 1.747088699502789e-05,
796
+ "loss": 1.1514,
797
+ "step": 108
798
+ },
799
+ {
800
+ "epoch": 0.7032258064516129,
801
+ "grad_norm": 1.1486002566265734,
802
+ "learning_rate": 1.7008344024927168e-05,
803
+ "loss": 1.3249,
804
+ "step": 109
805
+ },
806
+ {
807
+ "epoch": 0.7096774193548387,
808
+ "grad_norm": 0.36043342663778255,
809
+ "learning_rate": 1.6551399232245737e-05,
810
+ "loss": 1.1239,
811
+ "step": 110
812
+ },
813
+ {
814
+ "epoch": 0.7161290322580646,
815
+ "grad_norm": 0.46594876338109426,
816
+ "learning_rate": 1.610029643531182e-05,
817
+ "loss": 1.2918,
818
+ "step": 111
819
+ },
820
+ {
821
+ "epoch": 0.7225806451612903,
822
+ "grad_norm": 0.32990660251070025,
823
+ "learning_rate": 1.5655276335259493e-05,
824
+ "loss": 1.2266,
825
+ "step": 112
826
+ },
827
+ {
828
+ "epoch": 0.7290322580645161,
829
+ "grad_norm": 0.30010478660077256,
830
+ "learning_rate": 1.5216576387594481e-05,
831
+ "loss": 1.2114,
832
+ "step": 113
833
+ },
834
+ {
835
+ "epoch": 0.7354838709677419,
836
+ "grad_norm": 0.49532244626831723,
837
+ "learning_rate": 1.4784430675491685e-05,
838
+ "loss": 1.2457,
839
+ "step": 114
840
+ },
841
+ {
842
+ "epoch": 0.7419354838709677,
843
+ "grad_norm": 0.5191609185311767,
844
+ "learning_rate": 1.4359069784892282e-05,
845
+ "loss": 1.2862,
846
+ "step": 115
847
+ },
848
+ {
849
+ "epoch": 0.7483870967741936,
850
+ "grad_norm": 0.3826327354484767,
851
+ "learning_rate": 1.3940720681466734e-05,
852
+ "loss": 1.1351,
853
+ "step": 116
854
+ },
855
+ {
856
+ "epoch": 0.7548387096774194,
857
+ "grad_norm": 0.330074625162551,
858
+ "learning_rate": 1.3529606589509647e-05,
859
+ "loss": 1.1871,
860
+ "step": 117
861
+ },
862
+ {
863
+ "epoch": 0.7612903225806451,
864
+ "grad_norm": 0.34233269430078184,
865
+ "learning_rate": 1.3125946872830877e-05,
866
+ "loss": 1.1411,
867
+ "step": 118
868
+ },
869
+ {
870
+ "epoch": 0.7677419354838709,
871
+ "grad_norm": 0.31326296304705775,
872
+ "learning_rate": 1.2729956917706545e-05,
873
+ "loss": 1.2387,
874
+ "step": 119
875
+ },
876
+ {
877
+ "epoch": 0.7741935483870968,
878
+ "grad_norm": 0.3176809107580838,
879
+ "learning_rate": 1.2341848017952464e-05,
880
+ "loss": 1.2451,
881
+ "step": 120
882
+ },
883
+ {
884
+ "epoch": 0.7806451612903226,
885
+ "grad_norm": 0.31420402228609556,
886
+ "learning_rate": 1.1961827262181141e-05,
887
+ "loss": 1.1766,
888
+ "step": 121
889
+ },
890
+ {
891
+ "epoch": 0.7870967741935484,
892
+ "grad_norm": 0.4637761844099348,
893
+ "learning_rate": 1.1590097423302684e-05,
894
+ "loss": 1.1542,
895
+ "step": 122
896
+ },
897
+ {
898
+ "epoch": 0.7935483870967742,
899
+ "grad_norm": 0.36159367839677437,
900
+ "learning_rate": 1.1226856850328434e-05,
901
+ "loss": 1.3127,
902
+ "step": 123
903
+ },
904
+ {
905
+ "epoch": 0.8,
906
+ "grad_norm": 0.5010806704980222,
907
+ "learning_rate": 1.0872299362535173e-05,
908
+ "loss": 1.2729,
909
+ "step": 124
910
+ },
911
+ {
912
+ "epoch": 0.8064516129032258,
913
+ "grad_norm": 0.3461696613483525,
914
+ "learning_rate": 1.0526614146046312e-05,
915
+ "loss": 1.2425,
916
+ "step": 125
917
+ },
918
+ {
919
+ "epoch": 0.8129032258064516,
920
+ "grad_norm": 0.35751217338851793,
921
+ "learning_rate": 1.0189985652885225e-05,
922
+ "loss": 1.2222,
923
+ "step": 126
924
+ },
925
+ {
926
+ "epoch": 0.8193548387096774,
927
+ "grad_norm": 0.43059544412165696,
928
+ "learning_rate": 9.862593502554648e-06,
929
+ "loss": 1.1938,
930
+ "step": 127
931
+ },
932
+ {
933
+ "epoch": 0.8258064516129032,
934
+ "grad_norm": 0.7260092938036656,
935
+ "learning_rate": 9.544612386194612e-06,
936
+ "loss": 1.1063,
937
+ "step": 128
938
+ },
939
+ {
940
+ "epoch": 0.8258064516129032,
941
+ "eval_loss": 1.0231536626815796,
942
+ "eval_runtime": 62.2556,
943
+ "eval_samples_per_second": 1.606,
944
+ "eval_steps_per_second": 0.064,
945
+ "step": 128
946
  }
947
  ],
948
  "logging_steps": 1,
 
962
  "attributes": {}
963
  }
964
  },
965
+ "total_flos": 262078904401920.0,
966
  "train_batch_size": 5,
967
  "trial_name": null,
968
  "trial_params": null