fats-fme commited on
Commit
1d97221
·
verified ·
1 Parent(s): 9eb6f0c

Training in progress, step 138, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72d932876dc769bf93f82b0534690ecaf491a27dd3494f4b390be0317f04d933
3
  size 335922386
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d64006a8240d0814491b2db4b937e17fb7b606088a33ab275a336effb5b52496
3
  size 335922386
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c2106bd7c77f119bfbd0e7216f6b8076246c5b56169d9758a5766704f7fd3ac
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ac89b50eb49875d4fc6320c442b1f1a2bb0c6ca5dcf4534babea7e4fa581fbf
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d552cf0ac1340b257e68c2f09f146d8a7526b80238d90cb501f4380a5acaac65
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa2482d7eb8b9907f50055efed6d979a680e476b4380cec06a223fb30358eb52
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9263945a6777ffd183084f656dbf9a8ade54f242aec4e02deae1e6e4a03b7dfd
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05243bc9418b5d027b9cd58d0b804f8898dee9480e9cd6d09120cb4b16d4e2f3
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.25125170687300863,
5
  "eval_steps": 69,
6
- "global_step": 69,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -506,6 +506,497 @@
506
  "eval_samples_per_second": 8.365,
507
  "eval_steps_per_second": 2.096,
508
  "step": 69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  }
510
  ],
511
  "logging_steps": 1,
@@ -525,7 +1016,7 @@
525
  "attributes": {}
526
  }
527
  },
528
- "total_flos": 1.940639639006085e+17,
529
  "train_batch_size": 2,
530
  "trial_name": null,
531
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5025034137460173,
5
  "eval_steps": 69,
6
+ "global_step": 138,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
506
  "eval_samples_per_second": 8.365,
507
  "eval_steps_per_second": 2.096,
508
  "step": 69
509
+ },
510
+ {
511
+ "epoch": 0.25489303595812474,
512
+ "grad_norm": NaN,
513
+ "learning_rate": 0.00019609173219450998,
514
+ "loss": 0.0,
515
+ "step": 70
516
+ },
517
+ {
518
+ "epoch": 0.2585343650432408,
519
+ "grad_norm": NaN,
520
+ "learning_rate": 0.0001956940335732209,
521
+ "loss": 0.0,
522
+ "step": 71
523
+ },
524
+ {
525
+ "epoch": 0.26217569412835684,
526
+ "grad_norm": NaN,
527
+ "learning_rate": 0.00019527751227228963,
528
+ "loss": 0.0,
529
+ "step": 72
530
+ },
531
+ {
532
+ "epoch": 0.2658170232134729,
533
+ "grad_norm": NaN,
534
+ "learning_rate": 0.0001948422502199903,
535
+ "loss": 0.0,
536
+ "step": 73
537
+ },
538
+ {
539
+ "epoch": 0.269458352298589,
540
+ "grad_norm": NaN,
541
+ "learning_rate": 0.00019438833303083678,
542
+ "loss": 0.0,
543
+ "step": 74
544
+ },
545
+ {
546
+ "epoch": 0.27309968138370505,
547
+ "grad_norm": NaN,
548
+ "learning_rate": 0.0001939158499887428,
549
+ "loss": 0.0,
550
+ "step": 75
551
+ },
552
+ {
553
+ "epoch": 0.2767410104688211,
554
+ "grad_norm": NaN,
555
+ "learning_rate": 0.00019342489402945998,
556
+ "loss": 0.0,
557
+ "step": 76
558
+ },
559
+ {
560
+ "epoch": 0.2803823395539372,
561
+ "grad_norm": NaN,
562
+ "learning_rate": 0.00019291556172229785,
563
+ "loss": 0.0,
564
+ "step": 77
565
+ },
566
+ {
567
+ "epoch": 0.28402366863905326,
568
+ "grad_norm": NaN,
569
+ "learning_rate": 0.0001923879532511287,
570
+ "loss": 0.0,
571
+ "step": 78
572
+ },
573
+ {
574
+ "epoch": 0.2876649977241693,
575
+ "grad_norm": NaN,
576
+ "learning_rate": 0.00019184217239468212,
577
+ "loss": 0.0,
578
+ "step": 79
579
+ },
580
+ {
581
+ "epoch": 0.29130632680928537,
582
+ "grad_norm": NaN,
583
+ "learning_rate": 0.00019127832650613189,
584
+ "loss": 0.0,
585
+ "step": 80
586
+ },
587
+ {
588
+ "epoch": 0.2949476558944015,
589
+ "grad_norm": NaN,
590
+ "learning_rate": 0.00019069652649198005,
591
+ "loss": 0.0,
592
+ "step": 81
593
+ },
594
+ {
595
+ "epoch": 0.2985889849795175,
596
+ "grad_norm": NaN,
597
+ "learning_rate": 0.0001900968867902419,
598
+ "loss": 0.0,
599
+ "step": 82
600
+ },
601
+ {
602
+ "epoch": 0.3022303140646336,
603
+ "grad_norm": NaN,
604
+ "learning_rate": 0.00018947952534793661,
605
+ "loss": 0.0,
606
+ "step": 83
607
+ },
608
+ {
609
+ "epoch": 0.3058716431497497,
610
+ "grad_norm": NaN,
611
+ "learning_rate": 0.00018884456359788724,
612
+ "loss": 0.0,
613
+ "step": 84
614
+ },
615
+ {
616
+ "epoch": 0.30951297223486574,
617
+ "grad_norm": NaN,
618
+ "learning_rate": 0.0001881921264348355,
619
+ "loss": 0.0,
620
+ "step": 85
621
+ },
622
+ {
623
+ "epoch": 0.3131543013199818,
624
+ "grad_norm": NaN,
625
+ "learning_rate": 0.00018752234219087538,
626
+ "loss": 0.0,
627
+ "step": 86
628
+ },
629
+ {
630
+ "epoch": 0.31679563040509784,
631
+ "grad_norm": NaN,
632
+ "learning_rate": 0.00018683534261021057,
633
+ "loss": 0.0,
634
+ "step": 87
635
+ },
636
+ {
637
+ "epoch": 0.32043695949021395,
638
+ "grad_norm": NaN,
639
+ "learning_rate": 0.00018613126282324092,
640
+ "loss": 0.0,
641
+ "step": 88
642
+ },
643
+ {
644
+ "epoch": 0.32407828857533,
645
+ "grad_norm": NaN,
646
+ "learning_rate": 0.00018541024131998274,
647
+ "loss": 0.0,
648
+ "step": 89
649
+ },
650
+ {
651
+ "epoch": 0.32771961766044605,
652
+ "grad_norm": NaN,
653
+ "learning_rate": 0.00018467241992282843,
654
+ "loss": 0.0,
655
+ "step": 90
656
+ },
657
+ {
658
+ "epoch": 0.33136094674556216,
659
+ "grad_norm": NaN,
660
+ "learning_rate": 0.00018391794375865024,
661
+ "loss": 0.0,
662
+ "step": 91
663
+ },
664
+ {
665
+ "epoch": 0.3350022758306782,
666
+ "grad_norm": NaN,
667
+ "learning_rate": 0.00018314696123025454,
668
+ "loss": 0.0,
669
+ "step": 92
670
+ },
671
+ {
672
+ "epoch": 0.33864360491579426,
673
+ "grad_norm": NaN,
674
+ "learning_rate": 0.00018235962398719147,
675
+ "loss": 0.0,
676
+ "step": 93
677
+ },
678
+ {
679
+ "epoch": 0.3422849340009103,
680
+ "grad_norm": NaN,
681
+ "learning_rate": 0.00018155608689592604,
682
+ "loss": 0.0,
683
+ "step": 94
684
+ },
685
+ {
686
+ "epoch": 0.3459262630860264,
687
+ "grad_norm": NaN,
688
+ "learning_rate": 0.00018073650800937624,
689
+ "loss": 0.0,
690
+ "step": 95
691
+ },
692
+ {
693
+ "epoch": 0.34956759217114247,
694
+ "grad_norm": NaN,
695
+ "learning_rate": 0.00017990104853582493,
696
+ "loss": 0.0,
697
+ "step": 96
698
+ },
699
+ {
700
+ "epoch": 0.3532089212562585,
701
+ "grad_norm": NaN,
702
+ "learning_rate": 0.00017904987280721035,
703
+ "loss": 0.0,
704
+ "step": 97
705
+ },
706
+ {
707
+ "epoch": 0.3568502503413746,
708
+ "grad_norm": NaN,
709
+ "learning_rate": 0.000178183148246803,
710
+ "loss": 0.0,
711
+ "step": 98
712
+ },
713
+ {
714
+ "epoch": 0.3604915794264907,
715
+ "grad_norm": NaN,
716
+ "learning_rate": 0.0001773010453362737,
717
+ "loss": 0.0,
718
+ "step": 99
719
+ },
720
+ {
721
+ "epoch": 0.36413290851160673,
722
+ "grad_norm": NaN,
723
+ "learning_rate": 0.00017640373758216077,
724
+ "loss": 0.0,
725
+ "step": 100
726
+ },
727
+ {
728
+ "epoch": 0.3677742375967228,
729
+ "grad_norm": NaN,
730
+ "learning_rate": 0.0001754914014817416,
731
+ "loss": 0.0,
732
+ "step": 101
733
+ },
734
+ {
735
+ "epoch": 0.3714155666818389,
736
+ "grad_norm": NaN,
737
+ "learning_rate": 0.00017456421648831655,
738
+ "loss": 0.0,
739
+ "step": 102
740
+ },
741
+ {
742
+ "epoch": 0.37505689576695495,
743
+ "grad_norm": NaN,
744
+ "learning_rate": 0.00017362236497591094,
745
+ "loss": 0.0,
746
+ "step": 103
747
+ },
748
+ {
749
+ "epoch": 0.378698224852071,
750
+ "grad_norm": NaN,
751
+ "learning_rate": 0.0001726660322034027,
752
+ "loss": 0.0,
753
+ "step": 104
754
+ },
755
+ {
756
+ "epoch": 0.38233955393718705,
757
+ "grad_norm": NaN,
758
+ "learning_rate": 0.00017169540627808274,
759
+ "loss": 0.0,
760
+ "step": 105
761
+ },
762
+ {
763
+ "epoch": 0.38598088302230316,
764
+ "grad_norm": NaN,
765
+ "learning_rate": 0.00017071067811865476,
766
+ "loss": 0.0,
767
+ "step": 106
768
+ },
769
+ {
770
+ "epoch": 0.3896222121074192,
771
+ "grad_norm": NaN,
772
+ "learning_rate": 0.00016971204141768233,
773
+ "loss": 0.0,
774
+ "step": 107
775
+ },
776
+ {
777
+ "epoch": 0.39326354119253526,
778
+ "grad_norm": NaN,
779
+ "learning_rate": 0.00016869969260349018,
780
+ "loss": 0.0,
781
+ "step": 108
782
+ },
783
+ {
784
+ "epoch": 0.39690487027765137,
785
+ "grad_norm": NaN,
786
+ "learning_rate": 0.00016767383080152742,
787
+ "loss": 0.0,
788
+ "step": 109
789
+ },
790
+ {
791
+ "epoch": 0.4005461993627674,
792
+ "grad_norm": NaN,
793
+ "learning_rate": 0.0001666346577952004,
794
+ "loss": 0.0,
795
+ "step": 110
796
+ },
797
+ {
798
+ "epoch": 0.40418752844788347,
799
+ "grad_norm": NaN,
800
+ "learning_rate": 0.00016558237798618245,
801
+ "loss": 0.0,
802
+ "step": 111
803
+ },
804
+ {
805
+ "epoch": 0.4078288575329995,
806
+ "grad_norm": NaN,
807
+ "learning_rate": 0.00016451719835420877,
808
+ "loss": 0.0,
809
+ "step": 112
810
+ },
811
+ {
812
+ "epoch": 0.41147018661811563,
813
+ "grad_norm": NaN,
814
+ "learning_rate": 0.00016343932841636456,
815
+ "loss": 0.0,
816
+ "step": 113
817
+ },
818
+ {
819
+ "epoch": 0.4151115157032317,
820
+ "grad_norm": NaN,
821
+ "learning_rate": 0.00016234898018587337,
822
+ "loss": 0.0,
823
+ "step": 114
824
+ },
825
+ {
826
+ "epoch": 0.41875284478834773,
827
+ "grad_norm": NaN,
828
+ "learning_rate": 0.00016124636813039502,
829
+ "loss": 0.0,
830
+ "step": 115
831
+ },
832
+ {
833
+ "epoch": 0.42239417387346384,
834
+ "grad_norm": NaN,
835
+ "learning_rate": 0.00016013170912984058,
836
+ "loss": 0.0,
837
+ "step": 116
838
+ },
839
+ {
840
+ "epoch": 0.4260355029585799,
841
+ "grad_norm": NaN,
842
+ "learning_rate": 0.00015900522243371282,
843
+ "loss": 0.0,
844
+ "step": 117
845
+ },
846
+ {
847
+ "epoch": 0.42967683204369594,
848
+ "grad_norm": NaN,
849
+ "learning_rate": 0.0001578671296179806,
850
+ "loss": 0.0,
851
+ "step": 118
852
+ },
853
+ {
854
+ "epoch": 0.433318161128812,
855
+ "grad_norm": NaN,
856
+ "learning_rate": 0.00015671765454149559,
857
+ "loss": 0.0,
858
+ "step": 119
859
+ },
860
+ {
861
+ "epoch": 0.4369594902139281,
862
+ "grad_norm": NaN,
863
+ "learning_rate": 0.00015555702330196023,
864
+ "loss": 0.0,
865
+ "step": 120
866
+ },
867
+ {
868
+ "epoch": 0.44060081929904416,
869
+ "grad_norm": NaN,
870
+ "learning_rate": 0.00015438546419145488,
871
+ "loss": 0.0,
872
+ "step": 121
873
+ },
874
+ {
875
+ "epoch": 0.4442421483841602,
876
+ "grad_norm": NaN,
877
+ "learning_rate": 0.00015320320765153367,
878
+ "loss": 0.0,
879
+ "step": 122
880
+ },
881
+ {
882
+ "epoch": 0.44788347746927626,
883
+ "grad_norm": NaN,
884
+ "learning_rate": 0.00015201048622789747,
885
+ "loss": 0.0,
886
+ "step": 123
887
+ },
888
+ {
889
+ "epoch": 0.45152480655439237,
890
+ "grad_norm": NaN,
891
+ "learning_rate": 0.00015080753452465296,
892
+ "loss": 0.0,
893
+ "step": 124
894
+ },
895
+ {
896
+ "epoch": 0.4551661356395084,
897
+ "grad_norm": NaN,
898
+ "learning_rate": 0.0001495945891581668,
899
+ "loss": 0.0,
900
+ "step": 125
901
+ },
902
+ {
903
+ "epoch": 0.45880746472462447,
904
+ "grad_norm": NaN,
905
+ "learning_rate": 0.000148371888710524,
906
+ "loss": 0.0,
907
+ "step": 126
908
+ },
909
+ {
910
+ "epoch": 0.4624487938097406,
911
+ "grad_norm": NaN,
912
+ "learning_rate": 0.0001471396736825998,
913
+ "loss": 0.0,
914
+ "step": 127
915
+ },
916
+ {
917
+ "epoch": 0.46609012289485663,
918
+ "grad_norm": NaN,
919
+ "learning_rate": 0.00014589818644675378,
920
+ "loss": 0.0,
921
+ "step": 128
922
+ },
923
+ {
924
+ "epoch": 0.4697314519799727,
925
+ "grad_norm": NaN,
926
+ "learning_rate": 0.00014464767119915629,
927
+ "loss": 0.0,
928
+ "step": 129
929
+ },
930
+ {
931
+ "epoch": 0.47337278106508873,
932
+ "grad_norm": NaN,
933
+ "learning_rate": 0.00014338837391175582,
934
+ "loss": 0.0,
935
+ "step": 130
936
+ },
937
+ {
938
+ "epoch": 0.47701411015020484,
939
+ "grad_norm": NaN,
940
+ "learning_rate": 0.0001421205422838971,
941
+ "loss": 0.0,
942
+ "step": 131
943
+ },
944
+ {
945
+ "epoch": 0.4806554392353209,
946
+ "grad_norm": NaN,
947
+ "learning_rate": 0.00014084442569359964,
948
+ "loss": 0.0,
949
+ "step": 132
950
+ },
951
+ {
952
+ "epoch": 0.48429676832043694,
953
+ "grad_norm": NaN,
954
+ "learning_rate": 0.0001395602751485059,
955
+ "loss": 0.0,
956
+ "step": 133
957
+ },
958
+ {
959
+ "epoch": 0.48793809740555305,
960
+ "grad_norm": NaN,
961
+ "learning_rate": 0.000138268343236509,
962
+ "loss": 0.0,
963
+ "step": 134
964
+ },
965
+ {
966
+ "epoch": 0.4915794264906691,
967
+ "grad_norm": NaN,
968
+ "learning_rate": 0.00013696888407606952,
969
+ "loss": 0.0,
970
+ "step": 135
971
+ },
972
+ {
973
+ "epoch": 0.49522075557578515,
974
+ "grad_norm": NaN,
975
+ "learning_rate": 0.0001356621532662313,
976
+ "loss": 0.0,
977
+ "step": 136
978
+ },
979
+ {
980
+ "epoch": 0.4988620846609012,
981
+ "grad_norm": NaN,
982
+ "learning_rate": 0.0001343484078363461,
983
+ "loss": 0.0,
984
+ "step": 137
985
+ },
986
+ {
987
+ "epoch": 0.5025034137460173,
988
+ "grad_norm": NaN,
989
+ "learning_rate": 0.00013302790619551674,
990
+ "loss": 0.0,
991
+ "step": 138
992
+ },
993
+ {
994
+ "epoch": 0.5025034137460173,
995
+ "eval_loss": NaN,
996
+ "eval_runtime": 55.2669,
997
+ "eval_samples_per_second": 8.378,
998
+ "eval_steps_per_second": 2.099,
999
+ "step": 138
1000
  }
1001
  ],
1002
  "logging_steps": 1,
 
1016
  "attributes": {}
1017
  }
1018
  },
1019
+ "total_flos": 3.88127927801217e+17,
1020
  "train_batch_size": 2,
1021
  "trial_name": null,
1022
  "trial_params": null