Muennighoff commited on
Commit
f61dd83
Β·
1 Parent(s): 099d855

Cache everything; Add rankings everwhere; Automate num dataset/score computation

Browse files
Files changed (1) hide show
  1. app.py +68 -29
app.py CHANGED
@@ -393,6 +393,7 @@ MODELS_TO_SKIP = {
393
  "anttip/ct2fast-e5-small-v2-hfie",
394
  "newsrx/instructor-large",
395
  "newsrx/instructor-xl",
 
396
  }
397
 
398
 
@@ -471,7 +472,20 @@ def get_dim_seq_size(model):
471
  size = round(size["metadata"]["total_size"] / 1e9, 2)
472
  return dim, seq, size
473
 
474
- def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
 
 
 
 
 
 
 
 
 
 
 
 
 
475
  api = HfApi()
476
  models = api.list_models(filter="mteb")
477
  # Initialize list to models that we cannot fetch metadata from
@@ -532,6 +546,8 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
532
  cols = sorted(list(df.columns))
533
  cols.insert(0, cols.pop(cols.index("Model")))
534
  df = df[cols]
 
 
535
  if fillna:
536
  df.fillna("", inplace=True)
537
  return df
@@ -551,10 +567,8 @@ def get_mteb_average():
551
  langs=["en", "en-en"],
552
  fillna=False,
553
  add_emb_dim=True,
 
554
  )
555
- # Approximation (Missing Bitext Mining & including some nans)
556
- NUM_SCORES = DATA_OVERALL.shape[0] * DATA_OVERALL.shape[1]
557
-
558
  # Debugging:
559
  # DATA_OVERALL.to_csv("overall.csv")
560
 
@@ -572,32 +586,51 @@ def get_mteb_average():
572
 
573
  DATA_OVERALL = DATA_OVERALL.round(2)
574
 
 
 
 
 
 
 
 
 
575
  # Fill NaN after averaging
576
  DATA_OVERALL.fillna("", inplace=True)
577
 
578
- DATA_CLASSIFICATION_EN = DATA_OVERALL[["Model"] + TASK_LIST_CLASSIFICATION]
579
- DATA_CLUSTERING = DATA_OVERALL[["Model"] + TASK_LIST_CLUSTERING]
580
- DATA_PAIR_CLASSIFICATION = DATA_OVERALL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION]
581
- DATA_RERANKING = DATA_OVERALL[["Model"] + TASK_LIST_RERANKING]
582
- DATA_RETRIEVAL = DATA_OVERALL[["Model"] + TASK_LIST_RETRIEVAL]
583
- DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
584
- DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
585
-
586
  DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
587
 
588
  return DATA_OVERALL
589
 
590
  get_mteb_average()
591
- block = gr.Blocks()
592
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
 
 
594
  with block:
595
  gr.Markdown(f"""
596
  Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> πŸ€— Refer to the [MTEB paper](https://arxiv.org/abs/2210.07316) for details on metrics, tasks and models.
597
 
598
- - **Total Datasets**: 62
599
  - **Total Languages**: 112
600
- - **Total Scores**: >{NUM_SCORES}
601
  - **Total Models**: {len(DATA_OVERALL)}
602
  """)
603
  with gr.Tabs():
@@ -629,7 +662,8 @@ with block:
629
  """)
630
  with gr.Row():
631
  data_bitext_mining = gr.components.Dataframe(
632
- datatype=["markdown"] + ["number"] * 500, # hack when we don't know how many columns
 
633
  type="pandas",
634
  )
635
  with gr.Row():
@@ -652,7 +686,7 @@ with block:
652
  with gr.Row():
653
  data_classification_en = gr.components.Dataframe(
654
  DATA_CLASSIFICATION_EN,
655
- datatype=["markdown"] + ["number"] * len(DATA_CLASSIFICATION_EN.columns),
656
  type="pandas",
657
  )
658
  with gr.Row():
@@ -677,7 +711,8 @@ with block:
677
  """)
678
  with gr.Row():
679
  data_classification = gr.components.Dataframe(
680
- datatype=["markdown"] + ["number"] * 200, # hack when we don't know how many columns
 
681
  type="pandas",
682
  )
683
  with gr.Row():
@@ -700,7 +735,7 @@ with block:
700
  with gr.Row():
701
  data_clustering = gr.components.Dataframe(
702
  DATA_CLUSTERING,
703
- datatype=["markdown"] + ["number"] * len(DATA_CLUSTERING.columns),
704
  type="pandas",
705
  )
706
  with gr.Row():
@@ -724,7 +759,8 @@ with block:
724
  """)
725
  with gr.Row():
726
  data_clustering_de = gr.components.Dataframe(
727
- datatype=["markdown"] + ["number"] * len(TASK_LIST_CLUSTERING_DE),
 
728
  type="pandas",
729
  )
730
  with gr.Row():
@@ -748,7 +784,7 @@ with block:
748
  with gr.Row():
749
  data_pair_classification = gr.components.Dataframe(
750
  DATA_PAIR_CLASSIFICATION,
751
- datatype=["markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION.columns),
752
  type="pandas",
753
  )
754
  with gr.Row():
@@ -771,7 +807,7 @@ with block:
771
  data_retrieval = gr.components.Dataframe(
772
  DATA_RETRIEVAL,
773
  # Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
774
- datatype=["markdown"] + ["number"] * len(DATA_RETRIEVAL.columns) * 2,
775
  type="pandas",
776
  )
777
  with gr.Row():
@@ -791,7 +827,7 @@ with block:
791
  with gr.Row():
792
  data_reranking = gr.components.Dataframe(
793
  DATA_RERANKING,
794
- datatype=["markdown"] + ["number"] * len(DATA_RERANKING.columns),
795
  type="pandas",
796
  )
797
  with gr.Row():
@@ -813,7 +849,7 @@ with block:
813
  with gr.Row():
814
  data_sts_en = gr.components.Dataframe(
815
  DATA_STS_EN,
816
- datatype=["markdown"] + ["number"] * len(DATA_STS_EN.columns),
817
  type="pandas",
818
  )
819
  with gr.Row():
@@ -835,7 +871,8 @@ with block:
835
  """)
836
  with gr.Row():
837
  data_sts = gr.components.Dataframe(
838
- datatype=["markdown"] + ["number"] * 100, # hack when we don't know how many columns
 
839
  type="pandas",
840
  )
841
  with gr.Row():
@@ -853,7 +890,7 @@ with block:
853
  with gr.Row():
854
  data_summarization = gr.components.Dataframe(
855
  DATA_SUMMARIZATION,
856
- datatype=["markdown"] + ["number"] * 2,
857
  type="pandas",
858
  )
859
  with gr.Row():
@@ -880,8 +917,9 @@ with block:
880
  }
881
  ```
882
  """)
883
- # Running the function on page load in addition to when the button is clicked
884
- # This is optional - If deactivated the data created loaded at "Build time" is shown like for Overall tab
 
885
  block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
886
  block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
887
  block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
@@ -893,6 +931,7 @@ with block:
893
  block.load(get_mteb_data, inputs=[task_sts_en, lang_sts_en], outputs=data_sts_en)
894
  block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts)
895
  block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization)
 
896
 
897
  block.queue(concurrency_count=40, max_size=10)
898
  block.launch()
 
393
  "anttip/ct2fast-e5-small-v2-hfie",
394
  "newsrx/instructor-large",
395
  "newsrx/instructor-xl",
396
+ "dmlls/all-mpnet-base-v2",
397
  }
398
 
399
 
 
472
  size = round(size["metadata"]["total_size"] / 1e9, 2)
473
  return dim, seq, size
474
 
475
+ def add_rank(df):
476
+ cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length"]]
477
+ if len(cols_to_rank) == 1:
478
+ df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
479
+ else:
480
+ df.insert(1, "Average", df[cols_to_rank].mean(axis=1, skipna=False))
481
+ df.sort_values("Average", ascending=False, inplace=True)
482
+ df.insert(0, "Rank", list(range(1, len(df) + 1)))
483
+ df = df.round(2)
484
+ # Fill NaN after averaging
485
+ df.fillna("", inplace=True)
486
+ return df
487
+
488
+ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC, rank=True):
489
  api = HfApi()
490
  models = api.list_models(filter="mteb")
491
  # Initialize list to models that we cannot fetch metadata from
 
546
  cols = sorted(list(df.columns))
547
  cols.insert(0, cols.pop(cols.index("Model")))
548
  df = df[cols]
549
+ if rank:
550
+ df = add_rank(df)
551
  if fillna:
552
  df.fillna("", inplace=True)
553
  return df
 
567
  langs=["en", "en-en"],
568
  fillna=False,
569
  add_emb_dim=True,
570
+ rank=False,
571
  )
 
 
 
572
  # Debugging:
573
  # DATA_OVERALL.to_csv("overall.csv")
574
 
 
586
 
587
  DATA_OVERALL = DATA_OVERALL.round(2)
588
 
589
+ DATA_CLASSIFICATION_EN = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_CLASSIFICATION])
590
+ DATA_CLUSTERING = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_CLUSTERING])
591
+ DATA_PAIR_CLASSIFICATION = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION])
592
+ DATA_RERANKING = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_RERANKING])
593
+ DATA_RETRIEVAL = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_RETRIEVAL])
594
+ DATA_STS_EN = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_STS])
595
+ DATA_SUMMARIZATION = add_rank(DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION])
596
+
597
  # Fill NaN after averaging
598
  DATA_OVERALL.fillna("", inplace=True)
599
 
 
 
 
 
 
 
 
 
600
  DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
601
 
602
  return DATA_OVERALL
603
 
604
  get_mteb_average()
605
+ DATA_BITEXT_MINING = get_mteb_data(["BitextMining"])
606
+ DATA_CLASSIFICATION = get_mteb_data(["Classification"])
607
+ DATA_CLUSTERING_GERMAN = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
608
+ DATA_STS = get_mteb_data(["STS"])
609
+
610
+ # Exact, add all non-nan integer values for every dataset
611
+ NUM_SCORES = 0
612
+ DATASETS = []
613
+ # LANGUAGES = []
614
+ for d in [DATA_BITEXT_MINING, DATA_CLASSIFICATION, DATA_CLUSTERING, DATA_CLUSTERING_GERMAN, DATA_PAIR_CLASSIFICATION, DATA_RERANKING, DATA_RETRIEVAL, DATA_STS, DATA_SUMMARIZATION]:
615
+ # NUM_SCORES += d.iloc[:, 1:].apply(lambda x: sum([1 for y in x if isinstance(y, float) and not np.isnan(y)]), axis=1).sum()
616
+ cols_to_ignore = 3 if "Average" in d.columns else 2
617
+ # Count number of scores including only non-nan floats & excluding the rank column
618
+ NUM_SCORES += d.iloc[:, cols_to_ignore:].notna().sum().sum()
619
+ # Exclude rank & model name column (first two); Do not count different language versions as different datasets
620
+ DATASETS += [i.split(" ")[0] for i in d.columns[cols_to_ignore:]]
621
+ # LANGUAGES += [i.split(" ")[-1] for i in d.columns[cols_to_ignore:]]
622
+
623
+ NUM_DATASETS = len(set(DATASETS))
624
+ # NUM_LANGUAGES = len(set(LANGUAGES))
625
 
626
+ block = gr.Blocks()
627
  with block:
628
  gr.Markdown(f"""
629
  Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> πŸ€— Refer to the [MTEB paper](https://arxiv.org/abs/2210.07316) for details on metrics, tasks and models.
630
 
631
+ - **Total Datasets**: {NUM_DATASETS}
632
  - **Total Languages**: 112
633
+ - **Total Scores**: {NUM_SCORES}
634
  - **Total Models**: {len(DATA_OVERALL)}
635
  """)
636
  with gr.Tabs():
 
662
  """)
663
  with gr.Row():
664
  data_bitext_mining = gr.components.Dataframe(
665
+ DATA_BITEXT_MINING,
666
+ datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING.columns),
667
  type="pandas",
668
  )
669
  with gr.Row():
 
686
  with gr.Row():
687
  data_classification_en = gr.components.Dataframe(
688
  DATA_CLASSIFICATION_EN,
689
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_EN.columns),
690
  type="pandas",
691
  )
692
  with gr.Row():
 
711
  """)
712
  with gr.Row():
713
  data_classification = gr.components.Dataframe(
714
+ DATA_CLASSIFICATION,
715
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION) * 10,
716
  type="pandas",
717
  )
718
  with gr.Row():
 
735
  with gr.Row():
736
  data_clustering = gr.components.Dataframe(
737
  DATA_CLUSTERING,
738
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING.columns),
739
  type="pandas",
740
  )
741
  with gr.Row():
 
759
  """)
760
  with gr.Row():
761
  data_clustering_de = gr.components.Dataframe(
762
+ DATA_CLUSTERING_GERMAN,
763
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLUSTERING_GERMAN.columns) * 2,
764
  type="pandas",
765
  )
766
  with gr.Row():
 
784
  with gr.Row():
785
  data_pair_classification = gr.components.Dataframe(
786
  DATA_PAIR_CLASSIFICATION,
787
+ datatype=["number", "markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION.columns),
788
  type="pandas",
789
  )
790
  with gr.Row():
 
807
  data_retrieval = gr.components.Dataframe(
808
  DATA_RETRIEVAL,
809
  # Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
810
+ datatype=["number", "markdown"] + ["number"] * len(DATA_RETRIEVAL.columns) * 2,
811
  type="pandas",
812
  )
813
  with gr.Row():
 
827
  with gr.Row():
828
  data_reranking = gr.components.Dataframe(
829
  DATA_RERANKING,
830
+ datatype=["number", "markdown"] + ["number"] * len(DATA_RERANKING.columns),
831
  type="pandas",
832
  )
833
  with gr.Row():
 
849
  with gr.Row():
850
  data_sts_en = gr.components.Dataframe(
851
  DATA_STS_EN,
852
+ datatype=["number", "markdown"] + ["number"] * len(DATA_STS_EN.columns),
853
  type="pandas",
854
  )
855
  with gr.Row():
 
871
  """)
872
  with gr.Row():
873
  data_sts = gr.components.Dataframe(
874
+ DATA_STS,
875
+ datatype=["number", "markdown"] + ["number"] * len(DATA_STS.columns) * 2,
876
  type="pandas",
877
  )
878
  with gr.Row():
 
890
  with gr.Row():
891
  data_summarization = gr.components.Dataframe(
892
  DATA_SUMMARIZATION,
893
+ datatype=["number", "markdown"] + ["number"] * 2,
894
  type="pandas",
895
  )
896
  with gr.Row():
 
917
  }
918
  ```
919
  """)
920
+ # Running the functions on page load in addition to when the button is clicked
921
+ # This is optional - If deactivated the data loaded at "Build time" is shown like for Overall tab
922
+ """
923
  block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining)
924
  block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
925
  block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
 
931
  block.load(get_mteb_data, inputs=[task_sts_en, lang_sts_en], outputs=data_sts_en)
932
  block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts)
933
  block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization)
934
+ """
935
 
936
  block.queue(concurrency_count=40, max_size=10)
937
  block.launch()