Yilun Jin commited on
Commit
3647aad
·
1 Parent(s): 14d9c3b

update leaderboard to 4

Browse files
__pycache__/gen_table.cpython-38.pyc CHANGED
Binary files a/__pycache__/gen_table.cpython-38.pyc and b/__pycache__/gen_table.cpython-38.pyc differ
 
__pycache__/meta_data.cpython-38.pyc CHANGED
Binary files a/__pycache__/meta_data.cpython-38.pyc and b/__pycache__/meta_data.cpython-38.pyc differ
 
app.py CHANGED
@@ -50,6 +50,8 @@ with gr.Blocks() as demo:
50
  label='Model Type',
51
  interactive=True
52
  )
 
 
53
  data_component = gr.components.DataFrame(
54
  value=table[headers],
55
  type='pandas',
 
50
  label='Model Type',
51
  interactive=True
52
  )
53
+ print(headers)
54
+ print(check_box['essential'])
55
  data_component = gr.components.DataFrame(
56
  value=table[headers],
57
  type='pandas',
gen_table.py CHANGED
@@ -67,7 +67,7 @@ def model_type_flag(line, FIELDS):
67
 
68
  def BUILD_L1_DF(results, fields):
69
  check_box = {}
70
- check_box['essential'] = ['Method', 'Param (B)', 'Language Model', 'Vision Model']
71
  # revise there to set default dataset
72
  check_box['required'] = ['Avg Score', 'Avg Rank'] + DEFAULT_BENCH
73
  check_box['avg'] = ['Avg Score', 'Avg Rank']
@@ -131,7 +131,7 @@ def BUILD_L2_DF(results, dataset):
131
  df = df.iloc[::-1]
132
 
133
  check_box = {}
134
- check_box['essential'] = ['Method', 'Param (B)', 'Language Model', 'Vision Model']
135
  check_box['required'] = required_fields
136
  check_box['all'] = all_fields
137
  type_map = defaultdict(lambda: 'number')
 
67
 
68
  def BUILD_L1_DF(results, fields):
69
  check_box = {}
70
+ check_box['essential'] = ['Method', 'Param (B)']
71
  # revise there to set default dataset
72
  check_box['required'] = ['Avg Score', 'Avg Rank'] + DEFAULT_BENCH
73
  check_box['avg'] = ['Avg Score', 'Avg Rank']
 
131
  df = df.iloc[::-1]
132
 
133
  check_box = {}
134
+ check_box['essential'] = ['Method', 'Param (B)']
135
  check_box['required'] = required_fields
136
  check_box['all'] = all_fields
137
  type_map = defaultdict(lambda: 'number')
meta_data.py CHANGED
@@ -18,17 +18,21 @@ Shopping MMLU Leaderboard only includes open-source LLMs or API models that are
18
  """
19
  # CONSTANTS-FIELDS
20
  META_FIELDS = ['Method', 'Param (B)', 'OpenSource', 'Verified']
 
 
 
 
 
 
 
21
  MAIN_FIELDS = [
22
- 'MMBench_V11', 'MMStar', 'MME',
23
- 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
24
- 'HallusionBench', 'SEEDBench_IMG', 'MMVet',
25
- 'LLaVABench', 'CCBench', 'RealWorldQA', 'POPE', 'ScienceQA_TEST',
26
- 'SEEDBench2_Plus', 'MMT-Bench_VAL', 'BLINK'
27
- ]
28
- DEFAULT_BENCH = [
29
- 'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
30
- 'HallusionBench', 'MMVet'
31
  ]
 
 
 
 
 
32
  MMBENCH_FIELDS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench']
33
  MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
34
  MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
@@ -48,19 +52,7 @@ LEADERBOARD_MD['MAIN'] = f"""
48
  - Detailed evaluation results for each dataset (included or not included in main) are provided in the consequent tabs.
49
  """
50
 
51
- for dataset in ['MMBench_DEV_CN', 'MMBench_TEST_CN', 'MMBench_DEV_EN', 'MMBench_TEST_EN', 'CCBench']:
52
- LEADERBOARD_MD[dataset] = f"""
53
- ## {dataset.replace('_', ' ')} Evaluation Results
54
-
55
- - We adopt Circular Eval for benchmarks in MMBench series, you can check https://arxiv.org/pdf/2307.06281.pdf for the detailed definition of Circular Eval.
56
- """
57
 
58
- LEADERBOARD_MD['SEEDBench_IMG'] = """
59
- ## SEEDBench_IMG Scores (ChatGPT Answer Extraction / Official Leaderboard)
60
-
61
- - **Overall**: The overall accuracy across all questions with **ChatGPT answer matching**.
62
- - **Overall (official)**: SEEDBench_IMG acc on the official leaderboard (if applicable).
63
- """
64
 
65
  LEADERBOARD_MD['MMVet'] = """
66
  ## MMVet Evaluation Results
@@ -70,18 +62,6 @@ LEADERBOARD_MD['MMVet'] = """
70
  - We also provide performance on the [**Official Leaderboard**](https://paperswithcode.com/sota/visual-question-answering-on-mm-vet) for models that are applicable. Those results are obtained with GPT-4-0314 evaluator (which has been deperacted for new users).
71
  """
72
 
73
- LEADERBOARD_MD['MMMU_VAL'] = """
74
- ## MMMU Validation Evaluation Results
75
-
76
- - For MMMU, we support the evaluation of the `dev` (150 samples) and `validation` (900 samples) set. Here we only report the results on the `validation` set.
77
- - **Answer Inference:**
78
- - For models with `interleave_generate` interface (accept interleaved images & texts as inputs), all testing samples can be inferred. **`interleave_generate` is adopted for inference.**
79
- - For models without `interleave_generate` interface, samples with more than one images are skipped (42 out of 1050, directly count as wrong). **`generate` is adopted for inference.**
80
- - **Evaluation**:
81
- - MMMU include two types of questions: **multi-choice questions** & **open-ended QA**.
82
- - For **open-ended QA (62/1050)**, we re-formulate it as multi-choice questions: `{'question': 'QQQ', 'answer': 'AAA'} -> {'question': 'QQQ', 'A': 'AAA', 'B': 'Other Answers', 'answer': 'A'}`, and then adopt the same evaluation paradigm for **multi-choice questions**.
83
- - For **multi-choice questions (988/1050)**, we use **GPT-3.5-Turbo-0613** for matching prediction with options if heuristic matching does not work.
84
- """
85
 
86
  LEADERBOARD_MD['MathVista'] = """
87
  ## MMMU TestMini Evaluation Results
@@ -92,59 +72,6 @@ LEADERBOARD_MD['MathVista'] = """
92
  **Category Definitions:** **FQA:** figure QA, **GPS:** geometry problem solving, **MWP:** math word problem, **TQA:** textbook QA, **VQA:** visual QA, **ALG:** algebraic, **ARI:** arithmetic, **GEO:** geometry, **LOG:** logical , **NUM:** numeric, **SCI:** scientific, **STA:** statistical.
93
  """
94
 
95
- LEADERBOARD_MD['HallusionBench'] = """
96
- [**HallusionBench**](https://github.com/tianyi-lab/HallusionBench) is a benchmark to evaluate hallucination of VLMs. It asks a set of visual questions with one original image and one modified image (the answers for a question can be different, considering the image content).
97
-
98
- **Examples in HallusionBench:**
99
-
100
- | Original Figure | Modified Figure |
101
- | ------------------------------------------------------------ | ------------------------------------------------------------ |
102
- | ![](http://opencompass.openxlab.space/utils/Hallu0.png) | ![](http://opencompass.openxlab.space/utils/Hallu1.png) |
103
- | **Q1.** Is the right orange circle the same size as the left orange circle? **A1. Yes** | **Q1.** Is the right orange circle the same size as the left orange circle? **A1. No** |
104
- | **Q2.** Is the right orange circle larger than the left orange circle? **A2. No** | **Q2.** Is the right orange circle larger than the left orange circle? **A2. Yes** |
105
- | **Q3.** Is the right orange circle smaller than the left orange circle? **A3. No** | **Q3.** Is the right orange circle smaller than the left orange circle? **A3. No** |
106
-
107
- **Metrics**:
108
-
109
- >- aAcc: The overall accuracy of **all** atomic questions.
110
- >
111
- >- qAcc: The mean accuracy of unique **questions**. One question can be asked multiple times with different figures, we consider VLM correctly solved a unique question only if it succeeds in all <question, figure> pairs for this unique question.
112
- >- fAcc: The mean accuracy of all **figures**. One figure is associated with multiple questions, we consider VLM correct on a figure only if it succeeds to solve all questions of this figure.
113
-
114
- **Evaluation Setting**:
115
-
116
- > 1. **No-visual** Questions (questions asked without the associated figure) in HallusionBench are **skipped** during evaluation.
117
- > 2. When we failed to extract Yes / No from the VLM prediction, we adopt **GPT-3.5-Turbo-0613** as the answer extractor.
118
- > 3. We report aAcc, qAcc, and fAcc for all evaluated VLMs.
119
-
120
- ## HallusionBench Evaluation Results
121
- """
122
-
123
- LEADERBOARD_MD['LLaVABench'] = """
124
- ## LLaVABench Evaluation Results
125
-
126
- - In LLaVABench Evaluation, we use GPT-4-Turbo (gpt-4-1106-preview) as the judge LLM to assign scores to the VLM outputs. We only perform the evaluation once due to the limited variance among results of multiple evaluation pass originally reported.
127
- - No specific prompt template adopted for **ALL VLMs**.
128
- - We also include the official results (obtained by gpt-4-0314) for applicable models.
129
- """
130
-
131
- LEADERBOARD_MD['COCO_VAL'] = """
132
- ## COCO Caption Results
133
-
134
- - By default, we evaluate COCO Caption Validation set (5000 samples), and report the following metrics: BLEU-1, BLEU-4, CIDEr, ROUGE-L (default sorted by CIDEr).
135
- - We use the following prompt to evaluate all VLMs: `Please describe this image in general. Directly provide the description, do not include prefix like "This image depicts". `
136
- - **No specific prompt is adopted for all VLMs.**
137
- """
138
-
139
- LEADERBOARD_MD['ScienceQA_VAL'] = """
140
- ## ScienceQA Evaluation Results
141
-
142
- - We benchmark the **image** subset of ScienceQA validation and test set, and report the Top-1 accuracy.
143
- - During evaluation, we use `GPT-3.5-Turbo-0613` as the choice extractor for all VLMs if the choice can not be extracted via heuristic matching. **Zero-shot** inference is adopted.
144
- """
145
-
146
- LEADERBOARD_MD['ScienceQA_TEST'] = LEADERBOARD_MD['ScienceQA_VAL']
147
-
148
  LEADERBOARD_MD['OCRBench'] = """
149
  ## OCRBench Evaluation Results
150
 
@@ -159,66 +86,3 @@ LEADERBOARD_MD['MMStar'] = """
159
  - During the evaluation of MMStar, we find that some API models may reject to answer some of the questions. Currently, we treat such cases as wrong answers when reporting the results.
160
  """
161
 
162
- LEADERBOARD_MD['RealWorldQA'] = """
163
- ## RealWorldQA Evaluation Results
164
-
165
- - RealWorldQA is a benchmark designed to evaluate the real-world spatial understanding capabilities of multimodal AI models, contributed by XAI. It assesses how well these models comprehend physical environments. The benchmark consists of 700+ images, each accompanied by a question and a verifiable answer. These images are drawn from real-world scenarios, including those captured from vehicles. The goal is to advance AI models' understanding of our physical world.
166
- """
167
-
168
- LEADERBOARD_MD['TextVQA_VAL'] = """
169
- ## TextVQA Evaluation Results
170
-
171
- - TextVQA is a dataset to benchmark visual reasoning based on text in images. TextVQA requires models to read and reason about text in images to answer questions about them. Specifically, models need to incorporate a new modality of text present in the images and reason over it to answer TextVQA questions.
172
- - Note that some models may not be able to generate standardized responses based on the prompt. We currently do not have reports for these models.
173
- """
174
-
175
- LEADERBOARD_MD['ChartQA_TEST'] = """
176
- ## ChartQA Evaluation Results
177
-
178
- - ChartQA is a benchmark for question answering about charts with visual and logical reasoning.
179
- - Note that some models may not be able to generate standardized responses based on the prompt. We currently do not have reports for these models.
180
- """
181
-
182
- LEADERBOARD_MD['OCRVQA_TESTCORE'] = """
183
- ## OCRVQA Evaluation Results
184
-
185
- - OCRVQA is a benchmark for visual question answering by reading text in images. It presents a large-scale dataset, OCR-VQA-200K, comprising over 200,000 images of book covers. The study combines techniques from the Optical Character Recognition (OCR) and Visual Question Answering (VQA) domains to address the challenges associated with this new task and dataset.
186
- - Note that some models may not be able to generate standardized responses based on the prompt. We currently do not have reports for these models.
187
- """
188
-
189
- LEADERBOARD_MD['POPE'] = """
190
- ## POPE Evaluation Results
191
-
192
- - POPE is a benchmark for object hallucination evaluation. It includes three tracks of object hallucination: random, popular, and adversarial.
193
- - Note that the official POPE dataset contains approximately 8910 cases. POPE includes three tracks, and there are some overlapping samples among the three tracks. To reduce the data file size, we have kept only a single copy of the overlapping samples (about 5127 examples). However, the final accuracy will be calculated on the ~9k samples.
194
- - Some API models, due to safety policies, refuse to answer certain questions, so their actual capabilities may be higher than the reported scores.
195
- - We report the average F1 score across the three types of data as the overall score. Accuracy, precision, and recall are also shown in the table. F1 score = 2 * (precision * recall) / (precision + recall).
196
- """
197
-
198
- LEADERBOARD_MD['SEEDBench2_Plus'] = """
199
- ## SEEDBench2 Plus Evaluation Results
200
-
201
- - SEEDBench2 Plus comprises 2.3K multiple-choice questions with precise human annotations, spanning three broad categories: Charts, Maps, and Webs, each of which covers a wide spectrum of textrich scenarios in the real world.
202
- """
203
-
204
- LEADERBOARD_MD['MMT-Bench_VAL'] = """
205
- ## MMT-Bench Validation Evaluation Results
206
-
207
- - MMT-Bench comprises 31,325 meticulously curated multi-choice visual questions from various multimodal scenarios such as vehicle driving and embodied navigation, covering 32 core meta-tasks and 162 subtasks in multimodal understanding.
208
- - MMT-Bench_VAL is the validation set of MMT-Bench. MMT-Bench_ALL includes both validation and test sets. The suffix `MI`, such as `MMT-Bench_VAL_MI`, represents the multi-image version of the dataset with several images input.
209
- The defualt version is the single-image version, which concats the multiple images into a single image as input.
210
- """
211
-
212
- LEADERBOARD_MD['SEEDBench2'] = """
213
- ## SEEDBench2 Evaluation Results
214
-
215
- - SEEDBench2 comprises 24K multiple-choice questions with accurate human annotations, which spans 27 dimensions, including the evaluation of both text and image generation.
216
- - Note that we only evaluate and report the part of model's results on the SEEDBench2.
217
- """
218
-
219
- LEADERBOARD_MD['BLINK'] = """
220
- ## BLINK Test Evaluation Results
221
-
222
- - BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans “within a blink”, but pose significant challenges for current multimodal large language models (LLMs).
223
- - We evaluate BLINK on the test set of the benchmark, which contains 1901 visual questions in multi-choice format.
224
- """
 
18
  """
19
  # CONSTANTS-FIELDS
20
  META_FIELDS = ['Method', 'Param (B)', 'OpenSource', 'Verified']
21
+ # MAIN_FIELDS = [
22
+ # 'MMBench_V11', 'MMStar', 'MME',
23
+ # 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
24
+ # 'HallusionBench', 'SEEDBench_IMG', 'MMVet',
25
+ # 'LLaVABench', 'CCBench', 'RealWorldQA', 'POPE', 'ScienceQA_TEST',
26
+ # 'SEEDBench2_Plus', 'MMT-Bench_VAL', 'BLINK'
27
+ # ]
28
  MAIN_FIELDS = [
29
+ 'OCRBench', 'MMStar', 'MMVet','MathVista'
 
 
 
 
 
 
 
 
30
  ]
31
+ # DEFAULT_BENCH = [
32
+ # 'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
33
+ # 'HallusionBench', 'MMVet'
34
+ # ]
35
+ DEFAULT_BENCH = ['OCRBench', 'MMStar', 'MMVet','MathVista']
36
  MMBENCH_FIELDS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench']
37
  MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
38
  MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
 
52
  - Detailed evaluation results for each dataset (included or not included in main) are provided in the consequent tabs.
53
  """
54
 
 
 
 
 
 
 
55
 
 
 
 
 
 
 
56
 
57
  LEADERBOARD_MD['MMVet'] = """
58
  ## MMVet Evaluation Results
 
62
  - We also provide performance on the [**Official Leaderboard**](https://paperswithcode.com/sota/visual-question-answering-on-mm-vet) for models that are applicable. Those results are obtained with GPT-4-0314 evaluator (which has been deperacted for new users).
63
  """
64
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  LEADERBOARD_MD['MathVista'] = """
67
  ## MMMU TestMini Evaluation Results
 
72
  **Category Definitions:** **FQA:** figure QA, **GPS:** geometry problem solving, **MWP:** math word problem, **TQA:** textbook QA, **VQA:** visual QA, **ALG:** algebraic, **ARI:** arithmetic, **GEO:** geometry, **LOG:** logical , **NUM:** numeric, **SCI:** scientific, **STA:** statistical.
73
  """
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  LEADERBOARD_MD['OCRBench'] = """
76
  ## OCRBench Evaluation Results
77
 
 
86
  - During the evaluation of MMStar, we find that some API models may reject to answer some of the questions. Currently, we treat such cases as wrong answers when reporting the results.
87
  """
88