Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
kennymckormick
commited on
Commit
·
3c75092
1
Parent(s):
577e18a
update leaderboard
Browse files- app.py +144 -0
- lb_info.py +233 -0
app.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import abc
|
2 |
+
import gradio as gr
|
3 |
+
from lb_info import *
|
4 |
+
|
5 |
+
with gr.Blocks() as demo:
|
6 |
+
struct = load_results()
|
7 |
+
timestamp = struct['time']
|
8 |
+
EVAL_TIME = format_timestamp(timestamp)
|
9 |
+
results = struct['results']
|
10 |
+
N_MODEL = len(results)
|
11 |
+
N_DATA = len(results['LLaVA-v1.5-7B']) - 1
|
12 |
+
DATASETS = list(results['LLaVA-v1.5-7B'])
|
13 |
+
DATASETS.remove('META')
|
14 |
+
print(DATASETS)
|
15 |
+
|
16 |
+
gr.Markdown(LEADERBORAD_INTRODUCTION.format(N_MODEL, N_DATA, EVAL_TIME))
|
17 |
+
structs = [abc.abstractproperty() for _ in range(N_DATA)]
|
18 |
+
|
19 |
+
with gr.Tabs(elem_classes='tab-buttons') as tabs:
|
20 |
+
with gr.TabItem('🏅 OpenVLM Main Leaderboard', elem_id='main', id=0):
|
21 |
+
gr.Markdown(LEADERBOARD_MD['MAIN'])
|
22 |
+
table, check_box = BUILD_L1_DF(results, MAIN_FIELDS)
|
23 |
+
type_map = check_box['type_map']
|
24 |
+
checkbox_group = gr.CheckboxGroup(
|
25 |
+
choices=check_box['all'],
|
26 |
+
value=check_box['required'],
|
27 |
+
label="Evaluation Dimension",
|
28 |
+
interactive=True,
|
29 |
+
)
|
30 |
+
headers = check_box['essential'] + checkbox_group.value
|
31 |
+
with gr.Row():
|
32 |
+
model_size = gr.CheckboxGroup(
|
33 |
+
choices=MODEL_SIZE,
|
34 |
+
value=MODEL_SIZE,
|
35 |
+
label='Model Size',
|
36 |
+
interactive=True
|
37 |
+
)
|
38 |
+
model_type = gr.CheckboxGroup(
|
39 |
+
choices=MODEL_TYPE,
|
40 |
+
value=MODEL_TYPE,
|
41 |
+
label='Model Type',
|
42 |
+
interactive=True
|
43 |
+
)
|
44 |
+
data_component = gr.components.DataFrame(
|
45 |
+
value=table[headers],
|
46 |
+
type="pandas",
|
47 |
+
datatype=[type_map[x] for x in headers],
|
48 |
+
interactive=False,
|
49 |
+
visible=True)
|
50 |
+
|
51 |
+
def filter_df(fields, model_size, model_type):
|
52 |
+
headers = check_box['essential'] + fields
|
53 |
+
df = cp.deepcopy(table)
|
54 |
+
df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
|
55 |
+
df = df[df['flag']]
|
56 |
+
df.pop('flag')
|
57 |
+
if len(df):
|
58 |
+
df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
|
59 |
+
df = df[df['flag']]
|
60 |
+
df.pop('flag')
|
61 |
+
|
62 |
+
comp = gr.components.DataFrame(
|
63 |
+
value=df[headers],
|
64 |
+
type="pandas",
|
65 |
+
datatype=[type_map[x] for x in headers],
|
66 |
+
interactive=False,
|
67 |
+
visible=True)
|
68 |
+
return comp
|
69 |
+
|
70 |
+
for cbox in [checkbox_group, model_size, model_type]:
|
71 |
+
cbox.change(fn=filter_df, inputs=[checkbox_group, model_size, model_type], outputs=data_component)
|
72 |
+
|
73 |
+
with gr.TabItem('🔍 About', elem_id='about', id=1):
|
74 |
+
gr.Markdown(urlopen(VLMEVALKIT_README).read().decode())
|
75 |
+
|
76 |
+
for i, dataset in enumerate(DATASETS):
|
77 |
+
with gr.TabItem(f'📊 {dataset} Leaderboard', elem_id=dataset, id=i + 2):
|
78 |
+
if dataset in LEADERBOARD_MD:
|
79 |
+
gr.Markdown(LEADERBOARD_MD[dataset])
|
80 |
+
|
81 |
+
s = structs[i]
|
82 |
+
s.table, s.check_box = BUILD_L2_DF(results, dataset)
|
83 |
+
s.type_map = s.check_box['type_map']
|
84 |
+
s.checkbox_group = gr.CheckboxGroup(
|
85 |
+
choices=s.check_box['all'],
|
86 |
+
value=s.check_box['required'],
|
87 |
+
label=f"{dataset} CheckBoxes",
|
88 |
+
interactive=True,
|
89 |
+
)
|
90 |
+
s.headers = s.check_box['essential'] + s.checkbox_group.value
|
91 |
+
with gr.Row():
|
92 |
+
s.model_size = gr.CheckboxGroup(
|
93 |
+
choices=MODEL_SIZE,
|
94 |
+
value=MODEL_SIZE,
|
95 |
+
label='Model Size',
|
96 |
+
interactive=True
|
97 |
+
)
|
98 |
+
s.model_type = gr.CheckboxGroup(
|
99 |
+
choices=MODEL_TYPE,
|
100 |
+
value=MODEL_TYPE,
|
101 |
+
label='Model Type',
|
102 |
+
interactive=True
|
103 |
+
)
|
104 |
+
s.data_component = gr.components.DataFrame(
|
105 |
+
value=s.table[s.headers],
|
106 |
+
type="pandas",
|
107 |
+
datatype=[s.type_map[x] for x in s.headers],
|
108 |
+
interactive=False,
|
109 |
+
visible=True)
|
110 |
+
s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
|
111 |
+
|
112 |
+
def filter_df_l2(dataset_name, fields, model_size, model_type):
|
113 |
+
s = structs[DATASETS.index(dataset_name)]
|
114 |
+
headers = s.check_box['essential'] + fields
|
115 |
+
df = cp.deepcopy(s.table)
|
116 |
+
df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
|
117 |
+
df = df[df['flag']]
|
118 |
+
df.pop('flag')
|
119 |
+
if len(df):
|
120 |
+
df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
|
121 |
+
df = df[df['flag']]
|
122 |
+
df.pop('flag')
|
123 |
+
|
124 |
+
comp = gr.components.DataFrame(
|
125 |
+
value=df[headers],
|
126 |
+
type="pandas",
|
127 |
+
datatype=[s.type_map[x] for x in headers],
|
128 |
+
interactive=False,
|
129 |
+
visible=True)
|
130 |
+
return comp
|
131 |
+
|
132 |
+
for cbox in [s.checkbox_group, s.model_size, s.model_type]:
|
133 |
+
cbox.change(fn=filter_df_l2, inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type], outputs=s.data_component)
|
134 |
+
|
135 |
+
|
136 |
+
with gr.Row():
|
137 |
+
with gr.Accordion("Citation", open=False):
|
138 |
+
citation_button = gr.Textbox(
|
139 |
+
value=CITATION_BUTTON_TEXT,
|
140 |
+
label=CITATION_BUTTON_LABEL,
|
141 |
+
elem_id='citation-button')
|
142 |
+
|
143 |
+
if __name__ == '__main__':
|
144 |
+
demo.launch(server_name='0.0.0.0')
|
lb_info.py
ADDED
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import pandas as pd
|
3 |
+
from collections import defaultdict
|
4 |
+
import gradio as gr
|
5 |
+
import copy as cp
|
6 |
+
import numpy as np
|
7 |
+
from .misc import listinstr
|
8 |
+
|
9 |
+
# CONSTANTS-URL
|
10 |
+
URL = "http://opencompass.openxlab.space/utils/OpenVLM.json"
|
11 |
+
VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
|
12 |
+
# CONSTANTS-CITATION
|
13 |
+
CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
|
14 |
+
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
15 |
+
author={OpenCompass Contributors},
|
16 |
+
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
17 |
+
year={2023}
|
18 |
+
}"""
|
19 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
20 |
+
# CONSTANTS-TEXT
|
21 |
+
LEADERBORAD_INTRODUCTION = """# OpenVLM Leaderboard
|
22 |
+
### Welcome to the OpenVLM Leaderboard! On this leaderboard we share the evaluation results of VLMs obtained by the OpenSource Framework [**VLMEvalKit**](https://github.com/open-compass/VLMEvalKit) 🏆
|
23 |
+
### Currently, OpenVLM Leaderboard covers {} different VLMs (including GPT-4v, Gemini, QwenVLPlus, LLaVA, etc.) and {} different multi-modal benchmarks.
|
24 |
+
|
25 |
+
This leaderboard was last updated: {}.
|
26 |
+
"""
|
27 |
+
# CONSTANTS-FIELDS
|
28 |
+
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
29 |
+
MAIN_FIELDS = ['MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench', 'MME', 'SEEDBench_IMG', 'MMVet', 'MMMU_VAL', 'MathVista', 'HallusionBench', 'LLaVABench']
|
30 |
+
MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
|
31 |
+
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
32 |
+
MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
|
33 |
+
|
34 |
+
# The README file for each benchmark
|
35 |
+
LEADERBOARD_MD = {}
|
36 |
+
|
37 |
+
LEADERBOARD_MD['MAIN'] = """
|
38 |
+
## Main Evaluation Results
|
39 |
+
|
40 |
+
- Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better).
|
41 |
+
- Avg Rank: The average rank on all VLM Benchmarks (the lower the better).
|
42 |
+
- The overall evaluation results on 10 VLM benchmarks, sorted by the ascending order of Avg Rank.
|
43 |
+
"""
|
44 |
+
|
45 |
+
LEADERBOARD_MD['SEEDBench_IMG'] = """
|
46 |
+
## SEEDBench_IMG Scores (Prefetch / ChatGPT Answer Extraction / Official Leaderboard)
|
47 |
+
|
48 |
+
- **Overall**: The overall accuracy across all questions with **ChatGPT answer matching**.
|
49 |
+
- **Overall (prefetch)**: The accuracy when using exact matching for evaluation.
|
50 |
+
- **Overall (official)**: SEEDBench_IMG acc on the official leaderboard (if applicable).
|
51 |
+
"""
|
52 |
+
|
53 |
+
LEADERBOARD_MD['MMVet'] = """
|
54 |
+
## MMVet Evaluation Results
|
55 |
+
|
56 |
+
- In MMVet Evaluation, we use GPT-4-Turbo (gpt-4-1106-preview) as the judge LLM to assign scores to the VLM outputs. We only perform the evaluation once due to the limited variance among results of multiple evaluation pass originally reported.
|
57 |
+
- No specific prompt template adopted for **ALL VLMs**.
|
58 |
+
- We also provide performance on the [**Official Leaderboard**](https://paperswithcode.com/sota/visual-question-answering-on-mm-vet) for models that are applicable. Those results are obtained with GPT-4-0314 evaluator (which has been deperacted for new users).
|
59 |
+
"""
|
60 |
+
|
61 |
+
LEADERBOARD_MD['MMMU_VAL'] = """
|
62 |
+
## MMMU Validation Evaluation Results
|
63 |
+
|
64 |
+
- For MMMU, we support the evaluation of the `dev` (150 samples) and `validation` (900 samples) set. Here we only report the results on the `validation` set.
|
65 |
+
- **Answer Inference:**
|
66 |
+
- For models with `interleave_generate` interface (accept interleaved images & texts as inputs), all testing samples can be inferred. **`interleave_generate` is adopted for inference.**
|
67 |
+
- For models without `interleave_generate` interface, samples with more than one images are skipped (42 out of 1050, directly count as wrong). **`generate` is adopted for inference.**
|
68 |
+
- **Evaluation**:
|
69 |
+
- MMMU include two types of questions: **multi-choice questions** & **open-ended QA**.
|
70 |
+
- For **open-ended QA (62/1050)**, we re-formulate it as multi-choice questions: `{'question': 'QQQ', 'answer': 'AAA'} -> {'question': 'QQQ', 'A': 'AAA', 'B': 'Other Answers', 'answer': 'A'}`, and then adopt the same evaluation paradigm for **multi-choice questions**.
|
71 |
+
- For **multi-choice questions (988/1050)**, we use **GPT-3.5-Turbo-0613** for matching prediction with options if heuristic matching does not work.
|
72 |
+
"""
|
73 |
+
|
74 |
+
LEADERBOARD_MD['MathVista'] = """
|
75 |
+
## MMMU TestMini Evaluation Results
|
76 |
+
|
77 |
+
- We report the evaluation results on MathVista **TestMini**, which include 1000 test samples.
|
78 |
+
- We adopt `GPT-4-Turbo (1106)` as the answer extractor when we failed to extract the answer with heuristic matching.
|
79 |
+
- The performance of **Human (High school)** and **Random Choice** are copied from the official leaderboard.
|
80 |
+
**Category Definitions:** **FQA:** figure QA, **GPS:** geometry problem solving, **MWP:** math word problem, **TQA:** textbook QA, **VQA:** visual QA, **ALG:** algebraic, **ARI:** arithmetic, **GEO:** geometry, **LOG:** logical , **NUM:** numeric, **SCI:** scientific, **STA:** statistical.
|
81 |
+
"""
|
82 |
+
|
83 |
+
LEADERBOARD_MD['HallusionBench'] = """
|
84 |
+
[**HallusionBench**](https://github.com/tianyi-lab/HallusionBench) is a benchmark to evaluate hallucination of VLMs. It asks a set of visual questions with one original image and one modified image (the answers for a question can be different, considering the image content).
|
85 |
+
|
86 |
+
**Examples in HallusionBench:**
|
87 |
+
|
88 |
+
| Original Figure | Modified Figure |
|
89 |
+
| ------------------------------------------------------------ | ------------------------------------------------------------ |
|
90 |
+
| ![](http://opencompass.openxlab.space/utils/Hallu0.png) | ![](http://opencompass.openxlab.space/utils/Hallu1.png) |
|
91 |
+
| **Q1.** Is the right orange circle the same size as the left orange circle? **A1. Yes** | **Q1.** Is the right orange circle the same size as the left orange circle? **A1. No** |
|
92 |
+
| **Q2.** Is the right orange circle larger than the left orange circle? **A2. No** | **Q2.** Is the right orange circle larger than the left orange circle? **A2. Yes** |
|
93 |
+
| **Q3.** Is the right orange circle smaller than the left orange circle? **A3. No** | **Q3.** Is the right orange circle smaller than the left orange circle? **A3. No** |
|
94 |
+
|
95 |
+
**Metrics**:
|
96 |
+
|
97 |
+
>- aAcc: The overall accuracy of **all** atomic questions.
|
98 |
+
>
|
99 |
+
>- qAcc: The mean accuracy of unique **questions**. One question can be asked multiple times with different figures, we consider VLM correctly solved a unique question only if it succeeds in all <question, figure> pairs for this unique question.
|
100 |
+
>- fAcc: The mean accuracy of all **figures**. One figure is associated with multiple questions, we consider VLM correct on a figure only if it succeeds to solve all questions of this figure.
|
101 |
+
|
102 |
+
**Evaluation Setting**:
|
103 |
+
|
104 |
+
> 1. **No-visual** Questions (questions asked without the associated figure) in HallusionBench are **skipped** during evaluation.
|
105 |
+
> 2. When we failed to extract Yes / No from the VLM prediction, we adopt **GPT-3.5-Turbo-0613** as the answer extractor.
|
106 |
+
> 3. We report aAcc, qAcc, and fAcc for all evaluated VLMs.
|
107 |
+
|
108 |
+
## HallusionBench Evaluation Results
|
109 |
+
"""
|
110 |
+
|
111 |
+
LEADERBOARD_MD['LLaVABench'] = """
|
112 |
+
## LLaVABench Evaluation Results
|
113 |
+
|
114 |
+
- In LLaVABench Evaluation, we use GPT-4-Turbo (gpt-4-1106-preview) as the judge LLM to assign scores to the VLM outputs. We only perform the evaluation once due to the limited variance among results of multiple evaluation pass originally reported.
|
115 |
+
- No specific prompt template adopted for **ALL VLMs**.
|
116 |
+
- We also include the official results (obtained by gpt-4-0314) for applicable models.
|
117 |
+
"""
|
118 |
+
|
119 |
+
from urllib.request import urlopen
|
120 |
+
|
121 |
+
def load_results():
|
122 |
+
data = json.loads(urlopen(URL).read())
|
123 |
+
return data
|
124 |
+
|
125 |
+
def nth_large(val, vals):
|
126 |
+
return sum([1 for v in vals if v > val]) + 1
|
127 |
+
|
128 |
+
def format_timestamp(timestamp):
|
129 |
+
return timestamp[:2] + '.' + timestamp[2:4] + '.' + timestamp[4:6] + ' ' + timestamp[6:8] + ':' + timestamp[8:10] + ':' + timestamp[10:12]
|
130 |
+
|
131 |
+
def model_size_flag(sz, FIELDS):
|
132 |
+
if pd.isna(sz) and 'Unknown' in FIELDS:
|
133 |
+
return True
|
134 |
+
if pd.isna(sz):
|
135 |
+
return False
|
136 |
+
if '<10B' in FIELDS and sz < 10:
|
137 |
+
return True
|
138 |
+
if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
|
139 |
+
return True
|
140 |
+
if '20B-40B' in FIELDS and sz >= 20 and sz < 40:
|
141 |
+
return True
|
142 |
+
if '>40B' in FIELDS and sz >= 40:
|
143 |
+
return True
|
144 |
+
return False
|
145 |
+
|
146 |
+
def model_type_flag(line, FIELDS):
|
147 |
+
if 'OpenSource' in FIELDS and line['OpenSource'] == 'Yes':
|
148 |
+
return True
|
149 |
+
if 'API' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'Yes':
|
150 |
+
return True
|
151 |
+
if 'Proprietary' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'No':
|
152 |
+
return True
|
153 |
+
return False
|
154 |
+
|
155 |
+
def BUILD_L1_DF(results, fields):
|
156 |
+
res = defaultdict(list)
|
157 |
+
for i, m in enumerate(results):
|
158 |
+
item = results[m]
|
159 |
+
meta = item['META']
|
160 |
+
for k in META_FIELDS:
|
161 |
+
if k == 'Parameters (B)':
|
162 |
+
param = meta['Parameters']
|
163 |
+
res[k].append(float(param.replace('B', '')) if param != '' else None)
|
164 |
+
elif k == 'Method':
|
165 |
+
name, url = meta['Method']
|
166 |
+
res[k].append(f'<a href="{url}">{name}</a>')
|
167 |
+
else:
|
168 |
+
res[k].append(meta[k])
|
169 |
+
scores, ranks = [], []
|
170 |
+
for d in fields:
|
171 |
+
res[d].append(item[d]['Overall'])
|
172 |
+
if d == 'MME':
|
173 |
+
scores.append(item[d]['Overall'] / 28)
|
174 |
+
else:
|
175 |
+
scores.append(item[d]['Overall'])
|
176 |
+
ranks.append(nth_large(item[d]['Overall'], [x[d]['Overall'] for x in results.values()]))
|
177 |
+
res['Avg Score'].append(round(np.mean(scores), 1))
|
178 |
+
res['Avg Rank'].append(round(np.mean(ranks), 2))
|
179 |
+
|
180 |
+
df = pd.DataFrame(res)
|
181 |
+
df = df.sort_values('Avg Rank')
|
182 |
+
|
183 |
+
check_box = {}
|
184 |
+
check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
|
185 |
+
check_box['required'] = ['Avg Score', 'Avg Rank']
|
186 |
+
check_box['all'] = check_box['required'] + ['OpenSource', 'Verified'] + fields
|
187 |
+
type_map = defaultdict(lambda: 'number')
|
188 |
+
type_map['Method'] = 'html'
|
189 |
+
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
190 |
+
check_box['type_map'] = type_map
|
191 |
+
return df, check_box
|
192 |
+
|
193 |
+
def BUILD_L2_DF(results, dataset):
|
194 |
+
res = defaultdict(list)
|
195 |
+
fields = list(list(results.values())[0][dataset].keys())
|
196 |
+
non_overall_fields = [x for x in fields if 'Overall' not in x]
|
197 |
+
overall_fields = [x for x in fields if 'Overall' in x]
|
198 |
+
if dataset == 'MME':
|
199 |
+
non_overall_fields = [x for x in non_overall_fields if not listinstr(['Perception', 'Cognition'], x)]
|
200 |
+
overall_fields = overall_fields + ['Perception', 'Cognition']
|
201 |
+
|
202 |
+
for m in results:
|
203 |
+
item = results[m]
|
204 |
+
meta = item['META']
|
205 |
+
for k in META_FIELDS:
|
206 |
+
if k == 'Parameters (B)':
|
207 |
+
param = meta['Parameters']
|
208 |
+
res[k].append(float(param.replace('B', '')) if param != '' else None)
|
209 |
+
elif k == 'Method':
|
210 |
+
name, url = meta['Method']
|
211 |
+
res[k].append(f'<a href="{url}">{name}</a>')
|
212 |
+
else:
|
213 |
+
res[k].append(meta[k])
|
214 |
+
fields = [x for x in fields]
|
215 |
+
|
216 |
+
for d in non_overall_fields:
|
217 |
+
res[d].append(item[dataset][d])
|
218 |
+
for d in overall_fields:
|
219 |
+
res[d].append(item[dataset][d])
|
220 |
+
|
221 |
+
df = pd.DataFrame(res)
|
222 |
+
df = df.sort_values('Overall')
|
223 |
+
df = df.iloc[::-1]
|
224 |
+
|
225 |
+
check_box = {}
|
226 |
+
check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
|
227 |
+
check_box['required'] = overall_fields
|
228 |
+
check_box['all'] = non_overall_fields + overall_fields
|
229 |
+
type_map = defaultdict(lambda: 'number')
|
230 |
+
type_map['Method'] = 'html'
|
231 |
+
type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
|
232 |
+
check_box['type_map'] = type_map
|
233 |
+
return df, check_box
|