AtsuMiyai
update explanations on MM-UPD Bench
10de1e4
raw
history blame
26.6 kB
__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
import gradio as gr
import pandas as pd
import re
import pandas as pd
import numpy as np
from collections import defaultdict
from constants import *
import os
from huggingface_hub import Repository
import json
global data_component_aad, data_component_iasd, data_component_ivqd, filter_component
TOKEN = os.environ.get("TOKEN")
repo = Repository(local_dir="./download_from_dataset", clone_from="MM-UPD/results_for_leaderboard", repo_type="dataset", use_auth_token=TOKEN)
current_directory = os.getcwd()
def validate_model_size(s):
pattern = r'^\d+B$|^-$'
if re.match(pattern, s):
return s
else:
return '-'
def upload_file(files):
file_paths = [file.name for file in files]
return file_paths
def create_df(input_file):
json_string = input_file.decode('utf-8')
data = json.loads(json_string)
df = pd.DataFrame(data)
return df
# Accuracy Report
def report_acc(df, groupd='category', metric_type="dual"):
assert 'split' in df
assert groupd in [None, 'category', 'l2-category']
res = defaultdict(list)
res['split'] = ['test']
if groupd is None:
if metric_type == "dual":
res['overall'] = [
np.mean(df['hit']),
]
elif metric_type == "standard":
res['overall'] = [
np.mean(df['hit_standard']),
]
elif metric_type == "upd":
res['overall'] = [
np.mean(df['hit_upd']),
]
return pd.DataFrame(res)
elif groupd in df:
abilities = list(set(df[groupd]))
abilities.sort()
for ab in abilities:
sub_df = df[df[groupd] == ab]
if metric_type == "dual":
res[ab] = [
np.mean(sub_df['hit']),
]
elif metric_type == "standard":
res[ab] = [
np.mean(sub_df['hit_standard']),
]
elif metric_type == "upd":
res[ab] = [
np.mean(sub_df['hit_upd']),
]
return pd.DataFrame(res)
def eval_result_dual(data_main, metric_type="dual"):
overall = report_acc(data_main, None, metric_type)
leaf = report_acc(data_main, 'category', metric_type)
overall = round(overall['overall'].values[0] * 100, 1)
leaf = leaf.iloc[:, 1:].values.flatten().tolist()
leaf = [round(x * 100, 1) for x in leaf]
return overall, leaf
def calculate_score(input_file):
dual_df = create_df(input_file)
overall_dual, leaf_dual = eval_result_dual(dual_df)
overall_standard, leaf_standard = eval_result_dual(dual_df, metric_type="standard")
overall_upd, leaf_upd = eval_result_dual(dual_df, metric_type="upd")
return overall_dual, overall_standard, overall_upd, leaf_dual
# add the new data into the queue
def add_queue(base_df, input_file, model_name):
dual_df = create_df(input_file)
base_df[f"{model_name}_prediction_standard"] = dual_df["prediction_standard"]
base_df[f"{model_name}_hit_standard"] = dual_df["hit_standard"]
base_df[f"{model_name}_prediction_upd"] = dual_df["prediction_upd"]
base_df[f"{model_name}_hit_upd"] = dual_df["hit_upd"]
base_df[f"{model_name}_hit"] = dual_df["hit"]
return base_df
# check whether the input file is correct or not
def validity_check(input_file, UPD_type, question_type):
input_df = create_df(input_file)
# check for the correct data size
data_num_dict = {"AAD": 820, "IASD": 919, "IVQD": 356}
assert len(input_df) == data_num_dict[UPD_type], "Different Data Size"
print("len(input)", len(input_df))
print("data_num_dict[UPD_type]", data_num_dict[UPD_type])
# check for missing columns
column_list = ["hit_upd", "hit_standard", "hit", "prediction_upd", "prediction_standard"]
assert all(x in input_df.columns for x in column_list), "Column Missing"
# check for missing values
assert not input_df[column_list].isnull().any().any(), "Missing values found in columns"
# check for the presence of the correct values
option_mapping = {"AAD": "None of the above", "IASD": "None of the above", "IVQD": "The image and question are irrelevant."}
instruction_mapping = {"AAD": "F. None of the above", "IASD": "F. None of the above", "IVQD": "F. The image and question are irrelevant."}
input_df["D_upd"] = input_df["D_upd"].fillna("")
if question_type == "Base":
assert not input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]} found in Base"
assert not input_df["prediction_upd"].str.contains(instruction_mapping[UPD_type]).any(), f"{instruction_mapping[UPD_type]} found in Base"
elif question_type == "Option":
assert input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]}not found in Option"
assert not input_df["prediction_upd"].str.contains(instruction_mapping[UPD_type]).any(), f"{instruction_mapping[UPD_type]} found in Option"
elif question_type == "Instruction":
assert not input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]} found in Instruction"
return True
def add_new_eval(
input_file,
model_type: str,
model_name_textbox: str,
revision_name_textbox: str,
model_link: str,
model_size: str,
upd_type: str,
LLM_type: str,
LLM_name_textbox: str,
question_type: str
):
if input_file is None:
warning_text = "Error! Empty file!"
print(warning_text)
return warning_text
else:
model_size = validate_model_size(model_size)
if upd_type == 'AAD':
csv_path = CSV_AAD_RESULT_PATH
elif upd_type == 'IASD':
csv_path = CSV_IASD_RESULT_PATH
elif upd_type == 'IVQD':
csv_path = CSV_IVQD_RESULT_PATH
validity_check(input_file, upd_type, question_type)
csv_data = pd.read_csv(csv_path)
overall_dual_acc, overall_standard_acc, overall_upd_acc, leaf_dual = calculate_score(input_file)
if LLM_type == 'Other':
LLM_name = LLM_name_textbox
else:
LLM_name = LLM_type
if revision_name_textbox == '':
col = csv_data.shape[0]
model_name = model_name_textbox
else:
model_name = revision_name_textbox
model_name_list = csv_data['Model']
name_list = [name.split(']')[0][1:] for name in model_name_list]
if revision_name_textbox not in name_list:
col = csv_data.shape[0]
else:
col = name_list.index(revision_name_textbox)
if model_link == '':
model_name = model_name # no url
else:
model_name = '[' + model_name + '](' + model_link + ')'
# add new data
new_data = [
model_type,
model_name,
LLM_name,
model_size,
question_type,
overall_dual_acc,
overall_standard_acc,
overall_upd_acc,
]
new_data += leaf_dual
# If the same data already exists, return an error.
if new_data in csv_data.values.tolist():
warning_text = "Error! The same data already exists!"
print(warning_text)
return warning_text
# If the same model name already exists, return an error.
elif new_data[:5] in csv_data.values.tolist():
warning_text = "Error! The same data already exists! Please fill revision_name."
print(warning_text)
return warning_text
csv_data.loc[col] = new_data
csv_data = csv_data.to_csv(csv_path, index=False)
absolute_result_path = os.path.abspath(csv_path)
if not os.path.exists(absolute_result_path):
raise FileNotFoundError(f"File {absolute_result_path} not found")
repo.git_pull()
repo.git_add(absolute_result_path)
csv_queue_path = os.path.join(CSV_QUEUE_DIR, f"detail_results_{upd_type.lower()}_{question_type.lower()}.csv")
base_data = pd.read_csv(csv_queue_path)
base_data = add_queue(base_data, input_file, model_name)
base_data.to_csv(csv_queue_path, index=False)
absolute_queue_path = os.path.abspath(csv_queue_path)
if not os.path.exists(absolute_queue_path):
raise FileNotFoundError(f"File {absolute_queue_path} not found")
repo.git_add(absolute_queue_path)
repo.git_commit(f"add {model_name} results in {question_type}")
repo.git_push()
return 0
def get_baseline_aad_df():
repo.git_pull()
df = pd.read_csv(CSV_AAD_RESULT_PATH)
df = df.sort_values(by="Overall Dual Acc.", ascending=False)
present_columns = MODEL_INFO + checkbox_aad_group.value
df = df[present_columns]
return df
def get_all_aad_df():
repo.git_pull()
df = pd.read_csv(CSV_AAD_RESULT_PATH)
df = df.sort_values(by="Overall Dual Acc.", ascending=False)
return df
def get_baseline_iasd_df():
repo.git_pull()
df = pd.read_csv(CSV_IASD_RESULT_PATH)
df = df.sort_values(by="Overall Dual Acc.", ascending=False)
present_columns = MODEL_INFO + checkbox_iasd_group.value
df = df[present_columns]
return df
def get_all_iasd_df():
repo.git_pull()
df = pd.read_csv(CSV_IASD_RESULT_PATH)
df = df.sort_values(by="Overall Dual Acc.", ascending=False)
return df
def get_baseline_ivqd_df():
repo.git_pull()
df = pd.read_csv(CSV_IVQD_RESULT_PATH)
df = df.sort_values(by="Overall Dual Acc.", ascending=False)
present_columns = MODEL_INFO + checkbox_ivqd_group.value
df = df[present_columns]
return df
def get_all_ivqd_df():
repo.git_pull()
df = pd.read_csv(CSV_IVQD_RESULT_PATH)
df = df.sort_values(by="Overall Dual Acc.", ascending=False)
return df
block = gr.Blocks()
with block:
gr.Markdown(
LEADERBORAD_INTRODUCTION
)
with gr.Tabs(elem_classes="tab-buttons") as tabs:
# table mmupd bench
with gr.TabItem("πŸ… MM-AAD Benchmark", elem_id="mmaad-benchmark-tab-table", id=1):
# selection for column part:
checkbox_aad_group = gr.CheckboxGroup(
choices=TASK_AAD_INFO,
value=AVG_INFO,
label="Evaluation Dimension",
interactive=True,
) # user can select the evaluation dimension
with gr.Row():
# selection for model size part:
model_size = gr.CheckboxGroup(
choices=MODEL_SIZE,
value=MODEL_SIZE,
label="Model Size",
interactive=True,
)
# selection for model size part:
question_type = gr.CheckboxGroup(
choices=QUESTION_TYPE,
value=QUESTION_TYPE,
label="Question Type",
interactive=True,
)
baseline_value = get_baseline_aad_df()
baseline_header = MODEL_INFO + checkbox_aad_group.value
baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_aad_group.value)
data_component_aad = gr.components.Dataframe(
value=baseline_value,
headers=baseline_header,
type="pandas",
datatype=baseline_datatype,
interactive=False,
visible=True,
)
def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns):
updated_data = get_all_aad_df()
# model_size & question_type:
def custom_filter(row, model_size_filters, question_type_filters):
model_size = row['Model Size']
question_type = row['Question Type']
model_size = model_size.upper()
if model_size == '-':
size_filter = '-' in model_size_filters
elif 'B' in model_size:
size = float(model_size.replace('B', ''))
size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
else:
size_filter = False
question_type_filter = question_type in question_type_filters
return size_filter and question_type_filter
mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type)
updated_data = updated_data[mask]
# columns:
selected_columns = [item for item in TASK_AAD_INFO if item in selected_columns]
present_columns = MODEL_INFO + selected_columns
updated_data = updated_data[present_columns]
updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
updated_headers = present_columns
update_datatype = [DATA_AAD_TITILE_TYPE[COLUMN_AAD_NAMES.index(x)] for x in updated_headers]
filter_component = gr.components.Dataframe(
value=updated_data,
headers=updated_headers,
type="pandas",
datatype=update_datatype,
interactive=False,
visible=True,
)
return filter_component
model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
checkbox_aad_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
with gr.TabItem("πŸ… MM-IASD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=2):
checkbox_iasd_group = gr.CheckboxGroup(
choices=TASK_IASD_INFO,
value=AVG_INFO,
label="Evaluation Dimension",
interactive=True,
) # user can select the evaluation dimension
with gr.Row():
# selection for model size part:
model_size = gr.CheckboxGroup(
choices=MODEL_SIZE,
value=MODEL_SIZE,
label="Model Size",
interactive=True,
)
# selection for model size part:
question_type = gr.CheckboxGroup(
choices=QUESTION_TYPE,
value=QUESTION_TYPE,
label="Question Type",
interactive=True,
)
baseline_value = get_baseline_iasd_df()
baseline_header = MODEL_INFO + checkbox_iasd_group.value
baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_iasd_group.value)
data_component_iasd = gr.components.Dataframe(
value=baseline_value,
headers=baseline_header,
type="pandas",
datatype=baseline_datatype,
interactive=False,
visible=True,
)
def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns):
updated_data = get_all_iasd_df()
def custom_filter(row, model_size_filters, question_type_filters):
model_size = row['Model Size']
question_type = row['Question Type']
model_size = model_size.upper()
if model_size == '-':
size_filter = '-' in model_size_filters
elif 'B' in model_size:
size = float(model_size.replace('B', ''))
size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
else:
size_filter = False
question_type_filter = question_type in question_type_filters
return size_filter and question_type_filter
mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type)
updated_data = updated_data[mask]
# columns:
selected_columns = [item for item in TASK_IASD_INFO if item in selected_columns]
present_columns = MODEL_INFO + selected_columns
updated_data = updated_data[present_columns]
updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
updated_headers = present_columns
update_datatype = [DATA_IASD_TITILE_TYPE[COLUMN_IASD_NAMES.index(x)] for x in updated_headers]
filter_component = gr.components.Dataframe(
value=updated_data,
headers=updated_headers,
type="pandas",
datatype=update_datatype,
interactive=False,
visible=True,
)
return filter_component
model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
checkbox_iasd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
# Table 3
with gr.TabItem("πŸ… MM-IVQD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=3):
# selection for column part:
checkbox_ivqd_group = gr.CheckboxGroup(
choices=TASK_IVQD_INFO,
value=AVG_INFO,
label="Evaluation Dimension",
interactive=True,
) # user can select the evaluation dimension
with gr.Row():
# selection for model size part:
model_size = gr.CheckboxGroup(
choices=MODEL_SIZE,
value=MODEL_SIZE,
label="Model Size",
interactive=True,
)
# selection for model size part:
question_type = gr.CheckboxGroup(
choices=QUESTION_TYPE,
value=QUESTION_TYPE,
label="Question Type",
interactive=True,
)
baseline_value = get_baseline_ivqd_df()
baseline_header = MODEL_INFO + checkbox_ivqd_group.value
baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_ivqd_group.value)
data_component_ivqd = gr.components.Dataframe(
value=baseline_value,
headers=baseline_header,
type="pandas",
datatype=baseline_datatype,
interactive=False,
visible=True,
)
def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns):
updated_data = get_all_ivqd_df()
def custom_filter(row, model_size_filters, question_type_filters):
model_size = row['Model Size']
question_type = row['Question Type']
model_size = model_size.upper()
if model_size == '-':
size_filter = '-' in model_size_filters
elif 'B' in model_size:
size = float(model_size.replace('B', ''))
size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
else:
size_filter = False
question_type_filter = question_type in question_type_filters
return size_filter and question_type_filter
mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type)
updated_data = updated_data[mask]
selected_columns = [item for item in TASK_IVQD_INFO if item in selected_columns]
present_columns = MODEL_INFO + selected_columns
updated_data = updated_data[present_columns]
updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
updated_headers = present_columns
update_datatype = [DATA_IVQD_TITILE_TYPE[COLUMN_IVQD_NAMES.index(x)] for x in updated_headers]
filter_component = gr.components.Dataframe(
value=updated_data,
headers=updated_headers,
type="pandas",
datatype=update_datatype,
interactive=False,
visible=True,
)
return filter_component
model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
checkbox_ivqd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
# table 4
with gr.TabItem("πŸ“ About", elem_id="mmupd-benchmark-tab-table", id=4):
gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
# table 5
with gr.TabItem("πŸš€ Submit here! ", elem_id="mmupd-benchmark-tab-table", id=5):
with gr.Row():
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
with gr.Row():
gr.Markdown("# βœ‰οΈβœ¨ Submit your model evaluation json file here!", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_type = gr.Dropdown(
choices=["VLM", "LLM"],
label="Model type",
multiselect=False,
value="VLM",
interactive=True,
)
model_name_textbox = gr.Textbox(
label="Model name", placeholder="LLaMA-7B"
)
revision_name_textbox = gr.Textbox(
label="Revision Model Name", placeholder="LLaMA-7B"
)
model_link = gr.Textbox(
label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
)
model_size = gr.Textbox(
label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')"
)
with gr.Column():
LLM_type = gr.Dropdown(
choices=["Vicuna-1.5-7B", "Vicuna-1.5-13B", "Flan-T5-XL", "LLaMA-7B", "Llama-13B", "Llama-3-8B", "Llama-3-70B", "Yi-34B", "Mistral-7B", "Other"],
label="LLM type",
multiselect=False,
value="Vicuna-1.5-13B",
interactive=True,
)
LLM_name_textbox = gr.Textbox(
label="LLM model (Required for Other)",
placeholder="GPT-4",
)
upd_type = gr.Dropdown(
choices=[
"AAD",
"IASD",
"IVQD",
],
label="UPD type",
multiselect=False,
value="AAD",
interactive=True,
)
question_type = gr.Dropdown(
choices=QUESTION_TYPE,
label="Question Type",
multiselect=False,
value=QUESTION_TYPE[0],
interactive=True,
)
with gr.Column():
input_file = gr.components.File(label="Click to Upload a JSON File", file_count="single", type='binary')
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
submit_button.click(
add_new_eval,
inputs = [
input_file,
model_type,
model_name_textbox,
revision_name_textbox,
model_link,
model_size,
upd_type,
LLM_type,
LLM_name_textbox,
question_type
],
)
def refresh_data():
value1 = get_baseline_aad_df()
value2 = get_baseline_iasd_df()
value3 = get_baseline_ivqd_df()
return value1, value2, value3
with gr.Row():
data_run = gr.Button("Refresh")
data_run.click(
refresh_data, outputs=[data_component_aad, data_component_iasd, data_component_ivqd]
)
with gr.Accordion("Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
elem_id="citation-button",
show_copy_button=True,
)
block.launch()