Spaces:
Running
Running
__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions'] | |
import gradio as gr | |
import pandas as pd | |
import re | |
import pandas as pd | |
import numpy as np | |
from collections import defaultdict | |
from constants import * | |
import os | |
from huggingface_hub import Repository | |
import json | |
global data_component_aad, data_component_iasd, data_component_ivqd, filter_component | |
TOKEN = os.environ.get("TOKEN") | |
repo = Repository(local_dir="./download_from_dataset", clone_from="MM-UPD/results_for_leaderboard", repo_type="dataset", use_auth_token=TOKEN) | |
current_directory = os.getcwd() | |
def validate_model_size(s): | |
pattern = r'^\d+B$|^-$' | |
if re.match(pattern, s): | |
return s | |
else: | |
return '-' | |
def upload_file(files): | |
file_paths = [file.name for file in files] | |
return file_paths | |
def create_df(input_file): | |
json_string = input_file.decode('utf-8') | |
data = json.loads(json_string) | |
df = pd.DataFrame(data) | |
return df | |
# Accuracy Report | |
def report_acc(df, groupd='category', metric_type="dual"): | |
assert 'split' in df | |
assert groupd in [None, 'category', 'l2-category'] | |
res = defaultdict(list) | |
res['split'] = ['test'] | |
if groupd is None: | |
if metric_type == "dual": | |
res['overall'] = [ | |
np.mean(df['hit']), | |
] | |
elif metric_type == "standard": | |
res['overall'] = [ | |
np.mean(df['hit_standard']), | |
] | |
elif metric_type == "upd": | |
res['overall'] = [ | |
np.mean(df['hit_upd']), | |
] | |
return pd.DataFrame(res) | |
elif groupd in df: | |
abilities = list(set(df[groupd])) | |
abilities.sort() | |
for ab in abilities: | |
sub_df = df[df[groupd] == ab] | |
if metric_type == "dual": | |
res[ab] = [ | |
np.mean(sub_df['hit']), | |
] | |
elif metric_type == "standard": | |
res[ab] = [ | |
np.mean(sub_df['hit_standard']), | |
] | |
elif metric_type == "upd": | |
res[ab] = [ | |
np.mean(sub_df['hit_upd']), | |
] | |
return pd.DataFrame(res) | |
def eval_result_dual(data_main, metric_type="dual"): | |
overall = report_acc(data_main, None, metric_type) | |
leaf = report_acc(data_main, 'category', metric_type) | |
overall = round(overall['overall'].values[0] * 100, 1) | |
leaf = leaf.iloc[:, 1:].values.flatten().tolist() | |
leaf = [round(x * 100, 1) for x in leaf] | |
return overall, leaf | |
def calculate_score(input_file): | |
dual_df = create_df(input_file) | |
overall_dual, leaf_dual = eval_result_dual(dual_df) | |
overall_standard, leaf_standard = eval_result_dual(dual_df, metric_type="standard") | |
overall_upd, leaf_upd = eval_result_dual(dual_df, metric_type="upd") | |
return overall_dual, overall_standard, overall_upd, leaf_dual | |
# add the new data into the queue | |
def add_queue(base_df, input_file, model_name): | |
dual_df = create_df(input_file) | |
base_df[f"{model_name}_prediction_standard"] = dual_df["prediction_standard"] | |
base_df[f"{model_name}_hit_standard"] = dual_df["hit_standard"] | |
base_df[f"{model_name}_prediction_upd"] = dual_df["prediction_upd"] | |
base_df[f"{model_name}_hit_upd"] = dual_df["hit_upd"] | |
base_df[f"{model_name}_hit"] = dual_df["hit"] | |
return base_df | |
# check whether the input file is correct or not | |
def validity_check(input_file, UPD_type, question_type): | |
input_df = create_df(input_file) | |
# check for the correct data size | |
data_num_dict = {"AAD": 820, "IASD": 919, "IVQD": 356} | |
assert len(input_df) == data_num_dict[UPD_type], "Different Data Size" | |
print("len(input)", len(input_df)) | |
print("data_num_dict[UPD_type]", data_num_dict[UPD_type]) | |
# check for missing columns | |
column_list = ["hit_upd", "hit_standard", "hit", "prediction_upd", "prediction_standard"] | |
assert all(x in input_df.columns for x in column_list), "Column Missing" | |
# check for missing values | |
assert not input_df[column_list].isnull().any().any(), "Missing values found in columns" | |
# check for the presence of the correct values | |
option_mapping = {"AAD": "None of the above", "IASD": "None of the above", "IVQD": "The image and question are irrelevant."} | |
instruction_mapping = {"AAD": "F. None of the above", "IASD": "F. None of the above", "IVQD": "F. The image and question are irrelevant."} | |
input_df["D_upd"] = input_df["D_upd"].fillna("") | |
if question_type == "Base": | |
assert not input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]} found in Base" | |
assert not input_df["prediction_upd"].str.contains(instruction_mapping[UPD_type]).any(), f"{instruction_mapping[UPD_type]} found in Base" | |
elif question_type == "Option": | |
assert input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]}not found in Option" | |
assert not input_df["prediction_upd"].str.contains(instruction_mapping[UPD_type]).any(), f"{instruction_mapping[UPD_type]} found in Option" | |
elif question_type == "Instruction": | |
assert not input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]} found in Instruction" | |
return True | |
def add_new_eval( | |
input_file, | |
model_type: str, | |
model_name_textbox: str, | |
revision_name_textbox: str, | |
model_link: str, | |
model_size: str, | |
upd_type: str, | |
LLM_type: str, | |
LLM_name_textbox: str, | |
question_type: str | |
): | |
if input_file is None: | |
warning_text = "Error! Empty file!" | |
print(warning_text) | |
return warning_text | |
else: | |
model_size = validate_model_size(model_size) | |
if upd_type == 'AAD': | |
csv_path = CSV_AAD_RESULT_PATH | |
elif upd_type == 'IASD': | |
csv_path = CSV_IASD_RESULT_PATH | |
elif upd_type == 'IVQD': | |
csv_path = CSV_IVQD_RESULT_PATH | |
validity_check(input_file, upd_type, question_type) | |
csv_data = pd.read_csv(csv_path) | |
overall_dual_acc, overall_standard_acc, overall_upd_acc, leaf_dual = calculate_score(input_file) | |
if LLM_type == 'Other': | |
LLM_name = LLM_name_textbox | |
else: | |
LLM_name = LLM_type | |
if revision_name_textbox == '': | |
col = csv_data.shape[0] | |
model_name = model_name_textbox | |
else: | |
model_name = revision_name_textbox | |
model_name_list = csv_data['Model'] | |
name_list = [name.split(']')[0][1:] for name in model_name_list] | |
if revision_name_textbox not in name_list: | |
col = csv_data.shape[0] | |
else: | |
col = name_list.index(revision_name_textbox) | |
if model_link == '': | |
model_name = model_name # no url | |
else: | |
model_name = '[' + model_name + '](' + model_link + ')' | |
# add new data | |
new_data = [ | |
model_type, | |
model_name, | |
LLM_name, | |
model_size, | |
question_type, | |
overall_dual_acc, | |
overall_standard_acc, | |
overall_upd_acc, | |
] | |
new_data += leaf_dual | |
# If the same data already exists, return an error. | |
if new_data in csv_data.values.tolist(): | |
warning_text = "Error! The same data already exists!" | |
print(warning_text) | |
return warning_text | |
# If the same model name already exists, return an error. | |
elif new_data[:5] in csv_data.values.tolist(): | |
warning_text = "Error! The same data already exists! Please fill revision_name." | |
print(warning_text) | |
return warning_text | |
csv_data.loc[col] = new_data | |
csv_data = csv_data.to_csv(csv_path, index=False) | |
absolute_result_path = os.path.abspath(csv_path) | |
if not os.path.exists(absolute_result_path): | |
raise FileNotFoundError(f"File {absolute_result_path} not found") | |
repo.git_pull() | |
repo.git_add(absolute_result_path) | |
csv_queue_path = os.path.join(CSV_QUEUE_DIR, f"detail_results_{upd_type.lower()}_{question_type.lower()}.csv") | |
base_data = pd.read_csv(csv_queue_path) | |
base_data = add_queue(base_data, input_file, model_name) | |
base_data.to_csv(csv_queue_path, index=False) | |
absolute_queue_path = os.path.abspath(csv_queue_path) | |
if not os.path.exists(absolute_queue_path): | |
raise FileNotFoundError(f"File {absolute_queue_path} not found") | |
repo.git_add(absolute_queue_path) | |
repo.git_commit(f"add {model_name} results in {question_type}") | |
repo.git_push() | |
return 0 | |
def get_baseline_aad_df(): | |
repo.git_pull() | |
df = pd.read_csv(CSV_AAD_RESULT_PATH) | |
df = df.sort_values(by="Overall Dual Acc.", ascending=False) | |
present_columns = MODEL_INFO + checkbox_aad_group.value | |
df = df[present_columns] | |
return df | |
def get_all_aad_df(): | |
repo.git_pull() | |
df = pd.read_csv(CSV_AAD_RESULT_PATH) | |
df = df.sort_values(by="Overall Dual Acc.", ascending=False) | |
return df | |
def get_baseline_iasd_df(): | |
repo.git_pull() | |
df = pd.read_csv(CSV_IASD_RESULT_PATH) | |
df = df.sort_values(by="Overall Dual Acc.", ascending=False) | |
present_columns = MODEL_INFO + checkbox_iasd_group.value | |
df = df[present_columns] | |
return df | |
def get_all_iasd_df(): | |
repo.git_pull() | |
df = pd.read_csv(CSV_IASD_RESULT_PATH) | |
df = df.sort_values(by="Overall Dual Acc.", ascending=False) | |
return df | |
def get_baseline_ivqd_df(): | |
repo.git_pull() | |
df = pd.read_csv(CSV_IVQD_RESULT_PATH) | |
df = df.sort_values(by="Overall Dual Acc.", ascending=False) | |
present_columns = MODEL_INFO + checkbox_ivqd_group.value | |
df = df[present_columns] | |
return df | |
def get_all_ivqd_df(): | |
repo.git_pull() | |
df = pd.read_csv(CSV_IVQD_RESULT_PATH) | |
df = df.sort_values(by="Overall Dual Acc.", ascending=False) | |
return df | |
block = gr.Blocks() | |
with block: | |
gr.Markdown( | |
LEADERBORAD_INTRODUCTION | |
) | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
# table mmupd bench | |
with gr.TabItem("π MM-AAD Benchmark", elem_id="mmaad-benchmark-tab-table", id=1): | |
# selection for column part: | |
checkbox_aad_group = gr.CheckboxGroup( | |
choices=TASK_AAD_INFO, | |
value=AVG_INFO, | |
label="Evaluation Dimension", | |
interactive=True, | |
) # user can select the evaluation dimension | |
with gr.Row(): | |
# selection for model size part: | |
model_size = gr.CheckboxGroup( | |
choices=MODEL_SIZE, | |
value=MODEL_SIZE, | |
label="Model Size", | |
interactive=True, | |
) | |
# selection for model size part: | |
question_type = gr.CheckboxGroup( | |
choices=QUESTION_TYPE, | |
value=QUESTION_TYPE, | |
label="Question Type", | |
interactive=True, | |
) | |
baseline_value = get_baseline_aad_df() | |
baseline_header = MODEL_INFO + checkbox_aad_group.value | |
baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_aad_group.value) | |
data_component_aad = gr.components.Dataframe( | |
value=baseline_value, | |
headers=baseline_header, | |
type="pandas", | |
datatype=baseline_datatype, | |
interactive=False, | |
visible=True, | |
) | |
def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns): | |
updated_data = get_all_aad_df() | |
# model_size & question_type: | |
def custom_filter(row, model_size_filters, question_type_filters): | |
model_size = row['Model Size'] | |
question_type = row['Question Type'] | |
model_size = model_size.upper() | |
if model_size == '-': | |
size_filter = '-' in model_size_filters | |
elif 'B' in model_size: | |
size = float(model_size.replace('B', '')) | |
size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10) | |
else: | |
size_filter = False | |
question_type_filter = question_type in question_type_filters | |
return size_filter and question_type_filter | |
mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type) | |
updated_data = updated_data[mask] | |
# columns: | |
selected_columns = [item for item in TASK_AAD_INFO if item in selected_columns] | |
present_columns = MODEL_INFO + selected_columns | |
updated_data = updated_data[present_columns] | |
updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False) | |
updated_headers = present_columns | |
update_datatype = [DATA_AAD_TITILE_TYPE[COLUMN_AAD_NAMES.index(x)] for x in updated_headers] | |
filter_component = gr.components.Dataframe( | |
value=updated_data, | |
headers=updated_headers, | |
type="pandas", | |
datatype=update_datatype, | |
interactive=False, | |
visible=True, | |
) | |
return filter_component | |
model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad) | |
question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad) | |
checkbox_aad_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad) | |
with gr.TabItem("π MM-IASD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=2): | |
checkbox_iasd_group = gr.CheckboxGroup( | |
choices=TASK_IASD_INFO, | |
value=AVG_INFO, | |
label="Evaluation Dimension", | |
interactive=True, | |
) # user can select the evaluation dimension | |
with gr.Row(): | |
# selection for model size part: | |
model_size = gr.CheckboxGroup( | |
choices=MODEL_SIZE, | |
value=MODEL_SIZE, | |
label="Model Size", | |
interactive=True, | |
) | |
# selection for model size part: | |
question_type = gr.CheckboxGroup( | |
choices=QUESTION_TYPE, | |
value=QUESTION_TYPE, | |
label="Question Type", | |
interactive=True, | |
) | |
baseline_value = get_baseline_iasd_df() | |
baseline_header = MODEL_INFO + checkbox_iasd_group.value | |
baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_iasd_group.value) | |
data_component_iasd = gr.components.Dataframe( | |
value=baseline_value, | |
headers=baseline_header, | |
type="pandas", | |
datatype=baseline_datatype, | |
interactive=False, | |
visible=True, | |
) | |
def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns): | |
updated_data = get_all_iasd_df() | |
def custom_filter(row, model_size_filters, question_type_filters): | |
model_size = row['Model Size'] | |
question_type = row['Question Type'] | |
model_size = model_size.upper() | |
if model_size == '-': | |
size_filter = '-' in model_size_filters | |
elif 'B' in model_size: | |
size = float(model_size.replace('B', '')) | |
size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10) | |
else: | |
size_filter = False | |
question_type_filter = question_type in question_type_filters | |
return size_filter and question_type_filter | |
mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type) | |
updated_data = updated_data[mask] | |
# columns: | |
selected_columns = [item for item in TASK_IASD_INFO if item in selected_columns] | |
present_columns = MODEL_INFO + selected_columns | |
updated_data = updated_data[present_columns] | |
updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False) | |
updated_headers = present_columns | |
update_datatype = [DATA_IASD_TITILE_TYPE[COLUMN_IASD_NAMES.index(x)] for x in updated_headers] | |
filter_component = gr.components.Dataframe( | |
value=updated_data, | |
headers=updated_headers, | |
type="pandas", | |
datatype=update_datatype, | |
interactive=False, | |
visible=True, | |
) | |
return filter_component | |
model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd) | |
question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd) | |
checkbox_iasd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd) | |
# Table 3 | |
with gr.TabItem("π MM-IVQD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=3): | |
# selection for column part: | |
checkbox_ivqd_group = gr.CheckboxGroup( | |
choices=TASK_IVQD_INFO, | |
value=AVG_INFO, | |
label="Evaluation Dimension", | |
interactive=True, | |
) # user can select the evaluation dimension | |
with gr.Row(): | |
# selection for model size part: | |
model_size = gr.CheckboxGroup( | |
choices=MODEL_SIZE, | |
value=MODEL_SIZE, | |
label="Model Size", | |
interactive=True, | |
) | |
# selection for model size part: | |
question_type = gr.CheckboxGroup( | |
choices=QUESTION_TYPE, | |
value=QUESTION_TYPE, | |
label="Question Type", | |
interactive=True, | |
) | |
baseline_value = get_baseline_ivqd_df() | |
baseline_header = MODEL_INFO + checkbox_ivqd_group.value | |
baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_ivqd_group.value) | |
data_component_ivqd = gr.components.Dataframe( | |
value=baseline_value, | |
headers=baseline_header, | |
type="pandas", | |
datatype=baseline_datatype, | |
interactive=False, | |
visible=True, | |
) | |
def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns): | |
updated_data = get_all_ivqd_df() | |
def custom_filter(row, model_size_filters, question_type_filters): | |
model_size = row['Model Size'] | |
question_type = row['Question Type'] | |
model_size = model_size.upper() | |
if model_size == '-': | |
size_filter = '-' in model_size_filters | |
elif 'B' in model_size: | |
size = float(model_size.replace('B', '')) | |
size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10) | |
else: | |
size_filter = False | |
question_type_filter = question_type in question_type_filters | |
return size_filter and question_type_filter | |
mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type) | |
updated_data = updated_data[mask] | |
selected_columns = [item for item in TASK_IVQD_INFO if item in selected_columns] | |
present_columns = MODEL_INFO + selected_columns | |
updated_data = updated_data[present_columns] | |
updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False) | |
updated_headers = present_columns | |
update_datatype = [DATA_IVQD_TITILE_TYPE[COLUMN_IVQD_NAMES.index(x)] for x in updated_headers] | |
filter_component = gr.components.Dataframe( | |
value=updated_data, | |
headers=updated_headers, | |
type="pandas", | |
datatype=update_datatype, | |
interactive=False, | |
visible=True, | |
) | |
return filter_component | |
model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd) | |
question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd) | |
checkbox_ivqd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd) | |
# table 4 | |
with gr.TabItem("π About", elem_id="mmupd-benchmark-tab-table", id=4): | |
gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text") | |
# table 5 | |
with gr.TabItem("π Submit here! ", elem_id="mmupd-benchmark-tab-table", id=5): | |
with gr.Row(): | |
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text") | |
with gr.Row(): | |
gr.Markdown("# βοΈβ¨ Submit your model evaluation json file here!", elem_classes="markdown-text") | |
with gr.Row(): | |
with gr.Column(): | |
model_type = gr.Dropdown( | |
choices=["VLM", "LLM"], | |
label="Model type", | |
multiselect=False, | |
value="VLM", | |
interactive=True, | |
) | |
model_name_textbox = gr.Textbox( | |
label="Model name", placeholder="LLaMA-7B" | |
) | |
revision_name_textbox = gr.Textbox( | |
label="Revision Model Name", placeholder="LLaMA-7B" | |
) | |
model_link = gr.Textbox( | |
label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf" | |
) | |
model_size = gr.Textbox( | |
label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')" | |
) | |
with gr.Column(): | |
LLM_type = gr.Dropdown( | |
choices=["Vicuna-1.5-7B", "Vicuna-1.5-13B", "Flan-T5-XL", "LLaMA-7B", "Llama-13B", "Llama-3-8B", "Llama-3-70B", "Yi-34B", "Mistral-7B", "Other"], | |
label="LLM type", | |
multiselect=False, | |
value="Vicuna-1.5-13B", | |
interactive=True, | |
) | |
LLM_name_textbox = gr.Textbox( | |
label="LLM model (Required for Other)", | |
placeholder="GPT-4", | |
) | |
upd_type = gr.Dropdown( | |
choices=[ | |
"AAD", | |
"IASD", | |
"IVQD", | |
], | |
label="UPD type", | |
multiselect=False, | |
value="AAD", | |
interactive=True, | |
) | |
question_type = gr.Dropdown( | |
choices=QUESTION_TYPE, | |
label="Question Type", | |
multiselect=False, | |
value=QUESTION_TYPE[0], | |
interactive=True, | |
) | |
with gr.Column(): | |
input_file = gr.components.File(label="Click to Upload a JSON File", file_count="single", type='binary') | |
submit_button = gr.Button("Submit Eval") | |
submission_result = gr.Markdown() | |
submit_button.click( | |
add_new_eval, | |
inputs = [ | |
input_file, | |
model_type, | |
model_name_textbox, | |
revision_name_textbox, | |
model_link, | |
model_size, | |
upd_type, | |
LLM_type, | |
LLM_name_textbox, | |
question_type | |
], | |
) | |
def refresh_data(): | |
value1 = get_baseline_aad_df() | |
value2 = get_baseline_iasd_df() | |
value3 = get_baseline_ivqd_df() | |
return value1, value2, value3 | |
with gr.Row(): | |
data_run = gr.Button("Refresh") | |
data_run.click( | |
refresh_data, outputs=[data_component_aad, data_component_iasd, data_component_ivqd] | |
) | |
with gr.Accordion("Citation", open=False): | |
citation_button = gr.Textbox( | |
value=CITATION_BUTTON_TEXT, | |
label=CITATION_BUTTON_LABEL, | |
elem_id="citation-button", | |
show_copy_button=True, | |
) | |
block.launch() | |