Spaces:

MM-UPD
/

MM-UPD_Leaderboard

Running

AtsuMiyai

update explanations on MM-UPD Bench

10de1e4 7 months ago

26.6 kB

	__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']

	import gradio as gr
	import pandas as pd
	import re
	import pandas as pd
	import numpy as np
	from collections import defaultdict
	from constants import *
	import os
	from huggingface_hub import Repository
	import json


	global data_component_aad, data_component_iasd, data_component_ivqd, filter_component


	TOKEN = os.environ.get("TOKEN")

	repo = Repository(local_dir="./download_from_dataset", clone_from="MM-UPD/results_for_leaderboard", repo_type="dataset", use_auth_token=TOKEN)

	current_directory = os.getcwd()


	def validate_model_size(s):
	pattern = r'^\d+B$\|^-$'
	if re.match(pattern, s):
	return s
	else:
	return '-'


	def upload_file(files):
	file_paths = [file.name for file in files]
	return file_paths


	def create_df(input_file):
	json_string = input_file.decode('utf-8')
	data = json.loads(json_string)
	df = pd.DataFrame(data)
	return df


	# Accuracy Report
	def report_acc(df, groupd='category', metric_type="dual"):
	assert 'split' in df
	assert groupd in [None, 'category', 'l2-category']

	res = defaultdict(list)
	res['split'] = ['test']
	if groupd is None:
	if metric_type == "dual":
	res['overall'] = [
	np.mean(df['hit']),
	]
	elif metric_type == "standard":
	res['overall'] = [
	np.mean(df['hit_standard']),
	]
	elif metric_type == "upd":
	res['overall'] = [
	np.mean(df['hit_upd']),
	]
	return pd.DataFrame(res)

	elif groupd in df:
	abilities = list(set(df[groupd]))
	abilities.sort()
	for ab in abilities:
	sub_df = df[df[groupd] == ab]
	if metric_type == "dual":
	res[ab] = [
	np.mean(sub_df['hit']),
	]
	elif metric_type == "standard":
	res[ab] = [
	np.mean(sub_df['hit_standard']),
	]
	elif metric_type == "upd":
	res[ab] = [
	np.mean(sub_df['hit_upd']),
	]

	return pd.DataFrame(res)


	def eval_result_dual(data_main, metric_type="dual"):
	overall = report_acc(data_main, None, metric_type)
	leaf = report_acc(data_main, 'category', metric_type)

	overall = round(overall['overall'].values[0] * 100, 1)
	leaf = leaf.iloc[:, 1:].values.flatten().tolist()
	leaf = [round(x * 100, 1) for x in leaf]

	return overall, leaf


	def calculate_score(input_file):
	dual_df = create_df(input_file)
	overall_dual, leaf_dual = eval_result_dual(dual_df)
	overall_standard, leaf_standard = eval_result_dual(dual_df, metric_type="standard")
	overall_upd, leaf_upd = eval_result_dual(dual_df, metric_type="upd")

	return overall_dual, overall_standard, overall_upd, leaf_dual


	# add the new data into the queue
	def add_queue(base_df, input_file, model_name):
	dual_df = create_df(input_file)
	base_df[f"{model_name}_prediction_standard"] = dual_df["prediction_standard"]
	base_df[f"{model_name}_hit_standard"] = dual_df["hit_standard"]
	base_df[f"{model_name}_prediction_upd"] = dual_df["prediction_upd"]
	base_df[f"{model_name}_hit_upd"] = dual_df["hit_upd"]
	base_df[f"{model_name}_hit"] = dual_df["hit"]
	return base_df


	# check whether the input file is correct or not
	def validity_check(input_file, UPD_type, question_type):

	input_df = create_df(input_file)

	# check for the correct data size
	data_num_dict = {"AAD": 820, "IASD": 919, "IVQD": 356}
	assert len(input_df) == data_num_dict[UPD_type], "Different Data Size"
	print("len(input)", len(input_df))
	print("data_num_dict[UPD_type]", data_num_dict[UPD_type])
	# check for missing columns
	column_list = ["hit_upd", "hit_standard", "hit", "prediction_upd", "prediction_standard"]
	assert all(x in input_df.columns for x in column_list), "Column Missing"

	# check for missing values
	assert not input_df[column_list].isnull().any().any(), "Missing values found in columns"

	# check for the presence of the correct values
	option_mapping = {"AAD": "None of the above", "IASD": "None of the above", "IVQD": "The image and question are irrelevant."}
	instruction_mapping = {"AAD": "F. None of the above", "IASD": "F. None of the above", "IVQD": "F. The image and question are irrelevant."}

	input_df["D_upd"] = input_df["D_upd"].fillna("")

	if question_type == "Base":
	assert not input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]} found in Base"
	assert not input_df["prediction_upd"].str.contains(instruction_mapping[UPD_type]).any(), f"{instruction_mapping[UPD_type]} found in Base"
	elif question_type == "Option":
	assert input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]}not found in Option"
	assert not input_df["prediction_upd"].str.contains(instruction_mapping[UPD_type]).any(), f"{instruction_mapping[UPD_type]} found in Option"
	elif question_type == "Instruction":
	assert not input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]} found in Instruction"

	return True


	def add_new_eval(
	input_file,
	model_type: str,
	model_name_textbox: str,
	revision_name_textbox: str,
	model_link: str,
	model_size: str,
	upd_type: str,
	LLM_type: str,
	LLM_name_textbox: str,
	question_type: str

	):

	if input_file is None:
	warning_text = "Error! Empty file!"
	print(warning_text)
	return warning_text
	else:
	model_size = validate_model_size(model_size)
	if upd_type == 'AAD':
	csv_path = CSV_AAD_RESULT_PATH
	elif upd_type == 'IASD':
	csv_path = CSV_IASD_RESULT_PATH
	elif upd_type == 'IVQD':
	csv_path = CSV_IVQD_RESULT_PATH

	validity_check(input_file, upd_type, question_type)

	csv_data = pd.read_csv(csv_path)

	overall_dual_acc, overall_standard_acc, overall_upd_acc, leaf_dual = calculate_score(input_file)

	if LLM_type == 'Other':
	LLM_name = LLM_name_textbox
	else:
	LLM_name = LLM_type

	if revision_name_textbox == '':
	col = csv_data.shape[0]
	model_name = model_name_textbox
	else:
	model_name = revision_name_textbox
	model_name_list = csv_data['Model']
	name_list = [name.split(']')[0][1:] for name in model_name_list]
	if revision_name_textbox not in name_list:
	col = csv_data.shape[0]
	else:
	col = name_list.index(revision_name_textbox)

	if model_link == '':
	model_name = model_name # no url
	else:
	model_name = '[' + model_name + '](' + model_link + ')'

	# add new data
	new_data = [
	model_type,
	model_name,
	LLM_name,
	model_size,
	question_type,
	overall_dual_acc,
	overall_standard_acc,
	overall_upd_acc,
	]
	new_data += leaf_dual

	# If the same data already exists, return an error.
	if new_data in csv_data.values.tolist():
	warning_text = "Error! The same data already exists!"
	print(warning_text)
	return warning_text
	# If the same model name already exists, return an error.
	elif new_data[:5] in csv_data.values.tolist():
	warning_text = "Error! The same data already exists! Please fill revision_name."
	print(warning_text)
	return warning_text

	csv_data.loc[col] = new_data
	csv_data = csv_data.to_csv(csv_path, index=False)

	absolute_result_path = os.path.abspath(csv_path)
	if not os.path.exists(absolute_result_path):
	raise FileNotFoundError(f"File {absolute_result_path} not found")

	repo.git_pull()
	repo.git_add(absolute_result_path)

	csv_queue_path = os.path.join(CSV_QUEUE_DIR, f"detail_results_{upd_type.lower()}_{question_type.lower()}.csv")
	base_data = pd.read_csv(csv_queue_path)

	base_data = add_queue(base_data, input_file, model_name)
	base_data.to_csv(csv_queue_path, index=False)

	absolute_queue_path = os.path.abspath(csv_queue_path)
	if not os.path.exists(absolute_queue_path):
	raise FileNotFoundError(f"File {absolute_queue_path} not found")

	repo.git_add(absolute_queue_path)
	repo.git_commit(f"add {model_name} results in {question_type}")

	repo.git_push()

	return 0


	def get_baseline_aad_df():
	repo.git_pull()
	df = pd.read_csv(CSV_AAD_RESULT_PATH)
	df = df.sort_values(by="Overall Dual Acc.", ascending=False)
	present_columns = MODEL_INFO + checkbox_aad_group.value
	df = df[present_columns]
	return df


	def get_all_aad_df():
	repo.git_pull()
	df = pd.read_csv(CSV_AAD_RESULT_PATH)
	df = df.sort_values(by="Overall Dual Acc.", ascending=False)
	return df


	def get_baseline_iasd_df():
	repo.git_pull()
	df = pd.read_csv(CSV_IASD_RESULT_PATH)
	df = df.sort_values(by="Overall Dual Acc.", ascending=False)
	present_columns = MODEL_INFO + checkbox_iasd_group.value
	df = df[present_columns]
	return df


	def get_all_iasd_df():
	repo.git_pull()
	df = pd.read_csv(CSV_IASD_RESULT_PATH)
	df = df.sort_values(by="Overall Dual Acc.", ascending=False)
	return df


	def get_baseline_ivqd_df():
	repo.git_pull()
	df = pd.read_csv(CSV_IVQD_RESULT_PATH)
	df = df.sort_values(by="Overall Dual Acc.", ascending=False)
	present_columns = MODEL_INFO + checkbox_ivqd_group.value
	df = df[present_columns]
	return df


	def get_all_ivqd_df():
	repo.git_pull()
	df = pd.read_csv(CSV_IVQD_RESULT_PATH)
	df = df.sort_values(by="Overall Dual Acc.", ascending=False)
	return df


	block = gr.Blocks()


	with block:
	gr.Markdown(
	LEADERBORAD_INTRODUCTION
	)
	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	# table mmupd bench
	with gr.TabItem("🏅 MM-AAD Benchmark", elem_id="mmaad-benchmark-tab-table", id=1):
	# selection for column part:
	checkbox_aad_group = gr.CheckboxGroup(
	choices=TASK_AAD_INFO,
	value=AVG_INFO,
	label="Evaluation Dimension",
	interactive=True,
	) # user can select the evaluation dimension

	with gr.Row():
	# selection for model size part:
	model_size = gr.CheckboxGroup(
	choices=MODEL_SIZE,
	value=MODEL_SIZE,
	label="Model Size",
	interactive=True,
	)

	# selection for model size part:
	question_type = gr.CheckboxGroup(
	choices=QUESTION_TYPE,
	value=QUESTION_TYPE,
	label="Question Type",
	interactive=True,
	)

	baseline_value = get_baseline_aad_df()
	baseline_header = MODEL_INFO + checkbox_aad_group.value
	baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_aad_group.value)

	data_component_aad = gr.components.Dataframe(
	value=baseline_value,
	headers=baseline_header,
	type="pandas",
	datatype=baseline_datatype,
	interactive=False,
	visible=True,
	)

	def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns):

	updated_data = get_all_aad_df()
	# model_size & question_type:

	def custom_filter(row, model_size_filters, question_type_filters):
	model_size = row['Model Size']
	question_type = row['Question Type']
	model_size = model_size.upper()

	if model_size == '-':
	size_filter = '-' in model_size_filters
	elif 'B' in model_size:
	size = float(model_size.replace('B', ''))
	size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
	else:
	size_filter = False

	question_type_filter = question_type in question_type_filters

	return size_filter and question_type_filter

	mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type)
	updated_data = updated_data[mask]

	# columns:
	selected_columns = [item for item in TASK_AAD_INFO if item in selected_columns]
	present_columns = MODEL_INFO + selected_columns
	updated_data = updated_data[present_columns]
	updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
	updated_headers = present_columns
	update_datatype = [DATA_AAD_TITILE_TYPE[COLUMN_AAD_NAMES.index(x)] for x in updated_headers]

	filter_component = gr.components.Dataframe(
	value=updated_data,
	headers=updated_headers,
	type="pandas",
	datatype=update_datatype,
	interactive=False,
	visible=True,
	)
	return filter_component


	model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
	question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
	checkbox_aad_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)

	with gr.TabItem("🏅 MM-IASD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=2):
	checkbox_iasd_group = gr.CheckboxGroup(
	choices=TASK_IASD_INFO,
	value=AVG_INFO,
	label="Evaluation Dimension",
	interactive=True,
	) # user can select the evaluation dimension

	with gr.Row():
	# selection for model size part:
	model_size = gr.CheckboxGroup(
	choices=MODEL_SIZE,
	value=MODEL_SIZE,
	label="Model Size",
	interactive=True,
	)

	# selection for model size part:
	question_type = gr.CheckboxGroup(
	choices=QUESTION_TYPE,
	value=QUESTION_TYPE,
	label="Question Type",
	interactive=True,
	)

	baseline_value = get_baseline_iasd_df()
	baseline_header = MODEL_INFO + checkbox_iasd_group.value
	baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_iasd_group.value)

	data_component_iasd = gr.components.Dataframe(
	value=baseline_value,
	headers=baseline_header,
	type="pandas",
	datatype=baseline_datatype,
	interactive=False,
	visible=True,
	)

	def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns):

	updated_data = get_all_iasd_df()

	def custom_filter(row, model_size_filters, question_type_filters):
	model_size = row['Model Size']
	question_type = row['Question Type']
	model_size = model_size.upper()

	if model_size == '-':
	size_filter = '-' in model_size_filters
	elif 'B' in model_size:
	size = float(model_size.replace('B', ''))
	size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
	else:
	size_filter = False

	question_type_filter = question_type in question_type_filters

	return size_filter and question_type_filter

	mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type)
	updated_data = updated_data[mask]

	# columns:
	selected_columns = [item for item in TASK_IASD_INFO if item in selected_columns]
	present_columns = MODEL_INFO + selected_columns
	updated_data = updated_data[present_columns]
	updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
	updated_headers = present_columns
	update_datatype = [DATA_IASD_TITILE_TYPE[COLUMN_IASD_NAMES.index(x)] for x in updated_headers]

	filter_component = gr.components.Dataframe(
	value=updated_data,
	headers=updated_headers,
	type="pandas",
	datatype=update_datatype,
	interactive=False,
	visible=True,
	)
	return filter_component

	model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
	question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
	checkbox_iasd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)

	# Table 3
	with gr.TabItem("🏅 MM-IVQD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=3):
	# selection for column part:
	checkbox_ivqd_group = gr.CheckboxGroup(
	choices=TASK_IVQD_INFO,
	value=AVG_INFO,
	label="Evaluation Dimension",
	interactive=True,
	) # user can select the evaluation dimension

	with gr.Row():
	# selection for model size part:
	model_size = gr.CheckboxGroup(
	choices=MODEL_SIZE,
	value=MODEL_SIZE,
	label="Model Size",
	interactive=True,
	)

	# selection for model size part:
	question_type = gr.CheckboxGroup(
	choices=QUESTION_TYPE,
	value=QUESTION_TYPE,
	label="Question Type",
	interactive=True,
	)

	baseline_value = get_baseline_ivqd_df()
	baseline_header = MODEL_INFO + checkbox_ivqd_group.value
	baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_ivqd_group.value)

	data_component_ivqd = gr.components.Dataframe(
	value=baseline_value,
	headers=baseline_header,
	type="pandas",
	datatype=baseline_datatype,
	interactive=False,
	visible=True,
	)

	def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns):

	updated_data = get_all_ivqd_df()

	def custom_filter(row, model_size_filters, question_type_filters):
	model_size = row['Model Size']
	question_type = row['Question Type']
	model_size = model_size.upper()

	if model_size == '-':
	size_filter = '-' in model_size_filters
	elif 'B' in model_size:
	size = float(model_size.replace('B', ''))
	size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10)
	else:
	size_filter = False

	question_type_filter = question_type in question_type_filters

	return size_filter and question_type_filter

	mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type)
	updated_data = updated_data[mask]

	selected_columns = [item for item in TASK_IVQD_INFO if item in selected_columns]
	present_columns = MODEL_INFO + selected_columns
	updated_data = updated_data[present_columns]
	updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False)
	updated_headers = present_columns
	update_datatype = [DATA_IVQD_TITILE_TYPE[COLUMN_IVQD_NAMES.index(x)] for x in updated_headers]

	filter_component = gr.components.Dataframe(
	value=updated_data,
	headers=updated_headers,
	type="pandas",
	datatype=update_datatype,
	interactive=False,
	visible=True,
	)
	return filter_component

	model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
	question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
	checkbox_ivqd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)

	# table 4
	with gr.TabItem("📝 About", elem_id="mmupd-benchmark-tab-table", id=4):
	gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")

	# table 5
	with gr.TabItem("🚀 Submit here! ", elem_id="mmupd-benchmark-tab-table", id=5):
	with gr.Row():
	gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")

	with gr.Row():
	gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")

	with gr.Row():
	with gr.Column():
	model_type = gr.Dropdown(
	choices=["VLM", "LLM"],
	label="Model type",
	multiselect=False,
	value="VLM",
	interactive=True,
	)
	model_name_textbox = gr.Textbox(
	label="Model name", placeholder="LLaMA-7B"
	)
	revision_name_textbox = gr.Textbox(
	label="Revision Model Name", placeholder="LLaMA-7B"
	)

	model_link = gr.Textbox(
	label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
	)

	model_size = gr.Textbox(
	label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')"
	)

	with gr.Column():
	LLM_type = gr.Dropdown(
	choices=["Vicuna-1.5-7B", "Vicuna-1.5-13B", "Flan-T5-XL", "LLaMA-7B", "Llama-13B", "Llama-3-8B", "Llama-3-70B", "Yi-34B", "Mistral-7B", "Other"],
	label="LLM type",
	multiselect=False,
	value="Vicuna-1.5-13B",
	interactive=True,
	)

	LLM_name_textbox = gr.Textbox(
	label="LLM model (Required for Other)",
	placeholder="GPT-4",
	)

	upd_type = gr.Dropdown(
	choices=[
	"AAD",
	"IASD",
	"IVQD",
	],
	label="UPD type",
	multiselect=False,
	value="AAD",
	interactive=True,
	)

	question_type = gr.Dropdown(
	choices=QUESTION_TYPE,
	label="Question Type",
	multiselect=False,
	value=QUESTION_TYPE[0],
	interactive=True,
	)

	with gr.Column():

	input_file = gr.components.File(label="Click to Upload a JSON File", file_count="single", type='binary')
	submit_button = gr.Button("Submit Eval")

	submission_result = gr.Markdown()
	submit_button.click(
	add_new_eval,
	inputs = [
	input_file,
	model_type,
	model_name_textbox,
	revision_name_textbox,
	model_link,
	model_size,
	upd_type,
	LLM_type,
	LLM_name_textbox,
	question_type
	],
	)

	def refresh_data():
	value1 = get_baseline_aad_df()
	value2 = get_baseline_iasd_df()
	value3 = get_baseline_ivqd_df()

	return value1, value2, value3

	with gr.Row():
	data_run = gr.Button("Refresh")
	data_run.click(
	refresh_data, outputs=[data_component_aad, data_component_iasd, data_component_ivqd]
	)

	with gr.Accordion("Citation", open=False):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	elem_id="citation-button",
	show_copy_button=True,
	)

	block.launch()