Spaces:
Sleeping
Sleeping
__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions'] | |
import gradio as gr | |
import pandas as pd | |
import re | |
import os | |
import json | |
import yaml | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import plotnine as p9 | |
import sys | |
sys.path.append('./src') | |
sys.path.append('.') | |
from src.about import * | |
from src.saving_utils import * | |
from src.vis_utils import * | |
from src.bin.PROBE import run_probe | |
def add_new_eval( | |
human_file, | |
skempi_file, | |
model_name_textbox: str, | |
revision_name_textbox: str, | |
benchmark_type, | |
similarity_tasks, | |
function_prediction_aspect, | |
function_prediction_dataset, | |
family_prediction_dataset, | |
): | |
representation_name = model_name_textbox if revision_name_textbox == '' else revision_name_textbox | |
results = run_probe(benchmark_type, representation_name, human_file, skempi_file, similarity_tasks, function_prediction_aspect, function_prediction_dataset, family_prediction_dataset) | |
for benchmark_type in results: | |
if benchmark_type == 'similarity': | |
save_similarity_output(results['similarity'], representation_name) | |
elif benchmark_type == 'function': | |
save_function_output(results['function'], representation_name) | |
elif benchmark_type == 'family': | |
save_family_output(results['family'], representation_name) | |
elif benchmark_type == "affinity": | |
save_affinity_output(results['affinity', representation_name]) | |
# Function to update leaderboard dynamically based on user selection | |
def update_leaderboard(selected_methods, selected_metrics): | |
return get_baseline_df(selected_methods, selected_metrics) | |
block = gr.Blocks() | |
with block: | |
gr.Markdown(LEADERBOARD_INTRODUCTION) | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
# table jmmmu bench | |
with gr.TabItem("π PROBE Leaderboard", elem_id="probe-benchmark-tab-table", id=1): | |
method_names = pd.read_csv(CSV_RESULT_PATH)['method_name'].unique().tolist() | |
metric_names = pd.read_csv(CSV_RESULT_PATH).columns.tolist() | |
metrics_with_method = metric_names.copy() | |
metric_names.remove('method_name') # Remove method_name from the metric options | |
# Leaderboard section with method and metric selectors | |
with gr.Row(): | |
# Add method and metric selectors for leaderboard | |
leaderboard_method_selector = gr.CheckboxGroup( | |
choices=method_names, label="Select method_names for Leaderboard", value=method_names, interactive=True | |
) | |
leaderboard_metric_selector = gr.CheckboxGroup( | |
choices=metric_names, label="Select Metrics for Leaderboard", value=metric_names, interactive=True | |
) | |
# Display the filtered leaderboard | |
baseline_value = get_baseline_df(method_names, metric_names) | |
baseline_header = ["method_name"] + metric_names | |
baseline_datatype = ['markdown'] + ['number'] * len(metric_names) | |
data_component = gr.components.Dataframe( | |
value=baseline_value, | |
headers=baseline_header, | |
type="pandas", | |
datatype=baseline_datatype, | |
interactive=False, | |
visible=True, | |
) | |
# Update leaderboard when method/metric selection changes | |
leaderboard_method_selector.change( | |
update_leaderboard, | |
inputs=[leaderboard_method_selector, leaderboard_metric_selector], | |
outputs=data_component | |
) | |
leaderboard_metric_selector.change( | |
update_leaderboard, | |
inputs=[leaderboard_method_selector, leaderboard_metric_selector], | |
outputs=data_component | |
) | |
# Dropdown for benchmark type | |
benchmark_types = TASK_INFO + ['flexible'] | |
benchmark_type_selector = gr.Dropdown(choices=benchmark_types, label="Select Benchmark Type for Visualization", value="flexible") | |
# Dynamic metric selectors (will be updated based on benchmark type) | |
x_metric_selector = gr.Dropdown(choices=[], label="Select X-axis Metric") | |
y_metric_selector = gr.Dropdown(choices=[], label="Select Y-axis Metric") | |
method_selector = gr.CheckboxGroup(choices=method_names, label="Select methods to visualize", interactive=True, value=method_names) | |
# Button to draw the plot for the selected benchmark | |
plot_button = gr.Button("Plot") | |
plot_output = gr.Image(label="Plot") | |
# Update metric selectors when benchmark type is chosen | |
def update_metric_choices(benchmark_type): | |
if benchmark_type == 'flexible': | |
# Show all metrics for the flexible visualizer | |
metric_names = df.columns.tolist() | |
return gr.update(choices=metric_names, value=metric_names[0]), gr.update(choices=metric_names, value=metric_names[1]) | |
elif benchmark_type in benchmark_specific_metrics: | |
metrics = benchmark_specific_metrics[benchmark_type] | |
return gr.update(choices=metrics, value=metrics[0]), gr.update(choices=metrics) | |
return gr.update(choices=[]), gr.update(choices=[]) | |
benchmark_type_selector.change( | |
update_metric_choices, | |
inputs=[benchmark_type_selector], | |
outputs=[x_metric_selector, y_metric_selector] | |
) | |
# Generate the plot based on user input | |
plot_button.click( | |
benchmark_plot, | |
inputs=[benchmark_type_selector, method_selector, x_metric_selector, y_metric_selector], | |
outputs=plot_output | |
) | |
with gr.TabItem("π About", elem_id="probe-benchmark-tab-table", id=2): | |
with gr.Row(): | |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") | |
with gr.TabItem("π Submit here! ", elem_id="probe-benchmark-tab-table", id=3): | |
with gr.Row(): | |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") | |
with gr.Row(): | |
gr.Markdown("# βοΈβ¨ Submit your model's representation files here!", elem_classes="markdown-text") | |
with gr.Row(): | |
with gr.Column(): | |
model_name_textbox = gr.Textbox( | |
label="Model name", | |
) | |
revision_name_textbox = gr.Textbox( | |
label="Revision Model Name", | |
) | |
benchmark_type = gr.CheckboxGroup( | |
choices=TASK_INFO, | |
label="Benchmark Type", | |
interactive=True, | |
) | |
similarity_tasks = gr.CheckboxGroup( | |
choices=similarity_tasks_options, | |
label="Select Similarity Tasks", | |
interactive=True, | |
) | |
function_prediction_aspect = gr.Radio( | |
choices=function_prediction_aspect_options, | |
label="Select Function Prediction Aspect", | |
interactive=True, | |
) | |
function_prediction_dataset = gr.Radio( | |
choices=function_prediction_dataset_options, | |
label="Select Function Prediction Dataset", | |
interactive=True, | |
) | |
family_prediction_dataset = gr.CheckboxGroup( | |
choices=family_prediction_dataset_options, | |
label="Select Family Prediction Dataset", | |
interactive=True, | |
) | |
with gr.Column(): | |
human_file = gr.components.File(label="Click to Upload the representation file (csv) for Human dataset", file_count="single", type='filepath') | |
skempi_file = gr.components.File(label="Click to Upload the representation file (csv) for SKEMPI dataset", file_count="single", type='filepath') | |
submit_button = gr.Button("Submit Eval") | |
submission_result = gr.Markdown() | |
submit_button.click( | |
add_new_eval, | |
inputs=[ | |
human_file, | |
skempi_file, | |
model_name_textbox, | |
revision_name_textbox, | |
benchmark_type, | |
similarity_tasks, | |
function_prediction_aspect, | |
function_prediction_dataset, | |
family_prediction_dataset, | |
], | |
) | |
def refresh_data(): | |
value = get_baseline_df(method_names, metric_names) | |
return value | |
with gr.Row(): | |
data_run = gr.Button("Refresh") | |
data_run.click(refresh_data, outputs=[data_component]) | |
with gr.Accordion("Citation", open=False): | |
citation_button = gr.Textbox( | |
value=CITATION_BUTTON_TEXT, | |
label=CITATION_BUTTON_LABEL, | |
elem_id="citation-button", | |
show_copy_button=True, | |
) | |
block.launch() | |