import json import re import os import streamlit as st import requests import pandas as pd from io import StringIO import plotly.graph_objs as go from huggingface_hub import HfApi from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError import streamlit.components.v1 as components # BENCHMARKS = ["WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB", "WebArena"] BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB",] def create_html_table_main(df, benchmarks): col1, col2 = st.columns([2,6]) with col1: sort_column = st.selectbox("Sort by", df.columns.tolist()) with col2: sort_order = st.radio("Order", ["Ascending", "Descending"], horizontal=True) # Sort dataframe if sort_order == "Ascending": df = df.sort_values(by=sort_column) else: df = df.sort_values(by=sort_column, ascending=False) # Create HTML table without JavaScript sorting html = ''' ''' html += '
' html += '' html += '' for column in df.columns: html += f'' html += '' html += '' for _, row in df.iterrows(): html += '' for col in df.columns: html += f'' html += '' html += '
{column}
{row[col]}
' html += '
' return html def create_html_table_benchmark(df, benchmarks): # Create HTML table without JavaScript sorting html = ''' ''' html += '
' html += '' html += '' for column in df.columns: if column != "Reproduced_all": html += f'' html += '' html += '' for _, row in df.iterrows(): html += '' for column in df.columns: if column == "Reproduced": if row[column] == "-": html += f'' else: html += f'' elif column == "Reproduced_all": continue else: html += f'' html += '' html += '
{column}
{row[column]}
{row[column]}{"
".join(map(str, row["Reproduced_all"]))}
{row[column]}
' html += '
' return html def check_sanity(agent): for benchmark in BENCHMARKS: file_path = f"results/{agent}/{benchmark.lower()}.json" if not os.path.exists(file_path): continue original_count = 0 with open(file_path) as f: results = json.load(f) for result in results: if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]): return False if result["agent_name"] != agent: return False if result["benchmark"] != benchmark: return False if result["original_or_reproduced"] == "Original": original_count += 1 if original_count != 1: return False return True def main(): st.set_page_config(page_title="WebAgent Leaderboard", layout="wide") all_agents = os.listdir("results") all_results = {} for agent in all_agents: if not check_sanity(agent): st.error(f"Results for {agent} are not in the correct format.") continue agent_results = [] for benchmark in BENCHMARKS: with open(f"results/{agent}/{benchmark.lower()}.json") as f: agent_results.extend(json.load(f)) all_results[agent] = agent_results st.title("🏆 WebAgent Leaderboard") st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.") # content = create_yall() # tab1, tab2, tab3, tab4 = st.tabs(["🏆 WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "📝 About"]) tabs = st.tabs(["🏆 WebAgent Leaderboard",] + BENCHMARKS + ["📝 About"]) with tabs[0]: # Leaderboard tab def get_leaderboard_dict(results): leaderboard_dict = [] for key, values in results.items(): result_dict = {"Agent": key} for benchmark in BENCHMARKS: if any(value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original" for value in values): result_dict[benchmark] = [value["score"] for value in values if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original"][0] else: result_dict[benchmark] = "-" leaderboard_dict.append(result_dict) return leaderboard_dict leaderboard_dict = get_leaderboard_dict(all_results) # print (leaderboard_dict) full_df = pd.DataFrame.from_dict(leaderboard_dict) df = pd.DataFrame(columns=full_df.columns) dfs_to_concat = [] dfs_to_concat.append(full_df) # Concatenate the DataFrames if dfs_to_concat: df = pd.concat(dfs_to_concat, ignore_index=True) # df['Average'] = sum(df[column] for column in BENCHMARKS)/len(BENCHMARKS) # df['Average'] = df['Average'].round(2) # Sort values df = df.sort_values(by='WebArena', ascending=False) # Add a search bar search_query = st.text_input("Search agents", "", key="search_main") # Filter the DataFrame based on the search query if search_query: df = df[df['Agent'].str.contains(search_query, case=False)] # Display the filtered DataFrame or the entire leaderboard def make_hyperlink(agent_name): url = f"https://huggingface.co/spaces/meghsn/WebAgent-Leaderboard/blob/main/results/{agent_name}/README.md" return f'{agent_name}' df['Agent'] = df['Agent'].apply(make_hyperlink) # st.dataframe( # df[['Agent'] + BENCHMARKS], # use_container_width=True, # column_config={benchmark: {'alignment': 'center'} for benchmark in BENCHMARKS}, # hide_index=True, # # height=int(len(df) * 36.2), # ) # st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True) html_table = create_html_table_main(df, BENCHMARKS) # print (html_table) st.markdown(html_table, unsafe_allow_html=True) # components.html(html_table, height=600, scrolling=True) if st.button("Export to CSV", key="export_main"): # Export the DataFrame to CSV csv_data = df.to_csv(index=False) # Create a link to download the CSV file st.download_button( label="Download CSV", data=csv_data, file_name="leaderboard.csv", key="download-csv", help="Click to download the CSV file", ) with tabs[-1]: st.markdown(''' ### Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks. ''') for i, benchmark in enumerate(BENCHMARKS, start=1): with tabs[i]: def get_benchmark_dict(results, benchmark): benchmark_dict = [] for key, values in results.items(): result_dict = {"Agent": key} flag = 0 for value in values: if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original": result_dict["Score"] = value["score"] result_dict["Benchmark Specific"] = value["benchmark_specific"] result_dict["Benchmark Tuned"] = value["benchmark_tuned"] result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"] result_dict["Reproducible"] = value["reproducible"] result_dict["Comments"] = value["comments"] result_dict["Study ID"] = value["study_id"] result_dict["Date"] = value["date_time"] result_dict["Reproduced"] = [] result_dict["Reproduced_all"] = [] flag = 1 if not flag: result_dict["Score"] = "-" result_dict["Benchmark Specific"] = "-" result_dict["Benchmark Tuned"] = "-" result_dict["Followed Evaluation Protocol"] = "-" result_dict["Reproducible"] = "-" result_dict["Comments"] = "-" result_dict["Study ID"] = "-" result_dict["Date"] = "-" result_dict["Reproduced"] = [] result_dict["Reproduced_all"] = [] if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced": result_dict["Reproduced"].append(value["score"]) result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])])) if result_dict["Reproduced"]: result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"])) else: result_dict["Reproduced"] = "-" benchmark_dict.append(result_dict) return benchmark_dict benchmark_dict = get_benchmark_dict(all_results, benchmark=benchmark) # print (leaderboard_dict) full_df = pd.DataFrame.from_dict(benchmark_dict) df_ = pd.DataFrame(columns=full_df.columns) dfs_to_concat = [] dfs_to_concat.append(full_df) # Concatenate the DataFrames if dfs_to_concat: df_ = pd.concat(dfs_to_concat, ignore_index=True) # st.markdown(f"

{benchmark}

", unsafe_allow_html=True) # st.dataframe( # df_, # use_container_width=True, # column_config={benchmark: {'alignment': 'center'}}, # hide_index=True, # ) html_table = create_html_table_benchmark(df_, BENCHMARKS) st.markdown(html_table, unsafe_allow_html=True) if __name__ == "__main__": main()