import json
import re
import os
import streamlit as st
import requests
import pandas as pd
from io import StringIO
import plotly.graph_objs as go
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
import streamlit.components.v1 as components
# BENCHMARKS = ["WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB", "WebArena"]
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB",]
def create_html_table_main(df, benchmarks):
col1, col2 = st.columns([2,6])
with col1:
sort_column = st.selectbox("Sort by", df.columns.tolist())
with col2:
sort_order = st.radio("Order", ["Ascending", "Descending"], horizontal=True)
# Sort dataframe
if sort_order == "Ascending":
df = df.sort_values(by=sort_column)
else:
df = df.sort_values(by=sort_column, ascending=False)
# Create HTML table without JavaScript sorting
html = '''
'''
html += '
'
html += '
'
html += ''
for column in df.columns:
html += f'{column} | '
html += '
'
html += ''
for _, row in df.iterrows():
html += ''
for col in df.columns:
html += f'{row[col]} | '
html += '
'
html += '
'
html += '
'
return html
def create_html_table_benchmark(df, benchmarks):
# Create HTML table without JavaScript sorting
html = '''
'''
html += ''
html += '
'
html += ''
for column in df.columns:
if column != "Reproduced_all":
html += f'{column} | '
html += '
'
html += ''
for _, row in df.iterrows():
html += ''
for column in df.columns:
if column == "Reproduced":
if row[column] == "-":
html += f'{row[column]} | '
else:
html += f'{row[column]}{" ".join(map(str, row["Reproduced_all"]))} | '
elif column == "Reproduced_all":
continue
else:
html += f'{row[column]} | '
html += '
'
html += '
'
html += '
'
return html
def check_sanity(agent):
for benchmark in BENCHMARKS:
file_path = f"results/{agent}/{benchmark.lower()}.json"
if not os.path.exists(file_path):
continue
original_count = 0
with open(file_path) as f:
results = json.load(f)
for result in results:
if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]):
return False
if result["agent_name"] != agent:
return False
if result["benchmark"] != benchmark:
return False
if result["original_or_reproduced"] == "Original":
original_count += 1
if original_count != 1:
return False
return True
def main():
st.set_page_config(page_title="WebAgent Leaderboard", layout="wide")
all_agents = os.listdir("results")
all_results = {}
for agent in all_agents:
if not check_sanity(agent):
st.error(f"Results for {agent} are not in the correct format.")
continue
agent_results = []
for benchmark in BENCHMARKS:
with open(f"results/{agent}/{benchmark.lower()}.json") as f:
agent_results.extend(json.load(f))
all_results[agent] = agent_results
st.title("🏆 WebAgent Leaderboard")
st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
# content = create_yall()
# tab1, tab2, tab3, tab4 = st.tabs(["🏆 WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "📝 About"])
tabs = st.tabs(["🏆 WebAgent Leaderboard",] + BENCHMARKS + ["📝 About"])
with tabs[0]:
# Leaderboard tab
def get_leaderboard_dict(results):
leaderboard_dict = []
for key, values in results.items():
result_dict = {"Agent": key}
for benchmark in BENCHMARKS:
if any(value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original" for value in values):
result_dict[benchmark] = [value["score"] for value in values if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original"][0]
else:
result_dict[benchmark] = "-"
leaderboard_dict.append(result_dict)
return leaderboard_dict
leaderboard_dict = get_leaderboard_dict(all_results)
# print (leaderboard_dict)
full_df = pd.DataFrame.from_dict(leaderboard_dict)
df = pd.DataFrame(columns=full_df.columns)
dfs_to_concat = []
dfs_to_concat.append(full_df)
# Concatenate the DataFrames
if dfs_to_concat:
df = pd.concat(dfs_to_concat, ignore_index=True)
# df['Average'] = sum(df[column] for column in BENCHMARKS)/len(BENCHMARKS)
# df['Average'] = df['Average'].round(2)
# Sort values
df = df.sort_values(by='WebArena', ascending=False)
# Add a search bar
search_query = st.text_input("Search agents", "", key="search_main")
# Filter the DataFrame based on the search query
if search_query:
df = df[df['Agent'].str.contains(search_query, case=False)]
# Display the filtered DataFrame or the entire leaderboard
def make_hyperlink(agent_name):
url = f"https://huggingface.co/spaces/meghsn/WebAgent-Leaderboard/blob/main/results/{agent_name}/README.md"
return f'{agent_name}'
df['Agent'] = df['Agent'].apply(make_hyperlink)
# st.dataframe(
# df[['Agent'] + BENCHMARKS],
# use_container_width=True,
# column_config={benchmark: {'alignment': 'center'} for benchmark in BENCHMARKS},
# hide_index=True,
# # height=int(len(df) * 36.2),
# )
# st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
html_table = create_html_table_main(df, BENCHMARKS)
# print (html_table)
st.markdown(html_table, unsafe_allow_html=True)
# components.html(html_table, height=600, scrolling=True)
if st.button("Export to CSV", key="export_main"):
# Export the DataFrame to CSV
csv_data = df.to_csv(index=False)
# Create a link to download the CSV file
st.download_button(
label="Download CSV",
data=csv_data,
file_name="leaderboard.csv",
key="download-csv",
help="Click to download the CSV file",
)
with tabs[-1]:
st.markdown('''
### Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.
''')
for i, benchmark in enumerate(BENCHMARKS, start=1):
with tabs[i]:
def get_benchmark_dict(results, benchmark):
benchmark_dict = []
for key, values in results.items():
result_dict = {"Agent": key}
flag = 0
for value in values:
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
result_dict["Score"] = value["score"]
result_dict["Benchmark Specific"] = value["benchmark_specific"]
result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
result_dict["Reproducible"] = value["reproducible"]
result_dict["Comments"] = value["comments"]
result_dict["Study ID"] = value["study_id"]
result_dict["Date"] = value["date_time"]
result_dict["Reproduced"] = []
result_dict["Reproduced_all"] = []
flag = 1
if not flag:
result_dict["Score"] = "-"
result_dict["Benchmark Specific"] = "-"
result_dict["Benchmark Tuned"] = "-"
result_dict["Followed Evaluation Protocol"] = "-"
result_dict["Reproducible"] = "-"
result_dict["Comments"] = "-"
result_dict["Study ID"] = "-"
result_dict["Date"] = "-"
result_dict["Reproduced"] = []
result_dict["Reproduced_all"] = []
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
result_dict["Reproduced"].append(value["score"])
result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
if result_dict["Reproduced"]:
result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
else:
result_dict["Reproduced"] = "-"
benchmark_dict.append(result_dict)
return benchmark_dict
benchmark_dict = get_benchmark_dict(all_results, benchmark=benchmark)
# print (leaderboard_dict)
full_df = pd.DataFrame.from_dict(benchmark_dict)
df_ = pd.DataFrame(columns=full_df.columns)
dfs_to_concat = []
dfs_to_concat.append(full_df)
# Concatenate the DataFrames
if dfs_to_concat:
df_ = pd.concat(dfs_to_concat, ignore_index=True)
# st.markdown(f"{benchmark}
", unsafe_allow_html=True)
# st.dataframe(
# df_,
# use_container_width=True,
# column_config={benchmark: {'alignment': 'center'}},
# hide_index=True,
# )
html_table = create_html_table_benchmark(df_, BENCHMARKS)
st.markdown(html_table, unsafe_allow_html=True)
if __name__ == "__main__":
main()