Spaces:

ServiceNow
/

browsergym-leaderboard

Running

App Files Files Community

test-agent-2

by meghsn - opened Nov 25, 2024

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+497

-951

This view is limited to 50 files because it contains too many changes. See the raw diff here.

Files changed (50) hide show

app.py +28 -82
results/Bgym-GPT-3.5/README.md +1 -0
results/Bgym-GPT-3.5/config.json +4 -0
results/{GenericAgent-GPT-4o → Bgym-GPT-3.5}/miniwob.json +4 -4
results/Bgym-GPT-3.5/webarena.json +16 -0
results/Bgym-GPT-3.5/workarena-l1.json +44 -0
results/{GenericAgent-GPT-4o → Bgym-GPT-3.5}/workarena-l2.json +4 -4
results/{GenericAgent-GPT-4o → Bgym-GPT-3.5}/workarena-l3.json +3 -3
results/Bgym-GPT-4o-V/README.md +1 -0
results/Bgym-GPT-4o-V/config.json +4 -0
results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o-V}/miniwob.json +4 -4
results/Bgym-GPT-4o-V/webarena.json +16 -0
results/{GenericAgent-GPT-4o → Bgym-GPT-4o-V}/workarena-l1.json +4 -4
results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o-V}/workarena-l2.json +4 -4
results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o-V}/workarena-l3.json +3 -3
results/Bgym-GPT-4o/README.md +1 -0
results/Bgym-GPT-4o/config.json +4 -0
results/Bgym-GPT-4o/miniwob.json +16 -0
results/Bgym-GPT-4o/webarena.json +16 -0
results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o}/workarena-l1.json +4 -4
results/{GenericAgent-Claude-3.5-Sonnet → Bgym-GPT-4o}/workarena-l2.json +4 -4
results/{GenericAgent-GPT-o1-mini → Bgym-GPT-4o}/workarena-l3.json +3 -3
results/Bgym-Llama-3-70b/README.md +1 -0
results/Bgym-Llama-3-70b/config.json +4 -0
results/Bgym-Llama-3-70b/miniwob.json +16 -0
results/Bgym-Llama-3-70b/webarena.json +16 -0
results/Bgym-Llama-3-70b/workarena-l1.json +58 -0
results/Bgym-Llama-3-70b/workarena-l2.json +16 -0
results/{GenericAgent-Claude-3.5-Sonnet → Bgym-Llama-3-70b}/workarena-l3.json +4 -4
results/Bgym-Mixtral-8x22b/README.md +1 -0
results/Bgym-Mixtral-8x22b/config.json +4 -0
results/Bgym-Mixtral-8x22b/miniwob.json +16 -0
results/Bgym-Mixtral-8x22b/webarena.json +16 -0
results/Bgym-Mixtral-8x22b/workarena-l1.json +44 -0
results/Bgym-Mixtral-8x22b/workarena-l2.json +16 -0
results/Bgym-Mixtral-8x22b/workarena-l3.json +16 -0
results/GenericAgent-Claude-3.5-Sonnet/README.md +0 -46
results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json +0 -16
results/GenericAgent-Claude-3.5-Sonnet/miniwob.json +0 -16
results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json +0 -16
results/GenericAgent-Claude-3.5-Sonnet/webarena.json +0 -16
results/GenericAgent-Claude-3.5-Sonnet/weblinx.json +0 -16
results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json +0 -16
results/GenericAgent-GPT-4o-mini/README.md +0 -54
results/GenericAgent-GPT-4o-mini/assistantbench.json +0 -16
results/GenericAgent-GPT-4o-mini/visualwebarena.json +0 -16
results/GenericAgent-GPT-4o-mini/webarena.json +0 -16
results/GenericAgent-GPT-4o-mini/weblinx.json +0 -16
results/GenericAgent-GPT-4o/README.md +0 -46
results/GenericAgent-GPT-4o/assistantbench.json +0 -16

app.py CHANGED Viewed

@@ -9,7 +9,6 @@ import plotly.graph_objs as go
 from huggingface_hub import HfApi
 from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
 import streamlit.components.v1 as components
-from datetime import datetime
 from urllib.parse import quote
 from pathlib import Path
@@ -17,7 +16,7 @@ import re
 import html
 from typing import Dict, Any
-BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB", "WebLINX", "VisualWebArena", "AssistantBench"]
 def sanitize_agent_name(agent_name):
     # Only allow alphanumeric chars, hyphen, underscore
@@ -44,34 +43,12 @@ def sanitize_column_name(col: str) -> str:
     return html.escape(str(col))
 def sanitize_cell_value(value: Any) -> str:
     if isinstance(value, (int, float)):
         return str(value)
-    if isinstance(value, str) and '±' in value:
-        score, std_err = value.split('±')
-        return f'{score.strip()} <span style="font-size: smaller; color: var(--lighter-color);">±{std_err.strip()}</span>'
     return html.escape(str(value))
 def create_html_table_main(df):
-    col1, col2 = st.columns([2,6])
-    with col1:
-        sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("WebArena"), key="main_sort_column")
-    with col2:
-        sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key="main_sort_order")
-    def get_sort_value(row):
-            if row == "-":
-                return float('-inf')
-            else:
-                try:
-                    return float(row)
-                except ValueError:
-                    return row
-    # Sort dataframe
-    if sort_order == "Ascending":
-        df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
-    else:
-        df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
     html = '''
     <style>
         table {
@@ -110,28 +87,7 @@ def create_html_table_main(df):
     html += '</div>'
     return html
-def create_html_table_benchmark(df, benchmark):
-    col1, col2 = st.columns([2,6])
-    with col1:
-        sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("Score"), key=f"benchmark_sort_column_{benchmark}")
-    with col2:
-        sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key=f"benchmark_sort_order_{benchmark}")
-    def get_sort_value(row):
-            if row == "-":
-                return float('-inf')
-            else:
-                try:
-                    return float(row)
-                except ValueError:
-                    return row
-    # Sort dataframe
-    if sort_order == "Ascending":
-        df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
-    else:
-        df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
     html = '''
     <style>
         table {
@@ -155,9 +111,8 @@ def create_html_table_benchmark(df, benchmark):
     html += '<table>'
     html += '<thead><tr>'
     for column in df.columns:
-        if column == "Reproduced_all" or column == "std_err":
-            continue
-        html += f'<th>{sanitize_column_name(column)}</th>'
     html += '</tr></thead>'
     html += '<tbody>'
     for _, row in df.iterrows():
@@ -170,11 +125,8 @@ def create_html_table_benchmark(df, benchmark):
                     summary = sanitize_cell_value(row[column])
                     details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
                     html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
-            elif column == "Reproduced_all" or column == "std_err":
                 continue
-            elif column == "Score":
-                score_with_std_err = f'{row[column]} ± {row["std_err"]}'
-                html += f'<td>{sanitize_cell_value(score_with_std_err)}</td>'
             else:
                 html += f'<td>{sanitize_cell_value(row[column])}</td>'
         html += '</tr>'
@@ -209,19 +161,6 @@ def check_sanity(agent):
 def main():
     st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
-    st.markdown("""
-        <style>
-        :root {
-            --lighter-color: #888; /* Default for light theme */
-        }
-        @media (prefers-color-scheme: dark) {
-            :root {
-                --lighter-color: #ccc; /* Default for dark theme */
-            }
-        }
-        </style>
-    """, unsafe_allow_html=True)
     st.markdown("""
         <head>
             <meta http-equiv="Content-Security-Policy"
@@ -244,10 +183,7 @@ def main():
             continue
         agent_results = []
         for benchmark in BENCHMARKS:
-            file_path = safe_path_join(agent, f"{benchmark.lower()}.json")
-            if not file_path.is_file():
-                continue
-            with open(file_path) as f:
                 agent_results.extend(json.load(f))
         all_results[agent] = agent_results
@@ -281,9 +217,11 @@ def main():
         if dfs_to_concat:
             df = pd.concat(dfs_to_concat, ignore_index=True)
-        for benchmark in BENCHMARKS:
-            df[benchmark] = df[benchmark].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
-            df[benchmark] = df[benchmark].astype(str)
         # Add a search bar
         search_query = st.text_input("Search agents", "", key="search_main")
@@ -302,6 +240,14 @@ def main():
                 return ""
         df['Agent'] = df['Agent'].apply(make_hyperlink)
         html_table = create_html_table_main(df)
         st.markdown(html_table, unsafe_allow_html=True)
@@ -449,21 +395,18 @@ MIT
                     for value in values:
                         if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
                             result_dict["Score"] = value["score"]
-                            result_dict["std_err"] = value["std_err"]
                             result_dict["Benchmark Specific"] = value["benchmark_specific"]
                             result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
                             result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
                             result_dict["Reproducible"] = value["reproducible"]
                             result_dict["Comments"] = value["comments"]
                             result_dict["Study ID"] = value["study_id"]
-                            value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
                             result_dict["Date"] = value["date_time"]
                             result_dict["Reproduced"] = []
                             result_dict["Reproduced_all"] = []
                             flag = 1
                         if not flag:
                             result_dict["Score"] = "-"
-                            result_dict["std_err"] = "-"
                             result_dict["Benchmark Specific"] = "-"
                             result_dict["Benchmark Tuned"] = "-"
                             result_dict["Followed Evaluation Protocol"] = "-"
@@ -475,7 +418,6 @@ MIT
                             result_dict["Reproduced_all"] = []
                         if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
                             result_dict["Reproduced"].append(value["score"])
-                            value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
                             result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
                     if result_dict["Reproduced"]:
                         result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
@@ -493,10 +435,14 @@ MIT
             # Concatenate the DataFrames
             if dfs_to_concat:
                 df_ = pd.concat(dfs_to_concat, ignore_index=True)
-            df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
-            df_['std_err'] = df_['std_err'].apply(lambda x: f"{x:.1f}" if x != "-" else "-")
-            df_['Score'] = df_['Score'].astype(str)
-            html_table = create_html_table_benchmark(df_, benchmark)
             st.markdown(html_table, unsafe_allow_html=True)

 from huggingface_hub import HfApi
 from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
 import streamlit.components.v1 as components
 from urllib.parse import quote
 from pathlib import Path
 import html
 from typing import Dict, Any
+BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB",]
 def sanitize_agent_name(agent_name):
     # Only allow alphanumeric chars, hyphen, underscore
     return html.escape(str(col))
 def sanitize_cell_value(value: Any) -> str:
+    """Sanitize cell values for HTML display"""
     if isinstance(value, (int, float)):
         return str(value)
     return html.escape(str(value))
 def create_html_table_main(df):
     html = '''
     <style>
         table {
     html += '</div>'
     return html
+def create_html_table_benchmark(df):
     html = '''
     <style>
         table {
     html += '<table>'
     html += '<thead><tr>'
     for column in df.columns:
+        if column != "Reproduced_all":
+            html += f'<th>{sanitize_column_name(column)}</th>'
     html += '</tr></thead>'
     html += '<tbody>'
     for _, row in df.iterrows():
                     summary = sanitize_cell_value(row[column])
                     details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
                     html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
+            elif column == "Reproduced_all":
                 continue
             else:
                 html += f'<td>{sanitize_cell_value(row[column])}</td>'
         html += '</tr>'
 def main():
     st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
     st.markdown("""
         <head>
             <meta http-equiv="Content-Security-Policy"
             continue
         agent_results = []
         for benchmark in BENCHMARKS:
+            with open(f"results/{agent}/{benchmark.lower()}.json") as f:
                 agent_results.extend(json.load(f))
         all_results[agent] = agent_results
         if dfs_to_concat:
             df = pd.concat(dfs_to_concat, ignore_index=True)
+        # df['Average'] = sum(df[column] for column in BENCHMARKS)/len(BENCHMARKS)
+        # df['Average'] = df['Average'].round(2)
+        # Sort values
+        df = df.sort_values(by='WebArena', ascending=False)
         # Add a search bar
         search_query = st.text_input("Search agents", "", key="search_main")
                 return ""
         df['Agent'] = df['Agent'].apply(make_hyperlink)
+        # st.dataframe(
+        #     df[['Agent'] + BENCHMARKS],
+        #     use_container_width=True,
+        #     column_config={benchmark: {'alignment': 'center'} for benchmark in BENCHMARKS},
+        #     hide_index=True,
+        #     # height=int(len(df) * 36.2),
+        # )
+        # st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
         html_table = create_html_table_main(df)
         st.markdown(html_table, unsafe_allow_html=True)
                     for value in values:
                         if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
                             result_dict["Score"] = value["score"]
                             result_dict["Benchmark Specific"] = value["benchmark_specific"]
                             result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
                             result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
                             result_dict["Reproducible"] = value["reproducible"]
                             result_dict["Comments"] = value["comments"]
                             result_dict["Study ID"] = value["study_id"]
                             result_dict["Date"] = value["date_time"]
                             result_dict["Reproduced"] = []
                             result_dict["Reproduced_all"] = []
                             flag = 1
                         if not flag:
                             result_dict["Score"] = "-"
                             result_dict["Benchmark Specific"] = "-"
                             result_dict["Benchmark Tuned"] = "-"
                             result_dict["Followed Evaluation Protocol"] = "-"
                             result_dict["Reproduced_all"] = []
                         if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
                             result_dict["Reproduced"].append(value["score"])
                             result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
                     if result_dict["Reproduced"]:
                         result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
             # Concatenate the DataFrames
             if dfs_to_concat:
                 df_ = pd.concat(dfs_to_concat, ignore_index=True)
+            # st.markdown(f"<h2 id='{benchmark.lower()}'>{benchmark}</h2>", unsafe_allow_html=True)
+            # st.dataframe(
+            #     df_,
+            #     use_container_width=True,
+            #     column_config={benchmark: {'alignment': 'center'}},
+            #     hide_index=True,
+            # )
+            html_table = create_html_table_benchmark(df_)
             st.markdown(html_table, unsafe_allow_html=True)

results/Bgym-GPT-3.5/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ## GPT-3.5 model

results/Bgym-GPT-3.5/config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "agent_name": "GPT-3.5",
+    "backend_llm": "GPT-3.5"
+}

results/{GenericAgent-GPT-4o → Bgym-GPT-3.5}/miniwob.json RENAMED Viewed

@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o",
-        "study_id": "2024-10-25_06-08-16",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",
-        "score": 63.8,
-        "std_err": 1.9,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

 [
     {
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",
+        "score": 43.4,
+        "std_err": 0.1,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/Bgym-GPT-3.5/webarena.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 6.7,
+        "std_err": 0.2,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-3.5/workarena-l1.json ADDED Viewed

	@@ -0,0 +1,44 @@

+[
+    {
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L1",
+        "score": 6.1,
+        "std_err": 0.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    },
+    {
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 5.7,
+        "std_err": 0.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-04 12:06:00"
+    },
+    {
+        "benchmark": "WorkArena-L1",
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "score": 5.1,
+        "std_err": 0.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-04 12:06:00"
+    }
+]

results/{GenericAgent-GPT-4o → Bgym-GPT-3.5}/workarena-l2.json RENAMED Viewed

@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o",
-        "study_id": "2024-10-23_17-10-46",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",
-        "score": 8.5,
-        "std_err": 1.8,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

 [
     {
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",
+        "score": 0.0,
+        "std_err": 0.0,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/{GenericAgent-GPT-4o → Bgym-GPT-3.5}/workarena-l3.json RENAMED Viewed

@@ -1,8 +1,8 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o",
-        "study_id": "-",
-        "date_time": "2024-10-24 23:03:30",
         "benchmark": "WorkArena-L3",
         "score": 0.0,
         "std_err": 0.0,

 [
     {
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",
         "score": 0.0,
         "std_err": 0.0,

results/Bgym-GPT-4o-V/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ## GPT-4o-V model

results/Bgym-GPT-4o-V/config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "agent_name": "GPT-4o-V",
+    "backend_llm": "GPT-4o-V"
+}

results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o-V}/miniwob.json RENAMED Viewed

@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "2024-10-25_06-08-16",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",
-        "score": 56.6,
-        "std_err": 2.0,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

 [
     {
+        "agent_name": "Bgym-GPT-4o-V",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",
+        "score": 72.5,
+        "std_err": 0.5,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/Bgym-GPT-4o-V/webarena.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o-V",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 24.0,
+        "std_err": 0.4,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/{GenericAgent-GPT-4o → Bgym-GPT-4o-V}/workarena-l1.json RENAMED Viewed

@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o",
-        "study_id": "2024-10-23_14-17-40",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L1",
-        "score": 45.5,
-        "std_err": 2.7,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

 [
     {
+        "agent_name": "Bgym-GPT-4o-V",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L1",
+        "score": 41.8,
+        "std_err": 0.4,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o-V}/workarena-l2.json RENAMED Viewed

@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "2024-10-23_17-10-46",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",
-        "score": 1.3,
-        "std_err": 0.7,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

 [
     {
+        "agent_name": "Bgym-GPT-4o-V",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",
+        "score": 3.8,
+        "std_err": 0.6,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o-V}/workarena-l3.json RENAMED Viewed

@@ -1,8 +1,8 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "-",
-        "date_time": "2024-10-24 23:03:30",
         "benchmark": "WorkArena-L3",
         "score": 0.0,
         "std_err": 0.0,

 [
     {
+        "agent_name": "Bgym-GPT-4o-V",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",
         "score": 0.0,
         "std_err": 0.0,

results/Bgym-GPT-4o/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ## GPT-4o model

results/Bgym-GPT-4o/config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "agent_name": "GPT-4o",
+    "backend_llm": "GPT-4o"
+}

results/Bgym-GPT-4o/miniwob.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "MiniWoB",
+        "score": 71.3,
+        "std_err": 0.5,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-4o/webarena.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 23.5,
+        "std_err": 0.4,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o}/workarena-l1.json RENAMED Viewed

@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "2024-10-23_14-17-40",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L1",
-        "score": 27,
-        "std_err": 2.4,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

 [
     {
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L1",
+        "score": 42.7,
+        "std_err": 0.4,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/{GenericAgent-Claude-3.5-Sonnet → Bgym-GPT-4o}/workarena-l2.json RENAMED Viewed

@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "2024-10-23_17-10-46",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",
-        "score": 39.1,
-        "std_err": 3.2,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

 [
     {
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",
+        "score": 3.0,
+        "std_err": 0.6,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/{GenericAgent-GPT-o1-mini → Bgym-GPT-4o}/workarena-l3.json RENAMED Viewed

@@ -1,8 +1,8 @@
 [
     {
-        "agent_name": "GenericAgent-GPT-o1-mini",
-        "study_id": "-",
-        "date_time": "2024-10-24 23:03:30",
         "benchmark": "WorkArena-L3",
         "score": 0.0,
         "std_err": 0.0,

 [
     {
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",
         "score": 0.0,
         "std_err": 0.0,

results/Bgym-Llama-3-70b/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ### Llama-3-70B

results/Bgym-Llama-3-70b/config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "agent_name": "Llama-3-70B",
+    "backend_llm": "Llama-3-70B"
+}

results/Bgym-Llama-3-70b/miniwob.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "MiniWoB",
+        "score": 68.2,
+        "std_err": 0.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Llama-3-70b/webarena.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 11.0,
+        "std_err": 0.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Llama-3-70b/workarena-l1.json ADDED Viewed

	@@ -0,0 +1,58 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 17.9,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    },
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 15.9,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-04 12:06:00"
+    },
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 19.9,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-05 2:07:00"
+    },
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 17.9,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-12 12:00:00"
+    }
+]

results/Bgym-Llama-3-70b/workarena-l2.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L2",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/{GenericAgent-Claude-3.5-Sonnet → Bgym-Llama-3-70b}/workarena-l3.json RENAMED Viewed

@@ -1,11 +1,11 @@
 [
     {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "2024-10-24_18-06-57",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",
-        "score": 0.4,
-        "std_err": 0.4,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

 [
     {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",
+        "score": 0.0,
+        "std_err": 0.0,
         "benchmark_specific": "No",
         "benchmark_tuned": "No",
         "followed_evaluation_protocol": "Yes",

results/Bgym-Mixtral-8x22b/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ## Mixtral 8x22B

results/Bgym-Mixtral-8x22b/config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "agent_name": "Mixtral-8x22B",
+    "backend_llm": "Mixtral-8x22B"
+}

results/Bgym-Mixtral-8x22b/miniwob.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "MiniWoB",
+        "score": 62.4,
+        "std_err": 0.5,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Mixtral-8x22b/webarena.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 12.6,
+        "std_err": 0.9,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Mixtral-8x22b/workarena-l1.json ADDED Viewed

	@@ -0,0 +1,44 @@

+[
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 12.4,
+        "std_err": 0.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-04 12:06:00"
+    },
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 11.4,
+        "std_err": 0.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-04 12:06:00"
+    },
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 13.4,
+        "std_err": 0.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-04 12:06:00"
+    }
+]

results/Bgym-Mixtral-8x22b/workarena-l2.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L2",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Mixtral-8x22b/workarena-l3.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/GenericAgent-Claude-3.5-Sonnet/README.md DELETED Viewed

@@ -1,46 +0,0 @@
-### GenericAgent-Claude-3.5-Sonnet
-This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
-It uses Claude-3.5-sonnet as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
-```python
-BASE_FLAGS = GenericPromptFlags(
-    obs=dp.ObsFlags(
-        use_html=False,
-        use_ax_tree=True,
-        use_focused_element=True,
-        use_error_logs=True,
-        use_history=True,
-        use_past_error_logs=False,
-        use_action_history=True,
-        use_think_history=True,  # gpt-4o config except for this line
-        use_diff=False,
-        html_type="pruned_html",
-        use_screenshot=False,
-        use_som=False,
-        extract_visible_tag=True,
-        extract_clickable_tag=True,
-        extract_coords="False",
-        filter_visible_elements_only=False,
-    ),
-    action=dp.ActionFlags(
-        multi_actions=False,
-        action_set="bid",
-        long_description=False,
-        individual_examples=False,
-    ),
-    use_plan=False,
-    use_criticise=False,
-    use_thinking=True,
-    use_memory=False,
-    use_concrete_example=True,
-    use_abstract_example=True,
-    use_hints=True,
-    enable_chat=False,
-    max_prompt_tokens=40_000,
-    be_cautious=True,
-    extra_instructions=None,
-)
-```
-Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).

results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
-        "benchmark": "AssistantBench",
-        "score": 5.2,
-        "std_err": 1.5,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "Intersection of finished tasks across agents.",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-11-28 19:34:58"
-    }
-]

results/GenericAgent-Claude-3.5-Sonnet/miniwob.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "2024-10-25_06-08-16",
-        "benchmark": "MiniWoB",
-        "score": 69.8,
-        "std_err": 1.8,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2021-01-01 12:00:00"
-    }
-]

results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "22f0611d-aeea-4ee9-a533-b45442b5e080",
-        "benchmark": "VisualWebArena",
-        "score": 21.0,
-        "std_err": 1.3,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-12-02 09:11:35"
-    }
-]

results/GenericAgent-Claude-3.5-Sonnet/webarena.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae",
-        "benchmark": "WebArena",
-        "score": 36.2,
-        "std_err": 1.7,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-11-29 22:37:46"
-    }
-]

results/GenericAgent-Claude-3.5-Sonnet/weblinx.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
-        "benchmark": "WebLINX",
-        "score": 13.7,
-        "std_err": 0.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2024-11-07 21:42:30"
-    }
-]

results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
-        "study_id": "2024-10-23_14-17-40",
-        "benchmark": "WorkArena-L1",
-        "score": 56.4,
-        "std_err": 2.7,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2021-01-01 12:00:00"
-    }
-]

results/GenericAgent-GPT-4o-mini/README.md DELETED Viewed

@@ -1,54 +0,0 @@
-### GenericAgent-GPT-4o-mini
-This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
-It uses GPT-4o-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
-```python
-BASE_FLAGS = GenericPromptFlags(
-    obs=dp.ObsFlags(
-        use_html=False,
-        use_ax_tree=True,
-        use_focused_element=True,
-        use_error_logs=True,
-        use_history=True,
-        use_past_error_logs=False,
-        use_action_history=True,
-        use_think_history=True,  # gpt-4o config except for this line
-        use_diff=False,
-        html_type="pruned_html",
-        use_screenshot=False,
-        use_som=False,
-        extract_visible_tag=True,
-        extract_clickable_tag=True,
-        extract_coords="False",
-        filter_visible_elements_only=False,
-    ),
-    action=dp.ActionFlags(
-        multi_actions=False,
-        action_set="bid",
-        long_description=False,
-        individual_examples=False,
-    ),
-    use_plan=False,
-    use_criticise=False,
-    use_thinking=True,
-    use_memory=False,
-    use_concrete_example=True,
-    use_abstract_example=True,
-    use_hints=True,
-    enable_chat=False,
-    max_prompt_tokens=40_000,
-    be_cautious=True,
-    extra_instructions=None,
-)
-```
-© Hugging Face
-TOS
-Privacy
-About
-Jobs
-Models
-Datasets
-Spaces
-Pricing
-Docs

results/GenericAgent-GPT-4o-mini/assistantbench.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
-        "date_time": "2024-11-28 19:34:58",
-        "benchmark": "AssistantBench",
-        "score": 2.1,
-        "std_err": 1.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "Intersection of finished tasks across agents.",
-        "original_or_reproduced": "Original"
-    }
-]

results/GenericAgent-GPT-4o-mini/visualwebarena.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "8d8642d3-757a-4346-ba45-01398f85b1f4",
-        "date_time": "2024-12-02 02:54:33",
-        "benchmark": "VisualWebArena",
-        "score": 16.9,
-        "std_err": 1.2,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/GenericAgent-GPT-4o-mini/webarena.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "c6bdeb87-9879-4c06-aa70-00d895001156",
-        "date_time": "2024-11-29 19:25:49",
-        "benchmark": "WebArena",
-        "score": 17.4,
-        "std_err": 1.3,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/GenericAgent-GPT-4o-mini/weblinx.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-4o-mini",
-        "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
-        "date_time": "2024-11-07 21:42:30",
-        "benchmark": "WebLINX",
-        "score": 11.6,
-        "std_err": 0.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/GenericAgent-GPT-4o/README.md DELETED Viewed

@@ -1,46 +0,0 @@
-### GenericAgent-GPT-4o
-This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
-It uses GPT-4o as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
-```python
-BASE_FLAGS = GenericPromptFlags(
-    obs=dp.ObsFlags(
-        use_html=False,
-        use_ax_tree=True,
-        use_focused_element=True,
-        use_error_logs=True,
-        use_history=True,
-        use_past_error_logs=False,
-        use_action_history=True,
-        use_think_history=True,  # gpt-4o config except for this line
-        use_diff=False,
-        html_type="pruned_html",
-        use_screenshot=False,
-        use_som=False,
-        extract_visible_tag=True,
-        extract_clickable_tag=True,
-        extract_coords="False",
-        filter_visible_elements_only=False,
-    ),
-    action=dp.ActionFlags(
-        multi_actions=False,
-        action_set="bid",
-        long_description=False,
-        individual_examples=False,
-    ),
-    use_plan=False,
-    use_criticise=False,
-    use_thinking=True,
-    use_memory=False,
-    use_concrete_example=True,
-    use_abstract_example=True,
-    use_hints=True,
-    enable_chat=False,
-    max_prompt_tokens=40_000,
-    be_cautious=True,
-    extra_instructions=None,
-)
-```
-Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).

results/GenericAgent-GPT-4o/assistantbench.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "GenericAgent-GPT-4o",
-        "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
-        "date_time": "2024-11-28 19:34:58",
-        "benchmark": "AssistantBench",
-        "score": 4.8,
-        "std_err": 2.4,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "Intersection of finished tasks across agents.",
-        "original_or_reproduced": "Original"
-    }
-]