test-agent-2
#2
by
meghsn
- opened
This view is limited to 50 files because it contains too many changes.Β
See the raw diff here.
- app.py +28 -82
- results/Bgym-GPT-3.5/README.md +1 -0
- results/Bgym-GPT-3.5/config.json +4 -0
- results/{GenericAgent-GPT-4o β Bgym-GPT-3.5}/miniwob.json +4 -4
- results/Bgym-GPT-3.5/webarena.json +16 -0
- results/Bgym-GPT-3.5/workarena-l1.json +44 -0
- results/{GenericAgent-GPT-4o β Bgym-GPT-3.5}/workarena-l2.json +4 -4
- results/{GenericAgent-GPT-4o β Bgym-GPT-3.5}/workarena-l3.json +3 -3
- results/Bgym-GPT-4o-V/README.md +1 -0
- results/Bgym-GPT-4o-V/config.json +4 -0
- results/{GenericAgent-GPT-4o-mini β Bgym-GPT-4o-V}/miniwob.json +4 -4
- results/Bgym-GPT-4o-V/webarena.json +16 -0
- results/{GenericAgent-GPT-4o β Bgym-GPT-4o-V}/workarena-l1.json +4 -4
- results/{GenericAgent-GPT-4o-mini β Bgym-GPT-4o-V}/workarena-l2.json +4 -4
- results/{GenericAgent-GPT-4o-mini β Bgym-GPT-4o-V}/workarena-l3.json +3 -3
- results/Bgym-GPT-4o/README.md +1 -0
- results/Bgym-GPT-4o/config.json +4 -0
- results/Bgym-GPT-4o/miniwob.json +16 -0
- results/Bgym-GPT-4o/webarena.json +16 -0
- results/{GenericAgent-GPT-4o-mini β Bgym-GPT-4o}/workarena-l1.json +4 -4
- results/{GenericAgent-Claude-3.5-Sonnet β Bgym-GPT-4o}/workarena-l2.json +4 -4
- results/{GenericAgent-GPT-o1-mini β Bgym-GPT-4o}/workarena-l3.json +3 -3
- results/Bgym-Llama-3-70b/README.md +1 -0
- results/Bgym-Llama-3-70b/config.json +4 -0
- results/Bgym-Llama-3-70b/miniwob.json +16 -0
- results/Bgym-Llama-3-70b/webarena.json +16 -0
- results/Bgym-Llama-3-70b/workarena-l1.json +58 -0
- results/Bgym-Llama-3-70b/workarena-l2.json +16 -0
- results/{GenericAgent-Claude-3.5-Sonnet β Bgym-Llama-3-70b}/workarena-l3.json +4 -4
- results/Bgym-Mixtral-8x22b/README.md +1 -0
- results/Bgym-Mixtral-8x22b/config.json +4 -0
- results/Bgym-Mixtral-8x22b/miniwob.json +16 -0
- results/Bgym-Mixtral-8x22b/webarena.json +16 -0
- results/Bgym-Mixtral-8x22b/workarena-l1.json +44 -0
- results/Bgym-Mixtral-8x22b/workarena-l2.json +16 -0
- results/Bgym-Mixtral-8x22b/workarena-l3.json +16 -0
- results/GenericAgent-Claude-3.5-Sonnet/README.md +0 -46
- results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json +0 -16
- results/GenericAgent-Claude-3.5-Sonnet/miniwob.json +0 -16
- results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json +0 -16
- results/GenericAgent-Claude-3.5-Sonnet/webarena.json +0 -16
- results/GenericAgent-Claude-3.5-Sonnet/weblinx.json +0 -16
- results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json +0 -16
- results/GenericAgent-GPT-4o-mini/README.md +0 -54
- results/GenericAgent-GPT-4o-mini/assistantbench.json +0 -16
- results/GenericAgent-GPT-4o-mini/visualwebarena.json +0 -16
- results/GenericAgent-GPT-4o-mini/webarena.json +0 -16
- results/GenericAgent-GPT-4o-mini/weblinx.json +0 -16
- results/GenericAgent-GPT-4o/README.md +0 -46
- results/GenericAgent-GPT-4o/assistantbench.json +0 -16
app.py
CHANGED
@@ -9,7 +9,6 @@ import plotly.graph_objs as go
|
|
9 |
from huggingface_hub import HfApi
|
10 |
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
11 |
import streamlit.components.v1 as components
|
12 |
-
from datetime import datetime
|
13 |
|
14 |
from urllib.parse import quote
|
15 |
from pathlib import Path
|
@@ -17,7 +16,7 @@ import re
|
|
17 |
import html
|
18 |
from typing import Dict, Any
|
19 |
|
20 |
-
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB",
|
21 |
|
22 |
def sanitize_agent_name(agent_name):
|
23 |
# Only allow alphanumeric chars, hyphen, underscore
|
@@ -44,34 +43,12 @@ def sanitize_column_name(col: str) -> str:
|
|
44 |
return html.escape(str(col))
|
45 |
|
46 |
def sanitize_cell_value(value: Any) -> str:
|
|
|
47 |
if isinstance(value, (int, float)):
|
48 |
return str(value)
|
49 |
-
if isinstance(value, str) and 'Β±' in value:
|
50 |
-
score, std_err = value.split('Β±')
|
51 |
-
return f'{score.strip()} <span style="font-size: smaller; color: var(--lighter-color);">Β±{std_err.strip()}</span>'
|
52 |
return html.escape(str(value))
|
53 |
|
54 |
def create_html_table_main(df):
|
55 |
-
col1, col2 = st.columns([2,6])
|
56 |
-
with col1:
|
57 |
-
sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("WebArena"), key="main_sort_column")
|
58 |
-
with col2:
|
59 |
-
sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key="main_sort_order")
|
60 |
-
|
61 |
-
def get_sort_value(row):
|
62 |
-
if row == "-":
|
63 |
-
return float('-inf')
|
64 |
-
else:
|
65 |
-
try:
|
66 |
-
return float(row)
|
67 |
-
except ValueError:
|
68 |
-
return row
|
69 |
-
|
70 |
-
# Sort dataframe
|
71 |
-
if sort_order == "Ascending":
|
72 |
-
df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
|
73 |
-
else:
|
74 |
-
df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
|
75 |
html = '''
|
76 |
<style>
|
77 |
table {
|
@@ -110,28 +87,7 @@ def create_html_table_main(df):
|
|
110 |
html += '</div>'
|
111 |
return html
|
112 |
|
113 |
-
def create_html_table_benchmark(df
|
114 |
-
col1, col2 = st.columns([2,6])
|
115 |
-
with col1:
|
116 |
-
sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("Score"), key=f"benchmark_sort_column_{benchmark}")
|
117 |
-
with col2:
|
118 |
-
sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key=f"benchmark_sort_order_{benchmark}")
|
119 |
-
|
120 |
-
def get_sort_value(row):
|
121 |
-
if row == "-":
|
122 |
-
return float('-inf')
|
123 |
-
else:
|
124 |
-
try:
|
125 |
-
return float(row)
|
126 |
-
except ValueError:
|
127 |
-
return row
|
128 |
-
|
129 |
-
# Sort dataframe
|
130 |
-
if sort_order == "Ascending":
|
131 |
-
df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
|
132 |
-
else:
|
133 |
-
df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
|
134 |
-
|
135 |
html = '''
|
136 |
<style>
|
137 |
table {
|
@@ -155,9 +111,8 @@ def create_html_table_benchmark(df, benchmark):
|
|
155 |
html += '<table>'
|
156 |
html += '<thead><tr>'
|
157 |
for column in df.columns:
|
158 |
-
if column
|
159 |
-
|
160 |
-
html += f'<th>{sanitize_column_name(column)}</th>'
|
161 |
html += '</tr></thead>'
|
162 |
html += '<tbody>'
|
163 |
for _, row in df.iterrows():
|
@@ -170,11 +125,8 @@ def create_html_table_benchmark(df, benchmark):
|
|
170 |
summary = sanitize_cell_value(row[column])
|
171 |
details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
|
172 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
173 |
-
elif column == "Reproduced_all"
|
174 |
continue
|
175 |
-
elif column == "Score":
|
176 |
-
score_with_std_err = f'{row[column]} Β± {row["std_err"]}'
|
177 |
-
html += f'<td>{sanitize_cell_value(score_with_std_err)}</td>'
|
178 |
else:
|
179 |
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
180 |
html += '</tr>'
|
@@ -209,19 +161,6 @@ def check_sanity(agent):
|
|
209 |
|
210 |
def main():
|
211 |
st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
|
212 |
-
st.markdown("""
|
213 |
-
<style>
|
214 |
-
:root {
|
215 |
-
--lighter-color: #888; /* Default for light theme */
|
216 |
-
}
|
217 |
-
@media (prefers-color-scheme: dark) {
|
218 |
-
:root {
|
219 |
-
--lighter-color: #ccc; /* Default for dark theme */
|
220 |
-
}
|
221 |
-
}
|
222 |
-
</style>
|
223 |
-
""", unsafe_allow_html=True)
|
224 |
-
|
225 |
st.markdown("""
|
226 |
<head>
|
227 |
<meta http-equiv="Content-Security-Policy"
|
@@ -244,10 +183,7 @@ def main():
|
|
244 |
continue
|
245 |
agent_results = []
|
246 |
for benchmark in BENCHMARKS:
|
247 |
-
|
248 |
-
if not file_path.is_file():
|
249 |
-
continue
|
250 |
-
with open(file_path) as f:
|
251 |
agent_results.extend(json.load(f))
|
252 |
all_results[agent] = agent_results
|
253 |
|
@@ -281,9 +217,11 @@ def main():
|
|
281 |
if dfs_to_concat:
|
282 |
df = pd.concat(dfs_to_concat, ignore_index=True)
|
283 |
|
284 |
-
for
|
285 |
-
|
286 |
-
|
|
|
|
|
287 |
# Add a search bar
|
288 |
search_query = st.text_input("Search agents", "", key="search_main")
|
289 |
|
@@ -302,6 +240,14 @@ def main():
|
|
302 |
return ""
|
303 |
|
304 |
df['Agent'] = df['Agent'].apply(make_hyperlink)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
html_table = create_html_table_main(df)
|
306 |
st.markdown(html_table, unsafe_allow_html=True)
|
307 |
|
@@ -449,21 +395,18 @@ MIT
|
|
449 |
for value in values:
|
450 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
|
451 |
result_dict["Score"] = value["score"]
|
452 |
-
result_dict["std_err"] = value["std_err"]
|
453 |
result_dict["Benchmark Specific"] = value["benchmark_specific"]
|
454 |
result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
|
455 |
result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
|
456 |
result_dict["Reproducible"] = value["reproducible"]
|
457 |
result_dict["Comments"] = value["comments"]
|
458 |
result_dict["Study ID"] = value["study_id"]
|
459 |
-
value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
|
460 |
result_dict["Date"] = value["date_time"]
|
461 |
result_dict["Reproduced"] = []
|
462 |
result_dict["Reproduced_all"] = []
|
463 |
flag = 1
|
464 |
if not flag:
|
465 |
result_dict["Score"] = "-"
|
466 |
-
result_dict["std_err"] = "-"
|
467 |
result_dict["Benchmark Specific"] = "-"
|
468 |
result_dict["Benchmark Tuned"] = "-"
|
469 |
result_dict["Followed Evaluation Protocol"] = "-"
|
@@ -475,7 +418,6 @@ MIT
|
|
475 |
result_dict["Reproduced_all"] = []
|
476 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
|
477 |
result_dict["Reproduced"].append(value["score"])
|
478 |
-
value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
|
479 |
result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
|
480 |
if result_dict["Reproduced"]:
|
481 |
result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
|
@@ -493,10 +435,14 @@ MIT
|
|
493 |
# Concatenate the DataFrames
|
494 |
if dfs_to_concat:
|
495 |
df_ = pd.concat(dfs_to_concat, ignore_index=True)
|
496 |
-
|
497 |
-
|
498 |
-
df_
|
499 |
-
|
|
|
|
|
|
|
|
|
500 |
st.markdown(html_table, unsafe_allow_html=True)
|
501 |
|
502 |
|
|
|
9 |
from huggingface_hub import HfApi
|
10 |
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
11 |
import streamlit.components.v1 as components
|
|
|
12 |
|
13 |
from urllib.parse import quote
|
14 |
from pathlib import Path
|
|
|
16 |
import html
|
17 |
from typing import Dict, Any
|
18 |
|
19 |
+
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB",]
|
20 |
|
21 |
def sanitize_agent_name(agent_name):
|
22 |
# Only allow alphanumeric chars, hyphen, underscore
|
|
|
43 |
return html.escape(str(col))
|
44 |
|
45 |
def sanitize_cell_value(value: Any) -> str:
|
46 |
+
"""Sanitize cell values for HTML display"""
|
47 |
if isinstance(value, (int, float)):
|
48 |
return str(value)
|
|
|
|
|
|
|
49 |
return html.escape(str(value))
|
50 |
|
51 |
def create_html_table_main(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
html = '''
|
53 |
<style>
|
54 |
table {
|
|
|
87 |
html += '</div>'
|
88 |
return html
|
89 |
|
90 |
+
def create_html_table_benchmark(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
html = '''
|
92 |
<style>
|
93 |
table {
|
|
|
111 |
html += '<table>'
|
112 |
html += '<thead><tr>'
|
113 |
for column in df.columns:
|
114 |
+
if column != "Reproduced_all":
|
115 |
+
html += f'<th>{sanitize_column_name(column)}</th>'
|
|
|
116 |
html += '</tr></thead>'
|
117 |
html += '<tbody>'
|
118 |
for _, row in df.iterrows():
|
|
|
125 |
summary = sanitize_cell_value(row[column])
|
126 |
details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
|
127 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
128 |
+
elif column == "Reproduced_all":
|
129 |
continue
|
|
|
|
|
|
|
130 |
else:
|
131 |
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
132 |
html += '</tr>'
|
|
|
161 |
|
162 |
def main():
|
163 |
st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
st.markdown("""
|
165 |
<head>
|
166 |
<meta http-equiv="Content-Security-Policy"
|
|
|
183 |
continue
|
184 |
agent_results = []
|
185 |
for benchmark in BENCHMARKS:
|
186 |
+
with open(f"results/{agent}/{benchmark.lower()}.json") as f:
|
|
|
|
|
|
|
187 |
agent_results.extend(json.load(f))
|
188 |
all_results[agent] = agent_results
|
189 |
|
|
|
217 |
if dfs_to_concat:
|
218 |
df = pd.concat(dfs_to_concat, ignore_index=True)
|
219 |
|
220 |
+
# df['Average'] = sum(df[column] for column in BENCHMARKS)/len(BENCHMARKS)
|
221 |
+
# df['Average'] = df['Average'].round(2)
|
222 |
+
# Sort values
|
223 |
+
df = df.sort_values(by='WebArena', ascending=False)
|
224 |
+
|
225 |
# Add a search bar
|
226 |
search_query = st.text_input("Search agents", "", key="search_main")
|
227 |
|
|
|
240 |
return ""
|
241 |
|
242 |
df['Agent'] = df['Agent'].apply(make_hyperlink)
|
243 |
+
# st.dataframe(
|
244 |
+
# df[['Agent'] + BENCHMARKS],
|
245 |
+
# use_container_width=True,
|
246 |
+
# column_config={benchmark: {'alignment': 'center'} for benchmark in BENCHMARKS},
|
247 |
+
# hide_index=True,
|
248 |
+
# # height=int(len(df) * 36.2),
|
249 |
+
# )
|
250 |
+
# st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
|
251 |
html_table = create_html_table_main(df)
|
252 |
st.markdown(html_table, unsafe_allow_html=True)
|
253 |
|
|
|
395 |
for value in values:
|
396 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
|
397 |
result_dict["Score"] = value["score"]
|
|
|
398 |
result_dict["Benchmark Specific"] = value["benchmark_specific"]
|
399 |
result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
|
400 |
result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
|
401 |
result_dict["Reproducible"] = value["reproducible"]
|
402 |
result_dict["Comments"] = value["comments"]
|
403 |
result_dict["Study ID"] = value["study_id"]
|
|
|
404 |
result_dict["Date"] = value["date_time"]
|
405 |
result_dict["Reproduced"] = []
|
406 |
result_dict["Reproduced_all"] = []
|
407 |
flag = 1
|
408 |
if not flag:
|
409 |
result_dict["Score"] = "-"
|
|
|
410 |
result_dict["Benchmark Specific"] = "-"
|
411 |
result_dict["Benchmark Tuned"] = "-"
|
412 |
result_dict["Followed Evaluation Protocol"] = "-"
|
|
|
418 |
result_dict["Reproduced_all"] = []
|
419 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
|
420 |
result_dict["Reproduced"].append(value["score"])
|
|
|
421 |
result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
|
422 |
if result_dict["Reproduced"]:
|
423 |
result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
|
|
|
435 |
# Concatenate the DataFrames
|
436 |
if dfs_to_concat:
|
437 |
df_ = pd.concat(dfs_to_concat, ignore_index=True)
|
438 |
+
# st.markdown(f"<h2 id='{benchmark.lower()}'>{benchmark}</h2>", unsafe_allow_html=True)
|
439 |
+
# st.dataframe(
|
440 |
+
# df_,
|
441 |
+
# use_container_width=True,
|
442 |
+
# column_config={benchmark: {'alignment': 'center'}},
|
443 |
+
# hide_index=True,
|
444 |
+
# )
|
445 |
+
html_table = create_html_table_benchmark(df_)
|
446 |
st.markdown(html_table, unsafe_allow_html=True)
|
447 |
|
448 |
|
results/Bgym-GPT-3.5/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
## GPT-3.5 model
|
results/Bgym-GPT-3.5/config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"agent_name": "GPT-3.5",
|
3 |
+
"backend_llm": "GPT-3.5"
|
4 |
+
}
|
results/{GenericAgent-GPT-4o β Bgym-GPT-3.5}/miniwob.json
RENAMED
@@ -1,11 +1,11 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
-
"score":
|
8 |
-
"std_err": 1
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-GPT-3.5",
|
4 |
+
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
+
"score": 43.4,
|
8 |
+
"std_err": 0.1,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-3.5/webarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-3.5",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WebArena",
|
7 |
+
"score": 6.7,
|
8 |
+
"std_err": 0.2,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-GPT-3.5/workarena-l1.json
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-3.5",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WorkArena-L1",
|
7 |
+
"score": 6.1,
|
8 |
+
"std_err": 0.3,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"agent_name": "Bgym-GPT-3.5",
|
18 |
+
"study_id": "study_id",
|
19 |
+
"benchmark": "WorkArena-L1",
|
20 |
+
"score": 5.7,
|
21 |
+
"std_err": 0.3,
|
22 |
+
"benchmark_specific": "No",
|
23 |
+
"benchmark_tuned": "No",
|
24 |
+
"followed_evaluation_protocol": "Yes",
|
25 |
+
"reproducible": "Yes",
|
26 |
+
"comments": "NA",
|
27 |
+
"original_or_reproduced": "Reproduced",
|
28 |
+
"date_time": "2021-01-04 12:06:00"
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"benchmark": "WorkArena-L1",
|
32 |
+
"agent_name": "Bgym-GPT-3.5",
|
33 |
+
"study_id": "study_id",
|
34 |
+
"score": 5.1,
|
35 |
+
"std_err": 0.3,
|
36 |
+
"benchmark_specific": "No",
|
37 |
+
"benchmark_tuned": "No",
|
38 |
+
"followed_evaluation_protocol": "Yes",
|
39 |
+
"reproducible": "Yes",
|
40 |
+
"comments": "NA",
|
41 |
+
"original_or_reproduced": "Reproduced",
|
42 |
+
"date_time": "2021-01-04 12:06:00"
|
43 |
+
}
|
44 |
+
]
|
results/{GenericAgent-GPT-4o β Bgym-GPT-3.5}/workarena-l2.json
RENAMED
@@ -1,11 +1,11 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
-
"score":
|
8 |
-
"std_err":
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-GPT-3.5",
|
4 |
+
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
+
"score": 0.0,
|
8 |
+
"std_err": 0.0,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/{GenericAgent-GPT-4o β Bgym-GPT-3.5}/workarena-l3.json
RENAMED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
"score": 0.0,
|
8 |
"std_err": 0.0,
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-GPT-3.5",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
"score": 0.0,
|
8 |
"std_err": 0.0,
|
results/Bgym-GPT-4o-V/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
## GPT-4o-V model
|
results/Bgym-GPT-4o-V/config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"agent_name": "GPT-4o-V",
|
3 |
+
"backend_llm": "GPT-4o-V"
|
4 |
+
}
|
results/{GenericAgent-GPT-4o-mini β Bgym-GPT-4o-V}/miniwob.json
RENAMED
@@ -1,11 +1,11 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
-
"score":
|
8 |
-
"std_err":
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-GPT-4o-V",
|
4 |
+
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
+
"score": 72.5,
|
8 |
+
"std_err": 0.5,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-4o-V/webarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-4o-V",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WebArena",
|
7 |
+
"score": 24.0,
|
8 |
+
"std_err": 0.4,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/{GenericAgent-GPT-4o β Bgym-GPT-4o-V}/workarena-l1.json
RENAMED
@@ -1,11 +1,11 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
7 |
-
"score":
|
8 |
-
"std_err":
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-GPT-4o-V",
|
4 |
+
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
7 |
+
"score": 41.8,
|
8 |
+
"std_err": 0.4,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/{GenericAgent-GPT-4o-mini β Bgym-GPT-4o-V}/workarena-l2.json
RENAMED
@@ -1,11 +1,11 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
-
"score":
|
8 |
-
"std_err": 0.
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-GPT-4o-V",
|
4 |
+
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
+
"score": 3.8,
|
8 |
+
"std_err": 0.6,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/{GenericAgent-GPT-4o-mini β Bgym-GPT-4o-V}/workarena-l3.json
RENAMED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
"score": 0.0,
|
8 |
"std_err": 0.0,
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-GPT-4o-V",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
"score": 0.0,
|
8 |
"std_err": 0.0,
|
results/Bgym-GPT-4o/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
## GPT-4o model
|
results/Bgym-GPT-4o/config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"agent_name": "GPT-4o",
|
3 |
+
"backend_llm": "GPT-4o"
|
4 |
+
}
|
results/Bgym-GPT-4o/miniwob.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-4o",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "MiniWoB",
|
7 |
+
"score": 71.3,
|
8 |
+
"std_err": 0.5,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-GPT-4o/webarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-4o",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WebArena",
|
7 |
+
"score": 23.5,
|
8 |
+
"std_err": 0.4,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/{GenericAgent-GPT-4o-mini β Bgym-GPT-4o}/workarena-l1.json
RENAMED
@@ -1,11 +1,11 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
7 |
-
"score":
|
8 |
-
"std_err":
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-GPT-4o",
|
4 |
+
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
7 |
+
"score": 42.7,
|
8 |
+
"std_err": 0.4,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/{GenericAgent-Claude-3.5-Sonnet β Bgym-GPT-4o}/workarena-l2.json
RENAMED
@@ -1,11 +1,11 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
-
"score":
|
8 |
-
"std_err":
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-GPT-4o",
|
4 |
+
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
7 |
+
"score": 3.0,
|
8 |
+
"std_err": 0.6,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/{GenericAgent-GPT-o1-mini β Bgym-GPT-4o}/workarena-l3.json
RENAMED
@@ -1,8 +1,8 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
-
"study_id": "
|
5 |
-
"date_time": "
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
"score": 0.0,
|
8 |
"std_err": 0.0,
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-GPT-4o",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
"score": 0.0,
|
8 |
"std_err": 0.0,
|
results/Bgym-Llama-3-70b/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
### Llama-3-70B
|
results/Bgym-Llama-3-70b/config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"agent_name": "Llama-3-70B",
|
3 |
+
"backend_llm": "Llama-3-70B"
|
4 |
+
}
|
results/Bgym-Llama-3-70b/miniwob.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3-70b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "MiniWoB",
|
7 |
+
"score": 68.2,
|
8 |
+
"std_err": 0.7,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3-70b/webarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3-70b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WebArena",
|
7 |
+
"score": 11.0,
|
8 |
+
"std_err": 0.3,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3-70b/workarena-l1.json
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3-70b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
+
"score": 17.9,
|
7 |
+
"std_err": 0.6,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"agent_name": "Bgym-Llama-3-70b",
|
18 |
+
"study_id": "study_id",
|
19 |
+
"benchmark": "WorkArena-L1",
|
20 |
+
"score": 15.9,
|
21 |
+
"std_err": 0.6,
|
22 |
+
"benchmark_specific": "No",
|
23 |
+
"benchmark_tuned": "No",
|
24 |
+
"followed_evaluation_protocol": "Yes",
|
25 |
+
"reproducible": "Yes",
|
26 |
+
"comments": "NA",
|
27 |
+
"original_or_reproduced": "Reproduced",
|
28 |
+
"date_time": "2021-01-04 12:06:00"
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"agent_name": "Bgym-Llama-3-70b",
|
32 |
+
"study_id": "study_id",
|
33 |
+
"benchmark": "WorkArena-L1",
|
34 |
+
"score": 19.9,
|
35 |
+
"std_err": 0.6,
|
36 |
+
"benchmark_specific": "No",
|
37 |
+
"benchmark_tuned": "No",
|
38 |
+
"followed_evaluation_protocol": "Yes",
|
39 |
+
"reproducible": "Yes",
|
40 |
+
"comments": "NA",
|
41 |
+
"original_or_reproduced": "Reproduced",
|
42 |
+
"date_time": "2021-01-05 2:07:00"
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"agent_name": "Bgym-Llama-3-70b",
|
46 |
+
"study_id": "study_id",
|
47 |
+
"benchmark": "WorkArena-L1",
|
48 |
+
"score": 17.9,
|
49 |
+
"std_err": 0.6,
|
50 |
+
"benchmark_specific": "No",
|
51 |
+
"benchmark_tuned": "No",
|
52 |
+
"followed_evaluation_protocol": "Yes",
|
53 |
+
"reproducible": "Yes",
|
54 |
+
"comments": "NA",
|
55 |
+
"original_or_reproduced": "Reproduced",
|
56 |
+
"date_time": "2021-01-12 12:00:00"
|
57 |
+
}
|
58 |
+
]
|
results/Bgym-Llama-3-70b/workarena-l2.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3-70b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WorkArena-L2",
|
7 |
+
"score": 0.0,
|
8 |
+
"std_err": 0.0,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/{GenericAgent-Claude-3.5-Sonnet β Bgym-Llama-3-70b}/workarena-l3.json
RENAMED
@@ -1,11 +1,11 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
-
"study_id": "
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
-
"score": 0.
|
8 |
-
"std_err": 0.
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "Bgym-Llama-3-70b",
|
4 |
+
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L3",
|
7 |
+
"score": 0.0,
|
8 |
+
"std_err": 0.0,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-Mixtral-8x22b/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
## Mixtral 8x22B
|
results/Bgym-Mixtral-8x22b/config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"agent_name": "Mixtral-8x22B",
|
3 |
+
"backend_llm": "Mixtral-8x22B"
|
4 |
+
}
|
results/Bgym-Mixtral-8x22b/miniwob.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Mixtral-8x22b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "MiniWoB",
|
7 |
+
"score": 62.4,
|
8 |
+
"std_err": 0.5,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Mixtral-8x22b/webarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Mixtral-8x22b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WebArena",
|
7 |
+
"score": 12.6,
|
8 |
+
"std_err": 0.9,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Mixtral-8x22b/workarena-l1.json
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Mixtral-8x22b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
+
"score": 12.4,
|
7 |
+
"std_err": 0.7,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-04 12:06:00"
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"agent_name": "Bgym-Mixtral-8x22b",
|
18 |
+
"study_id": "study_id",
|
19 |
+
"benchmark": "WorkArena-L1",
|
20 |
+
"score": 11.4,
|
21 |
+
"std_err": 0.7,
|
22 |
+
"benchmark_specific": "No",
|
23 |
+
"benchmark_tuned": "No",
|
24 |
+
"followed_evaluation_protocol": "Yes",
|
25 |
+
"reproducible": "Yes",
|
26 |
+
"comments": "NA",
|
27 |
+
"original_or_reproduced": "Reproduced",
|
28 |
+
"date_time": "2021-01-04 12:06:00"
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"agent_name": "Bgym-Mixtral-8x22b",
|
32 |
+
"study_id": "study_id",
|
33 |
+
"benchmark": "WorkArena-L1",
|
34 |
+
"score": 13.4,
|
35 |
+
"std_err": 0.7,
|
36 |
+
"benchmark_specific": "No",
|
37 |
+
"benchmark_tuned": "No",
|
38 |
+
"followed_evaluation_protocol": "Yes",
|
39 |
+
"reproducible": "Yes",
|
40 |
+
"comments": "NA",
|
41 |
+
"original_or_reproduced": "Reproduced",
|
42 |
+
"date_time": "2021-01-04 12:06:00"
|
43 |
+
}
|
44 |
+
]
|
results/Bgym-Mixtral-8x22b/workarena-l2.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Mixtral-8x22b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WorkArena-L2",
|
7 |
+
"score": 0.0,
|
8 |
+
"std_err": 0.0,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Mixtral-8x22b/workarena-l3.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Mixtral-8x22b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WorkArena-L3",
|
7 |
+
"score": 0.0,
|
8 |
+
"std_err": 0.0,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/GenericAgent-Claude-3.5-Sonnet/README.md
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
### GenericAgent-Claude-3.5-Sonnet
|
2 |
-
|
3 |
-
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
-
|
5 |
-
It uses Claude-3.5-sonnet as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
6 |
-
```python
|
7 |
-
BASE_FLAGS = GenericPromptFlags(
|
8 |
-
obs=dp.ObsFlags(
|
9 |
-
use_html=False,
|
10 |
-
use_ax_tree=True,
|
11 |
-
use_focused_element=True,
|
12 |
-
use_error_logs=True,
|
13 |
-
use_history=True,
|
14 |
-
use_past_error_logs=False,
|
15 |
-
use_action_history=True,
|
16 |
-
use_think_history=True, # gpt-4o config except for this line
|
17 |
-
use_diff=False,
|
18 |
-
html_type="pruned_html",
|
19 |
-
use_screenshot=False,
|
20 |
-
use_som=False,
|
21 |
-
extract_visible_tag=True,
|
22 |
-
extract_clickable_tag=True,
|
23 |
-
extract_coords="False",
|
24 |
-
filter_visible_elements_only=False,
|
25 |
-
),
|
26 |
-
action=dp.ActionFlags(
|
27 |
-
multi_actions=False,
|
28 |
-
action_set="bid",
|
29 |
-
long_description=False,
|
30 |
-
individual_examples=False,
|
31 |
-
),
|
32 |
-
use_plan=False,
|
33 |
-
use_criticise=False,
|
34 |
-
use_thinking=True,
|
35 |
-
use_memory=False,
|
36 |
-
use_concrete_example=True,
|
37 |
-
use_abstract_example=True,
|
38 |
-
use_hints=True,
|
39 |
-
enable_chat=False,
|
40 |
-
max_prompt_tokens=40_000,
|
41 |
-
be_cautious=True,
|
42 |
-
extra_instructions=None,
|
43 |
-
)
|
44 |
-
```
|
45 |
-
|
46 |
-
Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
-
"study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
|
5 |
-
"benchmark": "AssistantBench",
|
6 |
-
"score": 5.2,
|
7 |
-
"std_err": 1.5,
|
8 |
-
"benchmark_specific": "No",
|
9 |
-
"benchmark_tuned": "No",
|
10 |
-
"followed_evaluation_protocol": "Yes",
|
11 |
-
"reproducible": "Yes",
|
12 |
-
"comments": "Intersection of finished tasks across agents.",
|
13 |
-
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "2024-11-28 19:34:58"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-Claude-3.5-Sonnet/miniwob.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
-
"study_id": "2024-10-25_06-08-16",
|
5 |
-
"benchmark": "MiniWoB",
|
6 |
-
"score": 69.8,
|
7 |
-
"std_err": 1.8,
|
8 |
-
"benchmark_specific": "No",
|
9 |
-
"benchmark_tuned": "No",
|
10 |
-
"followed_evaluation_protocol": "Yes",
|
11 |
-
"reproducible": "Yes",
|
12 |
-
"comments": "NA",
|
13 |
-
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "2021-01-01 12:00:00"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
-
"study_id": "22f0611d-aeea-4ee9-a533-b45442b5e080",
|
5 |
-
"benchmark": "VisualWebArena",
|
6 |
-
"score": 21.0,
|
7 |
-
"std_err": 1.3,
|
8 |
-
"benchmark_specific": "No",
|
9 |
-
"benchmark_tuned": "No",
|
10 |
-
"followed_evaluation_protocol": "Yes",
|
11 |
-
"reproducible": "Yes",
|
12 |
-
"comments": "NA",
|
13 |
-
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "2024-12-02 09:11:35"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-Claude-3.5-Sonnet/webarena.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
-
"study_id": "b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae",
|
5 |
-
"benchmark": "WebArena",
|
6 |
-
"score": 36.2,
|
7 |
-
"std_err": 1.7,
|
8 |
-
"benchmark_specific": "No",
|
9 |
-
"benchmark_tuned": "No",
|
10 |
-
"followed_evaluation_protocol": "Yes",
|
11 |
-
"reproducible": "Yes",
|
12 |
-
"comments": "NA",
|
13 |
-
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "2024-11-29 22:37:46"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-Claude-3.5-Sonnet/weblinx.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
-
"study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
|
5 |
-
"benchmark": "WebLINX",
|
6 |
-
"score": 13.7,
|
7 |
-
"std_err": 0.6,
|
8 |
-
"benchmark_specific": "No",
|
9 |
-
"benchmark_tuned": "No",
|
10 |
-
"followed_evaluation_protocol": "Yes",
|
11 |
-
"reproducible": "Yes",
|
12 |
-
"comments": "NA",
|
13 |
-
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "2024-11-07 21:42:30"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
-
"study_id": "2024-10-23_14-17-40",
|
5 |
-
"benchmark": "WorkArena-L1",
|
6 |
-
"score": 56.4,
|
7 |
-
"std_err": 2.7,
|
8 |
-
"benchmark_specific": "No",
|
9 |
-
"benchmark_tuned": "No",
|
10 |
-
"followed_evaluation_protocol": "Yes",
|
11 |
-
"reproducible": "Yes",
|
12 |
-
"comments": "NA",
|
13 |
-
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "2021-01-01 12:00:00"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-GPT-4o-mini/README.md
DELETED
@@ -1,54 +0,0 @@
|
|
1 |
-
### GenericAgent-GPT-4o-mini
|
2 |
-
|
3 |
-
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
-
|
5 |
-
It uses GPT-4o-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
6 |
-
```python
|
7 |
-
BASE_FLAGS = GenericPromptFlags(
|
8 |
-
obs=dp.ObsFlags(
|
9 |
-
use_html=False,
|
10 |
-
use_ax_tree=True,
|
11 |
-
use_focused_element=True,
|
12 |
-
use_error_logs=True,
|
13 |
-
use_history=True,
|
14 |
-
use_past_error_logs=False,
|
15 |
-
use_action_history=True,
|
16 |
-
use_think_history=True, # gpt-4o config except for this line
|
17 |
-
use_diff=False,
|
18 |
-
html_type="pruned_html",
|
19 |
-
use_screenshot=False,
|
20 |
-
use_som=False,
|
21 |
-
extract_visible_tag=True,
|
22 |
-
extract_clickable_tag=True,
|
23 |
-
extract_coords="False",
|
24 |
-
filter_visible_elements_only=False,
|
25 |
-
),
|
26 |
-
action=dp.ActionFlags(
|
27 |
-
multi_actions=False,
|
28 |
-
action_set="bid",
|
29 |
-
long_description=False,
|
30 |
-
individual_examples=False,
|
31 |
-
),
|
32 |
-
use_plan=False,
|
33 |
-
use_criticise=False,
|
34 |
-
use_thinking=True,
|
35 |
-
use_memory=False,
|
36 |
-
use_concrete_example=True,
|
37 |
-
use_abstract_example=True,
|
38 |
-
use_hints=True,
|
39 |
-
enable_chat=False,
|
40 |
-
max_prompt_tokens=40_000,
|
41 |
-
be_cautious=True,
|
42 |
-
extra_instructions=None,
|
43 |
-
)
|
44 |
-
```
|
45 |
-
Β© Hugging Face
|
46 |
-
TOS
|
47 |
-
Privacy
|
48 |
-
About
|
49 |
-
Jobs
|
50 |
-
Models
|
51 |
-
Datasets
|
52 |
-
Spaces
|
53 |
-
Pricing
|
54 |
-
Docs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-GPT-4o-mini/assistantbench.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
-
"study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
|
5 |
-
"date_time": "2024-11-28 19:34:58",
|
6 |
-
"benchmark": "AssistantBench",
|
7 |
-
"score": 2.1,
|
8 |
-
"std_err": 1.0,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "Intersection of finished tasks across agents.",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-GPT-4o-mini/visualwebarena.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
-
"study_id": "8d8642d3-757a-4346-ba45-01398f85b1f4",
|
5 |
-
"date_time": "2024-12-02 02:54:33",
|
6 |
-
"benchmark": "VisualWebArena",
|
7 |
-
"score": 16.9,
|
8 |
-
"std_err": 1.2,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-GPT-4o-mini/webarena.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
-
"study_id": "c6bdeb87-9879-4c06-aa70-00d895001156",
|
5 |
-
"date_time": "2024-11-29 19:25:49",
|
6 |
-
"benchmark": "WebArena",
|
7 |
-
"score": 17.4,
|
8 |
-
"std_err": 1.3,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-GPT-4o-mini/weblinx.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
-
"study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
|
5 |
-
"date_time": "2024-11-07 21:42:30",
|
6 |
-
"benchmark": "WebLINX",
|
7 |
-
"score": 11.6,
|
8 |
-
"std_err": 0.6,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-GPT-4o/README.md
DELETED
@@ -1,46 +0,0 @@
|
|
1 |
-
### GenericAgent-GPT-4o
|
2 |
-
|
3 |
-
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
4 |
-
|
5 |
-
It uses GPT-4o as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
6 |
-
```python
|
7 |
-
BASE_FLAGS = GenericPromptFlags(
|
8 |
-
obs=dp.ObsFlags(
|
9 |
-
use_html=False,
|
10 |
-
use_ax_tree=True,
|
11 |
-
use_focused_element=True,
|
12 |
-
use_error_logs=True,
|
13 |
-
use_history=True,
|
14 |
-
use_past_error_logs=False,
|
15 |
-
use_action_history=True,
|
16 |
-
use_think_history=True, # gpt-4o config except for this line
|
17 |
-
use_diff=False,
|
18 |
-
html_type="pruned_html",
|
19 |
-
use_screenshot=False,
|
20 |
-
use_som=False,
|
21 |
-
extract_visible_tag=True,
|
22 |
-
extract_clickable_tag=True,
|
23 |
-
extract_coords="False",
|
24 |
-
filter_visible_elements_only=False,
|
25 |
-
),
|
26 |
-
action=dp.ActionFlags(
|
27 |
-
multi_actions=False,
|
28 |
-
action_set="bid",
|
29 |
-
long_description=False,
|
30 |
-
individual_examples=False,
|
31 |
-
),
|
32 |
-
use_plan=False,
|
33 |
-
use_criticise=False,
|
34 |
-
use_thinking=True,
|
35 |
-
use_memory=False,
|
36 |
-
use_concrete_example=True,
|
37 |
-
use_abstract_example=True,
|
38 |
-
use_hints=True,
|
39 |
-
enable_chat=False,
|
40 |
-
max_prompt_tokens=40_000,
|
41 |
-
be_cautious=True,
|
42 |
-
extra_instructions=None,
|
43 |
-
)
|
44 |
-
```
|
45 |
-
|
46 |
-
Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-GPT-4o/assistantbench.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "GenericAgent-GPT-4o",
|
4 |
-
"study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
|
5 |
-
"date_time": "2024-11-28 19:34:58",
|
6 |
-
"benchmark": "AssistantBench",
|
7 |
-
"score": 4.8,
|
8 |
-
"std_err": 2.4,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "Intersection of finished tasks across agents.",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|