This view is limited to 50 files because it contains too many changes.Β  See the raw diff here.
Files changed (50) hide show
  1. app.py +28 -82
  2. results/Bgym-GPT-3.5/README.md +1 -0
  3. results/Bgym-GPT-3.5/config.json +4 -0
  4. results/{GenericAgent-GPT-4o β†’ Bgym-GPT-3.5}/miniwob.json +4 -4
  5. results/Bgym-GPT-3.5/webarena.json +16 -0
  6. results/Bgym-GPT-3.5/workarena-l1.json +44 -0
  7. results/{GenericAgent-GPT-4o β†’ Bgym-GPT-3.5}/workarena-l2.json +4 -4
  8. results/{GenericAgent-GPT-4o β†’ Bgym-GPT-3.5}/workarena-l3.json +3 -3
  9. results/Bgym-GPT-4o-V/README.md +1 -0
  10. results/Bgym-GPT-4o-V/config.json +4 -0
  11. results/{GenericAgent-GPT-4o-mini β†’ Bgym-GPT-4o-V}/miniwob.json +4 -4
  12. results/Bgym-GPT-4o-V/webarena.json +16 -0
  13. results/{GenericAgent-GPT-4o β†’ Bgym-GPT-4o-V}/workarena-l1.json +4 -4
  14. results/{GenericAgent-GPT-4o-mini β†’ Bgym-GPT-4o-V}/workarena-l2.json +4 -4
  15. results/{GenericAgent-GPT-4o-mini β†’ Bgym-GPT-4o-V}/workarena-l3.json +3 -3
  16. results/Bgym-GPT-4o/README.md +1 -0
  17. results/Bgym-GPT-4o/config.json +4 -0
  18. results/Bgym-GPT-4o/miniwob.json +16 -0
  19. results/Bgym-GPT-4o/webarena.json +16 -0
  20. results/{GenericAgent-GPT-4o-mini β†’ Bgym-GPT-4o}/workarena-l1.json +4 -4
  21. results/{GenericAgent-Claude-3.5-Sonnet β†’ Bgym-GPT-4o}/workarena-l2.json +4 -4
  22. results/{GenericAgent-GPT-o1-mini β†’ Bgym-GPT-4o}/workarena-l3.json +3 -3
  23. results/Bgym-Llama-3-70b/README.md +1 -0
  24. results/Bgym-Llama-3-70b/config.json +4 -0
  25. results/Bgym-Llama-3-70b/miniwob.json +16 -0
  26. results/Bgym-Llama-3-70b/webarena.json +16 -0
  27. results/Bgym-Llama-3-70b/workarena-l1.json +58 -0
  28. results/Bgym-Llama-3-70b/workarena-l2.json +16 -0
  29. results/{GenericAgent-Claude-3.5-Sonnet β†’ Bgym-Llama-3-70b}/workarena-l3.json +4 -4
  30. results/Bgym-Mixtral-8x22b/README.md +1 -0
  31. results/Bgym-Mixtral-8x22b/config.json +4 -0
  32. results/Bgym-Mixtral-8x22b/miniwob.json +16 -0
  33. results/Bgym-Mixtral-8x22b/webarena.json +16 -0
  34. results/Bgym-Mixtral-8x22b/workarena-l1.json +44 -0
  35. results/Bgym-Mixtral-8x22b/workarena-l2.json +16 -0
  36. results/Bgym-Mixtral-8x22b/workarena-l3.json +16 -0
  37. results/GenericAgent-Claude-3.5-Sonnet/README.md +0 -46
  38. results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json +0 -16
  39. results/GenericAgent-Claude-3.5-Sonnet/miniwob.json +0 -16
  40. results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json +0 -16
  41. results/GenericAgent-Claude-3.5-Sonnet/webarena.json +0 -16
  42. results/GenericAgent-Claude-3.5-Sonnet/weblinx.json +0 -16
  43. results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json +0 -16
  44. results/GenericAgent-GPT-4o-mini/README.md +0 -54
  45. results/GenericAgent-GPT-4o-mini/assistantbench.json +0 -16
  46. results/GenericAgent-GPT-4o-mini/visualwebarena.json +0 -16
  47. results/GenericAgent-GPT-4o-mini/webarena.json +0 -16
  48. results/GenericAgent-GPT-4o-mini/weblinx.json +0 -16
  49. results/GenericAgent-GPT-4o/README.md +0 -46
  50. results/GenericAgent-GPT-4o/assistantbench.json +0 -16
app.py CHANGED
@@ -9,7 +9,6 @@ import plotly.graph_objs as go
9
  from huggingface_hub import HfApi
10
  from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
11
  import streamlit.components.v1 as components
12
- from datetime import datetime
13
 
14
  from urllib.parse import quote
15
  from pathlib import Path
@@ -17,7 +16,7 @@ import re
17
  import html
18
  from typing import Dict, Any
19
 
20
- BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB", "WebLINX", "VisualWebArena", "AssistantBench"]
21
 
22
  def sanitize_agent_name(agent_name):
23
  # Only allow alphanumeric chars, hyphen, underscore
@@ -44,34 +43,12 @@ def sanitize_column_name(col: str) -> str:
44
  return html.escape(str(col))
45
 
46
  def sanitize_cell_value(value: Any) -> str:
 
47
  if isinstance(value, (int, float)):
48
  return str(value)
49
- if isinstance(value, str) and 'Β±' in value:
50
- score, std_err = value.split('Β±')
51
- return f'{score.strip()} <span style="font-size: smaller; color: var(--lighter-color);">Β±{std_err.strip()}</span>'
52
  return html.escape(str(value))
53
 
54
  def create_html_table_main(df):
55
- col1, col2 = st.columns([2,6])
56
- with col1:
57
- sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("WebArena"), key="main_sort_column")
58
- with col2:
59
- sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key="main_sort_order")
60
-
61
- def get_sort_value(row):
62
- if row == "-":
63
- return float('-inf')
64
- else:
65
- try:
66
- return float(row)
67
- except ValueError:
68
- return row
69
-
70
- # Sort dataframe
71
- if sort_order == "Ascending":
72
- df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
73
- else:
74
- df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
75
  html = '''
76
  <style>
77
  table {
@@ -110,28 +87,7 @@ def create_html_table_main(df):
110
  html += '</div>'
111
  return html
112
 
113
- def create_html_table_benchmark(df, benchmark):
114
- col1, col2 = st.columns([2,6])
115
- with col1:
116
- sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("Score"), key=f"benchmark_sort_column_{benchmark}")
117
- with col2:
118
- sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key=f"benchmark_sort_order_{benchmark}")
119
-
120
- def get_sort_value(row):
121
- if row == "-":
122
- return float('-inf')
123
- else:
124
- try:
125
- return float(row)
126
- except ValueError:
127
- return row
128
-
129
- # Sort dataframe
130
- if sort_order == "Ascending":
131
- df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
132
- else:
133
- df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
134
-
135
  html = '''
136
  <style>
137
  table {
@@ -155,9 +111,8 @@ def create_html_table_benchmark(df, benchmark):
155
  html += '<table>'
156
  html += '<thead><tr>'
157
  for column in df.columns:
158
- if column == "Reproduced_all" or column == "std_err":
159
- continue
160
- html += f'<th>{sanitize_column_name(column)}</th>'
161
  html += '</tr></thead>'
162
  html += '<tbody>'
163
  for _, row in df.iterrows():
@@ -170,11 +125,8 @@ def create_html_table_benchmark(df, benchmark):
170
  summary = sanitize_cell_value(row[column])
171
  details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
172
  html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
173
- elif column == "Reproduced_all" or column == "std_err":
174
  continue
175
- elif column == "Score":
176
- score_with_std_err = f'{row[column]} Β± {row["std_err"]}'
177
- html += f'<td>{sanitize_cell_value(score_with_std_err)}</td>'
178
  else:
179
  html += f'<td>{sanitize_cell_value(row[column])}</td>'
180
  html += '</tr>'
@@ -209,19 +161,6 @@ def check_sanity(agent):
209
 
210
  def main():
211
  st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
212
- st.markdown("""
213
- <style>
214
- :root {
215
- --lighter-color: #888; /* Default for light theme */
216
- }
217
- @media (prefers-color-scheme: dark) {
218
- :root {
219
- --lighter-color: #ccc; /* Default for dark theme */
220
- }
221
- }
222
- </style>
223
- """, unsafe_allow_html=True)
224
-
225
  st.markdown("""
226
  <head>
227
  <meta http-equiv="Content-Security-Policy"
@@ -244,10 +183,7 @@ def main():
244
  continue
245
  agent_results = []
246
  for benchmark in BENCHMARKS:
247
- file_path = safe_path_join(agent, f"{benchmark.lower()}.json")
248
- if not file_path.is_file():
249
- continue
250
- with open(file_path) as f:
251
  agent_results.extend(json.load(f))
252
  all_results[agent] = agent_results
253
 
@@ -281,9 +217,11 @@ def main():
281
  if dfs_to_concat:
282
  df = pd.concat(dfs_to_concat, ignore_index=True)
283
 
284
- for benchmark in BENCHMARKS:
285
- df[benchmark] = df[benchmark].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
286
- df[benchmark] = df[benchmark].astype(str)
 
 
287
  # Add a search bar
288
  search_query = st.text_input("Search agents", "", key="search_main")
289
 
@@ -302,6 +240,14 @@ def main():
302
  return ""
303
 
304
  df['Agent'] = df['Agent'].apply(make_hyperlink)
 
 
 
 
 
 
 
 
305
  html_table = create_html_table_main(df)
306
  st.markdown(html_table, unsafe_allow_html=True)
307
 
@@ -449,21 +395,18 @@ MIT
449
  for value in values:
450
  if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
451
  result_dict["Score"] = value["score"]
452
- result_dict["std_err"] = value["std_err"]
453
  result_dict["Benchmark Specific"] = value["benchmark_specific"]
454
  result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
455
  result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
456
  result_dict["Reproducible"] = value["reproducible"]
457
  result_dict["Comments"] = value["comments"]
458
  result_dict["Study ID"] = value["study_id"]
459
- value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
460
  result_dict["Date"] = value["date_time"]
461
  result_dict["Reproduced"] = []
462
  result_dict["Reproduced_all"] = []
463
  flag = 1
464
  if not flag:
465
  result_dict["Score"] = "-"
466
- result_dict["std_err"] = "-"
467
  result_dict["Benchmark Specific"] = "-"
468
  result_dict["Benchmark Tuned"] = "-"
469
  result_dict["Followed Evaluation Protocol"] = "-"
@@ -475,7 +418,6 @@ MIT
475
  result_dict["Reproduced_all"] = []
476
  if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
477
  result_dict["Reproduced"].append(value["score"])
478
- value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
479
  result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
480
  if result_dict["Reproduced"]:
481
  result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
@@ -493,10 +435,14 @@ MIT
493
  # Concatenate the DataFrames
494
  if dfs_to_concat:
495
  df_ = pd.concat(dfs_to_concat, ignore_index=True)
496
- df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
497
- df_['std_err'] = df_['std_err'].apply(lambda x: f"{x:.1f}" if x != "-" else "-")
498
- df_['Score'] = df_['Score'].astype(str)
499
- html_table = create_html_table_benchmark(df_, benchmark)
 
 
 
 
500
  st.markdown(html_table, unsafe_allow_html=True)
501
 
502
 
 
9
  from huggingface_hub import HfApi
10
  from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
11
  import streamlit.components.v1 as components
 
12
 
13
  from urllib.parse import quote
14
  from pathlib import Path
 
16
  import html
17
  from typing import Dict, Any
18
 
19
+ BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB",]
20
 
21
  def sanitize_agent_name(agent_name):
22
  # Only allow alphanumeric chars, hyphen, underscore
 
43
  return html.escape(str(col))
44
 
45
  def sanitize_cell_value(value: Any) -> str:
46
+ """Sanitize cell values for HTML display"""
47
  if isinstance(value, (int, float)):
48
  return str(value)
 
 
 
49
  return html.escape(str(value))
50
 
51
  def create_html_table_main(df):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  html = '''
53
  <style>
54
  table {
 
87
  html += '</div>'
88
  return html
89
 
90
+ def create_html_table_benchmark(df):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  html = '''
92
  <style>
93
  table {
 
111
  html += '<table>'
112
  html += '<thead><tr>'
113
  for column in df.columns:
114
+ if column != "Reproduced_all":
115
+ html += f'<th>{sanitize_column_name(column)}</th>'
 
116
  html += '</tr></thead>'
117
  html += '<tbody>'
118
  for _, row in df.iterrows():
 
125
  summary = sanitize_cell_value(row[column])
126
  details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
127
  html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
128
+ elif column == "Reproduced_all":
129
  continue
 
 
 
130
  else:
131
  html += f'<td>{sanitize_cell_value(row[column])}</td>'
132
  html += '</tr>'
 
161
 
162
  def main():
163
  st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  st.markdown("""
165
  <head>
166
  <meta http-equiv="Content-Security-Policy"
 
183
  continue
184
  agent_results = []
185
  for benchmark in BENCHMARKS:
186
+ with open(f"results/{agent}/{benchmark.lower()}.json") as f:
 
 
 
187
  agent_results.extend(json.load(f))
188
  all_results[agent] = agent_results
189
 
 
217
  if dfs_to_concat:
218
  df = pd.concat(dfs_to_concat, ignore_index=True)
219
 
220
+ # df['Average'] = sum(df[column] for column in BENCHMARKS)/len(BENCHMARKS)
221
+ # df['Average'] = df['Average'].round(2)
222
+ # Sort values
223
+ df = df.sort_values(by='WebArena', ascending=False)
224
+
225
  # Add a search bar
226
  search_query = st.text_input("Search agents", "", key="search_main")
227
 
 
240
  return ""
241
 
242
  df['Agent'] = df['Agent'].apply(make_hyperlink)
243
+ # st.dataframe(
244
+ # df[['Agent'] + BENCHMARKS],
245
+ # use_container_width=True,
246
+ # column_config={benchmark: {'alignment': 'center'} for benchmark in BENCHMARKS},
247
+ # hide_index=True,
248
+ # # height=int(len(df) * 36.2),
249
+ # )
250
+ # st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
251
  html_table = create_html_table_main(df)
252
  st.markdown(html_table, unsafe_allow_html=True)
253
 
 
395
  for value in values:
396
  if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
397
  result_dict["Score"] = value["score"]
 
398
  result_dict["Benchmark Specific"] = value["benchmark_specific"]
399
  result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
400
  result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
401
  result_dict["Reproducible"] = value["reproducible"]
402
  result_dict["Comments"] = value["comments"]
403
  result_dict["Study ID"] = value["study_id"]
 
404
  result_dict["Date"] = value["date_time"]
405
  result_dict["Reproduced"] = []
406
  result_dict["Reproduced_all"] = []
407
  flag = 1
408
  if not flag:
409
  result_dict["Score"] = "-"
 
410
  result_dict["Benchmark Specific"] = "-"
411
  result_dict["Benchmark Tuned"] = "-"
412
  result_dict["Followed Evaluation Protocol"] = "-"
 
418
  result_dict["Reproduced_all"] = []
419
  if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
420
  result_dict["Reproduced"].append(value["score"])
 
421
  result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
422
  if result_dict["Reproduced"]:
423
  result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
 
435
  # Concatenate the DataFrames
436
  if dfs_to_concat:
437
  df_ = pd.concat(dfs_to_concat, ignore_index=True)
438
+ # st.markdown(f"<h2 id='{benchmark.lower()}'>{benchmark}</h2>", unsafe_allow_html=True)
439
+ # st.dataframe(
440
+ # df_,
441
+ # use_container_width=True,
442
+ # column_config={benchmark: {'alignment': 'center'}},
443
+ # hide_index=True,
444
+ # )
445
+ html_table = create_html_table_benchmark(df_)
446
  st.markdown(html_table, unsafe_allow_html=True)
447
 
448
 
results/Bgym-GPT-3.5/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ## GPT-3.5 model
results/Bgym-GPT-3.5/config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "agent_name": "GPT-3.5",
3
+ "backend_llm": "GPT-3.5"
4
+ }
results/{GenericAgent-GPT-4o β†’ Bgym-GPT-3.5}/miniwob.json RENAMED
@@ -1,11 +1,11 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-GPT-4o",
4
- "study_id": "2024-10-25_06-08-16",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
- "score": 63.8,
8
- "std_err": 1.9,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
1
  [
2
  {
3
+ "agent_name": "Bgym-GPT-3.5",
4
+ "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
+ "score": 43.4,
8
+ "std_err": 0.1,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/Bgym-GPT-3.5/webarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-3.5",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WebArena",
7
+ "score": 6.7,
8
+ "std_err": 0.2,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-3.5/workarena-l1.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-3.5",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena-L1",
7
+ "score": 6.1,
8
+ "std_err": 0.3,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ },
16
+ {
17
+ "agent_name": "Bgym-GPT-3.5",
18
+ "study_id": "study_id",
19
+ "benchmark": "WorkArena-L1",
20
+ "score": 5.7,
21
+ "std_err": 0.3,
22
+ "benchmark_specific": "No",
23
+ "benchmark_tuned": "No",
24
+ "followed_evaluation_protocol": "Yes",
25
+ "reproducible": "Yes",
26
+ "comments": "NA",
27
+ "original_or_reproduced": "Reproduced",
28
+ "date_time": "2021-01-04 12:06:00"
29
+ },
30
+ {
31
+ "benchmark": "WorkArena-L1",
32
+ "agent_name": "Bgym-GPT-3.5",
33
+ "study_id": "study_id",
34
+ "score": 5.1,
35
+ "std_err": 0.3,
36
+ "benchmark_specific": "No",
37
+ "benchmark_tuned": "No",
38
+ "followed_evaluation_protocol": "Yes",
39
+ "reproducible": "Yes",
40
+ "comments": "NA",
41
+ "original_or_reproduced": "Reproduced",
42
+ "date_time": "2021-01-04 12:06:00"
43
+ }
44
+ ]
results/{GenericAgent-GPT-4o β†’ Bgym-GPT-3.5}/workarena-l2.json RENAMED
@@ -1,11 +1,11 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-GPT-4o",
4
- "study_id": "2024-10-23_17-10-46",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
- "score": 8.5,
8
- "std_err": 1.8,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
1
  [
2
  {
3
+ "agent_name": "Bgym-GPT-3.5",
4
+ "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
+ "score": 0.0,
8
+ "std_err": 0.0,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/{GenericAgent-GPT-4o β†’ Bgym-GPT-3.5}/workarena-l3.json RENAMED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-GPT-4o",
4
- "study_id": "-",
5
- "date_time": "2024-10-24 23:03:30",
6
  "benchmark": "WorkArena-L3",
7
  "score": 0.0,
8
  "std_err": 0.0,
 
1
  [
2
  {
3
+ "agent_name": "Bgym-GPT-3.5",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
7
  "score": 0.0,
8
  "std_err": 0.0,
results/Bgym-GPT-4o-V/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ## GPT-4o-V model
results/Bgym-GPT-4o-V/config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "agent_name": "GPT-4o-V",
3
+ "backend_llm": "GPT-4o-V"
4
+ }
results/{GenericAgent-GPT-4o-mini β†’ Bgym-GPT-4o-V}/miniwob.json RENAMED
@@ -1,11 +1,11 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "2024-10-25_06-08-16",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
- "score": 56.6,
8
- "std_err": 2.0,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
1
  [
2
  {
3
+ "agent_name": "Bgym-GPT-4o-V",
4
+ "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
+ "score": 72.5,
8
+ "std_err": 0.5,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/Bgym-GPT-4o-V/webarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o-V",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WebArena",
7
+ "score": 24.0,
8
+ "std_err": 0.4,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/{GenericAgent-GPT-4o β†’ Bgym-GPT-4o-V}/workarena-l1.json RENAMED
@@ -1,11 +1,11 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-GPT-4o",
4
- "study_id": "2024-10-23_14-17-40",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
7
- "score": 45.5,
8
- "std_err": 2.7,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
1
  [
2
  {
3
+ "agent_name": "Bgym-GPT-4o-V",
4
+ "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
7
+ "score": 41.8,
8
+ "std_err": 0.4,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/{GenericAgent-GPT-4o-mini β†’ Bgym-GPT-4o-V}/workarena-l2.json RENAMED
@@ -1,11 +1,11 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "2024-10-23_17-10-46",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
- "score": 1.3,
8
- "std_err": 0.7,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
1
  [
2
  {
3
+ "agent_name": "Bgym-GPT-4o-V",
4
+ "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
+ "score": 3.8,
8
+ "std_err": 0.6,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/{GenericAgent-GPT-4o-mini β†’ Bgym-GPT-4o-V}/workarena-l3.json RENAMED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "-",
5
- "date_time": "2024-10-24 23:03:30",
6
  "benchmark": "WorkArena-L3",
7
  "score": 0.0,
8
  "std_err": 0.0,
 
1
  [
2
  {
3
+ "agent_name": "Bgym-GPT-4o-V",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
7
  "score": 0.0,
8
  "std_err": 0.0,
results/Bgym-GPT-4o/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ## GPT-4o model
results/Bgym-GPT-4o/config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "agent_name": "GPT-4o",
3
+ "backend_llm": "GPT-4o"
4
+ }
results/Bgym-GPT-4o/miniwob.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "MiniWoB",
7
+ "score": 71.3,
8
+ "std_err": 0.5,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-4o/webarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WebArena",
7
+ "score": 23.5,
8
+ "std_err": 0.4,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/{GenericAgent-GPT-4o-mini β†’ Bgym-GPT-4o}/workarena-l1.json RENAMED
@@ -1,11 +1,11 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "2024-10-23_14-17-40",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
7
- "score": 27,
8
- "std_err": 2.4,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
1
  [
2
  {
3
+ "agent_name": "Bgym-GPT-4o",
4
+ "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
7
+ "score": 42.7,
8
+ "std_err": 0.4,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/{GenericAgent-Claude-3.5-Sonnet β†’ Bgym-GPT-4o}/workarena-l2.json RENAMED
@@ -1,11 +1,11 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "2024-10-23_17-10-46",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
- "score": 39.1,
8
- "std_err": 3.2,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
1
  [
2
  {
3
+ "agent_name": "Bgym-GPT-4o",
4
+ "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
7
+ "score": 3.0,
8
+ "std_err": 0.6,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/{GenericAgent-GPT-o1-mini β†’ Bgym-GPT-4o}/workarena-l3.json RENAMED
@@ -1,8 +1,8 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-GPT-o1-mini",
4
- "study_id": "-",
5
- "date_time": "2024-10-24 23:03:30",
6
  "benchmark": "WorkArena-L3",
7
  "score": 0.0,
8
  "std_err": 0.0,
 
1
  [
2
  {
3
+ "agent_name": "Bgym-GPT-4o",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
7
  "score": 0.0,
8
  "std_err": 0.0,
results/Bgym-Llama-3-70b/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ### Llama-3-70B
results/Bgym-Llama-3-70b/config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "agent_name": "Llama-3-70B",
3
+ "backend_llm": "Llama-3-70B"
4
+ }
results/Bgym-Llama-3-70b/miniwob.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3-70b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "MiniWoB",
7
+ "score": 68.2,
8
+ "std_err": 0.7,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Llama-3-70b/webarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3-70b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WebArena",
7
+ "score": 11.0,
8
+ "std_err": 0.3,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Llama-3-70b/workarena-l1.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3-70b",
4
+ "study_id": "study_id",
5
+ "benchmark": "WorkArena-L1",
6
+ "score": 17.9,
7
+ "std_err": 0.6,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ },
16
+ {
17
+ "agent_name": "Bgym-Llama-3-70b",
18
+ "study_id": "study_id",
19
+ "benchmark": "WorkArena-L1",
20
+ "score": 15.9,
21
+ "std_err": 0.6,
22
+ "benchmark_specific": "No",
23
+ "benchmark_tuned": "No",
24
+ "followed_evaluation_protocol": "Yes",
25
+ "reproducible": "Yes",
26
+ "comments": "NA",
27
+ "original_or_reproduced": "Reproduced",
28
+ "date_time": "2021-01-04 12:06:00"
29
+ },
30
+ {
31
+ "agent_name": "Bgym-Llama-3-70b",
32
+ "study_id": "study_id",
33
+ "benchmark": "WorkArena-L1",
34
+ "score": 19.9,
35
+ "std_err": 0.6,
36
+ "benchmark_specific": "No",
37
+ "benchmark_tuned": "No",
38
+ "followed_evaluation_protocol": "Yes",
39
+ "reproducible": "Yes",
40
+ "comments": "NA",
41
+ "original_or_reproduced": "Reproduced",
42
+ "date_time": "2021-01-05 2:07:00"
43
+ },
44
+ {
45
+ "agent_name": "Bgym-Llama-3-70b",
46
+ "study_id": "study_id",
47
+ "benchmark": "WorkArena-L1",
48
+ "score": 17.9,
49
+ "std_err": 0.6,
50
+ "benchmark_specific": "No",
51
+ "benchmark_tuned": "No",
52
+ "followed_evaluation_protocol": "Yes",
53
+ "reproducible": "Yes",
54
+ "comments": "NA",
55
+ "original_or_reproduced": "Reproduced",
56
+ "date_time": "2021-01-12 12:00:00"
57
+ }
58
+ ]
results/Bgym-Llama-3-70b/workarena-l2.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3-70b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena-L2",
7
+ "score": 0.0,
8
+ "std_err": 0.0,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/{GenericAgent-Claude-3.5-Sonnet β†’ Bgym-Llama-3-70b}/workarena-l3.json RENAMED
@@ -1,11 +1,11 @@
1
  [
2
  {
3
- "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "2024-10-24_18-06-57",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
7
- "score": 0.4,
8
- "std_err": 0.4,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
1
  [
2
  {
3
+ "agent_name": "Bgym-Llama-3-70b",
4
+ "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
7
+ "score": 0.0,
8
+ "std_err": 0.0,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/Bgym-Mixtral-8x22b/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ## Mixtral 8x22B
results/Bgym-Mixtral-8x22b/config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "agent_name": "Mixtral-8x22B",
3
+ "backend_llm": "Mixtral-8x22B"
4
+ }
results/Bgym-Mixtral-8x22b/miniwob.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Mixtral-8x22b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "MiniWoB",
7
+ "score": 62.4,
8
+ "std_err": 0.5,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Mixtral-8x22b/webarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Mixtral-8x22b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WebArena",
7
+ "score": 12.6,
8
+ "std_err": 0.9,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Mixtral-8x22b/workarena-l1.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Mixtral-8x22b",
4
+ "study_id": "study_id",
5
+ "benchmark": "WorkArena-L1",
6
+ "score": 12.4,
7
+ "std_err": 0.7,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-04 12:06:00"
15
+ },
16
+ {
17
+ "agent_name": "Bgym-Mixtral-8x22b",
18
+ "study_id": "study_id",
19
+ "benchmark": "WorkArena-L1",
20
+ "score": 11.4,
21
+ "std_err": 0.7,
22
+ "benchmark_specific": "No",
23
+ "benchmark_tuned": "No",
24
+ "followed_evaluation_protocol": "Yes",
25
+ "reproducible": "Yes",
26
+ "comments": "NA",
27
+ "original_or_reproduced": "Reproduced",
28
+ "date_time": "2021-01-04 12:06:00"
29
+ },
30
+ {
31
+ "agent_name": "Bgym-Mixtral-8x22b",
32
+ "study_id": "study_id",
33
+ "benchmark": "WorkArena-L1",
34
+ "score": 13.4,
35
+ "std_err": 0.7,
36
+ "benchmark_specific": "No",
37
+ "benchmark_tuned": "No",
38
+ "followed_evaluation_protocol": "Yes",
39
+ "reproducible": "Yes",
40
+ "comments": "NA",
41
+ "original_or_reproduced": "Reproduced",
42
+ "date_time": "2021-01-04 12:06:00"
43
+ }
44
+ ]
results/Bgym-Mixtral-8x22b/workarena-l2.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Mixtral-8x22b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena-L2",
7
+ "score": 0.0,
8
+ "std_err": 0.0,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Mixtral-8x22b/workarena-l3.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Mixtral-8x22b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena-L3",
7
+ "score": 0.0,
8
+ "std_err": 0.0,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/GenericAgent-Claude-3.5-Sonnet/README.md DELETED
@@ -1,46 +0,0 @@
1
- ### GenericAgent-Claude-3.5-Sonnet
2
-
3
- This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
4
-
5
- It uses Claude-3.5-sonnet as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
6
- ```python
7
- BASE_FLAGS = GenericPromptFlags(
8
- obs=dp.ObsFlags(
9
- use_html=False,
10
- use_ax_tree=True,
11
- use_focused_element=True,
12
- use_error_logs=True,
13
- use_history=True,
14
- use_past_error_logs=False,
15
- use_action_history=True,
16
- use_think_history=True, # gpt-4o config except for this line
17
- use_diff=False,
18
- html_type="pruned_html",
19
- use_screenshot=False,
20
- use_som=False,
21
- extract_visible_tag=True,
22
- extract_clickable_tag=True,
23
- extract_coords="False",
24
- filter_visible_elements_only=False,
25
- ),
26
- action=dp.ActionFlags(
27
- multi_actions=False,
28
- action_set="bid",
29
- long_description=False,
30
- individual_examples=False,
31
- ),
32
- use_plan=False,
33
- use_criticise=False,
34
- use_thinking=True,
35
- use_memory=False,
36
- use_concrete_example=True,
37
- use_abstract_example=True,
38
- use_hints=True,
39
- enable_chat=False,
40
- max_prompt_tokens=40_000,
41
- be_cautious=True,
42
- extra_instructions=None,
43
- )
44
- ```
45
-
46
- Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
5
- "benchmark": "AssistantBench",
6
- "score": 5.2,
7
- "std_err": 1.5,
8
- "benchmark_specific": "No",
9
- "benchmark_tuned": "No",
10
- "followed_evaluation_protocol": "Yes",
11
- "reproducible": "Yes",
12
- "comments": "Intersection of finished tasks across agents.",
13
- "original_or_reproduced": "Original",
14
- "date_time": "2024-11-28 19:34:58"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/GenericAgent-Claude-3.5-Sonnet/miniwob.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "2024-10-25_06-08-16",
5
- "benchmark": "MiniWoB",
6
- "score": 69.8,
7
- "std_err": 1.8,
8
- "benchmark_specific": "No",
9
- "benchmark_tuned": "No",
10
- "followed_evaluation_protocol": "Yes",
11
- "reproducible": "Yes",
12
- "comments": "NA",
13
- "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "22f0611d-aeea-4ee9-a533-b45442b5e080",
5
- "benchmark": "VisualWebArena",
6
- "score": 21.0,
7
- "std_err": 1.3,
8
- "benchmark_specific": "No",
9
- "benchmark_tuned": "No",
10
- "followed_evaluation_protocol": "Yes",
11
- "reproducible": "Yes",
12
- "comments": "NA",
13
- "original_or_reproduced": "Original",
14
- "date_time": "2024-12-02 09:11:35"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/GenericAgent-Claude-3.5-Sonnet/webarena.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae",
5
- "benchmark": "WebArena",
6
- "score": 36.2,
7
- "std_err": 1.7,
8
- "benchmark_specific": "No",
9
- "benchmark_tuned": "No",
10
- "followed_evaluation_protocol": "Yes",
11
- "reproducible": "Yes",
12
- "comments": "NA",
13
- "original_or_reproduced": "Original",
14
- "date_time": "2024-11-29 22:37:46"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/GenericAgent-Claude-3.5-Sonnet/weblinx.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
5
- "benchmark": "WebLINX",
6
- "score": 13.7,
7
- "std_err": 0.6,
8
- "benchmark_specific": "No",
9
- "benchmark_tuned": "No",
10
- "followed_evaluation_protocol": "Yes",
11
- "reproducible": "Yes",
12
- "comments": "NA",
13
- "original_or_reproduced": "Original",
14
- "date_time": "2024-11-07 21:42:30"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
- "study_id": "2024-10-23_14-17-40",
5
- "benchmark": "WorkArena-L1",
6
- "score": 56.4,
7
- "std_err": 2.7,
8
- "benchmark_specific": "No",
9
- "benchmark_tuned": "No",
10
- "followed_evaluation_protocol": "Yes",
11
- "reproducible": "Yes",
12
- "comments": "NA",
13
- "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/GenericAgent-GPT-4o-mini/README.md DELETED
@@ -1,54 +0,0 @@
1
- ### GenericAgent-GPT-4o-mini
2
-
3
- This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
4
-
5
- It uses GPT-4o-mini as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
6
- ```python
7
- BASE_FLAGS = GenericPromptFlags(
8
- obs=dp.ObsFlags(
9
- use_html=False,
10
- use_ax_tree=True,
11
- use_focused_element=True,
12
- use_error_logs=True,
13
- use_history=True,
14
- use_past_error_logs=False,
15
- use_action_history=True,
16
- use_think_history=True, # gpt-4o config except for this line
17
- use_diff=False,
18
- html_type="pruned_html",
19
- use_screenshot=False,
20
- use_som=False,
21
- extract_visible_tag=True,
22
- extract_clickable_tag=True,
23
- extract_coords="False",
24
- filter_visible_elements_only=False,
25
- ),
26
- action=dp.ActionFlags(
27
- multi_actions=False,
28
- action_set="bid",
29
- long_description=False,
30
- individual_examples=False,
31
- ),
32
- use_plan=False,
33
- use_criticise=False,
34
- use_thinking=True,
35
- use_memory=False,
36
- use_concrete_example=True,
37
- use_abstract_example=True,
38
- use_hints=True,
39
- enable_chat=False,
40
- max_prompt_tokens=40_000,
41
- be_cautious=True,
42
- extra_instructions=None,
43
- )
44
- ```
45
- Β© Hugging Face
46
- TOS
47
- Privacy
48
- About
49
- Jobs
50
- Models
51
- Datasets
52
- Spaces
53
- Pricing
54
- Docs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/GenericAgent-GPT-4o-mini/assistantbench.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
5
- "date_time": "2024-11-28 19:34:58",
6
- "benchmark": "AssistantBench",
7
- "score": 2.1,
8
- "std_err": 1.0,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "Intersection of finished tasks across agents.",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/GenericAgent-GPT-4o-mini/visualwebarena.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "8d8642d3-757a-4346-ba45-01398f85b1f4",
5
- "date_time": "2024-12-02 02:54:33",
6
- "benchmark": "VisualWebArena",
7
- "score": 16.9,
8
- "std_err": 1.2,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/GenericAgent-GPT-4o-mini/webarena.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "c6bdeb87-9879-4c06-aa70-00d895001156",
5
- "date_time": "2024-11-29 19:25:49",
6
- "benchmark": "WebArena",
7
- "score": 17.4,
8
- "std_err": 1.3,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/GenericAgent-GPT-4o-mini/weblinx.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "GenericAgent-GPT-4o-mini",
4
- "study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
5
- "date_time": "2024-11-07 21:42:30",
6
- "benchmark": "WebLINX",
7
- "score": 11.6,
8
- "std_err": 0.6,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/GenericAgent-GPT-4o/README.md DELETED
@@ -1,46 +0,0 @@
1
- ### GenericAgent-GPT-4o
2
-
3
- This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
4
-
5
- It uses GPT-4o as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
6
- ```python
7
- BASE_FLAGS = GenericPromptFlags(
8
- obs=dp.ObsFlags(
9
- use_html=False,
10
- use_ax_tree=True,
11
- use_focused_element=True,
12
- use_error_logs=True,
13
- use_history=True,
14
- use_past_error_logs=False,
15
- use_action_history=True,
16
- use_think_history=True, # gpt-4o config except for this line
17
- use_diff=False,
18
- html_type="pruned_html",
19
- use_screenshot=False,
20
- use_som=False,
21
- extract_visible_tag=True,
22
- extract_clickable_tag=True,
23
- extract_coords="False",
24
- filter_visible_elements_only=False,
25
- ),
26
- action=dp.ActionFlags(
27
- multi_actions=False,
28
- action_set="bid",
29
- long_description=False,
30
- individual_examples=False,
31
- ),
32
- use_plan=False,
33
- use_criticise=False,
34
- use_thinking=True,
35
- use_memory=False,
36
- use_concrete_example=True,
37
- use_abstract_example=True,
38
- use_hints=True,
39
- enable_chat=False,
40
- max_prompt_tokens=40_000,
41
- be_cautious=True,
42
- extra_instructions=None,
43
- )
44
- ```
45
-
46
- Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/GenericAgent-GPT-4o/assistantbench.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "GenericAgent-GPT-4o",
4
- "study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
5
- "date_time": "2024-11-28 19:34:58",
6
- "benchmark": "AssistantBench",
7
- "score": 4.8,
8
- "std_err": 2.4,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "Intersection of finished tasks across agents.",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]