meghsn commited on
Commit
d5581cc
Β·
1 Parent(s): 97d7e59

Result updates

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. app.py +4 -3
  2. results/Bgym-GPT-3.5/README.md +0 -1
  3. results/Bgym-GPT-3.5/config.json +0 -4
  4. results/Bgym-GPT-3.5/miniwob.json +0 -16
  5. results/Bgym-GPT-3.5/webarena.json +0 -16
  6. results/Bgym-GPT-3.5/workarena-l1.json +0 -16
  7. results/Bgym-GPT-3.5/workarena-l2.json +0 -16
  8. results/Bgym-GPT-4o-V/README.md +0 -1
  9. results/Bgym-GPT-4o-V/miniwob.json +0 -16
  10. results/Bgym-GPT-4o-V/webarena.json +0 -16
  11. results/Bgym-GPT-4o-V/workarena-l1.json +0 -16
  12. results/Bgym-GPT-4o-V/workarena-l2.json +0 -16
  13. results/Bgym-GPT-4o-V/workarena-l3.json +0 -16
  14. results/Bgym-GPT-o1-mini/workarena-l3.json +0 -16
  15. results/Bgym-Llama-3-70b/README.md +0 -1
  16. results/Bgym-Llama-3-70b/miniwob.json +0 -16
  17. results/Bgym-Llama-3-70b/webarena.json +0 -16
  18. results/Bgym-Llama-3-70b/workarena-l1.json +0 -16
  19. results/Bgym-Llama-3-70b/workarena-l2.json +0 -16
  20. results/Bgym-Llama-3-70b/workarena-l3.json +0 -16
  21. results/Bgym-Mixtral-8x22b/README.md +0 -1
  22. results/Bgym-Mixtral-8x22b/miniwob.json +0 -16
  23. results/Bgym-Mixtral-8x22b/webarena.json +0 -16
  24. results/Bgym-Mixtral-8x22b/workarena-l1.json +0 -16
  25. results/Bgym-Mixtral-8x22b/workarena-l2.json +0 -16
  26. results/Bgym-Mixtral-8x22b/workarena-l3.json +0 -16
  27. results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/README.md +0 -0
  28. results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/assistantbench.json +1 -1
  29. results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/miniwob.json +1 -1
  30. results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/webarena.json +1 -1
  31. results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/weblinx.json +1 -1
  32. results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/workarena-l1.json +1 -1
  33. results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/workarena-l2.json +1 -1
  34. results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/workarena-l3.json +1 -1
  35. results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/README.md +0 -0
  36. results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/assistantbench.json +1 -1
  37. results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/miniwob.json +1 -1
  38. results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/webarena.json +1 -1
  39. results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/weblinx.json +1 -1
  40. results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/workarena-l1.json +1 -1
  41. results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/workarena-l2.json +1 -1
  42. results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/workarena-l3.json +1 -1
  43. results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/README.md +0 -0
  44. results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/assistantbench.json +1 -1
  45. results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/miniwob.json +1 -1
  46. results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/webarena.json +1 -1
  47. results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/weblinx.json +1 -1
  48. results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/workarena-l1.json +1 -1
  49. results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/workarena-l2.json +1 -1
  50. results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/workarena-l3.json +1 -1
app.py CHANGED
@@ -155,8 +155,9 @@ def create_html_table_benchmark(df, benchmark):
155
  html += '<table>'
156
  html += '<thead><tr>'
157
  for column in df.columns:
158
- if column != "Reproduced_all":
159
- html += f'<th>{sanitize_column_name(column)}</th>'
 
160
  html += '</tr></thead>'
161
  html += '<tbody>'
162
  for _, row in df.iterrows():
@@ -169,7 +170,7 @@ def create_html_table_benchmark(df, benchmark):
169
  summary = sanitize_cell_value(row[column])
170
  details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
171
  html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
172
- elif column == "Reproduced_all":
173
  continue
174
  elif column == "Score":
175
  score_with_std_err = f'{row[column]} Β± {row["std_err"]}'
 
155
  html += '<table>'
156
  html += '<thead><tr>'
157
  for column in df.columns:
158
+ if column == "Reproduced_all" or column == "std_err":
159
+ continue
160
+ html += f'<th>{sanitize_column_name(column)}</th>'
161
  html += '</tr></thead>'
162
  html += '<tbody>'
163
  for _, row in df.iterrows():
 
170
  summary = sanitize_cell_value(row[column])
171
  details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
172
  html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
173
+ elif column == "Reproduced_all" or column == "std_err":
174
  continue
175
  elif column == "Score":
176
  score_with_std_err = f'{row[column]} Β± {row["std_err"]}'
results/Bgym-GPT-3.5/README.md DELETED
@@ -1 +0,0 @@
1
- ## GPT-3.5 model
 
 
results/Bgym-GPT-3.5/config.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "agent_name": "GPT-3.5",
3
- "backend_llm": "GPT-3.5"
4
- }
 
 
 
 
 
results/Bgym-GPT-3.5/miniwob.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-GPT-3.5",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "MiniWoB",
7
- "score": 43.4,
8
- "std_err": 0.1,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-GPT-3.5/webarena.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-GPT-3.5",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "WebArena",
7
- "score": 6.7,
8
- "std_err": 0.2,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-GPT-3.5/workarena-l1.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-GPT-3.5",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "WorkArena-L1",
7
- "score": 6.1,
8
- "std_err": 0.3,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-GPT-3.5/workarena-l2.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-GPT-3.5",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "WorkArena-L2",
7
- "score": 0.0,
8
- "std_err": 0.0,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-GPT-4o-V/README.md DELETED
@@ -1 +0,0 @@
1
- ## GPT-4o-V model
 
 
results/Bgym-GPT-4o-V/miniwob.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-GPT-4o-V",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "MiniWoB",
7
- "score": 72.5,
8
- "std_err": 0.5,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-GPT-4o-V/webarena.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-GPT-4o-V",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "WebArena",
7
- "score": 24.0,
8
- "std_err": 0.4,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-GPT-4o-V/workarena-l1.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-GPT-4o-V",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "WorkArena-L1",
7
- "score": 41.8,
8
- "std_err": 0.4,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-GPT-4o-V/workarena-l2.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-GPT-4o-V",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "WorkArena-L2",
7
- "score": 3.8,
8
- "std_err": 0.6,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-GPT-4o-V/workarena-l3.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-GPT-4o-V",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "WorkArena-L3",
7
- "score": 0.0,
8
- "std_err": 0.0,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-GPT-o1-mini/workarena-l3.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-GPT-o1-mini",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "WorkArena-L3",
7
- "score": 0.0,
8
- "std_err": 0.0,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-Llama-3-70b/README.md DELETED
@@ -1 +0,0 @@
1
- ### Llama-3-70B
 
 
results/Bgym-Llama-3-70b/miniwob.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-Llama-3-70b",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "MiniWoB",
7
- "score": 68.2,
8
- "std_err": 0.7,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-Llama-3-70b/webarena.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-Llama-3-70b",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "WebArena",
7
- "score": 11.0,
8
- "std_err": 0.3,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-Llama-3-70b/workarena-l1.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-Llama-3-70b",
4
- "study_id": "study_id",
5
- "benchmark": "WorkArena-L1",
6
- "score": 17.9,
7
- "std_err": 0.6,
8
- "benchmark_specific": "No",
9
- "benchmark_tuned": "No",
10
- "followed_evaluation_protocol": "Yes",
11
- "reproducible": "Yes",
12
- "comments": "NA",
13
- "original_or_reproduced": "Original",
14
- "date_time": "2021-01-01 12:00:00"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-Llama-3-70b/workarena-l2.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-Llama-3-70b",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "WorkArena-L2",
7
- "score": 0.0,
8
- "std_err": 0.0,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-Llama-3-70b/workarena-l3.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-Llama-3-70b",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "WorkArena-L3",
7
- "score": 0.0,
8
- "std_err": 0.0,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-Mixtral-8x22b/README.md DELETED
@@ -1 +0,0 @@
1
- ## Mixtral 8x22B
 
 
results/Bgym-Mixtral-8x22b/miniwob.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-Mixtral-8x22b",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "MiniWoB",
7
- "score": 62.4,
8
- "std_err": 0.5,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-Mixtral-8x22b/webarena.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-Mixtral-8x22b",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "WebArena",
7
- "score": 12.6,
8
- "std_err": 0.9,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-Mixtral-8x22b/workarena-l1.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-Mixtral-8x22b",
4
- "study_id": "study_id",
5
- "benchmark": "WorkArena-L1",
6
- "score": 12.4,
7
- "std_err": 0.7,
8
- "benchmark_specific": "No",
9
- "benchmark_tuned": "No",
10
- "followed_evaluation_protocol": "Yes",
11
- "reproducible": "Yes",
12
- "comments": "NA",
13
- "original_or_reproduced": "Original",
14
- "date_time": "2021-01-04 12:06:00"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-Mixtral-8x22b/workarena-l2.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-Mixtral-8x22b",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "WorkArena-L2",
7
- "score": 0.0,
8
- "std_err": 0.0,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/Bgym-Mixtral-8x22b/workarena-l3.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "agent_name": "Bgym-Mixtral-8x22b",
4
- "study_id": "study_id",
5
- "date_time": "2021-01-01 12:00:00",
6
- "benchmark": "WorkArena-L3",
7
- "score": 0.0,
8
- "std_err": 0.0,
9
- "benchmark_specific": "No",
10
- "benchmark_tuned": "No",
11
- "followed_evaluation_protocol": "Yes",
12
- "reproducible": "Yes",
13
- "comments": "NA",
14
- "original_or_reproduced": "Original"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/README.md RENAMED
File without changes
results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/assistantbench.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-Claude-3.5-Sonnet",
4
  "study_id": "study_id",
5
  "benchmark": "AssistantBench",
6
  "score": 5.2,
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
  "study_id": "study_id",
5
  "benchmark": "AssistantBench",
6
  "score": 5.2,
results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/miniwob.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-Claude-3.5-Sonnet",
4
  "study_id": "study_id",
5
  "benchmark": "MiniWoB",
6
  "score": 69.8,
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
  "study_id": "study_id",
5
  "benchmark": "MiniWoB",
6
  "score": 69.8,
results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/webarena.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-Claude-3.5-Sonnet",
4
  "study_id": "study_id",
5
  "benchmark": "WebArena",
6
  "score": 36.2,
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
  "study_id": "study_id",
5
  "benchmark": "WebArena",
6
  "score": 36.2,
results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/weblinx.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-Claude-3.5-Sonnet",
4
  "study_id": "study_id",
5
  "benchmark": "WebLINX",
6
  "score": 13.7,
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
  "study_id": "study_id",
5
  "benchmark": "WebLINX",
6
  "score": 13.7,
results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/workarena-l1.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-Claude-3.5-Sonnet",
4
  "study_id": "study_id",
5
  "benchmark": "WorkArena-L1",
6
  "score": 56.4,
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
  "study_id": "study_id",
5
  "benchmark": "WorkArena-L1",
6
  "score": 56.4,
results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/workarena-l2.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-Claude-3.5-Sonnet",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
results/{Bgym-Claude-3.5-Sonnet β†’ GenericAgent-Claude-3.5-Sonnet}/workarena-l3.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-Claude-3.5-Sonnet",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-Claude-3.5-Sonnet",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/README.md RENAMED
File without changes
results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/assistantbench.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "AssistantBench",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "AssistantBench",
results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/miniwob.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/webarena.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebArena",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebArena",
results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/weblinx.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebLINX",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebLINX",
results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/workarena-l1.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/workarena-l2.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
results/{Bgym-GPT-4o-mini β†’ GenericAgent-GPT-4o-mini}/workarena-l3.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-4o-mini",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/README.md RENAMED
File without changes
results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/assistantbench.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-GPT-4o",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "AssistantBench",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-4o",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "AssistantBench",
results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/miniwob.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-GPT-4o",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-4o",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/webarena.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-GPT-4o",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebArena",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-4o",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebArena",
results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/weblinx.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-GPT-4o",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebLINX",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-4o",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebLINX",
results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/workarena-l1.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-GPT-4o",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-4o",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L1",
results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/workarena-l2.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-GPT-4o",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-4o",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L2",
results/{Bgym-GPT-4o β†’ GenericAgent-GPT-4o}/workarena-l3.json RENAMED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "agent_name": "Bgym-GPT-4o",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",
 
1
  [
2
  {
3
+ "agent_name": "GenericAgent-GPT-4o",
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WorkArena-L3",