Result updates
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- app.py +4 -3
- results/Bgym-GPT-3.5/README.md +0 -1
- results/Bgym-GPT-3.5/config.json +0 -4
- results/Bgym-GPT-3.5/miniwob.json +0 -16
- results/Bgym-GPT-3.5/webarena.json +0 -16
- results/Bgym-GPT-3.5/workarena-l1.json +0 -16
- results/Bgym-GPT-3.5/workarena-l2.json +0 -16
- results/Bgym-GPT-4o-V/README.md +0 -1
- results/Bgym-GPT-4o-V/miniwob.json +0 -16
- results/Bgym-GPT-4o-V/webarena.json +0 -16
- results/Bgym-GPT-4o-V/workarena-l1.json +0 -16
- results/Bgym-GPT-4o-V/workarena-l2.json +0 -16
- results/Bgym-GPT-4o-V/workarena-l3.json +0 -16
- results/Bgym-GPT-o1-mini/workarena-l3.json +0 -16
- results/Bgym-Llama-3-70b/README.md +0 -1
- results/Bgym-Llama-3-70b/miniwob.json +0 -16
- results/Bgym-Llama-3-70b/webarena.json +0 -16
- results/Bgym-Llama-3-70b/workarena-l1.json +0 -16
- results/Bgym-Llama-3-70b/workarena-l2.json +0 -16
- results/Bgym-Llama-3-70b/workarena-l3.json +0 -16
- results/Bgym-Mixtral-8x22b/README.md +0 -1
- results/Bgym-Mixtral-8x22b/miniwob.json +0 -16
- results/Bgym-Mixtral-8x22b/webarena.json +0 -16
- results/Bgym-Mixtral-8x22b/workarena-l1.json +0 -16
- results/Bgym-Mixtral-8x22b/workarena-l2.json +0 -16
- results/Bgym-Mixtral-8x22b/workarena-l3.json +0 -16
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/README.md +0 -0
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/assistantbench.json +1 -1
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/miniwob.json +1 -1
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/webarena.json +1 -1
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/weblinx.json +1 -1
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/workarena-l1.json +1 -1
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/workarena-l2.json +1 -1
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/workarena-l3.json +1 -1
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/README.md +0 -0
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/assistantbench.json +1 -1
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/miniwob.json +1 -1
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/webarena.json +1 -1
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/weblinx.json +1 -1
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/workarena-l1.json +1 -1
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/workarena-l2.json +1 -1
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/workarena-l3.json +1 -1
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/README.md +0 -0
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/assistantbench.json +1 -1
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/miniwob.json +1 -1
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/webarena.json +1 -1
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/weblinx.json +1 -1
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/workarena-l1.json +1 -1
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/workarena-l2.json +1 -1
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/workarena-l3.json +1 -1
app.py
CHANGED
@@ -155,8 +155,9 @@ def create_html_table_benchmark(df, benchmark):
|
|
155 |
html += '<table>'
|
156 |
html += '<thead><tr>'
|
157 |
for column in df.columns:
|
158 |
-
if column
|
159 |
-
|
|
|
160 |
html += '</tr></thead>'
|
161 |
html += '<tbody>'
|
162 |
for _, row in df.iterrows():
|
@@ -169,7 +170,7 @@ def create_html_table_benchmark(df, benchmark):
|
|
169 |
summary = sanitize_cell_value(row[column])
|
170 |
details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
|
171 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
172 |
-
elif column == "Reproduced_all":
|
173 |
continue
|
174 |
elif column == "Score":
|
175 |
score_with_std_err = f'{row[column]} Β± {row["std_err"]}'
|
|
|
155 |
html += '<table>'
|
156 |
html += '<thead><tr>'
|
157 |
for column in df.columns:
|
158 |
+
if column == "Reproduced_all" or column == "std_err":
|
159 |
+
continue
|
160 |
+
html += f'<th>{sanitize_column_name(column)}</th>'
|
161 |
html += '</tr></thead>'
|
162 |
html += '<tbody>'
|
163 |
for _, row in df.iterrows():
|
|
|
170 |
summary = sanitize_cell_value(row[column])
|
171 |
details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
|
172 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
173 |
+
elif column == "Reproduced_all" or column == "std_err":
|
174 |
continue
|
175 |
elif column == "Score":
|
176 |
score_with_std_err = f'{row[column]} Β± {row["std_err"]}'
|
results/Bgym-GPT-3.5/README.md
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
## GPT-3.5 model
|
|
|
|
results/Bgym-GPT-3.5/config.json
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"agent_name": "GPT-3.5",
|
3 |
-
"backend_llm": "GPT-3.5"
|
4 |
-
}
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-3.5/miniwob.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-GPT-3.5",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "MiniWoB",
|
7 |
-
"score": 43.4,
|
8 |
-
"std_err": 0.1,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-3.5/webarena.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-GPT-3.5",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "WebArena",
|
7 |
-
"score": 6.7,
|
8 |
-
"std_err": 0.2,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-3.5/workarena-l1.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-GPT-3.5",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "WorkArena-L1",
|
7 |
-
"score": 6.1,
|
8 |
-
"std_err": 0.3,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-3.5/workarena-l2.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-GPT-3.5",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "WorkArena-L2",
|
7 |
-
"score": 0.0,
|
8 |
-
"std_err": 0.0,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-4o-V/README.md
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
## GPT-4o-V model
|
|
|
|
results/Bgym-GPT-4o-V/miniwob.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-GPT-4o-V",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "MiniWoB",
|
7 |
-
"score": 72.5,
|
8 |
-
"std_err": 0.5,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-4o-V/webarena.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-GPT-4o-V",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "WebArena",
|
7 |
-
"score": 24.0,
|
8 |
-
"std_err": 0.4,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-4o-V/workarena-l1.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-GPT-4o-V",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "WorkArena-L1",
|
7 |
-
"score": 41.8,
|
8 |
-
"std_err": 0.4,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-4o-V/workarena-l2.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-GPT-4o-V",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "WorkArena-L2",
|
7 |
-
"score": 3.8,
|
8 |
-
"std_err": 0.6,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-4o-V/workarena-l3.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-GPT-4o-V",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "WorkArena-L3",
|
7 |
-
"score": 0.0,
|
8 |
-
"std_err": 0.0,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-o1-mini/workarena-l3.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-GPT-o1-mini",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "WorkArena-L3",
|
7 |
-
"score": 0.0,
|
8 |
-
"std_err": 0.0,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Llama-3-70b/README.md
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
### Llama-3-70B
|
|
|
|
results/Bgym-Llama-3-70b/miniwob.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-Llama-3-70b",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "MiniWoB",
|
7 |
-
"score": 68.2,
|
8 |
-
"std_err": 0.7,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Llama-3-70b/webarena.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-Llama-3-70b",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "WebArena",
|
7 |
-
"score": 11.0,
|
8 |
-
"std_err": 0.3,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Llama-3-70b/workarena-l1.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-Llama-3-70b",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"benchmark": "WorkArena-L1",
|
6 |
-
"score": 17.9,
|
7 |
-
"std_err": 0.6,
|
8 |
-
"benchmark_specific": "No",
|
9 |
-
"benchmark_tuned": "No",
|
10 |
-
"followed_evaluation_protocol": "Yes",
|
11 |
-
"reproducible": "Yes",
|
12 |
-
"comments": "NA",
|
13 |
-
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "2021-01-01 12:00:00"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Llama-3-70b/workarena-l2.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-Llama-3-70b",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "WorkArena-L2",
|
7 |
-
"score": 0.0,
|
8 |
-
"std_err": 0.0,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Llama-3-70b/workarena-l3.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-Llama-3-70b",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "WorkArena-L3",
|
7 |
-
"score": 0.0,
|
8 |
-
"std_err": 0.0,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Mixtral-8x22b/README.md
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
## Mixtral 8x22B
|
|
|
|
results/Bgym-Mixtral-8x22b/miniwob.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-Mixtral-8x22b",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "MiniWoB",
|
7 |
-
"score": 62.4,
|
8 |
-
"std_err": 0.5,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Mixtral-8x22b/webarena.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-Mixtral-8x22b",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "WebArena",
|
7 |
-
"score": 12.6,
|
8 |
-
"std_err": 0.9,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Mixtral-8x22b/workarena-l1.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-Mixtral-8x22b",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"benchmark": "WorkArena-L1",
|
6 |
-
"score": 12.4,
|
7 |
-
"std_err": 0.7,
|
8 |
-
"benchmark_specific": "No",
|
9 |
-
"benchmark_tuned": "No",
|
10 |
-
"followed_evaluation_protocol": "Yes",
|
11 |
-
"reproducible": "Yes",
|
12 |
-
"comments": "NA",
|
13 |
-
"original_or_reproduced": "Original",
|
14 |
-
"date_time": "2021-01-04 12:06:00"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Mixtral-8x22b/workarena-l2.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-Mixtral-8x22b",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "WorkArena-L2",
|
7 |
-
"score": 0.0,
|
8 |
-
"std_err": 0.0,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Mixtral-8x22b/workarena-l3.json
DELETED
@@ -1,16 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"agent_name": "Bgym-Mixtral-8x22b",
|
4 |
-
"study_id": "study_id",
|
5 |
-
"date_time": "2021-01-01 12:00:00",
|
6 |
-
"benchmark": "WorkArena-L3",
|
7 |
-
"score": 0.0,
|
8 |
-
"std_err": 0.0,
|
9 |
-
"benchmark_specific": "No",
|
10 |
-
"benchmark_tuned": "No",
|
11 |
-
"followed_evaluation_protocol": "Yes",
|
12 |
-
"reproducible": "Yes",
|
13 |
-
"comments": "NA",
|
14 |
-
"original_or_reproduced": "Original"
|
15 |
-
}
|
16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/README.md
RENAMED
File without changes
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/assistantbench.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"benchmark": "AssistantBench",
|
6 |
"score": 5.2,
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
"study_id": "study_id",
|
5 |
"benchmark": "AssistantBench",
|
6 |
"score": 5.2,
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/miniwob.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 69.8,
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
"study_id": "study_id",
|
5 |
"benchmark": "MiniWoB",
|
6 |
"score": 69.8,
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/webarena.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"benchmark": "WebArena",
|
6 |
"score": 36.2,
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
"study_id": "study_id",
|
5 |
"benchmark": "WebArena",
|
6 |
"score": 36.2,
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/weblinx.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"benchmark": "WebLINX",
|
6 |
"score": 13.7,
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
"study_id": "study_id",
|
5 |
"benchmark": "WebLINX",
|
6 |
"score": 13.7,
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/workarena-l1.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"benchmark": "WorkArena-L1",
|
6 |
"score": 56.4,
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
"study_id": "study_id",
|
5 |
"benchmark": "WorkArena-L1",
|
6 |
"score": 56.4,
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/workarena-l2.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/workarena-l3.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L3",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L3",
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/README.md
RENAMED
File without changes
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/assistantbench.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "AssistantBench",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "AssistantBench",
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/miniwob.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/webarena.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WebArena",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WebArena",
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/weblinx.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WebLINX",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WebLINX",
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/workarena-l1.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/workarena-l2.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/workarena-l3.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L3",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L3",
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/README.md
RENAMED
File without changes
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/assistantbench.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "AssistantBench",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "AssistantBench",
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/miniwob.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/webarena.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WebArena",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WebArena",
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/weblinx.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WebLINX",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WebLINX",
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/workarena-l1.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L1",
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/workarena-l2.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L2",
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/workarena-l3.json
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
[
|
2 |
{
|
3 |
-
"agent_name": "
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L3",
|
|
|
1 |
[
|
2 |
{
|
3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WorkArena-L3",
|