saattrupdan
commited on
Commit
·
76e4363
1
Parent(s):
3e57038
feat: Change layout, fix task order, fix colours for models, fix range
Browse files- .gitignore +1 -0
- app.py +23 -17
.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
.venv
|
|
|
|
1 |
.venv
|
2 |
+
__pycache__
|
app.py
CHANGED
@@ -159,14 +159,15 @@ def main() -> None:
|
|
159 |
"of different language models on different tasks. It is based on the "
|
160 |
"generative results from the [ScandEval benchmark](https://scandeval.com)."
|
161 |
)
|
162 |
-
with gr.
|
163 |
-
with gr.
|
164 |
language_names_dropdown = gr.Dropdown(
|
165 |
choices=all_languages,
|
166 |
multiselect=True,
|
167 |
label="Languages",
|
168 |
value=["Danish"],
|
169 |
interactive=True,
|
|
|
170 |
)
|
171 |
model_ids_dropdown = gr.Dropdown(
|
172 |
choices=danish_models,
|
@@ -174,17 +175,15 @@ def main() -> None:
|
|
174 |
label="Models",
|
175 |
value=["gpt-4-0613", "mistralai/Mistral-7B-v0.1"],
|
176 |
interactive=True,
|
|
|
177 |
)
|
178 |
use_win_ratio_checkbox = gr.Checkbox(
|
179 |
label="Compare models with win ratios (as opposed to raw scores)",
|
180 |
value=True,
|
181 |
interactive=True,
|
|
|
182 |
)
|
183 |
-
|
184 |
-
"<center>Made with ❤️ by the <a href=\"https://alexandra.dk\">"
|
185 |
-
"Alexandra Institute</a>.</center>"
|
186 |
-
)
|
187 |
-
with gr.Column():
|
188 |
plot = gr.Plot(
|
189 |
value=produce_radial_plot(
|
190 |
model_ids_dropdown.value,
|
@@ -193,6 +192,11 @@ def main() -> None:
|
|
193 |
results_dfs=results_dfs,
|
194 |
),
|
195 |
)
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
language_names_dropdown.change(
|
198 |
fn=partial(update_model_ids_dropdown, results_dfs=results_dfs),
|
@@ -371,7 +375,7 @@ def produce_radial_plot(
|
|
371 |
if model_id not in results_dfs_filtered[language].index:
|
372 |
continue
|
373 |
score = results_dfs_filtered[language].loc[model_id][task]
|
374 |
-
win_ratio = np.mean([
|
375 |
score >= other_score
|
376 |
for other_score in results_dfs_filtered[language][task].dropna()
|
377 |
])
|
@@ -383,22 +387,21 @@ def produce_radial_plot(
|
|
383 |
result_list.append(np.mean(scores))
|
384 |
results.append(result_list)
|
385 |
|
386 |
-
# Sort the results to avoid misleading radial plots
|
387 |
-
model_idx_with_highest_variance = np.argmax(
|
388 |
-
[np.std(result_list) for result_list in results]
|
389 |
-
)
|
390 |
-
sorted_idxs = np.argsort(results[model_idx_with_highest_variance])
|
391 |
-
results = [np.asarray(result_list)[sorted_idxs] for result_list in results]
|
392 |
-
tasks = np.asarray(tasks)[sorted_idxs]
|
393 |
-
|
394 |
# Add the results to a plotly figure
|
395 |
fig = go.Figure()
|
396 |
for model_id, result_list in zip(model_ids, results):
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
fig.add_trace(go.Scatterpolar(
|
398 |
r=result_list,
|
399 |
theta=[task.name for task in tasks],
|
400 |
fill='toself',
|
401 |
name=model_id,
|
|
|
402 |
))
|
403 |
|
404 |
languages_str = ""
|
@@ -414,7 +417,10 @@ def produce_radial_plot(
|
|
414 |
|
415 |
# Builds the radial plot from the results
|
416 |
fig.update_layout(
|
417 |
-
polar=dict(radialaxis=dict(visible=True
|
|
|
|
|
|
|
418 |
)
|
419 |
|
420 |
logger.info("Successfully produced radial plot.")
|
|
|
159 |
"of different language models on different tasks. It is based on the "
|
160 |
"generative results from the [ScandEval benchmark](https://scandeval.com)."
|
161 |
)
|
162 |
+
with gr.Column():
|
163 |
+
with gr.Row():
|
164 |
language_names_dropdown = gr.Dropdown(
|
165 |
choices=all_languages,
|
166 |
multiselect=True,
|
167 |
label="Languages",
|
168 |
value=["Danish"],
|
169 |
interactive=True,
|
170 |
+
scale=2,
|
171 |
)
|
172 |
model_ids_dropdown = gr.Dropdown(
|
173 |
choices=danish_models,
|
|
|
175 |
label="Models",
|
176 |
value=["gpt-4-0613", "mistralai/Mistral-7B-v0.1"],
|
177 |
interactive=True,
|
178 |
+
scale=2,
|
179 |
)
|
180 |
use_win_ratio_checkbox = gr.Checkbox(
|
181 |
label="Compare models with win ratios (as opposed to raw scores)",
|
182 |
value=True,
|
183 |
interactive=True,
|
184 |
+
scale=1,
|
185 |
)
|
186 |
+
with gr.Row():
|
|
|
|
|
|
|
|
|
187 |
plot = gr.Plot(
|
188 |
value=produce_radial_plot(
|
189 |
model_ids_dropdown.value,
|
|
|
192 |
results_dfs=results_dfs,
|
193 |
),
|
194 |
)
|
195 |
+
with gr.Row():
|
196 |
+
gr.Markdown(
|
197 |
+
"<center>Made with ❤️ by the <a href=\"https://alexandra.dk\">"
|
198 |
+
"Alexandra Institute</a>.</center>"
|
199 |
+
)
|
200 |
|
201 |
language_names_dropdown.change(
|
202 |
fn=partial(update_model_ids_dropdown, results_dfs=results_dfs),
|
|
|
375 |
if model_id not in results_dfs_filtered[language].index:
|
376 |
continue
|
377 |
score = results_dfs_filtered[language].loc[model_id][task]
|
378 |
+
win_ratio = 100 * np.mean([
|
379 |
score >= other_score
|
380 |
for other_score in results_dfs_filtered[language][task].dropna()
|
381 |
])
|
|
|
387 |
result_list.append(np.mean(scores))
|
388 |
results.append(result_list)
|
389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
390 |
# Add the results to a plotly figure
|
391 |
fig = go.Figure()
|
392 |
for model_id, result_list in zip(model_ids, results):
|
393 |
+
|
394 |
+
# Generate colour for model, as an RGB triplet. The same model will always
|
395 |
+
# have the same colour
|
396 |
+
random.seed(model_id)
|
397 |
+
r, g, b = tuple(random.randint(0, 255) for _ in range(3))
|
398 |
+
|
399 |
fig.add_trace(go.Scatterpolar(
|
400 |
r=result_list,
|
401 |
theta=[task.name for task in tasks],
|
402 |
fill='toself',
|
403 |
name=model_id,
|
404 |
+
line=dict(color=f'rgb({r}, {g}, {b})'),
|
405 |
))
|
406 |
|
407 |
languages_str = ""
|
|
|
417 |
|
418 |
# Builds the radial plot from the results
|
419 |
fig.update_layout(
|
420 |
+
polar=dict(radialaxis=dict(visible=True, range=[0, 100])),
|
421 |
+
showlegend=True,
|
422 |
+
title=title,
|
423 |
+
width=800,
|
424 |
)
|
425 |
|
426 |
logger.info("Successfully produced radial plot.")
|