saattrupdan commited on
Commit
76e4363
·
1 Parent(s): 3e57038

feat: Change layout, fix task order, fix colours for models, fix range

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. app.py +23 -17
.gitignore CHANGED
@@ -1 +1,2 @@
1
  .venv
 
 
1
  .venv
2
+ __pycache__
app.py CHANGED
@@ -159,14 +159,15 @@ def main() -> None:
159
  "of different language models on different tasks. It is based on the "
160
  "generative results from the [ScandEval benchmark](https://scandeval.com)."
161
  )
162
- with gr.Row():
163
- with gr.Column():
164
  language_names_dropdown = gr.Dropdown(
165
  choices=all_languages,
166
  multiselect=True,
167
  label="Languages",
168
  value=["Danish"],
169
  interactive=True,
 
170
  )
171
  model_ids_dropdown = gr.Dropdown(
172
  choices=danish_models,
@@ -174,17 +175,15 @@ def main() -> None:
174
  label="Models",
175
  value=["gpt-4-0613", "mistralai/Mistral-7B-v0.1"],
176
  interactive=True,
 
177
  )
178
  use_win_ratio_checkbox = gr.Checkbox(
179
  label="Compare models with win ratios (as opposed to raw scores)",
180
  value=True,
181
  interactive=True,
 
182
  )
183
- gr.Markdown(
184
- "<center>Made with ❤️ by the <a href=\"https://alexandra.dk\">"
185
- "Alexandra Institute</a>.</center>"
186
- )
187
- with gr.Column():
188
  plot = gr.Plot(
189
  value=produce_radial_plot(
190
  model_ids_dropdown.value,
@@ -193,6 +192,11 @@ def main() -> None:
193
  results_dfs=results_dfs,
194
  ),
195
  )
 
 
 
 
 
196
 
197
  language_names_dropdown.change(
198
  fn=partial(update_model_ids_dropdown, results_dfs=results_dfs),
@@ -371,7 +375,7 @@ def produce_radial_plot(
371
  if model_id not in results_dfs_filtered[language].index:
372
  continue
373
  score = results_dfs_filtered[language].loc[model_id][task]
374
- win_ratio = np.mean([
375
  score >= other_score
376
  for other_score in results_dfs_filtered[language][task].dropna()
377
  ])
@@ -383,22 +387,21 @@ def produce_radial_plot(
383
  result_list.append(np.mean(scores))
384
  results.append(result_list)
385
 
386
- # Sort the results to avoid misleading radial plots
387
- model_idx_with_highest_variance = np.argmax(
388
- [np.std(result_list) for result_list in results]
389
- )
390
- sorted_idxs = np.argsort(results[model_idx_with_highest_variance])
391
- results = [np.asarray(result_list)[sorted_idxs] for result_list in results]
392
- tasks = np.asarray(tasks)[sorted_idxs]
393
-
394
  # Add the results to a plotly figure
395
  fig = go.Figure()
396
  for model_id, result_list in zip(model_ids, results):
 
 
 
 
 
 
397
  fig.add_trace(go.Scatterpolar(
398
  r=result_list,
399
  theta=[task.name for task in tasks],
400
  fill='toself',
401
  name=model_id,
 
402
  ))
403
 
404
  languages_str = ""
@@ -414,7 +417,10 @@ def produce_radial_plot(
414
 
415
  # Builds the radial plot from the results
416
  fig.update_layout(
417
- polar=dict(radialaxis=dict(visible=True)), showlegend=True, title=title
 
 
 
418
  )
419
 
420
  logger.info("Successfully produced radial plot.")
 
159
  "of different language models on different tasks. It is based on the "
160
  "generative results from the [ScandEval benchmark](https://scandeval.com)."
161
  )
162
+ with gr.Column():
163
+ with gr.Row():
164
  language_names_dropdown = gr.Dropdown(
165
  choices=all_languages,
166
  multiselect=True,
167
  label="Languages",
168
  value=["Danish"],
169
  interactive=True,
170
+ scale=2,
171
  )
172
  model_ids_dropdown = gr.Dropdown(
173
  choices=danish_models,
 
175
  label="Models",
176
  value=["gpt-4-0613", "mistralai/Mistral-7B-v0.1"],
177
  interactive=True,
178
+ scale=2,
179
  )
180
  use_win_ratio_checkbox = gr.Checkbox(
181
  label="Compare models with win ratios (as opposed to raw scores)",
182
  value=True,
183
  interactive=True,
184
+ scale=1,
185
  )
186
+ with gr.Row():
 
 
 
 
187
  plot = gr.Plot(
188
  value=produce_radial_plot(
189
  model_ids_dropdown.value,
 
192
  results_dfs=results_dfs,
193
  ),
194
  )
195
+ with gr.Row():
196
+ gr.Markdown(
197
+ "<center>Made with ❤️ by the <a href=\"https://alexandra.dk\">"
198
+ "Alexandra Institute</a>.</center>"
199
+ )
200
 
201
  language_names_dropdown.change(
202
  fn=partial(update_model_ids_dropdown, results_dfs=results_dfs),
 
375
  if model_id not in results_dfs_filtered[language].index:
376
  continue
377
  score = results_dfs_filtered[language].loc[model_id][task]
378
+ win_ratio = 100 * np.mean([
379
  score >= other_score
380
  for other_score in results_dfs_filtered[language][task].dropna()
381
  ])
 
387
  result_list.append(np.mean(scores))
388
  results.append(result_list)
389
 
 
 
 
 
 
 
 
 
390
  # Add the results to a plotly figure
391
  fig = go.Figure()
392
  for model_id, result_list in zip(model_ids, results):
393
+
394
+ # Generate colour for model, as an RGB triplet. The same model will always
395
+ # have the same colour
396
+ random.seed(model_id)
397
+ r, g, b = tuple(random.randint(0, 255) for _ in range(3))
398
+
399
  fig.add_trace(go.Scatterpolar(
400
  r=result_list,
401
  theta=[task.name for task in tasks],
402
  fill='toself',
403
  name=model_id,
404
+ line=dict(color=f'rgb({r}, {g}, {b})'),
405
  ))
406
 
407
  languages_str = ""
 
417
 
418
  # Builds the radial plot from the results
419
  fig.update_layout(
420
+ polar=dict(radialaxis=dict(visible=True, range=[0, 100])),
421
+ showlegend=True,
422
+ title=title,
423
+ width=800,
424
  )
425
 
426
  logger.info("Successfully produced radial plot.")