pminervini commited on
Commit
855cd65
Β·
1 Parent(s): 4476a5b
Files changed (1) hide show
  1. app.py +88 -82
app.py CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
4
  import pandas as pd
5
 
6
  from apscheduler.schedulers.background import BackgroundScheduler
 
7
  from huggingface_hub import snapshot_download
8
 
9
  from src.display.about import (
@@ -14,8 +15,9 @@ from src.display.about import (
14
  LLM_BENCHMARKS_TEXT,
15
  LLM_BENCHMARKS_DETAILS,
16
  FAQ_TEXT,
17
- TITLE,
18
  )
 
19
  from src.display.css_html_js import custom_css
20
 
21
  from src.display.utils import (
@@ -38,33 +40,44 @@ from src.submission.submit import add_new_eval
38
  from src.utils import get_dataset_summary_table
39
 
40
 
41
- def restart_space():
42
- API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
43
-
44
-
45
  def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
46
  try:
47
  print(local_dir)
48
  snapshot_download(repo_id=repo_id, local_dir=local_dir, repo_type=repo_type, tqdm_class=tqdm_class, etag_timeout=etag_timeout)
49
- except Exception:
50
  restart_space()
51
 
52
- dataset_df = get_dataset_summary_table(file_path='blog/Hallucination-Leaderboard-Summary.csv')
53
 
54
- ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
55
- ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
56
 
57
 
58
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
59
- leaderboard_df = original_df.copy()
 
 
 
 
 
60
 
61
- finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 
 
 
 
 
 
 
62
 
63
 
64
  # Searching and filtering
65
- def update_table(hidden_df: pd.DataFrame, columns: list, type_query: list, precision_query: list, size_query: list, query: str):
66
- show_deleted = True
67
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
 
 
 
 
68
  filtered_df = filter_queries(query, filtered_df)
69
  df = select_columns(filtered_df, columns)
70
  return df
@@ -75,13 +88,15 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
75
 
76
 
77
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
78
- always_here_cols = [
79
- AutoEvalColumn.model_type_symbol.name,
80
- AutoEvalColumn.model.name,
81
- ]
 
82
  # We use COLS to maintain sorting
83
  filtered_df = df[
84
- always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
 
85
  ]
86
  return filtered_df
87
 
@@ -98,19 +113,17 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
98
  final_df.append(temp_filtered_df)
99
  if len(final_df) > 0:
100
  filtered_df = pd.concat(final_df)
101
- filtered_df = filtered_df.drop_duplicates(
102
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
103
- )
104
-
105
  return filtered_df
106
 
107
 
108
- def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool) -> pd.DataFrame:
 
 
 
109
  # Show all models
110
- if show_deleted:
111
- filtered_df = df
112
- else: # Show only still on the hub models
113
- filtered_df = df[df[AutoEvalColumn.still_on_hub.name] is True]
114
 
115
  type_emoji = [t[0] for t in type_query]
116
  filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
@@ -124,17 +137,27 @@ def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precisio
124
  return filtered_df
125
 
126
 
 
 
 
 
 
 
127
  demo = gr.Blocks(css=custom_css)
128
  with demo:
129
  gr.HTML(TITLE)
130
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
131
 
132
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
133
- with gr.TabItem("Hallucination Benchmark", elem_id="llm-benchmark-tab-table", id=0):
 
 
134
  with gr.Row():
135
  with gr.Column():
136
  with gr.Row():
137
- search_bar = gr.Textbox(placeholder=" πŸ” Model search (separate multiple queries with `;`)", show_label=False, elem_id="search-bar",)
 
 
138
  with gr.Row():
139
  shown_columns = gr.CheckboxGroup(
140
  choices=[
@@ -149,8 +172,7 @@ with demo:
149
  ],
150
  label="Select columns to show",
151
  elem_id="column-select",
152
- interactive=True,
153
- )
154
 
155
  with gr.Column(min_width=320):
156
  filter_columns_type = gr.CheckboxGroup(
@@ -158,44 +180,39 @@ with demo:
158
  choices=[t.to_str() for t in ModelType],
159
  value=[t.to_str() for t in ModelType],
160
  interactive=True,
161
- elem_id="filter-columns-type",
162
- )
163
  filter_columns_precision = gr.CheckboxGroup(
164
  label="Precision",
165
  choices=[i.value.name for i in Precision],
166
  value=[i.value.name for i in Precision],
167
  interactive=True,
168
- elem_id="filter-columns-precision",
169
- )
170
  filter_columns_size = gr.CheckboxGroup(
171
  label="Model sizes (in billions of parameters)",
172
  choices=list(NUMERIC_INTERVALS.keys()),
173
  value=list(NUMERIC_INTERVALS.keys()),
174
  interactive=True,
175
- elem_id="filter-columns-size",
176
- )
177
 
178
  leaderboard_table = gr.components.Dataframe(
179
  value=leaderboard_df[
180
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
181
- + shown_columns.value
182
- + [AutoEvalColumn.dummy.name]
183
  ] if leaderboard_df.empty is False else leaderboard_df,
184
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
185
  datatype=TYPES,
186
  elem_id="leaderboard-table",
187
  interactive=False,
188
- visible=True,
189
- column_widths=["2%", "20%"]
190
- )
191
 
192
  # Dummy leaderboard for handling the case when the user uses backspace key
193
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
194
  value=original_df[COLS] if original_df.empty is False else original_df,
195
  headers=COLS,
196
  datatype=TYPES,
197
- visible=False
198
- )
199
  search_bar.submit(
200
  update_table,
201
  [
@@ -206,8 +223,11 @@ with demo:
206
  filter_columns_size,
207
  search_bar,
208
  ],
209
- leaderboard_table,
210
- )
 
 
 
211
  for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
212
  selector.change(
213
  update_table,
@@ -220,8 +240,7 @@ with demo:
220
  search_bar,
221
  ],
222
  leaderboard_table,
223
- queue=True,
224
- )
225
 
226
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
227
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -238,48 +257,38 @@ with demo:
238
  gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
239
  gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
240
 
241
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
242
  with gr.Column():
243
  with gr.Row():
244
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
245
 
246
  with gr.Column():
247
- with gr.Accordion(
248
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
249
- open=False,
250
- ):
251
  with gr.Row():
252
  finished_eval_table = gr.components.Dataframe(
253
  value=finished_eval_queue_df,
254
  headers=EVAL_COLS,
255
  datatype=EVAL_TYPES,
256
- row_count=5
257
- )
258
- with gr.Accordion(
259
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
260
- open=False,
261
- ):
262
  with gr.Row():
263
  running_eval_table = gr.components.Dataframe(
264
  value=running_eval_queue_df,
265
  headers=EVAL_COLS,
266
  datatype=EVAL_TYPES,
267
- row_count=5
268
- )
269
 
270
- with gr.Accordion(
271
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
272
- open=False,
273
- ):
274
  with gr.Row():
275
  pending_eval_table = gr.components.Dataframe(
276
  value=pending_eval_queue_df,
277
  headers=EVAL_COLS,
278
  datatype=EVAL_TYPES,
279
- row_count=5
280
- )
281
  with gr.Row():
282
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
283
 
284
  with gr.Row():
285
  with gr.Column():
@@ -291,8 +300,7 @@ with demo:
291
  label="Model type",
292
  multiselect=False,
293
  value=None,
294
- interactive=True,
295
- )
296
 
297
  with gr.Column():
298
  precision = gr.Dropdown(
@@ -300,15 +308,15 @@ with demo:
300
  label="Precision",
301
  multiselect=False,
302
  value="float32",
303
- interactive=True,
304
- )
305
  weight_type = gr.Dropdown(
306
  choices=[i.value.name for i in WeightType],
307
  label="Weights type",
308
  multiselect=False,
309
  value="Original",
310
- interactive=True,
311
- )
312
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
313
 
314
  submit_button = gr.Button("Submit Eval")
@@ -324,18 +332,16 @@ with demo:
324
  weight_type,
325
  model_type,
326
  ],
327
- submission_result,
328
- )
329
 
330
  with gr.Row():
331
- with gr.Accordion("πŸ“™ Citation", open=False):
332
  citation_button = gr.Textbox(
333
  value=CITATION_BUTTON_TEXT,
334
  label=CITATION_BUTTON_LABEL,
335
  lines=20,
336
  elem_id="citation-button",
337
- show_copy_button=True,
338
- )
339
 
340
  scheduler = BackgroundScheduler()
341
 
 
4
  import pandas as pd
5
 
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
+
8
  from huggingface_hub import snapshot_download
9
 
10
  from src.display.about import (
 
15
  LLM_BENCHMARKS_TEXT,
16
  LLM_BENCHMARKS_DETAILS,
17
  FAQ_TEXT,
18
+ TITLE
19
  )
20
+
21
  from src.display.css_html_js import custom_css
22
 
23
  from src.display.utils import (
 
40
  from src.utils import get_dataset_summary_table
41
 
42
 
 
 
 
 
43
  def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
44
  try:
45
  print(local_dir)
46
  snapshot_download(repo_id=repo_id, local_dir=local_dir, repo_type=repo_type, tqdm_class=tqdm_class, etag_timeout=etag_timeout)
47
+ except Exception as e:
48
  restart_space()
49
 
 
50
 
51
+ def restart_space():
52
+ API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
53
 
54
 
55
+ def init_space():
56
+ dataset_df = get_dataset_summary_table(file_path='blog/Hallucination-Leaderboard-Summary.csv')
57
+
58
+ import socket
59
+ if socket.gethostname() not in {'neuromancer'}:
60
+ ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
61
+ ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
62
 
63
+ raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
64
+
65
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
66
+ return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
67
+
68
+
69
+ dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
70
+ leaderboard_df = original_df.copy()
71
 
72
 
73
  # Searching and filtering
74
+ def update_table(hidden_df: pd.DataFrame,
75
+ columns: list,
76
+ type_query: list,
77
+ precision_query: list,
78
+ size_query: list,
79
+ query: str):
80
+ filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
81
  filtered_df = filter_queries(query, filtered_df)
82
  df = select_columns(filtered_df, columns)
83
  return df
 
88
 
89
 
90
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
91
+ # always_here_cols = [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
92
+
93
+ always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
94
+ dummy_col = [AutoEvalColumn.dummy.name]
95
+
96
  # We use COLS to maintain sorting
97
  filtered_df = df[
98
+ # always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
99
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col
100
  ]
101
  return filtered_df
102
 
 
113
  final_df.append(temp_filtered_df)
114
  if len(final_df) > 0:
115
  filtered_df = pd.concat(final_df)
116
+ subset = [AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
117
+ filtered_df = filtered_df.drop_duplicates(subset=subset)
 
 
118
  return filtered_df
119
 
120
 
121
+ def filter_models(df: pd.DataFrame,
122
+ type_query: list,
123
+ size_query: list,
124
+ precision_query: list) -> pd.DataFrame:
125
  # Show all models
126
+ filtered_df = df
 
 
 
127
 
128
  type_emoji = [t[0] for t in type_query]
129
  filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
 
137
  return filtered_df
138
 
139
 
140
+ # triggered only once at startup => read query parameter if it exists
141
+ def load_query(request: gr.Request):
142
+ query = request.query_params.get("query") or ""
143
+ return query
144
+
145
+
146
  demo = gr.Blocks(css=custom_css)
147
  with demo:
148
  gr.HTML(TITLE)
149
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
150
 
151
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
152
+ with gr.TabItem("Hallucinations Benchmark",
153
+ elem_id="llm-benchmark-tab-table",
154
+ id=0):
155
  with gr.Row():
156
  with gr.Column():
157
  with gr.Row():
158
+ search_bar = gr.Textbox(placeholder=" πŸ” Model search (separate multiple queries with `;`)",
159
+ show_label=False,
160
+ elem_id="search-bar")
161
  with gr.Row():
162
  shown_columns = gr.CheckboxGroup(
163
  choices=[
 
172
  ],
173
  label="Select columns to show",
174
  elem_id="column-select",
175
+ interactive=True)
 
176
 
177
  with gr.Column(min_width=320):
178
  filter_columns_type = gr.CheckboxGroup(
 
180
  choices=[t.to_str() for t in ModelType],
181
  value=[t.to_str() for t in ModelType],
182
  interactive=True,
183
+ elem_id="filter-columns-type")
184
+
185
  filter_columns_precision = gr.CheckboxGroup(
186
  label="Precision",
187
  choices=[i.value.name for i in Precision],
188
  value=[i.value.name for i in Precision],
189
  interactive=True,
190
+ elem_id="filter-columns-precision")
191
+
192
  filter_columns_size = gr.CheckboxGroup(
193
  label="Model sizes (in billions of parameters)",
194
  choices=list(NUMERIC_INTERVALS.keys()),
195
  value=list(NUMERIC_INTERVALS.keys()),
196
  interactive=True,
197
+ elem_id="filter-columns-size")
 
198
 
199
  leaderboard_table = gr.components.Dataframe(
200
  value=leaderboard_df[
201
+ [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + [AutoEvalColumn.dummy.name]
 
 
202
  ] if leaderboard_df.empty is False else leaderboard_df,
203
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
204
  datatype=TYPES,
205
  elem_id="leaderboard-table",
206
  interactive=False,
207
+ visible=True)
 
 
208
 
209
  # Dummy leaderboard for handling the case when the user uses backspace key
210
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
211
  value=original_df[COLS] if original_df.empty is False else original_df,
212
  headers=COLS,
213
  datatype=TYPES,
214
+ visible=False)
215
+
216
  search_bar.submit(
217
  update_table,
218
  [
 
223
  filter_columns_size,
224
  search_bar,
225
  ],
226
+ leaderboard_table)
227
+
228
+ # Check query parameter once at startup and update search bar
229
+ demo.load(load_query, inputs=[], outputs=[search_bar])
230
+
231
  for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
232
  selector.change(
233
  update_table,
 
240
  search_bar,
241
  ],
242
  leaderboard_table,
243
+ queue=True)
 
244
 
245
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
246
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
257
  gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
258
  gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
259
 
260
+ with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
261
  with gr.Column():
262
  with gr.Row():
263
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
264
 
265
  with gr.Column():
266
+ with gr.Accordion(f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
 
 
 
267
  with gr.Row():
268
  finished_eval_table = gr.components.Dataframe(
269
  value=finished_eval_queue_df,
270
  headers=EVAL_COLS,
271
  datatype=EVAL_TYPES,
272
+ row_count=5)
273
+
274
+ with gr.Accordion(f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
 
 
 
275
  with gr.Row():
276
  running_eval_table = gr.components.Dataframe(
277
  value=running_eval_queue_df,
278
  headers=EVAL_COLS,
279
  datatype=EVAL_TYPES,
280
+ row_count=5)
 
281
 
282
+ with gr.Accordion(f"⏳ Scheduled Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
 
 
 
283
  with gr.Row():
284
  pending_eval_table = gr.components.Dataframe(
285
  value=pending_eval_queue_df,
286
  headers=EVAL_COLS,
287
  datatype=EVAL_TYPES,
288
+ row_count=5)
289
+
290
  with gr.Row():
291
+ gr.Markdown("# Submit your model here", elem_classes="markdown-text")
292
 
293
  with gr.Row():
294
  with gr.Column():
 
300
  label="Model type",
301
  multiselect=False,
302
  value=None,
303
+ interactive=True)
 
304
 
305
  with gr.Column():
306
  precision = gr.Dropdown(
 
308
  label="Precision",
309
  multiselect=False,
310
  value="float32",
311
+ interactive=True)
312
+
313
  weight_type = gr.Dropdown(
314
  choices=[i.value.name for i in WeightType],
315
  label="Weights type",
316
  multiselect=False,
317
  value="Original",
318
+ interactive=True)
319
+
320
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
321
 
322
  submit_button = gr.Button("Submit Eval")
 
332
  weight_type,
333
  model_type,
334
  ],
335
+ submission_result)
 
336
 
337
  with gr.Row():
338
+ with gr.Accordion("Citing this leaderboard", open=False):
339
  citation_button = gr.Textbox(
340
  value=CITATION_BUTTON_TEXT,
341
  label=CITATION_BUTTON_LABEL,
342
  lines=20,
343
  elem_id="citation-button",
344
+ show_copy_button=True)
 
345
 
346
  scheduler = BackgroundScheduler()
347