hynky HF staff commited on
Commit
40e38d3
·
1 Parent(s): e23c5c4

UI overhaul + seapration of concerns

Browse files
app.py CHANGED
@@ -1,618 +1,4 @@
1
- import heapq
2
- import json
3
- import os
4
- import re
5
- import tempfile
6
- from collections import defaultdict
7
- from concurrent.futures import ThreadPoolExecutor
8
- from functools import partial
9
- from pathlib import Path
10
- from typing import Literal
11
 
12
- import gradio as gr
13
- import numpy as np
14
- import plotly.express as px
15
- import plotly.graph_objects as go
16
- import tenacity
17
- from datatrove.io import get_datafolder
18
- from datatrove.utils.stats import MetricStatsDict
19
-
20
- PARTITION_OPTIONS = Literal["Top", "Bottom", "Most frequent (n_docs)"]
21
- METRICS_LOCATION_DEFAULT = os.getenv("METRICS_LOCATION_DEFAULT", "hf://datasets/HuggingFaceFW-Dev/summary-stats-files")
22
-
23
-
24
- def find_folders(base_folder, path):
25
- base_folder = get_datafolder(base_folder)
26
- if not base_folder.exists(path):
27
- return []
28
- return sorted(
29
- [
30
- folder["name"]
31
- for folder in base_folder.ls(path, detail=True)
32
- if folder["type"] == "directory" and not folder["name"].rstrip("/") == path
33
- ]
34
- )
35
-
36
-
37
- def find_metrics_folders(base_folder: str):
38
- base_data_folder = get_datafolder(base_folder)
39
- # First find all metric.json using globing for metric.json
40
- metrics_merged = base_data_folder.glob("**/metric.json")
41
-
42
- # Then for each of metrics.merged take the all but last two parts of the path (grouping/metric_name)
43
- metrics_folders = [str(Path(x).parent.parent.parent) for x in metrics_merged]
44
- # Finally get the unique paths
45
- return sorted(list(set(metrics_folders)))
46
-
47
-
48
- def fetch_datasets(base_folder: str):
49
- datasets = sorted(find_metrics_folders(base_folder))
50
- return datasets, gr.update(choices=datasets, value=None), fetch_groups(base_folder, datasets, None, "union")
51
-
52
-
53
- def export_data(exported_data: MetricStatsDict, metric_name: str):
54
- if not exported_data:
55
- return None
56
- # Assuming exported_data is a dictionary where the key is the dataset name and the value is the data to be exported
57
- with tempfile.NamedTemporaryFile(mode="w", delete=False, prefix=metric_name, suffix=".json") as temp:
58
- json.dump({
59
- name: sorted([{"value": key, **value} for key, value in dt.to_dict().items()], key=lambda x: x["value"])
60
- for name, dt in exported_data.items()
61
- }, temp, indent=2)
62
- temp_path = temp.name
63
- return gr.update(visible=True, value=temp_path)
64
-
65
-
66
- def fetch_groups(base_folder, datasets, old_groups, type="intersection"):
67
- if not datasets:
68
- return gr.update(choices=[], value=None)
69
-
70
- with ThreadPoolExecutor() as executor:
71
- GROUPS = list(executor.map(lambda run: [Path(x).name for x in find_folders(base_folder, run)], datasets))
72
- if len(GROUPS) == 0:
73
- return gr.update(choices=[], value=None)
74
-
75
- if type == "intersection":
76
- new_choices = set.intersection(*(set(g) for g in GROUPS))
77
- else:
78
- new_choices = set.union(*(set(g) for g in GROUPS))
79
- value = None
80
- if old_groups:
81
- value = list(set.intersection(new_choices, {old_groups}))
82
- value = value[0] if value else None
83
-
84
- if not value and len(new_choices) == 1:
85
- value = list(new_choices)[0]
86
-
87
- # now take the intersection of all grups
88
- return gr.update(choices=sorted(list(new_choices)), value=value)
89
-
90
-
91
- def fetch_metrics(base_folder, datasets, group, old_metrics, type="intersection"):
92
- if not group:
93
- return gr.update(choices=[], value=None)
94
-
95
- with ThreadPoolExecutor() as executor:
96
- metrics = list(
97
- executor.map(lambda run: [Path(x).name for x in find_folders(base_folder, f"{run}/{group}")], datasets))
98
- if len(metrics) == 0:
99
- return gr.update(choices=[], value=None)
100
-
101
- if type == "intersection":
102
- new_possibles_choices = set.intersection(*(set(s) for s in metrics))
103
- else:
104
- new_possibles_choices = set.union(*(set(s) for s in metrics))
105
- value = None
106
- if old_metrics:
107
- value = list(set.intersection(new_possibles_choices, {old_metrics}))
108
- value = value[0] if value else None
109
-
110
- if not value and len(new_possibles_choices) == 1:
111
- value = list(new_possibles_choices)[0]
112
-
113
- return gr.update(choices=sorted(list(new_possibles_choices)), value=value)
114
-
115
-
116
- def reverse_search(base_folder, possible_datasets, grouping, metric_name):
117
- with ThreadPoolExecutor() as executor:
118
- found_datasets = list(executor.map(
119
- lambda dataset: dataset if metric_exists(base_folder, dataset, metric_name, grouping) else None,
120
- possible_datasets))
121
- found_datasets = [dataset for dataset in found_datasets if dataset is not None]
122
- return "\n".join(found_datasets)
123
-
124
-
125
- def reverse_search_add(datasets, reverse_search_results):
126
- datasets = datasets or []
127
- return sorted(list(set(datasets + reverse_search_results.strip().split("\n"))))
128
-
129
-
130
- def metric_exists(base_folder, path, metric_name, group_by):
131
- base_folder = get_datafolder(base_folder)
132
- return base_folder.exists(f"{path}/{group_by}/{metric_name}/metric.json")
133
-
134
-
135
- @tenacity.retry(stop=tenacity.stop_after_attempt(5))
136
- def load_metrics(base_folder, path, metric_name, group_by):
137
- base_folder = get_datafolder(base_folder)
138
- with base_folder.open(
139
- f"{path}/{group_by}/{metric_name}/metric.json",
140
- ) as f:
141
- json_metric = json.load(f)
142
- # No idea why this is necessary, but it is, otheriwse the Metric StatsDict is malformed
143
- return MetricStatsDict.from_dict(json_metric)
144
-
145
-
146
- def prepare_for_non_grouped_plotting(metric, normalization, rounding):
147
- metrics_rounded = defaultdict(lambda: 0)
148
- for key, value in metric.items():
149
- metrics_rounded[round(float(key), rounding)] += value.total
150
- if normalization:
151
- normalizer = sum(metrics_rounded.values())
152
- metrics_rounded = {k: v / normalizer for k, v in metrics_rounded.items()}
153
- # check that the sum of the values is 1
154
- summed = sum(metrics_rounded.values())
155
- assert abs(summed - 1) < 0.01, summed
156
- return metrics_rounded
157
-
158
-
159
- def load_data(dataset_path, base_folder, grouping, metric_name):
160
- metrics = load_metrics(base_folder, dataset_path, metric_name, grouping)
161
- return metrics
162
-
163
-
164
- def prepare_for_group_plotting(metric, top_k, direction: PARTITION_OPTIONS, regex: str | None, rounding: int):
165
- regex_compiled = re.compile(regex) if regex else None
166
- metric = {key: value for key, value in metric.items() if not regex or regex_compiled.match(key)}
167
- means = {key: round(float(value.mean), rounding) for key, value in metric.items()}
168
- # Use heap to get top_k keys
169
- if direction == "Top":
170
- keys = heapq.nlargest(top_k, means, key=means.get)
171
- elif direction == "Most frequent (n_docs)":
172
- totals = {key: int(value.n) for key, value in metric.items()}
173
- keys = heapq.nlargest(top_k, totals, key=totals.get)
174
- else:
175
- keys = heapq.nsmallest(top_k, means, key=means.get)
176
-
177
- means = [means[key] for key in keys]
178
- stds = [metric[key].standard_deviation for key in keys]
179
- return keys, means, stds
180
-
181
-
182
- def set_alpha(color, alpha):
183
- """
184
- Takes a hex color and returns
185
- rgba(r, g, b, a)
186
- """
187
- if color.startswith('#'):
188
- r, g, b = int(color[1:3], 16), int(color[3:5], 16), int(color[5:7], 16)
189
- else:
190
- r, g, b = 0, 0, 0 # Fallback to black if the color format is not recognized
191
- return f"rgba({r}, {g}, {b}, {alpha})"
192
-
193
-
194
- def plot_scatter(
195
- data: dict[str, dict[float, float]],
196
- metric_name: str,
197
- log_scale_x: bool,
198
- log_scale_y: bool,
199
- normalization: bool,
200
- rounding: int,
201
- cumsum: bool,
202
- perc: bool,
203
- progress: gr.Progress,
204
- ):
205
- fig = go.Figure()
206
-
207
- # First sort the histograms, by their name
208
- data = {name: histogram for name, histogram in sorted(data.items())}
209
- for i, (name, histogram) in enumerate(progress.tqdm(data.items(), total=len(data), desc="Plotting...")):
210
- histogram_prepared = prepare_for_non_grouped_plotting(histogram, normalization, rounding)
211
- x = sorted(histogram_prepared.keys())
212
- y = [histogram_prepared[k] for k in x]
213
- if cumsum:
214
- y = np.cumsum(y).tolist()
215
- if perc:
216
- y = (np.array(y) * 100).tolist()
217
-
218
- fig.add_trace(
219
- go.Scatter(
220
- x=x,
221
- y=y,
222
- mode="lines",
223
- name=name,
224
- marker=dict(color=set_alpha(px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)], 0.5)),
225
- )
226
- )
227
-
228
- yaxis_title = "Frequency" if normalization else "Total"
229
-
230
- fig.update_layout(
231
- title=f"Line Plots for {metric_name}",
232
- xaxis_title=metric_name,
233
- yaxis_title=yaxis_title,
234
- xaxis_type="log" if log_scale_x and len(x) > 1 else None,
235
- yaxis_type="log" if log_scale_y and len(y) > 1 else None,
236
- width=1200,
237
- height=600,
238
- showlegend=True,
239
- )
240
-
241
- return fig
242
-
243
-
244
- def plot_bars(
245
- data: dict[str, list[dict[str, float]]],
246
- metric_name: str,
247
- top_k: int,
248
- direction: PARTITION_OPTIONS,
249
- regex: str | None,
250
- rounding: int,
251
- log_scale_x: bool,
252
- log_scale_y: bool,
253
- progress: gr.Progress,
254
- ):
255
- fig = go.Figure()
256
- x = []
257
- y = []
258
-
259
- for i, (name, histogram) in enumerate(progress.tqdm(data.items(), total=len(data), desc="Plotting...")):
260
- x, y, stds = prepare_for_group_plotting(histogram, top_k, direction, regex, rounding)
261
-
262
- fig.add_trace(go.Bar(
263
- x=x,
264
- y=y,
265
- name=f"{name} Mean",
266
- marker=dict(color=set_alpha(px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)], 0.5)),
267
- error_y=dict(type='data', array=stds, visible=True)
268
- ))
269
-
270
- fig.update_layout(
271
- title=f"Bar Plots for {metric_name}",
272
- xaxis_title=metric_name,
273
- yaxis_title="Avg. value",
274
- xaxis_type="log" if log_scale_x and len(x) > 1 else None,
275
- yaxis_type="log" if log_scale_y and len(y) > 1 else None,
276
- autosize=True,
277
- width=1200,
278
- height=600,
279
- showlegend=True,
280
- )
281
-
282
- return fig
283
-
284
-
285
- def get_desc(data):
286
- res = {name: list(dt.to_dict().keys()) for name, dt in data.items()}
287
- return "\n".join([
288
- f"**{name}**: min={min(dt)}, max={max(dt)}" for name, dt in res.items()
289
- ])
290
-
291
-
292
- def update_graph(
293
- base_folder,
294
- datasets,
295
- metric_name,
296
- grouping,
297
- log_scale_x,
298
- log_scale_y,
299
- rounding,
300
- normalization,
301
- top_k,
302
- direction,
303
- regex,
304
- cumsum,
305
- perc,
306
- progress=gr.Progress(),
307
- ):
308
- if len(datasets) <= 0 or not metric_name or not grouping:
309
- return None
310
- # Placeholder for logic to rerender the graph based on the inputs
311
-
312
- with ThreadPoolExecutor() as pool:
313
- data = list(
314
- progress.tqdm(
315
- pool.map(
316
- partial(load_data, base_folder=base_folder, metric_name=metric_name, grouping=grouping),
317
- datasets,
318
- ),
319
- total=len(datasets),
320
- desc="Loading data...",
321
- )
322
- )
323
-
324
- data = {path: result for path, result in zip(datasets, data)}
325
- return plot_data(data, metric_name, normalization, rounding, grouping, top_k, direction, regex, log_scale_x,
326
- log_scale_y, cumsum, perc, progress), data, export_data(data, metric_name), get_desc(data)
327
-
328
-
329
- def plot_data(data, metric_name, normalization, rounding, grouping, top_k, direction, regex, log_scale_x, log_scale_y,
330
- cumsum, perc,
331
- progress=gr.Progress()):
332
- if rounding is None or top_k is None:
333
- return None
334
- graph_fc = (
335
- partial(plot_scatter, normalization=normalization, rounding=rounding, cumsum=cumsum, perc=perc)
336
- if grouping == "histogram"
337
- else partial(plot_bars, top_k=top_k, direction=direction, regex=regex, rounding=rounding)
338
- )
339
- return graph_fc(data=data, metric_name=metric_name, progress=progress, log_scale_x=log_scale_x,
340
- log_scale_y=log_scale_y)
341
-
342
-
343
- # Create the Gradio interface
344
- with gr.Blocks() as demo:
345
- datasets = gr.State([])
346
- exported_data = gr.State([])
347
- metrics_headline = gr.Markdown(value="# Metrics Exploration")
348
- with gr.Row():
349
- with gr.Column(scale=2):
350
- with gr.Row():
351
- with gr.Column(scale=1):
352
- base_folder = gr.Textbox(
353
- label="Metrics Location",
354
- value=METRICS_LOCATION_DEFAULT,
355
- )
356
- datasets_refetch = gr.Button("Fetch Datasets")
357
-
358
- with gr.Column(scale=1):
359
- regex_select = gr.Text(label="Regex filter", value=".*")
360
- regex_button = gr.Button("Search")
361
- with gr.Row():
362
- datasets_selected = gr.Dropdown(
363
- choices=[],
364
- label="Datasets",
365
- multiselect=True,
366
- )
367
-
368
- # add a readme description
369
- readme_description = gr.Markdown(
370
- label="Readme",
371
- value="""
372
- ## How to use:
373
- 1) Specify Metrics location (Stats block `output_folder` without the last path segment) and click "Fetch Datasets"
374
- 2) Select datasets you are interested in using the dropdown or regex filter
375
- 3) Specify Grouping (global average/value/fqdn/suffix) and Metric name
376
- 4) Click "Update Graph"
377
-
378
-
379
- ## Groupings:
380
- - **histogram**: Creates a line plot of values with their frequencies. If normalization is on, the frequencies sum to 1.
381
- * normalize:
382
- - **(fqdn/suffix)**: Creates a bar plot of the avg. values of the metric for full qualifed domain name/suffix of domain.
383
- * k: the number of groups to show
384
- * Top/Bottom/Most frequent (n_docs): Groups with the top/bottom k values/most prevalant docs are shown
385
- - **none**: Shows the average value of given metric
386
-
387
- ## Reverse search:
388
- To search for datasets containing a grouping and certain metric, use the Reverse search section.
389
- Specify the search parameters and click "Search". This will show you found datasets in the "Found datasets" textbox. You can modify the selection after search by removing unwanted lines and clicking "Add to selection".
390
-
391
- ## Note:
392
- The data might not be 100% representative, due to the sampling and optimistic merging of the metrics (fqdn/suffix).
393
- """,
394
- )
395
- with gr.Column(scale=1):
396
- # Define the dropdown for grouping
397
- grouping_dropdown = gr.Dropdown(
398
- choices=[],
399
- label="Grouping",
400
- multiselect=False,
401
- )
402
- # Define the dropdown for metric_name
403
- metric_name_dropdown = gr.Dropdown(
404
- choices=[],
405
- label="Metric name",
406
- multiselect=False,
407
- )
408
-
409
- update_button = gr.Button("Update Graph", variant="primary")
410
-
411
- with gr.Row():
412
- with gr.Column(scale=1):
413
- cumsum_checkbox = gr.Checkbox(
414
- label="Cumsum",
415
- value=False,
416
- )
417
- perc_checkbox = gr.Checkbox(
418
- label="%",
419
- value=False,
420
- )
421
- log_scale_x_checkbox = gr.Checkbox(
422
- label="Log scale x",
423
- value=False,
424
- )
425
- log_scale_y_checkbox = gr.Checkbox(
426
- label="Log scale y",
427
- value=False,
428
- )
429
- rounding = gr.Number(
430
- label="Rounding",
431
- value=2,
432
- )
433
- normalization_checkbox = gr.Checkbox(
434
- label="Normalize",
435
- value=True, # Default value
436
- visible=False
437
- )
438
- with gr.Row():
439
- # export_data_button = gr.Button("Export data", visible=True, link=export_data_json)
440
- export_data_json = gr.File(visible=False)
441
- with gr.Column(visible=False) as min_max_hist:
442
- min_max_hist_data = gr.Markdown()
443
- with gr.Column(scale=4):
444
- with gr.Row(visible=False) as group_choices:
445
- with gr.Column(scale=2):
446
- group_regex = gr.Text(
447
- label="Group Regex",
448
- value=None,
449
- )
450
- with gr.Row():
451
- top_select = gr.Number(
452
- label="N Groups",
453
- value=100,
454
- interactive=True,
455
- )
456
-
457
- direction_checkbox = gr.Radio(
458
- label="Partition",
459
- choices=[
460
- "Top",
461
- "Bottom",
462
- "Most frequent (n_docs)",
463
- ],
464
- value="Most frequent (n_docs)",
465
- )
466
- # Define the graph output
467
- with gr.Row():
468
- graph_output = gr.Plot(label="Graph")
469
-
470
- with gr.Row():
471
- reverse_search_headline = gr.Markdown(value="# Reverse metrics search")
472
-
473
- with gr.Row():
474
- with gr.Column(scale=1):
475
- # Define the dropdown for grouping
476
- reverse_grouping_dropdown = gr.Dropdown(
477
- choices=[],
478
- label="Grouping",
479
- multiselect=False,
480
- )
481
- # Define the dropdown for metric_name
482
- reverse_metric_name_dropdown = gr.Dropdown(
483
- choices=[],
484
- label="Stat name",
485
- multiselect=False,
486
- )
487
-
488
- with gr.Column(scale=1):
489
- reverse_search_button = gr.Button("Search")
490
- reverse_search_add_button = gr.Button("Add to selection")
491
-
492
- with gr.Column(scale=2):
493
- reverse_search_results = gr.Textbox(
494
- label="Found datasets",
495
- lines=10,
496
- placeholder="Found datasets containing the group/metric name. You can modify the selection after search by removing unwanted lines and clicking Add to selection"
497
- )
498
-
499
- update_button.click(
500
- fn=update_graph,
501
- inputs=[
502
- base_folder,
503
- datasets_selected,
504
- metric_name_dropdown,
505
- grouping_dropdown,
506
- log_scale_x_checkbox,
507
- log_scale_y_checkbox,
508
- rounding,
509
- normalization_checkbox,
510
- top_select,
511
- direction_checkbox,
512
- group_regex,
513
- cumsum_checkbox,
514
- perc_checkbox
515
- ],
516
- outputs=[graph_output, exported_data, export_data_json, min_max_hist_data],
517
- )
518
-
519
- gr.on(
520
- triggers=[normalization_checkbox.change, rounding.change, group_regex.change, direction_checkbox.change,
521
- top_select.change, log_scale_x_checkbox.change,
522
- log_scale_y_checkbox.change, cumsum_checkbox.change, perc_checkbox.change],
523
- fn=plot_data,
524
- inputs=[
525
- exported_data,
526
- metric_name_dropdown,
527
- normalization_checkbox,
528
- rounding,
529
- grouping_dropdown,
530
- top_select,
531
- direction_checkbox,
532
- group_regex,
533
- log_scale_x_checkbox,
534
- log_scale_y_checkbox,
535
- cumsum_checkbox,
536
- perc_checkbox
537
- ],
538
- outputs=[graph_output],
539
- )
540
-
541
- datasets_selected.change(
542
- fn=fetch_groups,
543
- inputs=[base_folder, datasets_selected, grouping_dropdown],
544
- outputs=grouping_dropdown,
545
- )
546
-
547
- grouping_dropdown.change(
548
- fn=fetch_metrics,
549
- inputs=[base_folder, datasets_selected, grouping_dropdown, metric_name_dropdown],
550
- outputs=metric_name_dropdown,
551
- )
552
-
553
- reverse_grouping_dropdown.select(
554
- fn=partial(fetch_metrics, type="union"),
555
- inputs=[base_folder, datasets, reverse_grouping_dropdown, reverse_metric_name_dropdown],
556
- outputs=reverse_metric_name_dropdown,
557
- )
558
-
559
- reverse_search_button.click(
560
- fn=reverse_search,
561
- inputs=[base_folder, datasets, reverse_grouping_dropdown, reverse_metric_name_dropdown],
562
- outputs=reverse_search_results,
563
- )
564
-
565
- reverse_search_add_button.click(
566
- fn=reverse_search_add,
567
- inputs=[datasets_selected, reverse_search_results],
568
- outputs=datasets_selected,
569
- )
570
-
571
- datasets_refetch.click(
572
- fn=fetch_datasets,
573
- inputs=[base_folder],
574
- outputs=[datasets, datasets_selected, reverse_grouping_dropdown],
575
- )
576
-
577
-
578
- def update_datasets_with_regex(regex, selected_runs, all_runs):
579
- if not regex:
580
- return
581
- new_dsts = {run for run in all_runs if re.search(regex, run)}
582
- if not new_dsts:
583
- return gr.update(value=list(selected_runs))
584
- dst_union = new_dsts.union(selected_runs or [])
585
- return gr.update(value=sorted(list(dst_union)))
586
-
587
-
588
- regex_button.click(
589
- fn=update_datasets_with_regex,
590
- inputs=[regex_select, datasets_selected, datasets],
591
- outputs=datasets_selected,
592
- )
593
-
594
-
595
- def update_grouping_options(grouping):
596
- if grouping == "histogram":
597
- return {
598
- normalization_checkbox: gr.Column(visible=True),
599
- group_choices: gr.Column(visible=False),
600
- min_max_hist: gr.Column(visible=True),
601
- }
602
- else:
603
- return {
604
- normalization_checkbox: gr.Column(visible=False),
605
- group_choices: gr.Column(visible=True),
606
- min_max_hist: gr.Column(visible=False),
607
- }
608
-
609
-
610
- grouping_dropdown.change(
611
- fn=update_grouping_options,
612
- inputs=[grouping_dropdown],
613
- outputs=[normalization_checkbox, group_choices, min_max_hist],
614
- )
615
-
616
- # Launch the application
617
- if __name__ == "__main__":
618
- demo.launch()
 
1
+ from src.view.view import create_interface
 
 
 
 
 
 
 
 
 
2
 
3
+ demo = create_interface()
4
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/logic/__pycache__/data_fetching.cpython-312.pyc ADDED
Binary file (8.43 kB). View file
 
src/logic/__pycache__/data_processing.cpython-312.pyc ADDED
Binary file (4.59 kB). View file
 
src/logic/__pycache__/plotting.cpython-312.pyc ADDED
Binary file (5.12 kB). View file
 
src/logic/__pycache__/utils.cpython-312.pyc ADDED
Binary file (1.36 kB). View file
 
src/logic/data_fetching.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tempfile
4
+ from pathlib import Path
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from typing import List, Dict
7
+ from datatrove.io import get_datafolder
8
+ from datatrove.utils.stats import MetricStatsDict
9
+ import gradio as gr
10
+ import tenacity
11
+
12
+ def find_folders(base_folder: str, path: str) -> List[str]:
13
+ base_folder = get_datafolder(base_folder)
14
+ if not base_folder.exists(path):
15
+ return []
16
+ return sorted(
17
+ [
18
+ folder["name"]
19
+ for folder in base_folder.ls(path, detail=True)
20
+ if folder["type"] == "directory" and not folder["name"].rstrip("/") == path
21
+ ]
22
+ )
23
+
24
+ def find_metrics_folders(base_folder: str) -> List[str]:
25
+ base_data_df = get_datafolder(base_folder)
26
+ dirs = sorted(
27
+ folder
28
+ for folder, info in base_data_df.find("", detail=True, maxdepth=1, withdirs=True).items()
29
+ if info["type"] == "directory"
30
+ )
31
+ return sorted(list(set(dirs)))
32
+
33
+ def fetch_datasets(base_folder: str):
34
+ datasets = sorted(find_metrics_folders(base_folder))
35
+ return datasets, gr.update(choices=datasets, value=None), fetch_groups(base_folder, datasets, None, "union")
36
+
37
+ def fetch_groups(base_folder: str, datasets: List[str], old_groups: str, type: str = "intersection"):
38
+ if not datasets:
39
+ return gr.update(choices=[], value=None)
40
+
41
+ with ThreadPoolExecutor() as executor:
42
+ GROUPS = list(executor.map(lambda run: [Path(x).name for x in find_folders(base_folder, run)], datasets))
43
+ if len(GROUPS) == 0:
44
+ return gr.update(choices=[], value=None)
45
+
46
+ if type == "intersection":
47
+ new_choices = set.intersection(*(set(g) for g in GROUPS))
48
+ else:
49
+ new_choices = set.union(*(set(g) for g in GROUPS))
50
+ value = None
51
+ if old_groups:
52
+ value = list(set.intersection(new_choices, {old_groups}))
53
+ value = value[0] if value else None
54
+
55
+ if not value and len(new_choices) == 1:
56
+ value = list(new_choices)[0]
57
+
58
+ return gr.update(choices=sorted(list(new_choices)), value=value)
59
+
60
+ def fetch_metrics(base_folder: str, datasets: List[str], group: str, old_metrics: str, type: str = "intersection"):
61
+ if not group:
62
+ return gr.update(choices=[], value=None)
63
+
64
+ with ThreadPoolExecutor() as executor:
65
+ metrics = list(
66
+ executor.map(lambda run: [Path(x).name for x in find_folders(base_folder, f"{run}/{group}")], datasets))
67
+ if len(metrics) == 0:
68
+ return gr.update(choices=[], value=None)
69
+
70
+ if type == "intersection":
71
+ new_possibles_choices = set.intersection(*(set(s) for s in metrics))
72
+ else:
73
+ new_possibles_choices = set.union(*(set(s) for s in metrics))
74
+ value = None
75
+ if old_metrics:
76
+ value = list(set.intersection(new_possibles_choices, {old_metrics}))
77
+ value = value[0] if value else None
78
+
79
+ if not value and len(new_possibles_choices) == 1:
80
+ value = list(new_possibles_choices)[0]
81
+
82
+ return gr.update(choices=sorted(list(new_possibles_choices)), value=value)
83
+
84
+ def reverse_search(base_folder: str, possible_datasets: List[str], grouping: str, metric_name: str) -> str:
85
+ with ThreadPoolExecutor() as executor:
86
+ found_datasets = list(executor.map(
87
+ lambda dataset: dataset if metric_exists(base_folder, dataset, metric_name, grouping) else None,
88
+ possible_datasets))
89
+ found_datasets = [dataset for dataset in found_datasets if dataset is not None]
90
+ return "\n".join(found_datasets)
91
+
92
+ def reverse_search_add(datasets: List[str], reverse_search_results: str) -> List[str]:
93
+ datasets = datasets or []
94
+ return sorted(list(set(datasets + reverse_search_results.strip().split("\n"))))
95
+
96
+ def metric_exists(base_folder: str, path: str, metric_name: str, group_by: str) -> bool:
97
+ base_folder = get_datafolder(base_folder)
98
+ return base_folder.exists(f"{path}/{group_by}/{metric_name}/metric.json")
99
+
100
+ @tenacity.retry(stop=tenacity.stop_after_attempt(5))
101
+ def load_metrics(base_folder: str, path: str, metric_name: str, group_by: str) -> MetricStatsDict:
102
+ base_folder = get_datafolder(base_folder)
103
+ with base_folder.open(f"{path}/{group_by}/{metric_name}/metric.json") as f:
104
+ json_metric = json.load(f)
105
+ return MetricStatsDict.from_dict(json_metric)
106
+
107
+ def load_data(dataset_path: str, base_folder: str, grouping: str, metric_name: str) -> MetricStatsDict:
108
+ return load_metrics(base_folder, dataset_path, metric_name, grouping)
src/logic/data_processing.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import heapq
4
+ from collections import defaultdict
5
+ import tempfile
6
+ from typing import Dict, Tuple, List, Literal
7
+ import gradio as gr
8
+ from datatrove.utils.stats import MetricStatsDict
9
+
10
+ PARTITION_OPTIONS = Literal["Top", "Bottom", "Most frequent (n_docs)"]
11
+
12
+ def prepare_for_non_grouped_plotting(metric: Dict[str, MetricStatsDict], normalization: bool, rounding: int) -> Dict[float, float]:
13
+ metrics_rounded = defaultdict(lambda: 0)
14
+ for key, value in metric.items():
15
+ metrics_rounded[round(float(key), rounding)] += value.total
16
+ if normalization:
17
+ normalizer = sum(metrics_rounded.values())
18
+ metrics_rounded = {k: v / normalizer for k, v in metrics_rounded.items()}
19
+ assert abs(sum(metrics_rounded.values()) - 1) < 0.01
20
+ return metrics_rounded
21
+
22
+ def prepare_for_group_plotting(metric: Dict[str, MetricStatsDict], top_k: int, direction: PARTITION_OPTIONS, regex: str | None, rounding: int) -> Tuple[List[str], List[float], List[float]]:
23
+ regex_compiled = re.compile(regex) if regex else None
24
+ metric = {key: value for key, value in metric.items() if not regex or regex_compiled.match(key)}
25
+ means = {key: round(float(value.mean), rounding) for key, value in metric.items()}
26
+ if direction == "Top":
27
+ keys = heapq.nlargest(top_k, means, key=means.get)
28
+ elif direction == "Most frequent (n_docs)":
29
+ totals = {key: int(value.n) for key, value in metric.items()}
30
+ keys = heapq.nlargest(top_k, totals, key=totals.get)
31
+ else:
32
+ keys = heapq.nsmallest(top_k, means, key=means.get)
33
+
34
+ means = [means[key] for key in keys]
35
+ stds = [metric[key].standard_deviation for key in keys]
36
+ return keys, means, stds
37
+
38
+ def export_data(exported_data: Dict[str, MetricStatsDict], metric_name: str):
39
+ if not exported_data:
40
+ return None
41
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, prefix=metric_name, suffix=".json") as temp:
42
+ json.dump({
43
+ name: sorted([{"value": key, **value} for key, value in dt.to_dict().items()], key=lambda x: x["value"])
44
+ for name, dt in exported_data.items()
45
+ }, temp, indent=2)
46
+ temp_path = temp.name
47
+ return gr.update(visible=True, value=temp_path)
src/logic/plotting.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+ import plotly.express as px
3
+ import plotly.graph_objects as go
4
+ import numpy as np
5
+ import gradio as gr
6
+ from typing import Dict, List
7
+ from .data_processing import prepare_for_non_grouped_plotting, prepare_for_group_plotting
8
+ from .utils import set_alpha
9
+
10
+ def plot_scatter(
11
+ data: Dict[str, Dict[float, float]],
12
+ metric_name: str,
13
+ log_scale_x: bool,
14
+ log_scale_y: bool,
15
+ normalization: bool,
16
+ rounding: int,
17
+ cumsum: bool,
18
+ perc: bool,
19
+ progress: gr.Progress,
20
+ ):
21
+ fig = go.Figure()
22
+ data = {name: histogram for name, histogram in sorted(data.items())}
23
+ for i, (name, histogram) in enumerate(progress.tqdm(data.items(), total=len(data), desc="Plotting...")):
24
+ histogram_prepared = prepare_for_non_grouped_plotting(histogram, normalization, rounding)
25
+ x = sorted(histogram_prepared.keys())
26
+ y = [histogram_prepared[k] for k in x]
27
+ if cumsum:
28
+ y = np.cumsum(y).tolist()
29
+ if perc:
30
+ y = (np.array(y) * 100).tolist()
31
+
32
+ fig.add_trace(
33
+ go.Scatter(
34
+ x=x,
35
+ y=y,
36
+ mode="lines",
37
+ name=name,
38
+ marker=dict(color=set_alpha(px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)], 0.5)),
39
+ )
40
+ )
41
+
42
+ yaxis_title = "Frequency" if normalization else "Total"
43
+
44
+ fig.update_layout(
45
+ title=f"Line Plots for {metric_name}",
46
+ xaxis_title=metric_name,
47
+ yaxis_title=yaxis_title,
48
+ xaxis_type="log" if log_scale_x and len(x) > 1 else None,
49
+ yaxis_type="log" if log_scale_y and len(y) > 1 else None,
50
+ width=1200,
51
+ height=600,
52
+ showlegend=True,
53
+ )
54
+
55
+ return fig
56
+
57
+ def plot_bars(
58
+ data: Dict[str, List[Dict[str, float]]],
59
+ metric_name: str,
60
+ top_k: int,
61
+ direction: str,
62
+ regex: str | None,
63
+ rounding: int,
64
+ log_scale_x: bool,
65
+ log_scale_y: bool,
66
+ progress: gr.Progress,
67
+ ):
68
+ fig = go.Figure()
69
+ x = []
70
+ y = []
71
+
72
+ for i, (name, histogram) in enumerate(progress.tqdm(data.items(), total=len(data), desc="Plotting...")):
73
+ x, y, stds = prepare_for_group_plotting(histogram, top_k, direction, regex, rounding)
74
+
75
+ fig.add_trace(go.Bar(
76
+ x=x,
77
+ y=y,
78
+ name=f"{name} Mean",
79
+ marker=dict(color=set_alpha(px.colors.qualitative.Plotly[i % len(px.colors.qualitative.Plotly)], 0.5)),
80
+ error_y=dict(type='data', array=stds, visible=True)
81
+ ))
82
+
83
+ fig.update_layout(
84
+ title=f"Bar Plots for {metric_name}",
85
+ xaxis_title=metric_name,
86
+ yaxis_title="Avg. value",
87
+ xaxis_type="log" if log_scale_x and len(x) > 1 else None,
88
+ yaxis_type="log" if log_scale_y and len(y) > 1 else None,
89
+ autosize=True,
90
+ width=1200,
91
+ height=600,
92
+ showlegend=True,
93
+ )
94
+
95
+ return fig
96
+
97
+ def plot_data(data, metric_name, normalization, rounding, grouping, top_k, direction, regex, log_scale_x, log_scale_y,
98
+ cumsum, perc, progress=gr.Progress()):
99
+ if rounding is None or top_k is None:
100
+ return None
101
+ graph_fc = (
102
+ partial(plot_scatter, normalization=normalization, rounding=rounding, cumsum=cumsum, perc=perc)
103
+ if grouping == "histogram"
104
+ else partial(plot_bars, top_k=top_k, direction=direction, regex=regex, rounding=rounding)
105
+ )
106
+ return graph_fc(data=data, metric_name=metric_name, progress=progress, log_scale_x=log_scale_x,
107
+ log_scale_y=log_scale_y)
src/logic/utils.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def set_alpha(color: str, alpha: float) -> str:
2
+ if color.startswith('#'):
3
+ r, g, b = int(color[1:3], 16), int(color[3:5], 16), int(color[5:7], 16)
4
+ else:
5
+ r, g, b = 0, 0, 0
6
+ return f"rgba({r}, {g}, {b}, {alpha})"
7
+
8
+ def get_desc(data):
9
+ res = {name: list(dt.to_dict().keys()) for name, dt in data.items()}
10
+ return "\n".join([
11
+ f"**{name}**: min={min(dt)}, max={max(dt)}" for name, dt in res.items()
12
+ ])
src/view/__pycache__/view.cpython-312.pyc ADDED
Binary file (13.4 kB). View file
 
src/view/view.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.logic.data_fetching import fetch_datasets, fetch_groups, fetch_metrics, load_data, reverse_search, reverse_search_add
3
+ from src.logic.data_processing import export_data
4
+ from src.logic.plotting import plot_data
5
+ from src.logic.utils import get_desc
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from functools import partial
8
+ import os
9
+ import re
10
+
11
+
12
+
13
+ METRICS_LOCATION_DEFAULT = os.getenv("METRICS_LOCATION_DEFAULT", "hf://datasets/HuggingFaceFW-Dev/summary-stats-files")
14
+
15
+ def update_graph(
16
+ base_folder,
17
+ datasets,
18
+ metric_name,
19
+ grouping,
20
+ log_scale_x,
21
+ log_scale_y,
22
+ rounding,
23
+ normalization,
24
+ top_k,
25
+ direction,
26
+ regex,
27
+ cumsum,
28
+ perc,
29
+ progress=gr.Progress(),
30
+ ):
31
+ if len(datasets) <= 0 or not metric_name or not grouping:
32
+ return None
33
+
34
+ with ThreadPoolExecutor() as pool:
35
+ data = list(
36
+ progress.tqdm(
37
+ pool.map(
38
+ partial(load_data, base_folder=base_folder, metric_name=metric_name, grouping=grouping),
39
+ datasets,
40
+ ),
41
+ total=len(datasets),
42
+ desc="Loading data...",
43
+ )
44
+ )
45
+
46
+ data = {path: result for path, result in zip(datasets, data)}
47
+ return plot_data(data, metric_name, normalization, rounding, grouping, top_k, direction, regex, log_scale_x,
48
+ log_scale_y, cumsum, perc, progress), data, export_data(data, metric_name), get_desc(data)
49
+
50
+ def create_interface():
51
+ with gr.Blocks() as demo:
52
+ datasets = gr.State([])
53
+ exported_data = gr.State([])
54
+ metrics_headline = gr.Markdown(value="# Metrics Exploration")
55
+
56
+ with gr.Tabs():
57
+ with gr.TabItem("Help"):
58
+ gr.Markdown(
59
+ label="Readme",
60
+ value="""
61
+ ## How to use:
62
+ 1) Specify Metrics location (Stats block `output_folder` without the last path segment) and click "Fetch Datasets"
63
+ 2) Select datasets you are interested in using the dropdown or regex filter
64
+ 3) Specify Grouping (global average/value/fqdn/suffix) and Metric name
65
+ 4) Click "Render Metric"
66
+
67
+
68
+ ## Groupings:
69
+ - **histogram**: Creates a line plot of values with their frequencies. If normalization is on, the frequencies sum to 1.
70
+ * normalize:
71
+ - **(fqdn/suffix)**: Creates a bar plot of the avg. values of the metric for full qualifed domain name/suffix of domain.
72
+ * k: the number of groups to show
73
+ * Top/Bottom/Most frequent (n_docs): Groups with the top/bottom k values/most prevalant docs are shown
74
+ - **none**: Shows the average value of given metric
75
+
76
+ ## Reverse search:
77
+ To search for datasets containing a grouping and certain metric, use the Reverse search section.
78
+ Specify the search parameters and click "Search". This will show you found datasets in the "Found datasets" textbox. You can modify the selection after search by removing unwanted lines and clicking "Add to selection".
79
+
80
+ ## Note:
81
+ The data might not be 100% representative, due to the sampling and optimistic merging of the metrics (fqdn/suffix).
82
+ """,
83
+ )
84
+
85
+ with gr.TabItem("Metric View"):
86
+ with gr.Row():
87
+ with gr.Column(scale=2):
88
+ with gr.Row():
89
+ with gr.Column(scale=1):
90
+ base_folder = gr.Textbox(
91
+ label="Metrics Location",
92
+ value=METRICS_LOCATION_DEFAULT,
93
+ )
94
+ datasets_refetch = gr.Button("Fetch Datasets")
95
+
96
+ with gr.Column(scale=1):
97
+ regex_select = gr.Text(label="Regex filter", value=".*")
98
+ regex_button = gr.Button("Search")
99
+ with gr.Row():
100
+ datasets_selected = gr.Dropdown(
101
+ choices=[],
102
+ label="Datasets",
103
+ multiselect=True,
104
+ )
105
+
106
+ with gr.Column(scale=1):
107
+ grouping_dropdown = gr.Dropdown(
108
+ choices=[],
109
+ label="Grouping",
110
+ multiselect=False,
111
+ )
112
+ metric_name_dropdown = gr.Dropdown(
113
+ choices=[],
114
+ label="Metric name",
115
+ multiselect=False,
116
+ )
117
+
118
+ render_button = gr.Button("Render Metric", variant="primary")
119
+
120
+ with gr.Tabs():
121
+ with gr.TabItem("Graph Settings"):
122
+ log_scale_x_checkbox = gr.Checkbox(
123
+ label="Log scale x",
124
+ value=False,
125
+ )
126
+ log_scale_y_checkbox = gr.Checkbox(
127
+ label="Log scale y",
128
+ value=False,
129
+ )
130
+ rounding = gr.Number(
131
+ label="Rounding",
132
+ value=2,
133
+ )
134
+ normalization_checkbox = gr.Checkbox(
135
+ label="Normalize",
136
+ value=True,
137
+ visible=False
138
+ )
139
+ with gr.Row():
140
+ export_data_json = gr.File(visible=False)
141
+
142
+ with gr.TabItem("Grouping Settings"):
143
+ with gr.Row(visible=False) as group_choices:
144
+ with gr.Column(scale=2):
145
+ group_regex = gr.Text(
146
+ label="Group Regex",
147
+ value=None,
148
+ )
149
+ with gr.Row():
150
+ top_select = gr.Number(
151
+ label="N Groups",
152
+ value=100,
153
+ interactive=True,
154
+ )
155
+
156
+ direction_checkbox = gr.Radio(
157
+ label="Partition",
158
+ choices=[
159
+ "Top",
160
+ "Bottom",
161
+ "Most frequent (n_docs)",
162
+ ],
163
+ value="Most frequent (n_docs)",
164
+ )
165
+
166
+ with gr.TabItem("Histogram Settings") as histogram_settings:
167
+ cdf_checkbox = gr.Checkbox(
168
+ label="CDF",
169
+ value=False,
170
+ )
171
+ perc_checkbox = gr.Checkbox(
172
+ label="%",
173
+ value=False,
174
+ )
175
+ with gr.Column(visible=False) as min_max_hist:
176
+ min_max_hist_data = gr.Markdown()
177
+
178
+ with gr.Row():
179
+ graph_output = gr.Plot(label="Graph")
180
+
181
+ with gr.TabItem("Reverse Metrics Search"):
182
+ reverse_search_headline = gr.Markdown(value="# Reverse Metrics Search")
183
+
184
+ with gr.Row():
185
+ with gr.Column(scale=1):
186
+ reverse_grouping_dropdown = gr.Dropdown(
187
+ choices=[],
188
+ label="Grouping",
189
+ multiselect=False,
190
+ )
191
+ reverse_metric_name_dropdown = gr.Dropdown(
192
+ choices=[],
193
+ label="Metric Name",
194
+ multiselect=False,
195
+ )
196
+ reverse_search_button = gr.Button("Search")
197
+ reverse_search_add_button = gr.Button("Add to selection")
198
+
199
+ with gr.Column(scale=2):
200
+ reverse_search_results = gr.Textbox(
201
+ label="Found datasets",
202
+ lines=10,
203
+ placeholder="Found datasets containing the group/metric name. You can modify the selection after search by removing unwanted lines and clicking Add to selection"
204
+ )
205
+
206
+ render_button.click(
207
+ fn=update_graph,
208
+ inputs=[
209
+ base_folder,
210
+ datasets_selected,
211
+ metric_name_dropdown,
212
+ grouping_dropdown,
213
+ log_scale_x_checkbox,
214
+ log_scale_y_checkbox,
215
+ rounding,
216
+ normalization_checkbox,
217
+ top_select,
218
+ direction_checkbox,
219
+ group_regex,
220
+ cdf_checkbox,
221
+ perc_checkbox
222
+ ],
223
+ outputs=[graph_output, exported_data, export_data_json, min_max_hist_data],
224
+ )
225
+
226
+ gr.on(
227
+ triggers=[normalization_checkbox.change, rounding.change, group_regex.change, direction_checkbox.change,
228
+ top_select.change, log_scale_x_checkbox.change,
229
+ log_scale_y_checkbox.change, cdf_checkbox.change, perc_checkbox.change],
230
+ fn=plot_data,
231
+ inputs=[
232
+ exported_data,
233
+ metric_name_dropdown,
234
+ normalization_checkbox,
235
+ rounding,
236
+ grouping_dropdown,
237
+ top_select,
238
+ direction_checkbox,
239
+ group_regex,
240
+ log_scale_x_checkbox,
241
+ log_scale_y_checkbox,
242
+ cdf_checkbox,
243
+ perc_checkbox
244
+ ],
245
+ outputs=[graph_output],
246
+ )
247
+
248
+ datasets_selected.change(
249
+ fn=fetch_groups,
250
+ inputs=[base_folder, datasets_selected, grouping_dropdown],
251
+ outputs=grouping_dropdown,
252
+ )
253
+
254
+ grouping_dropdown.change(
255
+ fn=fetch_metrics,
256
+ inputs=[base_folder, datasets_selected, grouping_dropdown, metric_name_dropdown],
257
+ outputs=metric_name_dropdown,
258
+ )
259
+
260
+ reverse_grouping_dropdown.select(
261
+ fn=partial(fetch_metrics, type="union"),
262
+ inputs=[base_folder, datasets, reverse_grouping_dropdown, reverse_metric_name_dropdown],
263
+ outputs=reverse_metric_name_dropdown,
264
+ )
265
+
266
+ reverse_search_button.click(
267
+ fn=reverse_search,
268
+ inputs=[base_folder, datasets, reverse_grouping_dropdown, reverse_metric_name_dropdown],
269
+ outputs=reverse_search_results,
270
+ )
271
+
272
+ reverse_search_add_button.click(
273
+ fn=reverse_search_add,
274
+ inputs=[datasets_selected, reverse_search_results],
275
+ outputs=datasets_selected,
276
+ )
277
+
278
+ datasets_refetch.click(
279
+ fn=fetch_datasets,
280
+ inputs=[base_folder],
281
+ outputs=[datasets, datasets_selected, reverse_grouping_dropdown],
282
+ )
283
+
284
+
285
+ def update_datasets_with_regex(regex, selected_runs, all_runs):
286
+ if not regex:
287
+ return
288
+ new_dsts = {run for run in all_runs if re.search(regex, run)}
289
+ if not new_dsts:
290
+ return gr.update(value=list(selected_runs))
291
+ dst_union = new_dsts.union(selected_runs or [])
292
+ return gr.update(value=sorted(list(dst_union)))
293
+
294
+
295
+ regex_button.click(
296
+ fn=update_datasets_with_regex,
297
+ inputs=[regex_select, datasets_selected, datasets],
298
+ outputs=datasets_selected,
299
+ )
300
+
301
+
302
+ def update_grouping_options(grouping):
303
+ if grouping == "histogram":
304
+ return {
305
+ normalization_checkbox: gr.Column(visible=True),
306
+ group_choices: gr.Column(visible=False),
307
+ min_max_hist: gr.Column(visible=True),
308
+ histogram_settings: gr.TabItem(visible=True),
309
+ }
310
+ else:
311
+ return {
312
+ normalization_checkbox: gr.Column(visible=False),
313
+ group_choices: gr.Column(visible=True),
314
+ min_max_hist: gr.Column(visible=False),
315
+ histogram_settings: gr.TabItem(visible=False),
316
+ }
317
+
318
+
319
+ grouping_dropdown.change(
320
+ fn=update_grouping_options,
321
+ inputs=[grouping_dropdown],
322
+ outputs=[normalization_checkbox, group_choices, min_max_hist, histogram_settings],
323
+ )
324
+
325
+ return demo