import gradio as gr from src.logic.data_fetching import fetch_datasets, fetch_groups, fetch_metrics, load_data, reverse_search, reverse_search_add from src.logic.data_processing import export_data from src.logic.plotting import plot_data from src.logic.utils import get_desc from concurrent.futures import ThreadPoolExecutor from functools import partial import os import re METRICS_LOCATION_DEFAULT = os.getenv("METRICS_LOCATION_DEFAULT", "hf://datasets/HuggingFaceFW-Dev/summary-stats-files") def update_graph( base_folder, datasets, metric_name, grouping, log_scale_x, log_scale_y, rounding, normalization, top_k, direction, regex, cumsum, perc, progress=gr.Progress(), ): if len(datasets) <= 0 or not metric_name or not grouping: return None with ThreadPoolExecutor() as pool: data = list( progress.tqdm( pool.map( partial(load_data, base_folder=base_folder, metric_name=metric_name, grouping=grouping), datasets, ), total=len(datasets), desc="Loading data...", ) ) data = {path: result for path, result in zip(datasets, data)} return plot_data(data, metric_name, normalization, rounding, grouping, top_k, direction, regex, log_scale_x, log_scale_y, cumsum, perc, progress), data, export_data(data, metric_name), get_desc(data) def create_interface(): with gr.Blocks() as demo: datasets = gr.State([]) exported_data = gr.State([]) metrics_headline = gr.Markdown(value="# Metrics Exploration") with gr.Tabs(): with gr.TabItem("Help"): gr.Markdown( label="Readme", value=""" ## How to use: 1) Specify Metrics location (Stats block `output_folder` without the last path segment) and click "Fetch Datasets" 2) Select datasets you are interested in using the dropdown or regex filter 3) Specify Grouping (global average/value/fqdn/suffix) and Metric name 4) Click "Render Metric" ## Groupings: - **histogram**: Creates a line plot of values with their frequencies. If normalization is on, the frequencies sum to 1. * normalize: - **(fqdn/suffix)**: Creates a bar plot of the avg. values of the metric for full qualifed domain name/suffix of domain. * k: the number of groups to show * Top/Bottom/Most frequent (n_docs): Groups with the top/bottom k values/most prevalant docs are shown - **none**: Shows the average value of given metric ## Reverse search: To search for datasets containing a grouping and certain metric, use the Reverse search section. Specify the search parameters and click "Search". This will show you found datasets in the "Found datasets" textbox. You can modify the selection after search by removing unwanted lines and clicking "Add to selection". ## Note: The data might not be 100% representative, due to the sampling and optimistic merging of the metrics (fqdn/suffix). """, ) with gr.TabItem("Metric View"): with gr.Row(): with gr.Column(scale=2): with gr.Row(): with gr.Column(scale=1): base_folder = gr.Textbox( label="Metrics Location", value=METRICS_LOCATION_DEFAULT, ) datasets_refetch = gr.Button("Fetch Datasets") with gr.Column(scale=1): regex_select = gr.Text(label="Regex filter", value=".*") regex_button = gr.Button("Search") with gr.Row(): datasets_selected = gr.Dropdown( choices=[], label="Datasets", multiselect=True, ) with gr.Column(scale=1): grouping_dropdown = gr.Dropdown( choices=[], label="Grouping", multiselect=False, ) metric_name_dropdown = gr.Dropdown( choices=[], label="Metric name", multiselect=False, ) render_button = gr.Button("Render Metric", variant="primary") with gr.Tabs(): with gr.TabItem("Graph Settings"): log_scale_x_checkbox = gr.Checkbox( label="Log scale x", value=False, ) log_scale_y_checkbox = gr.Checkbox( label="Log scale y", value=False, ) rounding = gr.Number( label="Rounding", value=2, ) normalization_checkbox = gr.Checkbox( label="Normalize", value=True, visible=False ) with gr.Row(): export_data_json = gr.File(visible=False) with gr.TabItem("Grouping Settings"): with gr.Row(visible=False) as group_choices: with gr.Column(scale=2): group_regex = gr.Text( label="Group Regex", value=None, ) with gr.Row(): top_select = gr.Number( label="N Groups", value=100, interactive=True, ) direction_checkbox = gr.Radio( label="Partition", choices=[ "Top", "Bottom", "Most frequent (n_docs)", ], value="Most frequent (n_docs)", ) with gr.TabItem("Histogram Settings") as histogram_settings: cdf_checkbox = gr.Checkbox( label="CDF", value=False, ) perc_checkbox = gr.Checkbox( label="%", value=False, ) with gr.Column(visible=False) as min_max_hist: min_max_hist_data = gr.Markdown() with gr.Row(): graph_output = gr.Plot(label="Graph") with gr.TabItem("Reverse Metrics Search"): reverse_search_headline = gr.Markdown(value="# Reverse Metrics Search") with gr.Row(): with gr.Column(scale=1): reverse_grouping_dropdown = gr.Dropdown( choices=[], label="Grouping", multiselect=False, ) reverse_metric_name_dropdown = gr.Dropdown( choices=[], label="Metric Name", multiselect=False, ) reverse_search_button = gr.Button("Search") reverse_search_add_button = gr.Button("Add to selection") with gr.Column(scale=2): reverse_search_results = gr.Textbox( label="Found datasets", lines=10, placeholder="Found datasets containing the group/metric name. You can modify the selection after search by removing unwanted lines and clicking Add to selection" ) render_button.click( fn=update_graph, inputs=[ base_folder, datasets_selected, metric_name_dropdown, grouping_dropdown, log_scale_x_checkbox, log_scale_y_checkbox, rounding, normalization_checkbox, top_select, direction_checkbox, group_regex, cdf_checkbox, perc_checkbox ], outputs=[graph_output, exported_data, export_data_json, min_max_hist_data], ) gr.on( triggers=[normalization_checkbox.change, rounding.change, group_regex.change, direction_checkbox.change, top_select.change, log_scale_x_checkbox.change, log_scale_y_checkbox.change, cdf_checkbox.change, perc_checkbox.change], fn=plot_data, inputs=[ exported_data, metric_name_dropdown, normalization_checkbox, rounding, grouping_dropdown, top_select, direction_checkbox, group_regex, log_scale_x_checkbox, log_scale_y_checkbox, cdf_checkbox, perc_checkbox ], outputs=[graph_output], ) datasets_selected.change( fn=fetch_groups, inputs=[base_folder, datasets_selected, grouping_dropdown], outputs=grouping_dropdown, ) grouping_dropdown.change( fn=fetch_metrics, inputs=[base_folder, datasets_selected, grouping_dropdown, metric_name_dropdown], outputs=metric_name_dropdown, ) reverse_grouping_dropdown.select( fn=partial(fetch_metrics, type="union"), inputs=[base_folder, datasets, reverse_grouping_dropdown, reverse_metric_name_dropdown], outputs=reverse_metric_name_dropdown, ) reverse_search_button.click( fn=reverse_search, inputs=[base_folder, datasets, reverse_grouping_dropdown, reverse_metric_name_dropdown], outputs=reverse_search_results, ) reverse_search_add_button.click( fn=reverse_search_add, inputs=[datasets_selected, reverse_search_results], outputs=datasets_selected, ) datasets_refetch.click( fn=fetch_datasets, inputs=[base_folder], outputs=[datasets, datasets_selected, reverse_grouping_dropdown], ) def update_datasets_with_regex(regex, selected_runs, all_runs): if not regex: return new_dsts = {run for run in all_runs if re.search(regex, run)} if not new_dsts: return gr.update(value=list(selected_runs)) dst_union = new_dsts.union(selected_runs or []) return gr.update(value=sorted(list(dst_union))) regex_button.click( fn=update_datasets_with_regex, inputs=[regex_select, datasets_selected, datasets], outputs=datasets_selected, ) def update_grouping_options(grouping): if grouping == "histogram": return { normalization_checkbox: gr.Column(visible=True), group_choices: gr.Column(visible=False), min_max_hist: gr.Column(visible=True), histogram_settings: gr.TabItem(visible=True), } else: return { normalization_checkbox: gr.Column(visible=False), group_choices: gr.Column(visible=True), min_max_hist: gr.Column(visible=False), histogram_settings: gr.TabItem(visible=False), } grouping_dropdown.change( fn=update_grouping_options, inputs=[grouping_dropdown], outputs=[normalization_checkbox, group_choices, min_max_hist, histogram_settings], ) return demo