import os import time from pathlib import Path import pandas as pd import streamlit as st import yaml from datasets import get_dataset_config_names from dotenv import load_dotenv from huggingface_hub import list_datasets from evaluation import filter_evaluated_models from utils import ( AUTOTRAIN_TASK_TO_HUB_TASK, commit_evaluation_log, create_autotrain_project_name, format_col_mapping, get_compatible_models, get_config_metadata, get_dataset_card_url, get_key, get_metadata, http_get, http_post, ) if Path(".env").is_file(): load_dotenv(".env") HF_TOKEN = os.getenv("HF_TOKEN") AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME") AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API") DATASETS_PREVIEW_API = os.getenv("DATASETS_PREVIEW_API") # Put image tasks on top TASK_TO_ID = { "image_binary_classification": 17, "image_multi_class_classification": 18, "binary_classification": 1, "multi_class_classification": 2, "natural_language_inference": 22, "entity_extraction": 4, "extractive_question_answering": 5, "translation": 6, "summarization": 8, "text_zero_shot_classification": 23, } TASK_TO_DEFAULT_METRICS = { "binary_classification": ["f1", "precision", "recall", "auc", "accuracy"], "multi_class_classification": [ "f1", "precision", "recall", "accuracy", ], "natural_language_inference": ["f1", "precision", "recall", "auc", "accuracy"], "entity_extraction": ["precision", "recall", "f1", "accuracy"], "extractive_question_answering": ["f1", "exact_match"], "translation": ["sacrebleu"], "summarization": ["rouge1", "rouge2", "rougeL", "rougeLsum"], "image_binary_classification": ["f1", "precision", "recall", "auc", "accuracy"], "image_multi_class_classification": [ "f1", "precision", "recall", "accuracy", ], "text_zero_shot_classification": ["accuracy", "loss"], } AUTOTRAIN_TASK_TO_LANG = { "translation": "en2de", "image_binary_classification": "unk", "image_multi_class_classification": "unk", } AUTOTRAIN_MACHINE = {"text_zero_shot_classification": "r5.16x"} SUPPORTED_TASKS = list(TASK_TO_ID.keys()) # Extracted from utils.get_supported_metrics # Hardcoded for now due to speed / caching constraints SUPPORTED_METRICS = [ "accuracy", "bertscore", "bleu", "cer", "chrf", "code_eval", "comet", "competition_math", "coval", "cuad", "exact_match", "f1", "frugalscore", "google_bleu", "mae", "mahalanobis", "matthews_correlation", "mean_iou", "meteor", "mse", "pearsonr", "perplexity", "precision", "recall", "roc_auc", "rouge", "sacrebleu", "sari", "seqeval", "spearmanr", "squad", "squad_v2", "ter", "trec_eval", "wer", "wiki_split", "xnli", "angelina-wang/directional_bias_amplification", "jordyvl/ece", "lvwerra/ai4code", "lvwerra/amex", ] ####### # APP # ####### st.title("Evaluation on the Hub") st.markdown( """ Welcome to Hugging Face's automatic model evaluator 👋! This application allows you to evaluate 🤗 Transformers [models](https://huggingface.co/models?library=transformers&sort=downloads) across a wide variety of [datasets](https://huggingface.co/datasets) on the Hub. Please select the dataset and configuration below. The results of your evaluation will be displayed on the [public leaderboards](https://huggingface.co/spaces/autoevaluate/leaderboards). For more details, check out out our [blog post](https://huggingface.co/blog/eval-on-the-hub). """ ) all_datasets = [d.id for d in list_datasets()] query_params = st.experimental_get_query_params() if "first_query_params" not in st.session_state: st.session_state.first_query_params = query_params first_query_params = st.session_state.first_query_params default_dataset = all_datasets[0] if "dataset" in first_query_params: if len(first_query_params["dataset"]) > 0 and first_query_params["dataset"][0] in all_datasets: default_dataset = first_query_params["dataset"][0] selected_dataset = st.selectbox( "Select a dataset", all_datasets, index=all_datasets.index(default_dataset), help="""Datasets with metadata can be evaluated with 1-click. Configure an evaluation job to add \ new metadata to a dataset card.""", ) st.experimental_set_query_params(**{"dataset": [selected_dataset]}) # Check if selected dataset can be streamed is_valid_dataset = http_get( path="/is-valid", domain=DATASETS_PREVIEW_API, params={"dataset": selected_dataset}, ).json() if is_valid_dataset["valid"] is False: st.error( """The dataset you selected is not currently supported. Open a \ [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) for support.""" ) metadata = get_metadata(selected_dataset, token=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) print(f"INFO -- Dataset metadata: {metadata}") if metadata is None: st.warning("No evaluation metadata found. Please configure the evaluation job below.") with st.expander("Advanced configuration"): # Select task selected_task = st.selectbox( "Select a task", SUPPORTED_TASKS, index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0, help="""Don't see your favourite task here? Open a \ [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) to request it!""", ) # Select config configs = get_dataset_config_names(selected_dataset) selected_config = st.selectbox( "Select a config", configs, help="""Some datasets contain several sub-datasets, known as _configurations_. \ Select one to evaluate your models on. \ See the [docs](https://huggingface.co/docs/datasets/master/en/load_hub#configurations) for more details. """, ) # Some datasets have multiple metadata (one per config), so we grab the one associated with the selected config config_metadata = get_config_metadata(selected_config, metadata) print(f"INFO -- Config metadata: {config_metadata}") # Select splits splits_resp = http_get( path="/splits", domain=DATASETS_PREVIEW_API, params={"dataset": selected_dataset}, ) if splits_resp.status_code == 200: split_names = [] all_splits = splits_resp.json() for split in all_splits["splits"]: if split["config"] == selected_config: split_names.append(split["split"]) if config_metadata is not None: eval_split = config_metadata["splits"].get("eval_split", None) else: eval_split = None selected_split = st.selectbox( "Select a split", split_names, index=split_names.index(eval_split) if eval_split is not None else 0, help="Be wary when evaluating models on the `train` split.", ) # Select columns rows_resp = http_get( path="/first-rows", domain=DATASETS_PREVIEW_API, params={ "dataset": selected_dataset, "config": selected_config, "split": selected_split, }, ).json() col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns) st.markdown("**Map your dataset columns**") st.markdown( """The model evaluator uses a standardised set of column names for the input examples and labels. \ Please define the mapping between your dataset columns (right) and the standardised column names (left).""" ) col1, col2 = st.columns(2) # TODO: find a better way to layout these items # TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata col_mapping = {} if selected_task in ["binary_classification", "multi_class_classification"]: with col1: st.markdown("`text` column") st.text("") st.text("") st.text("") st.text("") st.markdown("`target` column") with col2: text_col = st.selectbox( "This column should contain the text to be classified", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "text")) if config_metadata is not None else 0, ) target_col = st.selectbox( "This column should contain the labels associated with the text", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "target")) if config_metadata is not None else 0, ) col_mapping[text_col] = "text" col_mapping[target_col] = "target" elif selected_task == "text_zero_shot_classification": with col1: st.markdown("`text` column") st.text("") st.text("") st.text("") st.text("") st.markdown("`classes` column") st.text("") st.text("") st.text("") st.text("") st.markdown("`target` column") with col2: text_col = st.selectbox( "This column should contain the text to be classified", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "text")) if config_metadata is not None else 0, ) classes_col = st.selectbox( "This column should contain the classes associated with the text", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "classes")) if config_metadata is not None else 0, ) target_col = st.selectbox( "This column should contain the index of the correct class", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "target")) if config_metadata is not None else 0, ) col_mapping[text_col] = "text" col_mapping[classes_col] = "classes" col_mapping[target_col] = "target" if selected_task in ["natural_language_inference"]: config_metadata = get_config_metadata(selected_config, metadata) with col1: st.markdown("`text1` column") st.text("") st.text("") st.text("") st.text("") st.text("") st.markdown("`text2` column") st.text("") st.text("") st.text("") st.text("") st.text("") st.markdown("`target` column") with col2: text1_col = st.selectbox( "This column should contain the first text passage to be classified", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "text1")) if config_metadata is not None else 0, ) text2_col = st.selectbox( "This column should contain the second text passage to be classified", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "text2")) if config_metadata is not None else 0, ) target_col = st.selectbox( "This column should contain the labels associated with the text", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "target")) if config_metadata is not None else 0, ) col_mapping[text1_col] = "text1" col_mapping[text2_col] = "text2" col_mapping[target_col] = "target" elif selected_task == "entity_extraction": with col1: st.markdown("`tokens` column") st.text("") st.text("") st.text("") st.text("") st.markdown("`tags` column") with col2: tokens_col = st.selectbox( "This column should contain the array of tokens to be classified", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "tokens")) if config_metadata is not None else 0, ) tags_col = st.selectbox( "This column should contain the labels associated with each part of the text", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "tags")) if config_metadata is not None else 0, ) col_mapping[tokens_col] = "tokens" col_mapping[tags_col] = "tags" elif selected_task == "translation": with col1: st.markdown("`source` column") st.text("") st.text("") st.text("") st.text("") st.markdown("`target` column") with col2: text_col = st.selectbox( "This column should contain the text to be translated", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "source")) if config_metadata is not None else 0, ) target_col = st.selectbox( "This column should contain the target translation", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "target")) if config_metadata is not None else 0, ) col_mapping[text_col] = "source" col_mapping[target_col] = "target" elif selected_task == "summarization": with col1: st.markdown("`text` column") st.text("") st.text("") st.text("") st.text("") st.markdown("`target` column") with col2: text_col = st.selectbox( "This column should contain the text to be summarized", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "text")) if config_metadata is not None else 0, ) target_col = st.selectbox( "This column should contain the target summary", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "target")) if config_metadata is not None else 0, ) col_mapping[text_col] = "text" col_mapping[target_col] = "target" elif selected_task == "extractive_question_answering": if config_metadata is not None: col_mapping = config_metadata["col_mapping"] # Hub YAML parser converts periods to hyphens, so we remap them here col_mapping = format_col_mapping(col_mapping) with col1: st.markdown("`context` column") st.text("") st.text("") st.text("") st.text("") st.markdown("`question` column") st.text("") st.text("") st.text("") st.text("") st.markdown("`answers.text` column") st.text("") st.text("") st.text("") st.text("") st.markdown("`answers.answer_start` column") with col2: context_col = st.selectbox( "This column should contain the question's context", col_names, index=col_names.index(get_key(col_mapping, "context")) if config_metadata is not None else 0, ) question_col = st.selectbox( "This column should contain the question to be answered, given the context", col_names, index=col_names.index(get_key(col_mapping, "question")) if config_metadata is not None else 0, ) answers_text_col = st.selectbox( "This column should contain example answers to the question, extracted from the context", col_names, index=col_names.index(get_key(col_mapping, "answers.text")) if config_metadata is not None else 0, ) answers_start_col = st.selectbox( "This column should contain the indices in the context of the first character of each `answers.text`", col_names, index=col_names.index(get_key(col_mapping, "answers.answer_start")) if config_metadata is not None else 0, ) col_mapping[context_col] = "context" col_mapping[question_col] = "question" col_mapping[answers_text_col] = "answers.text" col_mapping[answers_start_col] = "answers.answer_start" elif selected_task in ["image_binary_classification", "image_multi_class_classification"]: with col1: st.markdown("`image` column") st.text("") st.text("") st.text("") st.text("") st.markdown("`target` column") with col2: image_col = st.selectbox( "This column should contain the images to be classified", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "image")) if config_metadata is not None else 0, ) target_col = st.selectbox( "This column should contain the labels associated with the images", col_names, index=col_names.index(get_key(config_metadata["col_mapping"], "target")) if config_metadata is not None else 0, ) col_mapping[image_col] = "image" col_mapping[target_col] = "target" # Select metrics st.markdown("**Select metrics**") st.markdown("The following metrics will be computed") html_string = " ".join( [ '