Spaces:
Runtime error
Runtime error
File size: 7,610 Bytes
c5fe7df 923754b c5fe7df 923754b c5fe7df 923754b c5fe7df 14dabac c5fe7df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
import streamlit as st
from pandas import read_csv
import os
from evaluate import load
from huggingface_hub import Repository
import zipfile
# first define URLs for the reference and submission datasets on the Hub
REFERENCE_NAME = "references"
SUBMISSION_NAME = "submissions"
REFERENCE_URL = os.path.join(
"https://huggingface.co/datasets/xtreme-s", REFERENCE_NAME
)
SUBMISSION_URL = os.path.join(
"https://huggingface.co/datasets/xtreme-s", SUBMISSION_NAME
)
# grab these repos using the token provided
HF_TOKEN = os.environ.get("HF_TOKEN")
reference_repo = Repository(
local_dir="references", clone_from=REFERENCE_URL, use_auth_token=HF_TOKEN
)
submission_repo = Repository(
local_dir="submissions", clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN
)
submission_repo.git_pull()
all_submissions = [
folder
for folder in os.listdir(SUBMISSION_NAME)
if os.path.isdir(os.path.join(SUBMISSION_NAME, folder)) and folder != ".git"
]
# define the XTREME-S test sets
TEST_SETS = [
"fleurs",
"mls",
"vp",
"covost-2",
"f-lid",
"m-14",
]
EXPECTED_TEST_FILES = [f + ".txt" for f in TEST_SETS]
# define the optional test sets - ignore for now
OPTIONAL_TEST_SETS = [] #["f-r5"]
OPTIONAL_TEST_FILES = [f + ".txt" for f in OPTIONAL_TEST_SETS]
# load all metrics
wer_metric = load("wer")
bleu_metric = load("bleu")
acc_metric = load("accuracy")
f1_metric = load("f1")
# map test set to metric
METRIC_MAP = {
"fleurs": wer_metric,
"mls": wer_metric,
"vp": wer_metric,
"covost-2": bleu_metric,
"f-lid": acc_metric,
"m-14": f1_metric,
}
def compute_score(pred_file, ref_file, metric):
"""Assess predicted file against reference file for a given metric."""
with open(pred_file, "r", encoding="utf-8") as pred, open(
ref_file, "r", encoding="utf-8"
) as ref:
# TODO: any post-processing required?
pred_lines = [line.strip() for line in pred.readlines()]
ref_lines = [line.strip() for line in ref.readlines()]
score = metric(ref_lines, pred_lines)
return score
# load up the results file
CSV_RESULTS_FILE = os.path.join(SUBMISSION_NAME, "results.csv")
all_results = read_csv(CSV_RESULTS_FILE)
# Write table form CSV
table = all_results.copy()
# make sure the column ordering is correct (name, average-score, fleurs, mls, ...)
average_column = table.pop("average-score")
name_column = table.pop("name")
table.insert(0, "average-score", average_column)
table = table.select_dtypes(exclude=["object", "string"])
table.insert(0, "name", name_column)
table = table.sort_values(by=["average-score"], ascending=False, ignore_index=True)
table = table.round(2)
table.index = table.index + 1
# Streamlit
st.markdown("# XTREME-S: Evaluating Cross-lingual Speech Representations")
st.markdown(
f"""
This is the leaderboard of the XTREME-S benchmark.
Submitted systems are ranked by the **ESB Score** which is the average of
all non-optional datasets: {", ".join(TEST_SETS)}. The optional dataset of f-r5 does not contribute to the average score."""
)
# st.table(table)
st.dataframe(table.style.format(subset=["esb-score", *TEST_SETS, *OPTIONAL_TEST_SETS], formatter="{:.1f}"))
st.markdown(
"""
XTREME-S was proposed in *XTREME-S: Evaluating Cross-lingual Speech Representations*, by Conneau et. al.
\n
The abstract of the paper is as follows:
\n
*We introduce XTREME-S, a new benchmark to evaluate universal cross-lingual speech representations in many languages. XTREME-S covers four task families: speech recognition, classification, speech-to-text translation and retrieval. Covering 102 languages from 10+ language families, 3 different domains and 4 task families, XTREME-S aims to simplify multilingual speech representation evaluation, as well as catalyze research in "universal" speech representation learning. This paper describes the new benchmark and establishes the first speech-only and speech-text baselines using XLS-R and mSLAM on all downstream tasks. We motivate the design choices and detail how to use the benchmark.*
\n
For more information, refer to the paper submission on [Arxiv](https://arxiv.org/abs/2203.10752).
"""
)
st.markdown(
"""
## Submitting to XTREME-S
\n
To submit to XTREME-S, download the audio data for the mandatory XTREME-S test sets from [xtreme-s/datasets](https://huggingface.co/datasets/xtreme-s/datasets). The test sets contain audio data only. Evaluate your system on the nine test sets by generating predictions for the unlabelled audio samples. For each test set, save the predictions to a .txt file in the order that the audio samples are provided, with one prediction per line. Name the .txt file according to the XTREME-S test set names shown in the table (e.g. the predictions for Fleurs should be named fleurs.txt).
\n
Once you have evaluated your system on all of the six mandatory test sets, move the predictions into one folder and zip it. The name you assign to the zipped folder will be the name that is shown on the table (e.g. mSLAM.zip will be displayed as mSLAM). Upload your zipped submissions for scoring and placement on the leaderboard.
\n
Should you experience any issues, open an issue using the link [new discussion](https://huggingface.co/spaces/xtreme-s/leaderboard/discussions/new) and tag `@sanchit-gandhi`.
"""
)
# Using the "with" syntax
with st.form(key="my_form"):
uploaded_file = st.file_uploader("Choose a zip file")
submit_button = st.form_submit_button(label="Submit")
if submit_button:
if uploaded_file is None:
raise ValueError("Please make sure to have uploaded a zip file.")
submission = uploaded_file.name.split(".zip")[0]
with st.spinner(f"Uploading {submission}..."):
with zipfile.ZipFile(uploaded_file, 'r') as zip_ref:
zip_ref.extractall(submission_repo.local_dir)
submission_repo.push_to_hub()
with st.spinner(f"Computing XTREME-S Score for {submission}..."):
results = {"name": submission}
all_submitted_files = os.listdir(os.path.join(SUBMISSION_NAME, submission))
submitted_files = [f for f in all_submitted_files if f in EXPECTED_TEST_FILES]
submitted_optional_files = [f for f in all_submitted_files if f in OPTIONAL_TEST_FILES]
if sorted(EXPECTED_TEST_FILES) != sorted(submitted_files):
raise ValueError(
f"{', '.join(submitted_files)} were submitted, but expected {', '.join(EXPECTED_TEST_FILES)}"
)
for file in submitted_files:
ref_file = os.path.join(REFERENCE_NAME, file)
pred_file = os.path.join(SUBMISSION_NAME, submission, file)
test_set = file.split(".")[0]
metric = METRIC_MAP[test_set]
score = compute_score(pred_file, ref_file, metric)
results[test_set] = round(100 * score, 2)
# TODO: assessment of 'optional' test sets
# XTREME-S score is computed over the mandatory test sets only
average_score = 0.4 * (100 - (results["fleurs"] + results["mls"] + results["vp"]) / 3) + 0.4 * results[
"covost-2"] + 0.2 * (results["f-lid"] + results["m-14"]) / 2
results["average-score"] = round(average_score, 2)
all_results = all_results.append(results, ignore_index=True)
# save and upload new evaluated results
all_results.to_csv(CSV_RESULTS_FILE, index=False)
commit_url = submission_repo.push_to_hub()
st.success('Please refresh this space (CTRL+R) to see your result')
|