Spaces:

mms-meta
/

mms-zeroshot

Running

File size: 4,778 Bytes

7f1afcd
daad6da
 
 
 
 
 
7f1afcd
a4107b1
297e244
 
 
 
a4107b1
297e244
 
 
 
a4107b1
297e244
a4107b1
297e244
a4107b1
 
 
 
 
 
0f191f9
 
 
 
 
 
 
 
 
 
 
 
daad6da
0f191f9
 
 
 
 
 
 
 
 
 
 
 
a4107b1
daad6da
 
 
 
 
 
0f191f9
 
 
 
 
 
 
 
 
 
 
 
daad6da
 
 
0f191f9
 
 
 
 
daad6da
297e244
 
daad6da
 
 
a4107b1
 
 
 
 
 
 
 
 
 
 
daad6da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4107b1
7f1afcd
a4107b1

import gradio as gr
from zeroshot import (
    process,
    WORD_SCORE_DEFAULT_IF_LM,
    WORD_SCORE_DEFAULT_IF_NOLM,
    LM_SCORE_DEFAULT,
)

with gr.Blocks(css="style.css") as demo:
    gr.Markdown(
        "<p align='center' style='font-size: 20px;'>MMS Zero-shot ASR Demo. See our arXiV <a href='https://arxiv.org/'>paper</a> for model details.</p>"
    )
    gr.HTML(
        """<center>The demo works on input audio in any language, as long as you provide a list of words or sentences for that language and an optional n-gram language model (even a simple 1-gram model will work!) to help with accuracy.<br>We recommend having a minimum of 5000 distinct words in the textfile to acheive a good performance.</center>"""
    )
    with gr.Row():
        with gr.Column():
            audio = gr.Audio(label="Audio Input\n(use microphone or upload a file)")

            with gr.Row():
                words_file = gr.File(label="Text Data")
                lm_file = gr.File(label="Language Model\n(optional)")

            with gr.Accordion("Advanced Settings", open=False):
                gr.Markdown(
                    "The following parameters are used for beam-search decoding. Use the default values if you are not sure."
                )
                with gr.Row():
                    with gr.Column():
                        wscore_usedefault = gr.Checkbox(
                            label="Use Default Word Insertion Score", value=True
                        )
                        wscore = gr.Slider(
                            minimum=-10.0,
                            maximum=10.0,
                            value=WORD_SCORE_DEFAULT_IF_NOLM,
                            step=0.1,
                            interactive=False,
                            label="Word Insertion Score",
                        )

                    with gr.Column():
                        lmscore_usedefault = gr.Checkbox(
                            label="Use Default Language Model Score", value=True
                        )
                        lmscore = gr.Slider(
                            minimum=-10.0,
                            maximum=10.0,
                            value=0,
                            step=0.1,
                            interactive=False,
                            label="Language Model Score",
                        )
            btn = gr.Button("Submit", elem_id="submit")

            @gr.on(
                inputs=[wscore_usedefault, lmscore_usedefault, lm_file],
                outputs=[wscore, lmscore],
            )
            def update_slider(ws, ls, lm):

                ws_slider = gr.Slider(
                    minimum=-10.0,
                    maximum=10.0,
                    value=LM_SCORE_DEFAULT if lm is not None else 0,
                    step=0.1,
                    interactive=not ws,
                    label="Word Insertion Score",
                )
                ls_slider = gr.Slider(
                    minimum=-10.0,
                    maximum=10.0,
                    value=WORD_SCORE_DEFAULT_IF_NOLM
                    if lm is None
                    else WORD_SCORE_DEFAULT_IF_LM,
                    step=0.1,
                    interactive=not ls,
                    label="Language Model Score",
                )
                return ws_slider, ls_slider

        with gr.Column():
            text = gr.Textbox(label="Transcript")
            with gr.Accordion("Logs", open=False):
                logs = gr.Textbox(show_label=False)

    btn.click(
        process,
        inputs=[
            audio,
            words_file,
            lm_file,
            wscore,
            lmscore,
            wscore_usedefault,
            lmscore_usedefault,
        ],
        outputs=[text, logs],
    )

    # Examples
    gr.Examples(
        examples=[
            # ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
            ["upload/english/english.mp3", "upload/english/c4_10k_sentences.txt"],
            ["upload/english/english.mp3", "upload/english/c4_5k_sentences.txt"],
            ["upload/english/english.mp3", "upload/english/cv8_top10k_words.txt"],
        ],
        inputs=[audio, words_file],
        label="English",
    )
    gr.Examples(
        examples=[
            # ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
            ["upload/ligurian/ligurian_1.mp3", "upload/ligurian/zenamt_10k_sentences.txt"],
            ["upload/ligurian/ligurian_2.mp3", "upload/ligurian/zenamt_10k_sentences.txt"],
            ["upload/ligurian/ligurian_3.mp3", "upload/ligurian/zenamt_5k_sentences.txt"],
        ],
        inputs=[audio, words_file],
        label="Ligurian",
    )

demo.launch()