Spaces:
Running
Running
File size: 4,778 Bytes
7f1afcd daad6da 7f1afcd a4107b1 297e244 a4107b1 297e244 a4107b1 297e244 a4107b1 297e244 a4107b1 0f191f9 daad6da 0f191f9 a4107b1 daad6da 0f191f9 daad6da 0f191f9 daad6da 297e244 daad6da a4107b1 daad6da a4107b1 7f1afcd a4107b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import gradio as gr
from zeroshot import (
process,
WORD_SCORE_DEFAULT_IF_LM,
WORD_SCORE_DEFAULT_IF_NOLM,
LM_SCORE_DEFAULT,
)
with gr.Blocks(css="style.css") as demo:
gr.Markdown(
"<p align='center' style='font-size: 20px;'>MMS Zero-shot ASR Demo. See our arXiV <a href='https://arxiv.org/'>paper</a> for model details.</p>"
)
gr.HTML(
"""<center>The demo works on input audio in any language, as long as you provide a list of words or sentences for that language and an optional n-gram language model (even a simple 1-gram model will work!) to help with accuracy.<br>We recommend having a minimum of 5000 distinct words in the textfile to acheive a good performance.</center>"""
)
with gr.Row():
with gr.Column():
audio = gr.Audio(label="Audio Input\n(use microphone or upload a file)")
with gr.Row():
words_file = gr.File(label="Text Data")
lm_file = gr.File(label="Language Model\n(optional)")
with gr.Accordion("Advanced Settings", open=False):
gr.Markdown(
"The following parameters are used for beam-search decoding. Use the default values if you are not sure."
)
with gr.Row():
with gr.Column():
wscore_usedefault = gr.Checkbox(
label="Use Default Word Insertion Score", value=True
)
wscore = gr.Slider(
minimum=-10.0,
maximum=10.0,
value=WORD_SCORE_DEFAULT_IF_NOLM,
step=0.1,
interactive=False,
label="Word Insertion Score",
)
with gr.Column():
lmscore_usedefault = gr.Checkbox(
label="Use Default Language Model Score", value=True
)
lmscore = gr.Slider(
minimum=-10.0,
maximum=10.0,
value=0,
step=0.1,
interactive=False,
label="Language Model Score",
)
btn = gr.Button("Submit", elem_id="submit")
@gr.on(
inputs=[wscore_usedefault, lmscore_usedefault, lm_file],
outputs=[wscore, lmscore],
)
def update_slider(ws, ls, lm):
ws_slider = gr.Slider(
minimum=-10.0,
maximum=10.0,
value=LM_SCORE_DEFAULT if lm is not None else 0,
step=0.1,
interactive=not ws,
label="Word Insertion Score",
)
ls_slider = gr.Slider(
minimum=-10.0,
maximum=10.0,
value=WORD_SCORE_DEFAULT_IF_NOLM
if lm is None
else WORD_SCORE_DEFAULT_IF_LM,
step=0.1,
interactive=not ls,
label="Language Model Score",
)
return ws_slider, ls_slider
with gr.Column():
text = gr.Textbox(label="Transcript")
with gr.Accordion("Logs", open=False):
logs = gr.Textbox(show_label=False)
btn.click(
process,
inputs=[
audio,
words_file,
lm_file,
wscore,
lmscore,
wscore_usedefault,
lmscore_usedefault,
],
outputs=[text, logs],
)
# Examples
gr.Examples(
examples=[
# ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
["upload/english/english.mp3", "upload/english/c4_10k_sentences.txt"],
["upload/english/english.mp3", "upload/english/c4_5k_sentences.txt"],
["upload/english/english.mp3", "upload/english/cv8_top10k_words.txt"],
],
inputs=[audio, words_file],
label="English",
)
gr.Examples(
examples=[
# ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
["upload/ligurian/ligurian_1.mp3", "upload/ligurian/zenamt_10k_sentences.txt"],
["upload/ligurian/ligurian_2.mp3", "upload/ligurian/zenamt_10k_sentences.txt"],
["upload/ligurian/ligurian_3.mp3", "upload/ligurian/zenamt_5k_sentences.txt"],
],
inputs=[audio, words_file],
label="Ligurian",
)
demo.launch()
|