import os import gradio as gr import torchaudio from typing import Tuple, Optional import soundfile as sf from s2st_inference import s2st_inference from utils import download_model DESCRIPTION = r""" **Speech-to-Speech Translation from Spanish to English** - Paper: Direct Speech-to-Speech Translation With Discrete Units - Dataset: CVSS-C - Toolkit: [ESPnet](https://github.com/espnet/espnet) - Pretrained Speech-to-Unit translation model: https://huggingface.co/espnet/jiyang_tang_cvss-c_es-en_discrete_unit - Pretrained WaveGAN vocoder: https://huggingface.co/espnet/cvss-c_en_wavegan_hubert_vocoder Part of a CMU MIIS capstone project with [@realzza](https://github.com/realzza) and [@sophia1488](https://github.com/sophia1488) """ SAMPLE_RATE = 16000 MAX_INPUT_LENGTH = 60 # seconds S2UT_TAG = 'espnet/jiyang_tang_cvss-c_es-en_discrete_unit' S2UT_DIR = 'model' VOCODER_TAG = 'espnet/cvss-c_en_wavegan_hubert_vocoder' VOCODER_DIR = 'vocoder' NGPU = 0 BEAM_SIZE = 1 class App: def __init__(self): # Download models os.makedirs(S2UT_DIR, exist_ok=True) os.makedirs(VOCODER_DIR, exist_ok=True) self.s2ut_path = download_model(S2UT_TAG, S2UT_DIR) self.vocoder_path = download_model(VOCODER_TAG, VOCODER_DIR) def s2st( self, input_audio: Optional[str], ): orig_wav, orig_sr = torchaudio.load(input_audio) wav = torchaudio.functional.resample(orig_wav, orig_freq=orig_sr, new_freq=SAMPLE_RATE) max_length = int(MAX_INPUT_LENGTH * SAMPLE_RATE) if wav.shape[1] > max_length: wav = wav[:, :max_length] gr.Warning(f"Input audio is too long. Truncated to {MAX_INPUT_LENGTH} seconds.") wav = wav[0] # mono # Temporary change cwd to model dir so that it loads correctly cwd = os.getcwd() os.chdir(self.s2ut_path) # Translate wav out_wav = s2st_inference( wav, train_config=os.path.join( self.s2ut_path, 'exp', 's2st_train_s2st_discrete_unit_raw_fbank_es_en', 'config.yaml', ), model_file=os.path.join( self.s2ut_path, 'exp', 's2st_train_s2st_discrete_unit_raw_fbank_es_en', '500epoch.pth', ), vocoder_file=os.path.join( self.vocoder_path, 'checkpoint-450000steps.pkl', ), vocoder_config=os.path.join( self.vocoder_path, 'config.yml', ), ngpu=NGPU, beam_size=BEAM_SIZE, ) # Restore working directory os.chdir(cwd) # Save result output_path = 'output.wav' sf.write( output_path, out_wav, 16000, "PCM_16", ) return output_path def main(): app = App() with gr.Blocks() as demo: gr.Markdown(DESCRIPTION) with gr.Group(): input_audio = gr.Audio( label="Input speech", type="filepath", sources=["upload", "microphone"], format='wav', streaming=False, visible=True, ) btn = gr.Button("Translate") output_audio = gr.Audio( label="Translated speech", autoplay=False, streaming=False, type="numpy", ) # Placeholders so that the example section can show these values source_text = gr.Text(label='Source Text', visible=False) target_text = gr.Text(label='Target Text', visible=False) # Examples with gr.Row(): gr.Examples( examples=[ ["examples/example1.wav", "fue enterrada en el cementerio forest lawn memorial park de hollywood hills", "she was buried at the forest lawn memorial park of hollywood hills"], ["examples/example2.wav", "diversos otros músicos han interpretado esta canción en conciertos en vivo", "many other musicians have played this song in live concerts"], ["examples/example3.wav", "es gómez-moreno el primero en situar su origen en guadalajara, hoy ampliamente aceptado", "gomez moreno was the first person to place its origin in guadalajara which is now broadly accepted"], ], inputs=[input_audio, source_text, target_text], outputs=[output_audio], ) btn.click( fn=app.s2st, inputs=[input_audio], outputs=[output_audio], api_name="run", ) demo.queue(max_size=50).launch() if __name__ == '__main__': main()