Spaces:
Running
Running
import gradio as gr | |
import torchaudio | |
import torch | |
# Define the list of target languages | |
languages = { | |
"English": "eng", | |
"Hindi": "hin", | |
"Portuguese": "por", | |
"Russian": "rus", | |
"Spanish": "spa" | |
} | |
def speech_to_text(audio_data, tgt_lang): | |
audio_input, _ = torchaudio.load(audio_data) | |
s2t_model = torch.jit.load("unity_on_device_s2t.ptl") | |
with torch.no_grad(): | |
text = s2t_model(audio_input, tgt_lang=languages[tgt_lang]) | |
return text | |
def speech_to_speech_translation(audio_data, tgt_lang): | |
audio_input, _ = torchaudio.load(audio_data) | |
s2st_model = torch.jit.load("unity_on_device.ptl") | |
with torch.no_grad(): | |
text, units, waveform = s2st_model(audio_input, tgt_lang=languages[tgt_lang]) | |
output_file = "/tmp/result.wav" | |
torchaudio.save(output_file, waveform.unsqueeze(0), sample_rate=16000) | |
return text, output_file | |
# Gradio interfaces | |
iface_s2t = gr.Interface( | |
fn=speech_to_text, | |
inputs=[ | |
gr.Audio(label="Upload or Record Audio for Speech to Text"), | |
gr.Dropdown(list(languages.keys()), label="Select Target Language") | |
], | |
outputs="text", | |
title="Speech to Text" | |
) | |
iface_s2st = gr.Interface( | |
fn=speech_to_speech_translation, | |
inputs=[ | |
gr.Audio(label="Upload or Record Audio for Speech to Speech Translation"), | |
gr.Dropdown(list(languages.keys()), label="Select Target Language") | |
], | |
outputs=["text", "audio"], | |
title="Speech to Speech Translation" | |
) | |
# Combine into an accordion interface | |
accordion = gr.Accordion( | |
iface_s2t, | |
iface_s2st, | |
labels=["Speech to Text", "Speech to Speech Translation"] | |
) | |
# Launch the application | |
accordion.launch() | |