Spaces:
Running
Running
File size: 5,072 Bytes
e667211 f6806df e667211 49f03db e667211 e58b4bb e667211 e58b4bb e667211 e58b4bb e667211 e58b4bb e667211 b293439 3cb13e6 b293439 e667211 e58b4bb 78e56be b293439 78e56be 3cb13e6 b293439 e667211 3cb13e6 e667211 24fdc8a e667211 49f03db e667211 49f03db e667211 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import gradio as gr
import torchaudio
import torch
import os
import time
import soundfile as sf
languages = {
"English": "eng",
"Hindi": "hin",
"Portuguese": "por",
"Russian": "rus",
"Spanish": "spa"
}
welcome_message = """
# Welcome to Tonic's Unity On Device!
Tonic's Unity On Device uses [facebook/seamless-m4t-unity-small](https://huggingface.co/facebook/seamless-m4t-unity-small) for audio translation & accessibility.
Tonic's Unity On Device!🚀 on your own data & in your own way by cloning this space. Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/TeamTonic/SeamlessOnDevice?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a></h3>
### Join us :
TeamTonic is always making cool demos! Join our active builder's community on Discord: [Discord](https://discord.gg/GWpVpekp) On Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On Github: [Polytonic](https://github.com/tonic-ai) & contribute to [PolyGPT](https://github.com/tonic-ai/polygpt-alpha)"
"""
def save_and_resample_audio(input_audio_path, output_audio_path, resample_rate=16000):
waveform, sample_rate = torchaudio.load(input_audio_path)
resampler = torchaudio.transforms.Resample(sample_rate, resample_rate, dtype=waveform.dtype)
resampled_waveform = resampler(waveform)
torchaudio.save(output_audio_path, resampled_waveform, resample_rate)
def save_audio(audio_input, output_dir="saved_audio", resample_rate=16000):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
sample_rate, audio_data = audio_input
file_name = f"audio_{int(time.time())}.wav"
file_path = os.path.join(output_dir, file_name)
sf.write(file_path, audio_data, sample_rate)
resampled_file_path = os.path.join(output_dir, f"resampled_{file_name}")
save_and_resample_audio(file_path, resampled_file_path, resample_rate)
return resampled_file_path
def speech_to_text(audio_data, tgt_lang):
file_path = save_audio(audio_data)
audio_input, _ = torchaudio.load(file_path)
s2t_model = torch.jit.load("unity_on_device.ptl", map_location=torch.device('cpu'))
with torch.no_grad():
model_output = s2t_model(audio_input, tgt_lang=languages[tgt_lang])
transcribed_text = model_output[0] if model_output else ""
print("Speech to Text Model Output:", transcribed_text)
return transcribed_text
def speech_to_speech_translation(audio_data, tgt_lang):
file_path = save_audio(audio_data)
audio_input, _ = torchaudio.load(file_path)
s2st_model = torch.jit.load("unity_on_device.ptl", map_location=torch.device('cpu'))
with torch.no_grad():
translated_text, units, waveform = s2st_model(audio_input, tgt_lang=languages[tgt_lang])
output_file = "/tmp/result.wav"
torchaudio.save(output_file, waveform.unsqueeze(0), sample_rate=16000)
print("Translated Text:", translated_text)
print("Units:", units)
print("Waveform Shape:", waveform.shape)
return translated_text, output_file
def create_interface():
with gr.Blocks(theme='ParityError/Anime') as interface:
gr.Markdown(welcome_message)
input_language = gr.Dropdown(list(languages.keys()), label="Select Target Language", value="English")
with gr.Accordion("Speech to Text", open=False) as stt_accordion:
audio_input_stt = gr.Audio(label="Upload or Record Audio")
text_output_stt = gr.Text(label="Transcribed Text")
stt_button = gr.Button("Transcribe")
stt_button.click(speech_to_text, inputs=[audio_input_stt, input_language], outputs=text_output_stt)
gr.Examples([["audio1.wav"]], inputs=[audio_input_stt], outputs=[text_output_stt])
with gr.Accordion("Speech to Speech Translation", open=False) as s2st_accordion:
audio_input_s2st = gr.Audio(label="Upload or Record Audio")
text_output_s2st = gr.Text(label="Translated Text")
audio_output_s2st = gr.Audio(label="Translated Audio", type="filepath")
s2st_button = gr.Button("Translate")
s2st_button.click(speech_to_speech_translation, inputs=[audio_input_s2st, input_language], outputs=[text_output_s2st, audio_output_s2st])
gr.Examples([["audio1.wav"]], inputs=[audio_input_s2st], outputs=[text_output_s2st, audio_output_s2st])
return interface
app = create_interface()
app.launch(show_error=True, debug=True) |