|
import os |
|
|
|
import torch |
|
import gradio as gr |
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
vocoder = torch.hub.load(repo_or_dir='ex3ndr/supervoice-vocoder', model='bigvsan') |
|
vocoder.to(device) |
|
vocoder.eval() |
|
|
|
|
|
gpt = torch.hub.load(repo_or_dir='ex3ndr/supervoice-gpt', model='phonemizer') |
|
gpt.to(device) |
|
gpt.eval() |
|
|
|
|
|
model = torch.hub.load(repo_or_dir='ex3ndr/supervoice-voicebox', model='phonemizer', gpt=gpt, vocoder=vocoder) |
|
model.to(device) |
|
model.eval() |
|
|
|
|
|
|
|
|
|
description = f''' |
|
Voicebox demo |
|
''' |
|
|
|
def synthesise(text, voice): |
|
output = model.synthesize(text, voice = voice, steps = 8, alpha = 0.1) |
|
waveform = output['wav'] |
|
return (24000, waveform.numpy()) |
|
|
|
if __name__ == "__main__": |
|
i = gr.Interface( |
|
fn=synthesise, |
|
description=description, |
|
inputs=[ |
|
gr.Text(label='Text:', lines=5, max_lines=10), |
|
gr.Dropdown(label="voice", choices=("voice_1", "voice_2"), value="voice_1"), |
|
], |
|
outputs=[ |
|
gr.Audio( |
|
label="Audio:", |
|
autoplay=False, |
|
streaming=False, |
|
type="numpy", |
|
), |
|
|
|
], |
|
allow_flagging ='never', |
|
cache_examples=True, |
|
title='Something', |
|
examples=[ ], |
|
) |
|
i.queue(max_size=20, default_concurrency_limit=4) |
|
i.launch(share=False, server_name="0.0.0.0") |
|
|