|
import gradio as gr |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import librosa |
|
import numpy as np |
|
import torch |
|
|
|
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
|
|
|
|
|
checkpoint = "microsoft/speecht5_tts" |
|
processor = SpeechT5Processor.from_pretrained(checkpoint) |
|
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) |
|
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
|
|
def predict(text): |
|
if len(text.strip()) == 0: |
|
return (16000, np.zeros(0).astype(np.int16)) |
|
|
|
inputs = processor(text=text, return_tensors="pt") |
|
|
|
|
|
input_ids = inputs["input_ids"] |
|
input_ids = input_ids[..., :model.config.max_text_positions] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
speaker_embedding = np.load("cmu_us_bdl_arctic-wav-arctic_a0009.npy") |
|
|
|
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0) |
|
|
|
speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder) |
|
|
|
speech = (speech.numpy() * 32767).astype(np.int16) |
|
return (16000, speech) |
|
|
|
demo = gr.Interface( |
|
fn = predict, |
|
inputs="text", |
|
outputs=gr.Audio(type="numpy") |
|
) |
|
|
|
demo.launch() |