Spaces:
Running
Running
import gradio as gr | |
from transformers import AutoProcessor, BarkModel | |
import torch | |
import numpy as np | |
import nltk | |
from scipy.io.wavfile import write | |
from IPython.display import Audio | |
# Download nltk punkt for sentence tokenization | |
nltk.download("punkt") | |
torch.set_num_threads(1) | |
# Load models | |
models = { | |
"suno/bark": BarkModel.from_pretrained("suno/bark"), | |
"suno/bark-small": BarkModel.from_pretrained("suno/bark-small") | |
} | |
# Combined voice presets | |
all_voice_presets = [ | |
"v2/en_speaker_0", "v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_3", | |
"v2/en_speaker_4", "v2/en_speaker_5", "v2/en_speaker_6", | |
"v2/en_speaker_7", "v2/en_speaker_8", "v2/en_speaker_9", | |
"v2/tr_speaker_0", "v2/tr_speaker_1", "v2/tr_speaker_2", "v2/tr_speaker_3", | |
"v2/tr_speaker_4", "v2/tr_speaker_5", "v2/tr_speaker_6", | |
"v2/tr_speaker_7", "v2/tr_speaker_8", "v2/tr_speaker_9", | |
"v2/de_speaker_0", "v2/de_speaker_1", "v2/de_speaker_2", "v2/de_speaker_3", | |
"v2/de_speaker_4", "v2/de_speaker_5", "v2/de_speaker_6", | |
"v2/de_speaker_7", "v2/de_speaker_8", "v2/de_speaker_9", | |
"v2/fr_speaker_0", "v2/fr_speaker_1", "v2/fr_speaker_2", "v2/fr_speaker_3", | |
"v2/fr_speaker_4", "v2/fr_speaker_5", "v2/fr_speaker_6", | |
"v2/fr_speaker_7", "v2/fr_speaker_8", "v2/fr_speaker_9", | |
"v2/it_speaker_0", "v2/it_speaker_1", "v2/it_speaker_2", "v2/it_speaker_3", | |
"v2/it_speaker_4", "v2/it_speaker_5", "v2/it_speaker_6", | |
"v2/it_speaker_7", "v2/it_speaker_8", "v2/it_speaker_9", | |
"v2/zh_speaker_0", "v2/zh_speaker_1", "v2/zh_speaker_2", "v2/zh_speaker_3", | |
"v2/zh_speaker_4", "v2/zh_speaker_5", "v2/zh_speaker_6", | |
"v2/zh_speaker_7", "v2/zh_speaker_8", "v2/zh_speaker_9" | |
] | |
SAMPLE_RATE = 22050 # Set a standard sample rate for Bark output | |
silence_duration = 0.25 # quarter-second silence duration between sentences | |
# Function to generate speech | |
def generate_speech(text, model_name, voice_preset): | |
model = models[model_name] | |
processor = AutoProcessor.from_pretrained(model_name) | |
sentences = nltk.sent_tokenize(text) # Tokenize text into sentences | |
if len(sentences) == 1: # If single sentence, use original method | |
inputs = processor(text, voice_preset=voice_preset) | |
audio_array = model.generate(**inputs) | |
audio_array = audio_array.cpu().numpy().squeeze() | |
return (model.generation_config.sample_rate, audio_array) | |
# For multiple sentences, generate and concatenate | |
silence = np.zeros(int(silence_duration * SAMPLE_RATE)) | |
audio_pieces = [] | |
for sentence in sentences: | |
inputs = processor(sentence, voice_preset=voice_preset) | |
audio_array = model.generate(**inputs).cpu().numpy().squeeze() | |
audio_pieces.append(audio_array) | |
audio_pieces.append(silence.copy()) # Add silence between sentences | |
full_audio = np.concatenate(audio_pieces) | |
return (SAMPLE_RATE, full_audio) | |
# Gradio app setup | |
with gr.Blocks() as app: | |
gr.Markdown("# Multilingual Text-to-Speech with Bark") | |
# Textbox for user input | |
text_input = gr.Textbox(label="Enter Text", placeholder="Type something to synthesize...") | |
# Model selection | |
model_preset_input = gr.Dropdown(["suno/bark", "suno/bark-small"], label="Select Model", value="suno/bark-small") | |
# Combined voice presets dropdown | |
voice_preset_input = gr.Dropdown( | |
choices=all_voice_presets, | |
label="Select Voice Preset" | |
) | |
# Button to generate voice | |
generate_button = gr.Button("Generate Voice") | |
# Output audio | |
audio_output = gr.Audio(label="Generated Voice", type="numpy") | |
# Generate voice on button click | |
generate_button.click( | |
generate_speech, | |
inputs=[text_input, model_preset_input, voice_preset_input], | |
outputs=audio_output | |
) | |
app.launch() | |