eaysu
punkt tab added
de4cadf
import gradio as gr
from transformers import AutoProcessor, BarkModel
import torch
import numpy as np
import nltk
from scipy.io.wavfile import write
from IPython.display import Audio
# Download nltk punkt for sentence tokenization
nltk.download("punkt")
nltk.download("punkt_tab")
torch.set_num_threads(1)
# Load models
models = {
"suno/bark": BarkModel.from_pretrained("suno/bark"),
"suno/bark-small": BarkModel.from_pretrained("suno/bark-small")
}
# Combined voice presets
all_voice_presets = [
"v2/en_speaker_0", "v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_3",
"v2/en_speaker_4", "v2/en_speaker_5", "v2/en_speaker_6",
"v2/en_speaker_7", "v2/en_speaker_8", "v2/en_speaker_9",
"v2/tr_speaker_0", "v2/tr_speaker_1", "v2/tr_speaker_2", "v2/tr_speaker_3",
"v2/tr_speaker_4", "v2/tr_speaker_5", "v2/tr_speaker_6",
"v2/tr_speaker_7", "v2/tr_speaker_8", "v2/tr_speaker_9",
"v2/de_speaker_0", "v2/de_speaker_1", "v2/de_speaker_2", "v2/de_speaker_3",
"v2/de_speaker_4", "v2/de_speaker_5", "v2/de_speaker_6",
"v2/de_speaker_7", "v2/de_speaker_8", "v2/de_speaker_9",
"v2/fr_speaker_0", "v2/fr_speaker_1", "v2/fr_speaker_2", "v2/fr_speaker_3",
"v2/fr_speaker_4", "v2/fr_speaker_5", "v2/fr_speaker_6",
"v2/fr_speaker_7", "v2/fr_speaker_8", "v2/fr_speaker_9",
"v2/it_speaker_0", "v2/it_speaker_1", "v2/it_speaker_2", "v2/it_speaker_3",
"v2/it_speaker_4", "v2/it_speaker_5", "v2/it_speaker_6",
"v2/it_speaker_7", "v2/it_speaker_8", "v2/it_speaker_9",
"v2/zh_speaker_0", "v2/zh_speaker_1", "v2/zh_speaker_2", "v2/zh_speaker_3",
"v2/zh_speaker_4", "v2/zh_speaker_5", "v2/zh_speaker_6",
"v2/zh_speaker_7", "v2/zh_speaker_8", "v2/zh_speaker_9"
]
SAMPLE_RATE = 22050 # Set a standard sample rate for Bark output
silence_duration = 0.25 # quarter-second silence duration between sentences
# Function to generate speech
def generate_speech(text, model_name, voice_preset):
model = models[model_name]
processor = AutoProcessor.from_pretrained(model_name)
sentences = nltk.sent_tokenize(text) # Tokenize text into sentences
if len(sentences) == 1: # If single sentence, use original method
inputs = processor(text, voice_preset=voice_preset)
audio_array = model.generate(**inputs)
audio_array = audio_array.cpu().numpy().squeeze()
return (model.generation_config.sample_rate, audio_array)
# For multiple sentences, generate and concatenate
silence = np.zeros(int(silence_duration * SAMPLE_RATE))
audio_pieces = []
for sentence in sentences:
inputs = processor(sentence, voice_preset=voice_preset)
audio_array = model.generate(**inputs).cpu().numpy().squeeze()
audio_pieces.append(audio_array)
audio_pieces.append(silence.copy()) # Add silence between sentences
full_audio = np.concatenate(audio_pieces)
return (SAMPLE_RATE, full_audio)
# Gradio app setup
with gr.Blocks() as app:
gr.Markdown("# Multilingual Text-to-Speech with Bark")
# Textbox for user input
text_input = gr.Textbox(label="Enter Text", placeholder="Type something to synthesize...")
# Model selection
model_preset_input = gr.Dropdown(["suno/bark", "suno/bark-small"], label="Select Model", value="suno/bark-small")
# Combined voice presets dropdown
voice_preset_input = gr.Dropdown(
choices=all_voice_presets,
label="Select Voice Preset"
)
# Button to generate voice
generate_button = gr.Button("Generate Voice")
# Output audio
audio_output = gr.Audio(label="Generated Voice", type="numpy")
# Generate voice on button click
generate_button.click(
generate_speech,
inputs=[text_input, model_preset_input, voice_preset_input],
outputs=audio_output
)
app.launch()