eaysu
developed an algorithm for long texts.
7ab479a
raw
history blame
3.81 kB
import gradio as gr
from transformers import AutoProcessor, BarkModel
import torch
import numpy as np
import nltk
from scipy.io.wavfile import write
from IPython.display import Audio
# Download nltk punkt for sentence tokenization
nltk.download("punkt")
torch.set_num_threads(1)
# Load models
models = {
"suno/bark": BarkModel.from_pretrained("suno/bark"),
"suno/bark-small": BarkModel.from_pretrained("suno/bark-small")
}
# Combined voice presets
all_voice_presets = [
"v2/en_speaker_0", "v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_3",
"v2/en_speaker_4", "v2/en_speaker_5", "v2/en_speaker_6",
"v2/en_speaker_7", "v2/en_speaker_8", "v2/en_speaker_9",
"v2/tr_speaker_0", "v2/tr_speaker_1", "v2/tr_speaker_2", "v2/tr_speaker_3",
"v2/tr_speaker_4", "v2/tr_speaker_5", "v2/tr_speaker_6",
"v2/tr_speaker_7", "v2/tr_speaker_8", "v2/tr_speaker_9",
"v2/de_speaker_0", "v2/de_speaker_1", "v2/de_speaker_2", "v2/de_speaker_3",
"v2/de_speaker_4", "v2/de_speaker_5", "v2/de_speaker_6",
"v2/de_speaker_7", "v2/de_speaker_8", "v2/de_speaker_9",
"v2/fr_speaker_0", "v2/fr_speaker_1", "v2/fr_speaker_2", "v2/fr_speaker_3",
"v2/fr_speaker_4", "v2/fr_speaker_5", "v2/fr_speaker_6",
"v2/fr_speaker_7", "v2/fr_speaker_8", "v2/fr_speaker_9",
"v2/it_speaker_0", "v2/it_speaker_1", "v2/it_speaker_2", "v2/it_speaker_3",
"v2/it_speaker_4", "v2/it_speaker_5", "v2/it_speaker_6",
"v2/it_speaker_7", "v2/it_speaker_8", "v2/it_speaker_9",
"v2/zh_speaker_0", "v2/zh_speaker_1", "v2/zh_speaker_2", "v2/zh_speaker_3",
"v2/zh_speaker_4", "v2/zh_speaker_5", "v2/zh_speaker_6",
"v2/zh_speaker_7", "v2/zh_speaker_8", "v2/zh_speaker_9"
]
SAMPLE_RATE = 22050 # Set a standard sample rate for Bark output
silence_duration = 0.25 # quarter-second silence duration between sentences
# Function to generate speech
def generate_speech(text, model_name, voice_preset):
model = models[model_name]
processor = AutoProcessor.from_pretrained(model_name)
sentences = nltk.sent_tokenize(text) # Tokenize text into sentences
if len(sentences) == 1: # If single sentence, use original method
inputs = processor(text, voice_preset=voice_preset)
audio_array = model.generate(**inputs)
audio_array = audio_array.cpu().numpy().squeeze()
return (model.generation_config.sample_rate, audio_array)
# For multiple sentences, generate and concatenate
silence = np.zeros(int(silence_duration * SAMPLE_RATE))
audio_pieces = []
for sentence in sentences:
inputs = processor(sentence, voice_preset=voice_preset)
audio_array = model.generate(**inputs).cpu().numpy().squeeze()
audio_pieces.append(audio_array)
audio_pieces.append(silence.copy()) # Add silence between sentences
full_audio = np.concatenate(audio_pieces)
return (SAMPLE_RATE, full_audio)
# Gradio app setup
with gr.Blocks() as app:
gr.Markdown("# Multilingual Text-to-Speech with Bark")
# Textbox for user input
text_input = gr.Textbox(label="Enter Text", placeholder="Type something to synthesize...")
# Model selection
model_preset_input = gr.Dropdown(["suno/bark", "suno/bark-small"], label="Select Model", value="suno/bark-small")
# Combined voice presets dropdown
voice_preset_input = gr.Dropdown(
choices=all_voice_presets,
label="Select Voice Preset"
)
# Button to generate voice
generate_button = gr.Button("Generate Voice")
# Output audio
audio_output = gr.Audio(label="Generated Voice", type="numpy")
# Generate voice on button click
generate_button.click(
generate_speech,
inputs=[text_input, model_preset_input, voice_preset_input],
outputs=audio_output
)
app.launch()