File size: 3,840 Bytes
a6bbc99
 
907a50a
7cbdddc
7ab479a
 
 
 
 
 
de4cadf
a6bbc99
907a50a
 
d4af80c
7cbdddc
 
 
 
 
d4af80c
 
 
 
 
7ab479a
 
 
d4af80c
 
 
7ab479a
 
 
 
 
 
 
 
 
d4af80c
a6bbc99
7ab479a
 
 
a6bbc99
7cbdddc
 
7ab479a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a6bbc99
 
 
7cbdddc
a6bbc99
7cbdddc
 
 
 
2fc3324
7cbdddc
0d782ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import gradio as gr
from transformers import AutoProcessor, BarkModel
import torch
import numpy as np
import nltk
from scipy.io.wavfile import write
from IPython.display import Audio

# Download nltk punkt for sentence tokenization
nltk.download("punkt")
nltk.download("punkt_tab")

torch.set_num_threads(1)

# Load models
models = {
    "suno/bark": BarkModel.from_pretrained("suno/bark"),
    "suno/bark-small": BarkModel.from_pretrained("suno/bark-small")
}

# Combined voice presets
all_voice_presets = [
    "v2/en_speaker_0", "v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_3", 
    "v2/en_speaker_4", "v2/en_speaker_5", "v2/en_speaker_6", 
    "v2/en_speaker_7", "v2/en_speaker_8", "v2/en_speaker_9",
    "v2/tr_speaker_0", "v2/tr_speaker_1", "v2/tr_speaker_2", "v2/tr_speaker_3", 
    "v2/tr_speaker_4", "v2/tr_speaker_5", "v2/tr_speaker_6", 
    "v2/tr_speaker_7", "v2/tr_speaker_8", "v2/tr_speaker_9",
    "v2/de_speaker_0", "v2/de_speaker_1", "v2/de_speaker_2", "v2/de_speaker_3", 
    "v2/de_speaker_4", "v2/de_speaker_5", "v2/de_speaker_6", 
    "v2/de_speaker_7", "v2/de_speaker_8", "v2/de_speaker_9",
    "v2/fr_speaker_0", "v2/fr_speaker_1", "v2/fr_speaker_2", "v2/fr_speaker_3", 
    "v2/fr_speaker_4", "v2/fr_speaker_5", "v2/fr_speaker_6", 
    "v2/fr_speaker_7", "v2/fr_speaker_8", "v2/fr_speaker_9",
    "v2/it_speaker_0", "v2/it_speaker_1", "v2/it_speaker_2", "v2/it_speaker_3",
    "v2/it_speaker_4", "v2/it_speaker_5", "v2/it_speaker_6",
    "v2/it_speaker_7", "v2/it_speaker_8", "v2/it_speaker_9",
    "v2/zh_speaker_0", "v2/zh_speaker_1", "v2/zh_speaker_2", "v2/zh_speaker_3",
    "v2/zh_speaker_4", "v2/zh_speaker_5", "v2/zh_speaker_6",
    "v2/zh_speaker_7", "v2/zh_speaker_8", "v2/zh_speaker_9"
]

SAMPLE_RATE = 22050  # Set a standard sample rate for Bark output
silence_duration = 0.25  # quarter-second silence duration between sentences

# Function to generate speech
def generate_speech(text, model_name, voice_preset):
    model = models[model_name]
    processor = AutoProcessor.from_pretrained(model_name)
    sentences = nltk.sent_tokenize(text)  # Tokenize text into sentences

    if len(sentences) == 1:  # If single sentence, use original method
        inputs = processor(text, voice_preset=voice_preset)
        audio_array = model.generate(**inputs)
        audio_array = audio_array.cpu().numpy().squeeze()
        return (model.generation_config.sample_rate, audio_array)
    
    # For multiple sentences, generate and concatenate
    silence = np.zeros(int(silence_duration * SAMPLE_RATE))
    audio_pieces = []

    for sentence in sentences:
        inputs = processor(sentence, voice_preset=voice_preset)
        audio_array = model.generate(**inputs).cpu().numpy().squeeze()
        audio_pieces.append(audio_array)
        audio_pieces.append(silence.copy())  # Add silence between sentences

    full_audio = np.concatenate(audio_pieces)
    return (SAMPLE_RATE, full_audio)

# Gradio app setup
with gr.Blocks() as app:
    gr.Markdown("# Multilingual Text-to-Speech with Bark")

    # Textbox for user input
    text_input = gr.Textbox(label="Enter Text", placeholder="Type something to synthesize...")

    # Model selection
    model_preset_input = gr.Dropdown(["suno/bark", "suno/bark-small"], label="Select Model", value="suno/bark-small")

    # Combined voice presets dropdown
    voice_preset_input = gr.Dropdown(
        choices=all_voice_presets, 
        label="Select Voice Preset"
    )

    # Button to generate voice
    generate_button = gr.Button("Generate Voice")
    
    # Output audio
    audio_output = gr.Audio(label="Generated Voice", type="numpy")

    # Generate voice on button click
    generate_button.click(
        generate_speech,
        inputs=[text_input, model_preset_input, voice_preset_input],
        outputs=audio_output
    )

app.launch()