Spaces:
Running
Running
eaysu
commited on
Commit
·
7ab479a
1
Parent(s):
188479b
developed an algorithm for long texts.
Browse files
app.py
CHANGED
@@ -2,6 +2,12 @@ import gradio as gr
|
|
2 |
from transformers import AutoProcessor, BarkModel
|
3 |
import torch
|
4 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
torch.set_num_threads(1)
|
7 |
|
@@ -16,25 +22,50 @@ all_voice_presets = [
|
|
16 |
"v2/en_speaker_0", "v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_3",
|
17 |
"v2/en_speaker_4", "v2/en_speaker_5", "v2/en_speaker_6",
|
18 |
"v2/en_speaker_7", "v2/en_speaker_8", "v2/en_speaker_9",
|
19 |
-
"v2/
|
20 |
-
"v2/
|
21 |
-
"v2/
|
22 |
"v2/de_speaker_0", "v2/de_speaker_1", "v2/de_speaker_2", "v2/de_speaker_3",
|
23 |
"v2/de_speaker_4", "v2/de_speaker_5", "v2/de_speaker_6",
|
24 |
"v2/de_speaker_7", "v2/de_speaker_8", "v2/de_speaker_9",
|
25 |
-
"v2/
|
26 |
-
"v2/
|
27 |
-
"v2/
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
]
|
29 |
|
|
|
|
|
|
|
30 |
# Function to generate speech
|
31 |
def generate_speech(text, model_name, voice_preset):
|
32 |
model = models[model_name]
|
33 |
-
processor = AutoProcessor.from_pretrained(model_name)
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
# Gradio app setup
|
40 |
with gr.Blocks() as app:
|
|
|
2 |
from transformers import AutoProcessor, BarkModel
|
3 |
import torch
|
4 |
import numpy as np
|
5 |
+
import nltk
|
6 |
+
from scipy.io.wavfile import write
|
7 |
+
from IPython.display import Audio
|
8 |
+
|
9 |
+
# Download nltk punkt for sentence tokenization
|
10 |
+
nltk.download("punkt")
|
11 |
|
12 |
torch.set_num_threads(1)
|
13 |
|
|
|
22 |
"v2/en_speaker_0", "v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_3",
|
23 |
"v2/en_speaker_4", "v2/en_speaker_5", "v2/en_speaker_6",
|
24 |
"v2/en_speaker_7", "v2/en_speaker_8", "v2/en_speaker_9",
|
25 |
+
"v2/tr_speaker_0", "v2/tr_speaker_1", "v2/tr_speaker_2", "v2/tr_speaker_3",
|
26 |
+
"v2/tr_speaker_4", "v2/tr_speaker_5", "v2/tr_speaker_6",
|
27 |
+
"v2/tr_speaker_7", "v2/tr_speaker_8", "v2/tr_speaker_9",
|
28 |
"v2/de_speaker_0", "v2/de_speaker_1", "v2/de_speaker_2", "v2/de_speaker_3",
|
29 |
"v2/de_speaker_4", "v2/de_speaker_5", "v2/de_speaker_6",
|
30 |
"v2/de_speaker_7", "v2/de_speaker_8", "v2/de_speaker_9",
|
31 |
+
"v2/fr_speaker_0", "v2/fr_speaker_1", "v2/fr_speaker_2", "v2/fr_speaker_3",
|
32 |
+
"v2/fr_speaker_4", "v2/fr_speaker_5", "v2/fr_speaker_6",
|
33 |
+
"v2/fr_speaker_7", "v2/fr_speaker_8", "v2/fr_speaker_9",
|
34 |
+
"v2/it_speaker_0", "v2/it_speaker_1", "v2/it_speaker_2", "v2/it_speaker_3",
|
35 |
+
"v2/it_speaker_4", "v2/it_speaker_5", "v2/it_speaker_6",
|
36 |
+
"v2/it_speaker_7", "v2/it_speaker_8", "v2/it_speaker_9",
|
37 |
+
"v2/zh_speaker_0", "v2/zh_speaker_1", "v2/zh_speaker_2", "v2/zh_speaker_3",
|
38 |
+
"v2/zh_speaker_4", "v2/zh_speaker_5", "v2/zh_speaker_6",
|
39 |
+
"v2/zh_speaker_7", "v2/zh_speaker_8", "v2/zh_speaker_9"
|
40 |
]
|
41 |
|
42 |
+
SAMPLE_RATE = 22050 # Set a standard sample rate for Bark output
|
43 |
+
silence_duration = 0.25 # quarter-second silence duration between sentences
|
44 |
+
|
45 |
# Function to generate speech
|
46 |
def generate_speech(text, model_name, voice_preset):
|
47 |
model = models[model_name]
|
48 |
+
processor = AutoProcessor.from_pretrained(model_name)
|
49 |
+
sentences = nltk.sent_tokenize(text) # Tokenize text into sentences
|
50 |
+
|
51 |
+
if len(sentences) == 1: # If single sentence, use original method
|
52 |
+
inputs = processor(text, voice_preset=voice_preset)
|
53 |
+
audio_array = model.generate(**inputs)
|
54 |
+
audio_array = audio_array.cpu().numpy().squeeze()
|
55 |
+
return (model.generation_config.sample_rate, audio_array)
|
56 |
+
|
57 |
+
# For multiple sentences, generate and concatenate
|
58 |
+
silence = np.zeros(int(silence_duration * SAMPLE_RATE))
|
59 |
+
audio_pieces = []
|
60 |
+
|
61 |
+
for sentence in sentences:
|
62 |
+
inputs = processor(sentence, voice_preset=voice_preset)
|
63 |
+
audio_array = model.generate(**inputs).cpu().numpy().squeeze()
|
64 |
+
audio_pieces.append(audio_array)
|
65 |
+
audio_pieces.append(silence.copy()) # Add silence between sentences
|
66 |
+
|
67 |
+
full_audio = np.concatenate(audio_pieces)
|
68 |
+
return (SAMPLE_RATE, full_audio)
|
69 |
|
70 |
# Gradio app setup
|
71 |
with gr.Blocks() as app:
|