eaysu commited on
Commit
7ab479a
·
1 Parent(s): 188479b

developed an algorithm for long texts.

Browse files
Files changed (1) hide show
  1. app.py +42 -11
app.py CHANGED
@@ -2,6 +2,12 @@ import gradio as gr
2
  from transformers import AutoProcessor, BarkModel
3
  import torch
4
  import numpy as np
 
 
 
 
 
 
5
 
6
  torch.set_num_threads(1)
7
 
@@ -16,25 +22,50 @@ all_voice_presets = [
16
  "v2/en_speaker_0", "v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_3",
17
  "v2/en_speaker_4", "v2/en_speaker_5", "v2/en_speaker_6",
18
  "v2/en_speaker_7", "v2/en_speaker_8", "v2/en_speaker_9",
19
- "v2/fr_speaker_0", "v2/fr_speaker_1", "v2/fr_speaker_2", "v2/fr_speaker_3",
20
- "v2/fr_speaker_4", "v2/fr_speaker_5", "v2/fr_speaker_6",
21
- "v2/fr_speaker_7", "v2/fr_speaker_8", "v2/fr_speaker_9",
22
  "v2/de_speaker_0", "v2/de_speaker_1", "v2/de_speaker_2", "v2/de_speaker_3",
23
  "v2/de_speaker_4", "v2/de_speaker_5", "v2/de_speaker_6",
24
  "v2/de_speaker_7", "v2/de_speaker_8", "v2/de_speaker_9",
25
- "v2/tr_speaker_0", "v2/tr_speaker_1", "v2/tr_speaker_2", "v2/tr_speaker_3",
26
- "v2/tr_speaker_4", "v2/tr_speaker_5", "v2/tr_speaker_6",
27
- "v2/tr_speaker_7", "v2/tr_speaker_8", "v2/tr_speaker_9"
 
 
 
 
 
 
28
  ]
29
 
 
 
 
30
  # Function to generate speech
31
  def generate_speech(text, model_name, voice_preset):
32
  model = models[model_name]
33
- processor = AutoProcessor.from_pretrained(model_name) # Load processor for the selected model
34
- inputs = processor(text, voice_preset=voice_preset)
35
- audio_array = model.generate(**inputs)
36
- audio_array = audio_array.cpu().numpy().squeeze()
37
- return (model.generation_config.sample_rate, audio_array)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  # Gradio app setup
40
  with gr.Blocks() as app:
 
2
  from transformers import AutoProcessor, BarkModel
3
  import torch
4
  import numpy as np
5
+ import nltk
6
+ from scipy.io.wavfile import write
7
+ from IPython.display import Audio
8
+
9
+ # Download nltk punkt for sentence tokenization
10
+ nltk.download("punkt")
11
 
12
  torch.set_num_threads(1)
13
 
 
22
  "v2/en_speaker_0", "v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_3",
23
  "v2/en_speaker_4", "v2/en_speaker_5", "v2/en_speaker_6",
24
  "v2/en_speaker_7", "v2/en_speaker_8", "v2/en_speaker_9",
25
+ "v2/tr_speaker_0", "v2/tr_speaker_1", "v2/tr_speaker_2", "v2/tr_speaker_3",
26
+ "v2/tr_speaker_4", "v2/tr_speaker_5", "v2/tr_speaker_6",
27
+ "v2/tr_speaker_7", "v2/tr_speaker_8", "v2/tr_speaker_9",
28
  "v2/de_speaker_0", "v2/de_speaker_1", "v2/de_speaker_2", "v2/de_speaker_3",
29
  "v2/de_speaker_4", "v2/de_speaker_5", "v2/de_speaker_6",
30
  "v2/de_speaker_7", "v2/de_speaker_8", "v2/de_speaker_9",
31
+ "v2/fr_speaker_0", "v2/fr_speaker_1", "v2/fr_speaker_2", "v2/fr_speaker_3",
32
+ "v2/fr_speaker_4", "v2/fr_speaker_5", "v2/fr_speaker_6",
33
+ "v2/fr_speaker_7", "v2/fr_speaker_8", "v2/fr_speaker_9",
34
+ "v2/it_speaker_0", "v2/it_speaker_1", "v2/it_speaker_2", "v2/it_speaker_3",
35
+ "v2/it_speaker_4", "v2/it_speaker_5", "v2/it_speaker_6",
36
+ "v2/it_speaker_7", "v2/it_speaker_8", "v2/it_speaker_9",
37
+ "v2/zh_speaker_0", "v2/zh_speaker_1", "v2/zh_speaker_2", "v2/zh_speaker_3",
38
+ "v2/zh_speaker_4", "v2/zh_speaker_5", "v2/zh_speaker_6",
39
+ "v2/zh_speaker_7", "v2/zh_speaker_8", "v2/zh_speaker_9"
40
  ]
41
 
42
+ SAMPLE_RATE = 22050 # Set a standard sample rate for Bark output
43
+ silence_duration = 0.25 # quarter-second silence duration between sentences
44
+
45
  # Function to generate speech
46
  def generate_speech(text, model_name, voice_preset):
47
  model = models[model_name]
48
+ processor = AutoProcessor.from_pretrained(model_name)
49
+ sentences = nltk.sent_tokenize(text) # Tokenize text into sentences
50
+
51
+ if len(sentences) == 1: # If single sentence, use original method
52
+ inputs = processor(text, voice_preset=voice_preset)
53
+ audio_array = model.generate(**inputs)
54
+ audio_array = audio_array.cpu().numpy().squeeze()
55
+ return (model.generation_config.sample_rate, audio_array)
56
+
57
+ # For multiple sentences, generate and concatenate
58
+ silence = np.zeros(int(silence_duration * SAMPLE_RATE))
59
+ audio_pieces = []
60
+
61
+ for sentence in sentences:
62
+ inputs = processor(sentence, voice_preset=voice_preset)
63
+ audio_array = model.generate(**inputs).cpu().numpy().squeeze()
64
+ audio_pieces.append(audio_array)
65
+ audio_pieces.append(silence.copy()) # Add silence between sentences
66
+
67
+ full_audio = np.concatenate(audio_pieces)
68
+ return (SAMPLE_RATE, full_audio)
69
 
70
  # Gradio app setup
71
  with gr.Blocks() as app: