gabrielchua commited on
Commit
f136260
·
1 Parent(s): 9f31d5a

remove Parler-TTS Mini

Browse files
Files changed (2) hide show
  1. app.py +9 -25
  2. utils.py +16 -46
app.py CHANGED
@@ -25,7 +25,7 @@ from utils import generate_script, generate_audio, parse_url
25
  class DialogueItem(BaseModel):
26
  """A single dialogue item."""
27
 
28
- speaker: Literal["Host (Jenna)", "Guest"]
29
  text: str
30
 
31
 
@@ -41,7 +41,6 @@ def generate_podcast(
41
  files: List[str],
42
  url: Optional[str],
43
  tone: Optional[str],
44
- voice: Optional[str],
45
  length: Optional[str],
46
  language: str
47
  ) -> Tuple[str, str]:
@@ -58,12 +57,6 @@ def generate_podcast(
58
  "Korean": "KR",
59
  }
60
 
61
- # Change voice to the appropriate code
62
- voice_mapping = {
63
- "Male": "Gary",
64
- "Female": "Laura",
65
- }
66
-
67
  # Check if at least one input is provided
68
  if not files and not url:
69
  raise gr.Error("Please provide at least one PDF file or a URL.")
@@ -116,16 +109,16 @@ def generate_podcast(
116
  total_characters = 0
117
 
118
  for line in llm_output.dialogue:
119
- logger.info(f"Generating audio for {line.speaker}, {language} and {voice}: {line.text}")
120
- if line.speaker == "Host (Jenna)":
121
- speaker = f"**Jenna**: {line.text}"
122
  else:
123
  speaker = f"**{llm_output.name_of_guest}**: {line.text}"
124
  transcript += speaker + "\n\n"
125
  total_characters += len(line.text)
126
 
127
  # Get audio file path
128
- audio_file_path = generate_audio(line.text, line.speaker, language_mapping[language], voice_mapping[voice])
129
  # Read the audio file into an AudioSegment
130
  audio_segment = AudioSegment.from_file(audio_file_path)
131
  audio_segments.append(audio_segment)
@@ -173,20 +166,15 @@ demo = gr.Interface(
173
  label="3. 🎭 Choose the tone",
174
  value="Fun"
175
  ),
176
- gr.Radio(
177
- choices=["Male", "Female"],
178
- label="4. 🎭 Choose the guest's voice",
179
- value="Female"
180
- ),
181
  gr.Radio(
182
  choices=["Short (1-2 min)", "Medium (3-5 min)"],
183
- label="5. ⏱️ Choose the length",
184
  value="Medium (3-5 min)"
185
  ),
186
  gr.Dropdown(
187
  choices=["English", "Spanish", "French", "Chinese", "Japanese", "Korean"],
188
  value="English",
189
- label="6. 🌐 Choose the language (Highly experimental, English is recommended)",
190
  ),
191
  ],
192
  outputs=[
@@ -202,15 +190,13 @@ demo = gr.Interface(
202
  [str(Path("examples/1310.4546v1.pdf"))],
203
  "",
204
  "Fun",
205
- "Male",
206
- "Medium (3-5 min)",
207
  "English"
208
  ],
209
  [
210
  [],
211
  "https://en.wikipedia.org/wiki/Hugging_Face",
212
  "Fun",
213
- "Male",
214
  "Short (1-2 min)",
215
  "English"
216
  ],
@@ -218,14 +204,12 @@ demo = gr.Interface(
218
  [],
219
  "https://simple.wikipedia.org/wiki/Taylor_Swift",
220
  "Fun",
221
- "Female",
222
  "Short (1-2 min)",
223
  "English"
224
  ],
225
  ],
226
  cache_examples=True,
227
- examples_cache_dir="examples_cached"
228
  )
229
 
230
  if __name__ == "__main__":
231
- demo.launch(show_api=True)
 
25
  class DialogueItem(BaseModel):
26
  """A single dialogue item."""
27
 
28
+ speaker: Literal["Host (Jane)", "Guest"]
29
  text: str
30
 
31
 
 
41
  files: List[str],
42
  url: Optional[str],
43
  tone: Optional[str],
 
44
  length: Optional[str],
45
  language: str
46
  ) -> Tuple[str, str]:
 
57
  "Korean": "KR",
58
  }
59
 
 
 
 
 
 
 
60
  # Check if at least one input is provided
61
  if not files and not url:
62
  raise gr.Error("Please provide at least one PDF file or a URL.")
 
109
  total_characters = 0
110
 
111
  for line in llm_output.dialogue:
112
+ logger.info(f"Generating audio for {line.speaker}: {line.text}")
113
+ if line.speaker == "Host (Jane)":
114
+ speaker = f"**Jane**: {line.text}"
115
  else:
116
  speaker = f"**{llm_output.name_of_guest}**: {line.text}"
117
  transcript += speaker + "\n\n"
118
  total_characters += len(line.text)
119
 
120
  # Get audio file path
121
+ audio_file_path = generate_audio(line.text, line.speaker, language_mapping[language])
122
  # Read the audio file into an AudioSegment
123
  audio_segment = AudioSegment.from_file(audio_file_path)
124
  audio_segments.append(audio_segment)
 
166
  label="3. 🎭 Choose the tone",
167
  value="Fun"
168
  ),
 
 
 
 
 
169
  gr.Radio(
170
  choices=["Short (1-2 min)", "Medium (3-5 min)"],
171
+ label="4. ⏱️ Choose the length",
172
  value="Medium (3-5 min)"
173
  ),
174
  gr.Dropdown(
175
  choices=["English", "Spanish", "French", "Chinese", "Japanese", "Korean"],
176
  value="English",
177
+ label="5. 🌐 Choose the language (Highly experimental, English is recommended)",
178
  ),
179
  ],
180
  outputs=[
 
190
  [str(Path("examples/1310.4546v1.pdf"))],
191
  "",
192
  "Fun",
193
+ "Short (1-2 min)",
 
194
  "English"
195
  ],
196
  [
197
  [],
198
  "https://en.wikipedia.org/wiki/Hugging_Face",
199
  "Fun",
 
200
  "Short (1-2 min)",
201
  "English"
202
  ],
 
204
  [],
205
  "https://simple.wikipedia.org/wiki/Taylor_Swift",
206
  "Fun",
 
207
  "Short (1-2 min)",
208
  "English"
209
  ],
210
  ],
211
  cache_examples=True,
 
212
  )
213
 
214
  if __name__ == "__main__":
215
+ demo.launch(show_api=True)
utils.py CHANGED
@@ -7,20 +7,12 @@ Functions:
7
  - get_audio: Get the audio from the TTS model from HF Spaces.
8
  """
9
 
10
- import os
11
  import requests
12
- import tempfile
13
 
14
-
15
- import soundfile as sf
16
- import spaces
17
- import torch
18
  from gradio_client import Client
19
  from openai import OpenAI
20
- from parler_tts import ParlerTTSForConditionalGeneration
21
  from pydantic import ValidationError
22
- from transformers import AutoTokenizer
23
-
24
 
25
  MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
26
  JINA_URL = "https://r.jina.ai/"
@@ -32,10 +24,6 @@ client = OpenAI(
32
 
33
  hf_client = Client("mrfakename/MeloTTS")
34
 
35
- # Initialize the model and tokenizer (do this outside the function for efficiency)
36
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
37
- model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1").to(device)
38
- tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
39
 
40
  def generate_script(system_prompt: str, input_text: str, output_model):
41
  """Get the dialogue from the LLM."""
@@ -79,38 +67,20 @@ def parse_url(url: str) -> str:
79
  response = requests.get(full_url, timeout=60)
80
  return response.text
81
 
82
- def generate_audio(text: str, speaker: str, language: str, voice: str) -> str:
83
- """Generate audio using the local Parler TTS model or HuggingFace client."""
84
-
85
- if language == "EN":
86
- # Adjust the description based on speaker and language
87
- if speaker == "Guest":
88
- description = f"{voice} has a slightly expressive and animated speech, speaking at a moderate speed with natural pitch variations. The voice is clear and close-up, as if recorded in a professional studio."
89
- else: # host
90
- description = f"{voice} has a professional and engaging tone, speaking at a moderate to slightly faster pace. The voice is clear, warm, and sounds like a seasoned podcast host."
91
-
92
- # Prepare inputs
93
- input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
94
- prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
95
 
96
- # Generate audio
97
- generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
98
- audio_arr = generation.cpu().numpy().squeeze()
99
-
100
- # Save to temporary file
101
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
102
- sf.write(temp_file.name, audio_arr, model.config.sampling_rate, format='mp3')
103
-
104
- return temp_file.name
 
105
 
106
- else:
107
- accent = language
108
- if speaker == "Guest":
109
- speed = 0.9
110
- else: # host
111
- speed = 1.1
112
- # Generate audio
113
- result = hf_client.predict(
114
- text=text, language=language, speaker=accent, speed=speed, api_name="/synthesize"
115
- )
116
- return result
 
7
  - get_audio: Get the audio from the TTS model from HF Spaces.
8
  """
9
 
10
+ import os
11
  import requests
 
12
 
 
 
 
 
13
  from gradio_client import Client
14
  from openai import OpenAI
 
15
  from pydantic import ValidationError
 
 
16
 
17
  MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
18
  JINA_URL = "https://r.jina.ai/"
 
24
 
25
  hf_client = Client("mrfakename/MeloTTS")
26
 
 
 
 
 
27
 
28
  def generate_script(system_prompt: str, input_text: str, output_model):
29
  """Get the dialogue from the LLM."""
 
67
  response = requests.get(full_url, timeout=60)
68
  return response.text
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ def generate_audio(text: str, speaker: str, language: str) -> bytes:
72
+ """Get the audio from the TTS model from HF Spaces and adjust pitch if necessary."""
73
+ if speaker == "Guest":
74
+ accent = "EN-US" if language == "EN" else language
75
+ speed = 0.9
76
+ else: # host
77
+ accent = "EN-Default" if language == "EN" else language
78
+ speed = 1
79
+ if language != "EN" and speaker != "Guest":
80
+ speed = 1.1
81
 
82
+ # Generate audio
83
+ result = hf_client.predict(
84
+ text=text, language=language, speaker=accent, speed=speed, api_name="/synthesize"
85
+ )
86
+ return result