Andrei Kulchyk Andrei Kulchyk commited on
Commit
0925810
Β·
unverified Β·
1 Parent(s): 38f34b6

Convert TTS async-ly (#3)

Browse files

Co-authored-by: Andrei Kulchyk <[email protected]>

Files changed (2) hide show
  1. app.py +37 -25
  2. src/tts.py +33 -0
app.py CHANGED
@@ -1,14 +1,20 @@
1
  import json
2
  import os
3
  import re
 
 
4
 
5
  import librosa
6
  import requests
7
  import gradio as gr
8
  import pandas as pd
9
  from dotenv import load_dotenv
10
- from openai import OpenAI
11
  from langchain_community.document_loaders import PyPDFLoader
 
 
 
 
12
 
13
  load_dotenv()
14
 
@@ -77,7 +83,8 @@ class AudiobookBuilder:
77
  self._aiml_base_url = aiml_base_url
78
  self._aiml_client = OpenAI(api_key=api_key, base_url=self._aiml_base_url)
79
  self._default_narrator_voice = "ALY2WaJPY0oBJlqpQbfW"
80
- self._eleven_api_key = eleven_api_key or os.environ["ELEVEN_API_KEY"]
 
81
 
82
  def annotate_text(self, text: str) -> str:
83
  response = self._send_request_to_llm(messages=[
@@ -108,29 +115,34 @@ class AudiobookBuilder:
108
  )
109
  return json.loads(response["choices"][0]["message"]["content"])
110
 
111
- def generate_audio(
112
  self,
113
  annotated_text: str,
114
  character_to_voice: dict[str, str],
115
- *,
116
- chunk_size: int = 1024,
117
- ) -> None:
118
  current_character = "narrator"
119
- with open("audiobook.mp3", "wb") as ab:
120
- for line in annotated_text.splitlines():
121
- cleaned_line = line.strip().lower()
122
- if not cleaned_line:
123
- continue
124
- try:
125
- current_character = re.findall(r"\[[\w\s]+\]", cleaned_line)[0][1:-1]
126
- except:
127
- pass
128
- voice_id = character_to_voice[current_character]
129
- character_text = cleaned_line[cleaned_line.rfind("]")+1:].lstrip()
130
- fragment = self._send_request_to_tts(voice_id=voice_id, text=character_text)
131
- for chunk in fragment.iter_content(chunk_size=chunk_size):
 
 
 
 
 
132
  if chunk:
133
- ab.write(chunk)
 
134
 
135
  @staticmethod
136
  def get_unique_characters(annotated_text: str) -> list[str]:
@@ -207,7 +219,7 @@ def parse_pdf(file_path):
207
  return "\n".join([doc.page_content for doc in documents])
208
 
209
 
210
- def respond(text, uploaded_file):
211
  # Check if a file is uploaded
212
  if uploaded_file is not None:
213
  # Save the uploaded file temporarily to check its size
@@ -236,10 +248,10 @@ def respond(text, uploaded_file):
236
  unique_characters = builder.get_unique_characters(annotated_text)
237
  character_to_gender = builder.classify_characters(text, unique_characters)
238
  character_to_voice = builder.map_characters_to_voices(character_to_gender)
239
- builder.generate_audio(annotated_text, character_to_voice)
240
-
241
- audio, sr = librosa.load("audiobook.mp3", sr=None)
242
- return (sr, audio), None # Return audio and None for error message
243
 
244
 
245
  def refresh():
 
1
  import json
2
  import os
3
  import re
4
+ from pathlib import Path
5
+ from uuid import uuid4
6
 
7
  import librosa
8
  import requests
9
  import gradio as gr
10
  import pandas as pd
11
  from dotenv import load_dotenv
12
+ from elevenlabs import AsyncElevenLabs
13
  from langchain_community.document_loaders import PyPDFLoader
14
+ from openai import OpenAI
15
+
16
+ from src.tts import tts_astream
17
+
18
 
19
  load_dotenv()
20
 
 
83
  self._aiml_base_url = aiml_base_url
84
  self._aiml_client = OpenAI(api_key=api_key, base_url=self._aiml_base_url)
85
  self._default_narrator_voice = "ALY2WaJPY0oBJlqpQbfW"
86
+ self._eleven_api_key = eleven_api_key or os.environ["11LABS_API_KEY"]
87
+ self._eleven_client = AsyncElevenLabs(api_key=self._eleven_api_key)
88
 
89
  def annotate_text(self, text: str) -> str:
90
  response = self._send_request_to_llm(messages=[
 
115
  )
116
  return json.loads(response["choices"][0]["message"]["content"])
117
 
118
+ async def generate_audio(
119
  self,
120
  annotated_text: str,
121
  character_to_voice: dict[str, str],
122
+ ) -> Path:
123
+ results = []
 
124
  current_character = "narrator"
125
+ for line in annotated_text.splitlines():
126
+ cleaned_line = line.strip().lower()
127
+ if not cleaned_line:
128
+ continue
129
+ try:
130
+ current_character = re.findall(r"\[[\w\s]+\]", cleaned_line)[0][1:-1]
131
+ except:
132
+ pass
133
+ voice_id = character_to_voice[current_character]
134
+ character_text = cleaned_line[cleaned_line.rfind("]")+1:].lstrip()
135
+ results.append(tts_astream(voice_id=voice_id, text=character_text))
136
+
137
+ save_dir = Path("data") / "books"
138
+ save_dir.mkdir(exist_ok=True)
139
+ save_path = save_dir / f"{uuid4()}.wav"
140
+ with open(save_path, "wb") as ab:
141
+ for result in results:
142
+ async for chunk in result:
143
  if chunk:
144
+ ab.write(chunk)
145
+ return save_path
146
 
147
  @staticmethod
148
  def get_unique_characters(annotated_text: str) -> list[str]:
 
219
  return "\n".join([doc.page_content for doc in documents])
220
 
221
 
222
+ async def respond(text, uploaded_file):
223
  # Check if a file is uploaded
224
  if uploaded_file is not None:
225
  # Save the uploaded file temporarily to check its size
 
248
  unique_characters = builder.get_unique_characters(annotated_text)
249
  character_to_gender = builder.classify_characters(text, unique_characters)
250
  character_to_voice = builder.map_characters_to_voices(character_to_gender)
251
+ save_path = await builder.generate_audio(annotated_text, character_to_voice)
252
+
253
+ audio, sr = librosa.load(str(save_path), sr=None)
254
+ return (sr, audio)
255
 
256
 
257
  def refresh():
src/tts.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import typing as t
3
+
4
+ from dotenv import load_dotenv
5
+ from elevenlabs.client import AsyncElevenLabs, ElevenLabs
6
+
7
+
8
+ load_dotenv()
9
+
10
+
11
+ ELEVEN_CLIENT = ElevenLabs(api_key=os.getenv("11LABS_API_KEY"))
12
+
13
+ ELEVEN_CLIENT_ASYNC = AsyncElevenLabs(api_key=os.getenv("11LABS_API_KEY"))
14
+
15
+
16
+ def tts_stream(voice_id: str, text: str) -> t.Iterator[bytes]:
17
+ async_iter = ELEVEN_CLIENT.text_to_speech.convert(voice_id=voice_id, text=text)
18
+ for chunk in async_iter:
19
+ if chunk:
20
+ yield chunk
21
+
22
+
23
+ def tts(voice_id: str, text: str):
24
+ tts_iter = tts_stream(voice_id=voice_id, text=text)
25
+ combined = b"".join(tts_iter)
26
+ return combined
27
+
28
+
29
+ async def tts_astream(voice_id: str, text: str) -> t.AsyncIterator[bytes]:
30
+ async_iter = ELEVEN_CLIENT_ASYNC.text_to_speech.convert(voice_id=voice_id, text=text)
31
+ async for chunk in async_iter:
32
+ if chunk:
33
+ yield chunk