Spaces:
Sleeping
Sleeping
Andrei Kulchyk
Andrei Kulchyk
commited on
Convert TTS async-ly (#3)
Browse filesCo-authored-by: Andrei Kulchyk <[email protected]>
- app.py +37 -25
- src/tts.py +33 -0
app.py
CHANGED
@@ -1,14 +1,20 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
import re
|
|
|
|
|
4 |
|
5 |
import librosa
|
6 |
import requests
|
7 |
import gradio as gr
|
8 |
import pandas as pd
|
9 |
from dotenv import load_dotenv
|
10 |
-
from
|
11 |
from langchain_community.document_loaders import PyPDFLoader
|
|
|
|
|
|
|
|
|
12 |
|
13 |
load_dotenv()
|
14 |
|
@@ -77,7 +83,8 @@ class AudiobookBuilder:
|
|
77 |
self._aiml_base_url = aiml_base_url
|
78 |
self._aiml_client = OpenAI(api_key=api_key, base_url=self._aiml_base_url)
|
79 |
self._default_narrator_voice = "ALY2WaJPY0oBJlqpQbfW"
|
80 |
-
self._eleven_api_key = eleven_api_key or os.environ["
|
|
|
81 |
|
82 |
def annotate_text(self, text: str) -> str:
|
83 |
response = self._send_request_to_llm(messages=[
|
@@ -108,29 +115,34 @@ class AudiobookBuilder:
|
|
108 |
)
|
109 |
return json.loads(response["choices"][0]["message"]["content"])
|
110 |
|
111 |
-
def generate_audio(
|
112 |
self,
|
113 |
annotated_text: str,
|
114 |
character_to_voice: dict[str, str],
|
115 |
-
|
116 |
-
|
117 |
-
) -> None:
|
118 |
current_character = "narrator"
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
132 |
if chunk:
|
133 |
-
ab.write(chunk)
|
|
|
134 |
|
135 |
@staticmethod
|
136 |
def get_unique_characters(annotated_text: str) -> list[str]:
|
@@ -207,7 +219,7 @@ def parse_pdf(file_path):
|
|
207 |
return "\n".join([doc.page_content for doc in documents])
|
208 |
|
209 |
|
210 |
-
def respond(text, uploaded_file):
|
211 |
# Check if a file is uploaded
|
212 |
if uploaded_file is not None:
|
213 |
# Save the uploaded file temporarily to check its size
|
@@ -236,10 +248,10 @@ def respond(text, uploaded_file):
|
|
236 |
unique_characters = builder.get_unique_characters(annotated_text)
|
237 |
character_to_gender = builder.classify_characters(text, unique_characters)
|
238 |
character_to_voice = builder.map_characters_to_voices(character_to_gender)
|
239 |
-
builder.generate_audio(annotated_text, character_to_voice)
|
240 |
-
|
241 |
-
audio, sr = librosa.load(
|
242 |
-
return (sr, audio)
|
243 |
|
244 |
|
245 |
def refresh():
|
|
|
1 |
import json
|
2 |
import os
|
3 |
import re
|
4 |
+
from pathlib import Path
|
5 |
+
from uuid import uuid4
|
6 |
|
7 |
import librosa
|
8 |
import requests
|
9 |
import gradio as gr
|
10 |
import pandas as pd
|
11 |
from dotenv import load_dotenv
|
12 |
+
from elevenlabs import AsyncElevenLabs
|
13 |
from langchain_community.document_loaders import PyPDFLoader
|
14 |
+
from openai import OpenAI
|
15 |
+
|
16 |
+
from src.tts import tts_astream
|
17 |
+
|
18 |
|
19 |
load_dotenv()
|
20 |
|
|
|
83 |
self._aiml_base_url = aiml_base_url
|
84 |
self._aiml_client = OpenAI(api_key=api_key, base_url=self._aiml_base_url)
|
85 |
self._default_narrator_voice = "ALY2WaJPY0oBJlqpQbfW"
|
86 |
+
self._eleven_api_key = eleven_api_key or os.environ["11LABS_API_KEY"]
|
87 |
+
self._eleven_client = AsyncElevenLabs(api_key=self._eleven_api_key)
|
88 |
|
89 |
def annotate_text(self, text: str) -> str:
|
90 |
response = self._send_request_to_llm(messages=[
|
|
|
115 |
)
|
116 |
return json.loads(response["choices"][0]["message"]["content"])
|
117 |
|
118 |
+
async def generate_audio(
|
119 |
self,
|
120 |
annotated_text: str,
|
121 |
character_to_voice: dict[str, str],
|
122 |
+
) -> Path:
|
123 |
+
results = []
|
|
|
124 |
current_character = "narrator"
|
125 |
+
for line in annotated_text.splitlines():
|
126 |
+
cleaned_line = line.strip().lower()
|
127 |
+
if not cleaned_line:
|
128 |
+
continue
|
129 |
+
try:
|
130 |
+
current_character = re.findall(r"\[[\w\s]+\]", cleaned_line)[0][1:-1]
|
131 |
+
except:
|
132 |
+
pass
|
133 |
+
voice_id = character_to_voice[current_character]
|
134 |
+
character_text = cleaned_line[cleaned_line.rfind("]")+1:].lstrip()
|
135 |
+
results.append(tts_astream(voice_id=voice_id, text=character_text))
|
136 |
+
|
137 |
+
save_dir = Path("data") / "books"
|
138 |
+
save_dir.mkdir(exist_ok=True)
|
139 |
+
save_path = save_dir / f"{uuid4()}.wav"
|
140 |
+
with open(save_path, "wb") as ab:
|
141 |
+
for result in results:
|
142 |
+
async for chunk in result:
|
143 |
if chunk:
|
144 |
+
ab.write(chunk)
|
145 |
+
return save_path
|
146 |
|
147 |
@staticmethod
|
148 |
def get_unique_characters(annotated_text: str) -> list[str]:
|
|
|
219 |
return "\n".join([doc.page_content for doc in documents])
|
220 |
|
221 |
|
222 |
+
async def respond(text, uploaded_file):
|
223 |
# Check if a file is uploaded
|
224 |
if uploaded_file is not None:
|
225 |
# Save the uploaded file temporarily to check its size
|
|
|
248 |
unique_characters = builder.get_unique_characters(annotated_text)
|
249 |
character_to_gender = builder.classify_characters(text, unique_characters)
|
250 |
character_to_voice = builder.map_characters_to_voices(character_to_gender)
|
251 |
+
save_path = await builder.generate_audio(annotated_text, character_to_voice)
|
252 |
+
|
253 |
+
audio, sr = librosa.load(str(save_path), sr=None)
|
254 |
+
return (sr, audio)
|
255 |
|
256 |
|
257 |
def refresh():
|
src/tts.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import typing as t
|
3 |
+
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from elevenlabs.client import AsyncElevenLabs, ElevenLabs
|
6 |
+
|
7 |
+
|
8 |
+
load_dotenv()
|
9 |
+
|
10 |
+
|
11 |
+
ELEVEN_CLIENT = ElevenLabs(api_key=os.getenv("11LABS_API_KEY"))
|
12 |
+
|
13 |
+
ELEVEN_CLIENT_ASYNC = AsyncElevenLabs(api_key=os.getenv("11LABS_API_KEY"))
|
14 |
+
|
15 |
+
|
16 |
+
def tts_stream(voice_id: str, text: str) -> t.Iterator[bytes]:
|
17 |
+
async_iter = ELEVEN_CLIENT.text_to_speech.convert(voice_id=voice_id, text=text)
|
18 |
+
for chunk in async_iter:
|
19 |
+
if chunk:
|
20 |
+
yield chunk
|
21 |
+
|
22 |
+
|
23 |
+
def tts(voice_id: str, text: str):
|
24 |
+
tts_iter = tts_stream(voice_id=voice_id, text=text)
|
25 |
+
combined = b"".join(tts_iter)
|
26 |
+
return combined
|
27 |
+
|
28 |
+
|
29 |
+
async def tts_astream(voice_id: str, text: str) -> t.AsyncIterator[bytes]:
|
30 |
+
async_iter = ELEVEN_CLIENT_ASYNC.text_to_speech.convert(voice_id=voice_id, text=text)
|
31 |
+
async for chunk in async_iter:
|
32 |
+
if chunk:
|
33 |
+
yield chunk
|