Spaces:
Sleeping
Sleeping
navalnica
commited on
Commit
Β·
9281119
1
Parent(s):
95849c2
async sound effects generation; add text samples; improve UI; add input len check
Browse files- README.md +0 -8
- app.py +43 -15
- data/samples_to_split.py +3 -3
- pg.ipynb +320 -132
- src/audio_generators.py +68 -42
- src/config.py +17 -0
- src/tts.py +17 -2
README.md
CHANGED
@@ -20,23 +20,15 @@ python_version: 3.11
|
|
20 |
- add context
|
21 |
- filter, apply only for long phrases
|
22 |
- only for narrator?
|
23 |
-
- checkbox! make effects great again (no) optional
|
24 |
- stability
|
25 |
- add limit on input text size (5000 chars)
|
26 |
- improve UI
|
27 |
- add error box
|
28 |
-
- add samples
|
29 |
- show character parts
|
30 |
-
- remove file upload pane
|
31 |
-
- labels on how long to wait
|
32 |
-
- labels describing components
|
33 |
-
- header and description
|
34 |
- prepare slides / story
|
35 |
- testing
|
36 |
- eval current execution time
|
37 |
-
- test on different text inputs
|
38 |
- optimizations
|
39 |
-
- generate audio effects asynchronously
|
40 |
- combine sequential phrases of same character in single phrase
|
41 |
- support large texts. use batching. problem: how to ensure same characters?
|
42 |
- can detect characters in first prompt, then split text in each batch into character phrases
|
|
|
20 |
- add context
|
21 |
- filter, apply only for long phrases
|
22 |
- only for narrator?
|
|
|
23 |
- stability
|
24 |
- add limit on input text size (5000 chars)
|
25 |
- improve UI
|
26 |
- add error box
|
|
|
27 |
- show character parts
|
|
|
|
|
|
|
|
|
28 |
- prepare slides / story
|
29 |
- testing
|
30 |
- eval current execution time
|
|
|
31 |
- optimizations
|
|
|
32 |
- combine sequential phrases of same character in single phrase
|
33 |
- support large texts. use batching. problem: how to ensure same characters?
|
34 |
- can detect characters in first prompt, then split text in each batch into character phrases
|
app.py
CHANGED
@@ -9,7 +9,8 @@ from langchain_community.document_loaders import PyPDFLoader
|
|
9 |
load_dotenv()
|
10 |
|
11 |
from src.builder import AudiobookBuilder
|
12 |
-
from src.config import logger, FILE_SIZE_MAX
|
|
|
13 |
|
14 |
|
15 |
def get_auth_params():
|
@@ -57,6 +58,9 @@ async def respond(
|
|
57 |
logger.exception(e)
|
58 |
return (None, str(e))
|
59 |
|
|
|
|
|
|
|
60 |
builder = AudiobookBuilder()
|
61 |
audio_fp = await builder.run(text=text, generate_effects=generate_effects)
|
62 |
return audio_fp, ""
|
@@ -67,34 +71,58 @@ def refresh():
|
|
67 |
|
68 |
|
69 |
with gr.Blocks(title="Audiobooks Generation") as ui:
|
70 |
-
gr.Markdown(
|
71 |
|
72 |
with gr.Row(variant="panel"):
|
73 |
-
text_input = gr.Textbox(label="Enter the book text", lines=20)
|
74 |
-
# Add a file upload field for .txt and .pdf files
|
75 |
file_input = gr.File(
|
76 |
-
label="Upload a text file or PDF",
|
|
|
|
|
77 |
)
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
-
effects_generation_checkbox = gr.Checkbox(label="
|
|
|
|
|
|
|
|
|
86 |
|
87 |
-
submit_button = gr.Button("Submit")
|
88 |
submit_button.click(
|
89 |
fn=respond,
|
90 |
-
inputs=[
|
|
|
|
|
|
|
|
|
91 |
outputs=[
|
92 |
audio_output,
|
93 |
error_output,
|
94 |
], # Include the audio output and error message output
|
95 |
)
|
96 |
-
|
97 |
-
refresh_button = gr.Button("Refresh")
|
98 |
refresh_button.click(
|
99 |
fn=refresh,
|
100 |
inputs=[],
|
|
|
9 |
load_dotenv()
|
10 |
|
11 |
from src.builder import AudiobookBuilder
|
12 |
+
from src.config import logger, FILE_SIZE_MAX, MAX_TEXT_LEN, DESCRIPTION
|
13 |
+
from data import samples_to_split as samples
|
14 |
|
15 |
|
16 |
def get_auth_params():
|
|
|
58 |
logger.exception(e)
|
59 |
return (None, str(e))
|
60 |
|
61 |
+
if len(text) > MAX_TEXT_LEN:
|
62 |
+
raise ValueError(len(text)) # TODO
|
63 |
+
|
64 |
builder = AudiobookBuilder()
|
65 |
audio_fp = await builder.run(text=text, generate_effects=generate_effects)
|
66 |
return audio_fp, ""
|
|
|
71 |
|
72 |
|
73 |
with gr.Blocks(title="Audiobooks Generation") as ui:
|
74 |
+
gr.Markdown(DESCRIPTION)
|
75 |
|
76 |
with gr.Row(variant="panel"):
|
77 |
+
text_input = gr.Textbox(label="Enter the book text here", lines=20)
|
|
|
78 |
file_input = gr.File(
|
79 |
+
label="Upload a text file or PDF",
|
80 |
+
file_types=[".txt", ".pdf"],
|
81 |
+
visible=False,
|
82 |
)
|
83 |
|
84 |
+
examples = gr.Examples(
|
85 |
+
examples=[
|
86 |
+
[samples.GATSBY_1],
|
87 |
+
[samples.GATSBY_2],
|
88 |
+
[samples.WONDERFUL_CHRISTMAS_1],
|
89 |
+
[samples.WONDERFUL_CHRISTMAS_2],
|
90 |
+
],
|
91 |
+
inputs=text_input,
|
92 |
+
label="Sample Inputs",
|
93 |
+
example_labels=[
|
94 |
+
"Gatsby 1",
|
95 |
+
"Gatsby 2",
|
96 |
+
"Wonderful Christmas 1",
|
97 |
+
"Wonderful Christmas 2",
|
98 |
+
],
|
99 |
+
)
|
100 |
+
|
101 |
+
audio_output = gr.Audio(
|
102 |
+
label='Generated audio. Please wait for the waveform to appear, before hitting "Play"',
|
103 |
+
type="filepath",
|
104 |
+
)
|
105 |
+
# error output is hidden initially
|
106 |
+
error_output = gr.Textbox(label="Error Message", interactive=False, visible=False)
|
107 |
|
108 |
+
effects_generation_checkbox = gr.Checkbox(label="Add background effects")
|
109 |
+
|
110 |
+
with gr.Row(variant="panel"):
|
111 |
+
submit_button = gr.Button("Generate the audiobook", variant="primary")
|
112 |
+
refresh_button = gr.Button("Refresh", variant="secondary")
|
113 |
|
|
|
114 |
submit_button.click(
|
115 |
fn=respond,
|
116 |
+
inputs=[
|
117 |
+
text_input,
|
118 |
+
file_input,
|
119 |
+
effects_generation_checkbox,
|
120 |
+
], # Include the uploaded file as an input
|
121 |
outputs=[
|
122 |
audio_output,
|
123 |
error_output,
|
124 |
], # Include the audio output and error message output
|
125 |
)
|
|
|
|
|
126 |
refresh_button.click(
|
127 |
fn=refresh,
|
128 |
inputs=[],
|
data/samples_to_split.py
CHANGED
@@ -157,17 +157,17 @@ Frank! why in the world donβt you come to dinner? There is a gentleman
|
|
157 |
at table who came to see papa on business, and I ran away after the
|
158 |
soupβI couldnβt eat my dinner one bit, without you.β
|
159 |
|
160 |
-
βYouβll
|
161 |
who has to hoe corn all day, canβt stop to eat.β
|
162 |
|
163 |
βO, Frank Hallock! _for shame!_β cried Kate, putting down her indignant
|
164 |
foot without being able to make noise enough about it to disturb an
|
165 |
earthworm.
|
166 |
|
167 |
-
βItβs
|
168 |
might.
|
169 |
|
170 |
-
βIt is
|
171 |
the circus pass by makes you say such things, I am glad you canβt go to
|
172 |
see it.β\
|
173 |
"""
|
|
|
157 |
at table who came to see papa on business, and I ran away after the
|
158 |
soupβI couldnβt eat my dinner one bit, without you.β
|
159 |
|
160 |
+
βYouβll HAVE to, I reckon,β returned Frank; βa poor fellow, like me,
|
161 |
who has to hoe corn all day, canβt stop to eat.β
|
162 |
|
163 |
βO, Frank Hallock! _for shame!_β cried Kate, putting down her indignant
|
164 |
foot without being able to make noise enough about it to disturb an
|
165 |
earthworm.
|
166 |
|
167 |
+
βItβs TRUE,β responded Frank, pitching into the next hill with all his
|
168 |
might.
|
169 |
|
170 |
+
βIt is NOT true,β cried Kate; βand if just running off to LOOK at
|
171 |
the circus pass by makes you say such things, I am glad you canβt go to
|
172 |
see it.β\
|
173 |
"""
|
pg.ipynb
CHANGED
@@ -80,13 +80,13 @@
|
|
80 |
"name": "stdout",
|
81 |
"output_type": "stream",
|
82 |
"text": [
|
83 |
-
"(
|
84 |
]
|
85 |
}
|
86 |
],
|
87 |
"source": [
|
88 |
"# df = pd.read_csv('data/11labs_tts_voices.csv')\n",
|
89 |
-
"df = pd.read_csv('data/11labs_available_tts_voices.csv')\n",
|
90 |
"df[\"age\"] = df[\"age\"].str.replace(\" \", \"_\").str.replace(\"-\", \"_\")\n",
|
91 |
"print(df.shape)"
|
92 |
]
|
@@ -99,9 +99,9 @@
|
|
99 |
{
|
100 |
"data": {
|
101 |
"text/plain": [
|
102 |
-
"Index(['voice_id', 'name', 'preview_url', '
|
103 |
-
" '
|
104 |
-
" 'category', 'language', 'descriptive'],\n",
|
105 |
" dtype='object')"
|
106 |
]
|
107 |
},
|
@@ -123,9 +123,8 @@
|
|
123 |
"data": {
|
124 |
"text/plain": [
|
125 |
"language\n",
|
126 |
-
"NaN
|
127 |
-
"en
|
128 |
-
"romanian 1\n",
|
129 |
"Name: count, dtype: int64"
|
130 |
]
|
131 |
},
|
@@ -147,10 +146,8 @@
|
|
147 |
"data": {
|
148 |
"text/plain": [
|
149 |
"gender\n",
|
150 |
-
"female
|
151 |
-
"male
|
152 |
-
"neutral 6\n",
|
153 |
-
"non-binary 1\n",
|
154 |
"Name: count, dtype: int64"
|
155 |
]
|
156 |
},
|
@@ -172,10 +169,9 @@
|
|
172 |
"data": {
|
173 |
"text/plain": [
|
174 |
"age\n",
|
175 |
-
"middle_aged
|
176 |
-
"young
|
177 |
-
"old
|
178 |
-
"NaN 2\n",
|
179 |
"Name: count, dtype: int64"
|
180 |
]
|
181 |
},
|
@@ -190,7 +186,7 @@
|
|
190 |
},
|
191 |
{
|
192 |
"cell_type": "code",
|
193 |
-
"execution_count":
|
194 |
"metadata": {},
|
195 |
"outputs": [
|
196 |
{
|
@@ -216,43 +212,133 @@
|
|
216 |
" <th>gender</th>\n",
|
217 |
" <th>female</th>\n",
|
218 |
" <th>male</th>\n",
|
219 |
-
" <th>neutral</th>\n",
|
220 |
-
" <th>non-binary</th>\n",
|
221 |
" </tr>\n",
|
222 |
" <tr>\n",
|
223 |
" <th>age</th>\n",
|
224 |
" <th></th>\n",
|
225 |
" <th></th>\n",
|
226 |
-
" <th></th>\n",
|
227 |
-
" <th></th>\n",
|
228 |
" </tr>\n",
|
229 |
" </thead>\n",
|
230 |
" <tbody>\n",
|
231 |
" <tr>\n",
|
232 |
" <th>middle_aged</th>\n",
|
233 |
-
" <td>48</td>\n",
|
234 |
-
" <td>130</td>\n",
|
235 |
" <td>4</td>\n",
|
236 |
-
" <td>
|
237 |
" </tr>\n",
|
238 |
" <tr>\n",
|
239 |
" <th>old</th>\n",
|
240 |
-
" <td>
|
241 |
-
" <td>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
" <td>1</td>\n",
|
|
|
|
|
|
|
243 |
" <td>0</td>\n",
|
|
|
|
|
244 |
" </tr>\n",
|
245 |
" <tr>\n",
|
246 |
-
" <th>
|
247 |
-
" <
|
248 |
-
" <td>59</td>\n",
|
249 |
" <td>1</td>\n",
|
250 |
" <td>0</td>\n",
|
|
|
251 |
" </tr>\n",
|
252 |
" <tr>\n",
|
253 |
-
" <th>
|
|
|
|
|
254 |
" <td>0</td>\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
" <td>2</td>\n",
|
|
|
|
|
|
|
|
|
|
|
256 |
" <td>0</td>\n",
|
257 |
" <td>0</td>\n",
|
258 |
" </tr>\n",
|
@@ -261,26 +347,31 @@
|
|
261 |
"</div>"
|
262 |
],
|
263 |
"text/plain": [
|
264 |
-
"
|
265 |
-
"
|
266 |
-
"
|
267 |
-
"
|
268 |
-
"
|
269 |
-
"
|
|
|
|
|
|
|
270 |
]
|
271 |
},
|
272 |
-
"execution_count":
|
273 |
"metadata": {},
|
274 |
"output_type": "execute_result"
|
275 |
}
|
276 |
],
|
277 |
"source": [
|
278 |
-
"df.groupby([
|
|
|
|
|
279 |
]
|
280 |
},
|
281 |
{
|
282 |
"cell_type": "code",
|
283 |
-
"execution_count":
|
284 |
"metadata": {},
|
285 |
"outputs": [
|
286 |
{
|
@@ -303,17 +394,13 @@
|
|
303 |
"<table border=\"1\" class=\"dataframe\">\n",
|
304 |
" <thead>\n",
|
305 |
" <tr style=\"text-align: right;\">\n",
|
306 |
-
" <th
|
307 |
-
" <th>
|
308 |
-
" <th>
|
309 |
-
" <th>
|
310 |
-
" <th>neutral</th>\n",
|
311 |
-
" <th>non-binary</th>\n",
|
312 |
" </tr>\n",
|
313 |
" <tr>\n",
|
314 |
-
" <th>
|
315 |
-
" <th>age</th>\n",
|
316 |
-
" <th></th>\n",
|
317 |
" <th></th>\n",
|
318 |
" <th></th>\n",
|
319 |
" <th></th>\n",
|
@@ -321,82 +408,197 @@
|
|
321 |
" </thead>\n",
|
322 |
" <tbody>\n",
|
323 |
" <tr>\n",
|
324 |
-
" <th
|
325 |
-
" <th>middle_aged</th>\n",
|
326 |
-
" <td>30</td>\n",
|
327 |
-
" <td>91</td>\n",
|
328 |
" <td>2</td>\n",
|
329 |
-
" <td>
|
|
|
330 |
" </tr>\n",
|
331 |
" <tr>\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
" <th>old</th>\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
" <td>3</td>\n",
|
334 |
" <td>3</td>\n",
|
335 |
-
" <td>
|
336 |
-
" <td>0</td>\n",
|
337 |
" </tr>\n",
|
338 |
" <tr>\n",
|
339 |
-
" <th>
|
340 |
-
" <td>
|
341 |
-
" <td>
|
342 |
-
" <td>
|
343 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
" </tr>\n",
|
345 |
" <tr>\n",
|
346 |
-
" <th>
|
347 |
-
" <
|
348 |
-
" <
|
349 |
-
" <
|
350 |
-
" <td>0</td>\n",
|
351 |
" </tr>\n",
|
|
|
|
|
352 |
" <tr>\n",
|
353 |
-
" <th>
|
354 |
-
" <th>
|
355 |
" <td>1</td>\n",
|
356 |
-
" <td>
|
357 |
-
" <td>0</td>\n",
|
358 |
-
" <td>0</td>\n",
|
359 |
" </tr>\n",
|
360 |
" <tr>\n",
|
361 |
-
" <th
|
362 |
-
" <th>middle_aged</th>\n",
|
363 |
-
" <td>18</td>\n",
|
364 |
-
" <td>39</td>\n",
|
365 |
" <td>2</td>\n",
|
366 |
" <td>1</td>\n",
|
367 |
" </tr>\n",
|
368 |
" <tr>\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
369 |
" <th>old</th>\n",
|
370 |
-
" <td>
|
371 |
-
" <td>
|
372 |
-
" <td>1</td>\n",
|
373 |
-
" <td>0</td>\n",
|
374 |
" </tr>\n",
|
375 |
" <tr>\n",
|
376 |
" <th>young</th>\n",
|
377 |
-
" <td>
|
378 |
-
" <td>
|
379 |
-
" <td>1</td>\n",
|
380 |
-
" <td>0</td>\n",
|
381 |
" </tr>\n",
|
382 |
" </tbody>\n",
|
383 |
"</table>\n",
|
384 |
"</div>"
|
385 |
],
|
386 |
"text/plain": [
|
387 |
-
"gender female male
|
388 |
-
"language age
|
389 |
-
"en middle_aged
|
390 |
-
"
|
391 |
-
"
|
392 |
-
"
|
393 |
-
"
|
394 |
-
"NaN middle_aged 18 39 2 1\n",
|
395 |
-
" old 96 36 1 0\n",
|
396 |
-
" young 49 21 1 0"
|
397 |
]
|
398 |
},
|
399 |
-
"execution_count":
|
400 |
"metadata": {},
|
401 |
"output_type": "execute_result"
|
402 |
}
|
@@ -407,54 +609,33 @@
|
|
407 |
},
|
408 |
{
|
409 |
"cell_type": "code",
|
410 |
-
"execution_count":
|
411 |
"metadata": {},
|
412 |
"outputs": [
|
413 |
{
|
414 |
"data": {
|
415 |
"text/plain": [
|
416 |
"descriptive\n",
|
417 |
-
"
|
418 |
-
"
|
419 |
-
"
|
420 |
-
"
|
421 |
-
"
|
422 |
-
"
|
423 |
-
"
|
424 |
-
"
|
425 |
-
"
|
426 |
-
"
|
427 |
-
"
|
428 |
-
"
|
429 |
-
"
|
430 |
-
"
|
431 |
-
"excited
|
432 |
-
"
|
433 |
-
"mature 8\n",
|
434 |
-
"classy 8\n",
|
435 |
-
"chill 7\n",
|
436 |
-
"neutral 7\n",
|
437 |
-
"crisp 6\n",
|
438 |
-
"gentle 6\n",
|
439 |
-
"childish 6\n",
|
440 |
-
"hyped 6\n",
|
441 |
-
"cute 5\n",
|
442 |
-
"sassy 4\n",
|
443 |
-
"soft 4\n",
|
444 |
-
"rough 3\n",
|
445 |
-
"grumpy 3\n",
|
446 |
-
"whispery 3\n",
|
447 |
-
"robotic 3\n",
|
448 |
-
"relaxed 3\n",
|
449 |
-
"raspy 2\n",
|
450 |
-
"cheeky 1\n",
|
451 |
-
"sad 1\n",
|
452 |
-
"anxious 1\n",
|
453 |
-
"motivational 1\n",
|
454 |
"Name: count, dtype: int64"
|
455 |
]
|
456 |
},
|
457 |
-
"execution_count":
|
458 |
"metadata": {},
|
459 |
"output_type": "execute_result"
|
460 |
}
|
@@ -463,6 +644,13 @@
|
|
463 |
"df['descriptive'].value_counts(dropna=False)"
|
464 |
]
|
465 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
466 |
{
|
467 |
"cell_type": "code",
|
468 |
"execution_count": 17,
|
|
|
80 |
"name": "stdout",
|
81 |
"output_type": "stream",
|
82 |
"text": [
|
83 |
+
"(34, 15)\n"
|
84 |
]
|
85 |
}
|
86 |
],
|
87 |
"source": [
|
88 |
"# df = pd.read_csv('data/11labs_tts_voices.csv')\n",
|
89 |
+
"df = pd.read_csv('data/11labs_available_tts_voices.reviewed.csv')\n",
|
90 |
"df[\"age\"] = df[\"age\"].str.replace(\" \", \"_\").str.replace(\"-\", \"_\")\n",
|
91 |
"print(df.shape)"
|
92 |
]
|
|
|
99 |
{
|
100 |
"data": {
|
101 |
"text/plain": [
|
102 |
+
"Index(['voice_id', 'name', 'preview_url', 'manual_quality_review', 'owner_id',\n",
|
103 |
+
" 'permission_on_resource', 'is_legacy', 'is_mixed', 'accent',\n",
|
104 |
+
" 'description', 'age', 'gender', 'category', 'language', 'descriptive'],\n",
|
105 |
" dtype='object')"
|
106 |
]
|
107 |
},
|
|
|
123 |
"data": {
|
124 |
"text/plain": [
|
125 |
"language\n",
|
126 |
+
"NaN 25\n",
|
127 |
+
"en 9\n",
|
|
|
128 |
"Name: count, dtype: int64"
|
129 |
]
|
130 |
},
|
|
|
146 |
"data": {
|
147 |
"text/plain": [
|
148 |
"gender\n",
|
149 |
+
"female 17\n",
|
150 |
+
"male 17\n",
|
|
|
|
|
151 |
"Name: count, dtype: int64"
|
152 |
]
|
153 |
},
|
|
|
169 |
"data": {
|
170 |
"text/plain": [
|
171 |
"age\n",
|
172 |
+
"middle_aged 13\n",
|
173 |
+
"young 11\n",
|
174 |
+
"old 10\n",
|
|
|
175 |
"Name: count, dtype: int64"
|
176 |
]
|
177 |
},
|
|
|
186 |
},
|
187 |
{
|
188 |
"cell_type": "code",
|
189 |
+
"execution_count": 9,
|
190 |
"metadata": {},
|
191 |
"outputs": [
|
192 |
{
|
|
|
212 |
" <th>gender</th>\n",
|
213 |
" <th>female</th>\n",
|
214 |
" <th>male</th>\n",
|
|
|
|
|
215 |
" </tr>\n",
|
216 |
" <tr>\n",
|
217 |
" <th>age</th>\n",
|
218 |
" <th></th>\n",
|
219 |
" <th></th>\n",
|
|
|
|
|
220 |
" </tr>\n",
|
221 |
" </thead>\n",
|
222 |
" <tbody>\n",
|
223 |
" <tr>\n",
|
224 |
" <th>middle_aged</th>\n",
|
|
|
|
|
225 |
" <td>4</td>\n",
|
226 |
+
" <td>9</td>\n",
|
227 |
" </tr>\n",
|
228 |
" <tr>\n",
|
229 |
" <th>old</th>\n",
|
230 |
+
" <td>5</td>\n",
|
231 |
+
" <td>5</td>\n",
|
232 |
+
" </tr>\n",
|
233 |
+
" <tr>\n",
|
234 |
+
" <th>young</th>\n",
|
235 |
+
" <td>8</td>\n",
|
236 |
+
" <td>3</td>\n",
|
237 |
+
" </tr>\n",
|
238 |
+
" </tbody>\n",
|
239 |
+
"</table>\n",
|
240 |
+
"</div>"
|
241 |
+
],
|
242 |
+
"text/plain": [
|
243 |
+
"gender female male\n",
|
244 |
+
"age \n",
|
245 |
+
"middle_aged 4 9\n",
|
246 |
+
"old 5 5\n",
|
247 |
+
"young 8 3"
|
248 |
+
]
|
249 |
+
},
|
250 |
+
"execution_count": 9,
|
251 |
+
"metadata": {},
|
252 |
+
"output_type": "execute_result"
|
253 |
+
}
|
254 |
+
],
|
255 |
+
"source": [
|
256 |
+
"df.groupby(['age', 'gender'], dropna=False)['voice_id'].count().unstack(fill_value=0)"
|
257 |
+
]
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"cell_type": "code",
|
261 |
+
"execution_count": 13,
|
262 |
+
"metadata": {},
|
263 |
+
"outputs": [
|
264 |
+
{
|
265 |
+
"data": {
|
266 |
+
"text/html": [
|
267 |
+
"<div>\n",
|
268 |
+
"<style scoped>\n",
|
269 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
270 |
+
" vertical-align: middle;\n",
|
271 |
+
" }\n",
|
272 |
+
"\n",
|
273 |
+
" .dataframe tbody tr th {\n",
|
274 |
+
" vertical-align: top;\n",
|
275 |
+
" }\n",
|
276 |
+
"\n",
|
277 |
+
" .dataframe thead th {\n",
|
278 |
+
" text-align: right;\n",
|
279 |
+
" }\n",
|
280 |
+
"</style>\n",
|
281 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
282 |
+
" <thead>\n",
|
283 |
+
" <tr style=\"text-align: right;\">\n",
|
284 |
+
" <th></th>\n",
|
285 |
+
" <th>age</th>\n",
|
286 |
+
" <th>middle_aged</th>\n",
|
287 |
+
" <th>old</th>\n",
|
288 |
+
" <th>young</th>\n",
|
289 |
+
" </tr>\n",
|
290 |
+
" <tr>\n",
|
291 |
+
" <th>manual_quality_review</th>\n",
|
292 |
+
" <th>gender</th>\n",
|
293 |
+
" <th></th>\n",
|
294 |
+
" <th></th>\n",
|
295 |
+
" <th></th>\n",
|
296 |
+
" </tr>\n",
|
297 |
+
" </thead>\n",
|
298 |
+
" <tbody>\n",
|
299 |
+
" <tr>\n",
|
300 |
+
" <th rowspan=\"2\" valign=\"top\">bad</th>\n",
|
301 |
+
" <th>female</th>\n",
|
302 |
+
" <td>1</td>\n",
|
303 |
+
" <td>2</td>\n",
|
304 |
" <td>1</td>\n",
|
305 |
+
" </tr>\n",
|
306 |
+
" <tr>\n",
|
307 |
+
" <th>male</th>\n",
|
308 |
" <td>0</td>\n",
|
309 |
+
" <td>3</td>\n",
|
310 |
+
" <td>1</td>\n",
|
311 |
" </tr>\n",
|
312 |
" <tr>\n",
|
313 |
+
" <th rowspan=\"2\" valign=\"top\">medium</th>\n",
|
314 |
+
" <th>female</th>\n",
|
|
|
315 |
" <td>1</td>\n",
|
316 |
" <td>0</td>\n",
|
317 |
+
" <td>0</td>\n",
|
318 |
" </tr>\n",
|
319 |
" <tr>\n",
|
320 |
+
" <th>male</th>\n",
|
321 |
+
" <td>2</td>\n",
|
322 |
+
" <td>1</td>\n",
|
323 |
" <td>0</td>\n",
|
324 |
+
" </tr>\n",
|
325 |
+
" <tr>\n",
|
326 |
+
" <th rowspan=\"2\" valign=\"top\">ok</th>\n",
|
327 |
+
" <th>female</th>\n",
|
328 |
+
" <td>2</td>\n",
|
329 |
+
" <td>3</td>\n",
|
330 |
+
" <td>7</td>\n",
|
331 |
+
" </tr>\n",
|
332 |
+
" <tr>\n",
|
333 |
+
" <th>male</th>\n",
|
334 |
+
" <td>6</td>\n",
|
335 |
+
" <td>1</td>\n",
|
336 |
" <td>2</td>\n",
|
337 |
+
" </tr>\n",
|
338 |
+
" <tr>\n",
|
339 |
+
" <th>very bad</th>\n",
|
340 |
+
" <th>male</th>\n",
|
341 |
+
" <td>1</td>\n",
|
342 |
" <td>0</td>\n",
|
343 |
" <td>0</td>\n",
|
344 |
" </tr>\n",
|
|
|
347 |
"</div>"
|
348 |
],
|
349 |
"text/plain": [
|
350 |
+
"age middle_aged old young\n",
|
351 |
+
"manual_quality_review gender \n",
|
352 |
+
"bad female 1 2 1\n",
|
353 |
+
" male 0 3 1\n",
|
354 |
+
"medium female 1 0 0\n",
|
355 |
+
" male 2 1 0\n",
|
356 |
+
"ok female 2 3 7\n",
|
357 |
+
" male 6 1 2\n",
|
358 |
+
"very bad male 1 0 0"
|
359 |
]
|
360 |
},
|
361 |
+
"execution_count": 13,
|
362 |
"metadata": {},
|
363 |
"output_type": "execute_result"
|
364 |
}
|
365 |
],
|
366 |
"source": [
|
367 |
+
"df.groupby([\"manual_quality_review\", 'gender', \"age\"], dropna=False)[\n",
|
368 |
+
" \"voice_id\"\n",
|
369 |
+
"].count().unstack(fill_value=0)"
|
370 |
]
|
371 |
},
|
372 |
{
|
373 |
"cell_type": "code",
|
374 |
+
"execution_count": 16,
|
375 |
"metadata": {},
|
376 |
"outputs": [
|
377 |
{
|
|
|
394 |
"<table border=\"1\" class=\"dataframe\">\n",
|
395 |
" <thead>\n",
|
396 |
" <tr style=\"text-align: right;\">\n",
|
397 |
+
" <th>age</th>\n",
|
398 |
+
" <th>middle_aged</th>\n",
|
399 |
+
" <th>old</th>\n",
|
400 |
+
" <th>young</th>\n",
|
|
|
|
|
401 |
" </tr>\n",
|
402 |
" <tr>\n",
|
403 |
+
" <th>gender</th>\n",
|
|
|
|
|
404 |
" <th></th>\n",
|
405 |
" <th></th>\n",
|
406 |
" <th></th>\n",
|
|
|
408 |
" </thead>\n",
|
409 |
" <tbody>\n",
|
410 |
" <tr>\n",
|
411 |
+
" <th>female</th>\n",
|
|
|
|
|
|
|
412 |
" <td>2</td>\n",
|
413 |
+
" <td>3</td>\n",
|
414 |
+
" <td>7</td>\n",
|
415 |
" </tr>\n",
|
416 |
" <tr>\n",
|
417 |
+
" <th>male</th>\n",
|
418 |
+
" <td>6</td>\n",
|
419 |
+
" <td>1</td>\n",
|
420 |
+
" <td>2</td>\n",
|
421 |
+
" </tr>\n",
|
422 |
+
" </tbody>\n",
|
423 |
+
"</table>\n",
|
424 |
+
"</div>"
|
425 |
+
],
|
426 |
+
"text/plain": [
|
427 |
+
"age middle_aged old young\n",
|
428 |
+
"gender \n",
|
429 |
+
"female 2 3 7\n",
|
430 |
+
"male 6 1 2"
|
431 |
+
]
|
432 |
+
},
|
433 |
+
"execution_count": 16,
|
434 |
+
"metadata": {},
|
435 |
+
"output_type": "execute_result"
|
436 |
+
}
|
437 |
+
],
|
438 |
+
"source": [
|
439 |
+
"df[df['manual_quality_review'].isin(['ok'])].groupby(['gender', \"age\"], dropna=False)[\n",
|
440 |
+
" \"voice_id\"\n",
|
441 |
+
"].count().unstack(fill_value=0)"
|
442 |
+
]
|
443 |
+
},
|
444 |
+
{
|
445 |
+
"cell_type": "code",
|
446 |
+
"execution_count": null,
|
447 |
+
"metadata": {},
|
448 |
+
"outputs": [
|
449 |
+
{
|
450 |
+
"data": {
|
451 |
+
"text/html": [
|
452 |
+
"<div>\n",
|
453 |
+
"<style scoped>\n",
|
454 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
455 |
+
" vertical-align: middle;\n",
|
456 |
+
" }\n",
|
457 |
+
"\n",
|
458 |
+
" .dataframe tbody tr th {\n",
|
459 |
+
" vertical-align: top;\n",
|
460 |
+
" }\n",
|
461 |
+
"\n",
|
462 |
+
" .dataframe thead th {\n",
|
463 |
+
" text-align: right;\n",
|
464 |
+
" }\n",
|
465 |
+
"</style>\n",
|
466 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
467 |
+
" <thead>\n",
|
468 |
+
" <tr style=\"text-align: right;\">\n",
|
469 |
+
" <th>age</th>\n",
|
470 |
+
" <th>middle_aged</th>\n",
|
471 |
" <th>old</th>\n",
|
472 |
+
" <th>young</th>\n",
|
473 |
+
" </tr>\n",
|
474 |
+
" <tr>\n",
|
475 |
+
" <th>gender</th>\n",
|
476 |
+
" <th></th>\n",
|
477 |
+
" <th></th>\n",
|
478 |
+
" <th></th>\n",
|
479 |
+
" </tr>\n",
|
480 |
+
" </thead>\n",
|
481 |
+
" <tbody>\n",
|
482 |
+
" <tr>\n",
|
483 |
+
" <th>female</th>\n",
|
484 |
" <td>3</td>\n",
|
485 |
" <td>3</td>\n",
|
486 |
+
" <td>7</td>\n",
|
|
|
487 |
" </tr>\n",
|
488 |
" <tr>\n",
|
489 |
+
" <th>male</th>\n",
|
490 |
+
" <td>8</td>\n",
|
491 |
+
" <td>2</td>\n",
|
492 |
+
" <td>2</td>\n",
|
493 |
+
" </tr>\n",
|
494 |
+
" </tbody>\n",
|
495 |
+
"</table>\n",
|
496 |
+
"</div>"
|
497 |
+
],
|
498 |
+
"text/plain": [
|
499 |
+
"age middle_aged old young\n",
|
500 |
+
"gender \n",
|
501 |
+
"female 3 3 7\n",
|
502 |
+
"male 8 2 2"
|
503 |
+
]
|
504 |
+
},
|
505 |
+
"metadata": {},
|
506 |
+
"output_type": "display_data"
|
507 |
+
}
|
508 |
+
],
|
509 |
+
"source": [
|
510 |
+
"df[df['manual_quality_review'].isin(['ok', 'medium'])].groupby(['gender', \"age\"], dropna=False)[\n",
|
511 |
+
" \"voice_id\"\n",
|
512 |
+
"].count().unstack(fill_value=0)"
|
513 |
+
]
|
514 |
+
},
|
515 |
+
{
|
516 |
+
"cell_type": "code",
|
517 |
+
"execution_count": null,
|
518 |
+
"metadata": {},
|
519 |
+
"outputs": [],
|
520 |
+
"source": []
|
521 |
+
},
|
522 |
+
{
|
523 |
+
"cell_type": "code",
|
524 |
+
"execution_count": 10,
|
525 |
+
"metadata": {},
|
526 |
+
"outputs": [
|
527 |
+
{
|
528 |
+
"data": {
|
529 |
+
"text/html": [
|
530 |
+
"<div>\n",
|
531 |
+
"<style scoped>\n",
|
532 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
533 |
+
" vertical-align: middle;\n",
|
534 |
+
" }\n",
|
535 |
+
"\n",
|
536 |
+
" .dataframe tbody tr th {\n",
|
537 |
+
" vertical-align: top;\n",
|
538 |
+
" }\n",
|
539 |
+
"\n",
|
540 |
+
" .dataframe thead th {\n",
|
541 |
+
" text-align: right;\n",
|
542 |
+
" }\n",
|
543 |
+
"</style>\n",
|
544 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
545 |
+
" <thead>\n",
|
546 |
+
" <tr style=\"text-align: right;\">\n",
|
547 |
+
" <th></th>\n",
|
548 |
+
" <th>gender</th>\n",
|
549 |
+
" <th>female</th>\n",
|
550 |
+
" <th>male</th>\n",
|
551 |
" </tr>\n",
|
552 |
" <tr>\n",
|
553 |
+
" <th>language</th>\n",
|
554 |
+
" <th>age</th>\n",
|
555 |
+
" <th></th>\n",
|
556 |
+
" <th></th>\n",
|
|
|
557 |
" </tr>\n",
|
558 |
+
" </thead>\n",
|
559 |
+
" <tbody>\n",
|
560 |
" <tr>\n",
|
561 |
+
" <th rowspan=\"2\" valign=\"top\">en</th>\n",
|
562 |
+
" <th>middle_aged</th>\n",
|
563 |
" <td>1</td>\n",
|
564 |
+
" <td>5</td>\n",
|
|
|
|
|
565 |
" </tr>\n",
|
566 |
" <tr>\n",
|
567 |
+
" <th>young</th>\n",
|
|
|
|
|
|
|
568 |
" <td>2</td>\n",
|
569 |
" <td>1</td>\n",
|
570 |
" </tr>\n",
|
571 |
" <tr>\n",
|
572 |
+
" <th rowspan=\"3\" valign=\"top\">NaN</th>\n",
|
573 |
+
" <th>middle_aged</th>\n",
|
574 |
+
" <td>3</td>\n",
|
575 |
+
" <td>4</td>\n",
|
576 |
+
" </tr>\n",
|
577 |
+
" <tr>\n",
|
578 |
" <th>old</th>\n",
|
579 |
+
" <td>5</td>\n",
|
580 |
+
" <td>5</td>\n",
|
|
|
|
|
581 |
" </tr>\n",
|
582 |
" <tr>\n",
|
583 |
" <th>young</th>\n",
|
584 |
+
" <td>6</td>\n",
|
585 |
+
" <td>2</td>\n",
|
|
|
|
|
586 |
" </tr>\n",
|
587 |
" </tbody>\n",
|
588 |
"</table>\n",
|
589 |
"</div>"
|
590 |
],
|
591 |
"text/plain": [
|
592 |
+
"gender female male\n",
|
593 |
+
"language age \n",
|
594 |
+
"en middle_aged 1 5\n",
|
595 |
+
" young 2 1\n",
|
596 |
+
"NaN middle_aged 3 4\n",
|
597 |
+
" old 5 5\n",
|
598 |
+
" young 6 2"
|
|
|
|
|
|
|
599 |
]
|
600 |
},
|
601 |
+
"execution_count": 10,
|
602 |
"metadata": {},
|
603 |
"output_type": "execute_result"
|
604 |
}
|
|
|
609 |
},
|
610 |
{
|
611 |
"cell_type": "code",
|
612 |
+
"execution_count": 11,
|
613 |
"metadata": {},
|
614 |
"outputs": [
|
615 |
{
|
616 |
"data": {
|
617 |
"text/plain": [
|
618 |
"descriptive\n",
|
619 |
+
"pleasant 6\n",
|
620 |
+
"casual 5\n",
|
621 |
+
"confident 3\n",
|
622 |
+
"calm 3\n",
|
623 |
+
"NaN 3\n",
|
624 |
+
"intense 3\n",
|
625 |
+
"chill 2\n",
|
626 |
+
"formal 1\n",
|
627 |
+
"serious 1\n",
|
628 |
+
"mature 1\n",
|
629 |
+
"cute 1\n",
|
630 |
+
"crisp 1\n",
|
631 |
+
"upbeat 1\n",
|
632 |
+
"professional 1\n",
|
633 |
+
"excited 1\n",
|
634 |
+
"wise 1\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
635 |
"Name: count, dtype: int64"
|
636 |
]
|
637 |
},
|
638 |
+
"execution_count": 11,
|
639 |
"metadata": {},
|
640 |
"output_type": "execute_result"
|
641 |
}
|
|
|
644 |
"df['descriptive'].value_counts(dropna=False)"
|
645 |
]
|
646 |
},
|
647 |
+
{
|
648 |
+
"cell_type": "code",
|
649 |
+
"execution_count": null,
|
650 |
+
"metadata": {},
|
651 |
+
"outputs": [],
|
652 |
+
"source": []
|
653 |
+
},
|
654 |
{
|
655 |
"cell_type": "code",
|
656 |
"execution_count": 17,
|
src/audio_generators.py
CHANGED
@@ -9,8 +9,8 @@ from langchain_community.callbacks import get_openai_callback
|
|
9 |
from pydub import AudioSegment
|
10 |
|
11 |
from src.lc_callbacks import LCMessageLoggerAsync
|
12 |
-
from src.tts import tts_astream_consumed,
|
13 |
-
from src.utils import
|
14 |
from src.emotions.generation import (
|
15 |
EffectGeneratorAsync,
|
16 |
TextPreparationForTTSTaskOutput,
|
@@ -76,37 +76,38 @@ class AudioGeneratorWithEffects:
|
|
76 |
"""Main method to generate the audiobook with TTS, emotion, and sound effects."""
|
77 |
num_lines = len(text_split.phrases)
|
78 |
lines_for_sound_effect = self._select_lines_for_sound_effect(
|
79 |
-
num_lines,
|
|
|
80 |
)
|
81 |
logger.info(f"{generate_effects = }, {lines_for_sound_effect = }")
|
82 |
|
83 |
-
|
84 |
text_split, lines_for_sound_effect
|
85 |
)
|
86 |
|
87 |
tts_results, self.temp_files = await self._generate_tts_audio(
|
88 |
-
text_split,
|
89 |
)
|
90 |
|
91 |
-
# Step 3: Add sound effects to selected lines
|
92 |
audio_chunks = await self._add_sound_effects(
|
93 |
-
tts_results, lines_for_sound_effect,
|
94 |
)
|
95 |
|
96 |
-
# Step 4: Merge audio files
|
97 |
normalized_audio_chunks = self._normalize_audio_chunks(
|
98 |
audio_chunks, self.temp_files
|
99 |
)
|
|
|
100 |
final_output = self._merge_audio_files(
|
101 |
normalized_audio_chunks, save_path=out_path
|
102 |
)
|
103 |
|
104 |
-
# Clean up temporary files
|
105 |
self._cleanup_temp_files(self.temp_files)
|
106 |
|
107 |
return final_output
|
108 |
|
109 |
-
def _select_lines_for_sound_effect(
|
|
|
|
|
110 |
"""Select % of the lines randomly for sound effect generation."""
|
111 |
return random.sample(range(num_lines), k=int(fraction * num_lines))
|
112 |
|
@@ -159,7 +160,7 @@ class AudioGeneratorWithEffects:
|
|
159 |
async def _generate_tts_audio(
|
160 |
self,
|
161 |
text_split: SplitTextOutput,
|
162 |
-
|
163 |
character_to_voice: dict[str, str],
|
164 |
) -> tuple[list[str], list[str]]:
|
165 |
"""Generate TTS audio for modified text."""
|
@@ -174,20 +175,18 @@ class AudioGeneratorWithEffects:
|
|
174 |
# bytes_ = await consume_aiter(iter_)
|
175 |
return bytes_
|
176 |
|
177 |
-
for idx, (
|
178 |
-
zip(
|
179 |
):
|
180 |
voice_id = character_to_voice[character_phrase.character]
|
181 |
|
182 |
-
# Use the semaphore-protected TTS function
|
183 |
task = tts_astream_with_semaphore(
|
184 |
voice_id=voice_id,
|
185 |
-
text=
|
186 |
-
params=
|
187 |
)
|
188 |
tasks_for_tts.append(task)
|
189 |
|
190 |
-
# Gather all TTS results
|
191 |
tts_results = await asyncio.gather(*tasks_for_tts)
|
192 |
|
193 |
# Save the results to temporary files
|
@@ -206,38 +205,65 @@ class AudioGeneratorWithEffects:
|
|
206 |
self,
|
207 |
tts_audio_files: list[str],
|
208 |
lines_for_sound_effect: list[int],
|
209 |
-
|
210 |
temp_files: list[str],
|
211 |
) -> list[str]:
|
212 |
"""Add sound effects to the selected lines."""
|
213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
for idx, tts_filename in enumerate(tts_audio_files):
|
215 |
-
|
216 |
-
if idx in lines_for_sound_effect:
|
217 |
-
# Get next sound effect data
|
218 |
-
sound_effect_data = sound_emotion_results.pop(0)
|
219 |
-
sound_effect_filename = f"sound_effect_{idx}.wav"
|
220 |
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
# Add sound effect overlay
|
230 |
-
output_filename = add_overlay_for_audio(
|
231 |
-
main_audio_filename=tts_filename,
|
232 |
-
sound_effect_filename=sound_effect_filename,
|
233 |
-
cycling_effect=True,
|
234 |
-
decrease_effect_volume=5,
|
235 |
)
|
236 |
-
audio_chunks.append(output_filename)
|
237 |
-
temp_files.append(sound_effect_filename) # Track temp files
|
238 |
-
temp_files.append(output_filename)
|
239 |
else:
|
240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
return audio_chunks
|
243 |
|
|
|
9 |
from pydub import AudioSegment
|
10 |
|
11 |
from src.lc_callbacks import LCMessageLoggerAsync
|
12 |
+
from src.tts import tts_astream_consumed, sound_generation_consumed
|
13 |
+
from src.utils import consume_aiter
|
14 |
from src.emotions.generation import (
|
15 |
EffectGeneratorAsync,
|
16 |
TextPreparationForTTSTaskOutput,
|
|
|
76 |
"""Main method to generate the audiobook with TTS, emotion, and sound effects."""
|
77 |
num_lines = len(text_split.phrases)
|
78 |
lines_for_sound_effect = self._select_lines_for_sound_effect(
|
79 |
+
num_lines,
|
80 |
+
fraction=float(0.2 * generate_effects),
|
81 |
)
|
82 |
logger.info(f"{generate_effects = }, {lines_for_sound_effect = }")
|
83 |
|
84 |
+
data_for_tts, data_for_sound_effects = await self._prepare_text_for_tts(
|
85 |
text_split, lines_for_sound_effect
|
86 |
)
|
87 |
|
88 |
tts_results, self.temp_files = await self._generate_tts_audio(
|
89 |
+
text_split, data_for_tts, character_to_voice
|
90 |
)
|
91 |
|
|
|
92 |
audio_chunks = await self._add_sound_effects(
|
93 |
+
tts_results, lines_for_sound_effect, data_for_sound_effects, self.temp_files
|
94 |
)
|
95 |
|
|
|
96 |
normalized_audio_chunks = self._normalize_audio_chunks(
|
97 |
audio_chunks, self.temp_files
|
98 |
)
|
99 |
+
|
100 |
final_output = self._merge_audio_files(
|
101 |
normalized_audio_chunks, save_path=out_path
|
102 |
)
|
103 |
|
|
|
104 |
self._cleanup_temp_files(self.temp_files)
|
105 |
|
106 |
return final_output
|
107 |
|
108 |
+
def _select_lines_for_sound_effect(
|
109 |
+
self, num_lines: int, fraction: float
|
110 |
+
) -> list[int]:
|
111 |
"""Select % of the lines randomly for sound effect generation."""
|
112 |
return random.sample(range(num_lines), k=int(fraction * num_lines))
|
113 |
|
|
|
160 |
async def _generate_tts_audio(
|
161 |
self,
|
162 |
text_split: SplitTextOutput,
|
163 |
+
data_for_tts: list[dict],
|
164 |
character_to_voice: dict[str, str],
|
165 |
) -> tuple[list[str], list[str]]:
|
166 |
"""Generate TTS audio for modified text."""
|
|
|
175 |
# bytes_ = await consume_aiter(iter_)
|
176 |
return bytes_
|
177 |
|
178 |
+
for idx, (data_item, character_phrase) in enumerate(
|
179 |
+
zip(data_for_tts, text_split.phrases)
|
180 |
):
|
181 |
voice_id = character_to_voice[character_phrase.character]
|
182 |
|
|
|
183 |
task = tts_astream_with_semaphore(
|
184 |
voice_id=voice_id,
|
185 |
+
text=data_item["modified_text"],
|
186 |
+
params=data_item["params"],
|
187 |
)
|
188 |
tasks_for_tts.append(task)
|
189 |
|
|
|
190 |
tts_results = await asyncio.gather(*tasks_for_tts)
|
191 |
|
192 |
# Save the results to temporary files
|
|
|
205 |
self,
|
206 |
tts_audio_files: list[str],
|
207 |
lines_for_sound_effect: list[int],
|
208 |
+
data_for_sound_effects: list[dict],
|
209 |
temp_files: list[str],
|
210 |
) -> list[str]:
|
211 |
"""Add sound effects to the selected lines."""
|
212 |
+
|
213 |
+
semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
|
214 |
+
|
215 |
+
async def _process_single_phrase(
|
216 |
+
tts_filename: str,
|
217 |
+
sound_effect_data: dict | None,
|
218 |
+
sound_effect_filename: str,
|
219 |
+
):
|
220 |
+
if sound_effect_data is None:
|
221 |
+
return (tts_filename, [])
|
222 |
+
|
223 |
+
async with semaphore:
|
224 |
+
sound_result = await sound_generation_consumed(sound_effect_data)
|
225 |
+
|
226 |
+
# save to file
|
227 |
+
with open(sound_effect_filename, "wb") as ab:
|
228 |
+
for chunk in sound_result:
|
229 |
+
ab.write(chunk)
|
230 |
+
|
231 |
+
# overlay sound effect on TTS audio
|
232 |
+
tts_with_effects_filename = add_overlay_for_audio(
|
233 |
+
main_audio_filename=tts_filename,
|
234 |
+
sound_effect_filename=sound_effect_filename,
|
235 |
+
cycling_effect=True,
|
236 |
+
decrease_effect_volume=5,
|
237 |
+
)
|
238 |
+
tmp_files = [sound_effect_filename, tts_with_effects_filename]
|
239 |
+
return (tts_with_effects_filename, tmp_files)
|
240 |
+
|
241 |
+
tasks = []
|
242 |
for idx, tts_filename in enumerate(tts_audio_files):
|
243 |
+
sound_effect_filename = f"sound_effect_{idx}.wav"
|
|
|
|
|
|
|
|
|
244 |
|
245 |
+
if idx not in lines_for_sound_effect:
|
246 |
+
tasks.append(
|
247 |
+
_process_single_phrase(
|
248 |
+
tts_filename=tts_filename,
|
249 |
+
sound_effect_data=None,
|
250 |
+
sound_effect_filename=sound_effect_filename,
|
251 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
)
|
|
|
|
|
|
|
253 |
else:
|
254 |
+
sound_effect_data = data_for_sound_effects.pop(0)
|
255 |
+
tasks.append(
|
256 |
+
_process_single_phrase(
|
257 |
+
tts_filename=tts_filename,
|
258 |
+
sound_effect_data=sound_effect_data,
|
259 |
+
sound_effect_filename=sound_effect_filename,
|
260 |
+
)
|
261 |
+
)
|
262 |
+
|
263 |
+
outputs = await asyncio.gather(*tasks)
|
264 |
+
audio_chunks = [x[0] for x in outputs]
|
265 |
+
tmp_files_to_add = [item for x in outputs for item in x[1]]
|
266 |
+
temp_files.extend(tmp_files_to_add)
|
267 |
|
268 |
return audio_chunks
|
269 |
|
src/config.py
CHANGED
@@ -17,3 +17,20 @@ ELEVENLABS_MAX_PARALLEL = 15 # current limitation of available subscription
|
|
17 |
|
18 |
# VOICES_CSV_FP = "data/11labs_available_tts_voices.csv"
|
19 |
VOICES_CSV_FP = "data/11labs_available_tts_voices.reviewed.csv"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
# VOICES_CSV_FP = "data/11labs_available_tts_voices.csv"
|
19 |
VOICES_CSV_FP = "data/11labs_available_tts_voices.reviewed.csv"
|
20 |
+
|
21 |
+
MAX_TEXT_LEN = 5000
|
22 |
+
|
23 |
+
DESCRIPTION = """\
|
24 |
+
# AI Audiobooks Generator
|
25 |
+
|
26 |
+
Create an audiobook from the input text automatically, using Gen-AI!
|
27 |
+
|
28 |
+
All you need to do - is to input the book text or select it from the provided Sample Inputs.
|
29 |
+
|
30 |
+
AI will do the rest:
|
31 |
+
- split text into characters
|
32 |
+
- assign each character a voice
|
33 |
+
- preprocess text to better convey emotions during Text-to-Speech
|
34 |
+
- (optionally) add sound effects to create immersive atmosphere
|
35 |
+
- generate audiobook using Text-to-Speech model
|
36 |
+
"""
|
src/tts.py
CHANGED
@@ -39,7 +39,10 @@ async def tts_astream(
|
|
39 |
style=params.get("style"),
|
40 |
)
|
41 |
|
42 |
-
logger.info(
|
|
|
|
|
|
|
43 |
async_iter = ELEVEN_CLIENT_ASYNC.text_to_speech.convert(**params_all)
|
44 |
async for chunk in async_iter:
|
45 |
if chunk:
|
@@ -57,11 +60,23 @@ async def tts_astream_consumed(
|
|
57 |
async def sound_generation_astream(
|
58 |
sound_generation_data: dict,
|
59 |
) -> t.AsyncIterator[bytes]:
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
async_iter = ELEVEN_CLIENT_ASYNC.text_to_sound_effects.convert(
|
61 |
-
text=
|
62 |
duration_seconds=sound_generation_data["duration_seconds"],
|
63 |
prompt_influence=sound_generation_data["prompt_influence"],
|
64 |
)
|
65 |
async for chunk in async_iter:
|
66 |
if chunk:
|
67 |
yield chunk
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
style=params.get("style"),
|
40 |
)
|
41 |
|
42 |
+
logger.info(
|
43 |
+
f"request to 11labs TTS endpoint with params {params_all} "
|
44 |
+
f'for the following text: "{text}"'
|
45 |
+
)
|
46 |
async_iter = ELEVEN_CLIENT_ASYNC.text_to_speech.convert(**params_all)
|
47 |
async for chunk in async_iter:
|
48 |
if chunk:
|
|
|
60 |
async def sound_generation_astream(
|
61 |
sound_generation_data: dict,
|
62 |
) -> t.AsyncIterator[bytes]:
|
63 |
+
text = sound_generation_data.pop("text")
|
64 |
+
logger.info(
|
65 |
+
f"request to 11labs sound effect generation with params {sound_generation_data} "
|
66 |
+
f'for the following text: "{text}"'
|
67 |
+
)
|
68 |
+
|
69 |
async_iter = ELEVEN_CLIENT_ASYNC.text_to_sound_effects.convert(
|
70 |
+
text=text,
|
71 |
duration_seconds=sound_generation_data["duration_seconds"],
|
72 |
prompt_influence=sound_generation_data["prompt_influence"],
|
73 |
)
|
74 |
async for chunk in async_iter:
|
75 |
if chunk:
|
76 |
yield chunk
|
77 |
+
|
78 |
+
|
79 |
+
@auto_retry
|
80 |
+
async def sound_generation_consumed(sound_generation_data: dict):
|
81 |
+
aiterator = sound_generation_astream(sound_generation_data=sound_generation_data)
|
82 |
+
return [x async for x in aiterator]
|