navalnica commited on
Commit
9281119
Β·
1 Parent(s): 95849c2

async sound effects generation; add text samples; improve UI; add input len check

Browse files
Files changed (7) hide show
  1. README.md +0 -8
  2. app.py +43 -15
  3. data/samples_to_split.py +3 -3
  4. pg.ipynb +320 -132
  5. src/audio_generators.py +68 -42
  6. src/config.py +17 -0
  7. src/tts.py +17 -2
README.md CHANGED
@@ -20,23 +20,15 @@ python_version: 3.11
20
  - add context
21
  - filter, apply only for long phrases
22
  - only for narrator?
23
- - checkbox! make effects great again (no) optional
24
  - stability
25
  - add limit on input text size (5000 chars)
26
  - improve UI
27
  - add error box
28
- - add samples
29
  - show character parts
30
- - remove file upload pane
31
- - labels on how long to wait
32
- - labels describing components
33
- - header and description
34
  - prepare slides / story
35
  - testing
36
  - eval current execution time
37
- - test on different text inputs
38
  - optimizations
39
- - generate audio effects asynchronously
40
  - combine sequential phrases of same character in single phrase
41
  - support large texts. use batching. problem: how to ensure same characters?
42
  - can detect characters in first prompt, then split text in each batch into character phrases
 
20
  - add context
21
  - filter, apply only for long phrases
22
  - only for narrator?
 
23
  - stability
24
  - add limit on input text size (5000 chars)
25
  - improve UI
26
  - add error box
 
27
  - show character parts
 
 
 
 
28
  - prepare slides / story
29
  - testing
30
  - eval current execution time
 
31
  - optimizations
 
32
  - combine sequential phrases of same character in single phrase
33
  - support large texts. use batching. problem: how to ensure same characters?
34
  - can detect characters in first prompt, then split text in each batch into character phrases
app.py CHANGED
@@ -9,7 +9,8 @@ from langchain_community.document_loaders import PyPDFLoader
9
  load_dotenv()
10
 
11
  from src.builder import AudiobookBuilder
12
- from src.config import logger, FILE_SIZE_MAX
 
13
 
14
 
15
  def get_auth_params():
@@ -57,6 +58,9 @@ async def respond(
57
  logger.exception(e)
58
  return (None, str(e))
59
 
 
 
 
60
  builder = AudiobookBuilder()
61
  audio_fp = await builder.run(text=text, generate_effects=generate_effects)
62
  return audio_fp, ""
@@ -67,34 +71,58 @@ def refresh():
67
 
68
 
69
  with gr.Blocks(title="Audiobooks Generation") as ui:
70
- gr.Markdown("# Audiobooks Generation")
71
 
72
  with gr.Row(variant="panel"):
73
- text_input = gr.Textbox(label="Enter the book text", lines=20)
74
- # Add a file upload field for .txt and .pdf files
75
  file_input = gr.File(
76
- label="Upload a text file or PDF", file_types=[".txt", ".pdf"]
 
 
77
  )
78
 
79
- with gr.Row(variant="panel"):
80
- audio_output = gr.Audio(label="Generated audio", type="filepath")
81
- error_output = gr.Textbox(
82
- label="Error Messages", interactive=False, visible=False
83
- ) # Initially hidden
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
- effects_generation_checkbox = gr.Checkbox(label="Generate background effects")
 
 
 
 
86
 
87
- submit_button = gr.Button("Submit")
88
  submit_button.click(
89
  fn=respond,
90
- inputs=[text_input, file_input, effects_generation_checkbox], # Include the uploaded file as an input
 
 
 
 
91
  outputs=[
92
  audio_output,
93
  error_output,
94
  ], # Include the audio output and error message output
95
  )
96
-
97
- refresh_button = gr.Button("Refresh")
98
  refresh_button.click(
99
  fn=refresh,
100
  inputs=[],
 
9
  load_dotenv()
10
 
11
  from src.builder import AudiobookBuilder
12
+ from src.config import logger, FILE_SIZE_MAX, MAX_TEXT_LEN, DESCRIPTION
13
+ from data import samples_to_split as samples
14
 
15
 
16
  def get_auth_params():
 
58
  logger.exception(e)
59
  return (None, str(e))
60
 
61
+ if len(text) > MAX_TEXT_LEN:
62
+ raise ValueError(len(text)) # TODO
63
+
64
  builder = AudiobookBuilder()
65
  audio_fp = await builder.run(text=text, generate_effects=generate_effects)
66
  return audio_fp, ""
 
71
 
72
 
73
  with gr.Blocks(title="Audiobooks Generation") as ui:
74
+ gr.Markdown(DESCRIPTION)
75
 
76
  with gr.Row(variant="panel"):
77
+ text_input = gr.Textbox(label="Enter the book text here", lines=20)
 
78
  file_input = gr.File(
79
+ label="Upload a text file or PDF",
80
+ file_types=[".txt", ".pdf"],
81
+ visible=False,
82
  )
83
 
84
+ examples = gr.Examples(
85
+ examples=[
86
+ [samples.GATSBY_1],
87
+ [samples.GATSBY_2],
88
+ [samples.WONDERFUL_CHRISTMAS_1],
89
+ [samples.WONDERFUL_CHRISTMAS_2],
90
+ ],
91
+ inputs=text_input,
92
+ label="Sample Inputs",
93
+ example_labels=[
94
+ "Gatsby 1",
95
+ "Gatsby 2",
96
+ "Wonderful Christmas 1",
97
+ "Wonderful Christmas 2",
98
+ ],
99
+ )
100
+
101
+ audio_output = gr.Audio(
102
+ label='Generated audio. Please wait for the waveform to appear, before hitting "Play"',
103
+ type="filepath",
104
+ )
105
+ # error output is hidden initially
106
+ error_output = gr.Textbox(label="Error Message", interactive=False, visible=False)
107
 
108
+ effects_generation_checkbox = gr.Checkbox(label="Add background effects")
109
+
110
+ with gr.Row(variant="panel"):
111
+ submit_button = gr.Button("Generate the audiobook", variant="primary")
112
+ refresh_button = gr.Button("Refresh", variant="secondary")
113
 
 
114
  submit_button.click(
115
  fn=respond,
116
+ inputs=[
117
+ text_input,
118
+ file_input,
119
+ effects_generation_checkbox,
120
+ ], # Include the uploaded file as an input
121
  outputs=[
122
  audio_output,
123
  error_output,
124
  ], # Include the audio output and error message output
125
  )
 
 
126
  refresh_button.click(
127
  fn=refresh,
128
  inputs=[],
data/samples_to_split.py CHANGED
@@ -157,17 +157,17 @@ Frank! why in the world don’t you come to dinner? There is a gentleman
157
  at table who came to see papa on business, and I ran away after the
158
  soupβ€”I couldn’t eat my dinner one bit, without you.”
159
 
160
- β€œYou’ll _have_ to, I reckon,” returned Frank; β€œa poor fellow, like me,
161
  who has to hoe corn all day, can’t stop to eat.”
162
 
163
  β€œO, Frank Hallock! _for shame!_” cried Kate, putting down her indignant
164
  foot without being able to make noise enough about it to disturb an
165
  earthworm.
166
 
167
- β€œIt’s _true_,” responded Frank, pitching into the next hill with all his
168
  might.
169
 
170
- β€œIt is _not_ true,” cried Kate; β€œand if just running off to _look_ at
171
  the circus pass by makes you say such things, I am glad you can’t go to
172
  see it.”\
173
  """
 
157
  at table who came to see papa on business, and I ran away after the
158
  soupβ€”I couldn’t eat my dinner one bit, without you.”
159
 
160
+ β€œYou’ll HAVE to, I reckon,” returned Frank; β€œa poor fellow, like me,
161
  who has to hoe corn all day, can’t stop to eat.”
162
 
163
  β€œO, Frank Hallock! _for shame!_” cried Kate, putting down her indignant
164
  foot without being able to make noise enough about it to disturb an
165
  earthworm.
166
 
167
+ β€œIt’s TRUE,” responded Frank, pitching into the next hill with all his
168
  might.
169
 
170
+ β€œIt is NOT true,” cried Kate; β€œand if just running off to LOOK at
171
  the circus pass by makes you say such things, I am glad you can’t go to
172
  see it.”\
173
  """
pg.ipynb CHANGED
@@ -80,13 +80,13 @@
80
  "name": "stdout",
81
  "output_type": "stream",
82
  "text": [
83
- "(468, 14)\n"
84
  ]
85
  }
86
  ],
87
  "source": [
88
  "# df = pd.read_csv('data/11labs_tts_voices.csv')\n",
89
- "df = pd.read_csv('data/11labs_available_tts_voices.csv')\n",
90
  "df[\"age\"] = df[\"age\"].str.replace(\" \", \"_\").str.replace(\"-\", \"_\")\n",
91
  "print(df.shape)"
92
  ]
@@ -99,9 +99,9 @@
99
  {
100
  "data": {
101
  "text/plain": [
102
- "Index(['voice_id', 'name', 'preview_url', 'owner_id', 'permission_on_resource',\n",
103
- " 'is_legacy', 'is_mixed', 'accent', 'description', 'age', 'gender',\n",
104
- " 'category', 'language', 'descriptive'],\n",
105
  " dtype='object')"
106
  ]
107
  },
@@ -123,9 +123,8 @@
123
  "data": {
124
  "text/plain": [
125
  "language\n",
126
- "NaN 264\n",
127
- "en 203\n",
128
- "romanian 1\n",
129
  "Name: count, dtype: int64"
130
  ]
131
  },
@@ -147,10 +146,8 @@
147
  "data": {
148
  "text/plain": [
149
  "gender\n",
150
- "female 231\n",
151
- "male 230\n",
152
- "neutral 6\n",
153
- "non-binary 1\n",
154
  "Name: count, dtype: int64"
155
  ]
156
  },
@@ -172,10 +169,9 @@
172
  "data": {
173
  "text/plain": [
174
  "age\n",
175
- "middle_aged 183\n",
176
- "young 143\n",
177
- "old 140\n",
178
- "NaN 2\n",
179
  "Name: count, dtype: int64"
180
  ]
181
  },
@@ -190,7 +186,7 @@
190
  },
191
  {
192
  "cell_type": "code",
193
- "execution_count": 14,
194
  "metadata": {},
195
  "outputs": [
196
  {
@@ -216,43 +212,133 @@
216
  " <th>gender</th>\n",
217
  " <th>female</th>\n",
218
  " <th>male</th>\n",
219
- " <th>neutral</th>\n",
220
- " <th>non-binary</th>\n",
221
  " </tr>\n",
222
  " <tr>\n",
223
  " <th>age</th>\n",
224
  " <th></th>\n",
225
  " <th></th>\n",
226
- " <th></th>\n",
227
- " <th></th>\n",
228
  " </tr>\n",
229
  " </thead>\n",
230
  " <tbody>\n",
231
  " <tr>\n",
232
  " <th>middle_aged</th>\n",
233
- " <td>48</td>\n",
234
- " <td>130</td>\n",
235
  " <td>4</td>\n",
236
- " <td>1</td>\n",
237
  " </tr>\n",
238
  " <tr>\n",
239
  " <th>old</th>\n",
240
- " <td>100</td>\n",
241
- " <td>39</td>\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  " <td>1</td>\n",
 
 
 
243
  " <td>0</td>\n",
 
 
244
  " </tr>\n",
245
  " <tr>\n",
246
- " <th>young</th>\n",
247
- " <td>83</td>\n",
248
- " <td>59</td>\n",
249
  " <td>1</td>\n",
250
  " <td>0</td>\n",
 
251
  " </tr>\n",
252
  " <tr>\n",
253
- " <th>NaN</th>\n",
 
 
254
  " <td>0</td>\n",
 
 
 
 
 
 
 
 
 
 
 
 
255
  " <td>2</td>\n",
 
 
 
 
 
256
  " <td>0</td>\n",
257
  " <td>0</td>\n",
258
  " </tr>\n",
@@ -261,26 +347,31 @@
261
  "</div>"
262
  ],
263
  "text/plain": [
264
- "gender female male neutral non-binary\n",
265
- "age \n",
266
- "middle_aged 48 130 4 1\n",
267
- "old 100 39 1 0\n",
268
- "young 83 59 1 0\n",
269
- "NaN 0 2 0 0"
 
 
 
270
  ]
271
  },
272
- "execution_count": 14,
273
  "metadata": {},
274
  "output_type": "execute_result"
275
  }
276
  ],
277
  "source": [
278
- "df.groupby(['age', 'gender'], dropna=False)['voice_id'].count().unstack(fill_value=0)"
 
 
279
  ]
280
  },
281
  {
282
  "cell_type": "code",
283
- "execution_count": 13,
284
  "metadata": {},
285
  "outputs": [
286
  {
@@ -303,17 +394,13 @@
303
  "<table border=\"1\" class=\"dataframe\">\n",
304
  " <thead>\n",
305
  " <tr style=\"text-align: right;\">\n",
306
- " <th></th>\n",
307
- " <th>gender</th>\n",
308
- " <th>female</th>\n",
309
- " <th>male</th>\n",
310
- " <th>neutral</th>\n",
311
- " <th>non-binary</th>\n",
312
  " </tr>\n",
313
  " <tr>\n",
314
- " <th>language</th>\n",
315
- " <th>age</th>\n",
316
- " <th></th>\n",
317
  " <th></th>\n",
318
  " <th></th>\n",
319
  " <th></th>\n",
@@ -321,82 +408,197 @@
321
  " </thead>\n",
322
  " <tbody>\n",
323
  " <tr>\n",
324
- " <th rowspan=\"4\" valign=\"top\">en</th>\n",
325
- " <th>middle_aged</th>\n",
326
- " <td>30</td>\n",
327
- " <td>91</td>\n",
328
  " <td>2</td>\n",
329
- " <td>0</td>\n",
 
330
  " </tr>\n",
331
  " <tr>\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  " <th>old</th>\n",
 
 
 
 
 
 
 
 
 
 
 
 
333
  " <td>3</td>\n",
334
  " <td>3</td>\n",
335
- " <td>0</td>\n",
336
- " <td>0</td>\n",
337
  " </tr>\n",
338
  " <tr>\n",
339
- " <th>young</th>\n",
340
- " <td>34</td>\n",
341
- " <td>38</td>\n",
342
- " <td>0</td>\n",
343
- " <td>0</td>\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  " </tr>\n",
345
  " <tr>\n",
346
- " <th>NaN</th>\n",
347
- " <td>0</td>\n",
348
- " <td>2</td>\n",
349
- " <td>0</td>\n",
350
- " <td>0</td>\n",
351
  " </tr>\n",
 
 
352
  " <tr>\n",
353
- " <th>romanian</th>\n",
354
- " <th>old</th>\n",
355
  " <td>1</td>\n",
356
- " <td>0</td>\n",
357
- " <td>0</td>\n",
358
- " <td>0</td>\n",
359
  " </tr>\n",
360
  " <tr>\n",
361
- " <th rowspan=\"3\" valign=\"top\">NaN</th>\n",
362
- " <th>middle_aged</th>\n",
363
- " <td>18</td>\n",
364
- " <td>39</td>\n",
365
  " <td>2</td>\n",
366
  " <td>1</td>\n",
367
  " </tr>\n",
368
  " <tr>\n",
 
 
 
 
 
 
369
  " <th>old</th>\n",
370
- " <td>96</td>\n",
371
- " <td>36</td>\n",
372
- " <td>1</td>\n",
373
- " <td>0</td>\n",
374
  " </tr>\n",
375
  " <tr>\n",
376
  " <th>young</th>\n",
377
- " <td>49</td>\n",
378
- " <td>21</td>\n",
379
- " <td>1</td>\n",
380
- " <td>0</td>\n",
381
  " </tr>\n",
382
  " </tbody>\n",
383
  "</table>\n",
384
  "</div>"
385
  ],
386
  "text/plain": [
387
- "gender female male neutral non-binary\n",
388
- "language age \n",
389
- "en middle_aged 30 91 2 0\n",
390
- " old 3 3 0 0\n",
391
- " young 34 38 0 0\n",
392
- " NaN 0 2 0 0\n",
393
- "romanian old 1 0 0 0\n",
394
- "NaN middle_aged 18 39 2 1\n",
395
- " old 96 36 1 0\n",
396
- " young 49 21 1 0"
397
  ]
398
  },
399
- "execution_count": 13,
400
  "metadata": {},
401
  "output_type": "execute_result"
402
  }
@@ -407,54 +609,33 @@
407
  },
408
  {
409
  "cell_type": "code",
410
- "execution_count": 15,
411
  "metadata": {},
412
  "outputs": [
413
  {
414
  "data": {
415
  "text/plain": [
416
  "descriptive\n",
417
- "confident 64\n",
418
- "calm 44\n",
419
- "casual 34\n",
420
- "pleasant 31\n",
421
- "deep 28\n",
422
- "NaN 26\n",
423
- "professional 26\n",
424
- "upbeat 22\n",
425
- "wise 20\n",
426
- "formal 17\n",
427
- "intense 13\n",
428
- "serious 13\n",
429
- "meditative 11\n",
430
- "modulated 11\n",
431
- "excited 10\n",
432
- "husky 10\n",
433
- "mature 8\n",
434
- "classy 8\n",
435
- "chill 7\n",
436
- "neutral 7\n",
437
- "crisp 6\n",
438
- "gentle 6\n",
439
- "childish 6\n",
440
- "hyped 6\n",
441
- "cute 5\n",
442
- "sassy 4\n",
443
- "soft 4\n",
444
- "rough 3\n",
445
- "grumpy 3\n",
446
- "whispery 3\n",
447
- "robotic 3\n",
448
- "relaxed 3\n",
449
- "raspy 2\n",
450
- "cheeky 1\n",
451
- "sad 1\n",
452
- "anxious 1\n",
453
- "motivational 1\n",
454
  "Name: count, dtype: int64"
455
  ]
456
  },
457
- "execution_count": 15,
458
  "metadata": {},
459
  "output_type": "execute_result"
460
  }
@@ -463,6 +644,13 @@
463
  "df['descriptive'].value_counts(dropna=False)"
464
  ]
465
  },
 
 
 
 
 
 
 
466
  {
467
  "cell_type": "code",
468
  "execution_count": 17,
 
80
  "name": "stdout",
81
  "output_type": "stream",
82
  "text": [
83
+ "(34, 15)\n"
84
  ]
85
  }
86
  ],
87
  "source": [
88
  "# df = pd.read_csv('data/11labs_tts_voices.csv')\n",
89
+ "df = pd.read_csv('data/11labs_available_tts_voices.reviewed.csv')\n",
90
  "df[\"age\"] = df[\"age\"].str.replace(\" \", \"_\").str.replace(\"-\", \"_\")\n",
91
  "print(df.shape)"
92
  ]
 
99
  {
100
  "data": {
101
  "text/plain": [
102
+ "Index(['voice_id', 'name', 'preview_url', 'manual_quality_review', 'owner_id',\n",
103
+ " 'permission_on_resource', 'is_legacy', 'is_mixed', 'accent',\n",
104
+ " 'description', 'age', 'gender', 'category', 'language', 'descriptive'],\n",
105
  " dtype='object')"
106
  ]
107
  },
 
123
  "data": {
124
  "text/plain": [
125
  "language\n",
126
+ "NaN 25\n",
127
+ "en 9\n",
 
128
  "Name: count, dtype: int64"
129
  ]
130
  },
 
146
  "data": {
147
  "text/plain": [
148
  "gender\n",
149
+ "female 17\n",
150
+ "male 17\n",
 
 
151
  "Name: count, dtype: int64"
152
  ]
153
  },
 
169
  "data": {
170
  "text/plain": [
171
  "age\n",
172
+ "middle_aged 13\n",
173
+ "young 11\n",
174
+ "old 10\n",
 
175
  "Name: count, dtype: int64"
176
  ]
177
  },
 
186
  },
187
  {
188
  "cell_type": "code",
189
+ "execution_count": 9,
190
  "metadata": {},
191
  "outputs": [
192
  {
 
212
  " <th>gender</th>\n",
213
  " <th>female</th>\n",
214
  " <th>male</th>\n",
 
 
215
  " </tr>\n",
216
  " <tr>\n",
217
  " <th>age</th>\n",
218
  " <th></th>\n",
219
  " <th></th>\n",
 
 
220
  " </tr>\n",
221
  " </thead>\n",
222
  " <tbody>\n",
223
  " <tr>\n",
224
  " <th>middle_aged</th>\n",
 
 
225
  " <td>4</td>\n",
226
+ " <td>9</td>\n",
227
  " </tr>\n",
228
  " <tr>\n",
229
  " <th>old</th>\n",
230
+ " <td>5</td>\n",
231
+ " <td>5</td>\n",
232
+ " </tr>\n",
233
+ " <tr>\n",
234
+ " <th>young</th>\n",
235
+ " <td>8</td>\n",
236
+ " <td>3</td>\n",
237
+ " </tr>\n",
238
+ " </tbody>\n",
239
+ "</table>\n",
240
+ "</div>"
241
+ ],
242
+ "text/plain": [
243
+ "gender female male\n",
244
+ "age \n",
245
+ "middle_aged 4 9\n",
246
+ "old 5 5\n",
247
+ "young 8 3"
248
+ ]
249
+ },
250
+ "execution_count": 9,
251
+ "metadata": {},
252
+ "output_type": "execute_result"
253
+ }
254
+ ],
255
+ "source": [
256
+ "df.groupby(['age', 'gender'], dropna=False)['voice_id'].count().unstack(fill_value=0)"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "execution_count": 13,
262
+ "metadata": {},
263
+ "outputs": [
264
+ {
265
+ "data": {
266
+ "text/html": [
267
+ "<div>\n",
268
+ "<style scoped>\n",
269
+ " .dataframe tbody tr th:only-of-type {\n",
270
+ " vertical-align: middle;\n",
271
+ " }\n",
272
+ "\n",
273
+ " .dataframe tbody tr th {\n",
274
+ " vertical-align: top;\n",
275
+ " }\n",
276
+ "\n",
277
+ " .dataframe thead th {\n",
278
+ " text-align: right;\n",
279
+ " }\n",
280
+ "</style>\n",
281
+ "<table border=\"1\" class=\"dataframe\">\n",
282
+ " <thead>\n",
283
+ " <tr style=\"text-align: right;\">\n",
284
+ " <th></th>\n",
285
+ " <th>age</th>\n",
286
+ " <th>middle_aged</th>\n",
287
+ " <th>old</th>\n",
288
+ " <th>young</th>\n",
289
+ " </tr>\n",
290
+ " <tr>\n",
291
+ " <th>manual_quality_review</th>\n",
292
+ " <th>gender</th>\n",
293
+ " <th></th>\n",
294
+ " <th></th>\n",
295
+ " <th></th>\n",
296
+ " </tr>\n",
297
+ " </thead>\n",
298
+ " <tbody>\n",
299
+ " <tr>\n",
300
+ " <th rowspan=\"2\" valign=\"top\">bad</th>\n",
301
+ " <th>female</th>\n",
302
+ " <td>1</td>\n",
303
+ " <td>2</td>\n",
304
  " <td>1</td>\n",
305
+ " </tr>\n",
306
+ " <tr>\n",
307
+ " <th>male</th>\n",
308
  " <td>0</td>\n",
309
+ " <td>3</td>\n",
310
+ " <td>1</td>\n",
311
  " </tr>\n",
312
  " <tr>\n",
313
+ " <th rowspan=\"2\" valign=\"top\">medium</th>\n",
314
+ " <th>female</th>\n",
 
315
  " <td>1</td>\n",
316
  " <td>0</td>\n",
317
+ " <td>0</td>\n",
318
  " </tr>\n",
319
  " <tr>\n",
320
+ " <th>male</th>\n",
321
+ " <td>2</td>\n",
322
+ " <td>1</td>\n",
323
  " <td>0</td>\n",
324
+ " </tr>\n",
325
+ " <tr>\n",
326
+ " <th rowspan=\"2\" valign=\"top\">ok</th>\n",
327
+ " <th>female</th>\n",
328
+ " <td>2</td>\n",
329
+ " <td>3</td>\n",
330
+ " <td>7</td>\n",
331
+ " </tr>\n",
332
+ " <tr>\n",
333
+ " <th>male</th>\n",
334
+ " <td>6</td>\n",
335
+ " <td>1</td>\n",
336
  " <td>2</td>\n",
337
+ " </tr>\n",
338
+ " <tr>\n",
339
+ " <th>very bad</th>\n",
340
+ " <th>male</th>\n",
341
+ " <td>1</td>\n",
342
  " <td>0</td>\n",
343
  " <td>0</td>\n",
344
  " </tr>\n",
 
347
  "</div>"
348
  ],
349
  "text/plain": [
350
+ "age middle_aged old young\n",
351
+ "manual_quality_review gender \n",
352
+ "bad female 1 2 1\n",
353
+ " male 0 3 1\n",
354
+ "medium female 1 0 0\n",
355
+ " male 2 1 0\n",
356
+ "ok female 2 3 7\n",
357
+ " male 6 1 2\n",
358
+ "very bad male 1 0 0"
359
  ]
360
  },
361
+ "execution_count": 13,
362
  "metadata": {},
363
  "output_type": "execute_result"
364
  }
365
  ],
366
  "source": [
367
+ "df.groupby([\"manual_quality_review\", 'gender', \"age\"], dropna=False)[\n",
368
+ " \"voice_id\"\n",
369
+ "].count().unstack(fill_value=0)"
370
  ]
371
  },
372
  {
373
  "cell_type": "code",
374
+ "execution_count": 16,
375
  "metadata": {},
376
  "outputs": [
377
  {
 
394
  "<table border=\"1\" class=\"dataframe\">\n",
395
  " <thead>\n",
396
  " <tr style=\"text-align: right;\">\n",
397
+ " <th>age</th>\n",
398
+ " <th>middle_aged</th>\n",
399
+ " <th>old</th>\n",
400
+ " <th>young</th>\n",
 
 
401
  " </tr>\n",
402
  " <tr>\n",
403
+ " <th>gender</th>\n",
 
 
404
  " <th></th>\n",
405
  " <th></th>\n",
406
  " <th></th>\n",
 
408
  " </thead>\n",
409
  " <tbody>\n",
410
  " <tr>\n",
411
+ " <th>female</th>\n",
 
 
 
412
  " <td>2</td>\n",
413
+ " <td>3</td>\n",
414
+ " <td>7</td>\n",
415
  " </tr>\n",
416
  " <tr>\n",
417
+ " <th>male</th>\n",
418
+ " <td>6</td>\n",
419
+ " <td>1</td>\n",
420
+ " <td>2</td>\n",
421
+ " </tr>\n",
422
+ " </tbody>\n",
423
+ "</table>\n",
424
+ "</div>"
425
+ ],
426
+ "text/plain": [
427
+ "age middle_aged old young\n",
428
+ "gender \n",
429
+ "female 2 3 7\n",
430
+ "male 6 1 2"
431
+ ]
432
+ },
433
+ "execution_count": 16,
434
+ "metadata": {},
435
+ "output_type": "execute_result"
436
+ }
437
+ ],
438
+ "source": [
439
+ "df[df['manual_quality_review'].isin(['ok'])].groupby(['gender', \"age\"], dropna=False)[\n",
440
+ " \"voice_id\"\n",
441
+ "].count().unstack(fill_value=0)"
442
+ ]
443
+ },
444
+ {
445
+ "cell_type": "code",
446
+ "execution_count": null,
447
+ "metadata": {},
448
+ "outputs": [
449
+ {
450
+ "data": {
451
+ "text/html": [
452
+ "<div>\n",
453
+ "<style scoped>\n",
454
+ " .dataframe tbody tr th:only-of-type {\n",
455
+ " vertical-align: middle;\n",
456
+ " }\n",
457
+ "\n",
458
+ " .dataframe tbody tr th {\n",
459
+ " vertical-align: top;\n",
460
+ " }\n",
461
+ "\n",
462
+ " .dataframe thead th {\n",
463
+ " text-align: right;\n",
464
+ " }\n",
465
+ "</style>\n",
466
+ "<table border=\"1\" class=\"dataframe\">\n",
467
+ " <thead>\n",
468
+ " <tr style=\"text-align: right;\">\n",
469
+ " <th>age</th>\n",
470
+ " <th>middle_aged</th>\n",
471
  " <th>old</th>\n",
472
+ " <th>young</th>\n",
473
+ " </tr>\n",
474
+ " <tr>\n",
475
+ " <th>gender</th>\n",
476
+ " <th></th>\n",
477
+ " <th></th>\n",
478
+ " <th></th>\n",
479
+ " </tr>\n",
480
+ " </thead>\n",
481
+ " <tbody>\n",
482
+ " <tr>\n",
483
+ " <th>female</th>\n",
484
  " <td>3</td>\n",
485
  " <td>3</td>\n",
486
+ " <td>7</td>\n",
 
487
  " </tr>\n",
488
  " <tr>\n",
489
+ " <th>male</th>\n",
490
+ " <td>8</td>\n",
491
+ " <td>2</td>\n",
492
+ " <td>2</td>\n",
493
+ " </tr>\n",
494
+ " </tbody>\n",
495
+ "</table>\n",
496
+ "</div>"
497
+ ],
498
+ "text/plain": [
499
+ "age middle_aged old young\n",
500
+ "gender \n",
501
+ "female 3 3 7\n",
502
+ "male 8 2 2"
503
+ ]
504
+ },
505
+ "metadata": {},
506
+ "output_type": "display_data"
507
+ }
508
+ ],
509
+ "source": [
510
+ "df[df['manual_quality_review'].isin(['ok', 'medium'])].groupby(['gender', \"age\"], dropna=False)[\n",
511
+ " \"voice_id\"\n",
512
+ "].count().unstack(fill_value=0)"
513
+ ]
514
+ },
515
+ {
516
+ "cell_type": "code",
517
+ "execution_count": null,
518
+ "metadata": {},
519
+ "outputs": [],
520
+ "source": []
521
+ },
522
+ {
523
+ "cell_type": "code",
524
+ "execution_count": 10,
525
+ "metadata": {},
526
+ "outputs": [
527
+ {
528
+ "data": {
529
+ "text/html": [
530
+ "<div>\n",
531
+ "<style scoped>\n",
532
+ " .dataframe tbody tr th:only-of-type {\n",
533
+ " vertical-align: middle;\n",
534
+ " }\n",
535
+ "\n",
536
+ " .dataframe tbody tr th {\n",
537
+ " vertical-align: top;\n",
538
+ " }\n",
539
+ "\n",
540
+ " .dataframe thead th {\n",
541
+ " text-align: right;\n",
542
+ " }\n",
543
+ "</style>\n",
544
+ "<table border=\"1\" class=\"dataframe\">\n",
545
+ " <thead>\n",
546
+ " <tr style=\"text-align: right;\">\n",
547
+ " <th></th>\n",
548
+ " <th>gender</th>\n",
549
+ " <th>female</th>\n",
550
+ " <th>male</th>\n",
551
  " </tr>\n",
552
  " <tr>\n",
553
+ " <th>language</th>\n",
554
+ " <th>age</th>\n",
555
+ " <th></th>\n",
556
+ " <th></th>\n",
 
557
  " </tr>\n",
558
+ " </thead>\n",
559
+ " <tbody>\n",
560
  " <tr>\n",
561
+ " <th rowspan=\"2\" valign=\"top\">en</th>\n",
562
+ " <th>middle_aged</th>\n",
563
  " <td>1</td>\n",
564
+ " <td>5</td>\n",
 
 
565
  " </tr>\n",
566
  " <tr>\n",
567
+ " <th>young</th>\n",
 
 
 
568
  " <td>2</td>\n",
569
  " <td>1</td>\n",
570
  " </tr>\n",
571
  " <tr>\n",
572
+ " <th rowspan=\"3\" valign=\"top\">NaN</th>\n",
573
+ " <th>middle_aged</th>\n",
574
+ " <td>3</td>\n",
575
+ " <td>4</td>\n",
576
+ " </tr>\n",
577
+ " <tr>\n",
578
  " <th>old</th>\n",
579
+ " <td>5</td>\n",
580
+ " <td>5</td>\n",
 
 
581
  " </tr>\n",
582
  " <tr>\n",
583
  " <th>young</th>\n",
584
+ " <td>6</td>\n",
585
+ " <td>2</td>\n",
 
 
586
  " </tr>\n",
587
  " </tbody>\n",
588
  "</table>\n",
589
  "</div>"
590
  ],
591
  "text/plain": [
592
+ "gender female male\n",
593
+ "language age \n",
594
+ "en middle_aged 1 5\n",
595
+ " young 2 1\n",
596
+ "NaN middle_aged 3 4\n",
597
+ " old 5 5\n",
598
+ " young 6 2"
 
 
 
599
  ]
600
  },
601
+ "execution_count": 10,
602
  "metadata": {},
603
  "output_type": "execute_result"
604
  }
 
609
  },
610
  {
611
  "cell_type": "code",
612
+ "execution_count": 11,
613
  "metadata": {},
614
  "outputs": [
615
  {
616
  "data": {
617
  "text/plain": [
618
  "descriptive\n",
619
+ "pleasant 6\n",
620
+ "casual 5\n",
621
+ "confident 3\n",
622
+ "calm 3\n",
623
+ "NaN 3\n",
624
+ "intense 3\n",
625
+ "chill 2\n",
626
+ "formal 1\n",
627
+ "serious 1\n",
628
+ "mature 1\n",
629
+ "cute 1\n",
630
+ "crisp 1\n",
631
+ "upbeat 1\n",
632
+ "professional 1\n",
633
+ "excited 1\n",
634
+ "wise 1\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  "Name: count, dtype: int64"
636
  ]
637
  },
638
+ "execution_count": 11,
639
  "metadata": {},
640
  "output_type": "execute_result"
641
  }
 
644
  "df['descriptive'].value_counts(dropna=False)"
645
  ]
646
  },
647
+ {
648
+ "cell_type": "code",
649
+ "execution_count": null,
650
+ "metadata": {},
651
+ "outputs": [],
652
+ "source": []
653
+ },
654
  {
655
  "cell_type": "code",
656
  "execution_count": 17,
src/audio_generators.py CHANGED
@@ -9,8 +9,8 @@ from langchain_community.callbacks import get_openai_callback
9
  from pydub import AudioSegment
10
 
11
  from src.lc_callbacks import LCMessageLoggerAsync
12
- from src.tts import tts_astream_consumed, sound_generation_astream
13
- from src.utils import auto_retry, consume_aiter
14
  from src.emotions.generation import (
15
  EffectGeneratorAsync,
16
  TextPreparationForTTSTaskOutput,
@@ -76,37 +76,38 @@ class AudioGeneratorWithEffects:
76
  """Main method to generate the audiobook with TTS, emotion, and sound effects."""
77
  num_lines = len(text_split.phrases)
78
  lines_for_sound_effect = self._select_lines_for_sound_effect(
79
- num_lines, fraction=float(0.2 * generate_effects),
 
80
  )
81
  logger.info(f"{generate_effects = }, {lines_for_sound_effect = }")
82
 
83
- modified_texts, sound_emotion_results = await self._prepare_text_for_tts(
84
  text_split, lines_for_sound_effect
85
  )
86
 
87
  tts_results, self.temp_files = await self._generate_tts_audio(
88
- text_split, modified_texts, character_to_voice
89
  )
90
 
91
- # Step 3: Add sound effects to selected lines
92
  audio_chunks = await self._add_sound_effects(
93
- tts_results, lines_for_sound_effect, sound_emotion_results, self.temp_files
94
  )
95
 
96
- # Step 4: Merge audio files
97
  normalized_audio_chunks = self._normalize_audio_chunks(
98
  audio_chunks, self.temp_files
99
  )
 
100
  final_output = self._merge_audio_files(
101
  normalized_audio_chunks, save_path=out_path
102
  )
103
 
104
- # Clean up temporary files
105
  self._cleanup_temp_files(self.temp_files)
106
 
107
  return final_output
108
 
109
- def _select_lines_for_sound_effect(self, num_lines: int, fraction: float) -> list[int]:
 
 
110
  """Select % of the lines randomly for sound effect generation."""
111
  return random.sample(range(num_lines), k=int(fraction * num_lines))
112
 
@@ -159,7 +160,7 @@ class AudioGeneratorWithEffects:
159
  async def _generate_tts_audio(
160
  self,
161
  text_split: SplitTextOutput,
162
- modified_texts: list[dict],
163
  character_to_voice: dict[str, str],
164
  ) -> tuple[list[str], list[str]]:
165
  """Generate TTS audio for modified text."""
@@ -174,20 +175,18 @@ class AudioGeneratorWithEffects:
174
  # bytes_ = await consume_aiter(iter_)
175
  return bytes_
176
 
177
- for idx, (modified_text, character_phrase) in enumerate(
178
- zip(modified_texts, text_split.phrases)
179
  ):
180
  voice_id = character_to_voice[character_phrase.character]
181
 
182
- # Use the semaphore-protected TTS function
183
  task = tts_astream_with_semaphore(
184
  voice_id=voice_id,
185
- text=modified_text["modified_text"],
186
- params=modified_text["params"],
187
  )
188
  tasks_for_tts.append(task)
189
 
190
- # Gather all TTS results
191
  tts_results = await asyncio.gather(*tasks_for_tts)
192
 
193
  # Save the results to temporary files
@@ -206,38 +205,65 @@ class AudioGeneratorWithEffects:
206
  self,
207
  tts_audio_files: list[str],
208
  lines_for_sound_effect: list[int],
209
- sound_emotion_results: list[dict],
210
  temp_files: list[str],
211
  ) -> list[str]:
212
  """Add sound effects to the selected lines."""
213
- audio_chunks = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  for idx, tts_filename in enumerate(tts_audio_files):
215
- # If the line has sound emotion data, generate sound effect and overlay
216
- if idx in lines_for_sound_effect:
217
- # Get next sound effect data
218
- sound_effect_data = sound_emotion_results.pop(0)
219
- sound_effect_filename = f"sound_effect_{idx}.wav"
220
 
221
- # Generate sound effect asynchronously
222
- sound_result = await consume_aiter(
223
- sound_generation_astream(sound_effect_data)
224
- )
225
- with open(sound_effect_filename, "wb") as ab:
226
- for chunk in sound_result:
227
- ab.write(chunk)
228
-
229
- # Add sound effect overlay
230
- output_filename = add_overlay_for_audio(
231
- main_audio_filename=tts_filename,
232
- sound_effect_filename=sound_effect_filename,
233
- cycling_effect=True,
234
- decrease_effect_volume=5,
235
  )
236
- audio_chunks.append(output_filename)
237
- temp_files.append(sound_effect_filename) # Track temp files
238
- temp_files.append(output_filename)
239
  else:
240
- audio_chunks.append(tts_filename)
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  return audio_chunks
243
 
 
9
  from pydub import AudioSegment
10
 
11
  from src.lc_callbacks import LCMessageLoggerAsync
12
+ from src.tts import tts_astream_consumed, sound_generation_consumed
13
+ from src.utils import consume_aiter
14
  from src.emotions.generation import (
15
  EffectGeneratorAsync,
16
  TextPreparationForTTSTaskOutput,
 
76
  """Main method to generate the audiobook with TTS, emotion, and sound effects."""
77
  num_lines = len(text_split.phrases)
78
  lines_for_sound_effect = self._select_lines_for_sound_effect(
79
+ num_lines,
80
+ fraction=float(0.2 * generate_effects),
81
  )
82
  logger.info(f"{generate_effects = }, {lines_for_sound_effect = }")
83
 
84
+ data_for_tts, data_for_sound_effects = await self._prepare_text_for_tts(
85
  text_split, lines_for_sound_effect
86
  )
87
 
88
  tts_results, self.temp_files = await self._generate_tts_audio(
89
+ text_split, data_for_tts, character_to_voice
90
  )
91
 
 
92
  audio_chunks = await self._add_sound_effects(
93
+ tts_results, lines_for_sound_effect, data_for_sound_effects, self.temp_files
94
  )
95
 
 
96
  normalized_audio_chunks = self._normalize_audio_chunks(
97
  audio_chunks, self.temp_files
98
  )
99
+
100
  final_output = self._merge_audio_files(
101
  normalized_audio_chunks, save_path=out_path
102
  )
103
 
 
104
  self._cleanup_temp_files(self.temp_files)
105
 
106
  return final_output
107
 
108
+ def _select_lines_for_sound_effect(
109
+ self, num_lines: int, fraction: float
110
+ ) -> list[int]:
111
  """Select % of the lines randomly for sound effect generation."""
112
  return random.sample(range(num_lines), k=int(fraction * num_lines))
113
 
 
160
  async def _generate_tts_audio(
161
  self,
162
  text_split: SplitTextOutput,
163
+ data_for_tts: list[dict],
164
  character_to_voice: dict[str, str],
165
  ) -> tuple[list[str], list[str]]:
166
  """Generate TTS audio for modified text."""
 
175
  # bytes_ = await consume_aiter(iter_)
176
  return bytes_
177
 
178
+ for idx, (data_item, character_phrase) in enumerate(
179
+ zip(data_for_tts, text_split.phrases)
180
  ):
181
  voice_id = character_to_voice[character_phrase.character]
182
 
 
183
  task = tts_astream_with_semaphore(
184
  voice_id=voice_id,
185
+ text=data_item["modified_text"],
186
+ params=data_item["params"],
187
  )
188
  tasks_for_tts.append(task)
189
 
 
190
  tts_results = await asyncio.gather(*tasks_for_tts)
191
 
192
  # Save the results to temporary files
 
205
  self,
206
  tts_audio_files: list[str],
207
  lines_for_sound_effect: list[int],
208
+ data_for_sound_effects: list[dict],
209
  temp_files: list[str],
210
  ) -> list[str]:
211
  """Add sound effects to the selected lines."""
212
+
213
+ semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
214
+
215
+ async def _process_single_phrase(
216
+ tts_filename: str,
217
+ sound_effect_data: dict | None,
218
+ sound_effect_filename: str,
219
+ ):
220
+ if sound_effect_data is None:
221
+ return (tts_filename, [])
222
+
223
+ async with semaphore:
224
+ sound_result = await sound_generation_consumed(sound_effect_data)
225
+
226
+ # save to file
227
+ with open(sound_effect_filename, "wb") as ab:
228
+ for chunk in sound_result:
229
+ ab.write(chunk)
230
+
231
+ # overlay sound effect on TTS audio
232
+ tts_with_effects_filename = add_overlay_for_audio(
233
+ main_audio_filename=tts_filename,
234
+ sound_effect_filename=sound_effect_filename,
235
+ cycling_effect=True,
236
+ decrease_effect_volume=5,
237
+ )
238
+ tmp_files = [sound_effect_filename, tts_with_effects_filename]
239
+ return (tts_with_effects_filename, tmp_files)
240
+
241
+ tasks = []
242
  for idx, tts_filename in enumerate(tts_audio_files):
243
+ sound_effect_filename = f"sound_effect_{idx}.wav"
 
 
 
 
244
 
245
+ if idx not in lines_for_sound_effect:
246
+ tasks.append(
247
+ _process_single_phrase(
248
+ tts_filename=tts_filename,
249
+ sound_effect_data=None,
250
+ sound_effect_filename=sound_effect_filename,
251
+ )
 
 
 
 
 
 
 
252
  )
 
 
 
253
  else:
254
+ sound_effect_data = data_for_sound_effects.pop(0)
255
+ tasks.append(
256
+ _process_single_phrase(
257
+ tts_filename=tts_filename,
258
+ sound_effect_data=sound_effect_data,
259
+ sound_effect_filename=sound_effect_filename,
260
+ )
261
+ )
262
+
263
+ outputs = await asyncio.gather(*tasks)
264
+ audio_chunks = [x[0] for x in outputs]
265
+ tmp_files_to_add = [item for x in outputs for item in x[1]]
266
+ temp_files.extend(tmp_files_to_add)
267
 
268
  return audio_chunks
269
 
src/config.py CHANGED
@@ -17,3 +17,20 @@ ELEVENLABS_MAX_PARALLEL = 15 # current limitation of available subscription
17
 
18
  # VOICES_CSV_FP = "data/11labs_available_tts_voices.csv"
19
  VOICES_CSV_FP = "data/11labs_available_tts_voices.reviewed.csv"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  # VOICES_CSV_FP = "data/11labs_available_tts_voices.csv"
19
  VOICES_CSV_FP = "data/11labs_available_tts_voices.reviewed.csv"
20
+
21
+ MAX_TEXT_LEN = 5000
22
+
23
+ DESCRIPTION = """\
24
+ # AI Audiobooks Generator
25
+
26
+ Create an audiobook from the input text automatically, using Gen-AI!
27
+
28
+ All you need to do - is to input the book text or select it from the provided Sample Inputs.
29
+
30
+ AI will do the rest:
31
+ - split text into characters
32
+ - assign each character a voice
33
+ - preprocess text to better convey emotions during Text-to-Speech
34
+ - (optionally) add sound effects to create immersive atmosphere
35
+ - generate audiobook using Text-to-Speech model
36
+ """
src/tts.py CHANGED
@@ -39,7 +39,10 @@ async def tts_astream(
39
  style=params.get("style"),
40
  )
41
 
42
- logger.info(f"call to 11labs TTS endpoint with params: {params_all}")
 
 
 
43
  async_iter = ELEVEN_CLIENT_ASYNC.text_to_speech.convert(**params_all)
44
  async for chunk in async_iter:
45
  if chunk:
@@ -57,11 +60,23 @@ async def tts_astream_consumed(
57
  async def sound_generation_astream(
58
  sound_generation_data: dict,
59
  ) -> t.AsyncIterator[bytes]:
 
 
 
 
 
 
60
  async_iter = ELEVEN_CLIENT_ASYNC.text_to_sound_effects.convert(
61
- text=sound_generation_data["text"],
62
  duration_seconds=sound_generation_data["duration_seconds"],
63
  prompt_influence=sound_generation_data["prompt_influence"],
64
  )
65
  async for chunk in async_iter:
66
  if chunk:
67
  yield chunk
 
 
 
 
 
 
 
39
  style=params.get("style"),
40
  )
41
 
42
+ logger.info(
43
+ f"request to 11labs TTS endpoint with params {params_all} "
44
+ f'for the following text: "{text}"'
45
+ )
46
  async_iter = ELEVEN_CLIENT_ASYNC.text_to_speech.convert(**params_all)
47
  async for chunk in async_iter:
48
  if chunk:
 
60
  async def sound_generation_astream(
61
  sound_generation_data: dict,
62
  ) -> t.AsyncIterator[bytes]:
63
+ text = sound_generation_data.pop("text")
64
+ logger.info(
65
+ f"request to 11labs sound effect generation with params {sound_generation_data} "
66
+ f'for the following text: "{text}"'
67
+ )
68
+
69
  async_iter = ELEVEN_CLIENT_ASYNC.text_to_sound_effects.convert(
70
+ text=text,
71
  duration_seconds=sound_generation_data["duration_seconds"],
72
  prompt_influence=sound_generation_data["prompt_influence"],
73
  )
74
  async for chunk in async_iter:
75
  if chunk:
76
  yield chunk
77
+
78
+
79
+ @auto_retry
80
+ async def sound_generation_consumed(sound_generation_data: dict):
81
+ aiterator = sound_generation_astream(sound_generation_data=sound_generation_data)
82
+ return [x async for x in aiterator]