Spaces:

ales
/

ai-audio-books

Sleeping

App Files Files Community

navalnica commited on Oct 11, 2024

Commit

9281119

1 Parent(s): 95849c2

async sound effects generation; add text samples; improve UI; add input len check

Browse files

Files changed (7) hide show

README.md +0 -8
app.py +43 -15
data/samples_to_split.py +3 -3
pg.ipynb +320 -132
src/audio_generators.py +68 -42
src/config.py +17 -0
src/tts.py +17 -2

README.md CHANGED Viewed

@@ -20,23 +20,15 @@ python_version: 3.11
     - add context
     - filter, apply only for long phrases
     - only for narrator?
-    - checkbox! make effects great again (no) optional
 - stability
     - add limit on input text size (5000 chars)
 - improve UI
     - add error box
-    - add samples
     - show character parts
-    - remove file upload pane
-    - labels on how long to wait
-    - labels describing components
-    - header and description
 - prepare slides / story
 - testing
     - eval current execution time
-    - test on different text inputs
 - optimizations
-    - generate audio effects asynchronously
     - combine sequential phrases of same character in single phrase
     - support large texts. use batching. problem: how to ensure same characters?
     - can detect characters in first prompt, then split text in each batch into character phrases

     - add context
     - filter, apply only for long phrases
     - only for narrator?
 - stability
     - add limit on input text size (5000 chars)
 - improve UI
     - add error box
     - show character parts
 - prepare slides / story
 - testing
     - eval current execution time
 - optimizations
     - combine sequential phrases of same character in single phrase
     - support large texts. use batching. problem: how to ensure same characters?
     - can detect characters in first prompt, then split text in each batch into character phrases

app.py CHANGED Viewed

@@ -9,7 +9,8 @@ from langchain_community.document_loaders import PyPDFLoader
 load_dotenv()
 from src.builder import AudiobookBuilder
-from src.config import logger, FILE_SIZE_MAX
 def get_auth_params():
@@ -57,6 +58,9 @@ async def respond(
             logger.exception(e)
             return (None, str(e))
     builder = AudiobookBuilder()
     audio_fp = await builder.run(text=text, generate_effects=generate_effects)
     return audio_fp, ""
@@ -67,34 +71,58 @@ def refresh():
 with gr.Blocks(title="Audiobooks Generation") as ui:
-    gr.Markdown("# Audiobooks Generation")
     with gr.Row(variant="panel"):
-        text_input = gr.Textbox(label="Enter the book text", lines=20)
-        # Add a file upload field for .txt and .pdf files
         file_input = gr.File(
-            label="Upload a text file or PDF", file_types=[".txt", ".pdf"]
         )
-    with gr.Row(variant="panel"):
-        audio_output = gr.Audio(label="Generated audio", type="filepath")
-        error_output = gr.Textbox(
-            label="Error Messages", interactive=False, visible=False
-        )  # Initially hidden
-    effects_generation_checkbox = gr.Checkbox(label="Generate background effects")
-    submit_button = gr.Button("Submit")
     submit_button.click(
         fn=respond,
-        inputs=[text_input, file_input, effects_generation_checkbox],  # Include the uploaded file as an input
         outputs=[
             audio_output,
             error_output,
         ],  # Include the audio output and error message output
     )
-    refresh_button = gr.Button("Refresh")
     refresh_button.click(
         fn=refresh,
         inputs=[],

 load_dotenv()
 from src.builder import AudiobookBuilder
+from src.config import logger, FILE_SIZE_MAX, MAX_TEXT_LEN, DESCRIPTION
+from data import samples_to_split as samples
 def get_auth_params():
             logger.exception(e)
             return (None, str(e))
+    if len(text) > MAX_TEXT_LEN:
+        raise ValueError(len(text))  # TODO
     builder = AudiobookBuilder()
     audio_fp = await builder.run(text=text, generate_effects=generate_effects)
     return audio_fp, ""
 with gr.Blocks(title="Audiobooks Generation") as ui:
+    gr.Markdown(DESCRIPTION)
     with gr.Row(variant="panel"):
+        text_input = gr.Textbox(label="Enter the book text here", lines=20)
         file_input = gr.File(
+            label="Upload a text file or PDF",
+            file_types=[".txt", ".pdf"],
+            visible=False,
         )
+    examples = gr.Examples(
+        examples=[
+            [samples.GATSBY_1],
+            [samples.GATSBY_2],
+            [samples.WONDERFUL_CHRISTMAS_1],
+            [samples.WONDERFUL_CHRISTMAS_2],
+        ],
+        inputs=text_input,
+        label="Sample Inputs",
+        example_labels=[
+            "Gatsby 1",
+            "Gatsby 2",
+            "Wonderful Christmas 1",
+            "Wonderful Christmas 2",
+        ],
+    )
+    audio_output = gr.Audio(
+        label='Generated audio. Please wait for the waveform to appear, before hitting "Play"',
+        type="filepath",
+    )
+    # error output is hidden initially
+    error_output = gr.Textbox(label="Error Message", interactive=False, visible=False)
+    effects_generation_checkbox = gr.Checkbox(label="Add background effects")
+    with gr.Row(variant="panel"):
+        submit_button = gr.Button("Generate the audiobook", variant="primary")
+        refresh_button = gr.Button("Refresh", variant="secondary")
     submit_button.click(
         fn=respond,
+        inputs=[
+            text_input,
+            file_input,
+            effects_generation_checkbox,
+        ],  # Include the uploaded file as an input
         outputs=[
             audio_output,
             error_output,
         ],  # Include the audio output and error message output
     )
     refresh_button.click(
         fn=refresh,
         inputs=[],

data/samples_to_split.py CHANGED Viewed

@@ -157,17 +157,17 @@ Frank! why in the world don’t you come to dinner? There is a gentleman
 at table who came to see papa on business, and I ran away after the
 soup—I couldn’t eat my dinner one bit, without you.”
-“You’ll _have_ to, I reckon,” returned Frank; “a poor fellow, like me,
 who has to hoe corn all day, can’t stop to eat.”
 “O, Frank Hallock! _for shame!_” cried Kate, putting down her indignant
 foot without being able to make noise enough about it to disturb an
 earthworm.
-“It’s _true_,” responded Frank, pitching into the next hill with all his
 might.
-“It is _not_ true,” cried Kate; “and if just running off to _look_ at
 the circus pass by makes you say such things, I am glad you can’t go to
 see it.”\
 """

 at table who came to see papa on business, and I ran away after the
 soup—I couldn’t eat my dinner one bit, without you.”
+“You’ll HAVE to, I reckon,” returned Frank; “a poor fellow, like me,
 who has to hoe corn all day, can’t stop to eat.”
 “O, Frank Hallock! _for shame!_” cried Kate, putting down her indignant
 foot without being able to make noise enough about it to disturb an
 earthworm.
+“It’s TRUE,” responded Frank, pitching into the next hill with all his
 might.
+“It is NOT true,” cried Kate; “and if just running off to LOOK at
 the circus pass by makes you say such things, I am glad you can’t go to
 see it.”\
 """

pg.ipynb CHANGED Viewed

@@ -80,13 +80,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "(468, 14)\n"
      ]
     }
    ],
    "source": [
     "# df = pd.read_csv('data/11labs_tts_voices.csv')\n",
-    "df = pd.read_csv('data/11labs_available_tts_voices.csv')\n",
     "df[\"age\"] = df[\"age\"].str.replace(\" \", \"_\").str.replace(\"-\", \"_\")\n",
     "print(df.shape)"
    ]
@@ -99,9 +99,9 @@
     {
      "data": {
       "text/plain": [
-       "Index(['voice_id', 'name', 'preview_url', 'owner_id', 'permission_on_resource',\n",
-       "       'is_legacy', 'is_mixed', 'accent', 'description', 'age', 'gender',\n",
-       "       'category', 'language', 'descriptive'],\n",
        "      dtype='object')"
       ]
      },
@@ -123,9 +123,8 @@
      "data": {
       "text/plain": [
        "language\n",
-       "NaN         264\n",
-       "en          203\n",
-       "romanian      1\n",
        "Name: count, dtype: int64"
       ]
      },
@@ -147,10 +146,8 @@
      "data": {
       "text/plain": [
        "gender\n",
-       "female        231\n",
-       "male          230\n",
-       "neutral         6\n",
-       "non-binary      1\n",
        "Name: count, dtype: int64"
       ]
      },
@@ -172,10 +169,9 @@
      "data": {
       "text/plain": [
        "age\n",
-       "middle_aged    183\n",
-       "young          143\n",
-       "old            140\n",
-       "NaN              2\n",
        "Name: count, dtype: int64"
       ]
      },
@@ -190,7 +186,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -216,43 +212,133 @@
        "      <th>gender</th>\n",
        "      <th>female</th>\n",
        "      <th>male</th>\n",
-       "      <th>neutral</th>\n",
-       "      <th>non-binary</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>age</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>middle_aged</th>\n",
-       "      <td>48</td>\n",
-       "      <td>130</td>\n",
        "      <td>4</td>\n",
-       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>old</th>\n",
-       "      <td>100</td>\n",
-       "      <td>39</td>\n",
        "      <td>1</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>young</th>\n",
-       "      <td>83</td>\n",
-       "      <td>59</td>\n",
        "      <td>1</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>NaN</th>\n",
        "      <td>0</td>\n",
        "      <td>2</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
@@ -261,26 +347,31 @@
        "</div>"
       ],
       "text/plain": [
-       "gender       female  male  neutral  non-binary\n",
-       "age                                           \n",
-       "middle_aged      48   130        4           1\n",
-       "old             100    39        1           0\n",
-       "young            83    59        1           0\n",
-       "NaN               0     2        0           0"
       ]
      },
-     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df.groupby(['age', 'gender'], dropna=False)['voice_id'].count().unstack(fill_value=0)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -303,17 +394,13 @@
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>gender</th>\n",
-       "      <th>female</th>\n",
-       "      <th>male</th>\n",
-       "      <th>neutral</th>\n",
-       "      <th>non-binary</th>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>language</th>\n",
-       "      <th>age</th>\n",
-       "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
@@ -321,82 +408,197 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th rowspan=\"4\" valign=\"top\">en</th>\n",
-       "      <th>middle_aged</th>\n",
-       "      <td>30</td>\n",
-       "      <td>91</td>\n",
        "      <td>2</td>\n",
-       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>old</th>\n",
        "      <td>3</td>\n",
        "      <td>3</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>young</th>\n",
-       "      <td>34</td>\n",
-       "      <td>38</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>NaN</th>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>romanian</th>\n",
-       "      <th>old</th>\n",
        "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th rowspan=\"3\" valign=\"top\">NaN</th>\n",
-       "      <th>middle_aged</th>\n",
-       "      <td>18</td>\n",
-       "      <td>39</td>\n",
        "      <td>2</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>old</th>\n",
-       "      <td>96</td>\n",
-       "      <td>36</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>young</th>\n",
-       "      <td>49</td>\n",
-       "      <td>21</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "gender                female  male  neutral  non-binary\n",
-       "language age                                           \n",
-       "en       middle_aged      30    91        2           0\n",
-       "         old               3     3        0           0\n",
-       "         young            34    38        0           0\n",
-       "         NaN               0     2        0           0\n",
-       "romanian old               1     0        0           0\n",
-       "NaN      middle_aged      18    39        2           1\n",
-       "         old              96    36        1           0\n",
-       "         young            49    21        1           0"
       ]
      },
-     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -407,54 +609,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "descriptive\n",
-       "confident       64\n",
-       "calm            44\n",
-       "casual          34\n",
-       "pleasant        31\n",
-       "deep            28\n",
-       "NaN             26\n",
-       "professional    26\n",
-       "upbeat          22\n",
-       "wise            20\n",
-       "formal          17\n",
-       "intense         13\n",
-       "serious         13\n",
-       "meditative      11\n",
-       "modulated       11\n",
-       "excited         10\n",
-       "husky           10\n",
-       "mature           8\n",
-       "classy           8\n",
-       "chill            7\n",
-       "neutral          7\n",
-       "crisp            6\n",
-       "gentle           6\n",
-       "childish         6\n",
-       "hyped            6\n",
-       "cute             5\n",
-       "sassy            4\n",
-       "soft             4\n",
-       "rough            3\n",
-       "grumpy           3\n",
-       "whispery         3\n",
-       "robotic          3\n",
-       "relaxed          3\n",
-       "raspy            2\n",
-       "cheeky           1\n",
-       "sad              1\n",
-       "anxious          1\n",
-       "motivational     1\n",
        "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -463,6 +644,13 @@
     "df['descriptive'].value_counts(dropna=False)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 17,

      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "(34, 15)\n"
      ]
     }
    ],
    "source": [
     "# df = pd.read_csv('data/11labs_tts_voices.csv')\n",
+    "df = pd.read_csv('data/11labs_available_tts_voices.reviewed.csv')\n",
     "df[\"age\"] = df[\"age\"].str.replace(\" \", \"_\").str.replace(\"-\", \"_\")\n",
     "print(df.shape)"
    ]
     {
      "data": {
       "text/plain": [
+       "Index(['voice_id', 'name', 'preview_url', 'manual_quality_review', 'owner_id',\n",
+       "       'permission_on_resource', 'is_legacy', 'is_mixed', 'accent',\n",
+       "       'description', 'age', 'gender', 'category', 'language', 'descriptive'],\n",
        "      dtype='object')"
       ]
      },
      "data": {
       "text/plain": [
        "language\n",
+       "NaN    25\n",
+       "en      9\n",
        "Name: count, dtype: int64"
       ]
      },
      "data": {
       "text/plain": [
        "gender\n",
+       "female    17\n",
+       "male      17\n",
        "Name: count, dtype: int64"
       ]
      },
      "data": {
       "text/plain": [
        "age\n",
+       "middle_aged    13\n",
+       "young          11\n",
+       "old            10\n",
        "Name: count, dtype: int64"
       ]
      },
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
        "      <th>gender</th>\n",
        "      <th>female</th>\n",
        "      <th>male</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>age</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>middle_aged</th>\n",
        "      <td>4</td>\n",
+       "      <td>9</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>old</th>\n",
+       "      <td>5</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>young</th>\n",
+       "      <td>8</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "gender       female  male\n",
+       "age                      \n",
+       "middle_aged       4     9\n",
+       "old               5     5\n",
+       "young             8     3"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.groupby(['age', 'gender'], dropna=False)['voice_id'].count().unstack(fill_value=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>age</th>\n",
+       "      <th>middle_aged</th>\n",
+       "      <th>old</th>\n",
+       "      <th>young</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>manual_quality_review</th>\n",
+       "      <th>gender</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">bad</th>\n",
+       "      <th>female</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
        "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>male</th>\n",
        "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">medium</th>\n",
+       "      <th>female</th>\n",
        "      <td>1</td>\n",
        "      <td>0</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>male</th>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
        "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">ok</th>\n",
+       "      <th>female</th>\n",
+       "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>male</th>\n",
+       "      <td>6</td>\n",
+       "      <td>1</td>\n",
        "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>very bad</th>\n",
+       "      <th>male</th>\n",
+       "      <td>1</td>\n",
        "      <td>0</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "</div>"
       ],
       "text/plain": [
+       "age                           middle_aged  old  young\n",
+       "manual_quality_review gender                         \n",
+       "bad                   female            1    2      1\n",
+       "                      male              0    3      1\n",
+       "medium                female            1    0      0\n",
+       "                      male              2    1      0\n",
+       "ok                    female            2    3      7\n",
+       "                      male              6    1      2\n",
+       "very bad              male              1    0      0"
       ]
      },
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "df.groupby([\"manual_quality_review\", 'gender', \"age\"], dropna=False)[\n",
+    "    \"voice_id\"\n",
+    "].count().unstack(fill_value=0)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
+       "      <th>age</th>\n",
+       "      <th>middle_aged</th>\n",
+       "      <th>old</th>\n",
+       "      <th>young</th>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>gender</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
+       "      <th>female</th>\n",
        "      <td>2</td>\n",
+       "      <td>3</td>\n",
+       "      <td>7</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>male</th>\n",
+       "      <td>6</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "age     middle_aged  old  young\n",
+       "gender                         \n",
+       "female            2    3      7\n",
+       "male              6    1      2"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[df['manual_quality_review'].isin(['ok'])].groupby(['gender', \"age\"], dropna=False)[\n",
+    "    \"voice_id\"\n",
+    "].count().unstack(fill_value=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>age</th>\n",
+       "      <th>middle_aged</th>\n",
        "      <th>old</th>\n",
+       "      <th>young</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>gender</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>female</th>\n",
        "      <td>3</td>\n",
        "      <td>3</td>\n",
+       "      <td>7</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>male</th>\n",
+       "      <td>8</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "age     middle_aged  old  young\n",
+       "gender                         \n",
+       "female            3    3      7\n",
+       "male              8    2      2"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df[df['manual_quality_review'].isin(['ok', 'medium'])].groupby(['gender', \"age\"], dropna=False)[\n",
+    "    \"voice_id\"\n",
+    "].count().unstack(fill_value=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>gender</th>\n",
+       "      <th>female</th>\n",
+       "      <th>male</th>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>language</th>\n",
+       "      <th>age</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
        "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
        "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">en</th>\n",
+       "      <th>middle_aged</th>\n",
        "      <td>1</td>\n",
+       "      <td>5</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>young</th>\n",
        "      <td>2</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">NaN</th>\n",
+       "      <th>middle_aged</th>\n",
+       "      <td>3</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
        "      <th>old</th>\n",
+       "      <td>5</td>\n",
+       "      <td>5</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>young</th>\n",
+       "      <td>6</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
+       "gender                female  male\n",
+       "language age                      \n",
+       "en       middle_aged       1     5\n",
+       "         young             2     1\n",
+       "NaN      middle_aged       3     4\n",
+       "         old               5     5\n",
+       "         young             6     2"
       ]
      },
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "descriptive\n",
+       "pleasant        6\n",
+       "casual          5\n",
+       "confident       3\n",
+       "calm            3\n",
+       "NaN             3\n",
+       "intense         3\n",
+       "chill           2\n",
+       "formal          1\n",
+       "serious         1\n",
+       "mature          1\n",
+       "cute            1\n",
+       "crisp           1\n",
+       "upbeat          1\n",
+       "professional    1\n",
+       "excited         1\n",
+       "wise            1\n",
        "Name: count, dtype: int64"
       ]
      },
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
     "df['descriptive'].value_counts(dropna=False)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 17,

src/audio_generators.py CHANGED Viewed

@@ -9,8 +9,8 @@ from langchain_community.callbacks import get_openai_callback
 from pydub import AudioSegment
 from src.lc_callbacks import LCMessageLoggerAsync
-from src.tts import tts_astream_consumed, sound_generation_astream
-from src.utils import auto_retry, consume_aiter
 from src.emotions.generation import (
     EffectGeneratorAsync,
     TextPreparationForTTSTaskOutput,
@@ -76,37 +76,38 @@ class AudioGeneratorWithEffects:
         """Main method to generate the audiobook with TTS, emotion, and sound effects."""
         num_lines = len(text_split.phrases)
         lines_for_sound_effect = self._select_lines_for_sound_effect(
-            num_lines, fraction=float(0.2 * generate_effects),
         )
         logger.info(f"{generate_effects = }, {lines_for_sound_effect = }")
-        modified_texts, sound_emotion_results = await self._prepare_text_for_tts(
             text_split, lines_for_sound_effect
         )
         tts_results, self.temp_files = await self._generate_tts_audio(
-            text_split, modified_texts, character_to_voice
         )
-        # Step 3: Add sound effects to selected lines
         audio_chunks = await self._add_sound_effects(
-            tts_results, lines_for_sound_effect, sound_emotion_results, self.temp_files
         )
-        # Step 4: Merge audio files
         normalized_audio_chunks = self._normalize_audio_chunks(
             audio_chunks, self.temp_files
         )
         final_output = self._merge_audio_files(
             normalized_audio_chunks, save_path=out_path
         )
-        # Clean up temporary files
         self._cleanup_temp_files(self.temp_files)
         return final_output
-    def _select_lines_for_sound_effect(self, num_lines: int, fraction: float) -> list[int]:
         """Select % of the lines randomly for sound effect generation."""
         return random.sample(range(num_lines), k=int(fraction * num_lines))
@@ -159,7 +160,7 @@ class AudioGeneratorWithEffects:
     async def _generate_tts_audio(
         self,
         text_split: SplitTextOutput,
-        modified_texts: list[dict],
         character_to_voice: dict[str, str],
     ) -> tuple[list[str], list[str]]:
         """Generate TTS audio for modified text."""
@@ -174,20 +175,18 @@ class AudioGeneratorWithEffects:
                 # bytes_ = await consume_aiter(iter_)
                 return bytes_
-        for idx, (modified_text, character_phrase) in enumerate(
-            zip(modified_texts, text_split.phrases)
         ):
             voice_id = character_to_voice[character_phrase.character]
-            # Use the semaphore-protected TTS function
             task = tts_astream_with_semaphore(
                 voice_id=voice_id,
-                text=modified_text["modified_text"],
-                params=modified_text["params"],
             )
             tasks_for_tts.append(task)
-        # Gather all TTS results
         tts_results = await asyncio.gather(*tasks_for_tts)
         # Save the results to temporary files
@@ -206,38 +205,65 @@ class AudioGeneratorWithEffects:
         self,
         tts_audio_files: list[str],
         lines_for_sound_effect: list[int],
-        sound_emotion_results: list[dict],
         temp_files: list[str],
     ) -> list[str]:
         """Add sound effects to the selected lines."""
-        audio_chunks = []
         for idx, tts_filename in enumerate(tts_audio_files):
-            # If the line has sound emotion data, generate sound effect and overlay
-            if idx in lines_for_sound_effect:
-                # Get next sound effect data
-                sound_effect_data = sound_emotion_results.pop(0)
-                sound_effect_filename = f"sound_effect_{idx}.wav"
-                # Generate sound effect asynchronously
-                sound_result = await consume_aiter(
-                    sound_generation_astream(sound_effect_data)
-                )
-                with open(sound_effect_filename, "wb") as ab:
-                    for chunk in sound_result:
-                        ab.write(chunk)
-                # Add sound effect overlay
-                output_filename = add_overlay_for_audio(
-                    main_audio_filename=tts_filename,
-                    sound_effect_filename=sound_effect_filename,
-                    cycling_effect=True,
-                    decrease_effect_volume=5,
                 )
-                audio_chunks.append(output_filename)
-                temp_files.append(sound_effect_filename)  # Track temp files
-                temp_files.append(output_filename)
             else:
-                audio_chunks.append(tts_filename)
         return audio_chunks

 from pydub import AudioSegment
 from src.lc_callbacks import LCMessageLoggerAsync
+from src.tts import tts_astream_consumed, sound_generation_consumed
+from src.utils import consume_aiter
 from src.emotions.generation import (
     EffectGeneratorAsync,
     TextPreparationForTTSTaskOutput,
         """Main method to generate the audiobook with TTS, emotion, and sound effects."""
         num_lines = len(text_split.phrases)
         lines_for_sound_effect = self._select_lines_for_sound_effect(
+            num_lines,
+            fraction=float(0.2 * generate_effects),
         )
         logger.info(f"{generate_effects = }, {lines_for_sound_effect = }")
+        data_for_tts, data_for_sound_effects = await self._prepare_text_for_tts(
             text_split, lines_for_sound_effect
         )
         tts_results, self.temp_files = await self._generate_tts_audio(
+            text_split, data_for_tts, character_to_voice
         )
         audio_chunks = await self._add_sound_effects(
+            tts_results, lines_for_sound_effect, data_for_sound_effects, self.temp_files
         )
         normalized_audio_chunks = self._normalize_audio_chunks(
             audio_chunks, self.temp_files
         )
         final_output = self._merge_audio_files(
             normalized_audio_chunks, save_path=out_path
         )
         self._cleanup_temp_files(self.temp_files)
         return final_output
+    def _select_lines_for_sound_effect(
+        self, num_lines: int, fraction: float
+    ) -> list[int]:
         """Select % of the lines randomly for sound effect generation."""
         return random.sample(range(num_lines), k=int(fraction * num_lines))
     async def _generate_tts_audio(
         self,
         text_split: SplitTextOutput,
+        data_for_tts: list[dict],
         character_to_voice: dict[str, str],
     ) -> tuple[list[str], list[str]]:
         """Generate TTS audio for modified text."""
                 # bytes_ = await consume_aiter(iter_)
                 return bytes_
+        for idx, (data_item, character_phrase) in enumerate(
+            zip(data_for_tts, text_split.phrases)
         ):
             voice_id = character_to_voice[character_phrase.character]
             task = tts_astream_with_semaphore(
                 voice_id=voice_id,
+                text=data_item["modified_text"],
+                params=data_item["params"],
             )
             tasks_for_tts.append(task)
         tts_results = await asyncio.gather(*tasks_for_tts)
         # Save the results to temporary files
         self,
         tts_audio_files: list[str],
         lines_for_sound_effect: list[int],
+        data_for_sound_effects: list[dict],
         temp_files: list[str],
     ) -> list[str]:
         """Add sound effects to the selected lines."""
+        semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
+        async def _process_single_phrase(
+            tts_filename: str,
+            sound_effect_data: dict | None,
+            sound_effect_filename: str,
+        ):
+            if sound_effect_data is None:
+                return (tts_filename, [])
+            async with semaphore:
+                sound_result = await sound_generation_consumed(sound_effect_data)
+            # save to file
+            with open(sound_effect_filename, "wb") as ab:
+                for chunk in sound_result:
+                    ab.write(chunk)
+            # overlay sound effect on TTS audio
+            tts_with_effects_filename = add_overlay_for_audio(
+                main_audio_filename=tts_filename,
+                sound_effect_filename=sound_effect_filename,
+                cycling_effect=True,
+                decrease_effect_volume=5,
+            )
+            tmp_files = [sound_effect_filename, tts_with_effects_filename]
+            return (tts_with_effects_filename, tmp_files)
+        tasks = []
         for idx, tts_filename in enumerate(tts_audio_files):
+            sound_effect_filename = f"sound_effect_{idx}.wav"
+            if idx not in lines_for_sound_effect:
+                tasks.append(
+                    _process_single_phrase(
+                        tts_filename=tts_filename,
+                        sound_effect_data=None,
+                        sound_effect_filename=sound_effect_filename,
+                    )
                 )
             else:
+                sound_effect_data = data_for_sound_effects.pop(0)
+                tasks.append(
+                    _process_single_phrase(
+                        tts_filename=tts_filename,
+                        sound_effect_data=sound_effect_data,
+                        sound_effect_filename=sound_effect_filename,
+                    )
+                )
+        outputs = await asyncio.gather(*tasks)
+        audio_chunks = [x[0] for x in outputs]
+        tmp_files_to_add = [item for x in outputs for item in x[1]]
+        temp_files.extend(tmp_files_to_add)
         return audio_chunks

src/config.py CHANGED Viewed

@@ -17,3 +17,20 @@ ELEVENLABS_MAX_PARALLEL = 15  # current limitation of available subscription
 # VOICES_CSV_FP = "data/11labs_available_tts_voices.csv"
 VOICES_CSV_FP = "data/11labs_available_tts_voices.reviewed.csv"

 # VOICES_CSV_FP = "data/11labs_available_tts_voices.csv"
 VOICES_CSV_FP = "data/11labs_available_tts_voices.reviewed.csv"
+MAX_TEXT_LEN = 5000
+DESCRIPTION = """\
+# AI Audiobooks Generator
+Create an audiobook from the input text automatically, using Gen-AI!
+All you need to do - is to input the book text or select it from the provided Sample Inputs.
+AI will do the rest:
+- split text into characters
+- assign each character a voice
+- preprocess text to better convey emotions during Text-to-Speech
+- (optionally) add sound effects to create immersive atmosphere
+- generate audiobook using Text-to-Speech model
+"""

src/tts.py CHANGED Viewed

@@ -39,7 +39,10 @@ async def tts_astream(
             style=params.get("style"),
         )
-    logger.info(f"call to 11labs TTS endpoint with params: {params_all}")
     async_iter = ELEVEN_CLIENT_ASYNC.text_to_speech.convert(**params_all)
     async for chunk in async_iter:
         if chunk:
@@ -57,11 +60,23 @@ async def tts_astream_consumed(
 async def sound_generation_astream(
     sound_generation_data: dict,
 ) -> t.AsyncIterator[bytes]:
     async_iter = ELEVEN_CLIENT_ASYNC.text_to_sound_effects.convert(
-        text=sound_generation_data["text"],
         duration_seconds=sound_generation_data["duration_seconds"],
         prompt_influence=sound_generation_data["prompt_influence"],
     )
     async for chunk in async_iter:
         if chunk:
             yield chunk

             style=params.get("style"),
         )
+    logger.info(
+        f"request to 11labs TTS endpoint with params {params_all} "
+        f'for the following text: "{text}"'
+    )
     async_iter = ELEVEN_CLIENT_ASYNC.text_to_speech.convert(**params_all)
     async for chunk in async_iter:
         if chunk:
 async def sound_generation_astream(
     sound_generation_data: dict,
 ) -> t.AsyncIterator[bytes]:
+    text = sound_generation_data.pop("text")
+    logger.info(
+        f"request to 11labs sound effect generation with params {sound_generation_data} "
+        f'for the following text: "{text}"'
+    )
     async_iter = ELEVEN_CLIENT_ASYNC.text_to_sound_effects.convert(
+        text=text,
         duration_seconds=sound_generation_data["duration_seconds"],
         prompt_influence=sound_generation_data["prompt_influence"],
     )
     async for chunk in async_iter:
         if chunk:
             yield chunk
+@auto_retry
+async def sound_generation_consumed(sound_generation_data: dict):
+    aiterator = sound_generation_astream(sound_generation_data=sound_generation_data)
+    return [x async for x in aiterator]