{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## initialize" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/ales/dev/repos/ai-audio-books\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/ales/dev/python-venvs/ai-audio-books/lib/python3.12/site-packages/IPython/core/magics/osm.py:417: UserWarning: This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.\n", " self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n" ] } ], "source": [ "%cd .." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "from pprint import pprint\n", "\n", "import dotenv\n", "import pandas as pd\n", "from httpx import Timeout\n", "from pydantic import BaseModel\n", "from langchain_core.prompts import (\n", " ChatPromptTemplate,\n", " SystemMessagePromptTemplate,\n", " HumanMessagePromptTemplate,\n", ")\n", "from langchain_openai import ChatOpenAI\n", "from langchain_community.callbacks import get_openai_callback\n", "\n", "from IPython.display import Audio\n", "\n", "import data.samples_to_split as samples\n", "\n", "from src.lc_callbacks import LCMessageLoggerAsync\n", "from src.schemas import AudioOutputFormat, TTSParams, TTSTimestampsResponse, TTSTimestampsAlignment\n", "from src.text_split_chain import create_split_text_chain\n", "from src import tts\n", "from src.utils import GPTModels" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dotenv.load_dotenv()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "text = samples.GATSBY_2\n", "# text = \"\"\"\\\n", "# Margaret: hello, how are you Tom?\n", "# Tom: nice, thanks. And you?\n", "# \"\"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## split text into character phrases" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-11-03 17:49:47,976 [INFO] audio-books (lc_callbacks.py): call to gpt-4o with 2 messages:\n", "{'role': 'system', 'content': 'you are provided with the book sample.\\nplease rewrite it and insert xml tags indicating character to whom current phrase belongs.\\nfor example: I looked at herWhat are you looking at?\\n\\nNotes:\\n- sometimes narrator is one of characters taking part in the action.\\nin this case use narrator\\'s name (if available) instead of \"narrator\"\\n- if it\\'s impossible to identify character name from the text provided, use codes \"c1\", \"c2\", etc,\\nwhere \"c\" prefix means character and number is used to enumerate unknown characters\\n- all quotes of direct speech must be attributed to characters, for example:\\n“She’s a nice girl,”said Tom after a moment.\\nmind that sometimes narrator could also be a character.\\n- use ALL available context to determine the character.\\nsometimes the character name becomes clear from the following phrases\\n- DO NOT include in your response anything except for the original text with character xml tags!!!\\n'}\n", "{'role': 'human', 'content': 'Here is the book sample:\\n---\\nInside, the crimson room bloomed with light. Tom and Miss Baker sat at\\neither end of the long couch and she read aloud to him from the\\nSaturday Evening Post—the words, murmurous and uninflected, running\\ntogether in a soothing tune. The lamplight, bright on his boots and\\ndull on the autumn-leaf yellow of her hair, glinted along the paper as\\nshe turned a page with a flutter of slender muscles in her arms.\\n\\nWhen we came in she held us silent for a moment with a lifted hand.\\n\\n“To be continued,” she said, tossing the magazine on the table, “in\\nour very next issue.”\\n\\nHer body asserted itself with a restless movement of her knee, and she\\nstood up.\\n\\n“Ten o’clock,” she remarked, apparently finding the time on the\\nceiling. “Time for this good girl to go to bed.”\\n\\n“Jordan’s going to play in the tournament tomorrow,” explained Daisy,\\n“over at Westchester.”\\n\\n“Oh—you’re Jordan Baker.”\\n\\nI knew now why her face was familiar—its pleasing contemptuous\\nexpression had looked out at me from many rotogravure pictures of the\\nsporting life at Asheville and Hot Springs and Palm Beach. I had heard\\nsome story of her too, a critical, unpleasant story, but what it was I\\nhad forgotten long ago.\\n\\n“Good night,” she said softly. “Wake me at eight, won’t you.”\\n\\n“If you’ll get up.”\\n\\n“I will. Good night, Mr. Carraway. See you anon.”\\n\\n“Of course you will,” confirmed Daisy. “In fact I think I’ll arrange a\\nmarriage. Come over often, Nick, and I’ll sort of—oh—fling you\\ntogether. You know—lock you up accidentally in linen closets and push\\nyou out to sea in a boat, and all that sort of thing—”\\n\\n“Good night,” called Miss Baker from the stairs. “I haven’t heard a\\nword.”\\n\\n“She’s a nice girl,” said Tom after a moment. “They oughtn’t to let\\nher run around the country this way.”\\n\\n“Who oughtn’t to?” inquired Daisy coldly.\\n\\n“Her family.”\\n\\n“Her family is one aunt about a thousand years old. Besides, Nick’s\\ngoing to look after her, aren’t you, Nick? She’s going to spend lots\\nof weekends out here this summer. I think the home influence will be\\nvery good for her.”\\n\\nDaisy and Tom looked at each other for a moment in silence.\\n\\n“Is she from New York?” I asked quickly.\\n\\n“From Louisville. Our white girlhood was passed together there. Our\\nbeautiful white—”\\n\\n“Did you give Nick a little heart to heart talk on the veranda?”\\ndemanded Tom suddenly.\\n\\n“Did I?” She looked at me. “I can’t seem to remember, but I think we\\ntalked about the Nordic race. Yes, I’m sure we did. It sort of crept\\nup on us and first thing you know—”\\n\\n“Don’t believe everything you hear, Nick,” he advised me.\\n'}\n", "2024-11-03 17:50:03,338 [INFO] httpx (_client.py): HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:03,348 [INFO] audio-books (lc_callbacks.py): raw LLM response: \"Inside, the crimson room bloomed with light. Tom and Miss Baker sat at either end of the long couch and she read aloud to him from the Saturday Evening Post—the words, murmurous and uninflected, running together in a soothing tune. The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, glinted along the paper as she turned a page with a flutter of slender muscles in her arms.\n", "\n", "When we came in she held us silent for a moment with a lifted hand.\n", "\n", "“To be continued,” she said, tossing the magazine on the table, “in our very next issue.”\n", "\n", "Her body asserted itself with a restless movement of her knee, and she stood up.\n", "\n", "“Ten o’clock,” she remarked, apparently finding the time on the ceiling. “Time for this good girl to go to bed.”\n", "\n", "“Jordan’s going to play in the tournament tomorrow,” explained Daisy, “over at Westchester.”\n", "\n", "“Oh—you’re Jordan Baker.”\n", "\n", "I knew now why her face was familiar—its pleasing contemptuous expression had looked out at me from many rotogravure pictures of the sporting life at Asheville and Hot Springs and Palm Beach. I had heard some story of her too, a critical, unpleasant story, but what it was I had forgotten long ago.\n", "\n", "“Good night,” she said softly. “Wake me at eight, won’t you.”\n", "\n", "“If you’ll get up.”\n", "\n", "“I will. Good night, Mr. Carraway. See you anon.”\n", "\n", "“Of course you will,” confirmed Daisy. “In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”\n", "\n", "“Good night,” called Miss Baker from the stairs. “I haven’t heard a word.”\n", "\n", "“She’s a nice girl,” said Tom after a moment. “They oughtn’t to let her run around the country this way.”\n", "\n", "“Who oughtn’t to?” inquired Daisy coldly.\n", "\n", "“Her family.”\n", "\n", "“Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”\n", "\n", "Daisy and Tom looked at each other for a moment in silence.\n", "\n", "“Is she from New York?” I asked quickly.\n", "\n", "“From Louisville. Our white girlhood was passed together there. Our beautiful white—”\n", "\n", "“Did you give Nick a little heart to heart talk on the veranda?” demanded Tom suddenly.\n", "\n", "“Did I?” She looked at me. “I can’t seem to remember, but I think we talked about the Nordic race. Yes, I’m sure we did. It sort of crept up on us and first thing you know—”\n", "\n", "“Don’t believe everything you hear, Nick,” he advised me.\"\n" ] } ], "source": [ "chain = create_split_text_chain(llm_model=GPTModels.GPT_4o)\n", "# chain = create_split_text_chain(llm_model=GPTModels.GPT_4_TURBO_2024_04_09)\n", "with get_openai_callback() as cb:\n", " res = chain.invoke({\"text\": text}, config={\"callbacks\": [LCMessageLoggerAsync()]})" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Jordan', 'Tom', 'Daisy', 'narrator']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.characters" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Inside, the crimson room bloomed with light. Tom and Miss Baker sat at either end of the long couch and she read aloud to him from the Saturday Evening Post—the words, murmurous and uninflected, running together in a soothing tune. The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, glinted along the paper as she turned a page with a flutter of slender muscles in her arms.\n", "\n", "When we came in she held us silent for a moment with a lifted hand.\n", "\n", "“To be continued,” she said, tossing the magazine on the table, “in our very next issue.”\n", "\n", "Her body asserted itself with a restless movement of her knee, and she stood up.\n", "\n", "“Ten o’clock,” she remarked, apparently finding the time on the ceiling. “Time for this good girl to go to bed.”\n", "\n", "“Jordan’s going to play in the tournament tomorrow,” explained Daisy, “over at Westchester.”\n", "\n", "“Oh—you’re Jordan Baker.”\n", "\n", "I knew now why her face was familiar—its pleasing contemptuous expression had looked out at me from many rotogravure pictures of the sporting life at Asheville and Hot Springs and Palm Beach. I had heard some story of her too, a critical, unpleasant story, but what it was I had forgotten long ago.\n", "\n", "“Good night,” she said softly. “Wake me at eight, won’t you.”\n", "\n", "“If you’ll get up.”\n", "\n", "“I will. Good night, Mr. Carraway. See you anon.”\n", "\n", "“Of course you will,” confirmed Daisy. “In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”\n", "\n", "“Good night,” called Miss Baker from the stairs. “I haven’t heard a word.”\n", "\n", "“She’s a nice girl,” said Tom after a moment. “They oughtn’t to let her run around the country this way.”\n", "\n", "“Who oughtn’t to?” inquired Daisy coldly.\n", "\n", "“Her family.”\n", "\n", "“Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”\n", "\n", "Daisy and Tom looked at each other for a moment in silence.\n", "\n", "“Is she from New York?” I asked quickly.\n", "\n", "“From Louisville. Our white girlhood was passed together there. Our beautiful white—”\n", "\n", "“Did you give Nick a little heart to heart talk on the veranda?” demanded Tom suddenly.\n", "\n", "“Did I?” She looked at me. “I can’t seem to remember, but I think we talked about the Nordic race. Yes, I’m sure we did. It sort of crept up on us and first thing you know—”\n", "\n", "“Don’t believe everything you hear, Nick,” he advised me.\n" ] } ], "source": [ "print(res.text_annotated)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[CharacterPhrase(character='narrator', text='Inside, the crimson room bloomed with light. Tom and Miss Baker sat at either end of the long couch and she read aloud to him from the Saturday Evening Post—the words, murmurous and uninflected, running together in a soothing tune. The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, glinted along the paper as she turned a page with a flutter of slender muscles in her arms.'),\n", " CharacterPhrase(character='narrator', text='When we came in she held us silent for a moment with a lifted hand.'),\n", " CharacterPhrase(character='Jordan', text='“To be continued,”'),\n", " CharacterPhrase(character='narrator', text='she said, tossing the magazine on the table,'),\n", " CharacterPhrase(character='Jordan', text='“in our very next issue.”'),\n", " CharacterPhrase(character='narrator', text='Her body asserted itself with a restless movement of her knee, and she stood up.'),\n", " CharacterPhrase(character='Jordan', text='“Ten o’clock,”'),\n", " CharacterPhrase(character='narrator', text='she remarked, apparently finding the time on the ceiling.'),\n", " CharacterPhrase(character='Jordan', text='“Time for this good girl to go to bed.”'),\n", " CharacterPhrase(character='Daisy', text='“Jordan’s going to play in the tournament tomorrow,”'),\n", " CharacterPhrase(character='narrator', text='explained Daisy,'),\n", " CharacterPhrase(character='Daisy', text='“over at Westchester.”'),\n", " CharacterPhrase(character='narrator', text='“Oh—you’re Jordan Baker.”'),\n", " CharacterPhrase(character='narrator', text='I knew now why her face was familiar—its pleasing contemptuous expression had looked out at me from many rotogravure pictures of the sporting life at Asheville and Hot Springs and Palm Beach. I had heard some story of her too, a critical, unpleasant story, but what it was I had forgotten long ago.'),\n", " CharacterPhrase(character='Jordan', text='“Good night,”'),\n", " CharacterPhrase(character='narrator', text='she said softly.'),\n", " CharacterPhrase(character='Jordan', text='“Wake me at eight, won’t you.”'),\n", " CharacterPhrase(character='Tom', text='“If you’ll get up.”'),\n", " CharacterPhrase(character='Jordan', text='“I will. Good night, Mr. Carraway. See you anon.”'),\n", " CharacterPhrase(character='Daisy', text='“Of course you will,”'),\n", " CharacterPhrase(character='narrator', text='confirmed Daisy.'),\n", " CharacterPhrase(character='Daisy', text='“In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”'),\n", " CharacterPhrase(character='Jordan', text='“Good night,”'),\n", " CharacterPhrase(character='narrator', text='called Miss Baker from the stairs.'),\n", " CharacterPhrase(character='Jordan', text='“I haven’t heard a word.”'),\n", " CharacterPhrase(character='Tom', text='“She’s a nice girl,”'),\n", " CharacterPhrase(character='narrator', text='said Tom after a moment.'),\n", " CharacterPhrase(character='Tom', text='“They oughtn’t to let her run around the country this way.”'),\n", " CharacterPhrase(character='Daisy', text='“Who oughtn’t to?”'),\n", " CharacterPhrase(character='narrator', text='inquired Daisy coldly.'),\n", " CharacterPhrase(character='Tom', text='“Her family.”'),\n", " CharacterPhrase(character='Daisy', text='“Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”'),\n", " CharacterPhrase(character='narrator', text='Daisy and Tom looked at each other for a moment in silence.'),\n", " CharacterPhrase(character='narrator', text='“Is she from New York?”'),\n", " CharacterPhrase(character='narrator', text='I asked quickly.'),\n", " CharacterPhrase(character='Daisy', text='“From Louisville. Our white girlhood was passed together there. Our beautiful white—”'),\n", " CharacterPhrase(character='Tom', text='“Did you give Nick a little heart to heart talk on the veranda?”'),\n", " CharacterPhrase(character='narrator', text='demanded Tom suddenly.'),\n", " CharacterPhrase(character='Daisy', text='“Did I?”'),\n", " CharacterPhrase(character='narrator', text='She looked at me.'),\n", " CharacterPhrase(character='Daisy', text='“I can’t seem to remember, but I think we talked about the Nordic race. Yes, I’m sure we did. It sort of crept up on us and first thing you know—”'),\n", " CharacterPhrase(character='Tom', text='“Don’t believe everything you hear, Nick,”'),\n", " CharacterPhrase(character='narrator', text='he advised me.')]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.phrases" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Inside, the crimson room bloomed with light. Tom and Miss Baker sat at\n", "either end of the long couch and she read aloud to him from the\n", "Saturday Evening Post—the words, murmurous and uninflected, running\n", "together in a soothing tune. The lamplight, bright on his boots and\n", "dull on the autumn-leaf yellow of her hair, glinted along the paper as\n", "she turned a page with a flutter of slender muscles in her arms.\n", "\n", "When we came in she held us silent for a moment with a lifted hand.\n", "\n", "“To be continued,” she said, tossing the magazine on the table, “in\n", "our very next issue.”\n", "\n", "Her body asserted itself with a restless movement of her knee, and she\n", "stood up.\n", "\n", "“Ten o’clock,” she remarked, apparently finding the time on the\n", "ceiling. “Time for this good girl to go to bed.”\n", "\n", "“Jordan’s going to play in the tournament tomorrow,” explained Daisy,\n", "“over at Westchester.”\n", "\n", "“Oh—you’re Jordan Baker.”\n", "\n", "I knew now why her face was familiar—its pleasing contemptuous\n", "expression had looked out at me from many rotogravure pictures of the\n", "sporting life at Asheville and Hot Springs and Palm Beach. I had heard\n", "some story of her too, a critical, unpleasant story, but what it was I\n", "had forgotten long ago.\n", "\n", "“Good night,” she said softly. “Wake me at eight, won’t you.”\n", "\n", "“If you’ll get up.”\n", "\n", "“I will. Good night, Mr. Carraway. See you anon.”\n", "\n", "“Of course you will,” confirmed Daisy. “In fact I think I’ll arrange a\n", "marriage. Come over often, Nick, and I’ll sort of—oh—fling you\n", "together. You know—lock you up accidentally in linen closets and push\n", "you out to sea in a boat, and all that sort of thing—”\n", "\n", "“Good night,” called Miss Baker from the stairs. “I haven’t heard a\n", "word.”\n", "\n", "“She’s a nice girl,” said Tom after a moment. “They oughtn’t to let\n", "her run around the country this way.”\n", "\n", "“Who oughtn’t to?” inquired Daisy coldly.\n", "\n", "“Her family.”\n", "\n", "“Her family is one aunt about a thousand years old. Besides, Nick’s\n", "going to look after her, aren’t you, Nick? She’s going to spend lots\n", "of weekends out here this summer. I think the home influence will be\n", "very good for her.”\n", "\n", "Daisy and Tom looked at each other for a moment in silence.\n", "\n", "“Is she from New York?” I asked quickly.\n", "\n", "“From Louisville. Our white girlhood was passed together there. Our\n", "beautiful white—”\n", "\n", "“Did you give Nick a little heart to heart talk on the veranda?”\n", "demanded Tom suddenly.\n", "\n", "“Did I?” She looked at me. “I can’t seem to remember, but I think we\n", "talked about the Nordic race. Yes, I’m sure we did. It sort of crept\n", "up on us and first thing you know—”\n", "\n", "“Don’t believe everything you hear, Nick,” he advised me.\n", "\n" ] } ], "source": [ "print(res.text_raw)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "characters: ['Jordan', 'Tom', 'Daisy', 'narrator']\n", "--------------------\n", "[narrator] Inside, the crimson room bloomed with light. Tom and Miss Baker sat at either end of the long couch and she read aloud to him from the Saturday Evening Post—the words, murmurous and uninflected, running together in a soothing tune. The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, glinted along the paper as she turned a page with a flutter of slender muscles in her arms.\n", "[narrator] When we came in she held us silent for a moment with a lifted hand.\n", "[Jordan] “To be continued,”\n", "[narrator] she said, tossing the magazine on the table,\n", "[Jordan] “in our very next issue.”\n", "[narrator] Her body asserted itself with a restless movement of her knee, and she stood up.\n", "[Jordan] “Ten o’clock,”\n", "[narrator] she remarked, apparently finding the time on the ceiling.\n", "[Jordan] “Time for this good girl to go to bed.”\n", "[Daisy] “Jordan’s going to play in the tournament tomorrow,”\n", "[narrator] explained Daisy,\n", "[Daisy] “over at Westchester.”\n", "[narrator] “Oh—you’re Jordan Baker.”\n", "[narrator] I knew now why her face was familiar—its pleasing contemptuous expression had looked out at me from many rotogravure pictures of the sporting life at Asheville and Hot Springs and Palm Beach. I had heard some story of her too, a critical, unpleasant story, but what it was I had forgotten long ago.\n", "[Jordan] “Good night,”\n", "[narrator] she said softly.\n", "[Jordan] “Wake me at eight, won’t you.”\n", "[Tom] “If you’ll get up.”\n", "[Jordan] “I will. Good night, Mr. Carraway. See you anon.”\n", "[Daisy] “Of course you will,”\n", "[narrator] confirmed Daisy.\n", "[Daisy] “In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”\n", "[Jordan] “Good night,”\n", "[narrator] called Miss Baker from the stairs.\n", "[Jordan] “I haven’t heard a word.”\n", "[Tom] “She’s a nice girl,”\n", "[narrator] said Tom after a moment.\n", "[Tom] “They oughtn’t to let her run around the country this way.”\n", "[Daisy] “Who oughtn’t to?”\n", "[narrator] inquired Daisy coldly.\n", "[Tom] “Her family.”\n", "[Daisy] “Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”\n", "[narrator] Daisy and Tom looked at each other for a moment in silence.\n", "[narrator] “Is she from New York?”\n", "[narrator] I asked quickly.\n", "[Daisy] “From Louisville. Our white girlhood was passed together there. Our beautiful white—”\n", "[Tom] “Did you give Nick a little heart to heart talk on the veranda?”\n", "[narrator] demanded Tom suddenly.\n", "[Daisy] “Did I?”\n", "[narrator] She looked at me.\n", "[Daisy] “I can’t seem to remember, but I think we talked about the Nordic race. Yes, I’m sure we did. It sort of crept up on us and first thing you know—”\n", "[Tom] “Don’t believe everything you hear, Nick,”\n", "[narrator] he advised me.\n" ] } ], "source": [ "print(res.to_pretty_text())" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LLM usage:\n", "\n", "Tokens Used: 1816\n", "\tPrompt Tokens: 877\n", "\tCompletion Tokens: 939\n", "Successful Requests: 1\n", "Total Cost (USD): $0.011582499999999999\n" ] } ], "source": [ "print(f'LLM usage:\\n\\n{cb}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## map characters to voices" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from src.select_voice_chain import VoiceSelector" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-11-03 17:50:03,555 [INFO] audio-books (select_voice_chain.py): reading voice data from: \"data/11labs_available_tts_voices.reviewed.csv\"\n", "2024-11-03 17:50:03,560 [INFO] audio-books (select_voice_chain.py): df.shape=(34, 15)\n", "2024-11-03 17:50:03,562 [INFO] audio-books (select_voice_chain.py): filtering df by \"manual_quality_review\" column\n", "2024-11-03 17:50:03,566 [INFO] audio-books (select_voice_chain.py): df.shape after filtering voices: (25, 15)\n" ] } ], "source": [ "vs = VoiceSelector()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "chain = vs.create_voice_mapping_chain(llm_model=GPTModels.GPT_4o)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RunnableAssign(mapper={\n", " charater_props: ChatPromptTemplate(input_variables=['characters', 'text'], input_types={}, partial_variables={'available_genders': '\"male\", \"female\"', 'available_age_groups': '\"middle_aged\", \"young\", \"old\"', 'format_instructions': 'The output should be formatted as a JSON instance that conforms to the JSON schema below.\\n\\nAs an example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]}\\nthe object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\\n\\nHere is the output schema:\\n```\\n{\"$defs\": {\"CharacterProperties\": {\"properties\": {\"gender\": {\"title\": \"Gender\", \"type\": \"string\"}, \"age_group\": {\"title\": \"Age Group\", \"type\": \"string\"}}, \"required\": [\"gender\", \"age_group\"], \"title\": \"CharacterProperties\", \"type\": \"object\"}}, \"properties\": {\"character2props\": {\"additionalProperties\": {\"$ref\": \"#/$defs/CharacterProperties\"}, \"title\": \"Character2Props\", \"type\": \"object\"}}, \"required\": [\"character2props\"]}\\n```'}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['available_age_groups', 'available_genders', 'format_instructions'], input_types={}, partial_variables={}, template='You are a helpful assistant proficient in literature and psychology.\\nOur goal is to create an audio book from the given text.\\nFor that we need to hire voice actors.\\nPlease help us to find the right actor for each character present in the text.\\n\\nYou are provided with the text split by the characters\\nto whom text parts belong to.\\n\\nYour task is to assign available properties to each character provided.\\nList of available properties:\\n- gender: {available_genders}\\n- age_group: {available_age_groups}\\n\\nNOTES:\\n- assign EXACTLY ONE property value for each property\\n- select properties values ONLY from the list of AVAILABLE property values\\n- fill properties for ALL characters from the list provided\\n- DO NOT include any characters absent in the list provided\\n\\n{format_instructions}\\n'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['characters', 'text'], input_types={}, partial_variables={}, template='\\n{text}\\n\\n\\n\\n{characters}\\n\\n'), additional_kwargs={})])\n", " | RunnableBinding(bound=ChatOpenAI(client=, async_client=, root_client=, root_async_client=, model_name='gpt-4o', temperature=0.0, model_kwargs={}, openai_api_key=SecretStr('**********'), request_timeout=Timeout(connect=4, read=60, write=60, pool=60)), kwargs={'response_format': {'type': 'json_object'}}, config={}, config_factories=[])\n", " | PydanticOutputParser(pydantic_object=)\n", " | RunnableLambda(remove_hallucinations)\n", "})\n", "| RunnableAssign(mapper={\n", " character2voice: RunnableLambda(get_voices)\n", " })\n", "| RunnableLambda(pack_results)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chain" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-11-03 17:50:03,650 [INFO] audio-books (lc_callbacks.py): call to gpt-4o with 2 messages:\n", "{'role': 'system', 'content': 'You are a helpful assistant proficient in literature and psychology.\\nOur goal is to create an audio book from the given text.\\nFor that we need to hire voice actors.\\nPlease help us to find the right actor for each character present in the text.\\n\\nYou are provided with the text split by the characters\\nto whom text parts belong to.\\n\\nYour task is to assign available properties to each character provided.\\nList of available properties:\\n- gender: \"male\", \"female\"\\n- age_group: \"middle_aged\", \"young\", \"old\"\\n\\nNOTES:\\n- assign EXACTLY ONE property value for each property\\n- select properties values ONLY from the list of AVAILABLE property values\\n- fill properties for ALL characters from the list provided\\n- DO NOT include any characters absent in the list provided\\n\\nThe output should be formatted as a JSON instance that conforms to the JSON schema below.\\n\\nAs an example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]}\\nthe object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\\n\\nHere is the output schema:\\n```\\n{\"$defs\": {\"CharacterProperties\": {\"properties\": {\"gender\": {\"title\": \"Gender\", \"type\": \"string\"}, \"age_group\": {\"title\": \"Age Group\", \"type\": \"string\"}}, \"required\": [\"gender\", \"age_group\"], \"title\": \"CharacterProperties\", \"type\": \"object\"}}, \"properties\": {\"character2props\": {\"additionalProperties\": {\"$ref\": \"#/$defs/CharacterProperties\"}, \"title\": \"Character2Props\", \"type\": \"object\"}}, \"required\": [\"character2props\"]}\\n```\\n'}\n", "{'role': 'human', 'content': \"\\nInside, the crimson room bloomed with light. Tom and Miss Baker sat at either end of the long couch and she read aloud to him from the Saturday Evening Post—the words, murmurous and uninflected, running together in a soothing tune. The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, glinted along the paper as she turned a page with a flutter of slender muscles in her arms.\\n\\nWhen we came in she held us silent for a moment with a lifted hand.\\n\\n“To be continued,” she said, tossing the magazine on the table, “in our very next issue.”\\n\\nHer body asserted itself with a restless movement of her knee, and she stood up.\\n\\n“Ten o’clock,” she remarked, apparently finding the time on the ceiling. “Time for this good girl to go to bed.”\\n\\n“Jordan’s going to play in the tournament tomorrow,” explained Daisy, “over at Westchester.”\\n\\n“Oh—you’re Jordan Baker.”\\n\\nI knew now why her face was familiar—its pleasing contemptuous expression had looked out at me from many rotogravure pictures of the sporting life at Asheville and Hot Springs and Palm Beach. I had heard some story of her too, a critical, unpleasant story, but what it was I had forgotten long ago.\\n\\n“Good night,” she said softly. “Wake me at eight, won’t you.”\\n\\n“If you’ll get up.”\\n\\n“I will. Good night, Mr. Carraway. See you anon.”\\n\\n“Of course you will,” confirmed Daisy. “In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”\\n\\n“Good night,” called Miss Baker from the stairs. “I haven’t heard a word.”\\n\\n“She’s a nice girl,” said Tom after a moment. “They oughtn’t to let her run around the country this way.”\\n\\n“Who oughtn’t to?” inquired Daisy coldly.\\n\\n“Her family.”\\n\\n“Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”\\n\\nDaisy and Tom looked at each other for a moment in silence.\\n\\n“Is she from New York?” I asked quickly.\\n\\n“From Louisville. Our white girlhood was passed together there. Our beautiful white—”\\n\\n“Did you give Nick a little heart to heart talk on the veranda?” demanded Tom suddenly.\\n\\n“Did I?” She looked at me. “I can’t seem to remember, but I think we talked about the Nordic race. Yes, I’m sure we did. It sort of crept up on us and first thing you know—”\\n\\n“Don’t believe everything you hear, Nick,” he advised me.\\n\\n\\n\\n['Jordan', 'Tom', 'Daisy', 'narrator']\\n\\n\"}\n", "2024-11-03 17:50:05,411 [INFO] httpx (_client.py): HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:05,430 [INFO] audio-books (lc_callbacks.py): raw LLM response: \"{\n", " \"character2props\": {\n", " \"Jordan\": {\n", " \"gender\": \"female\",\n", " \"age_group\": \"young\"\n", " },\n", " \"Tom\": {\n", " \"gender\": \"male\",\n", " \"age_group\": \"middle_aged\"\n", " },\n", " \"Daisy\": {\n", " \"gender\": \"female\",\n", " \"age_group\": \"young\"\n", " },\n", " \"narrator\": {\n", " \"gender\": \"male\",\n", " \"age_group\": \"young\"\n", " }\n", " }\n", "}\"\n" ] } ], "source": [ "res2 = chain.invoke(\n", " {\"text\": res.text_annotated, \"characters\": res.characters},\n", " config={\"callbacks\": [LCMessageLoggerAsync()]},\n", ")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SelectVoiceChainOutput(character2props={'Jordan': CharacterPropertiesNullable(gender='female', age_group='young'), 'Tom': CharacterPropertiesNullable(gender='male', age_group='middle_aged'), 'Daisy': CharacterPropertiesNullable(gender='female', age_group='young'), 'narrator': CharacterPropertiesNullable(gender='male', age_group='young')}, character2voice={'Jordan': '1btZhL2wthuOhUqvI0bB', 'Daisy': 'K5DRk4s8l1HFKsggS25u', 'Tom': 'RPEIZnKMqlQiZyZd1Dae', 'narrator': 'xSI29a9HAKdsWv3idXSN'})" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res2" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Jordan': '1btZhL2wthuOhUqvI0bB',\n", " 'Daisy': 'K5DRk4s8l1HFKsggS25u',\n", " 'Tom': 'RPEIZnKMqlQiZyZd1Dae',\n", " 'narrator': 'xSI29a9HAKdsWv3idXSN'}" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "character2voice = res2.character2voice\n", "character2voice" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## generate audio" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[CharacterPhrase(character='narrator', text='Inside, the crimson room bloomed with light. Tom and Miss Baker sat at either end of the long couch and she read aloud to him from the Saturday Evening Post—the words, murmurous and uninflected, running together in a soothing tune. The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, glinted along the paper as she turned a page with a flutter of slender muscles in her arms.'),\n", " CharacterPhrase(character='narrator', text='When we came in she held us silent for a moment with a lifted hand.'),\n", " CharacterPhrase(character='Jordan', text='“To be continued,”'),\n", " CharacterPhrase(character='narrator', text='she said, tossing the magazine on the table,'),\n", " CharacterPhrase(character='Jordan', text='“in our very next issue.”'),\n", " CharacterPhrase(character='narrator', text='Her body asserted itself with a restless movement of her knee, and she stood up.'),\n", " CharacterPhrase(character='Jordan', text='“Ten o’clock,”'),\n", " CharacterPhrase(character='narrator', text='she remarked, apparently finding the time on the ceiling.'),\n", " CharacterPhrase(character='Jordan', text='“Time for this good girl to go to bed.”'),\n", " CharacterPhrase(character='Daisy', text='“Jordan’s going to play in the tournament tomorrow,”'),\n", " CharacterPhrase(character='narrator', text='explained Daisy,'),\n", " CharacterPhrase(character='Daisy', text='“over at Westchester.”'),\n", " CharacterPhrase(character='narrator', text='“Oh—you’re Jordan Baker.”'),\n", " CharacterPhrase(character='narrator', text='I knew now why her face was familiar—its pleasing contemptuous expression had looked out at me from many rotogravure pictures of the sporting life at Asheville and Hot Springs and Palm Beach. I had heard some story of her too, a critical, unpleasant story, but what it was I had forgotten long ago.'),\n", " CharacterPhrase(character='Jordan', text='“Good night,”'),\n", " CharacterPhrase(character='narrator', text='she said softly.'),\n", " CharacterPhrase(character='Jordan', text='“Wake me at eight, won’t you.”'),\n", " CharacterPhrase(character='Tom', text='“If you’ll get up.”'),\n", " CharacterPhrase(character='Jordan', text='“I will. Good night, Mr. Carraway. See you anon.”'),\n", " CharacterPhrase(character='Daisy', text='“Of course you will,”'),\n", " CharacterPhrase(character='narrator', text='confirmed Daisy.'),\n", " CharacterPhrase(character='Daisy', text='“In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”'),\n", " CharacterPhrase(character='Jordan', text='“Good night,”'),\n", " CharacterPhrase(character='narrator', text='called Miss Baker from the stairs.'),\n", " CharacterPhrase(character='Jordan', text='“I haven’t heard a word.”'),\n", " CharacterPhrase(character='Tom', text='“She’s a nice girl,”'),\n", " CharacterPhrase(character='narrator', text='said Tom after a moment.'),\n", " CharacterPhrase(character='Tom', text='“They oughtn’t to let her run around the country this way.”'),\n", " CharacterPhrase(character='Daisy', text='“Who oughtn’t to?”'),\n", " CharacterPhrase(character='narrator', text='inquired Daisy coldly.'),\n", " CharacterPhrase(character='Tom', text='“Her family.”'),\n", " CharacterPhrase(character='Daisy', text='“Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”'),\n", " CharacterPhrase(character='narrator', text='Daisy and Tom looked at each other for a moment in silence.'),\n", " CharacterPhrase(character='narrator', text='“Is she from New York?”'),\n", " CharacterPhrase(character='narrator', text='I asked quickly.'),\n", " CharacterPhrase(character='Daisy', text='“From Louisville. Our white girlhood was passed together there. Our beautiful white—”'),\n", " CharacterPhrase(character='Tom', text='“Did you give Nick a little heart to heart talk on the veranda?”'),\n", " CharacterPhrase(character='narrator', text='demanded Tom suddenly.'),\n", " CharacterPhrase(character='Daisy', text='“Did I?”'),\n", " CharacterPhrase(character='narrator', text='She looked at me.'),\n", " CharacterPhrase(character='Daisy', text='“I can’t seem to remember, but I think we talked about the Nordic race. Yes, I’m sure we did. It sort of crept up on us and first thing you know—”'),\n", " CharacterPhrase(character='Tom', text='“Don’t believe everything you hear, Nick,”'),\n", " CharacterPhrase(character='narrator', text='he advised me.')]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.phrases" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-11-03 17:50:05,529 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"Inside, the crimson room bloomed with light. Tom and Miss Baker sat at either end of the long couch and she read aloud to him from the Saturday Evening Post—the words, murmurous and uninflected, running together in a soothing tune. The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, glinted along the paper as she turned a page with a flutter of slender muscles in her arms.\"\n", "2024-11-03 17:50:13,974 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:14,120 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"When we came in she held us silent for a moment with a lifted hand.\"\n", "2024-11-03 17:50:15,040 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:15,053 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '1btZhL2wthuOhUqvI0bB', 'output_format': } for the following text: \"“To be continued,”\"\n", "2024-11-03 17:50:15,994 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/1btZhL2wthuOhUqvI0bB/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:16,002 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"she said, tossing the magazine on the table,\"\n", "2024-11-03 17:50:16,801 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:16,824 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '1btZhL2wthuOhUqvI0bB', 'output_format': } for the following text: \"“in our very next issue.”\"\n", "2024-11-03 17:50:17,479 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/1btZhL2wthuOhUqvI0bB/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:17,584 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"Her body asserted itself with a restless movement of her knee, and she stood up.\"\n", "2024-11-03 17:50:18,644 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:18,839 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '1btZhL2wthuOhUqvI0bB', 'output_format': } for the following text: \"“Ten o’clock,”\"\n", "2024-11-03 17:50:19,427 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/1btZhL2wthuOhUqvI0bB/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:19,440 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"she remarked, apparently finding the time on the ceiling.\"\n", "2024-11-03 17:50:20,518 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:20,551 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '1btZhL2wthuOhUqvI0bB', 'output_format': } for the following text: \"“Time for this good girl to go to bed.”\"\n", "2024-11-03 17:50:21,366 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/1btZhL2wthuOhUqvI0bB/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:21,374 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'K5DRk4s8l1HFKsggS25u', 'output_format': } for the following text: \"“Jordan’s going to play in the tournament tomorrow,”\"\n", "2024-11-03 17:50:22,516 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/K5DRk4s8l1HFKsggS25u/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:22,527 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"explained Daisy,\"\n", "2024-11-03 17:50:23,121 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:23,166 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'K5DRk4s8l1HFKsggS25u', 'output_format': } for the following text: \"“over at Westchester.”\"\n", "2024-11-03 17:50:23,805 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/K5DRk4s8l1HFKsggS25u/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:23,833 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"“Oh—you’re Jordan Baker.”\"\n", "2024-11-03 17:50:24,493 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:24,502 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"I knew now why her face was familiar—its pleasing contemptuous expression had looked out at me from many rotogravure pictures of the sporting life at Asheville and Hot Springs and Palm Beach. I had heard some story of her too, a critical, unpleasant story, but what it was I had forgotten long ago.\"\n", "2024-11-03 17:50:28,203 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:28,458 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '1btZhL2wthuOhUqvI0bB', 'output_format': } for the following text: \"“Good night,”\"\n", "2024-11-03 17:50:29,088 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/1btZhL2wthuOhUqvI0bB/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:29,130 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"she said softly.\"\n", "2024-11-03 17:50:29,708 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:29,715 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '1btZhL2wthuOhUqvI0bB', 'output_format': } for the following text: \"“Wake me at eight, won’t you.”\"\n", "2024-11-03 17:50:30,444 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/1btZhL2wthuOhUqvI0bB/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:30,453 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'RPEIZnKMqlQiZyZd1Dae', 'output_format': } for the following text: \"“If you’ll get up.”\"\n", "2024-11-03 17:50:31,164 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/RPEIZnKMqlQiZyZd1Dae/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:31,170 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '1btZhL2wthuOhUqvI0bB', 'output_format': } for the following text: \"“I will. Good night, Mr. Carraway. See you anon.”\"\n", "2024-11-03 17:50:32,090 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/1btZhL2wthuOhUqvI0bB/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:32,102 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'K5DRk4s8l1HFKsggS25u', 'output_format': } for the following text: \"“Of course you will,”\"\n", "2024-11-03 17:50:32,731 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/K5DRk4s8l1HFKsggS25u/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:32,740 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"confirmed Daisy.\"\n", "2024-11-03 17:50:33,380 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:33,387 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'K5DRk4s8l1HFKsggS25u', 'output_format': } for the following text: \"“In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”\"\n", "2024-11-03 17:50:35,916 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/K5DRk4s8l1HFKsggS25u/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:36,114 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '1btZhL2wthuOhUqvI0bB', 'output_format': } for the following text: \"“Good night,”\"\n", "2024-11-03 17:50:36,868 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/1btZhL2wthuOhUqvI0bB/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:36,882 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"called Miss Baker from the stairs.\"\n", "2024-11-03 17:50:37,621 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:37,631 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '1btZhL2wthuOhUqvI0bB', 'output_format': } for the following text: \"“I haven’t heard a word.”\"\n", "2024-11-03 17:50:38,293 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/1btZhL2wthuOhUqvI0bB/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:38,301 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'RPEIZnKMqlQiZyZd1Dae', 'output_format': } for the following text: \"“She’s a nice girl,”\"\n", "2024-11-03 17:50:38,930 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/RPEIZnKMqlQiZyZd1Dae/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:38,936 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"said Tom after a moment.\"\n", "2024-11-03 17:50:39,539 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:39,545 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'RPEIZnKMqlQiZyZd1Dae', 'output_format': } for the following text: \"“They oughtn’t to let her run around the country this way.”\"\n", "2024-11-03 17:50:40,436 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/RPEIZnKMqlQiZyZd1Dae/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:40,441 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'K5DRk4s8l1HFKsggS25u', 'output_format': } for the following text: \"“Who oughtn’t to?”\"\n", "2024-11-03 17:50:41,045 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/K5DRk4s8l1HFKsggS25u/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:41,051 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"inquired Daisy coldly.\"\n", "2024-11-03 17:50:41,710 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:41,717 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'RPEIZnKMqlQiZyZd1Dae', 'output_format': } for the following text: \"“Her family.”\"\n", "2024-11-03 17:50:42,341 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/RPEIZnKMqlQiZyZd1Dae/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:42,348 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'K5DRk4s8l1HFKsggS25u', 'output_format': } for the following text: \"“Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”\"\n", "2024-11-03 17:50:44,869 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/K5DRk4s8l1HFKsggS25u/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:44,891 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"Daisy and Tom looked at each other for a moment in silence.\"\n", "2024-11-03 17:50:45,772 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:45,779 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"“Is she from New York?”\"\n", "2024-11-03 17:50:46,417 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:46,422 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"I asked quickly.\"\n", "2024-11-03 17:50:46,999 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:47,005 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'K5DRk4s8l1HFKsggS25u', 'output_format': } for the following text: \"“From Louisville. Our white girlhood was passed together there. Our beautiful white—”\"\n", "2024-11-03 17:50:48,191 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/K5DRk4s8l1HFKsggS25u/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:48,227 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'RPEIZnKMqlQiZyZd1Dae', 'output_format': } for the following text: \"“Did you give Nick a little heart to heart talk on the veranda?”\"\n", "2024-11-03 17:50:49,169 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/RPEIZnKMqlQiZyZd1Dae/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:49,228 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"demanded Tom suddenly.\"\n", "2024-11-03 17:50:49,954 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:49,961 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'K5DRk4s8l1HFKsggS25u', 'output_format': } for the following text: \"“Did I?”\"\n", "2024-11-03 17:50:50,522 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/K5DRk4s8l1HFKsggS25u/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:50,526 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"She looked at me.\"\n", "2024-11-03 17:50:51,110 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:51,116 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'K5DRk4s8l1HFKsggS25u', 'output_format': } for the following text: \"“I can’t seem to remember, but I think we talked about the Nordic race. Yes, I’m sure we did. It sort of crept up on us and first thing you know—”\"\n", "2024-11-03 17:50:53,124 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/K5DRk4s8l1HFKsggS25u/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:53,144 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'RPEIZnKMqlQiZyZd1Dae', 'output_format': } for the following text: \"“Don’t believe everything you hear, Nick,”\"\n", "2024-11-03 17:50:53,940 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/RPEIZnKMqlQiZyZd1Dae/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:50:53,956 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': 'xSI29a9HAKdsWv3idXSN', 'output_format': } for the following text: \"he advised me.\"\n", "2024-11-03 17:50:54,561 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/xSI29a9HAKdsWv3idXSN/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n" ] } ], "source": [ "tts_responses = []\n", "\n", "for phrase in res.phrases:\n", " voice_id = character2voice[phrase.character]\n", " tts_params = TTSParams(voice_id=voice_id, text=phrase.text)\n", " response = await tts.tts_w_timestamps(tts_params)\n", " tts_responses.append(response)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
charstartend
0I0.0000.058
1n0.0580.174
2s0.1740.255
3i0.2550.406
4d0.4060.453
............
2532d147.860147.884
2533147.884147.918
2534m147.918147.953
2535e147.953148.035
2536.148.035148.278
\n", "

2537 rows × 3 columns

\n", "
" ], "text/plain": [ " char start end\n", "0 I 0.000 0.058\n", "1 n 0.058 0.174\n", "2 s 0.174 0.255\n", "3 i 0.255 0.406\n", "4 d 0.406 0.453\n", "... ... ... ...\n", "2532 d 147.860 147.884\n", "2533 147.884 147.918\n", "2534 m 147.918 147.953\n", "2535 e 147.953 148.035\n", "2536 . 148.035 148.278\n", "\n", "[2537 rows x 3 columns]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alignments = [response.alignment for response in tts_responses]\n", "char2time = TTSTimestampsAlignemnt.combine_alignments(alignments=alignments)\n", "char2time.to_dataframe()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "len of original text: 2560\n", "len of joined character phrases: 2537\n", "len of char2time mapping: 2537\n" ] } ], "source": [ "print(f'len of original text: {len(text)}')\n", "# add 1 extra char imitating space between character phrases\n", "joined_phrases = ' '.join([x.text for x in res.phrases])\n", "print(f'len of joined character phrases: {len(joined_phrases)}')\n", "print(f'len of char2time mapping: {len(char2time.characters)}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## design effects" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "from src.sound_effects_design import create_sound_effects_design_chain, SoundEffectDescription" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-11-03 17:50:54,763 [INFO] audio-books (lc_callbacks.py): call to gpt-4o with 2 messages:\n", "{'role': 'system', 'content': 'You are an expert in directing audiobooks creation.\\nYour task is to design sound effects layed over the audio book.\\nYou are provided with the audiobook text chunk -\\ninsert XML tags describing sound effects, their place and duration.\\n\\nXML effect tags must have following structure:\\noriginal line from the text\\n\\nAdditional requirements:\\n- In the very beginning, analyze the whole text chunk provided in order to understand events and atmosphere.\\n- Prompts you place inside XML tags are going to be passes to text-to-sound-effect AI model.\\nThus, it\\'s required to write prompts rich in details.\\n- Do not generate long-running background or ambient music, crowd talking\\n- Aim for episodical sound effects, highlighting atmosphere and characters\\' actions.\\nFor example, cracking of stairs, wind blowing, car honks, air breeze.\\n- The reason is that text-to-sound-effects model is able to generate only short audio files, \\nup to 5 seconds long\\n- Sound effects must evoke immersive experience in listener.\\n- Generated sound effects will start playing with the first letter inside XML tag\\nand will end with the last letter inside XML tag\\n- You MUST use XML tags positions to control start and end of sound effects.\\n\\nResponse with the original text with selected phrases wrapped inside emotion XML tags.\\nDo not modify original text!\\nDo not include anythin else in your answer.\\n'}\n", "{'role': 'human', 'content': 'Inside, the crimson room bloomed with light. Tom and Miss Baker sat at\\neither end of the long couch and she read aloud to him from the\\nSaturday Evening Post—the words, murmurous and uninflected, running\\ntogether in a soothing tune. The lamplight, bright on his boots and\\ndull on the autumn-leaf yellow of her hair, glinted along the paper as\\nshe turned a page with a flutter of slender muscles in her arms.\\n\\nWhen we came in she held us silent for a moment with a lifted hand.\\n\\n“To be continued,” she said, tossing the magazine on the table, “in\\nour very next issue.”\\n\\nHer body asserted itself with a restless movement of her knee, and she\\nstood up.\\n\\n“Ten o’clock,” she remarked, apparently finding the time on the\\nceiling. “Time for this good girl to go to bed.”\\n\\n“Jordan’s going to play in the tournament tomorrow,” explained Daisy,\\n“over at Westchester.”\\n\\n“Oh—you’re Jordan Baker.”\\n\\nI knew now why her face was familiar—its pleasing contemptuous\\nexpression had looked out at me from many rotogravure pictures of the\\nsporting life at Asheville and Hot Springs and Palm Beach. I had heard\\nsome story of her too, a critical, unpleasant story, but what it was I\\nhad forgotten long ago.\\n\\n“Good night,” she said softly. “Wake me at eight, won’t you.”\\n\\n“If you’ll get up.”\\n\\n“I will. Good night, Mr. Carraway. See you anon.”\\n\\n“Of course you will,” confirmed Daisy. “In fact I think I’ll arrange a\\nmarriage. Come over often, Nick, and I’ll sort of—oh—fling you\\ntogether. You know—lock you up accidentally in linen closets and push\\nyou out to sea in a boat, and all that sort of thing—”\\n\\n“Good night,” called Miss Baker from the stairs. “I haven’t heard a\\nword.”\\n\\n“She’s a nice girl,” said Tom after a moment. “They oughtn’t to let\\nher run around the country this way.”\\n\\n“Who oughtn’t to?” inquired Daisy coldly.\\n\\n“Her family.”\\n\\n“Her family is one aunt about a thousand years old. Besides, Nick’s\\ngoing to look after her, aren’t you, Nick? She’s going to spend lots\\nof weekends out here this summer. I think the home influence will be\\nvery good for her.”\\n\\nDaisy and Tom looked at each other for a moment in silence.\\n\\n“Is she from New York?” I asked quickly.\\n\\n“From Louisville. Our white girlhood was passed together there. Our\\nbeautiful white—”\\n\\n“Did you give Nick a little heart to heart talk on the veranda?”\\ndemanded Tom suddenly.\\n\\n“Did I?” She looked at me. “I can’t seem to remember, but I think we\\ntalked about the Nordic race. Yes, I’m sure we did. It sort of crept\\nup on us and first thing you know—”\\n\\n“Don’t believe everything you hear, Nick,” he advised me.\\n\\n'}\n", "2024-11-03 17:51:08,770 [INFO] httpx (_client.py): HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", "2024-11-03 17:51:08,774 [INFO] audio-books (lc_callbacks.py): raw LLM response: \"Inside, the crimson room bloomed with light. Tom and Miss Baker sat at either end of the long couch and she read aloud to him from the Saturday Evening Post—the words, murmurous and uninflected, running together in a soothing tune. The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, glinted along the paper as she turned a page with a flutter of slender muscles in her arms.\n", "\n", "When we came in she held us silent for a moment with a lifted hand.\n", "\n", "“To be continued,” she said, tossing the magazine on the table, “in our very next issue.”\n", "\n", "Her body asserted itself with a restless movement of her knee, and she stood up.\n", "\n", "“Ten o’clock,” she remarked, apparently finding the time on the ceiling. “Time for this good girl to go to bed.”\n", "\n", "“Jordan’s going to play in the tournament tomorrow,” explained Daisy, “over at Westchester.”\n", "\n", "“Oh—you’re Jordan Baker.”\n", "\n", "I knew now why her face was familiar—its pleasing contemptuous expression had looked out at me from many rotogravure pictures of the sporting life at Asheville and Hot Springs and Palm Beach. I had heard some story of her too, a critical, unpleasant story, but what it was I had forgotten long ago.\n", "\n", "“Good night,” she said softly. “Wake me at eight, won’t you.”\n", "\n", "“If you’ll get up.”\n", "\n", "“I will. Good night, Mr. Carraway. See you anon.”\n", "\n", "“Of course you will,” confirmed Daisy. “In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”\n", "\n", "“Good night,” called Miss Baker from the stairs. “I haven’t heard a word.”\n", "\n", "“She’s a nice girl,” said Tom after a moment. “They oughtn’t to let her run around the country this way.”\n", "\n", "“Who oughtn’t to?” inquired Daisy coldly.\n", "\n", "“Her family.”\n", "\n", "“Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”\n", "\n", "Daisy and Tom looked at each other for a moment in silence.\n", "\n", "“Is she from New York?” I asked quickly.\n", "\n", "“From Louisville. Our white girlhood was passed together there. Our beautiful white—”\n", "\n", "“Did you give Nick a little heart to heart talk on the veranda?” demanded Tom suddenly.\n", "\n", "“Did I?” She looked at me. “I can’t seem to remember, but I think we talked about the Nordic race. Yes, I’m sure we did. It sort of crept up on us and first thing you know—”\n", "\n", "“Don’t believe everything you hear, Nick,” he advised me.\"\n" ] } ], "source": [ "chain = create_sound_effects_design_chain(llm_model=GPTModels.GPT_4o)\n", "\n", "with get_openai_callback() as cb:\n", " res = chain.invoke(\n", " {\"text\": text}, config={\"callbacks\": [LCMessageLoggerAsync()]}\n", " )" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# print(res.text_raw)\n", "# print(res.text_annotated)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[SoundEffectDescription(prompt='room filled with warm, inviting light', text_between_tags='crimson room bloomed with light', ix_start_llm_response=12, ix_end_llm_response=107, ix_start_orig_text=12, ix_end_orig_text=43, duration=-1),\n", " SoundEffectDescription(prompt='soft rustling of paper as a page is turned', text_between_tags='glinted along the paper as she turned a page with a flutter of slender muscles in her arms', ix_start_llm_response=379, ix_end_llm_response=538, ix_start_orig_text=315, ix_end_orig_text=405, duration=-1),\n", " SoundEffectDescription(prompt='gentle, commanding gesture', text_between_tags='lifted hand', ix_start_llm_response=596, ix_end_llm_response=660, ix_start_orig_text=463, ix_end_orig_text=474, duration=-1),\n", " SoundEffectDescription(prompt='magazine landing softly on a table', text_between_tags='tossing the magazine on the table', ix_start_llm_response=692, ix_end_llm_response=786, ix_start_orig_text=506, ix_end_orig_text=539, duration=-1),\n", " SoundEffectDescription(prompt='restless movement, fabric rustling', text_between_tags='restless movement of her knee', ix_start_llm_response=847, ix_end_llm_response=937, ix_start_orig_text=600, ix_end_orig_text=629, duration=-1),\n", " SoundEffectDescription(prompt='glancing upwards, searching for the time', text_between_tags='finding the time on the ceiling', ix_start_llm_response=998, ix_end_llm_response=1096, ix_start_orig_text=690, ix_end_orig_text=721, duration=-1),\n", " SoundEffectDescription(prompt='playful, mischievous tone', text_between_tags='lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing', ix_start_llm_response=1848, ix_end_llm_response=2003, ix_start_orig_text=1473, ix_end_orig_text=1576, duration=-1),\n", " SoundEffectDescription(prompt='footsteps ascending wooden stairs', text_between_tags='stairs', ix_start_llm_response=2048, ix_end_llm_response=2114, ix_start_orig_text=1621, ix_end_orig_text=1627, duration=-1),\n", " SoundEffectDescription(prompt='moment of silence, tension in the air', text_between_tags='Daisy and Tom looked at each other for a moment in silence', ix_start_llm_response=2535, ix_end_llm_response=2657, ix_start_orig_text=2048, ix_end_orig_text=2106, duration=-1)]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.sound_effects_descriptions" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sed.duration=-1\n" ] } ], "source": [ "sed = res.sound_effects_descriptions[0]\n", "print(f'{sed.duration=}')" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "# sound_effects_descriptions = res.sound_effects_descriptions" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "# # TODO: move to function\n", "# for sed in sound_effects_descriptions:\n", "# ix_start, ix_end = sed.ix_start_llm_response, sed.ix_end_llm_response\n", "# time_start = char2time.get_start_time_by_char_ix(ix_start)\n", "# time_end = char2time.get_end_time_by_char_ix(ix_end)\n", "# duration = time_end - time_start\n", "# sed.duration = duration" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "from src.config import logger" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-11-03 17:57:44,223 [INFO] audio-books (789017501.py): 8 out of 9 original sound effects are kept after filtering by min duration: 1\n" ] } ], "source": [ "# # filter\n", "# min_sound_effect_durations = 1\n", "# sound_effects_descriptions_filtered = [\n", "# sed for sed in sound_effects_descriptions if sed.duration > min_sound_effect_durations\n", "# ]\n", "# len_orig = len(sound_effects_descriptions)\n", "# len_new = len(sound_effects_descriptions_filtered)\n", "# logger.info(\n", "# f'{len_new} out of {len_orig} original sound effects are kept after filtering by min duration: '\n", "# f'{min_sound_effect_durations}'\n", "# )" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sed.prompt='room filled with warm, inviting light'\n", "text_bw_tags='crimson room bloomed with light'\n", "orig_text_from_calc_indices='crimson room bloomed with light'\n", "orig_text_from_calc_indices == text_bw_tags=True\n", "\n", "sed.prompt='soft rustling of paper as a page is turned'\n", "text_bw_tags='glinted along the paper as she turned a page with a flutter of slender muscles in her arms'\n", "orig_text_from_calc_indices='glinted along the paper as\\nshe turned a page with a flutter of slender muscles in her arms'\n", "orig_text_from_calc_indices == text_bw_tags=False\n", "\n", "sed.prompt='gentle, commanding gesture'\n", "text_bw_tags='lifted hand'\n", "orig_text_from_calc_indices='lifted hand'\n", "orig_text_from_calc_indices == text_bw_tags=True\n", "\n", "sed.prompt='magazine landing softly on a table'\n", "text_bw_tags='tossing the magazine on the table'\n", "orig_text_from_calc_indices='tossing the magazine on the table'\n", "orig_text_from_calc_indices == text_bw_tags=True\n", "\n", "sed.prompt='restless movement, fabric rustling'\n", "text_bw_tags='restless movement of her knee'\n", "orig_text_from_calc_indices='restless movement of her knee'\n", "orig_text_from_calc_indices == text_bw_tags=True\n", "\n", "sed.prompt='glancing upwards, searching for the time'\n", "text_bw_tags='finding the time on the ceiling'\n", "orig_text_from_calc_indices='finding the time on the\\nceiling'\n", "orig_text_from_calc_indices == text_bw_tags=False\n", "\n", "sed.prompt='playful, mischievous tone'\n", "text_bw_tags='lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing'\n", "orig_text_from_calc_indices='lock you up accidentally in linen closets and push\\nyou out to sea in a boat, and all that sort of thing'\n", "orig_text_from_calc_indices == text_bw_tags=False\n", "\n", "sed.prompt='footsteps ascending wooden stairs'\n", "text_bw_tags='stairs'\n", "orig_text_from_calc_indices='stairs'\n", "orig_text_from_calc_indices == text_bw_tags=True\n", "\n" ] } ], "source": [ "for sed in sound_effects_descriptions_filtered:\n", " text_bw_tags = sed.text_between_tags\n", " orig_text_from_calc_indices = text[sed.ix_start_orig_text : sed.ix_end_orig_text]\n", " print(f'{sed.prompt=}')\n", " print(f'{text_bw_tags=}')\n", " print(f'{orig_text_from_calc_indices=}')\n", " print(f'{orig_text_from_calc_indices == text_bw_tags=}')\n", " print(f'')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## generate effects" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "from src.schemas import SoundEffectsParams" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SoundEffectDescription(prompt='room filled with warm, inviting light', text_between_tags='crimson room bloomed with light', ix_start_llm_response=12, ix_end_llm_response=107, ix_start_orig_text=12, ix_end_orig_text=43, duration=5.271)" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sed = sound_effects_descriptions_filtered[0]\n", "# sed" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "# params = SoundEffectsParams(\n", "# text=sed.prompt,\n", "# duration_seconds=sed.duration,\n", "# prompt_influence=0.75 # NOTE: default, seems to work fine\n", "# )" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-11-03 18:14:57,872 [INFO] audio-books (tts.py): request to 11labs sound effect generation with params {'duration_seconds': 5.271, 'prompt_influence': 0.75} for the following text: \"room filled with warm, inviting light\"\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2024-11-03 18:15:03,710 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/sound-generation \"HTTP/1.1 200 OK\"\n" ] } ], "source": [ "sound_effect_response = await tts.sound_generation_consumed(params=params)" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "bytes" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(sound_effect_response[0])" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "from src import utils" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "# effects_dp = os.path.join('data', 'tmp', 'sound_effects')\n", "# os.makedirs(effects_dp, exist_ok=True)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-11-03 18:25:48,886 [INFO] audio-books (utils.py): saving to: \"data/tmp/sound_effects/sounf_effect.wav\"\n" ] } ], "source": [ "# out_fp = os.path.join(effects_dp, 'sounf_effect.wav')\n", "# utils.write_chunked_bytes(data=sound_effect_response, fp=out_fp)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## generate audio with timestamps" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# text = samples.ARCH_WIKI_1[:200]\n", "# pprint(text)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# text = '''\\\n", "# hello, this is the test when I am voicing 123 different phrases (some in parentheses),\n", "# with newlines\n", "# some unreadable characters: #!@%*&\n", "# LooLLL123\n", "# how is it??? going!!\n", "# and some smiles: :))\n", "# '''" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "

\n", " \n", " Inside, the crimson room bloomed with light.\n", " \n", " \n", " Tom and Miss Baker sat at either end of the long couch, and she read aloud to him from the Saturday Evening Post—\n", " the words, murmurous and uninflected, running together in a soothing tune.\n", " \n", " \n", " The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, \n", " glinted along the paper as she turned a page with a flutter of slender muscles in her arms.\n", " \n", "

\n", "
\n" ] } ], "source": [ "text = '\\n

\\n \\n Inside, the crimson room bloomed with light.\\n \\n \\n Tom and Miss Baker sat at either end of the long couch, and she read aloud to him from the Saturday Evening Post—\\n the words, murmurous and uninflected, running together in a soothing tune.\\n \\n \\n The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, \\n glinted along the paper as she turned a page with a flutter of slender muscles in her arms.\\n \\n

\\n
'\n", "print(text)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "out_dp = os.path.join('data', 'intonation_experiment', '1')\n", "os.makedirs(out_dp, exist_ok=True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-11-05 01:19:44,111 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '8opUN7sGOKbyojnjvNdl', 'output_format': , 'seed': 672} for the following text: \"Inside, the crimson room bloomed with light.\"\n", "2024-11-05 01:19:45,277 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-05 01:19:45,344 [INFO] audio-books (utils.py): saving to: \"data/intonation_experiment/1/no_ssml.ix0.mp3\"\n", "2024-11-05 01:19:45,349 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '8opUN7sGOKbyojnjvNdl', 'output_format': , 'seed': 672} for the following text: \"Inside, the crimson room bloomed with light.\"\n", "2024-11-05 01:19:46,211 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-05 01:19:46,268 [INFO] audio-books (utils.py): saving to: \"data/intonation_experiment/1/no_ssml.ix1.mp3\"\n", "2024-11-05 01:19:46,270 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '8opUN7sGOKbyojnjvNdl', 'output_format': , 'seed': 672} for the following text: \"Inside, the crimson room bloomed with light.\"\n", "2024-11-05 01:19:47,110 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-05 01:19:47,260 [INFO] audio-books (utils.py): saving to: \"data/intonation_experiment/1/no_ssml.ix2.mp3\"\n" ] } ], "source": [ "text = 'Inside, the crimson room bloomed with light.'\n", "params = TTSParams(voice_id=\"8opUN7sGOKbyojnjvNdl\", text=text, seed=672)\n", "\n", "for ix in range(3):\n", " out_fp = os.path.join(out_dp, f'no_ssml.ix{ix}')\n", " response = await tts.tts_w_timestamps(params)\n", " response.write_audio_to_file(out_fp, params.output_format)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-11-05 01:20:43,720 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '8opUN7sGOKbyojnjvNdl', 'output_format': , 'seed': 672} for the following text: \"Inside, the crimson room bloomed with light.\"\n", "2024-11-05 01:20:44,585 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-05 01:20:44,629 [INFO] audio-books (utils.py): saving to: \"data/intonation_experiment/1/with_ssml.ix0.mp3\"\n", "2024-11-05 01:20:44,630 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '8opUN7sGOKbyojnjvNdl', 'output_format': , 'seed': 672} for the following text: \"Inside, the crimson room bloomed with light.\"\n", "2024-11-05 01:20:45,461 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-05 01:20:45,526 [INFO] audio-books (utils.py): saving to: \"data/intonation_experiment/1/with_ssml.ix1.mp3\"\n", "2024-11-05 01:20:45,527 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '8opUN7sGOKbyojnjvNdl', 'output_format': , 'seed': 672} for the following text: \"Inside, the crimson room bloomed with light.\"\n", "2024-11-05 01:20:46,415 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-05 01:20:46,435 [INFO] audio-books (utils.py): saving to: \"data/intonation_experiment/1/with_ssml.ix2.mp3\"\n" ] } ], "source": [ "text = 'Inside, the crimson room bloomed with light.'\n", "params = TTSParams(voice_id=\"8opUN7sGOKbyojnjvNdl\", text=text, seed=672)\n", "\n", "for ix in range(3):\n", " out_fp = os.path.join(out_dp, f'with_ssml.ix{ix}')\n", " response = await tts.tts_w_timestamps(params)\n", " response.write_audio_to_file(out_fp, params.output_format)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-11-05 01:32:44,099 [INFO] audio-books (tts.py): request to 11labs TTS endpoint with params {'voice_id': '8opUN7sGOKbyojnjvNdl', 'output_format': , 'seed': 672} for the following text: \"\n", "Inside, the crimson room bloomed with light.\n", "Tom and Miss Baker sat at either end of the long couch,\n", " and she read aloud to him from the Saturday Evening Post\n", "- the words, murmurous and uninflected\n", "\"\n", "2024-11-05 01:32:46,529 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-11-05 01:32:46,612 [INFO] audio-books (utils.py): saving to: \"data/intonation_experiment/1/with_prosody_rate.ix0.mp3\"\n" ] } ], "source": [ "text = '''\n", "Inside, the crimson room bloomed with light.\n", "Tom and Miss Baker sat at either end of the long couch,\n", " and she read aloud to him from the Saturday Evening Post\n", "- the words, murmurous and uninflected\n", "'''\n", "\n", "params = TTSParams(voice_id=\"8opUN7sGOKbyojnjvNdl\", text=text, seed=672)\n", "\n", "for ix in range(1):\n", " out_fp = os.path.join(out_dp, f'with_prosody_rate.ix{ix}')\n", " response = await tts.tts_w_timestamps(params)\n", " response.write_audio_to_file(out_fp, params.output_format)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/ales/dev/python-venvs/ai-audio-books/lib/python3.12/site-packages/pydantic/main.py:390: UserWarning: Pydantic serializer warnings:\n", " Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected\n", " Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected\n", " Expected `VoiceSettings` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected\n", " Expected `int` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected\n", " Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected\n", " Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected\n", " return self.__pydantic_serializer__.to_python(\n", "2024-10-27 21:44:22,111 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n" ] } ], "source": [ "response_raw = await tts.ELEVEN_CLIENT_ASYNC.text_to_speech.convert_with_timestamps(\n", " **params.to_dict()\n", ")" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "response_parsed = TTSTimestampsResponse.model_validate(response_raw)" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'hello, this is the test when I am voicing 123 different phrases (some in parentheses),\\nwith newlines\\nsome unreadable characters: #!@%*&\\nLooLLL123\\nhow is it??? going!!\\nand some smiles: :))\\n'" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "t = response_parsed.alignment.text_joined\n", "t" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "from src.schemas import TTSTimestampsAlignemnt" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "a1 = TTSTimestampsAlignemnt(\n", " characters=list('abc'),\n", " character_start_times_seconds=[0.1, 0.2, 0.3],\n", " character_end_times_seconds=[0.15, 0.25, 0.35],\n", ")\n", "a2 = TTSTimestampsAlignemnt(\n", " characters=list('def'),\n", " character_start_times_seconds=[0.1, 0.2, 0.3],\n", " character_end_times_seconds=[0.15, 0.25, 0.35],\n", ")\n", "a3 = TTSTimestampsAlignemnt(\n", " characters=list(\"ghi\"),\n", " character_start_times_seconds=[0.1, 0.2, 0.3],\n", " character_end_times_seconds=[0.15, 0.25, 0.35],\n", ")" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [], "source": [ "a = TTSTimestampsAlignemnt.combine_alignments(alignments=[a1, a2, a3])" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
charstartend
0a0.100.15
1b0.200.25
2c0.300.35
3d0.450.50
4e0.550.60
5f0.650.70
6g0.800.85
7h0.900.95
8i1.001.05
\n", "
" ], "text/plain": [ " char start end\n", "0 a 0.10 0.15\n", "1 b 0.20 0.25\n", "2 c 0.30 0.35\n", "3 d 0.45 0.50\n", "4 e 0.55 0.60\n", "5 f 0.65 0.70\n", "6 g 0.80 0.85\n", "7 h 0.90 0.95\n", "8 i 1.00 1.05" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a.to_dataframe()" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.44999999999999996" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a.get_start_time_by_char_ix(3)" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.85" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a.get_end_time_by_char_ix(6)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## compare audio quality for different formats" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('This document is an annotated index of popular articles and important '\n", " 'information for improving and adding functionalities to the installed Arch '\n", " 'system. Readers are assumed to have read and followed t')\n" ] } ], "source": [ "text = samples.ARCH_WIKI_1[:200]\n", "pprint(text)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "params_base = TTSParams(\n", " voice_id=\"8opUN7sGOKbyojnjvNdl\",\n", " text=\"hello, how are you doing? this is the test aiming to decide which audio quality option to use\",\n", " # text=text,\n", ")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/ales/dev/python-venvs/ai-audio-books/lib/python3.12/site-packages/pydantic/main.py:390: UserWarning: Pydantic serializer warnings:\n", " Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected\n", " Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected\n", " Expected `VoiceSettings` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected\n", " Expected `int` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected\n", " Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected\n", " Expected `str` but got `ellipsis` with value `Ellipsis` - serialized value may not be as expected\n", " return self.__pydantic_serializer__.to_python(\n", "2024-10-27 18:16:54,779 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_22050_32 \"HTTP/1.1 200 OK\"\n", "2024-10-27 18:16:54,807 [INFO] audio-books (utils.py): saving to: \"data/compare_audio_quality2/compare.mp3_22050_32.mp3\"\n", "2024-10-27 18:16:56,094 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_32 \"HTTP/1.1 200 OK\"\n", "2024-10-27 18:16:56,110 [INFO] audio-books (utils.py): saving to: \"data/compare_audio_quality2/compare.mp3_44100_32.mp3\"\n", "2024-10-27 18:16:57,444 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_64 \"HTTP/1.1 200 OK\"\n", "2024-10-27 18:16:57,472 [INFO] audio-books (utils.py): saving to: \"data/compare_audio_quality2/compare.mp3_44100_64.mp3\"\n", "2024-10-27 18:16:59,066 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_96 \"HTTP/1.1 200 OK\"\n", "2024-10-27 18:16:59,106 [INFO] audio-books (utils.py): saving to: \"data/compare_audio_quality2/compare.mp3_44100_96.mp3\"\n", "2024-10-27 18:17:00,483 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_128 \"HTTP/1.1 200 OK\"\n", "2024-10-27 18:17:00,513 [INFO] audio-books (utils.py): saving to: \"data/compare_audio_quality2/compare.mp3_44100_128.mp3\"\n", "2024-10-27 18:17:01,877 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=mp3_44100_192 \"HTTP/1.1 200 OK\"\n", "2024-10-27 18:17:01,898 [INFO] audio-books (utils.py): saving to: \"data/compare_audio_quality2/compare.mp3_44100_192.mp3\"\n", "2024-10-27 18:17:03,164 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=pcm_16000 \"HTTP/1.1 200 OK\"\n", "2024-10-27 18:17:03,213 [INFO] audio-books (utils.py): saving to: \"data/compare_audio_quality2/compare.pcm_16000.wav\"\n", "2024-10-27 18:17:04,584 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=pcm_22050 \"HTTP/1.1 200 OK\"\n", "2024-10-27 18:17:04,651 [INFO] audio-books (utils.py): saving to: \"data/compare_audio_quality2/compare.pcm_22050.wav\"\n", "2024-10-27 18:17:05,986 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=pcm_24000 \"HTTP/1.1 200 OK\"\n", "2024-10-27 18:17:06,074 [INFO] audio-books (utils.py): saving to: \"data/compare_audio_quality2/compare.pcm_24000.wav\"\n", "2024-10-27 18:17:07,600 [INFO] httpx (_client.py): HTTP Request: POST https://api.elevenlabs.io/v1/text-to-speech/8opUN7sGOKbyojnjvNdl/with-timestamps?output_format=pcm_44100 \"HTTP/1.1 200 OK\"\n", "2024-10-27 18:17:07,698 [INFO] audio-books (utils.py): saving to: \"data/compare_audio_quality2/compare.pcm_44100.wav\"\n" ] } ], "source": [ "# out_dp = \"data/compare_audio_quality2\"\n", "# os.makedirs(out_dp, exist_ok=True)\n", "\n", "# for audio_format in AudioOutputFormat:\n", "# if audio_format is AudioOutputFormat.ULAW_8000:\n", "# continue\n", "\n", "# params = params_base.model_copy(deep=True)\n", "# params.output_format = audio_format\n", "\n", "# response_raw = await ELEVEN_CLIENT_ASYNC.text_to_speech.convert_with_timestamps(\n", "# **params.to_dict()\n", "# )\n", "# response_parsed = TTSTimestampsResponse.model_validate(response_raw)\n", "\n", "# filepath_no_ext = os.path.join(out_dp, f\"compare.{audio_format}\")\n", "# out_fp = response_parsed.write_audio_to_file(\n", "# filepath_no_ext=filepath_no_ext, audio_format=audio_format\n", "# )" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ffprobe version 7.0.1 Copyright (c) 2007-2024 the FFmpeg developers\n", " built with Apple clang version 15.0.0 (clang-1500.3.9.4)\n", " configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.0.1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enable-libzimg --disable-libjack --disable-indev=jack --enable-videotoolbox --enable-audiotoolbox --enable-neon\n", " libavutil 59. 8.100 / 59. 8.100\n", " libavcodec 61. 3.100 / 61. 3.100\n", " libavformat 61. 1.100 / 61. 1.100\n", " libavdevice 61. 1.100 / 61. 1.100\n", " libavfilter 10. 1.100 / 10. 1.100\n", " libswscale 8. 1.100 / 8. 1.100\n", " libswresample 5. 1.100 / 5. 1.100\n", " libpostproc 58. 1.100 / 58. 1.100\n", "\u001b[0;35m[mp3 @ 0x134724c80] \u001b[0m\u001b[0;33mEstimating duration from bitrate, this may be inaccurate\n", "\u001b[0mInput #0, mp3, from 'data/compare_audio_quality/compare.mp3_44100_64.mp3':\n", " Duration: 00:00:12.56, start: 0.000000, bitrate: 64 kb/s\n", " Stream #0:0: Audio: mp3 (mp3float), 44100 Hz, mono, fltp, 64 kb/s\n" ] } ], "source": [ "!ffprobe data/compare_audio_quality/compare.mp3_44100_64.mp3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## normalize audio" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "import pydub\n", "import pydub.effects" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "fp_in = 'data/tmp/tmp.mp3'" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "audio = pydub.AudioSegment.from_file(fp_in)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audio" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audio - 5" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pydub.effects.normalize(audio)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "from src import utils" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "utils.normalize_audio(audio, target_dBFS=-20)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "utils.normalize_audio(audio, target_dBFS=-25)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "ai-audio-books", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }