Spaces:

ales
/

ai-audio-books

Sleeping

App Files Files Community

navalnica commited on Oct 11, 2024

Commit

d984557

1 Parent(s): 655c692

use openai api key instead of mlapi api key; format

Browse files

Files changed (6) hide show

.env.template +0 -1
src/audio_generators.py +17 -8
src/config.py +1 -2
src/emotions/generation.py +77 -35
src/emotions/prompts.py +1 -1
src/emotions/utils.py +19 -14

.env.template CHANGED Viewed

@@ -1,5 +1,4 @@
 OPENAI_API_KEY="..."
 ELEVEN_LABS_API_KEY="..."
-AIML_API_KEY="..."
 AUTH_USERS="admin,community_user"
 AUTH_PASS="..."

 OPENAI_API_KEY="..."
 ELEVEN_LABS_API_KEY="..."
 AUTH_USERS="admin,community_user"
 AUTH_PASS="..."

src/audio_generators.py CHANGED Viewed

@@ -13,7 +13,7 @@ from src.tts import tts_astream, sound_generation_astream
 from src.utils import consume_aiter
 from src.emotions.generation import EffectGeneratorAsync
 from src.emotions.utils import add_overlay_for_audio
-from src.config import AI_ML_API_KEY, ELEVENLABS_MAX_PARALLEL, logger
 from src.text_split_chain import SplitTextOutput
@@ -58,7 +58,7 @@ class AudioGeneratorSimple:
 class AudioGeneratorWithEffects:
     def __init__(self):
-        self.effect_generator = EffectGeneratorAsync(AI_ML_API_KEY)
         self.semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
         self.temp_files = []
@@ -87,7 +87,9 @@ class AudioGeneratorWithEffects:
         )
         # Step 4: Merge audio files
-        normalized_audio_chunks = self._normalize_audio_chunks(audio_chunks, self.temp_files)
         final_output = self._merge_audio_files(normalized_audio_chunks)
         # Clean up temporary files
@@ -184,11 +186,14 @@ class AudioGeneratorWithEffects:
         for idx, tts_filename in enumerate(tts_audio_files):
             # If the line has sound emotion data, generate sound effect and overlay
             if idx in lines_for_sound_effect:
-                sound_effect_data = sound_emotion_results.pop(0)  # Get next sound effect data
                 sound_effect_filename = f"sound_effect_{idx}.wav"
                 # Generate sound effect asynchronously
-                sound_result = await consume_aiter(sound_generation_astream(sound_effect_data))
                 with open(sound_effect_filename, "wb") as ab:
                     for chunk in sound_result:
                         ab.write(chunk)
@@ -208,12 +213,16 @@ class AudioGeneratorWithEffects:
         return audio_chunks
-    def _normalize_audio(self, audio_segment: AudioSegment, target_dBFS: float = -20.0) -> AudioSegment:
         """Normalize an audio segment to the target dBFS level."""
         change_in_dBFS = target_dBFS - audio_segment.dBFS
         return audio_segment.apply_gain(change_in_dBFS)
-    def _normalize_audio_chunks(self, audio_filenames: list[str], temp_files, target_dBFS: float = -20.0) -> list[str]:
         """Normalize all audio chunks to the target volume level."""
         normalized_files = []
         for audio_file in audio_filenames:
@@ -246,4 +255,4 @@ class AudioGeneratorWithEffects:
             try:
                 os.remove(temp_file)
             except FileNotFoundError:
-                continue

 from src.utils import consume_aiter
 from src.emotions.generation import EffectGeneratorAsync
 from src.emotions.utils import add_overlay_for_audio
+from src.config import ELEVENLABS_MAX_PARALLEL, logger
 from src.text_split_chain import SplitTextOutput
 class AudioGeneratorWithEffects:
     def __init__(self):
+        self.effect_generator = EffectGeneratorAsync()
         self.semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
         self.temp_files = []
         )
         # Step 4: Merge audio files
+        normalized_audio_chunks = self._normalize_audio_chunks(
+            audio_chunks, self.temp_files
+        )
         final_output = self._merge_audio_files(normalized_audio_chunks)
         # Clean up temporary files
         for idx, tts_filename in enumerate(tts_audio_files):
             # If the line has sound emotion data, generate sound effect and overlay
             if idx in lines_for_sound_effect:
+                # Get next sound effect data
+                sound_effect_data = sound_emotion_results.pop(0)
                 sound_effect_filename = f"sound_effect_{idx}.wav"
                 # Generate sound effect asynchronously
+                sound_result = await consume_aiter(
+                    sound_generation_astream(sound_effect_data)
+                )
                 with open(sound_effect_filename, "wb") as ab:
                     for chunk in sound_result:
                         ab.write(chunk)
         return audio_chunks
+    def _normalize_audio(
+        self, audio_segment: AudioSegment, target_dBFS: float = -20.0
+    ) -> AudioSegment:
         """Normalize an audio segment to the target dBFS level."""
         change_in_dBFS = target_dBFS - audio_segment.dBFS
         return audio_segment.apply_gain(change_in_dBFS)
+    def _normalize_audio_chunks(
+        self, audio_filenames: list[str], temp_files, target_dBFS: float = -20.0
+    ) -> list[str]:
         """Normalize all audio chunks to the target volume level."""
         normalized_files = []
         for audio_file in audio_filenames:
             try:
                 os.remove(temp_file)
             except FileNotFoundError:
+                continue

src/config.py CHANGED Viewed

@@ -7,9 +7,8 @@ logging.basicConfig(
 )
 logger = logging.getLogger("audio-books")
 ELEVENLABS_API_KEY = os.environ["ELEVEN_LABS_API_KEY"]
-AI_ML_API_KEY = os.environ["AIML_API_KEY"]
 FILE_SIZE_MAX = 0.5  # in mb

 )
 logger = logging.getLogger("audio-books")
+OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
 ELEVENLABS_API_KEY = os.environ["ELEVEN_LABS_API_KEY"]
 FILE_SIZE_MAX = 0.5  # in mb

src/emotions/generation.py CHANGED Viewed

@@ -3,31 +3,45 @@ import json
 from requests import HTTPError
 from abc import ABC, abstractmethod
-from .prompts import SOUND_EFFECT_GENERATION, SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION, TEXT_MODIFICATION, TEXT_MODIFICATION_WITH_SSML
 from .utils import get_audio_duration
-from src.config import logger
 class AbstractEffectGenerator(ABC):
     @abstractmethod
-    def generate_text_for_sound_effect(self, text)-> dict:
         pass
     @abstractmethod
-    def generate_parameters_for_sound_effect(self, text: str, generated_audio_file: str)-> dict:
         pass
     @abstractmethod
     def add_emotion_to_text(self, text: str) -> dict:
         pass
 class EffectGenerator(AbstractEffectGenerator):
-    def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
-        self.client = openai.OpenAI(api_key=api_key)
-        self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
         self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
         self.model_type = model_type
-        logger.info(f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}")
     def generate_text_for_sound_effect(self, text: str) -> dict:
         """Generate sound effect description and parameters based on input text."""
@@ -36,21 +50,25 @@ class EffectGenerator(AbstractEffectGenerator):
                 model=self.model_type,
                 messages=[
                     {"role": "system", "content": self.sound_effect_prompt},
-                    {"role": "user", "content": text}
                 ],
-                response_format={"type": "json_object"}
             )
             # Extracting the output
             chatgpt_output = completion.choices[0].message.content
             # Parse and return JSON response
             output_dict = json.loads(chatgpt_output)
-            logger.info("Successfully generated sound effect description: %s", output_dict)
             return output_dict
         except json.JSONDecodeError as e:
             logger.error("Failed to parse the output text as JSON: %s", e)
-            raise RuntimeError(f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}")
         except HTTPError as e:
             logger.error("HTTP error occurred: %s", e)
@@ -60,24 +78,33 @@ class EffectGenerator(AbstractEffectGenerator):
             logger.error("Unexpected error occurred: %s", e)
             raise RuntimeError(f"Unexpected Error: {e}")
-    def generate_parameters_for_sound_effect(self, text: str, generated_audio_file: str = None)-> dict:
         llm_output = self.generate_text_for_sound_effect(text)
         if generated_audio_file is not None:
-            llm_output['duration_seconds'] = get_audio_duration(generated_audio_file)
-            logger.info("Added duration_seconds to output based on generated audio file: %s", generated_audio_file)
         return llm_output
     def add_emotion_to_text(self, text: str) -> dict:
         completion = self.client.chat.completions.create(
             model=self.model_type,
-            messages=[{"role": "system", "content": self.text_modification_prompt},
-                      {"role": "user", "content": text}],
-            response_format={"type": "json_object"}
         )
         chatgpt_output = completion.choices[0].message.content
         try:
             output_dict = json.loads(chatgpt_output)
-            logger.info("Successfully modified text with emotional cues: %s", output_dict)
             return output_dict
         except json.JSONDecodeError as e:
             logger.error("Error in parsing the modified text: %s", e)
@@ -85,9 +112,13 @@ class EffectGenerator(AbstractEffectGenerator):
 class EffectGeneratorAsync(AbstractEffectGenerator):
-    def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
-        self.client = openai.AsyncOpenAI(api_key=api_key)
-        self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
         self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
         self.model_type = model_type
@@ -98,21 +129,25 @@ class EffectGeneratorAsync(AbstractEffectGenerator):
                 model=self.model_type,
                 messages=[
                     {"role": "system", "content": self.sound_effect_prompt},
-                    {"role": "user", "content": text}
                 ],
-                response_format={"type": "json_object"}
             )
             # Extracting the output
             chatgpt_output = completion.choices[0].message.content
             # Parse and return JSON response
             output_dict = json.loads(chatgpt_output)
-            logger.info("Successfully generated sound effect description: %s", output_dict)
             return output_dict
         except json.JSONDecodeError as e:
             logger.error("Failed to parse the output text as JSON: %s", e)
-            raise RuntimeError(f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}")
         except HTTPError as e:
             logger.error("HTTP error occurred: %s", e)
@@ -122,27 +157,34 @@ class EffectGeneratorAsync(AbstractEffectGenerator):
             logger.error("Unexpected error occurred: %s", e)
             raise RuntimeError(f"Unexpected Error: {e}")
-    async def generate_parameters_for_sound_effect(self, text: str, generated_audio_file: str = None) -> dict:
         llm_output = await self.generate_text_for_sound_effect(text)
         if generated_audio_file is not None:
-            llm_output['duration_seconds'] = get_audio_duration(generated_audio_file)
-            logger.info("Added duration_seconds to output based on generated audio file: %s", generated_audio_file)
         return llm_output
     async def add_emotion_to_text(self, text: str) -> dict:
         completion = await self.client.chat.completions.create(
             model=self.model_type,
-            messages=[{"role": "system", "content": self.text_modification_prompt},
-                      {"role": "user", "content": text}],
-            response_format={"type": "json_object"}
         )
         chatgpt_output = completion.choices[0].message.content
         try:
             output_dict = json.loads(chatgpt_output)
-            logger.info("Successfully modified text with emotional cues: %s", output_dict)
             return output_dict
         except json.JSONDecodeError as e:
             logger.error("Error in parsing the modified text: %s", e)
             raise f"error, output_text: {chatgpt_output}"

 from requests import HTTPError
 from abc import ABC, abstractmethod
+from .prompts import (
+    SOUND_EFFECT_GENERATION,
+    SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION,
+    TEXT_MODIFICATION,
+    TEXT_MODIFICATION_WITH_SSML,
+)
 from .utils import get_audio_duration
+from src.config import logger, OPENAI_API_KEY
 class AbstractEffectGenerator(ABC):
     @abstractmethod
+    def generate_text_for_sound_effect(self, text) -> dict:
         pass
     @abstractmethod
+    def generate_parameters_for_sound_effect(
+        self, text: str, generated_audio_file: str
+    ) -> dict:
         pass
     @abstractmethod
     def add_emotion_to_text(self, text: str) -> dict:
         pass
 class EffectGenerator(AbstractEffectGenerator):
+    def __init__(self, predict_duration: bool = True, model_type: str = "gpt-4o"):
+        self.client = openai.OpenAI(api_key=OPENAI_API_KEY)
+        self.sound_effect_prompt = (
+            SOUND_EFFECT_GENERATION
+            if predict_duration
+            else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
+        )
         self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
         self.model_type = model_type
+        logger.info(
+            f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}"
+        )
     def generate_text_for_sound_effect(self, text: str) -> dict:
         """Generate sound effect description and parameters based on input text."""
                 model=self.model_type,
                 messages=[
                     {"role": "system", "content": self.sound_effect_prompt},
+                    {"role": "user", "content": text},
                 ],
+                response_format={"type": "json_object"},
             )
             # Extracting the output
             chatgpt_output = completion.choices[0].message.content
             # Parse and return JSON response
             output_dict = json.loads(chatgpt_output)
+            logger.info(
+                "Successfully generated sound effect description: %s", output_dict
+            )
             return output_dict
         except json.JSONDecodeError as e:
             logger.error("Failed to parse the output text as JSON: %s", e)
+            raise RuntimeError(
+                f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}"
+            )
         except HTTPError as e:
             logger.error("HTTP error occurred: %s", e)
             logger.error("Unexpected error occurred: %s", e)
             raise RuntimeError(f"Unexpected Error: {e}")
+    def generate_parameters_for_sound_effect(
+        self, text: str, generated_audio_file: str = None
+    ) -> dict:
         llm_output = self.generate_text_for_sound_effect(text)
         if generated_audio_file is not None:
+            llm_output["duration_seconds"] = get_audio_duration(generated_audio_file)
+            logger.info(
+                "Added duration_seconds to output based on generated audio file: %s",
+                generated_audio_file,
+            )
         return llm_output
     def add_emotion_to_text(self, text: str) -> dict:
         completion = self.client.chat.completions.create(
             model=self.model_type,
+            messages=[
+                {"role": "system", "content": self.text_modification_prompt},
+                {"role": "user", "content": text},
+            ],
+            response_format={"type": "json_object"},
         )
         chatgpt_output = completion.choices[0].message.content
         try:
             output_dict = json.loads(chatgpt_output)
+            logger.info(
+                "Successfully modified text with emotional cues: %s", output_dict
+            )
             return output_dict
         except json.JSONDecodeError as e:
             logger.error("Error in parsing the modified text: %s", e)
 class EffectGeneratorAsync(AbstractEffectGenerator):
+    def __init__(self, predict_duration: bool = True, model_type: str = "gpt-4o"):
+        self.client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)
+        self.sound_effect_prompt = (
+            SOUND_EFFECT_GENERATION
+            if predict_duration
+            else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
+        )
         self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
         self.model_type = model_type
                 model=self.model_type,
                 messages=[
                     {"role": "system", "content": self.sound_effect_prompt},
+                    {"role": "user", "content": text},
                 ],
+                response_format={"type": "json_object"},
             )
             # Extracting the output
             chatgpt_output = completion.choices[0].message.content
             # Parse and return JSON response
             output_dict = json.loads(chatgpt_output)
+            logger.info(
+                "Successfully generated sound effect description: %s", output_dict
+            )
             return output_dict
         except json.JSONDecodeError as e:
             logger.error("Failed to parse the output text as JSON: %s", e)
+            raise RuntimeError(
+                f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}"
+            )
         except HTTPError as e:
             logger.error("HTTP error occurred: %s", e)
             logger.error("Unexpected error occurred: %s", e)
             raise RuntimeError(f"Unexpected Error: {e}")
+    async def generate_parameters_for_sound_effect(
+        self, text: str, generated_audio_file: str = None
+    ) -> dict:
         llm_output = await self.generate_text_for_sound_effect(text)
         if generated_audio_file is not None:
+            llm_output["duration_seconds"] = get_audio_duration(generated_audio_file)
+            logger.info(
+                "Added duration_seconds to output based on generated audio file: %s",
+                generated_audio_file,
+            )
         return llm_output
     async def add_emotion_to_text(self, text: str) -> dict:
         completion = await self.client.chat.completions.create(
             model=self.model_type,
+            messages=[
+                {"role": "system", "content": self.text_modification_prompt},
+                {"role": "user", "content": text},
+            ],
+            response_format={"type": "json_object"},
         )
         chatgpt_output = completion.choices[0].message.content
         try:
             output_dict = json.loads(chatgpt_output)
+            logger.info(
+                "Successfully modified text with emotional cues: %s", output_dict
+            )
             return output_dict
         except json.JSONDecodeError as e:
             logger.error("Error in parsing the modified text: %s", e)
             raise f"error, output_text: {chatgpt_output}"

src/emotions/prompts.py CHANGED Viewed

@@ -154,4 +154,4 @@ with higher values indicating more emphasis on the voice similarity.
 The "style" parameter should also range from 0 to 1,
 where lower values indicate a neutral tone and higher values reflect more stylized or emotional delivery.
 Adjust both according to the emotional intensity of the text.
-"""

 The "style" parameter should also range from 0 to 1,
 where lower values indicate a neutral tone and higher values reflect more stylized or emotional delivery.
 Adjust both according to the emotional intensity of the text.
+"""

src/emotions/utils.py CHANGED Viewed

@@ -18,11 +18,13 @@ def get_audio_duration(filepath: str) -> float:
     return round(duration_in_seconds, 1)
-def add_overlay_for_audio(main_audio_filename: str,
-                          sound_effect_filename: str,
-                          output_filename: str = None,
-                          cycling_effect: bool = True,
-                          decrease_effect_volume: int = 0) -> str:
     try:
         main_audio = AudioSegment.from_file(main_audio_filename)
         effect_audio = AudioSegment.from_file(sound_effect_filename)
@@ -33,14 +35,16 @@ def add_overlay_for_audio(main_audio_filename: str,
         while len(effect_audio) < len(main_audio):
             effect_audio += effect_audio
-    effect_audio = effect_audio[:len(main_audio)]
     if decrease_effect_volume > 0:
         effect_audio = effect_audio - decrease_effect_volume
     combined_audio = main_audio.overlay(effect_audio)
     if output_filename is None:
-        output_filename = f"{Path(main_audio_filename).stem}_{Path(sound_effect_filename).stem}.wav"
     combined_audio.export(output_filename, format="wav")
     return output_filename
@@ -50,21 +54,22 @@ def sound_generation(sound_generation_data: dict, output_file: str):
         api_key="YOUR_API_KEY",
     )
     audio = client.text_to_sound_effects.convert(
-        text=sound_generation_data['text'],
-        duration_seconds=sound_generation_data['duration_seconds'],
-        prompt_influence=sound_generation_data['prompt_influence'],
     )
     save(audio, output_file)
     logger.error("Successfully generated sound effect to file: %s", output_file)
 async def sound_generation_async(sound_generation_data: dict, output_file: str):
     client = AsyncElevenLabs(
         api_key="YOUR_API_KEY",
     )
     audio = await client.text_to_sound_effects.convert(
-        text=sound_generation_data['text'],
-        duration_seconds=sound_generation_data['duration_seconds'],
-        prompt_influence=sound_generation_data['prompt_influence'],
     )
     save(audio, output_file)
-    logger.error("Successfully generated sound effect to file: %s", output_file)

     return round(duration_in_seconds, 1)
+def add_overlay_for_audio(
+    main_audio_filename: str,
+    sound_effect_filename: str,
+    output_filename: str = None,
+    cycling_effect: bool = True,
+    decrease_effect_volume: int = 0,
+) -> str:
     try:
         main_audio = AudioSegment.from_file(main_audio_filename)
         effect_audio = AudioSegment.from_file(sound_effect_filename)
         while len(effect_audio) < len(main_audio):
             effect_audio += effect_audio
+    effect_audio = effect_audio[: len(main_audio)]
     if decrease_effect_volume > 0:
         effect_audio = effect_audio - decrease_effect_volume
     combined_audio = main_audio.overlay(effect_audio)
     if output_filename is None:
+        output_filename = (
+            f"{Path(main_audio_filename).stem}_{Path(sound_effect_filename).stem}.wav"
+        )
     combined_audio.export(output_filename, format="wav")
     return output_filename
         api_key="YOUR_API_KEY",
     )
     audio = client.text_to_sound_effects.convert(
+        text=sound_generation_data["text"],
+        duration_seconds=sound_generation_data["duration_seconds"],
+        prompt_influence=sound_generation_data["prompt_influence"],
     )
     save(audio, output_file)
     logger.error("Successfully generated sound effect to file: %s", output_file)
 async def sound_generation_async(sound_generation_data: dict, output_file: str):
     client = AsyncElevenLabs(
         api_key="YOUR_API_KEY",
     )
     audio = await client.text_to_sound_effects.convert(
+        text=sound_generation_data["text"],
+        duration_seconds=sound_generation_data["duration_seconds"],
+        prompt_influence=sound_generation_data["prompt_influence"],
     )
     save(audio, output_file)
+    logger.error("Successfully generated sound effect to file: %s", output_file)