Spaces:

ales
/

ai-audio-books

Running

File size: 11,262 Bytes

import asyncio
import os
import re
from pathlib import Path
from uuid import uuid4
import random

from langchain_community.callbacks import get_openai_callback
from pydub import AudioSegment

from src.lc_callbacks import LCMessageLoggerAsync
from src.tts import tts_astream_consumed, sound_generation_consumed
from src.utils import consume_aiter
from src.emotions.generation import (
    EffectGeneratorAsync,
    TextPreparationForTTSTaskOutput,
)
from src.emotions.utils import add_overlay_for_audio
from src.config import ELEVENLABS_MAX_PARALLEL, logger, OPENAI_MAX_PARALLEL
from src.text_split_chain import SplitTextOutput


class AudioGeneratorSimple:

    async def generate_audio(
        self,
        text_split: SplitTextOutput,
        character_to_voice: dict[str, str],
    ) -> Path:
        semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)

        async def tts_astream_with_semaphore(voice_id: str, text: str):
            async with semaphore:
                bytes_ = await tts_astream_consumed(voice_id=voice_id, text=text)
                # bytes_ = await consume_aiter(iter_)
                return bytes_

        tasks = []
        for character_phrase in text_split.phrases:
            voice_id = character_to_voice[character_phrase.character]
            task = tts_astream_with_semaphore(
                voice_id=voice_id, text=character_phrase.text
            )
            tasks.append(task)

        results = await asyncio.gather(*tasks)

        save_dir = Path("data") / "books"
        save_dir.mkdir(exist_ok=True)
        audio_combined_fp = save_dir / f"{uuid4()}.wav"

        logger.info(f'saving generated audio book to: "{audio_combined_fp}"')
        with open(audio_combined_fp, "wb") as ab:
            for result in results:
                for chunk in result:
                    ab.write(chunk)

        return audio_combined_fp


class AudioGeneratorWithEffects:

    def __init__(self):
        self.effect_generator = EffectGeneratorAsync()
        self.semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
        self.temp_files = []

    async def generate_audio(
        self,
        text_split: SplitTextOutput,
        character_to_voice: dict[str, str],
        out_path: Path | None = None,
        *,
        generate_effects: bool = True,
    ) -> Path:
        """Main method to generate the audiobook with TTS, emotion, and sound effects."""
        num_lines = len(text_split.phrases)
        lines_for_sound_effect = self._select_lines_for_sound_effect(
            num_lines,
            fraction=float(0.2 * generate_effects),
        )
        logger.info(f"{generate_effects = }, {lines_for_sound_effect = }")

        data_for_tts, data_for_sound_effects = await self._prepare_text_for_tts(
            text_split, lines_for_sound_effect
        )

        tts_results, self.temp_files = await self._generate_tts_audio(
            text_split, data_for_tts, character_to_voice
        )

        audio_chunks = await self._add_sound_effects(
            tts_results, lines_for_sound_effect, data_for_sound_effects, self.temp_files
        )

        normalized_audio_chunks = self._normalize_audio_chunks(
            audio_chunks, self.temp_files
        )

        final_output = self._merge_audio_files(
            normalized_audio_chunks, save_path=out_path
        )

        self._cleanup_temp_files(self.temp_files)

        return final_output

    def _select_lines_for_sound_effect(
        self, num_lines: int, fraction: float
    ) -> list[int]:
        """Select % of the lines randomly for sound effect generation."""
        return random.sample(range(num_lines), k=int(fraction * num_lines))

    async def _prepare_text_for_tts(
        self, text_split: SplitTextOutput, lines_for_sound_effect: list[int]
    ) -> tuple[list[dict], list[dict]]:
        semaphore = asyncio.Semaphore(OPENAI_MAX_PARALLEL)

        async def run_task_with_semaphore(func, **params):
            async with semaphore:
                outputs = await func(**params)
                return outputs

        task_emotion_code = "add_emotion"
        task_effects_code = "add_effects"

        tasks = []

        for idx, character_phrase in enumerate(text_split.phrases):
            character_text = character_phrase.text.strip().lower()

            tasks.append(
                run_task_with_semaphore(
                    func=self.effect_generator.add_emotion_to_text,
                    text=character_text,
                )
            )

            # If this line needs sound effects, generate parameters
            if idx in lines_for_sound_effect:
                tasks.append(
                    run_task_with_semaphore(
                        func=self.effect_generator.generate_parameters_for_sound_effect,
                        text=character_text,
                    )
                )

        tasks_results: list[TextPreparationForTTSTaskOutput] = []
        tasks_results = await asyncio.gather(*tasks)

        emotion_tasks_results = [
            x.output for x in tasks_results if x.task == task_emotion_code
        ]
        effects_tasks_results = [
            x.output for x in tasks_results if x.task == task_effects_code
        ]

        return emotion_tasks_results, effects_tasks_results

    async def _generate_tts_audio(
        self,
        text_split: SplitTextOutput,
        data_for_tts: list[dict],
        character_to_voice: dict[str, str],
    ) -> tuple[list[str], list[str]]:
        """Generate TTS audio for modified text."""
        tasks_for_tts = []
        temp_files = []

        async def tts_astream_with_semaphore(voice_id: str, text: str, params: dict):
            async with self.semaphore:
                bytes_ = await tts_astream_consumed(
                    voice_id=voice_id, text=text, params=params
                )
                # bytes_ = await consume_aiter(iter_)
                return bytes_

        for idx, (data_item, character_phrase) in enumerate(
            zip(data_for_tts, text_split.phrases)
        ):
            voice_id = character_to_voice[character_phrase.character]

            task = tts_astream_with_semaphore(
                voice_id=voice_id,
                text=data_item["modified_text"],
                params=data_item["params"],
            )
            tasks_for_tts.append(task)

        tts_results = await asyncio.gather(*tasks_for_tts)

        # Save the results to temporary files
        tts_audio_files = []
        for idx, tts_result in enumerate(tts_results):
            tts_filename = f"tts_output_{idx}.wav"
            with open(tts_filename, "wb") as ab:
                for chunk in tts_result:
                    ab.write(chunk)
            tts_audio_files.append(tts_filename)
            temp_files.append(tts_filename)

        return tts_audio_files, temp_files

    async def _add_sound_effects(
        self,
        tts_audio_files: list[str],
        lines_for_sound_effect: list[int],
        data_for_sound_effects: list[dict],
        temp_files: list[str],
    ) -> list[str]:
        """Add sound effects to the selected lines."""

        semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)

        async def _process_single_phrase(
            tts_filename: str,
            sound_effect_data: dict | None,
            sound_effect_filename: str,
        ):
            if sound_effect_data is None:
                return (tts_filename, [])

            async with semaphore:
                sound_result = await sound_generation_consumed(sound_effect_data)

            # save to file
            with open(sound_effect_filename, "wb") as ab:
                for chunk in sound_result:
                    ab.write(chunk)

            # overlay sound effect on TTS audio
            tts_with_effects_filename = add_overlay_for_audio(
                main_audio_filename=tts_filename,
                sound_effect_filename=sound_effect_filename,
                cycling_effect=True,
                decrease_effect_volume=5,
            )
            tmp_files = [sound_effect_filename, tts_with_effects_filename]
            return (tts_with_effects_filename, tmp_files)

        tasks = []
        for idx, tts_filename in enumerate(tts_audio_files):
            sound_effect_filename = f"sound_effect_{idx}.wav"

            if idx not in lines_for_sound_effect:
                tasks.append(
                    _process_single_phrase(
                        tts_filename=tts_filename,
                        sound_effect_data=None,
                        sound_effect_filename=sound_effect_filename,
                    )
                )
            else:
                sound_effect_data = data_for_sound_effects.pop(0)
                tasks.append(
                    _process_single_phrase(
                        tts_filename=tts_filename,
                        sound_effect_data=sound_effect_data,
                        sound_effect_filename=sound_effect_filename,
                    )
                )

        outputs = await asyncio.gather(*tasks)
        audio_chunks = [x[0] for x in outputs]
        tmp_files_to_add = [item for x in outputs for item in x[1]]
        temp_files.extend(tmp_files_to_add)

        return audio_chunks

    def _normalize_audio(
        self, audio_segment: AudioSegment, target_dBFS: float = -20.0
    ) -> AudioSegment:
        """Normalize an audio segment to the target dBFS level."""
        change_in_dBFS = target_dBFS - audio_segment.dBFS
        return audio_segment.apply_gain(change_in_dBFS)

    def _normalize_audio_chunks(
        self, audio_filenames: list[str], temp_files, target_dBFS: float = -20.0
    ) -> list[str]:
        """Normalize all audio chunks to the target volume level."""
        normalized_files = []
        for audio_file in audio_filenames:
            audio_segment = AudioSegment.from_file(audio_file)
            normalized_audio = self._normalize_audio(audio_segment, target_dBFS)

            normalized_filename = f"normalized_{Path(audio_file).stem}.wav"
            normalized_audio.export(normalized_filename, format="wav")
            normalized_files.append(normalized_filename)
            temp_files.append(normalized_filename)

        return normalized_files

    def _merge_audio_files(
        self, audio_filenames: list[str], save_path: Path | None = None
    ) -> Path:
        """Helper function to merge multiple audio files into one."""
        combined = AudioSegment.from_file(audio_filenames[0])
        for filename in audio_filenames[1:]:
            next_audio = AudioSegment.from_file(filename)
            combined += next_audio  # Concatenate the audio

        if save_path is None:
            save_dir = Path("data") / "books"
            save_dir.mkdir(exist_ok=True)
            save_path = save_dir / f"{uuid4()}.wav"
        combined.export(save_path, format="wav")
        return Path(save_path)

    def _cleanup_temp_files(self, temp_files: list[str]) -> None:
        """Helper function to delete all temporary files."""
        for temp_file in temp_files:
            try:
                os.remove(temp_file)
            except FileNotFoundError:
                continue