File size: 6,826 Bytes
93a309d
 
 
 
 
3ee8f12
93a309d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ee8f12
93a309d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ee8f12
93a309d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2fa877
93a309d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import openai
import json
from requests import HTTPError
from abc import ABC, abstractmethod

from .prompts import SOUND_EFFECT_GENERATION, SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION, TEXT_MODIFICATION, TEXT_MODIFICATION_WITH_SSML
from .utils import get_audio_duration
from src.config import logger


class AbstractEffectGenerator(ABC):
    @abstractmethod
    def generate_text_for_sound_effect(self, text)-> dict:
        pass

    @abstractmethod
    def generate_parameters_for_sound_effect(self, text: str, generated_audio_file: str)-> dict:
        pass

    @abstractmethod
    def add_emotion_to_text(self, text: str) -> dict:
        pass

class EffectGenerator(AbstractEffectGenerator):
    def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
        self.client = openai.OpenAI(api_key=api_key)
        self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
        self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
        self.model_type = model_type
        logger.info(f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}")

    def generate_text_for_sound_effect(self, text: str) -> dict:
        """Generate sound effect description and parameters based on input text."""
        try:
            completion = self.client.chat.completions.create(
                model=self.model_type,
                messages=[
                    {"role": "system", "content": self.sound_effect_prompt},
                    {"role": "user", "content": text}
                ],
                response_format={"type": "json_object"}
            )
            # Extracting the output
            chatgpt_output = completion.choices[0].message.content

            # Parse and return JSON response
            output_dict = json.loads(chatgpt_output)
            logger.info("Successfully generated sound effect description: %s", output_dict)
            return output_dict

        except json.JSONDecodeError as e:
            logger.error("Failed to parse the output text as JSON: %s", e)
            raise RuntimeError(f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}")

        except HTTPError as e:
            logger.error("HTTP error occurred: %s", e)
            raise RuntimeError(f"HTTP Error: {e}")

        except Exception as e:
            logger.error("Unexpected error occurred: %s", e)
            raise RuntimeError(f"Unexpected Error: {e}")

    def generate_parameters_for_sound_effect(self, text: str, generated_audio_file: str = None)-> dict:
        llm_output = self.generate_text_for_sound_effect(text)
        if generated_audio_file is not None:
            llm_output['duration_seconds'] = get_audio_duration(generated_audio_file)
            logger.info("Added duration_seconds to output based on generated audio file: %s", generated_audio_file)
        return llm_output

    def add_emotion_to_text(self, text: str) -> dict:
        completion = self.client.chat.completions.create(
            model=self.model_type,
            messages=[{"role": "system", "content": self.text_modification_prompt},
                      {"role": "user", "content": text}],
            response_format={"type": "json_object"}
        )
        chatgpt_output = completion.choices[0].message.content
        try:
            output_dict = json.loads(chatgpt_output)
            logger.info("Successfully modified text with emotional cues: %s", output_dict)
            return output_dict
        except json.JSONDecodeError as e:
            logger.error("Error in parsing the modified text: %s", e)
            raise f"error, output_text: {chatgpt_output}"


class EffectGeneratorAsync(AbstractEffectGenerator):
    def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
        self.client = openai.AsyncOpenAI(api_key=api_key)
        self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
        self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
        self.model_type = model_type

    async def generate_text_for_sound_effect(self, text: str) -> dict:
        """Asynchronous version to generate sound effect description."""
        try:
            completion = await self.client.chat.completions.create(
                model=self.model_type,
                messages=[
                    {"role": "system", "content": self.sound_effect_prompt},
                    {"role": "user", "content": text}
                ],
                response_format={"type": "json_object"}
            )
            # Extracting the output
            chatgpt_output = completion.choices[0].message.content

            # Parse and return JSON response
            output_dict = json.loads(chatgpt_output)
            logger.info("Successfully generated sound effect description: %s", output_dict)
            return output_dict

        except json.JSONDecodeError as e:
            logger.error("Failed to parse the output text as JSON: %s", e)
            raise RuntimeError(f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}")

        except HTTPError as e:
            logger.error("HTTP error occurred: %s", e)
            raise RuntimeError(f"HTTP Error: {e}")

        except Exception as e:
            logger.error("Unexpected error occurred: %s", e)
            raise RuntimeError(f"Unexpected Error: {e}")


    async def generate_parameters_for_sound_effect(self, text: str, generated_audio_file: str = None) -> dict:
        llm_output = await self.generate_text_for_sound_effect(text)
        if generated_audio_file is not None:
            llm_output['duration_seconds'] = get_audio_duration(generated_audio_file)
            logger.info("Added duration_seconds to output based on generated audio file: %s", generated_audio_file)
        return llm_output

    async def add_emotion_to_text(self, text: str) -> dict:
        completion = await self.client.chat.completions.create(
            model=self.model_type,
            messages=[{"role": "system", "content": self.text_modification_prompt},
                      {"role": "user", "content": text}],
            response_format={"type": "json_object"}
        )
        chatgpt_output = completion.choices[0].message.content
        try:
            output_dict = json.loads(chatgpt_output)
            logger.info("Successfully modified text with emotional cues: %s", output_dict)
            return output_dict
        except json.JSONDecodeError as e:
            logger.error("Error in parsing the modified text: %s", e)
            raise f"error, output_text: {chatgpt_output}"