Spaces:
Running
Running
File size: 6,826 Bytes
93a309d 3ee8f12 93a309d 3ee8f12 93a309d 3ee8f12 93a309d c2fa877 93a309d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import openai
import json
from requests import HTTPError
from abc import ABC, abstractmethod
from .prompts import SOUND_EFFECT_GENERATION, SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION, TEXT_MODIFICATION, TEXT_MODIFICATION_WITH_SSML
from .utils import get_audio_duration
from src.config import logger
class AbstractEffectGenerator(ABC):
@abstractmethod
def generate_text_for_sound_effect(self, text)-> dict:
pass
@abstractmethod
def generate_parameters_for_sound_effect(self, text: str, generated_audio_file: str)-> dict:
pass
@abstractmethod
def add_emotion_to_text(self, text: str) -> dict:
pass
class EffectGenerator(AbstractEffectGenerator):
def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
self.client = openai.OpenAI(api_key=api_key)
self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
self.model_type = model_type
logger.info(f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}")
def generate_text_for_sound_effect(self, text: str) -> dict:
"""Generate sound effect description and parameters based on input text."""
try:
completion = self.client.chat.completions.create(
model=self.model_type,
messages=[
{"role": "system", "content": self.sound_effect_prompt},
{"role": "user", "content": text}
],
response_format={"type": "json_object"}
)
# Extracting the output
chatgpt_output = completion.choices[0].message.content
# Parse and return JSON response
output_dict = json.loads(chatgpt_output)
logger.info("Successfully generated sound effect description: %s", output_dict)
return output_dict
except json.JSONDecodeError as e:
logger.error("Failed to parse the output text as JSON: %s", e)
raise RuntimeError(f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}")
except HTTPError as e:
logger.error("HTTP error occurred: %s", e)
raise RuntimeError(f"HTTP Error: {e}")
except Exception as e:
logger.error("Unexpected error occurred: %s", e)
raise RuntimeError(f"Unexpected Error: {e}")
def generate_parameters_for_sound_effect(self, text: str, generated_audio_file: str = None)-> dict:
llm_output = self.generate_text_for_sound_effect(text)
if generated_audio_file is not None:
llm_output['duration_seconds'] = get_audio_duration(generated_audio_file)
logger.info("Added duration_seconds to output based on generated audio file: %s", generated_audio_file)
return llm_output
def add_emotion_to_text(self, text: str) -> dict:
completion = self.client.chat.completions.create(
model=self.model_type,
messages=[{"role": "system", "content": self.text_modification_prompt},
{"role": "user", "content": text}],
response_format={"type": "json_object"}
)
chatgpt_output = completion.choices[0].message.content
try:
output_dict = json.loads(chatgpt_output)
logger.info("Successfully modified text with emotional cues: %s", output_dict)
return output_dict
except json.JSONDecodeError as e:
logger.error("Error in parsing the modified text: %s", e)
raise f"error, output_text: {chatgpt_output}"
class EffectGeneratorAsync(AbstractEffectGenerator):
def __init__(self, api_key: str, predict_duration: bool = True, model_type: str = 'gpt-4o'):
self.client = openai.AsyncOpenAI(api_key=api_key)
self.sound_effect_prompt = SOUND_EFFECT_GENERATION if predict_duration else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
self.model_type = model_type
async def generate_text_for_sound_effect(self, text: str) -> dict:
"""Asynchronous version to generate sound effect description."""
try:
completion = await self.client.chat.completions.create(
model=self.model_type,
messages=[
{"role": "system", "content": self.sound_effect_prompt},
{"role": "user", "content": text}
],
response_format={"type": "json_object"}
)
# Extracting the output
chatgpt_output = completion.choices[0].message.content
# Parse and return JSON response
output_dict = json.loads(chatgpt_output)
logger.info("Successfully generated sound effect description: %s", output_dict)
return output_dict
except json.JSONDecodeError as e:
logger.error("Failed to parse the output text as JSON: %s", e)
raise RuntimeError(f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}")
except HTTPError as e:
logger.error("HTTP error occurred: %s", e)
raise RuntimeError(f"HTTP Error: {e}")
except Exception as e:
logger.error("Unexpected error occurred: %s", e)
raise RuntimeError(f"Unexpected Error: {e}")
async def generate_parameters_for_sound_effect(self, text: str, generated_audio_file: str = None) -> dict:
llm_output = await self.generate_text_for_sound_effect(text)
if generated_audio_file is not None:
llm_output['duration_seconds'] = get_audio_duration(generated_audio_file)
logger.info("Added duration_seconds to output based on generated audio file: %s", generated_audio_file)
return llm_output
async def add_emotion_to_text(self, text: str) -> dict:
completion = await self.client.chat.completions.create(
model=self.model_type,
messages=[{"role": "system", "content": self.text_modification_prompt},
{"role": "user", "content": text}],
response_format={"type": "json_object"}
)
chatgpt_output = completion.choices[0].message.content
try:
output_dict = json.loads(chatgpt_output)
logger.info("Successfully modified text with emotional cues: %s", output_dict)
return output_dict
except json.JSONDecodeError as e:
logger.error("Error in parsing the modified text: %s", e)
raise f"error, output_text: {chatgpt_output}"
|