Spaces:

ales
/

ai-audio-books

Running

ai-audio-books / src /emotions /generation.py

navalnica

use openai api key instead of mlapi api key; format

d984557 4 months ago

7.34 kB

	import openai
	import json
	from requests import HTTPError
	from abc import ABC, abstractmethod

	from .prompts import (
	SOUND_EFFECT_GENERATION,
	SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION,
	TEXT_MODIFICATION,
	TEXT_MODIFICATION_WITH_SSML,
	)
	from .utils import get_audio_duration
	from src.config import logger, OPENAI_API_KEY


	class AbstractEffectGenerator(ABC):
	@abstractmethod
	def generate_text_for_sound_effect(self, text) -> dict:
	pass

	@abstractmethod
	def generate_parameters_for_sound_effect(
	self, text: str, generated_audio_file: str
	) -> dict:
	pass

	@abstractmethod
	def add_emotion_to_text(self, text: str) -> dict:
	pass


	class EffectGenerator(AbstractEffectGenerator):
	def __init__(self, predict_duration: bool = True, model_type: str = "gpt-4o"):
	self.client = openai.OpenAI(api_key=OPENAI_API_KEY)
	self.sound_effect_prompt = (
	SOUND_EFFECT_GENERATION
	if predict_duration
	else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
	)
	self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
	self.model_type = model_type
	logger.info(
	f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}"
	)

	def generate_text_for_sound_effect(self, text: str) -> dict:
	"""Generate sound effect description and parameters based on input text."""
	try:
	completion = self.client.chat.completions.create(
	model=self.model_type,
	messages=[
	{"role": "system", "content": self.sound_effect_prompt},
	{"role": "user", "content": text},
	],
	response_format={"type": "json_object"},
	)
	# Extracting the output
	chatgpt_output = completion.choices[0].message.content

	# Parse and return JSON response
	output_dict = json.loads(chatgpt_output)
	logger.info(
	"Successfully generated sound effect description: %s", output_dict
	)
	return output_dict

	except json.JSONDecodeError as e:
	logger.error("Failed to parse the output text as JSON: %s", e)
	raise RuntimeError(
	f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}"
	)

	except HTTPError as e:
	logger.error("HTTP error occurred: %s", e)
	raise RuntimeError(f"HTTP Error: {e}")

	except Exception as e:
	logger.error("Unexpected error occurred: %s", e)
	raise RuntimeError(f"Unexpected Error: {e}")

	def generate_parameters_for_sound_effect(
	self, text: str, generated_audio_file: str = None
	) -> dict:
	llm_output = self.generate_text_for_sound_effect(text)
	if generated_audio_file is not None:
	llm_output["duration_seconds"] = get_audio_duration(generated_audio_file)
	logger.info(
	"Added duration_seconds to output based on generated audio file: %s",
	generated_audio_file,
	)
	return llm_output

	def add_emotion_to_text(self, text: str) -> dict:
	completion = self.client.chat.completions.create(
	model=self.model_type,
	messages=[
	{"role": "system", "content": self.text_modification_prompt},
	{"role": "user", "content": text},
	],
	response_format={"type": "json_object"},
	)
	chatgpt_output = completion.choices[0].message.content
	try:
	output_dict = json.loads(chatgpt_output)
	logger.info(
	"Successfully modified text with emotional cues: %s", output_dict
	)
	return output_dict
	except json.JSONDecodeError as e:
	logger.error("Error in parsing the modified text: %s", e)
	raise f"error, output_text: {chatgpt_output}"


	class EffectGeneratorAsync(AbstractEffectGenerator):
	def __init__(self, predict_duration: bool = True, model_type: str = "gpt-4o"):
	self.client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)
	self.sound_effect_prompt = (
	SOUND_EFFECT_GENERATION
	if predict_duration
	else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
	)
	self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
	self.model_type = model_type

	async def generate_text_for_sound_effect(self, text: str) -> dict:
	"""Asynchronous version to generate sound effect description."""
	try:
	completion = await self.client.chat.completions.create(
	model=self.model_type,
	messages=[
	{"role": "system", "content": self.sound_effect_prompt},
	{"role": "user", "content": text},
	],
	response_format={"type": "json_object"},
	)
	# Extracting the output
	chatgpt_output = completion.choices[0].message.content

	# Parse and return JSON response
	output_dict = json.loads(chatgpt_output)
	logger.info(
	"Successfully generated sound effect description: %s", output_dict
	)
	return output_dict

	except json.JSONDecodeError as e:
	logger.error("Failed to parse the output text as JSON: %s", e)
	raise RuntimeError(
	f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}"
	)

	except HTTPError as e:
	logger.error("HTTP error occurred: %s", e)
	raise RuntimeError(f"HTTP Error: {e}")

	except Exception as e:
	logger.error("Unexpected error occurred: %s", e)
	raise RuntimeError(f"Unexpected Error: {e}")

	async def generate_parameters_for_sound_effect(
	self, text: str, generated_audio_file: str = None
	) -> dict:
	llm_output = await self.generate_text_for_sound_effect(text)
	if generated_audio_file is not None:
	llm_output["duration_seconds"] = get_audio_duration(generated_audio_file)
	logger.info(
	"Added duration_seconds to output based on generated audio file: %s",
	generated_audio_file,
	)
	return llm_output

	async def add_emotion_to_text(self, text: str) -> dict:
	completion = await self.client.chat.completions.create(
	model=self.model_type,
	messages=[
	{"role": "system", "content": self.text_modification_prompt},
	{"role": "user", "content": text},
	],
	response_format={"type": "json_object"},
	)
	chatgpt_output = completion.choices[0].message.content
	try:
	output_dict = json.loads(chatgpt_output)
	logger.info(
	"Successfully modified text with emotional cues: %s", output_dict
	)
	return output_dict
	except json.JSONDecodeError as e:
	logger.error("Error in parsing the modified text: %s", e)
	raise f"error, output_text: {chatgpt_output}"