Spaces:
Sleeping
Sleeping
navalnica
commited on
Commit
Β·
d984557
1
Parent(s):
655c692
use openai api key instead of mlapi api key; format
Browse files- .env.template +0 -1
- src/audio_generators.py +17 -8
- src/config.py +1 -2
- src/emotions/generation.py +77 -35
- src/emotions/prompts.py +1 -1
- src/emotions/utils.py +19 -14
.env.template
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
OPENAI_API_KEY="..."
|
2 |
ELEVEN_LABS_API_KEY="..."
|
3 |
-
AIML_API_KEY="..."
|
4 |
AUTH_USERS="admin,community_user"
|
5 |
AUTH_PASS="..."
|
|
|
1 |
OPENAI_API_KEY="..."
|
2 |
ELEVEN_LABS_API_KEY="..."
|
|
|
3 |
AUTH_USERS="admin,community_user"
|
4 |
AUTH_PASS="..."
|
src/audio_generators.py
CHANGED
@@ -13,7 +13,7 @@ from src.tts import tts_astream, sound_generation_astream
|
|
13 |
from src.utils import consume_aiter
|
14 |
from src.emotions.generation import EffectGeneratorAsync
|
15 |
from src.emotions.utils import add_overlay_for_audio
|
16 |
-
from src.config import
|
17 |
from src.text_split_chain import SplitTextOutput
|
18 |
|
19 |
|
@@ -58,7 +58,7 @@ class AudioGeneratorSimple:
|
|
58 |
class AudioGeneratorWithEffects:
|
59 |
|
60 |
def __init__(self):
|
61 |
-
self.effect_generator = EffectGeneratorAsync(
|
62 |
self.semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
|
63 |
self.temp_files = []
|
64 |
|
@@ -87,7 +87,9 @@ class AudioGeneratorWithEffects:
|
|
87 |
)
|
88 |
|
89 |
# Step 4: Merge audio files
|
90 |
-
normalized_audio_chunks = self._normalize_audio_chunks(
|
|
|
|
|
91 |
final_output = self._merge_audio_files(normalized_audio_chunks)
|
92 |
|
93 |
# Clean up temporary files
|
@@ -184,11 +186,14 @@ class AudioGeneratorWithEffects:
|
|
184 |
for idx, tts_filename in enumerate(tts_audio_files):
|
185 |
# If the line has sound emotion data, generate sound effect and overlay
|
186 |
if idx in lines_for_sound_effect:
|
187 |
-
|
|
|
188 |
sound_effect_filename = f"sound_effect_{idx}.wav"
|
189 |
|
190 |
# Generate sound effect asynchronously
|
191 |
-
sound_result = await consume_aiter(
|
|
|
|
|
192 |
with open(sound_effect_filename, "wb") as ab:
|
193 |
for chunk in sound_result:
|
194 |
ab.write(chunk)
|
@@ -208,12 +213,16 @@ class AudioGeneratorWithEffects:
|
|
208 |
|
209 |
return audio_chunks
|
210 |
|
211 |
-
def _normalize_audio(
|
|
|
|
|
212 |
"""Normalize an audio segment to the target dBFS level."""
|
213 |
change_in_dBFS = target_dBFS - audio_segment.dBFS
|
214 |
return audio_segment.apply_gain(change_in_dBFS)
|
215 |
|
216 |
-
def _normalize_audio_chunks(
|
|
|
|
|
217 |
"""Normalize all audio chunks to the target volume level."""
|
218 |
normalized_files = []
|
219 |
for audio_file in audio_filenames:
|
@@ -246,4 +255,4 @@ class AudioGeneratorWithEffects:
|
|
246 |
try:
|
247 |
os.remove(temp_file)
|
248 |
except FileNotFoundError:
|
249 |
-
continue
|
|
|
13 |
from src.utils import consume_aiter
|
14 |
from src.emotions.generation import EffectGeneratorAsync
|
15 |
from src.emotions.utils import add_overlay_for_audio
|
16 |
+
from src.config import ELEVENLABS_MAX_PARALLEL, logger
|
17 |
from src.text_split_chain import SplitTextOutput
|
18 |
|
19 |
|
|
|
58 |
class AudioGeneratorWithEffects:
|
59 |
|
60 |
def __init__(self):
|
61 |
+
self.effect_generator = EffectGeneratorAsync()
|
62 |
self.semaphore = asyncio.Semaphore(ELEVENLABS_MAX_PARALLEL)
|
63 |
self.temp_files = []
|
64 |
|
|
|
87 |
)
|
88 |
|
89 |
# Step 4: Merge audio files
|
90 |
+
normalized_audio_chunks = self._normalize_audio_chunks(
|
91 |
+
audio_chunks, self.temp_files
|
92 |
+
)
|
93 |
final_output = self._merge_audio_files(normalized_audio_chunks)
|
94 |
|
95 |
# Clean up temporary files
|
|
|
186 |
for idx, tts_filename in enumerate(tts_audio_files):
|
187 |
# If the line has sound emotion data, generate sound effect and overlay
|
188 |
if idx in lines_for_sound_effect:
|
189 |
+
# Get next sound effect data
|
190 |
+
sound_effect_data = sound_emotion_results.pop(0)
|
191 |
sound_effect_filename = f"sound_effect_{idx}.wav"
|
192 |
|
193 |
# Generate sound effect asynchronously
|
194 |
+
sound_result = await consume_aiter(
|
195 |
+
sound_generation_astream(sound_effect_data)
|
196 |
+
)
|
197 |
with open(sound_effect_filename, "wb") as ab:
|
198 |
for chunk in sound_result:
|
199 |
ab.write(chunk)
|
|
|
213 |
|
214 |
return audio_chunks
|
215 |
|
216 |
+
def _normalize_audio(
|
217 |
+
self, audio_segment: AudioSegment, target_dBFS: float = -20.0
|
218 |
+
) -> AudioSegment:
|
219 |
"""Normalize an audio segment to the target dBFS level."""
|
220 |
change_in_dBFS = target_dBFS - audio_segment.dBFS
|
221 |
return audio_segment.apply_gain(change_in_dBFS)
|
222 |
|
223 |
+
def _normalize_audio_chunks(
|
224 |
+
self, audio_filenames: list[str], temp_files, target_dBFS: float = -20.0
|
225 |
+
) -> list[str]:
|
226 |
"""Normalize all audio chunks to the target volume level."""
|
227 |
normalized_files = []
|
228 |
for audio_file in audio_filenames:
|
|
|
255 |
try:
|
256 |
os.remove(temp_file)
|
257 |
except FileNotFoundError:
|
258 |
+
continue
|
src/config.py
CHANGED
@@ -7,9 +7,8 @@ logging.basicConfig(
|
|
7 |
)
|
8 |
logger = logging.getLogger("audio-books")
|
9 |
|
10 |
-
|
11 |
ELEVENLABS_API_KEY = os.environ["ELEVEN_LABS_API_KEY"]
|
12 |
-
AI_ML_API_KEY = os.environ["AIML_API_KEY"]
|
13 |
|
14 |
FILE_SIZE_MAX = 0.5 # in mb
|
15 |
|
|
|
7 |
)
|
8 |
logger = logging.getLogger("audio-books")
|
9 |
|
10 |
+
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
|
11 |
ELEVENLABS_API_KEY = os.environ["ELEVEN_LABS_API_KEY"]
|
|
|
12 |
|
13 |
FILE_SIZE_MAX = 0.5 # in mb
|
14 |
|
src/emotions/generation.py
CHANGED
@@ -3,31 +3,45 @@ import json
|
|
3 |
from requests import HTTPError
|
4 |
from abc import ABC, abstractmethod
|
5 |
|
6 |
-
from .prompts import
|
|
|
|
|
|
|
|
|
|
|
7 |
from .utils import get_audio_duration
|
8 |
-
from src.config import logger
|
9 |
|
10 |
|
11 |
class AbstractEffectGenerator(ABC):
|
12 |
@abstractmethod
|
13 |
-
def generate_text_for_sound_effect(self, text)-> dict:
|
14 |
pass
|
15 |
|
16 |
@abstractmethod
|
17 |
-
def generate_parameters_for_sound_effect(
|
|
|
|
|
18 |
pass
|
19 |
|
20 |
@abstractmethod
|
21 |
def add_emotion_to_text(self, text: str) -> dict:
|
22 |
pass
|
23 |
|
|
|
24 |
class EffectGenerator(AbstractEffectGenerator):
|
25 |
-
def __init__(self,
|
26 |
-
self.client = openai.OpenAI(api_key=
|
27 |
-
self.sound_effect_prompt =
|
|
|
|
|
|
|
|
|
28 |
self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
|
29 |
self.model_type = model_type
|
30 |
-
logger.info(
|
|
|
|
|
31 |
|
32 |
def generate_text_for_sound_effect(self, text: str) -> dict:
|
33 |
"""Generate sound effect description and parameters based on input text."""
|
@@ -36,21 +50,25 @@ class EffectGenerator(AbstractEffectGenerator):
|
|
36 |
model=self.model_type,
|
37 |
messages=[
|
38 |
{"role": "system", "content": self.sound_effect_prompt},
|
39 |
-
{"role": "user", "content": text}
|
40 |
],
|
41 |
-
response_format={"type": "json_object"}
|
42 |
)
|
43 |
# Extracting the output
|
44 |
chatgpt_output = completion.choices[0].message.content
|
45 |
|
46 |
# Parse and return JSON response
|
47 |
output_dict = json.loads(chatgpt_output)
|
48 |
-
logger.info(
|
|
|
|
|
49 |
return output_dict
|
50 |
|
51 |
except json.JSONDecodeError as e:
|
52 |
logger.error("Failed to parse the output text as JSON: %s", e)
|
53 |
-
raise RuntimeError(
|
|
|
|
|
54 |
|
55 |
except HTTPError as e:
|
56 |
logger.error("HTTP error occurred: %s", e)
|
@@ -60,24 +78,33 @@ class EffectGenerator(AbstractEffectGenerator):
|
|
60 |
logger.error("Unexpected error occurred: %s", e)
|
61 |
raise RuntimeError(f"Unexpected Error: {e}")
|
62 |
|
63 |
-
def generate_parameters_for_sound_effect(
|
|
|
|
|
64 |
llm_output = self.generate_text_for_sound_effect(text)
|
65 |
if generated_audio_file is not None:
|
66 |
-
llm_output[
|
67 |
-
logger.info(
|
|
|
|
|
|
|
68 |
return llm_output
|
69 |
|
70 |
def add_emotion_to_text(self, text: str) -> dict:
|
71 |
completion = self.client.chat.completions.create(
|
72 |
model=self.model_type,
|
73 |
-
messages=[
|
74 |
-
|
75 |
-
|
|
|
|
|
76 |
)
|
77 |
chatgpt_output = completion.choices[0].message.content
|
78 |
try:
|
79 |
output_dict = json.loads(chatgpt_output)
|
80 |
-
logger.info(
|
|
|
|
|
81 |
return output_dict
|
82 |
except json.JSONDecodeError as e:
|
83 |
logger.error("Error in parsing the modified text: %s", e)
|
@@ -85,9 +112,13 @@ class EffectGenerator(AbstractEffectGenerator):
|
|
85 |
|
86 |
|
87 |
class EffectGeneratorAsync(AbstractEffectGenerator):
|
88 |
-
def __init__(self,
|
89 |
-
self.client = openai.AsyncOpenAI(api_key=
|
90 |
-
self.sound_effect_prompt =
|
|
|
|
|
|
|
|
|
91 |
self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
|
92 |
self.model_type = model_type
|
93 |
|
@@ -98,21 +129,25 @@ class EffectGeneratorAsync(AbstractEffectGenerator):
|
|
98 |
model=self.model_type,
|
99 |
messages=[
|
100 |
{"role": "system", "content": self.sound_effect_prompt},
|
101 |
-
{"role": "user", "content": text}
|
102 |
],
|
103 |
-
response_format={"type": "json_object"}
|
104 |
)
|
105 |
# Extracting the output
|
106 |
chatgpt_output = completion.choices[0].message.content
|
107 |
|
108 |
# Parse and return JSON response
|
109 |
output_dict = json.loads(chatgpt_output)
|
110 |
-
logger.info(
|
|
|
|
|
111 |
return output_dict
|
112 |
|
113 |
except json.JSONDecodeError as e:
|
114 |
logger.error("Failed to parse the output text as JSON: %s", e)
|
115 |
-
raise RuntimeError(
|
|
|
|
|
116 |
|
117 |
except HTTPError as e:
|
118 |
logger.error("HTTP error occurred: %s", e)
|
@@ -122,27 +157,34 @@ class EffectGeneratorAsync(AbstractEffectGenerator):
|
|
122 |
logger.error("Unexpected error occurred: %s", e)
|
123 |
raise RuntimeError(f"Unexpected Error: {e}")
|
124 |
|
125 |
-
|
126 |
-
|
|
|
127 |
llm_output = await self.generate_text_for_sound_effect(text)
|
128 |
if generated_audio_file is not None:
|
129 |
-
llm_output[
|
130 |
-
logger.info(
|
|
|
|
|
|
|
131 |
return llm_output
|
132 |
|
133 |
async def add_emotion_to_text(self, text: str) -> dict:
|
134 |
completion = await self.client.chat.completions.create(
|
135 |
model=self.model_type,
|
136 |
-
messages=[
|
137 |
-
|
138 |
-
|
|
|
|
|
139 |
)
|
140 |
chatgpt_output = completion.choices[0].message.content
|
141 |
try:
|
142 |
output_dict = json.loads(chatgpt_output)
|
143 |
-
logger.info(
|
|
|
|
|
144 |
return output_dict
|
145 |
except json.JSONDecodeError as e:
|
146 |
logger.error("Error in parsing the modified text: %s", e)
|
147 |
raise f"error, output_text: {chatgpt_output}"
|
148 |
-
|
|
|
3 |
from requests import HTTPError
|
4 |
from abc import ABC, abstractmethod
|
5 |
|
6 |
+
from .prompts import (
|
7 |
+
SOUND_EFFECT_GENERATION,
|
8 |
+
SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION,
|
9 |
+
TEXT_MODIFICATION,
|
10 |
+
TEXT_MODIFICATION_WITH_SSML,
|
11 |
+
)
|
12 |
from .utils import get_audio_duration
|
13 |
+
from src.config import logger, OPENAI_API_KEY
|
14 |
|
15 |
|
16 |
class AbstractEffectGenerator(ABC):
|
17 |
@abstractmethod
|
18 |
+
def generate_text_for_sound_effect(self, text) -> dict:
|
19 |
pass
|
20 |
|
21 |
@abstractmethod
|
22 |
+
def generate_parameters_for_sound_effect(
|
23 |
+
self, text: str, generated_audio_file: str
|
24 |
+
) -> dict:
|
25 |
pass
|
26 |
|
27 |
@abstractmethod
|
28 |
def add_emotion_to_text(self, text: str) -> dict:
|
29 |
pass
|
30 |
|
31 |
+
|
32 |
class EffectGenerator(AbstractEffectGenerator):
|
33 |
+
def __init__(self, predict_duration: bool = True, model_type: str = "gpt-4o"):
|
34 |
+
self.client = openai.OpenAI(api_key=OPENAI_API_KEY)
|
35 |
+
self.sound_effect_prompt = (
|
36 |
+
SOUND_EFFECT_GENERATION
|
37 |
+
if predict_duration
|
38 |
+
else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
|
39 |
+
)
|
40 |
self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
|
41 |
self.model_type = model_type
|
42 |
+
logger.info(
|
43 |
+
f"EffectGenerator initialized with model_type: {model_type}, predict_duration: {predict_duration}"
|
44 |
+
)
|
45 |
|
46 |
def generate_text_for_sound_effect(self, text: str) -> dict:
|
47 |
"""Generate sound effect description and parameters based on input text."""
|
|
|
50 |
model=self.model_type,
|
51 |
messages=[
|
52 |
{"role": "system", "content": self.sound_effect_prompt},
|
53 |
+
{"role": "user", "content": text},
|
54 |
],
|
55 |
+
response_format={"type": "json_object"},
|
56 |
)
|
57 |
# Extracting the output
|
58 |
chatgpt_output = completion.choices[0].message.content
|
59 |
|
60 |
# Parse and return JSON response
|
61 |
output_dict = json.loads(chatgpt_output)
|
62 |
+
logger.info(
|
63 |
+
"Successfully generated sound effect description: %s", output_dict
|
64 |
+
)
|
65 |
return output_dict
|
66 |
|
67 |
except json.JSONDecodeError as e:
|
68 |
logger.error("Failed to parse the output text as JSON: %s", e)
|
69 |
+
raise RuntimeError(
|
70 |
+
f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}"
|
71 |
+
)
|
72 |
|
73 |
except HTTPError as e:
|
74 |
logger.error("HTTP error occurred: %s", e)
|
|
|
78 |
logger.error("Unexpected error occurred: %s", e)
|
79 |
raise RuntimeError(f"Unexpected Error: {e}")
|
80 |
|
81 |
+
def generate_parameters_for_sound_effect(
|
82 |
+
self, text: str, generated_audio_file: str = None
|
83 |
+
) -> dict:
|
84 |
llm_output = self.generate_text_for_sound_effect(text)
|
85 |
if generated_audio_file is not None:
|
86 |
+
llm_output["duration_seconds"] = get_audio_duration(generated_audio_file)
|
87 |
+
logger.info(
|
88 |
+
"Added duration_seconds to output based on generated audio file: %s",
|
89 |
+
generated_audio_file,
|
90 |
+
)
|
91 |
return llm_output
|
92 |
|
93 |
def add_emotion_to_text(self, text: str) -> dict:
|
94 |
completion = self.client.chat.completions.create(
|
95 |
model=self.model_type,
|
96 |
+
messages=[
|
97 |
+
{"role": "system", "content": self.text_modification_prompt},
|
98 |
+
{"role": "user", "content": text},
|
99 |
+
],
|
100 |
+
response_format={"type": "json_object"},
|
101 |
)
|
102 |
chatgpt_output = completion.choices[0].message.content
|
103 |
try:
|
104 |
output_dict = json.loads(chatgpt_output)
|
105 |
+
logger.info(
|
106 |
+
"Successfully modified text with emotional cues: %s", output_dict
|
107 |
+
)
|
108 |
return output_dict
|
109 |
except json.JSONDecodeError as e:
|
110 |
logger.error("Error in parsing the modified text: %s", e)
|
|
|
112 |
|
113 |
|
114 |
class EffectGeneratorAsync(AbstractEffectGenerator):
|
115 |
+
def __init__(self, predict_duration: bool = True, model_type: str = "gpt-4o"):
|
116 |
+
self.client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)
|
117 |
+
self.sound_effect_prompt = (
|
118 |
+
SOUND_EFFECT_GENERATION
|
119 |
+
if predict_duration
|
120 |
+
else SOUND_EFFECT_GENERATION_WITHOUT_DURATION_PREDICTION
|
121 |
+
)
|
122 |
self.text_modification_prompt = TEXT_MODIFICATION_WITH_SSML
|
123 |
self.model_type = model_type
|
124 |
|
|
|
129 |
model=self.model_type,
|
130 |
messages=[
|
131 |
{"role": "system", "content": self.sound_effect_prompt},
|
132 |
+
{"role": "user", "content": text},
|
133 |
],
|
134 |
+
response_format={"type": "json_object"},
|
135 |
)
|
136 |
# Extracting the output
|
137 |
chatgpt_output = completion.choices[0].message.content
|
138 |
|
139 |
# Parse and return JSON response
|
140 |
output_dict = json.loads(chatgpt_output)
|
141 |
+
logger.info(
|
142 |
+
"Successfully generated sound effect description: %s", output_dict
|
143 |
+
)
|
144 |
return output_dict
|
145 |
|
146 |
except json.JSONDecodeError as e:
|
147 |
logger.error("Failed to parse the output text as JSON: %s", e)
|
148 |
+
raise RuntimeError(
|
149 |
+
f"Error: Failed to parse the output text as JSON.\nOutput: {chatgpt_output}"
|
150 |
+
)
|
151 |
|
152 |
except HTTPError as e:
|
153 |
logger.error("HTTP error occurred: %s", e)
|
|
|
157 |
logger.error("Unexpected error occurred: %s", e)
|
158 |
raise RuntimeError(f"Unexpected Error: {e}")
|
159 |
|
160 |
+
async def generate_parameters_for_sound_effect(
|
161 |
+
self, text: str, generated_audio_file: str = None
|
162 |
+
) -> dict:
|
163 |
llm_output = await self.generate_text_for_sound_effect(text)
|
164 |
if generated_audio_file is not None:
|
165 |
+
llm_output["duration_seconds"] = get_audio_duration(generated_audio_file)
|
166 |
+
logger.info(
|
167 |
+
"Added duration_seconds to output based on generated audio file: %s",
|
168 |
+
generated_audio_file,
|
169 |
+
)
|
170 |
return llm_output
|
171 |
|
172 |
async def add_emotion_to_text(self, text: str) -> dict:
|
173 |
completion = await self.client.chat.completions.create(
|
174 |
model=self.model_type,
|
175 |
+
messages=[
|
176 |
+
{"role": "system", "content": self.text_modification_prompt},
|
177 |
+
{"role": "user", "content": text},
|
178 |
+
],
|
179 |
+
response_format={"type": "json_object"},
|
180 |
)
|
181 |
chatgpt_output = completion.choices[0].message.content
|
182 |
try:
|
183 |
output_dict = json.loads(chatgpt_output)
|
184 |
+
logger.info(
|
185 |
+
"Successfully modified text with emotional cues: %s", output_dict
|
186 |
+
)
|
187 |
return output_dict
|
188 |
except json.JSONDecodeError as e:
|
189 |
logger.error("Error in parsing the modified text: %s", e)
|
190 |
raise f"error, output_text: {chatgpt_output}"
|
|
src/emotions/prompts.py
CHANGED
@@ -154,4 +154,4 @@ with higher values indicating more emphasis on the voice similarity.
|
|
154 |
The "style" parameter should also range from 0 to 1,
|
155 |
where lower values indicate a neutral tone and higher values reflect more stylized or emotional delivery.
|
156 |
Adjust both according to the emotional intensity of the text.
|
157 |
-
"""
|
|
|
154 |
The "style" parameter should also range from 0 to 1,
|
155 |
where lower values indicate a neutral tone and higher values reflect more stylized or emotional delivery.
|
156 |
Adjust both according to the emotional intensity of the text.
|
157 |
+
"""
|
src/emotions/utils.py
CHANGED
@@ -18,11 +18,13 @@ def get_audio_duration(filepath: str) -> float:
|
|
18 |
return round(duration_in_seconds, 1)
|
19 |
|
20 |
|
21 |
-
def add_overlay_for_audio(
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
26 |
try:
|
27 |
main_audio = AudioSegment.from_file(main_audio_filename)
|
28 |
effect_audio = AudioSegment.from_file(sound_effect_filename)
|
@@ -33,14 +35,16 @@ def add_overlay_for_audio(main_audio_filename: str,
|
|
33 |
while len(effect_audio) < len(main_audio):
|
34 |
effect_audio += effect_audio
|
35 |
|
36 |
-
effect_audio = effect_audio[:len(main_audio)]
|
37 |
|
38 |
if decrease_effect_volume > 0:
|
39 |
effect_audio = effect_audio - decrease_effect_volume
|
40 |
combined_audio = main_audio.overlay(effect_audio)
|
41 |
|
42 |
if output_filename is None:
|
43 |
-
output_filename =
|
|
|
|
|
44 |
combined_audio.export(output_filename, format="wav")
|
45 |
return output_filename
|
46 |
|
@@ -50,21 +54,22 @@ def sound_generation(sound_generation_data: dict, output_file: str):
|
|
50 |
api_key="YOUR_API_KEY",
|
51 |
)
|
52 |
audio = client.text_to_sound_effects.convert(
|
53 |
-
text=sound_generation_data[
|
54 |
-
duration_seconds=sound_generation_data[
|
55 |
-
prompt_influence=sound_generation_data[
|
56 |
)
|
57 |
save(audio, output_file)
|
58 |
logger.error("Successfully generated sound effect to file: %s", output_file)
|
59 |
|
|
|
60 |
async def sound_generation_async(sound_generation_data: dict, output_file: str):
|
61 |
client = AsyncElevenLabs(
|
62 |
api_key="YOUR_API_KEY",
|
63 |
)
|
64 |
audio = await client.text_to_sound_effects.convert(
|
65 |
-
text=sound_generation_data[
|
66 |
-
duration_seconds=sound_generation_data[
|
67 |
-
prompt_influence=sound_generation_data[
|
68 |
)
|
69 |
save(audio, output_file)
|
70 |
-
logger.error("Successfully generated sound effect to file: %s", output_file)
|
|
|
18 |
return round(duration_in_seconds, 1)
|
19 |
|
20 |
|
21 |
+
def add_overlay_for_audio(
|
22 |
+
main_audio_filename: str,
|
23 |
+
sound_effect_filename: str,
|
24 |
+
output_filename: str = None,
|
25 |
+
cycling_effect: bool = True,
|
26 |
+
decrease_effect_volume: int = 0,
|
27 |
+
) -> str:
|
28 |
try:
|
29 |
main_audio = AudioSegment.from_file(main_audio_filename)
|
30 |
effect_audio = AudioSegment.from_file(sound_effect_filename)
|
|
|
35 |
while len(effect_audio) < len(main_audio):
|
36 |
effect_audio += effect_audio
|
37 |
|
38 |
+
effect_audio = effect_audio[: len(main_audio)]
|
39 |
|
40 |
if decrease_effect_volume > 0:
|
41 |
effect_audio = effect_audio - decrease_effect_volume
|
42 |
combined_audio = main_audio.overlay(effect_audio)
|
43 |
|
44 |
if output_filename is None:
|
45 |
+
output_filename = (
|
46 |
+
f"{Path(main_audio_filename).stem}_{Path(sound_effect_filename).stem}.wav"
|
47 |
+
)
|
48 |
combined_audio.export(output_filename, format="wav")
|
49 |
return output_filename
|
50 |
|
|
|
54 |
api_key="YOUR_API_KEY",
|
55 |
)
|
56 |
audio = client.text_to_sound_effects.convert(
|
57 |
+
text=sound_generation_data["text"],
|
58 |
+
duration_seconds=sound_generation_data["duration_seconds"],
|
59 |
+
prompt_influence=sound_generation_data["prompt_influence"],
|
60 |
)
|
61 |
save(audio, output_file)
|
62 |
logger.error("Successfully generated sound effect to file: %s", output_file)
|
63 |
|
64 |
+
|
65 |
async def sound_generation_async(sound_generation_data: dict, output_file: str):
|
66 |
client = AsyncElevenLabs(
|
67 |
api_key="YOUR_API_KEY",
|
68 |
)
|
69 |
audio = await client.text_to_sound_effects.convert(
|
70 |
+
text=sound_generation_data["text"],
|
71 |
+
duration_seconds=sound_generation_data["duration_seconds"],
|
72 |
+
prompt_influence=sound_generation_data["prompt_influence"],
|
73 |
)
|
74 |
save(audio, output_file)
|
75 |
+
logger.error("Successfully generated sound effect to file: %s", output_file)
|