debug long sounds
Browse files- README.md +0 -35
- api.py +108 -45
- audiocraft/builders.py +4 -3
- audiocraft/lm.py +11 -6
- demo.py +5 -5
- live_api.py +5 -2
README.md
CHANGED
@@ -65,41 +65,6 @@ The following needs `api.py` to be already running on a tmux session.
|
|
65 |
python landscape2soundscape.py
|
66 |
```
|
67 |
|
68 |
-
# Videos / Examples
|
69 |
-
|
70 |
-
Native voice Replaced with English TTS Voice
|
71 |
-
|
72 |
-
|
73 |
-
[![Same video w. Native voice replaced with English TTS](assets/tts_video_thumb.png)](https://www.youtube.com/watch?v=geI1Vqn4QpY)
|
74 |
-
|
75 |
-
```python
|
76 |
-
# https://github.com/audeering/shift
|
77 |
-
CUDA_VISIBLE_DEVICES=7 python tts.py --text ANBPR/BUNICA\ NESTIUTA\ Adriana\ Andrei\ \[tmo2UbKYAqc\].en.srt --video ANBPR/BUNICA\ NESTIUTA\ Adriana\ Andrei\ \[tmo2UbKYAqc\].webm
|
78 |
-
```
|
79 |
-
|
80 |
-
## Joint Application of D3.1 & D3.2
|
81 |
-
|
82 |
-
<a href="https://youtu.be/wWC8DpOKVvQ" rel="Subtitles to Video">![Foo4](assets/caption_to_video_thumb.png)</a>
|
83 |
-
|
84 |
-
|
85 |
-
From an image and text create a video:
|
86 |
-
|
87 |
-
```python
|
88 |
-
|
89 |
-
python tts.py --text sample.txt --image assets/image_from_T31.jpg
|
90 |
-
```
|
91 |
-
|
92 |
-
## Landscape 2 Soundscape
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
```python
|
99 |
-
# Loads image & text & sound-scene text and creates .mp4
|
100 |
-
python landscape2soundscape.py
|
101 |
-
```
|
102 |
-
|
103 |
For SHIFT demo / Collaboration with [SMB](https://www.smb.museum/home/)
|
104 |
- YouTube Videos
|
105 |
|
|
|
65 |
python landscape2soundscape.py
|
66 |
```
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
For SHIFT demo / Collaboration with [SMB](https://www.smb.museum/home/)
|
69 |
- YouTube Videos
|
70 |
|
api.py
CHANGED
@@ -16,12 +16,13 @@ from types import SimpleNamespace
|
|
16 |
from flask import Flask, request, send_from_directory
|
17 |
from flask_cors import CORS
|
18 |
from moviepy.editor import *
|
19 |
-
from audiocraft.
|
20 |
CACHE_DIR = 'flask_cache/'
|
21 |
-
|
22 |
-
|
23 |
-
sound_generator
|
24 |
-
|
|
|
25 |
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
26 |
|
27 |
import nltk
|
@@ -32,7 +33,43 @@ nltk.download('punkt')
|
|
32 |
# ssh-add ~/.ssh/id_ed25519_github2024
|
33 |
#
|
34 |
# git remote set-url origin [email protected]:audeering/shift
|
35 |
-
# ==
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
def _shift(x):
|
38 |
n = x.shape[0]
|
@@ -45,43 +82,37 @@ def _shift(x):
|
|
45 |
return x #* fade_in # silence this
|
46 |
|
47 |
def overlay(x, scene=None):
|
48 |
-
|
49 |
if scene is not None:
|
50 |
|
51 |
-
#
|
52 |
-
print('
|
53 |
-
|
54 |
-
[scene]
|
55 |
-
|
56 |
|
57 |
-
#
|
58 |
|
59 |
-
# upsample to 24kHZ of StyleTTS
|
60 |
print('Resampling')
|
61 |
-
back = [audresample.resample(i,
|
62 |
-
original_rate=sound_generator.sample_rate, # 16000
|
63 |
-
target_rate=24000
|
64 |
-
)[0, :] for i in back]
|
65 |
-
print('Cloning backgrounds')
|
66 |
-
# clone/elongate by 4x
|
67 |
-
back = [(_shift(np.concatenate([_shift(single_gen)] * 4))) for single_gen in back]
|
68 |
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
74 |
|
75 |
-
#
|
76 |
-
n_repeat = len(x) //
|
77 |
|
78 |
-
#
|
79 |
print(f'Additional Repeat {n_repeat=}')
|
80 |
-
|
81 |
-
|
82 |
-
print(f'\n====SOUND BACKGROUND SHAPE\n{
|
83 |
-
f'{np.abs(
|
84 |
-
x = .1 * x + .9 *
|
85 |
else:
|
86 |
print('sound_background = None')
|
87 |
return x
|
@@ -113,6 +144,8 @@ def tts_multi_sentence(precomputed_style_vector=None,
|
|
113 |
embedding_scale=1))
|
114 |
x = np.concatenate(x)
|
115 |
|
|
|
|
|
116 |
return overlay(x, scene=scene)
|
117 |
|
118 |
# Fallback - Mimic-3
|
@@ -122,7 +155,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
|
|
122 |
x, fs = soundfile.read('_tmp.wav')
|
123 |
x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64)
|
124 |
|
125 |
-
return overlay(x,
|
126 |
|
127 |
|
128 |
|
@@ -229,7 +262,7 @@ def serve_wav():
|
|
229 |
# ====SILENT VIDEO====
|
230 |
|
231 |
if args.video is not None:
|
232 |
-
# banner
|
233 |
frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
|
234 |
font = cv2.FONT_HERSHEY_SIMPLEX
|
235 |
bottomLeftCornerOfText = (240, 74) # w,h
|
@@ -260,6 +293,10 @@ def serve_wav():
|
|
260 |
fontColor,
|
261 |
thickness,
|
262 |
lineType)
|
|
|
|
|
|
|
|
|
263 |
# ====SILENT VIDEO EXTRACT====
|
264 |
# DONLOAD SRT from youtube
|
265 |
#
|
@@ -274,39 +311,65 @@ def serve_wav():
|
|
274 |
#
|
275 |
video_file = args.video
|
276 |
vf = VideoFileClip(video_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
try:
|
278 |
-
|
|
|
279 |
num = x_native.shape[0]
|
280 |
is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4)) # fade heaviside
|
281 |
-
|
282 |
def inpaint_banner(get_frame, t):
|
283 |
'''blend banner - (now plays) tts or native voic
|
284 |
'''
|
285 |
-
|
|
|
|
|
286 |
|
287 |
ix = int(t * 24000)
|
288 |
|
289 |
-
if is_tts[ix] > .5:
|
290 |
-
frame = frame_tts
|
|
|
|
|
291 |
else:
|
292 |
frame = frame_orig
|
293 |
-
|
294 |
# im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
|
|
|
|
|
|
|
295 |
offset_h = 24
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :]
|
297 |
+ .6 * frame).astype(np.uint8)
|
298 |
-
|
299 |
# im2 = np.concatenate([im, frame_tts], 0)
|
300 |
# cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
|
301 |
return im # np.concatenate([im, frane_ttts], 0)
|
|
|
302 |
except UnboundLocalError: # args.native == False
|
|
|
303 |
def inpaint_banner(get_frame, t):
|
|
|
304 |
im = np.copy(get_frame(t))
|
305 |
-
|
306 |
-
h, w, _ =
|
|
|
|
|
307 |
offset_h = 24
|
308 |
im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :]
|
309 |
-
+ .6 *
|
310 |
return im
|
311 |
vf = vf.fl(inpaint_banner)
|
312 |
vf.write_videofile(SILENT_VIDEO)
|
|
|
16 |
from flask import Flask, request, send_from_directory
|
17 |
from flask_cors import CORS
|
18 |
from moviepy.editor import *
|
19 |
+
from audiocraft.builders import AudioGen
|
20 |
CACHE_DIR = 'flask_cache/'
|
21 |
+
NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same scene for long video)
|
22 |
+
|
23 |
+
sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
|
24 |
+
|
25 |
+
|
26 |
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
27 |
|
28 |
import nltk
|
|
|
33 |
# ssh-add ~/.ssh/id_ed25519_github2024
|
34 |
#
|
35 |
# git remote set-url origin [email protected]:audeering/shift
|
36 |
+
# ==
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
|
41 |
+
'''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py'''
|
42 |
+
# initialize the dimensions of the image to be resized and
|
43 |
+
# grab the image size
|
44 |
+
dim = None
|
45 |
+
(h, w) = image.shape[:2]
|
46 |
+
|
47 |
+
# if both the width and height are None, then return the
|
48 |
+
# original image
|
49 |
+
if width is None and height is None:
|
50 |
+
return image
|
51 |
+
|
52 |
+
# check to see if the width is None
|
53 |
+
if width is None:
|
54 |
+
# calculate the ratio of the height and construct the
|
55 |
+
# dimensions
|
56 |
+
r = height / float(h)
|
57 |
+
dim = (int(w * r), height)
|
58 |
+
|
59 |
+
# otherwise, the height is None
|
60 |
+
else:
|
61 |
+
# calculate the ratio of the width and construct the
|
62 |
+
# dimensions
|
63 |
+
r = width / float(w)
|
64 |
+
dim = (width, int(h * r))
|
65 |
+
|
66 |
+
# resize the image
|
67 |
+
resized = cv2.resize(image, dim, interpolation=inter)
|
68 |
+
|
69 |
+
# return the resized image
|
70 |
+
return resized
|
71 |
+
|
72 |
+
|
73 |
|
74 |
def _shift(x):
|
75 |
n = x.shape[0]
|
|
|
82 |
return x #* fade_in # silence this
|
83 |
|
84 |
def overlay(x, scene=None):
|
85 |
+
|
86 |
if scene is not None:
|
87 |
|
88 |
+
# SOUNDS
|
89 |
+
print(f'AudioGen {NUM_SOUND_GENERATIONS} x {scene}')
|
90 |
+
background = sound_generator.generate(
|
91 |
+
[scene] * NUM_SOUND_GENERATIONS
|
92 |
+
).reshape(-1).detach().cpu().numpy() # bs, 11400
|
93 |
|
94 |
+
# upsample 16 kHz AudioGen to 24kHZ StyleTTS
|
95 |
|
|
|
96 |
print('Resampling')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
|
99 |
+
background = audresample.resample(
|
100 |
+
background,
|
101 |
+
original_rate=16000, # sound_generator.sample_rate,
|
102 |
+
target_rate=24000)[0, :]
|
103 |
+
|
104 |
+
# background /= np.abs(background).max() + 1e-7 Apply in sound_generator()
|
105 |
|
106 |
+
# replicat audiogen to match TTS
|
107 |
+
n_repeat = len(x) // background.shape[0] + 2
|
108 |
|
109 |
+
# Reach the full length of TTS by cloning
|
110 |
print(f'Additional Repeat {n_repeat=}')
|
111 |
+
background = np.concatenate(n_repeat * [background])
|
112 |
+
# background = _shift(background)
|
113 |
+
print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
|
114 |
+
f'{np.abs(background.max())=}\n{x.shape=}')
|
115 |
+
x = .1 * x + .9 * background[:len(x)]
|
116 |
else:
|
117 |
print('sound_background = None')
|
118 |
return x
|
|
|
144 |
embedding_scale=1))
|
145 |
x = np.concatenate(x)
|
146 |
|
147 |
+
x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
|
148 |
+
|
149 |
return overlay(x, scene=scene)
|
150 |
|
151 |
# Fallback - Mimic-3
|
|
|
155 |
x, fs = soundfile.read('_tmp.wav')
|
156 |
x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64)
|
157 |
|
158 |
+
return overlay(x, scene=scene)
|
159 |
|
160 |
|
161 |
|
|
|
262 |
# ====SILENT VIDEO====
|
263 |
|
264 |
if args.video is not None:
|
265 |
+
# banner - precomput @ 1920 pixels
|
266 |
frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
|
267 |
font = cv2.FONT_HERSHEY_SIMPLEX
|
268 |
bottomLeftCornerOfText = (240, 74) # w,h
|
|
|
293 |
fontColor,
|
294 |
thickness,
|
295 |
lineType)
|
296 |
+
|
297 |
+
print(f'\n______________________________\n'
|
298 |
+
f'Gen Banners for TTS/Native Title {frame_tts.shape=} {frame_orig.shape=}'
|
299 |
+
f'\n______________________________\n')
|
300 |
# ====SILENT VIDEO EXTRACT====
|
301 |
# DONLOAD SRT from youtube
|
302 |
#
|
|
|
311 |
#
|
312 |
video_file = args.video
|
313 |
vf = VideoFileClip(video_file)
|
314 |
+
|
315 |
+
# GET 1st FRAME to OBTAIN frame RESOLUTION
|
316 |
+
h, w, _ = vf.get_frame(0).shape
|
317 |
+
frame_tts = _resize(frame_tts, width=w)
|
318 |
+
frame_orig = _resize(frame_orig, width=w)
|
319 |
+
h, w, _ = frame_orig.shape
|
320 |
+
|
321 |
try:
|
322 |
+
|
323 |
+
# inpaint banner to say if native voice
|
324 |
num = x_native.shape[0]
|
325 |
is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4)) # fade heaviside
|
326 |
+
|
327 |
def inpaint_banner(get_frame, t):
|
328 |
'''blend banner - (now plays) tts or native voic
|
329 |
'''
|
330 |
+
|
331 |
+
im = np.copy(get_frame(t)) # pic
|
332 |
+
|
333 |
|
334 |
ix = int(t * 24000)
|
335 |
|
336 |
+
if is_tts[ix] > .5: # mask == 1 => tts / mask == 0 -> native
|
337 |
+
frame = frame_tts # rename frame to rsz_frame_... because if frame_tts is mod
|
338 |
+
# then is considered a "local variable" thus the "outer var"
|
339 |
+
# is not observed by python raising referenced before assign
|
340 |
else:
|
341 |
frame = frame_orig
|
342 |
+
|
343 |
# im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
|
344 |
+
|
345 |
+
|
346 |
+
|
347 |
offset_h = 24
|
348 |
+
|
349 |
+
|
350 |
+
print(f' > inpaint_banner() HAS NATIVE: {frame.shape=} {im.shape=}\n\n\n\n')
|
351 |
+
|
352 |
+
|
353 |
+
|
354 |
im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :]
|
355 |
+ .6 * frame).astype(np.uint8)
|
356 |
+
|
357 |
# im2 = np.concatenate([im, frame_tts], 0)
|
358 |
# cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
|
359 |
return im # np.concatenate([im, frane_ttts], 0)
|
360 |
+
|
361 |
except UnboundLocalError: # args.native == False
|
362 |
+
|
363 |
def inpaint_banner(get_frame, t):
|
364 |
+
|
365 |
im = np.copy(get_frame(t))
|
366 |
+
|
367 |
+
h, w, _ = frame_tts.shape # frame = banner
|
368 |
+
if w != im.shape[1]: # rsz banners to fit video w
|
369 |
+
local_frame = _resize(frame_tts, width=im.shape[1])
|
370 |
offset_h = 24
|
371 |
im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :]
|
372 |
+
+ .6 * local_frame).astype(np.uint8)
|
373 |
return im
|
374 |
vf = vf.fl(inpaint_banner)
|
375 |
vf.write_videofile(SILENT_VIDEO)
|
audiocraft/builders.py
CHANGED
@@ -79,9 +79,10 @@ class AudioGen(nn.Module):
|
|
79 |
conditions=attributes,
|
80 |
max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
|
81 |
x = self.compression_model.decode(gen_tokens, None) #[bs, 1, 11840]
|
82 |
-
print('______________\nGENTOk 5', gen_tokens)
|
83 |
-
print('GENAUD 5', x.sum())
|
84 |
-
|
|
|
85 |
|
86 |
# == BUILD Fn
|
87 |
def get_quantizer(self, quantizer, cfg, dimension):
|
|
|
79 |
conditions=attributes,
|
80 |
max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
|
81 |
x = self.compression_model.decode(gen_tokens, None) #[bs, 1, 11840]
|
82 |
+
# print('______________\nGENTOk 5', gen_tokens)
|
83 |
+
print('GENAUD 5', x.sum(), x.shape)
|
84 |
+
|
85 |
+
return x / x.abs().max(2, keepdims=True)[0] + 1e-7
|
86 |
|
87 |
# == BUILD Fn
|
88 |
def get_quantizer(self, quantizer, cfg, dimension):
|
audiocraft/lm.py
CHANGED
@@ -12,9 +12,13 @@ from audiocraft.activations import get_activation_fn
|
|
12 |
import numpy as np
|
13 |
|
14 |
def _shift(x):
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
18 |
return x
|
19 |
|
20 |
|
@@ -160,7 +164,7 @@ class LMModel(nn.Module):
|
|
160 |
self.cfg_coef = cfg_coef
|
161 |
self.condition_provider = condition_provider
|
162 |
self.card = card # 2048 ?
|
163 |
-
self.n_draw =
|
164 |
embed_dim = self.card + 1
|
165 |
self.n_q = n_q
|
166 |
self.dim = dim
|
@@ -353,11 +357,12 @@ class LMModel(nn.Module):
|
|
353 |
out_codes = out_codes.reshape(bs, self.n_draw, 4, new_len)
|
354 |
out_codes = out_codes.transpose(1, 2).reshape(bs, 4, self.n_draw * new_len)
|
355 |
print(out_codes.shape, 'o')
|
356 |
-
|
|
|
357 |
|
358 |
# Clear Transformer k/v history (Different history is kept by 48x selfattn)
|
359 |
for lay in self.transformer.layers:
|
360 |
lay.self_attn.k_history = None
|
361 |
lay.self_attn.v_history = None
|
362 |
|
363 |
-
return out_codes
|
|
|
12 |
import numpy as np
|
13 |
|
14 |
def _shift(x):
|
15 |
+
# cyclic shift of [1, 4, seq_len] slices from [bs, 4, seq_len]
|
16 |
+
print(x.shape, 'SHIFT\n= = = = = ')
|
17 |
+
for i, _slice in enumerate(x):
|
18 |
+
n = x.shape[2]
|
19 |
+
offset = np.random.randint(.24 * n, max(1, .74 * n)) # high should be above >= 0 TBD
|
20 |
+
print(offset)
|
21 |
+
x[i, :, :] = torch.roll(_slice, offset, dims=1)
|
22 |
return x
|
23 |
|
24 |
|
|
|
164 |
self.cfg_coef = cfg_coef
|
165 |
self.condition_provider = condition_provider
|
166 |
self.card = card # 2048 ?
|
167 |
+
self.n_draw = 2 # replicate so many times the generation of each text in batch
|
168 |
embed_dim = self.card + 1
|
169 |
self.n_q = n_q
|
170 |
self.dim = dim
|
|
|
357 |
out_codes = out_codes.reshape(bs, self.n_draw, 4, new_len)
|
358 |
out_codes = out_codes.transpose(1, 2).reshape(bs, 4, self.n_draw * new_len)
|
359 |
print(out_codes.shape, 'o')
|
360 |
+
for _ in range(7):
|
361 |
+
out_codes = _shift(out_codes)
|
362 |
|
363 |
# Clear Transformer k/v history (Different history is kept by 48x selfattn)
|
364 |
for lay in self.transformer.layers:
|
365 |
lay.self_attn.k_history = None
|
366 |
lay.self_attn.v_history = None
|
367 |
|
368 |
+
return out_codes
|
demo.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
import audiofile
|
2 |
import numpy as np
|
3 |
from audiocraft import AudioGen
|
4 |
-
text_list = ['dogs barging in the street',
|
5 |
-
'
|
6 |
|
7 |
-
sound_generator = AudioGen(duration
|
8 |
device='cuda:0').to('cuda:0').eval()
|
9 |
x = sound_generator.generate(text_list) # [bs, 1, 7680]
|
10 |
# print('demo', x.shape)
|
11 |
-
x = x[
|
12 |
-
x /= np.abs(x).max() + 1e-7
|
13 |
|
14 |
audiofile.write('del_seane.wav', x, 16000)
|
|
|
1 |
import audiofile
|
2 |
import numpy as np
|
3 |
from audiocraft import AudioGen
|
4 |
+
text_list = ['dogs barging in the street',
|
5 |
+
'cats meowing']
|
6 |
|
7 |
+
sound_generator = AudioGen(duration=0.24,
|
8 |
device='cuda:0').to('cuda:0').eval()
|
9 |
x = sound_generator.generate(text_list) # [bs, 1, 7680]
|
10 |
# print('demo', x.shape)
|
11 |
+
x = x[1, :, :].detach().cpu().numpy()
|
12 |
+
# x /= np.abs(x).max() + 1e-7 # inside generate()
|
13 |
|
14 |
audiofile.write('del_seane.wav', x, 16000)
|
live_api.py
CHANGED
@@ -14,7 +14,7 @@ from types import SimpleNamespace
|
|
14 |
from flask import Flask, request, send_from_directory
|
15 |
from flask_cors import CORS
|
16 |
from audiocraft.builders import AudioGen #, audio_write
|
17 |
-
|
18 |
sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
|
19 |
|
20 |
|
@@ -46,7 +46,10 @@ Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
|
46 |
def tts_multi_sentence(scene=None):
|
47 |
if scene is not None and len(scene) >= 4:
|
48 |
print(f'Processing: {scene} ..')
|
49 |
-
x = sound_generator.generate([scene])[0, :, :].detach().cpu().numpy()
|
|
|
|
|
|
|
50 |
|
51 |
x /= np.abs(x).max() + 1e-7
|
52 |
# is 16kHz - AUdiogen Fs
|
|
|
14 |
from flask import Flask, request, send_from_directory
|
15 |
from flask_cors import CORS
|
16 |
from audiocraft.builders import AudioGen #, audio_write
|
17 |
+
NUM_SOUND_GENERATIONS = 1 # they differ a lot and are unnatural to concatenate, prefer lm.n_draw
|
18 |
sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
|
19 |
|
20 |
|
|
|
46 |
def tts_multi_sentence(scene=None):
|
47 |
if scene is not None and len(scene) >= 4:
|
48 |
print(f'Processing: {scene} ..')
|
49 |
+
# x = sound_generator.generate([scene])[0, :, :].detach().cpu().numpy()
|
50 |
+
x = sound_generator.generate(
|
51 |
+
[scene] * NUM_SOUND_GENERATIONS
|
52 |
+
).reshape(1, -1).detach().cpu().numpy() # bs, 11400
|
53 |
|
54 |
x /= np.abs(x).max() + 1e-7
|
55 |
# is 16kHz - AUdiogen Fs
|