soudscape discard last 1s from AudioGen - avoids splash sound
Browse files- api.py +41 -13
- audiocraft/builders.py +5 -1
- audiocraft/lm.py +1 -1
- audiocraft/transformer.py +4 -4
api.py
CHANGED
@@ -20,7 +20,7 @@ from audiocraft.builders import AudioGen
|
|
20 |
CACHE_DIR = 'flask_cache/'
|
21 |
NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same soundscape for long video)
|
22 |
|
23 |
-
sound_generator = AudioGen(duration
|
24 |
|
25 |
|
26 |
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
@@ -87,11 +87,11 @@ def overlay(x, soundscape=None):
|
|
87 |
if soundscape is not None:
|
88 |
|
89 |
# SOUNDS
|
90 |
-
|
91 |
background = sound_generator.generate(
|
92 |
[soundscape] * NUM_SOUND_GENERATIONS
|
93 |
-
).reshape(-1).detach().cpu().numpy() # bs, 11400
|
94 |
-
|
95 |
# upsample 16 kHz AudioGen to 24kHZ StyleTTS
|
96 |
|
97 |
print('Resampling')
|
@@ -100,20 +100,48 @@ def overlay(x, soundscape=None):
|
|
100 |
background = audresample.resample(
|
101 |
background,
|
102 |
original_rate=16000, # sound_generator.sample_rate,
|
103 |
-
target_rate=24000)[0,
|
104 |
|
105 |
# background /= np.abs(background).max() + 1e-7 Apply in sound_generator()
|
106 |
|
107 |
-
# replicat audiogen to match TTS
|
108 |
-
n_repeat = len(x) // background.shape[0] + 2
|
109 |
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
# background = _shift(background)
|
114 |
-
print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
|
115 |
-
|
116 |
-
|
|
|
|
|
117 |
else:
|
118 |
print('sound_background = None')
|
119 |
return x
|
|
|
20 |
CACHE_DIR = 'flask_cache/'
|
21 |
NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same soundscape for long video)
|
22 |
|
23 |
+
sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
|
24 |
|
25 |
|
26 |
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
|
|
87 |
if soundscape is not None:
|
88 |
|
89 |
# SOUNDS
|
90 |
+
|
91 |
background = sound_generator.generate(
|
92 |
[soundscape] * NUM_SOUND_GENERATIONS
|
93 |
+
).reshape(-1).detach().cpu().numpy() # bs, 11400 @.74s
|
94 |
+
# sound_generator._flush() # ALREADY done in lm.generate() THE Encodec does not SEEM TO HAVE TRANSFORMERS thys no kvclean up kv cache from previous soundscape
|
95 |
# upsample 16 kHz AudioGen to 24kHZ StyleTTS
|
96 |
|
97 |
print('Resampling')
|
|
|
100 |
background = audresample.resample(
|
101 |
background,
|
102 |
original_rate=16000, # sound_generator.sample_rate,
|
103 |
+
target_rate=24000)[0, :-25000] # discard last samples as they have the splash sound / polarity change;
|
104 |
|
105 |
# background /= np.abs(background).max() + 1e-7 Apply in sound_generator()
|
106 |
|
|
|
|
|
107 |
|
108 |
+
|
109 |
+
|
110 |
+
k = background.shape[0]
|
111 |
+
|
112 |
+
|
113 |
+
|
114 |
+
|
115 |
+
|
116 |
+
|
117 |
+
|
118 |
+
|
119 |
+
hop = int(.7 * k) # only overlap 10%
|
120 |
+
n_repeat = len(x) // hop
|
121 |
+
total = np.zeros( hop * (n_repeat + 2)) # add some extra pad space for last frame to fit
|
122 |
+
|
123 |
+
m = np.ones(k)
|
124 |
+
overlap = k - hop
|
125 |
+
m[hop:] = np.linspace(1, 0, overlap) # tril mask for avg sound in the interpolated hop
|
126 |
+
# m[:overlap] = np.linspace(0, 1, overlap)
|
127 |
+
|
128 |
+
for j in range(n_repeat):
|
129 |
+
# total[j*k + hop:(j+1)*k + hop] += background
|
130 |
+
# total[j*k + hop:(j+1)*k + hop] = total[j*k + hop:(j+1)*k + hop] + m *background # the total is already smoothly falling due to the previous mask. Is only the new added signal that needs to rise smoothl
|
131 |
+
# total[j * (k+hop):(j+1) * k + j*hop] =background
|
132 |
+
total[j*hop:j*hop + k] += m * background # the total is already smoothly falling due to the previous mask. Is only the new added signal that needs to rise smoothl
|
133 |
+
# total = total.clip(-1, 1) # if too many signals were added on top of each other
|
134 |
+
# print(total[40000:70000].tolist())
|
135 |
+
print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
|
136 |
+
|
137 |
+
# background = np.concatenate(n_repeat * [background])
|
138 |
+
|
139 |
# background = _shift(background)
|
140 |
+
# print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
|
141 |
+
# f'{np.abs(background.max())=}\n{x.shape=}')
|
142 |
+
total /= np.abs(total).max() + 1e-7 # amplify speech to full [-1,1]
|
143 |
+
x = .4 * x + .6 * total[:len(x)]
|
144 |
+
|
145 |
else:
|
146 |
print('sound_background = None')
|
147 |
return x
|
audiocraft/builders.py
CHANGED
@@ -252,4 +252,8 @@ class AudioGen(nn.Module):
|
|
252 |
model.load_state_dict(pkg['best_state'])
|
253 |
model.cfg = cfg
|
254 |
# return model
|
255 |
-
self.lm = model.to(torch.float)
|
|
|
|
|
|
|
|
|
|
252 |
model.load_state_dict(pkg['best_state'])
|
253 |
model.cfg = cfg
|
254 |
# return model
|
255 |
+
self.lm = model.to(torch.float)
|
256 |
+
|
257 |
+
# def _flush(self):
|
258 |
+
# self.lm._flush() # already done in lm generate at end
|
259 |
+
|
audiocraft/lm.py
CHANGED
@@ -164,7 +164,7 @@ class LMModel(nn.Module):
|
|
164 |
self.cfg_coef = cfg_coef
|
165 |
self.condition_provider = condition_provider
|
166 |
self.card = card # 2048 ?
|
167 |
-
self.n_draw =
|
168 |
embed_dim = self.card + 1
|
169 |
self.n_q = n_q
|
170 |
self.dim = dim
|
|
|
164 |
self.cfg_coef = cfg_coef
|
165 |
self.condition_provider = condition_provider
|
166 |
self.card = card # 2048 ?
|
167 |
+
self.n_draw = 1 # replicate so many times the generation of each text in batch
|
168 |
embed_dim = self.card + 1
|
169 |
self.n_q = n_q
|
170 |
self.dim = dim
|
audiocraft/transformer.py
CHANGED
@@ -175,7 +175,7 @@ class StreamingMultiheadAttention(nn.Module):
|
|
175 |
v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
|
176 |
|
177 |
q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
|
178 |
-
print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(),'CROSS A5')
|
179 |
else:
|
180 |
# 1st projected makes k,v (instantaneous)
|
181 |
# 2nd cat
|
@@ -213,7 +213,7 @@ class StreamingMultiheadAttention(nn.Module):
|
|
213 |
|
214 |
|
215 |
# KV COMPLETION ONLY ON SELF ATTENTION
|
216 |
-
print('KV5', self.k_history.sum(), self.v_history.sum(), self.k_history.shape, self.v_history.shape)
|
217 |
|
218 |
|
219 |
if self.memory_efficient:
|
@@ -386,7 +386,7 @@ class StreamingTransformer(nn.Module):
|
|
386 |
|
387 |
|
388 |
for j, lay in enumerate(self.layers):
|
389 |
-
print(f'
|
390 |
-
x = lay(x, cross_attention_src=kwargs["cross_attention_src"]) # txt
|
391 |
# each layer (mha) keeps history of its own k,v for all tokens
|
392 |
return x
|
|
|
175 |
v = nn.functional.linear(value, self.in_proj_weight[2 * dim:], bias_v)
|
176 |
|
177 |
q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
|
178 |
+
# print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(),'CROSS A5')
|
179 |
else:
|
180 |
# 1st projected makes k,v (instantaneous)
|
181 |
# 2nd cat
|
|
|
213 |
|
214 |
|
215 |
# KV COMPLETION ONLY ON SELF ATTENTION
|
216 |
+
# print('KV5', self.k_history.sum(), self.v_history.sum(), self.k_history.shape, self.v_history.shape)
|
217 |
|
218 |
|
219 |
if self.memory_efficient:
|
|
|
386 |
|
387 |
|
388 |
for j, lay in enumerate(self.layers):
|
389 |
+
# print(f'Transf Layer{j} {pos_emb.sum()=} {pos_emb.shape=}{x.shape=}___________________')
|
390 |
+
x = lay(x, cross_attention_src=kwargs["cross_attention_src"]) # cross_attention_src = txt-cond
|
391 |
# each layer (mha) keeps history of its own k,v for all tokens
|
392 |
return x
|