Dionyssos commited on
Commit
3ac9f34
·
1 Parent(s): 459d7a3

debug long sounds

Browse files
Files changed (6) hide show
  1. README.md +0 -35
  2. api.py +108 -45
  3. audiocraft/builders.py +4 -3
  4. audiocraft/lm.py +11 -6
  5. demo.py +5 -5
  6. live_api.py +5 -2
README.md CHANGED
@@ -65,41 +65,6 @@ The following needs `api.py` to be already running on a tmux session.
65
  python landscape2soundscape.py
66
  ```
67
 
68
- # Videos / Examples
69
-
70
- Native voice Replaced with English TTS Voice
71
-
72
-
73
- [![Same video w. Native voice replaced with English TTS](assets/tts_video_thumb.png)](https://www.youtube.com/watch?v=geI1Vqn4QpY)
74
-
75
- ```python
76
- # https://github.com/audeering/shift
77
- CUDA_VISIBLE_DEVICES=7 python tts.py --text ANBPR/BUNICA\ NESTIUTA\ Adriana\ Andrei\ \[tmo2UbKYAqc\].en.srt --video ANBPR/BUNICA\ NESTIUTA\ Adriana\ Andrei\ \[tmo2UbKYAqc\].webm
78
- ```
79
-
80
- ## Joint Application of D3.1 & D3.2
81
-
82
- <a href="https://youtu.be/wWC8DpOKVvQ" rel="Subtitles to Video">![Foo4](assets/caption_to_video_thumb.png)</a>
83
-
84
-
85
- From an image and text create a video:
86
-
87
- ```python
88
-
89
- python tts.py --text sample.txt --image assets/image_from_T31.jpg
90
- ```
91
-
92
- ## Landscape 2 Soundscape
93
-
94
-
95
-
96
-
97
-
98
- ```python
99
- # Loads image & text & sound-scene text and creates .mp4
100
- python landscape2soundscape.py
101
- ```
102
-
103
  For SHIFT demo / Collaboration with [SMB](https://www.smb.museum/home/)
104
  - YouTube Videos
105
 
 
65
  python landscape2soundscape.py
66
  ```
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  For SHIFT demo / Collaboration with [SMB](https://www.smb.museum/home/)
69
  - YouTube Videos
70
 
api.py CHANGED
@@ -16,12 +16,13 @@ from types import SimpleNamespace
16
  from flask import Flask, request, send_from_directory
17
  from flask_cors import CORS
18
  from moviepy.editor import *
19
- from audiocraft.audiogen import AudioGen
20
  CACHE_DIR = 'flask_cache/'
21
- SOUNDSCAPE_DURATION = 4.4
22
- sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
23
- sound_generator.set_generation_params(duration=SOUNDSCAPE_DURATION)
24
- print(f'{sound_generator.sample_rate=}')
 
25
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
26
 
27
  import nltk
@@ -32,7 +33,43 @@ nltk.download('punkt')
32
  # ssh-add ~/.ssh/id_ed25519_github2024
33
  #
34
  # git remote set-url origin [email protected]:audeering/shift
35
- # ==
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  def _shift(x):
38
  n = x.shape[0]
@@ -45,43 +82,37 @@ def _shift(x):
45
  return x #* fade_in # silence this
46
 
47
  def overlay(x, scene=None):
48
- print(f'{scene=} OVERLAY')
49
  if scene is not None:
50
 
51
- # generate 4
52
- print('Generating AudioCraft')
53
- back = [sound_generator.generate(
54
- [scene]
55
- )[0].detach().cpu().numpy()[0, :] for _ in range(1)]
56
 
57
- #print([j.shape for j in back], len(back), 'BACK')
58
 
59
- # upsample to 24kHZ of StyleTTS
60
  print('Resampling')
61
- back = [audresample.resample(i,
62
- original_rate=sound_generator.sample_rate, # 16000
63
- target_rate=24000
64
- )[0, :] for i in back]
65
- print('Cloning backgrounds')
66
- # clone/elongate by 4x
67
- back = [(_shift(np.concatenate([_shift(single_gen)] * 4))) for single_gen in back]
68
 
69
 
70
- # long ~30s
71
- back = np.concatenate(back)
72
- for _ in range(4):
73
- back = _shift(back)
 
 
74
 
75
- # clone to exact len of TTS
76
- n_repeat = len(x) // back.shape[0] + 2
77
 
78
- # Additional Repeat - Reach full length of TTS
79
  print(f'Additional Repeat {n_repeat=}')
80
- back = np.concatenate(n_repeat * [back])
81
- back = _shift(back)
82
- print(f'\n====SOUND BACKGROUND SHAPE\n{back.shape=}',
83
- f'{np.abs(back.max())=}\n{x.shape=}')
84
- x = .1 * x + .9 * back[:len(x)]
85
  else:
86
  print('sound_background = None')
87
  return x
@@ -113,6 +144,8 @@ def tts_multi_sentence(precomputed_style_vector=None,
113
  embedding_scale=1))
114
  x = np.concatenate(x)
115
 
 
 
116
  return overlay(x, scene=scene)
117
 
118
  # Fallback - Mimic-3
@@ -122,7 +155,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
122
  x, fs = soundfile.read('_tmp.wav')
123
  x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64)
124
 
125
- return overlay(x, sound_background)
126
 
127
 
128
 
@@ -229,7 +262,7 @@ def serve_wav():
229
  # ====SILENT VIDEO====
230
 
231
  if args.video is not None:
232
- # banner
233
  frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
234
  font = cv2.FONT_HERSHEY_SIMPLEX
235
  bottomLeftCornerOfText = (240, 74) # w,h
@@ -260,6 +293,10 @@ def serve_wav():
260
  fontColor,
261
  thickness,
262
  lineType)
 
 
 
 
263
  # ====SILENT VIDEO EXTRACT====
264
  # DONLOAD SRT from youtube
265
  #
@@ -274,39 +311,65 @@ def serve_wav():
274
  #
275
  video_file = args.video
276
  vf = VideoFileClip(video_file)
 
 
 
 
 
 
 
277
  try:
278
- # inpaint banners if native voice
 
279
  num = x_native.shape[0]
280
  is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4)) # fade heaviside
281
-
282
  def inpaint_banner(get_frame, t):
283
  '''blend banner - (now plays) tts or native voic
284
  '''
285
- im = np.copy(get_frame(t))
 
 
286
 
287
  ix = int(t * 24000)
288
 
289
- if is_tts[ix] > .5: # mask is 1 thus tts else native
290
- frame = frame_tts
 
 
291
  else:
292
  frame = frame_orig
293
- h, w, _ = frame.shape
294
  # im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
 
 
 
295
  offset_h = 24
 
 
 
 
 
 
296
  im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :]
297
  + .6 * frame).astype(np.uint8)
298
-
299
  # im2 = np.concatenate([im, frame_tts], 0)
300
  # cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
301
  return im # np.concatenate([im, frane_ttts], 0)
 
302
  except UnboundLocalError: # args.native == False
 
303
  def inpaint_banner(get_frame, t):
 
304
  im = np.copy(get_frame(t))
305
- frame = frame_tts
306
- h, w, _ = frame.shape
 
 
307
  offset_h = 24
308
  im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :]
309
- + .6 * frame).astype(np.uint8)
310
  return im
311
  vf = vf.fl(inpaint_banner)
312
  vf.write_videofile(SILENT_VIDEO)
 
16
  from flask import Flask, request, send_from_directory
17
  from flask_cors import CORS
18
  from moviepy.editor import *
19
+ from audiocraft.builders import AudioGen
20
  CACHE_DIR = 'flask_cache/'
21
+ NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same scene for long video)
22
+
23
+ sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
24
+
25
+
26
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
27
 
28
  import nltk
 
33
  # ssh-add ~/.ssh/id_ed25519_github2024
34
  #
35
  # git remote set-url origin [email protected]:audeering/shift
36
+ # ==
37
+
38
+
39
+
40
+ def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
41
+ '''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py'''
42
+ # initialize the dimensions of the image to be resized and
43
+ # grab the image size
44
+ dim = None
45
+ (h, w) = image.shape[:2]
46
+
47
+ # if both the width and height are None, then return the
48
+ # original image
49
+ if width is None and height is None:
50
+ return image
51
+
52
+ # check to see if the width is None
53
+ if width is None:
54
+ # calculate the ratio of the height and construct the
55
+ # dimensions
56
+ r = height / float(h)
57
+ dim = (int(w * r), height)
58
+
59
+ # otherwise, the height is None
60
+ else:
61
+ # calculate the ratio of the width and construct the
62
+ # dimensions
63
+ r = width / float(w)
64
+ dim = (width, int(h * r))
65
+
66
+ # resize the image
67
+ resized = cv2.resize(image, dim, interpolation=inter)
68
+
69
+ # return the resized image
70
+ return resized
71
+
72
+
73
 
74
  def _shift(x):
75
  n = x.shape[0]
 
82
  return x #* fade_in # silence this
83
 
84
  def overlay(x, scene=None):
85
+
86
  if scene is not None:
87
 
88
+ # SOUNDS
89
+ print(f'AudioGen {NUM_SOUND_GENERATIONS} x {scene}')
90
+ background = sound_generator.generate(
91
+ [scene] * NUM_SOUND_GENERATIONS
92
+ ).reshape(-1).detach().cpu().numpy() # bs, 11400
93
 
94
+ # upsample 16 kHz AudioGen to 24kHZ StyleTTS
95
 
 
96
  print('Resampling')
 
 
 
 
 
 
 
97
 
98
 
99
+ background = audresample.resample(
100
+ background,
101
+ original_rate=16000, # sound_generator.sample_rate,
102
+ target_rate=24000)[0, :]
103
+
104
+ # background /= np.abs(background).max() + 1e-7 Apply in sound_generator()
105
 
106
+ # replicat audiogen to match TTS
107
+ n_repeat = len(x) // background.shape[0] + 2
108
 
109
+ # Reach the full length of TTS by cloning
110
  print(f'Additional Repeat {n_repeat=}')
111
+ background = np.concatenate(n_repeat * [background])
112
+ # background = _shift(background)
113
+ print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
114
+ f'{np.abs(background.max())=}\n{x.shape=}')
115
+ x = .1 * x + .9 * background[:len(x)]
116
  else:
117
  print('sound_background = None')
118
  return x
 
144
  embedding_scale=1))
145
  x = np.concatenate(x)
146
 
147
+ x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
148
+
149
  return overlay(x, scene=scene)
150
 
151
  # Fallback - Mimic-3
 
155
  x, fs = soundfile.read('_tmp.wav')
156
  x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64)
157
 
158
+ return overlay(x, scene=scene)
159
 
160
 
161
 
 
262
  # ====SILENT VIDEO====
263
 
264
  if args.video is not None:
265
+ # banner - precomput @ 1920 pixels
266
  frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
267
  font = cv2.FONT_HERSHEY_SIMPLEX
268
  bottomLeftCornerOfText = (240, 74) # w,h
 
293
  fontColor,
294
  thickness,
295
  lineType)
296
+
297
+ print(f'\n______________________________\n'
298
+ f'Gen Banners for TTS/Native Title {frame_tts.shape=} {frame_orig.shape=}'
299
+ f'\n______________________________\n')
300
  # ====SILENT VIDEO EXTRACT====
301
  # DONLOAD SRT from youtube
302
  #
 
311
  #
312
  video_file = args.video
313
  vf = VideoFileClip(video_file)
314
+
315
+ # GET 1st FRAME to OBTAIN frame RESOLUTION
316
+ h, w, _ = vf.get_frame(0).shape
317
+ frame_tts = _resize(frame_tts, width=w)
318
+ frame_orig = _resize(frame_orig, width=w)
319
+ h, w, _ = frame_orig.shape
320
+
321
  try:
322
+
323
+ # inpaint banner to say if native voice
324
  num = x_native.shape[0]
325
  is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4)) # fade heaviside
326
+
327
  def inpaint_banner(get_frame, t):
328
  '''blend banner - (now plays) tts or native voic
329
  '''
330
+
331
+ im = np.copy(get_frame(t)) # pic
332
+
333
 
334
  ix = int(t * 24000)
335
 
336
+ if is_tts[ix] > .5: # mask == 1 => tts / mask == 0 -> native
337
+ frame = frame_tts # rename frame to rsz_frame_... because if frame_tts is mod
338
+ # then is considered a "local variable" thus the "outer var"
339
+ # is not observed by python raising referenced before assign
340
  else:
341
  frame = frame_orig
342
+
343
  # im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
344
+
345
+
346
+
347
  offset_h = 24
348
+
349
+
350
+ print(f' > inpaint_banner() HAS NATIVE: {frame.shape=} {im.shape=}\n\n\n\n')
351
+
352
+
353
+
354
  im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :]
355
  + .6 * frame).astype(np.uint8)
356
+
357
  # im2 = np.concatenate([im, frame_tts], 0)
358
  # cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
359
  return im # np.concatenate([im, frane_ttts], 0)
360
+
361
  except UnboundLocalError: # args.native == False
362
+
363
  def inpaint_banner(get_frame, t):
364
+
365
  im = np.copy(get_frame(t))
366
+
367
+ h, w, _ = frame_tts.shape # frame = banner
368
+ if w != im.shape[1]: # rsz banners to fit video w
369
+ local_frame = _resize(frame_tts, width=im.shape[1])
370
  offset_h = 24
371
  im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :]
372
+ + .6 * local_frame).astype(np.uint8)
373
  return im
374
  vf = vf.fl(inpaint_banner)
375
  vf.write_videofile(SILENT_VIDEO)
audiocraft/builders.py CHANGED
@@ -79,9 +79,10 @@ class AudioGen(nn.Module):
79
  conditions=attributes,
80
  max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
81
  x = self.compression_model.decode(gen_tokens, None) #[bs, 1, 11840]
82
- print('______________\nGENTOk 5', gen_tokens)
83
- print('GENAUD 5', x.sum())
84
- return x
 
85
 
86
  # == BUILD Fn
87
  def get_quantizer(self, quantizer, cfg, dimension):
 
79
  conditions=attributes,
80
  max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
81
  x = self.compression_model.decode(gen_tokens, None) #[bs, 1, 11840]
82
+ # print('______________\nGENTOk 5', gen_tokens)
83
+ print('GENAUD 5', x.sum(), x.shape)
84
+
85
+ return x / x.abs().max(2, keepdims=True)[0] + 1e-7
86
 
87
  # == BUILD Fn
88
  def get_quantizer(self, quantizer, cfg, dimension):
audiocraft/lm.py CHANGED
@@ -12,9 +12,13 @@ from audiocraft.activations import get_activation_fn
12
  import numpy as np
13
 
14
  def _shift(x):
15
- n = x.shape[0]
16
- i = np.random.randint(.24 * n, max(1, .74 * n)) # high should be above >= 0 TBD do we have very short segments
17
- x = torch.roll(x, i, dims=2)
 
 
 
 
18
  return x
19
 
20
 
@@ -160,7 +164,7 @@ class LMModel(nn.Module):
160
  self.cfg_coef = cfg_coef
161
  self.condition_provider = condition_provider
162
  self.card = card # 2048 ?
163
- self.n_draw = 14 # replicate so many times the generation of each text in batch
164
  embed_dim = self.card + 1
165
  self.n_q = n_q
166
  self.dim = dim
@@ -353,11 +357,12 @@ class LMModel(nn.Module):
353
  out_codes = out_codes.reshape(bs, self.n_draw, 4, new_len)
354
  out_codes = out_codes.transpose(1, 2).reshape(bs, 4, self.n_draw * new_len)
355
  print(out_codes.shape, 'o')
356
- out_codes = _shift(out_codes)
 
357
 
358
  # Clear Transformer k/v history (Different history is kept by 48x selfattn)
359
  for lay in self.transformer.layers:
360
  lay.self_attn.k_history = None
361
  lay.self_attn.v_history = None
362
 
363
- return out_codes
 
12
  import numpy as np
13
 
14
  def _shift(x):
15
+ # cyclic shift of [1, 4, seq_len] slices from [bs, 4, seq_len]
16
+ print(x.shape, 'SHIFT\n= = = = = ')
17
+ for i, _slice in enumerate(x):
18
+ n = x.shape[2]
19
+ offset = np.random.randint(.24 * n, max(1, .74 * n)) # high should be above >= 0 TBD
20
+ print(offset)
21
+ x[i, :, :] = torch.roll(_slice, offset, dims=1)
22
  return x
23
 
24
 
 
164
  self.cfg_coef = cfg_coef
165
  self.condition_provider = condition_provider
166
  self.card = card # 2048 ?
167
+ self.n_draw = 2 # replicate so many times the generation of each text in batch
168
  embed_dim = self.card + 1
169
  self.n_q = n_q
170
  self.dim = dim
 
357
  out_codes = out_codes.reshape(bs, self.n_draw, 4, new_len)
358
  out_codes = out_codes.transpose(1, 2).reshape(bs, 4, self.n_draw * new_len)
359
  print(out_codes.shape, 'o')
360
+ for _ in range(7):
361
+ out_codes = _shift(out_codes)
362
 
363
  # Clear Transformer k/v history (Different history is kept by 48x selfattn)
364
  for lay in self.transformer.layers:
365
  lay.self_attn.k_history = None
366
  lay.self_attn.v_history = None
367
 
368
+ return out_codes
demo.py CHANGED
@@ -1,14 +1,14 @@
1
  import audiofile
2
  import numpy as np
3
  from audiocraft import AudioGen
4
- text_list = ['dogs barging in the street',
5
- 'music']
6
 
7
- sound_generator = AudioGen(duration=.74,
8
  device='cuda:0').to('cuda:0').eval()
9
  x = sound_generator.generate(text_list) # [bs, 1, 7680]
10
  # print('demo', x.shape)
11
- x = x[0, :, :].detach().cpu().numpy()
12
- x /= np.abs(x).max() + 1e-7
13
 
14
  audiofile.write('del_seane.wav', x, 16000)
 
1
  import audiofile
2
  import numpy as np
3
  from audiocraft import AudioGen
4
+ text_list = ['dogs barging in the street',
5
+ 'cats meowing']
6
 
7
+ sound_generator = AudioGen(duration=0.24,
8
  device='cuda:0').to('cuda:0').eval()
9
  x = sound_generator.generate(text_list) # [bs, 1, 7680]
10
  # print('demo', x.shape)
11
+ x = x[1, :, :].detach().cpu().numpy()
12
+ # x /= np.abs(x).max() + 1e-7 # inside generate()
13
 
14
  audiofile.write('del_seane.wav', x, 16000)
live_api.py CHANGED
@@ -14,7 +14,7 @@ from types import SimpleNamespace
14
  from flask import Flask, request, send_from_directory
15
  from flask_cors import CORS
16
  from audiocraft.builders import AudioGen #, audio_write
17
-
18
  sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
19
 
20
 
@@ -46,7 +46,10 @@ Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
46
  def tts_multi_sentence(scene=None):
47
  if scene is not None and len(scene) >= 4:
48
  print(f'Processing: {scene} ..')
49
- x = sound_generator.generate([scene])[0, :, :].detach().cpu().numpy()
 
 
 
50
 
51
  x /= np.abs(x).max() + 1e-7
52
  # is 16kHz - AUdiogen Fs
 
14
  from flask import Flask, request, send_from_directory
15
  from flask_cors import CORS
16
  from audiocraft.builders import AudioGen #, audio_write
17
+ NUM_SOUND_GENERATIONS = 1 # they differ a lot and are unnatural to concatenate, prefer lm.n_draw
18
  sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
19
 
20
 
 
46
  def tts_multi_sentence(scene=None):
47
  if scene is not None and len(scene) >= 4:
48
  print(f'Processing: {scene} ..')
49
+ # x = sound_generator.generate([scene])[0, :, :].detach().cpu().numpy()
50
+ x = sound_generator.generate(
51
+ [scene] * NUM_SOUND_GENERATIONS
52
+ ).reshape(1, -1).detach().cpu().numpy() # bs, 11400
53
 
54
  x /= np.abs(x).max() + 1e-7
55
  # is 16kHz - AUdiogen Fs