debug long sounds

Browse files

Files changed (6) hide show

README.md +0 -35
api.py +108 -45
audiocraft/builders.py +4 -3
audiocraft/lm.py +11 -6
demo.py +5 -5
live_api.py +5 -2

README.md CHANGED Viewed

@@ -65,41 +65,6 @@ The following needs `api.py` to be already running on a tmux session.
 python landscape2soundscape.py
 ```
-# Videos / Examples
-Native voice Replaced with English TTS Voice
-[![Same video w. Native voice replaced with English TTS](assets/tts_video_thumb.png)](https://www.youtube.com/watch?v=geI1Vqn4QpY)
-```python
-# https://github.com/audeering/shift
-CUDA_VISIBLE_DEVICES=7 python tts.py --text ANBPR/BUNICA\ NESTIUTA\ Adriana\ Andrei\ \[tmo2UbKYAqc\].en.srt --video ANBPR/BUNICA\ NESTIUTA\ Adriana\ Andrei\ \[tmo2UbKYAqc\].webm
-```
-## Joint Application of D3.1 & D3.2
-<a href="https://youtu.be/wWC8DpOKVvQ" rel="Subtitles to Video">![Foo4](assets/caption_to_video_thumb.png)</a>
-From an image and text create a video:
-```python
-python tts.py --text sample.txt --image assets/image_from_T31.jpg
-```
-## Landscape 2 Soundscape
-```python
-# Loads image & text & sound-scene text and creates .mp4
-python landscape2soundscape.py
-```
 For SHIFT demo / Collaboration with [SMB](https://www.smb.museum/home/)
   - YouTube Videos

 python landscape2soundscape.py
 ```
 For SHIFT demo / Collaboration with [SMB](https://www.smb.museum/home/)
   - YouTube Videos

api.py CHANGED Viewed

@@ -16,12 +16,13 @@ from types import SimpleNamespace
 from flask import Flask, request, send_from_directory
 from flask_cors import CORS
 from moviepy.editor import *
-from audiocraft.audiogen import AudioGen
 CACHE_DIR = 'flask_cache/'
-SOUNDSCAPE_DURATION = 4.4
-sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
-sound_generator.set_generation_params(duration=SOUNDSCAPE_DURATION)
-print(f'{sound_generator.sample_rate=}')
 Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
 import nltk
@@ -32,7 +33,43 @@ nltk.download('punkt')
 #   ssh-add ~/.ssh/id_ed25519_github2024
 #
 #   git remote set-url origin [email protected]:audeering/shift
-# ==
 def _shift(x):
     n = x.shape[0]
@@ -45,43 +82,37 @@ def _shift(x):
     return x  #* fade_in   # silence this
 def overlay(x, scene=None):
-    print(f'{scene=} OVERLAY')
     if scene is not None:
-        # generate 4
-        print('Generating AudioCraft')
-        back = [sound_generator.generate(
-                                        [scene]
-                                            )[0].detach().cpu().numpy()[0, :] for _ in range(1)]
-        #print([j.shape for j in back], len(back), 'BACK')
-        # upsample to 24kHZ of StyleTTS
         print('Resampling')
-        back = [audresample.resample(i,
-            original_rate=sound_generator.sample_rate,  # 16000
-            target_rate=24000
-            )[0, :] for i in back]
-        print('Cloning backgrounds')
-        # clone/elongate by 4x
-        back = [(_shift(np.concatenate([_shift(single_gen)] * 4))) for single_gen in back]
-        # long ~30s
-        back = np.concatenate(back)
-        for _ in range(4):
-            back = _shift(back)
-        # clone to exact len of TTS
-        n_repeat = len(x) // back.shape[0] + 2
-        # Additional Repeat - Reach full length of TTS
         print(f'Additional Repeat {n_repeat=}')
-        back = np.concatenate(n_repeat * [back])
-        back = _shift(back)
-        print(f'\n====SOUND BACKGROUND SHAPE\n{back.shape=}',
-              f'{np.abs(back.max())=}\n{x.shape=}')
-        x = .1 * x + .9 * back[:len(x)]
     else:
         print('sound_background = None')
     return x
@@ -113,6 +144,8 @@ def tts_multi_sentence(precomputed_style_vector=None,
                                     embedding_scale=1))
         x = np.concatenate(x)
         return overlay(x, scene=scene)
     # Fallback - Mimic-3
@@ -122,7 +155,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
     x, fs = soundfile.read('_tmp.wav')
     x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :]  # reshapes (64,) -> (1,64)
-    return overlay(x, sound_background)
@@ -229,7 +262,7 @@ def serve_wav():
     # ====SILENT VIDEO====
     if args.video is not None:
-        # banner
         frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
         font                   = cv2.FONT_HERSHEY_SIMPLEX
         bottomLeftCornerOfText = (240, 74)  # w,h
@@ -260,6 +293,10 @@ def serve_wav():
             fontColor,
             thickness,
             lineType)
         # ====SILENT VIDEO EXTRACT====
         # DONLOAD SRT from youtube
         #
@@ -274,39 +311,65 @@ def serve_wav():
         #
         video_file = args.video
         vf = VideoFileClip(video_file)
         try:
-            # inpaint banners if native voice
             num = x_native.shape[0]
             is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4))  # fade heaviside
             def inpaint_banner(get_frame, t):
                 '''blend banner - (now plays) tts or native voic
                 '''
-                im = np.copy(get_frame(t))
                 ix = int(t * 24000)
-                if is_tts[ix] > .5:  # mask is 1 thus tts else native
-                    frame = frame_tts
                 else:
                     frame = frame_orig
-                h, w, _ = frame.shape
                 # im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
                 offset_h = 24
                 im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :]
                                                     + .6 * frame).astype(np.uint8)
                 # im2 = np.concatenate([im, frame_tts], 0)
                 # cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
                 return im  # np.concatenate([im, frane_ttts], 0)
         except UnboundLocalError:  # args.native == False
             def inpaint_banner(get_frame, t):
                 im = np.copy(get_frame(t))
-                frame = frame_tts
-                h, w, _ = frame.shape
                 offset_h = 24
                 im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :]
-                                                    + .6 * frame).astype(np.uint8)
                 return im
         vf = vf.fl(inpaint_banner)
         vf.write_videofile(SILENT_VIDEO)

 from flask import Flask, request, send_from_directory
 from flask_cors import CORS
 from moviepy.editor import *
+from audiocraft.builders import AudioGen
 CACHE_DIR = 'flask_cache/'
+NUM_SOUND_GENERATIONS = 1  # batch size to generate same text (same scene for long video)
+sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
 Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
 import nltk
 #   ssh-add ~/.ssh/id_ed25519_github2024
 #
 #   git remote set-url origin [email protected]:audeering/shift
+# ==
+def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
+    '''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py'''
+    # initialize the dimensions of the image to be resized and
+    # grab the image size
+    dim = None
+    (h, w) = image.shape[:2]
+    # if both the width and height are None, then return the
+    # original image
+    if width is None and height is None:
+        return image
+    # check to see if the width is None
+    if width is None:
+        # calculate the ratio of the height and construct the
+        # dimensions
+        r = height / float(h)
+        dim = (int(w * r), height)
+    # otherwise, the height is None
+    else:
+        # calculate the ratio of the width and construct the
+        # dimensions
+        r = width / float(w)
+        dim = (width, int(h * r))
+    # resize the image
+    resized = cv2.resize(image, dim, interpolation=inter)
+    # return the resized image
+    return resized
 def _shift(x):
     n = x.shape[0]
     return x  #* fade_in   # silence this
 def overlay(x, scene=None):
     if scene is not None:
+        # SOUNDS
+        print(f'AudioGen {NUM_SOUND_GENERATIONS} x {scene}')
+        background = sound_generator.generate(
+                                        [scene] * NUM_SOUND_GENERATIONS
+                                        ).reshape(-1).detach().cpu().numpy() # bs, 11400
+        # upsample 16 kHz AudioGen to 24kHZ StyleTTS
         print('Resampling')
+        background = audresample.resample(
+            background,
+            original_rate=16000, # sound_generator.sample_rate,
+            target_rate=24000)[0, :]
+        # background /= np.abs(background).max() + 1e-7  Apply in sound_generator()
+        # replicat audiogen to match TTS
+        n_repeat = len(x) // background.shape[0] + 2
+        # Reach the full length of TTS by cloning
         print(f'Additional Repeat {n_repeat=}')
+        background = np.concatenate(n_repeat * [background])
+        # background = _shift(background)
+        print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
+              f'{np.abs(background.max())=}\n{x.shape=}')
+        x = .1 * x + .9 * background[:len(x)]
     else:
         print('sound_background = None')
     return x
                                     embedding_scale=1))
         x = np.concatenate(x)
+        x /= np.abs(x).max() + 1e-7  # amplify speech to full [-1,1]
         return overlay(x, scene=scene)
     # Fallback - Mimic-3
     x, fs = soundfile.read('_tmp.wav')
     x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :]  # reshapes (64,) -> (1,64)
+    return overlay(x, scene=scene)
     # ====SILENT VIDEO====
     if args.video is not None:
+        # banner - precomput @ 1920 pixels
         frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
         font                   = cv2.FONT_HERSHEY_SIMPLEX
         bottomLeftCornerOfText = (240, 74)  # w,h
             fontColor,
             thickness,
             lineType)
+        print(f'\n______________________________\n'
+              f'Gen Banners for TTS/Native Title {frame_tts.shape=} {frame_orig.shape=}'
+              f'\n______________________________\n')
         # ====SILENT VIDEO EXTRACT====
         # DONLOAD SRT from youtube
         #
         #
         video_file = args.video
         vf = VideoFileClip(video_file)
+        # GET 1st FRAME to OBTAIN frame RESOLUTION
+        h, w, _ = vf.get_frame(0).shape
+        frame_tts = _resize(frame_tts, width=w)
+        frame_orig = _resize(frame_orig, width=w)
+        h, w, _ = frame_orig.shape
         try:
+            # inpaint banner to say if native voice
             num = x_native.shape[0]
             is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4))  # fade heaviside
             def inpaint_banner(get_frame, t):
                 '''blend banner - (now plays) tts or native voic
                 '''
+                im = np.copy(get_frame(t))  # pic
                 ix = int(t * 24000)
+                if is_tts[ix] > .5:     # mask == 1 => tts / mask == 0 -> native
+                    frame = frame_tts   # rename frame to rsz_frame_... because if frame_tts is mod
+                                        # then is considered a "local variable" thus the "outer var"
+                                        # is not observed by python raising referenced before assign
                 else:
                     frame = frame_orig
                 # im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
                 offset_h = 24
+                print(f'  > inpaint_banner() HAS NATIVE:  {frame.shape=} {im.shape=}\n\n\n\n')
                 im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :]
                                                     + .6 * frame).astype(np.uint8)
                 # im2 = np.concatenate([im, frame_tts], 0)
                 # cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
                 return im  # np.concatenate([im, frane_ttts], 0)
         except UnboundLocalError:  # args.native == False
             def inpaint_banner(get_frame, t):
                 im = np.copy(get_frame(t))
+                h, w, _ = frame_tts.shape      # frame = banner
+                if w != im.shape[1]:        # rsz banners to fit video w
+                    local_frame = _resize(frame_tts, width=im.shape[1])
                 offset_h = 24
                 im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :]
+                                                    + .6 * local_frame).astype(np.uint8)
                 return im
         vf = vf.fl(inpaint_banner)
         vf.write_videofile(SILENT_VIDEO)

audiocraft/builders.py CHANGED Viewed

@@ -79,9 +79,10 @@ class AudioGen(nn.Module):
                 conditions=attributes,
                 max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
-            print('______________\nGENTOk 5', gen_tokens)
-            print('GENAUD 5', x.sum())
-        return x
     # == BUILD Fn
     def get_quantizer(self, quantizer, cfg, dimension):

                 conditions=attributes,
                 max_gen_len=int(self.duration * self.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
             x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
+            # print('______________\nGENTOk 5', gen_tokens)
+            print('GENAUD 5', x.sum(), x.shape)
+        return x / x.abs().max(2, keepdims=True)[0] + 1e-7
     # == BUILD Fn
     def get_quantizer(self, quantizer, cfg, dimension):

audiocraft/lm.py CHANGED Viewed

@@ -12,9 +12,13 @@ from audiocraft.activations import get_activation_fn
 import numpy as np
 def _shift(x):
-    n = x.shape[0]
-    i = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0 TBD do we have very short segments
-    x = torch.roll(x, i, dims=2)
     return x
@@ -160,7 +164,7 @@ class LMModel(nn.Module):
         self.cfg_coef = cfg_coef
         self.condition_provider = condition_provider
         self.card = card  # 2048 ?
-        self.n_draw = 14  # replicate so many times the generation of each text in batch
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
@@ -353,11 +357,12 @@ class LMModel(nn.Module):
         out_codes = out_codes.reshape(bs, self.n_draw, 4, new_len)
         out_codes = out_codes.transpose(1, 2).reshape(bs, 4, self.n_draw * new_len)
         print(out_codes.shape, 'o')
-        out_codes = _shift(out_codes)
         # Clear Transformer k/v history (Different history is kept by 48x selfattn)
         for lay in self.transformer.layers:
              lay.self_attn.k_history = None
              lay.self_attn.v_history = None
-        return out_codes

 import numpy as np
 def _shift(x):
+    # cyclic shift of [1, 4, seq_len] slices from [bs, 4, seq_len]
+    print(x.shape, 'SHIFT\n= = = = = ')
+    for i, _slice in enumerate(x):
+        n = x.shape[2]
+        offset = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0 TBD
+        print(offset)
+        x[i, :, :] = torch.roll(_slice, offset, dims=1)
     return x
         self.cfg_coef = cfg_coef
         self.condition_provider = condition_provider
         self.card = card  # 2048 ?
+        self.n_draw = 2  # replicate so many times the generation of each text in batch
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
         out_codes = out_codes.reshape(bs, self.n_draw, 4, new_len)
         out_codes = out_codes.transpose(1, 2).reshape(bs, 4, self.n_draw * new_len)
         print(out_codes.shape, 'o')
+        for _ in range(7):
+            out_codes = _shift(out_codes)
         # Clear Transformer k/v history (Different history is kept by 48x selfattn)
         for lay in self.transformer.layers:
              lay.self_attn.k_history = None
              lay.self_attn.v_history = None
+        return out_codes

demo.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import audiofile
 import numpy as np
 from audiocraft import AudioGen
-text_list = ['dogs barging in the street',
-             'music']
-sound_generator = AudioGen(duration=.74,
                            device='cuda:0').to('cuda:0').eval()
 x = sound_generator.generate(text_list)  # [bs, 1, 7680]
 # print('demo', x.shape)
-x = x[0, :, :].detach().cpu().numpy()
-x /= np.abs(x).max() + 1e-7
 audiofile.write('del_seane.wav', x, 16000)

 import audiofile
 import numpy as np
 from audiocraft import AudioGen
+text_list = ['dogs barging in the street',
+             'cats meowing']
+sound_generator = AudioGen(duration=0.24,
                            device='cuda:0').to('cuda:0').eval()
 x = sound_generator.generate(text_list)  # [bs, 1, 7680]
 # print('demo', x.shape)
+x = x[1, :, :].detach().cpu().numpy()
+# x /= np.abs(x).max() + 1e-7   # inside generate()
 audiofile.write('del_seane.wav', x, 16000)

live_api.py CHANGED Viewed

@@ -14,7 +14,7 @@ from types import SimpleNamespace
 from flask import Flask, request, send_from_directory
 from flask_cors import CORS
 from audiocraft.builders import AudioGen #, audio_write
 sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
@@ -46,7 +46,10 @@ Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
 def tts_multi_sentence(scene=None):
     if scene is not None and len(scene) >= 4:
         print(f'Processing: {scene} ..')
-        x = sound_generator.generate([scene])[0, :, :].detach().cpu().numpy()
         x /= np.abs(x).max() + 1e-7
         # is 16kHz - AUdiogen Fs

 from flask import Flask, request, send_from_directory
 from flask_cors import CORS
 from audiocraft.builders import AudioGen #, audio_write
+NUM_SOUND_GENERATIONS = 1  # they differ a lot and are unnatural to concatenate, prefer lm.n_draw
 sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
 def tts_multi_sentence(scene=None):
     if scene is not None and len(scene) >= 4:
         print(f'Processing: {scene} ..')
+        # x = sound_generator.generate([scene])[0, :, :].detach().cpu().numpy()
+        x = sound_generator.generate(
+                                        [scene] * NUM_SOUND_GENERATIONS
+                                        ).reshape(1, -1).detach().cpu().numpy() # bs, 11400
         x /= np.abs(x).max() + 1e-7
         # is 16kHz - AUdiogen Fs