dkounadis
/

artificial-styletts2

@@ -39,7 +39,7 @@ Beta version of [SHIFT](https://shift-europe.eu/) TTS tool with [AudioGen sounds
 <details>
 <summary>
-Build virtualenv & start API
 </summary>
 Clone
@@ -62,11 +62,11 @@ Flask `tmux-session`
 CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python api.py
 ```
-Inference via Flask. Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
 </details>
-# SHIFT Demos
 ## Landscape 2 Soundscapes
@@ -133,6 +133,6 @@ CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=
 Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [female voice](https://www.youtube.com/watch?v=pzrLYCaWD2A)
 ```python
-# generated audiobook will be saved in ./tts_audiobooks
 python audiobook.py
 ```

 <details>
 <summary>
+Build virtualenv / run `api.py`
 </summary>
 Clone
 CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python api.py
 ```
+Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
 </details>
+Inference via `api.py`
 ## Landscape 2 Soundscapes
 Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [female voice](https://www.youtube.com/watch?v=pzrLYCaWD2A)
 ```python
+#  audiobook will be saved in ./tts_audiobooks
 python audiobook.py
 ```

api.py CHANGED Viewed

@@ -150,7 +150,8 @@ def tts_multi_sentence(precomputed_style_vector=None,
                        text=None,
                        voice=None,
                        soundscape=None,
-                       speed=None):
     '''create 24kHZ np.array with tts
        precomputed_style_vector :   required if en_US or en_UK in voice, so
@@ -269,6 +270,8 @@ def serve_wav():
     # ====STYLE VECTOR====
     precomputed_style_vector = None
     if args.native:  # Voice Cloning
         try:
             precomputed_style_vector = msinference.compute_style(args.native)
@@ -289,7 +292,6 @@ def serve_wav():
     if precomputed_style_vector is None:
         if 'en_US' in args.voice or 'en_UK' in args.voice:
-            diffusion_steps = 7  # native
             _dir = '/' if args.affective else '_v2/'
             precomputed_style_vector = msinference.compute_style(
                 'assets/wavs/style_vector' + _dir + args.voice.replace(
@@ -446,7 +448,8 @@ def serve_wav():
                                                  precomputed_style_vector=precomputed_style_vector,
                                                  voice=args.voice,
                                                  soundscape=args.soundscape,
-                                                 speed=args.speed)
                               )
             total = np.concatenate(pieces, 0)
             # x = audresample.resample(x.astype(np.float32), 24000, 22050)  # reshapes (64,) -> (1,64)
@@ -467,7 +470,8 @@ def serve_wav():
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                soundscape=args.soundscape,
-                               speed=args.speed)
             soundfile.write(AUDIO_TRACK, x, 24000)
     # IMAGE 2 SPEECH
@@ -486,7 +490,8 @@ def serve_wav():
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                soundscape=args.soundscape,
-                               speed=args.speed
                                )
         soundfile.write(AUDIO_TRACK, x, 24000)
     if args.video or args.image:
@@ -515,7 +520,8 @@ def serve_wav():
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                soundscape=args.soundscape,
-                               speed=args.speed)
         OUT_FILE = 'tmp.wav'
         soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)

                        text=None,
                        voice=None,
                        soundscape=None,
+                       speed=None,
+                       diffusion_steps=7):
     '''create 24kHZ np.array with tts
        precomputed_style_vector :   required if en_US or en_UK in voice, so
     # ====STYLE VECTOR====
     precomputed_style_vector = None
+    diffusion_steps = 7  # 7=native / 5=non-native
     if args.native:  # Voice Cloning
         try:
             precomputed_style_vector = msinference.compute_style(args.native)
     if precomputed_style_vector is None:
         if 'en_US' in args.voice or 'en_UK' in args.voice:
             _dir = '/' if args.affective else '_v2/'
             precomputed_style_vector = msinference.compute_style(
                 'assets/wavs/style_vector' + _dir + args.voice.replace(
                                                  precomputed_style_vector=precomputed_style_vector,
                                                  voice=args.voice,
                                                  soundscape=args.soundscape,
+                                                 speed=args.speed,
+                                                 diffusion_steps=diffusion_steps)
                               )
             total = np.concatenate(pieces, 0)
             # x = audresample.resample(x.astype(np.float32), 24000, 22050)  # reshapes (64,) -> (1,64)
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                soundscape=args.soundscape,
+                               speed=args.speed,
+                               diffusion_steps=diffusion_steps)
             soundfile.write(AUDIO_TRACK, x, 24000)
     # IMAGE 2 SPEECH
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                soundscape=args.soundscape,
+                               speed=args.speed,
+                               diffusion_steps=diffusion_steps
                                )
         soundfile.write(AUDIO_TRACK, x, 24000)
     if args.video or args.image:
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                soundscape=args.soundscape,
+                               speed=args.speed,
+                               diffusion_steps=diffusion_steps)
         OUT_FILE = 'tmp.wav'
         soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)

assets/audiobook_TTS.docx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:96d6503beb8a3c8c792ef28d723e9c828bc94c73242b071e9376622e27d0ccf7
-size 221278

 version https://git-lfs.github.com/spec/v1
+oid sha256:bbdcb8fe14e0713954e3fa49dc53eaca041c2ac6cfa6de098e892f5a7da38c27
+size 221307

models.py CHANGED Viewed

@@ -41,24 +41,6 @@ class LearnedDownSample(nn.Module):
     def forward(self, x):
         return self.conv(x)
-class LearnedUpSample(nn.Module):
-    def __init__(self, layer_type, dim_in):
-        super().__init__()
-        self.layer_type = layer_type
-        if self.layer_type == 'none':
-            self.conv = nn.Identity()
-        elif self.layer_type == 'timepreserve':
-            self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
-        elif self.layer_type == 'half':
-            self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
-        else:
-            raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
-    def forward(self, x):
-        return self.conv(x)
 class DownSample(nn.Module):
     def __init__(self, layer_type):
         super().__init__()

     def forward(self, x):
         return self.conv(x)
 class DownSample(nn.Module):
     def __init__(self, layer_type):
         super().__init__()