Dionyssos commited on
Commit
267f0b7
·
1 Parent(s): 9d6172b
Files changed (4) hide show
  1. README.md +4 -4
  2. api.py +12 -6
  3. assets/audiobook_TTS.docx +2 -2
  4. models.py +0 -18
README.md CHANGED
@@ -39,7 +39,7 @@ Beta version of [SHIFT](https://shift-europe.eu/) TTS tool with [AudioGen sounds
39
 
40
  <details>
41
  <summary>
42
- Build virtualenv & start API
43
  </summary>
44
 
45
  Clone
@@ -62,11 +62,11 @@ Flask `tmux-session`
62
  CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python api.py
63
  ```
64
 
65
- Inference via Flask. Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
66
 
67
  </details>
68
 
69
- # SHIFT Demos
70
 
71
  ## Landscape 2 Soundscapes
72
 
@@ -133,6 +133,6 @@ CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=
133
  Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [female voice](https://www.youtube.com/watch?v=pzrLYCaWD2A)
134
 
135
  ```python
136
- # generated audiobook will be saved in ./tts_audiobooks
137
  python audiobook.py
138
  ```
 
39
 
40
  <details>
41
  <summary>
42
+ Build virtualenv / run `api.py`
43
  </summary>
44
 
45
  Clone
 
62
  CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python api.py
63
  ```
64
 
65
+ Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
66
 
67
  </details>
68
 
69
+ Inference via `api.py`
70
 
71
  ## Landscape 2 Soundscapes
72
 
 
133
  Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [female voice](https://www.youtube.com/watch?v=pzrLYCaWD2A)
134
 
135
  ```python
136
+ # audiobook will be saved in ./tts_audiobooks
137
  python audiobook.py
138
  ```
api.py CHANGED
@@ -150,7 +150,8 @@ def tts_multi_sentence(precomputed_style_vector=None,
150
  text=None,
151
  voice=None,
152
  soundscape=None,
153
- speed=None):
 
154
  '''create 24kHZ np.array with tts
155
 
156
  precomputed_style_vector : required if en_US or en_UK in voice, so
@@ -269,6 +270,8 @@ def serve_wav():
269
  # ====STYLE VECTOR====
270
 
271
  precomputed_style_vector = None
 
 
272
  if args.native: # Voice Cloning
273
  try:
274
  precomputed_style_vector = msinference.compute_style(args.native)
@@ -289,7 +292,6 @@ def serve_wav():
289
  if precomputed_style_vector is None:
290
 
291
  if 'en_US' in args.voice or 'en_UK' in args.voice:
292
- diffusion_steps = 7 # native
293
  _dir = '/' if args.affective else '_v2/'
294
  precomputed_style_vector = msinference.compute_style(
295
  'assets/wavs/style_vector' + _dir + args.voice.replace(
@@ -446,7 +448,8 @@ def serve_wav():
446
  precomputed_style_vector=precomputed_style_vector,
447
  voice=args.voice,
448
  soundscape=args.soundscape,
449
- speed=args.speed)
 
450
  )
451
  total = np.concatenate(pieces, 0)
452
  # x = audresample.resample(x.astype(np.float32), 24000, 22050) # reshapes (64,) -> (1,64)
@@ -467,7 +470,8 @@ def serve_wav():
467
  precomputed_style_vector=precomputed_style_vector,
468
  voice=args.voice,
469
  soundscape=args.soundscape,
470
- speed=args.speed)
 
471
  soundfile.write(AUDIO_TRACK, x, 24000)
472
 
473
  # IMAGE 2 SPEECH
@@ -486,7 +490,8 @@ def serve_wav():
486
  precomputed_style_vector=precomputed_style_vector,
487
  voice=args.voice,
488
  soundscape=args.soundscape,
489
- speed=args.speed
 
490
  )
491
  soundfile.write(AUDIO_TRACK, x, 24000)
492
  if args.video or args.image:
@@ -515,7 +520,8 @@ def serve_wav():
515
  precomputed_style_vector=precomputed_style_vector,
516
  voice=args.voice,
517
  soundscape=args.soundscape,
518
- speed=args.speed)
 
519
  OUT_FILE = 'tmp.wav'
520
  soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
521
 
 
150
  text=None,
151
  voice=None,
152
  soundscape=None,
153
+ speed=None,
154
+ diffusion_steps=7):
155
  '''create 24kHZ np.array with tts
156
 
157
  precomputed_style_vector : required if en_US or en_UK in voice, so
 
270
  # ====STYLE VECTOR====
271
 
272
  precomputed_style_vector = None
273
+ diffusion_steps = 7 # 7=native / 5=non-native
274
+
275
  if args.native: # Voice Cloning
276
  try:
277
  precomputed_style_vector = msinference.compute_style(args.native)
 
292
  if precomputed_style_vector is None:
293
 
294
  if 'en_US' in args.voice or 'en_UK' in args.voice:
 
295
  _dir = '/' if args.affective else '_v2/'
296
  precomputed_style_vector = msinference.compute_style(
297
  'assets/wavs/style_vector' + _dir + args.voice.replace(
 
448
  precomputed_style_vector=precomputed_style_vector,
449
  voice=args.voice,
450
  soundscape=args.soundscape,
451
+ speed=args.speed,
452
+ diffusion_steps=diffusion_steps)
453
  )
454
  total = np.concatenate(pieces, 0)
455
  # x = audresample.resample(x.astype(np.float32), 24000, 22050) # reshapes (64,) -> (1,64)
 
470
  precomputed_style_vector=precomputed_style_vector,
471
  voice=args.voice,
472
  soundscape=args.soundscape,
473
+ speed=args.speed,
474
+ diffusion_steps=diffusion_steps)
475
  soundfile.write(AUDIO_TRACK, x, 24000)
476
 
477
  # IMAGE 2 SPEECH
 
490
  precomputed_style_vector=precomputed_style_vector,
491
  voice=args.voice,
492
  soundscape=args.soundscape,
493
+ speed=args.speed,
494
+ diffusion_steps=diffusion_steps
495
  )
496
  soundfile.write(AUDIO_TRACK, x, 24000)
497
  if args.video or args.image:
 
520
  precomputed_style_vector=precomputed_style_vector,
521
  voice=args.voice,
522
  soundscape=args.soundscape,
523
+ speed=args.speed,
524
+ diffusion_steps=diffusion_steps)
525
  OUT_FILE = 'tmp.wav'
526
  soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
527
 
assets/audiobook_TTS.docx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96d6503beb8a3c8c792ef28d723e9c828bc94c73242b071e9376622e27d0ccf7
3
- size 221278
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbdcb8fe14e0713954e3fa49dc53eaca041c2ac6cfa6de098e892f5a7da38c27
3
+ size 221307
models.py CHANGED
@@ -41,24 +41,6 @@ class LearnedDownSample(nn.Module):
41
  def forward(self, x):
42
  return self.conv(x)
43
 
44
- class LearnedUpSample(nn.Module):
45
- def __init__(self, layer_type, dim_in):
46
- super().__init__()
47
- self.layer_type = layer_type
48
-
49
- if self.layer_type == 'none':
50
- self.conv = nn.Identity()
51
- elif self.layer_type == 'timepreserve':
52
- self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
53
- elif self.layer_type == 'half':
54
- self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
55
- else:
56
- raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
57
-
58
-
59
- def forward(self, x):
60
- return self.conv(x)
61
-
62
  class DownSample(nn.Module):
63
  def __init__(self, layer_type):
64
  super().__init__()
 
41
  def forward(self, x):
42
  return self.conv(x)
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  class DownSample(nn.Module):
45
  def __init__(self, layer_type):
46
  super().__init__()