fx .docx
Browse files- README.md +4 -4
- api.py +12 -6
- assets/audiobook_TTS.docx +2 -2
- models.py +0 -18
README.md
CHANGED
@@ -39,7 +39,7 @@ Beta version of [SHIFT](https://shift-europe.eu/) TTS tool with [AudioGen sounds
|
|
39 |
|
40 |
<details>
|
41 |
<summary>
|
42 |
-
Build virtualenv
|
43 |
</summary>
|
44 |
|
45 |
Clone
|
@@ -62,11 +62,11 @@ Flask `tmux-session`
|
|
62 |
CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python api.py
|
63 |
```
|
64 |
|
65 |
-
|
66 |
|
67 |
</details>
|
68 |
|
69 |
-
|
70 |
|
71 |
## Landscape 2 Soundscapes
|
72 |
|
@@ -133,6 +133,6 @@ CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=
|
|
133 |
Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [female voice](https://www.youtube.com/watch?v=pzrLYCaWD2A)
|
134 |
|
135 |
```python
|
136 |
-
#
|
137 |
python audiobook.py
|
138 |
```
|
|
|
39 |
|
40 |
<details>
|
41 |
<summary>
|
42 |
+
Build virtualenv / run `api.py`
|
43 |
</summary>
|
44 |
|
45 |
Clone
|
|
|
62 |
CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python api.py
|
63 |
```
|
64 |
|
65 |
+
Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
|
66 |
|
67 |
</details>
|
68 |
|
69 |
+
Inference via `api.py`
|
70 |
|
71 |
## Landscape 2 Soundscapes
|
72 |
|
|
|
133 |
Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [female voice](https://www.youtube.com/watch?v=pzrLYCaWD2A)
|
134 |
|
135 |
```python
|
136 |
+
# audiobook will be saved in ./tts_audiobooks
|
137 |
python audiobook.py
|
138 |
```
|
api.py
CHANGED
@@ -150,7 +150,8 @@ def tts_multi_sentence(precomputed_style_vector=None,
|
|
150 |
text=None,
|
151 |
voice=None,
|
152 |
soundscape=None,
|
153 |
-
speed=None
|
|
|
154 |
'''create 24kHZ np.array with tts
|
155 |
|
156 |
precomputed_style_vector : required if en_US or en_UK in voice, so
|
@@ -269,6 +270,8 @@ def serve_wav():
|
|
269 |
# ====STYLE VECTOR====
|
270 |
|
271 |
precomputed_style_vector = None
|
|
|
|
|
272 |
if args.native: # Voice Cloning
|
273 |
try:
|
274 |
precomputed_style_vector = msinference.compute_style(args.native)
|
@@ -289,7 +292,6 @@ def serve_wav():
|
|
289 |
if precomputed_style_vector is None:
|
290 |
|
291 |
if 'en_US' in args.voice or 'en_UK' in args.voice:
|
292 |
-
diffusion_steps = 7 # native
|
293 |
_dir = '/' if args.affective else '_v2/'
|
294 |
precomputed_style_vector = msinference.compute_style(
|
295 |
'assets/wavs/style_vector' + _dir + args.voice.replace(
|
@@ -446,7 +448,8 @@ def serve_wav():
|
|
446 |
precomputed_style_vector=precomputed_style_vector,
|
447 |
voice=args.voice,
|
448 |
soundscape=args.soundscape,
|
449 |
-
speed=args.speed
|
|
|
450 |
)
|
451 |
total = np.concatenate(pieces, 0)
|
452 |
# x = audresample.resample(x.astype(np.float32), 24000, 22050) # reshapes (64,) -> (1,64)
|
@@ -467,7 +470,8 @@ def serve_wav():
|
|
467 |
precomputed_style_vector=precomputed_style_vector,
|
468 |
voice=args.voice,
|
469 |
soundscape=args.soundscape,
|
470 |
-
speed=args.speed
|
|
|
471 |
soundfile.write(AUDIO_TRACK, x, 24000)
|
472 |
|
473 |
# IMAGE 2 SPEECH
|
@@ -486,7 +490,8 @@ def serve_wav():
|
|
486 |
precomputed_style_vector=precomputed_style_vector,
|
487 |
voice=args.voice,
|
488 |
soundscape=args.soundscape,
|
489 |
-
speed=args.speed
|
|
|
490 |
)
|
491 |
soundfile.write(AUDIO_TRACK, x, 24000)
|
492 |
if args.video or args.image:
|
@@ -515,7 +520,8 @@ def serve_wav():
|
|
515 |
precomputed_style_vector=precomputed_style_vector,
|
516 |
voice=args.voice,
|
517 |
soundscape=args.soundscape,
|
518 |
-
speed=args.speed
|
|
|
519 |
OUT_FILE = 'tmp.wav'
|
520 |
soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
|
521 |
|
|
|
150 |
text=None,
|
151 |
voice=None,
|
152 |
soundscape=None,
|
153 |
+
speed=None,
|
154 |
+
diffusion_steps=7):
|
155 |
'''create 24kHZ np.array with tts
|
156 |
|
157 |
precomputed_style_vector : required if en_US or en_UK in voice, so
|
|
|
270 |
# ====STYLE VECTOR====
|
271 |
|
272 |
precomputed_style_vector = None
|
273 |
+
diffusion_steps = 7 # 7=native / 5=non-native
|
274 |
+
|
275 |
if args.native: # Voice Cloning
|
276 |
try:
|
277 |
precomputed_style_vector = msinference.compute_style(args.native)
|
|
|
292 |
if precomputed_style_vector is None:
|
293 |
|
294 |
if 'en_US' in args.voice or 'en_UK' in args.voice:
|
|
|
295 |
_dir = '/' if args.affective else '_v2/'
|
296 |
precomputed_style_vector = msinference.compute_style(
|
297 |
'assets/wavs/style_vector' + _dir + args.voice.replace(
|
|
|
448 |
precomputed_style_vector=precomputed_style_vector,
|
449 |
voice=args.voice,
|
450 |
soundscape=args.soundscape,
|
451 |
+
speed=args.speed,
|
452 |
+
diffusion_steps=diffusion_steps)
|
453 |
)
|
454 |
total = np.concatenate(pieces, 0)
|
455 |
# x = audresample.resample(x.astype(np.float32), 24000, 22050) # reshapes (64,) -> (1,64)
|
|
|
470 |
precomputed_style_vector=precomputed_style_vector,
|
471 |
voice=args.voice,
|
472 |
soundscape=args.soundscape,
|
473 |
+
speed=args.speed,
|
474 |
+
diffusion_steps=diffusion_steps)
|
475 |
soundfile.write(AUDIO_TRACK, x, 24000)
|
476 |
|
477 |
# IMAGE 2 SPEECH
|
|
|
490 |
precomputed_style_vector=precomputed_style_vector,
|
491 |
voice=args.voice,
|
492 |
soundscape=args.soundscape,
|
493 |
+
speed=args.speed,
|
494 |
+
diffusion_steps=diffusion_steps
|
495 |
)
|
496 |
soundfile.write(AUDIO_TRACK, x, 24000)
|
497 |
if args.video or args.image:
|
|
|
520 |
precomputed_style_vector=precomputed_style_vector,
|
521 |
voice=args.voice,
|
522 |
soundscape=args.soundscape,
|
523 |
+
speed=args.speed,
|
524 |
+
diffusion_steps=diffusion_steps)
|
525 |
OUT_FILE = 'tmp.wav'
|
526 |
soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
|
527 |
|
assets/audiobook_TTS.docx
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bbdcb8fe14e0713954e3fa49dc53eaca041c2ac6cfa6de098e892f5a7da38c27
|
3 |
+
size 221307
|
models.py
CHANGED
@@ -41,24 +41,6 @@ class LearnedDownSample(nn.Module):
|
|
41 |
def forward(self, x):
|
42 |
return self.conv(x)
|
43 |
|
44 |
-
class LearnedUpSample(nn.Module):
|
45 |
-
def __init__(self, layer_type, dim_in):
|
46 |
-
super().__init__()
|
47 |
-
self.layer_type = layer_type
|
48 |
-
|
49 |
-
if self.layer_type == 'none':
|
50 |
-
self.conv = nn.Identity()
|
51 |
-
elif self.layer_type == 'timepreserve':
|
52 |
-
self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
|
53 |
-
elif self.layer_type == 'half':
|
54 |
-
self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
|
55 |
-
else:
|
56 |
-
raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
|
57 |
-
|
58 |
-
|
59 |
-
def forward(self, x):
|
60 |
-
return self.conv(x)
|
61 |
-
|
62 |
class DownSample(nn.Module):
|
63 |
def __init__(self, layer_type):
|
64 |
super().__init__()
|
|
|
41 |
def forward(self, x):
|
42 |
return self.conv(x)
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
class DownSample(nn.Module):
|
45 |
def __init__(self, layer_type):
|
46 |
super().__init__()
|