tune voice for audiobook
Browse files- .gitattributes +1 -1
- README.md +5 -8
- api.py +6 -3
- audiobook.py +4 -3
- msinference.py +1 -1
.gitattributes
CHANGED
@@ -46,4 +46,4 @@ uc_spk_Landscape2Soundscape_Masterpieces_pics/10_Boecklin_967648_NG2-80_001_rsz.
|
|
46 |
uc_spk_Landscape2Soundscape_Masterpieces_pics/03_Schinkel_WS200-002.jpg filter=lfs diff=lfs merge=lfs -text
|
47 |
uc_spk_Landscape2Soundscape_Masterpieces_pics/11_Liebermann_NG4-94_001.jpg filter=lfs diff=lfs merge=lfs -text
|
48 |
uc_spk_Landscape2Soundscape_Masterpieces_pics/12_Slevogt_AII1022_001.jpg filter=lfs diff=lfs merge=lfs -text
|
49 |
-
audiobook_TTS.docx filter=lfs diff=lfs merge=lfs -text
|
|
|
46 |
uc_spk_Landscape2Soundscape_Masterpieces_pics/03_Schinkel_WS200-002.jpg filter=lfs diff=lfs merge=lfs -text
|
47 |
uc_spk_Landscape2Soundscape_Masterpieces_pics/11_Liebermann_NG4-94_001.jpg filter=lfs diff=lfs merge=lfs -text
|
48 |
uc_spk_Landscape2Soundscape_Masterpieces_pics/12_Slevogt_AII1022_001.jpg filter=lfs diff=lfs merge=lfs -text
|
49 |
+
assets/audiobook_TTS.docx filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -26,10 +26,10 @@ Beta version of [SHIFT](https://shift-europe.eu/) TTS tool with [AudioGen sounds
|
|
26 |
- [Analysis of emotion of SHIFT TTS](https://huggingface.co/dkounadis/artificial-styletts2/discussions/2)
|
27 |
- [Listen Also foreign languages](https://huggingface.co/dkounadis/artificial-styletts2/discussions/4) synthesized via [MMS TTS](https://huggingface.co/facebook/mms-tts)
|
28 |
|
29 |
-
## Available Voices
|
30 |
|
31 |
|
32 |
-
<a href="https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20">Native English
|
33 |
|
34 |
##
|
35 |
|
@@ -62,16 +62,13 @@ Flask `tmux-session`
|
|
62 |
CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python api.py
|
63 |
```
|
64 |
|
65 |
-
Inference via Flask
|
66 |
-
|
67 |
-
Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
|
68 |
-
|
69 |
-
## Examples For Cultural Heritage Partners - SHIFT
|
70 |
|
71 |
</details>
|
72 |
|
|
|
73 |
|
74 |
-
## Landscape 2
|
75 |
|
76 |
The following needs `api.py` to be already running on a tmux session.
|
77 |
|
|
|
26 |
- [Analysis of emotion of SHIFT TTS](https://huggingface.co/dkounadis/artificial-styletts2/discussions/2)
|
27 |
- [Listen Also foreign languages](https://huggingface.co/dkounadis/artificial-styletts2/discussions/4) synthesized via [MMS TTS](https://huggingface.co/facebook/mms-tts)
|
28 |
|
29 |
+
## Listen to Available Voices!
|
30 |
|
31 |
|
32 |
+
<a href="https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20">Native English</a> / <a href="https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6">Non-native English: Accents</a> / <a href="https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv">Foreign languages</a>
|
33 |
|
34 |
##
|
35 |
|
|
|
62 |
CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python api.py
|
63 |
```
|
64 |
|
65 |
+
Inference via Flask. Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
|
|
|
|
|
|
|
|
|
66 |
|
67 |
</details>
|
68 |
|
69 |
+
# SHIFT Demos
|
70 |
|
71 |
+
## Landscape 2 Soundscapes
|
72 |
|
73 |
The following needs `api.py` to be already running on a tmux session.
|
74 |
|
api.py
CHANGED
@@ -170,7 +170,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
|
|
170 |
precomputed_style_vector,
|
171 |
alpha=0.3,
|
172 |
beta=0.7,
|
173 |
-
diffusion_steps=
|
174 |
embedding_scale=1))
|
175 |
x = np.concatenate(x)
|
176 |
|
@@ -286,8 +286,10 @@ def serve_wav():
|
|
286 |
|
287 |
# Native Eng
|
288 |
|
289 |
-
if precomputed_style_vector is None:
|
|
|
290 |
if 'en_US' in args.voice or 'en_UK' in args.voice:
|
|
|
291 |
_dir = '/' if args.affective else '_v2/'
|
292 |
precomputed_style_vector = msinference.compute_style(
|
293 |
'assets/wavs/style_vector' + _dir + args.voice.replace(
|
@@ -298,11 +300,12 @@ def serve_wav():
|
|
298 |
|
299 |
# Non-Native Eng
|
300 |
|
301 |
-
elif '_' in args.voice:
|
302 |
precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
|
303 |
'/', '_').replace('#', '_').replace(
|
304 |
'cmu-arctic', 'cmu_arctic').replace(
|
305 |
'_low', '') + '.wav')
|
|
|
306 |
|
307 |
# Foreign Lang - MMS/TTS
|
308 |
else:
|
|
|
170 |
precomputed_style_vector,
|
171 |
alpha=0.3,
|
172 |
beta=0.7,
|
173 |
+
diffusion_steps=diffusion_steps,
|
174 |
embedding_scale=1))
|
175 |
x = np.concatenate(x)
|
176 |
|
|
|
286 |
|
287 |
# Native Eng
|
288 |
|
289 |
+
if precomputed_style_vector is None:
|
290 |
+
|
291 |
if 'en_US' in args.voice or 'en_UK' in args.voice:
|
292 |
+
diffusion_steps = 7 # native
|
293 |
_dir = '/' if args.affective else '_v2/'
|
294 |
precomputed_style_vector = msinference.compute_style(
|
295 |
'assets/wavs/style_vector' + _dir + args.voice.replace(
|
|
|
300 |
|
301 |
# Non-Native Eng
|
302 |
|
303 |
+
elif '_' in args.voice:
|
304 |
precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
|
305 |
'/', '_').replace('#', '_').replace(
|
306 |
'cmu-arctic', 'cmu_arctic').replace(
|
307 |
'_low', '') + '.wav')
|
308 |
+
diffusion_steps = 5 # non-native
|
309 |
|
310 |
# Foreign Lang - MMS/TTS
|
311 |
else:
|
audiobook.py
CHANGED
@@ -19,13 +19,14 @@ Path(ROOT_DIR).mkdir(parents=True,
|
|
19 |
exist_ok=True)
|
20 |
voices = [
|
21 |
# 'en_US/vctk_low#p228', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20
|
22 |
-
'af_ZA_google-nwu_0184', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
|
23 |
-
|
|
|
24 |
] # select any voice from - https://audeering.github.io/shift/
|
25 |
|
26 |
#urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "audiobook_TTS.docx")
|
27 |
|
28 |
-
d = docx.Document('audiobook_TTS.docx') # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'
|
29 |
|
30 |
last_paragraph_was_silence = False # to know to add silence only once after only at the 1st empty paragraph we detect
|
31 |
|
|
|
19 |
exist_ok=True)
|
20 |
voices = [
|
21 |
# 'en_US/vctk_low#p228', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20
|
22 |
+
# 'af_ZA_google-nwu_0184', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
|
23 |
+
'en_US/vctk_low#p326', # Native voice
|
24 |
+
# 'jv_ID_google-gmu_06207',
|
25 |
] # select any voice from - https://audeering.github.io/shift/
|
26 |
|
27 |
#urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "audiobook_TTS.docx")
|
28 |
|
29 |
+
d = docx.Document('assets/audiobook_TTS.docx') # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'
|
30 |
|
31 |
last_paragraph_was_silence = False # to know to add silence only once after only at the 1st empty paragraph we detect
|
32 |
|
msinference.py
CHANGED
@@ -173,7 +173,7 @@ def inference(text,
|
|
173 |
ref_s,
|
174 |
alpha = 0.3,
|
175 |
beta = 0.7,
|
176 |
-
diffusion_steps=
|
177 |
embedding_scale=1,
|
178 |
use_gruut=False):
|
179 |
text = text.strip()
|
|
|
173 |
ref_s,
|
174 |
alpha = 0.3,
|
175 |
beta = 0.7,
|
176 |
+
diffusion_steps=7, # 7 if voice is native English else 5 for non-native
|
177 |
embedding_scale=1,
|
178 |
use_gruut=False):
|
179 |
text = text.strip()
|