Dionyssos commited on
Commit
9d6172b
·
1 Parent(s): 2a3699d

tune voice for audiobook

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -1
  2. README.md +5 -8
  3. api.py +6 -3
  4. audiobook.py +4 -3
  5. msinference.py +1 -1
.gitattributes CHANGED
@@ -46,4 +46,4 @@ uc_spk_Landscape2Soundscape_Masterpieces_pics/10_Boecklin_967648_NG2-80_001_rsz.
46
  uc_spk_Landscape2Soundscape_Masterpieces_pics/03_Schinkel_WS200-002.jpg filter=lfs diff=lfs merge=lfs -text
47
  uc_spk_Landscape2Soundscape_Masterpieces_pics/11_Liebermann_NG4-94_001.jpg filter=lfs diff=lfs merge=lfs -text
48
  uc_spk_Landscape2Soundscape_Masterpieces_pics/12_Slevogt_AII1022_001.jpg filter=lfs diff=lfs merge=lfs -text
49
- audiobook_TTS.docx filter=lfs diff=lfs merge=lfs -text
 
46
  uc_spk_Landscape2Soundscape_Masterpieces_pics/03_Schinkel_WS200-002.jpg filter=lfs diff=lfs merge=lfs -text
47
  uc_spk_Landscape2Soundscape_Masterpieces_pics/11_Liebermann_NG4-94_001.jpg filter=lfs diff=lfs merge=lfs -text
48
  uc_spk_Landscape2Soundscape_Masterpieces_pics/12_Slevogt_AII1022_001.jpg filter=lfs diff=lfs merge=lfs -text
49
+ assets/audiobook_TTS.docx filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -26,10 +26,10 @@ Beta version of [SHIFT](https://shift-europe.eu/) TTS tool with [AudioGen sounds
26
  - [Analysis of emotion of SHIFT TTS](https://huggingface.co/dkounadis/artificial-styletts2/discussions/2)
27
  - [Listen Also foreign languages](https://huggingface.co/dkounadis/artificial-styletts2/discussions/4) synthesized via [MMS TTS](https://huggingface.co/facebook/mms-tts)
28
 
29
- ## Available Voices
30
 
31
 
32
- <a href="https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20">Native English!</a> / <a href="https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6">Non-native English accents!</a> / <a href="https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv">Foreign languages</a>
33
 
34
  ##
35
 
@@ -62,16 +62,13 @@ Flask `tmux-session`
62
  CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python api.py
63
  ```
64
 
65
- Inference via Flask
66
-
67
- Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
68
-
69
- ## Examples For Cultural Heritage Partners - SHIFT
70
 
71
  </details>
72
 
 
73
 
74
- ## Landscape 2 Soundscape
75
 
76
  The following needs `api.py` to be already running on a tmux session.
77
 
 
26
  - [Analysis of emotion of SHIFT TTS](https://huggingface.co/dkounadis/artificial-styletts2/discussions/2)
27
  - [Listen Also foreign languages](https://huggingface.co/dkounadis/artificial-styletts2/discussions/4) synthesized via [MMS TTS](https://huggingface.co/facebook/mms-tts)
28
 
29
+ ## Listen to Available Voices!
30
 
31
 
32
+ <a href="https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20">Native English</a> / <a href="https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6">Non-native English: Accents</a> / <a href="https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv">Foreign languages</a>
33
 
34
  ##
35
 
 
62
  CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python api.py
63
  ```
64
 
65
+ Inference via Flask. Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
 
 
 
 
66
 
67
  </details>
68
 
69
+ # SHIFT Demos
70
 
71
+ ## Landscape 2 Soundscapes
72
 
73
  The following needs `api.py` to be already running on a tmux session.
74
 
api.py CHANGED
@@ -170,7 +170,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
170
  precomputed_style_vector,
171
  alpha=0.3,
172
  beta=0.7,
173
- diffusion_steps=7,
174
  embedding_scale=1))
175
  x = np.concatenate(x)
176
 
@@ -286,8 +286,10 @@ def serve_wav():
286
 
287
  # Native Eng
288
 
289
- if precomputed_style_vector is None: # TODO 7 diffusion steps
 
290
  if 'en_US' in args.voice or 'en_UK' in args.voice:
 
291
  _dir = '/' if args.affective else '_v2/'
292
  precomputed_style_vector = msinference.compute_style(
293
  'assets/wavs/style_vector' + _dir + args.voice.replace(
@@ -298,11 +300,12 @@ def serve_wav():
298
 
299
  # Non-Native Eng
300
 
301
- elif '_' in args.voice: # TODO 5 diffusion steps
302
  precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
303
  '/', '_').replace('#', '_').replace(
304
  'cmu-arctic', 'cmu_arctic').replace(
305
  '_low', '') + '.wav')
 
306
 
307
  # Foreign Lang - MMS/TTS
308
  else:
 
170
  precomputed_style_vector,
171
  alpha=0.3,
172
  beta=0.7,
173
+ diffusion_steps=diffusion_steps,
174
  embedding_scale=1))
175
  x = np.concatenate(x)
176
 
 
286
 
287
  # Native Eng
288
 
289
+ if precomputed_style_vector is None:
290
+
291
  if 'en_US' in args.voice or 'en_UK' in args.voice:
292
+ diffusion_steps = 7 # native
293
  _dir = '/' if args.affective else '_v2/'
294
  precomputed_style_vector = msinference.compute_style(
295
  'assets/wavs/style_vector' + _dir + args.voice.replace(
 
300
 
301
  # Non-Native Eng
302
 
303
+ elif '_' in args.voice:
304
  precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
305
  '/', '_').replace('#', '_').replace(
306
  'cmu-arctic', 'cmu_arctic').replace(
307
  '_low', '') + '.wav')
308
+ diffusion_steps = 5 # non-native
309
 
310
  # Foreign Lang - MMS/TTS
311
  else:
audiobook.py CHANGED
@@ -19,13 +19,14 @@ Path(ROOT_DIR).mkdir(parents=True,
19
  exist_ok=True)
20
  voices = [
21
  # 'en_US/vctk_low#p228', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20
22
- 'af_ZA_google-nwu_0184', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
23
- # 'en_US/vctk_low#p326',
 
24
  ] # select any voice from - https://audeering.github.io/shift/
25
 
26
  #urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "audiobook_TTS.docx")
27
 
28
- d = docx.Document('audiobook_TTS.docx') # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'
29
 
30
  last_paragraph_was_silence = False # to know to add silence only once after only at the 1st empty paragraph we detect
31
 
 
19
  exist_ok=True)
20
  voices = [
21
  # 'en_US/vctk_low#p228', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20
22
+ # 'af_ZA_google-nwu_0184', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
23
+ 'en_US/vctk_low#p326', # Native voice
24
+ # 'jv_ID_google-gmu_06207',
25
  ] # select any voice from - https://audeering.github.io/shift/
26
 
27
  #urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "audiobook_TTS.docx")
28
 
29
+ d = docx.Document('assets/audiobook_TTS.docx') # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'
30
 
31
  last_paragraph_was_silence = False # to know to add silence only once after only at the 1st empty paragraph we detect
32
 
msinference.py CHANGED
@@ -173,7 +173,7 @@ def inference(text,
173
  ref_s,
174
  alpha = 0.3,
175
  beta = 0.7,
176
- diffusion_steps=5, # diffusion_steps=5 sounds more pleasant than 7
177
  embedding_scale=1,
178
  use_gruut=False):
179
  text = text.strip()
 
173
  ref_s,
174
  alpha = 0.3,
175
  beta = 0.7,
176
+ diffusion_steps=7, # 7 if voice is native English else 5 for non-native
177
  embedding_scale=1,
178
  use_gruut=False):
179
  text = text.strip()