tune voice for audiobook

Browse files

Files changed (5) hide show

.gitattributes +1 -1
README.md +5 -8
api.py +6 -3
audiobook.py +4 -3
msinference.py +1 -1

.gitattributes CHANGED Viewed

@@ -46,4 +46,4 @@ uc_spk_Landscape2Soundscape_Masterpieces_pics/10_Boecklin_967648_NG2-80_001_rsz.
 uc_spk_Landscape2Soundscape_Masterpieces_pics/03_Schinkel_WS200-002.jpg filter=lfs diff=lfs merge=lfs -text
 uc_spk_Landscape2Soundscape_Masterpieces_pics/11_Liebermann_NG4-94_001.jpg filter=lfs diff=lfs merge=lfs -text
 uc_spk_Landscape2Soundscape_Masterpieces_pics/12_Slevogt_AII1022_001.jpg filter=lfs diff=lfs merge=lfs -text
-audiobook_TTS.docx filter=lfs diff=lfs merge=lfs -text

 uc_spk_Landscape2Soundscape_Masterpieces_pics/03_Schinkel_WS200-002.jpg filter=lfs diff=lfs merge=lfs -text
 uc_spk_Landscape2Soundscape_Masterpieces_pics/11_Liebermann_NG4-94_001.jpg filter=lfs diff=lfs merge=lfs -text
 uc_spk_Landscape2Soundscape_Masterpieces_pics/12_Slevogt_AII1022_001.jpg filter=lfs diff=lfs merge=lfs -text
+assets/audiobook_TTS.docx filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -26,10 +26,10 @@ Beta version of [SHIFT](https://shift-europe.eu/) TTS tool with [AudioGen sounds
   - [Analysis of emotion of SHIFT TTS](https://huggingface.co/dkounadis/artificial-styletts2/discussions/2)
   - [Listen Also foreign languages](https://huggingface.co/dkounadis/artificial-styletts2/discussions/4) synthesized via [MMS TTS](https://huggingface.co/facebook/mms-tts)
-## Available Voices
-<a href="https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20">Native English!</a> / <a href="https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6">Non-native English accents!</a> / <a href="https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv">Foreign languages</a>
 ##
@@ -62,16 +62,13 @@ Flask `tmux-session`
 CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python api.py
 ```
-Inference via Flask
-Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
-## Examples For Cultural Heritage Partners - SHIFT
 </details>
-## Landscape 2 Soundscape
 The following needs `api.py` to be already running on a tmux session.

   - [Analysis of emotion of SHIFT TTS](https://huggingface.co/dkounadis/artificial-styletts2/discussions/2)
   - [Listen Also foreign languages](https://huggingface.co/dkounadis/artificial-styletts2/discussions/4) synthesized via [MMS TTS](https://huggingface.co/facebook/mms-tts)
+## Listen to Available Voices!
+<a href="https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20">Native English</a> / <a href="https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6">Non-native English: Accents</a> / <a href="https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv">Foreign languages</a>
 ##
 CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=./hf_home CUDA_VISIBLE_DEVICES=0 python api.py
 ```
+Inference via Flask. Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
 </details>
+# SHIFT Demos
+## Landscape 2 Soundscapes
 The following needs `api.py` to be already running on a tmux session.

api.py CHANGED Viewed

@@ -170,7 +170,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
                         precomputed_style_vector,
                                     alpha=0.3,
                                     beta=0.7,
-                                    diffusion_steps=7,
                                     embedding_scale=1))
         x = np.concatenate(x)
@@ -286,8 +286,10 @@ def serve_wav():
     # Native Eng
-    if precomputed_style_vector is None:  # TODO 7 diffusion steps
         if 'en_US' in args.voice or 'en_UK' in args.voice:
             _dir = '/' if args.affective else '_v2/'
             precomputed_style_vector = msinference.compute_style(
                 'assets/wavs/style_vector' + _dir + args.voice.replace(
@@ -298,11 +300,12 @@ def serve_wav():
         # Non-Native Eng
-        elif '_' in  args.voice:  # TODO 5 diffusion steps
             precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
                                                                  '/', '_').replace('#', '_').replace(
                                                                     'cmu-arctic', 'cmu_arctic').replace(
                                                                         '_low', '') + '.wav')
         # Foreign Lang - MMS/TTS
         else:

                         precomputed_style_vector,
                                     alpha=0.3,
                                     beta=0.7,
+                                    diffusion_steps=diffusion_steps,
                                     embedding_scale=1))
         x = np.concatenate(x)
     # Native Eng
+    if precomputed_style_vector is None:
         if 'en_US' in args.voice or 'en_UK' in args.voice:
+            diffusion_steps = 7  # native
             _dir = '/' if args.affective else '_v2/'
             precomputed_style_vector = msinference.compute_style(
                 'assets/wavs/style_vector' + _dir + args.voice.replace(
         # Non-Native Eng
+        elif '_' in  args.voice:
             precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
                                                                  '/', '_').replace('#', '_').replace(
                                                                     'cmu-arctic', 'cmu_arctic').replace(
                                                                         '_low', '') + '.wav')
+            diffusion_steps = 5  # non-native
         # Foreign Lang - MMS/TTS
         else:

audiobook.py CHANGED Viewed

@@ -19,13 +19,14 @@ Path(ROOT_DIR).mkdir(parents=True,
                      exist_ok=True)
 voices = [
     # 'en_US/vctk_low#p228',  # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20
-    'af_ZA_google-nwu_0184',  # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
-    # 'en_US/vctk_low#p326',
     ]  # select any voice from - https://audeering.github.io/shift/
 #urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "audiobook_TTS.docx")
-d = docx.Document('audiobook_TTS.docx')  # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'
 last_paragraph_was_silence = False  # to know to add silence only once after only at the 1st empty paragraph we detect

                      exist_ok=True)
 voices = [
     # 'en_US/vctk_low#p228',  # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20
+    # 'af_ZA_google-nwu_0184',  # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
+    'en_US/vctk_low#p326',   # Native voice
+#    'jv_ID_google-gmu_06207',
     ]  # select any voice from - https://audeering.github.io/shift/
 #urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "audiobook_TTS.docx")
+d = docx.Document('assets/audiobook_TTS.docx')  # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'
 last_paragraph_was_silence = False  # to know to add silence only once after only at the 1st empty paragraph we detect

msinference.py CHANGED Viewed

@@ -173,7 +173,7 @@ def inference(text,
               ref_s,
               alpha = 0.3,
               beta = 0.7,
-              diffusion_steps=5,   # diffusion_steps=5 sounds more pleasant than 7
               embedding_scale=1,
               use_gruut=False):
     text = text.strip()

               ref_s,
               alpha = 0.3,
               beta = 0.7,
+              diffusion_steps=7, # 7 if voice is native English else 5 for non-native
               embedding_scale=1,
               use_gruut=False):
     text = text.strip()