ccoreilly commited on
Commit
f64d86f
·
1 Parent(s): e2574bf

Add MMS inference

Browse files
Files changed (3) hide show
  1. Dockerfile +8 -1
  2. app.py +30 -11
  3. mms.py +84 -0
Dockerfile CHANGED
@@ -5,7 +5,7 @@ RUN apt-get update && apt-get install -y gnupg && \
5
  echo "deb http://ppa.launchpad.net/zeehio/festcat/ubuntu bionic main" >> /etc/apt/sources.list && \
6
  echo "deb-src http://ppa.launchpad.net/zeehio/festcat/ubuntu bionic main" >> /etc/apt/sources.list && \
7
  apt-get update && \
8
- apt-get -y install festival festvox-ca-ona-hts festvox-ca-pau-hts lame git make autoconf automake libtool pkg-config gcc libsonic-dev ronn kramdown libpcaudio-dev
9
 
10
  RUN git clone -b ca-to-pr https://github.com/projecte-aina/espeak-ng
11
 
@@ -31,7 +31,14 @@ COPY --chown=user models models
31
 
32
  RUN pip install -r requirements.txt
33
 
 
 
 
 
 
 
34
  COPY --chown=user engine.py .
 
35
  COPY --chown=user festival.py .
36
  COPY --chown=user app.py .
37
 
 
5
  echo "deb http://ppa.launchpad.net/zeehio/festcat/ubuntu bionic main" >> /etc/apt/sources.list && \
6
  echo "deb-src http://ppa.launchpad.net/zeehio/festcat/ubuntu bionic main" >> /etc/apt/sources.list && \
7
  apt-get update && \
8
+ apt-get -y install festival festvox-ca-ona-hts festvox-ca-pau-hts lame git make autoconf automake libtool pkg-config gcc libsonic-dev ronn kramdown libpcaudio-dev libatlas-base-dev gfortran
9
 
10
  RUN git clone -b ca-to-pr https://github.com/projecte-aina/espeak-ng
11
 
 
31
 
32
  RUN pip install -r requirements.txt
33
 
34
+ RUN git clone https://github.com/jaywalnut310/vits.git && \
35
+ cd vits && sed s/torch==1.6.0/torch==1.7.0/ requirements.txt > requirements.txt && pip install -r requirements.txt && cd monotonic_align && \
36
+ python setup.py build_ext --inplace && cd /home/user
37
+
38
+ ENV PYTHONPATH=$PYTHONPATH:/home/user/app/vits
39
+
40
  COPY --chown=user engine.py .
41
+ COPY --chown=user mms.py .
42
  COPY --chown=user festival.py .
43
  COPY --chown=user app.py .
44
 
app.py CHANGED
@@ -1,14 +1,11 @@
1
  import tempfile
2
- from typing import Optional
3
- from TTS.config import load_config
4
  import gradio as gr
5
- import numpy as np
6
  import os
7
- from TTS.utils.manage import ModelManager
8
  from TTS.utils.synthesizer import Synthesizer
9
  from espeak_phonemizer import Phonemizer
10
  from engine import Piper
11
  from festival import festival_synthesize
 
12
 
13
  MAX_TXT_LEN = 325
14
 
@@ -41,6 +38,9 @@ def carrega_collectivat():
41
  def carrega_piper():
42
  return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")
43
 
 
 
 
44
 
45
  model_bsc = carrega_bsc()
46
  SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names
@@ -49,6 +49,10 @@ model_collectivat = carrega_collectivat()
49
 
50
  model_piper = carrega_piper()
51
 
 
 
 
 
52
  def tts(text, festival_voice, speaker_idx):
53
  if len(text) > MAX_TXT_LEN:
54
  text = text[:MAX_TXT_LEN]
@@ -60,9 +64,6 @@ def tts(text, festival_voice, speaker_idx):
60
  wav_coll = model_collectivat.tts(text)
61
  wav_piper = model_piper.synthesize(text)
62
 
63
- #return (model_bsc.tts_config.audio["sample_rate"], wav_bsc), (22000, wav_coll), (16000, wav_piper)
64
-
65
- # return output
66
  fp_bsc = ""
67
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
68
  model_bsc.save_wav(wav_bsc, fp)
@@ -77,12 +78,20 @@ def tts(text, festival_voice, speaker_idx):
77
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
78
  fp.write(wav_piper)
79
  fp_piper = fp.name
 
 
 
 
 
80
 
81
  fonemes = fonemitzador.phonemize(text, keep_clause_breakers=True)
82
 
83
  fp_festival = festival_synthesize(text, festival_voice)
84
 
85
- return fonemes, fp_festival, fp_bsc, fp_coll, fp_piper
 
 
 
86
 
87
 
88
  description="""
@@ -91,8 +100,11 @@ Amb aquesta aplicació podeu sintetitzar text a veu amb els últims models neuro
91
  1. Model multi-parlant VITS entrenat pel BSC (Projecte Aina) [enllaç](https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker)
92
  2. Model Fastspeech entrenat per Col·lectivat [enllaç](https://github.com/CollectivaT-dev/TTS-API)
93
  3. Model VITS entrenat per Piper/Home Assistant [enllaç](https://github.com/rhasspy/piper)
 
94
 
95
- Els dós últims models han estat entrenats amb la veu d'Ona de FestCAT. El primer model ha estat entrenat amb totes les veus de FestCAT, els talls de Common Voice 8 i un altre corpus pel que conté moltes veus de qualitat variable. La veu d'Ona està seleccionada per defecte per la comparativa però podeu provar les altres.
 
 
96
 
97
  Aquesta aplicació fa servir l'últim estat de l'espeak millorat per Carme Armentano del BSC
98
  https://github.com/projecte-aina/espeak-ng
@@ -116,13 +128,20 @@ iface = gr.Interface(
116
  gr.Audio(label="Festival",type="filepath"),
117
  gr.Audio(label="BSC VITS",type="filepath"),
118
  gr.Audio(label="Collectivat Fastspeech",type="filepath"),
119
- gr.Audio(label="Piper VITS",type="filepath")
 
120
  ],
121
  title="Comparativa de síntesi lliure en català️",
122
  description=description,
123
  article=article,
124
  allow_flagging="never",
125
  layout="vertical",
126
- live=False
 
 
 
 
 
 
127
  )
128
  iface.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import tempfile
 
 
2
  import gradio as gr
 
3
  import os
 
4
  from TTS.utils.synthesizer import Synthesizer
5
  from espeak_phonemizer import Phonemizer
6
  from engine import Piper
7
  from festival import festival_synthesize
8
+ from mms import MMS
9
 
10
  MAX_TXT_LEN = 325
11
 
 
38
  def carrega_piper():
39
  return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")
40
 
41
+ def carrega_mms():
42
+ return MMS(os.getcwd() + "/models/mms")
43
+
44
 
45
  model_bsc = carrega_bsc()
46
  SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names
 
49
 
50
  model_piper = carrega_piper()
51
 
52
+ model_mms = carrega_mms()
53
+
54
+ request_count = 0
55
+
56
  def tts(text, festival_voice, speaker_idx):
57
  if len(text) > MAX_TXT_LEN:
58
  text = text[:MAX_TXT_LEN]
 
64
  wav_coll = model_collectivat.tts(text)
65
  wav_piper = model_piper.synthesize(text)
66
 
 
 
 
67
  fp_bsc = ""
68
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
69
  model_bsc.save_wav(wav_bsc, fp)
 
78
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
79
  fp.write(wav_piper)
80
  fp_piper = fp.name
81
+
82
+ fp_mms = ""
83
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
84
+ model_mms.synthesize(fp.name, text)
85
+ fp_mms = fp.name
86
 
87
  fonemes = fonemitzador.phonemize(text, keep_clause_breakers=True)
88
 
89
  fp_festival = festival_synthesize(text, festival_voice)
90
 
91
+ global request_count
92
+ request_count += 1
93
+ print(f"Requests: {request_count}")
94
+ return fonemes, fp_festival, fp_bsc, fp_coll, fp_piper, fp_mms
95
 
96
 
97
  description="""
 
100
  1. Model multi-parlant VITS entrenat pel BSC (Projecte Aina) [enllaç](https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker)
101
  2. Model Fastspeech entrenat per Col·lectivat [enllaç](https://github.com/CollectivaT-dev/TTS-API)
102
  3. Model VITS entrenat per Piper/Home Assistant [enllaç](https://github.com/rhasspy/piper)
103
+ 3. Model VITS entrenat per Meta (llicència CC-BY-NC) [enllaç](https://github.com/facebookresearch/fairseq/tree/main/examples/mms)
104
 
105
+ El primer model ha estat entrenat amb totes les veus de FestCAT, els talls de Common Voice 8 i un altre corpus pel que conté moltes veus de qualitat variable. La veu d'Ona està seleccionada per defecte per la comparativa però podeu provar les altres.
106
+ Els models 2 i 3 han estat entrenats amb la veu d'Ona de FestCAT.
107
+ El model 4, anomenat MMS, de Meta (Facebook) ha estat entrenat a partir de dades d'un [audiollibre](http://live.bible.is/bible/CATBSS/LUK/1) de la Bíblia
108
 
109
  Aquesta aplicació fa servir l'últim estat de l'espeak millorat per Carme Armentano del BSC
110
  https://github.com/projecte-aina/espeak-ng
 
128
  gr.Audio(label="Festival",type="filepath"),
129
  gr.Audio(label="BSC VITS",type="filepath"),
130
  gr.Audio(label="Collectivat Fastspeech",type="filepath"),
131
+ gr.Audio(label="Piper VITS",type="filepath"),
132
+ gr.Audio(label="Meta MMS VITS",type="filepath")
133
  ],
134
  title="Comparativa de síntesi lliure en català️",
135
  description=description,
136
  article=article,
137
  allow_flagging="never",
138
  layout="vertical",
139
+ live=False,
140
+ examples=[
141
+ ["Duc pà sec al sac, m'assec on sóc i el suco amb suc", "ona", "ona"],
142
+ ["Un plat pla blanc, ple de pebre negre n’era. Un plat blanc pla, ple de pebre negre està", "ona", "ona"],
143
+ ["Visc al bosc i busco vesc i visc del vesc que busco al bosc", "ona", "ona"],
144
+ ["Una polla xica, pica, pellarica, camatorta i becarica va tenir sis polls xics, pics, pellarics, camacurts i becarics. Si la polla no hagués sigut xica, pica, pellarica, camatorta i becarica, els sis polls no haurien sigut xics, pics, pellarics, camacurts i becarics.", "ona", "ona"]
145
+ ]
146
  )
147
  iface.launch(server_name="0.0.0.0", server_port=7860)
mms.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import torch
8
+ import commons
9
+ import utils
10
+ from models import SynthesizerTrn
11
+ from scipy.io.wavfile import write
12
+ from pathlib import Path
13
+ from typing import Union
14
+
15
+ class TextMapper(object):
16
+ def __init__(self, vocab_file):
17
+ self.symbols = [x.replace("\n", "") for x in open(vocab_file).readlines()]
18
+ self.SPACE_ID = self.symbols.index(" ")
19
+ self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
20
+ self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}
21
+
22
+ def text_to_sequence(self, text, cleaner_names):
23
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
24
+ Args:
25
+ text: string to convert to a sequence
26
+ cleaner_names: names of the cleaner functions to run the text through
27
+ Returns:
28
+ List of integers corresponding to the symbols in the text
29
+ '''
30
+ sequence = []
31
+ clean_text = text.strip()
32
+ for symbol in clean_text:
33
+ symbol_id = self._symbol_to_id[symbol]
34
+ sequence += [symbol_id]
35
+ return sequence
36
+
37
+ def get_text(self, text, hps):
38
+ text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
39
+ if hps.data.add_blank:
40
+ text_norm = commons.intersperse(text_norm, 0)
41
+ text_norm = torch.LongTensor(text_norm)
42
+ return text_norm
43
+
44
+ def filter_oov(self, text):
45
+ val_chars = self._symbol_to_id
46
+ txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
47
+ print(f"text after filtering OOV: {txt_filt}")
48
+ return txt_filt
49
+
50
+ class MMS():
51
+ def __init__(self, model_path: Union[str, Path]):
52
+ ckpt_dir = model_path
53
+ vocab_file = f"{ckpt_dir}/vocab.txt"
54
+ config_file = f"{ckpt_dir}/config.json"
55
+ assert os.path.isfile(config_file), f"{config_file} doesn't exist"
56
+ self.hps = utils.get_hparams_from_file(config_file)
57
+ self.text_mapper = TextMapper(vocab_file)
58
+ self.net_g = SynthesizerTrn(
59
+ len(self.text_mapper.symbols),
60
+ self.hps.data.filter_length // 2 + 1,
61
+ self.hps.train.segment_size // self.hps.data.hop_length,
62
+ **self.hps.model)
63
+ g_pth = f"{ckpt_dir}/G_100000.pth"
64
+ print(f"load {g_pth}")
65
+
66
+ _ = utils.load_checkpoint(g_pth, self.net_g, None)
67
+
68
+ def synthesize(self, wav_path: str, txt):
69
+ print(f"text: {txt}")
70
+ txt = txt.lower()
71
+ txt = self.text_mapper.filter_oov(txt)
72
+ stn_tst = self.text_mapper.get_text(txt, self.hps)
73
+ with torch.no_grad():
74
+ x_tst = stn_tst.unsqueeze(0)
75
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
76
+ hyp = self.net_g.infer(
77
+ x_tst, x_tst_lengths, noise_scale=.667,
78
+ noise_scale_w=0.8, length_scale=1.0
79
+ )[0][0,0].cpu().float().numpy()
80
+
81
+ os.makedirs(os.path.dirname(wav_path), exist_ok=True)
82
+ print(f"wav: {wav_path}")
83
+ write(wav_path, self.hps.data.sampling_rate, hyp)
84
+ return