ccoreilly commited on
Commit
7514dcc
·
1 Parent(s): 7f0efc6

update app

Browse files
Files changed (3) hide show
  1. Dockerfile +3 -1
  2. app.py +21 -17
  3. engine.py +144 -0
Dockerfile CHANGED
@@ -11,11 +11,13 @@ RUN cd espeak-ng && \
11
  make install
12
 
13
  COPY requirements.txt .
14
- COPY app.py .
15
  COPY models .
16
 
17
  RUN pip install -r requirements.txt
18
 
 
 
 
19
  RUN mkdir -p cache && chmod 777 cache
20
 
21
  ENV NUMBA_CACHE_DIR=./cache
 
11
  make install
12
 
13
  COPY requirements.txt .
 
14
  COPY models .
15
 
16
  RUN pip install -r requirements.txt
17
 
18
+ COPY engine.py .
19
+ COPY app.py .
20
+
21
  RUN mkdir -p cache && chmod 777 cache
22
 
23
  ENV NUMBA_CACHE_DIR=./cache
app.py CHANGED
@@ -7,42 +7,48 @@ import os
7
  import json
8
  from TTS.utils.manage import ModelManager
9
  from TTS.utils.synthesizer import Synthesizer
10
-
11
 
12
  MAX_TXT_LEN = 100
13
 
14
  SPEAKERS = ['f_cen_05', 'f_cen_81', 'f_occ_31', 'f_occ_de', 'f_sep_31', 'm_cen_08', 'm_occ_44', 'm_val_89']
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def tts(text, speaker_idx):
17
  if len(text) > MAX_TXT_LEN:
18
  text = text[:MAX_TXT_LEN]
19
  print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
20
  print(text)
21
 
22
- model_path = os.getcwd() + "/best_model.pth"
23
- config_path = os.getcwd() + "/config.json"
24
- speakers_file_path = os.getcwd() + "/speakers.pth"
25
  speakers_maping_path = os.getcwd() + "/speaker_map.json"
26
- vocoder_path = None
27
- vocoder_config_path = None
28
-
29
- synthesizer = Synthesizer(
30
- model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path,
31
- )
32
 
33
  # Map speaker aliases to speaker ids
34
  with open(speakers_maping_path, 'r') as fp:
35
  maping = json.load(fp)
36
 
37
- speaker_idx = maping[speaker_idx]
38
 
39
  # synthesize
40
- if synthesizer is None:
41
- raise NameError("model not found")
42
- wavs = synthesizer.tts(text, speaker_idx)
43
  # return output
44
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
45
- synthesizer.save_wav(wavs, fp)
46
  return fp.name
47
 
48
 
@@ -66,11 +72,9 @@ iface = gr.Interface(
66
  ],
67
  outputs=gr.outputs.Audio(label="Output",type="filepath"),
68
  title="🗣️ TTS Català Multi Parlant - VITS 🗣️",
69
- theme="grass",
70
  description=description,
71
  article=article,
72
  allow_flagging="never",
73
- flagging_options=['error', 'bad-quality', 'wrong-pronounciation'],
74
  layout="vertical",
75
  live=False
76
  )
 
7
  import json
8
  from TTS.utils.manage import ModelManager
9
  from TTS.utils.synthesizer import Synthesizer
10
+ from .engine import Piper
11
 
12
  MAX_TXT_LEN = 100
13
 
14
  SPEAKERS = ['f_cen_05', 'f_cen_81', 'f_occ_31', 'f_occ_de', 'f_sep_31', 'm_cen_08', 'm_occ_44', 'm_val_89']
15
 
16
+ def carrega_bsc():
17
+ model_path = os.getcwd() + "/models/bsc/best_model.pth"
18
+ config_path = os.getcwd() + "/models/bsc/config.json"
19
+ speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth"
20
+ vocoder_path = None
21
+ vocoder_config_path = None
22
+
23
+ synthesizer = Synthesizer(
24
+ model_path, config_path, speakers_file_path, No$
25
+ )
26
+
27
+ return synthesizer
28
+
29
+ model_bsc = carrega_bsc()
30
+ SPEAKERS = model_bsc.speakers
31
+
32
+
33
  def tts(text, speaker_idx):
34
  if len(text) > MAX_TXT_LEN:
35
  text = text[:MAX_TXT_LEN]
36
  print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
37
  print(text)
38
 
 
 
 
39
  speakers_maping_path = os.getcwd() + "/speaker_map.json"
 
 
 
 
 
 
40
 
41
  # Map speaker aliases to speaker ids
42
  with open(speakers_maping_path, 'r') as fp:
43
  maping = json.load(fp)
44
 
45
+ #speaker_idx = maping[speaker_idx]
46
 
47
  # synthesize
48
+ wavs = model_bsc.tts(text, speaker_idx)
 
 
49
  # return output
50
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
51
+ model_bsc.save_wav(wavs, fp)
52
  return fp.name
53
 
54
 
 
72
  ],
73
  outputs=gr.outputs.Audio(label="Output",type="filepath"),
74
  title="🗣️ TTS Català Multi Parlant - VITS 🗣️",
 
75
  description=description,
76
  article=article,
77
  allow_flagging="never",
 
78
  layout="vertical",
79
  live=False
80
  )
engine.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ import os
4
+ import wave
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import List, Mapping, Optional, Sequence, Union
8
+
9
+ import numpy as np
10
+ import onnxruntime
11
+ from espeak_phonemizer import Phonemizer
12
+
13
+ _BOS = "^"
14
+ _EOS = "$"
15
+ _PAD = "_"
16
+
17
+
18
+ @dataclass
19
+ class PiperConfig:
20
+ num_symbols: int
21
+ num_speakers: int
22
+ sample_rate: int
23
+ espeak_voice: str
24
+ length_scale: float
25
+ noise_scale: float
26
+ noise_w: float
27
+ phoneme_id_map: Mapping[str, Sequence[int]]
28
+
29
+
30
+ class Piper:
31
+ def __init__(
32
+ self,
33
+ model_path: Union[str, Path],
34
+ config_path: Optional[Union[str, Path]] = None,
35
+ use_cuda: bool = False,
36
+ ):
37
+ if config_path is None:
38
+ config_path = f"{model_path}.json"
39
+
40
+ self.config = load_config(config_path)
41
+ self.phonemizer = Phonemizer(self.config.espeak_voice)
42
+ self.onnx_options = onnxruntime.SessionOptions()
43
+ self.onnx_options.intra_op_num_threads = os.cpu_count() - 1
44
+ self.model = onnxruntime.InferenceSession(
45
+ str(model_path),
46
+ sess_options=self.onnx_options,
47
+ providers=["CPUExecutionProvider"]
48
+ if not use_cuda
49
+ else ["CUDAExecutionProvider"],
50
+ )
51
+
52
+ def synthesize(
53
+ self,
54
+ text: str,
55
+ speaker_id: Optional[int] = None,
56
+ length_scale: Optional[float] = None,
57
+ noise_scale: Optional[float] = None,
58
+ noise_w: Optional[float] = None,
59
+ ) -> bytes:
60
+ """Synthesize WAV audio from text."""
61
+ if length_scale is None:
62
+ length_scale = self.config.length_scale
63
+
64
+ if noise_scale is None:
65
+ noise_scale = self.config.noise_scale
66
+
67
+ if noise_w is None:
68
+ noise_w = self.config.noise_w
69
+
70
+ phonemes_str = self.phonemizer.phonemize(text)
71
+ phonemes = [_BOS] + list(phonemes_str)
72
+ phoneme_ids: List[int] = []
73
+
74
+ for phoneme in phonemes:
75
+ phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
76
+ phoneme_ids.extend(self.config.phoneme_id_map[_PAD])
77
+
78
+ phoneme_ids.extend(self.config.phoneme_id_map[_EOS])
79
+
80
+ phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
81
+ phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
82
+ scales = np.array(
83
+ [noise_scale, length_scale, noise_w],
84
+ dtype=np.float32,
85
+ )
86
+
87
+ if (self.config.num_speakers > 1) and (speaker_id is not None):
88
+ # Default speaker
89
+ speaker_id = 0
90
+
91
+ sid = None
92
+
93
+ if speaker_id is not None:
94
+ sid = np.array([speaker_id], dtype=np.int64)
95
+
96
+ # Synthesize through Onnx
97
+ audio = self.model.run(
98
+ None,
99
+ {
100
+ "input": phoneme_ids_array,
101
+ "input_lengths": phoneme_ids_lengths,
102
+ "scales": scales,
103
+ "sid": sid,
104
+ },
105
+ )[0].squeeze((0, 1))
106
+ audio = audio_float_to_int16(audio.squeeze())
107
+
108
+ # Convert to WAV
109
+ with io.BytesIO() as wav_io:
110
+ wav_file: wave.Wave_write = wave.open(wav_io, "wb")
111
+ with wav_file:
112
+ wav_file.setframerate(self.config.sample_rate)
113
+ wav_file.setsampwidth(2)
114
+ wav_file.setnchannels(1)
115
+ wav_file.writeframes(audio.tobytes())
116
+
117
+ return wav_io.getvalue()
118
+
119
+
120
+ def load_config(config_path: Union[str, Path]) -> PiperConfig:
121
+ with open(config_path, "r", encoding="utf-8") as config_file:
122
+ config_dict = json.load(config_file)
123
+ inference = config_dict.get("inference", {})
124
+
125
+ return PiperConfig(
126
+ num_symbols=config_dict["num_symbols"],
127
+ num_speakers=config_dict["num_speakers"],
128
+ sample_rate=config_dict["audio"]["sample_rate"],
129
+ espeak_voice=config_dict["espeak"]["voice"],
130
+ noise_scale=inference.get("noise_scale", 0.667),
131
+ length_scale=inference.get("length_scale", 1.0),
132
+ noise_w=inference.get("noise_w", 0.8),
133
+ phoneme_id_map=config_dict["phoneme_id_map"],
134
+ )
135
+
136
+
137
+ def audio_float_to_int16(
138
+ audio: np.ndarray, max_wav_value: float = 32767.0
139
+ ) -> np.ndarray:
140
+ """Normalize audio and convert to int16 range"""
141
+ audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
142
+ audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
143
+ audio_norm = audio_norm.astype("int16")
144
+ return audio_norm