Spaces:
Runtime error
Runtime error
ANYANTUDRE
commited on
Commit
·
7c7161e
1
Parent(s):
cfe3883
fixed typo in goai_stt_ttt_pipeline
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- Makefile +0 -3
- TTS/.models.json +0 -938
- TTS/VERSION +0 -1
- TTS/__init__.py +0 -6
- TTS/__pycache__/__init__.cpython-310.pyc +0 -0
- TTS/__pycache__/model.cpython-310.pyc +0 -0
- TTS/api.py +0 -458
- TTS/bin/__init__.py +0 -0
- TTS/bin/collect_env_info.py +0 -48
- TTS/bin/compute_attention_masks.py +0 -165
- TTS/bin/compute_embeddings.py +0 -197
- TTS/bin/compute_statistics.py +0 -96
- TTS/bin/eval_encoder.py +0 -88
- TTS/bin/extract_tts_spectrograms.py +0 -287
- TTS/bin/find_unique_chars.py +0 -45
- TTS/bin/find_unique_phonemes.py +0 -74
- TTS/bin/remove_silence_using_vad.py +0 -124
- TTS/bin/resample.py +0 -90
- TTS/bin/synthesize.py +0 -494
- TTS/bin/train_encoder.py +0 -332
- TTS/bin/train_tts.py +0 -71
- TTS/bin/train_vocoder.py +0 -77
- TTS/bin/tune_wavegrad.py +0 -103
- TTS/config/__init__.py +0 -135
- TTS/config/__pycache__/__init__.cpython-310.pyc +0 -0
- TTS/config/__pycache__/shared_configs.cpython-310.pyc +0 -0
- TTS/config/shared_configs.py +0 -268
- TTS/demos/xtts_ft_demo/requirements.txt +0 -2
- TTS/demos/xtts_ft_demo/utils/formatter.py +0 -160
- TTS/demos/xtts_ft_demo/utils/gpt_train.py +0 -172
- TTS/demos/xtts_ft_demo/xtts_demo.py +0 -415
- TTS/encoder/README.md +0 -18
- TTS/encoder/__init__.py +0 -0
- TTS/encoder/__pycache__/__init__.cpython-310.pyc +0 -0
- TTS/encoder/__pycache__/losses.cpython-310.pyc +0 -0
- TTS/encoder/configs/base_encoder_config.py +0 -61
- TTS/encoder/configs/emotion_encoder_config.py +0 -12
- TTS/encoder/configs/speaker_encoder_config.py +0 -11
- TTS/encoder/dataset.py +0 -147
- TTS/encoder/losses.py +0 -226
- TTS/encoder/models/__pycache__/base_encoder.cpython-310.pyc +0 -0
- TTS/encoder/models/__pycache__/lstm.cpython-310.pyc +0 -0
- TTS/encoder/models/__pycache__/resnet.cpython-310.pyc +0 -0
- TTS/encoder/models/base_encoder.py +0 -161
- TTS/encoder/models/lstm.py +0 -99
- TTS/encoder/models/resnet.py +0 -198
- TTS/encoder/requirements.txt +0 -2
- TTS/encoder/utils/__init__.py +0 -0
- TTS/encoder/utils/__pycache__/__init__.cpython-310.pyc +0 -0
- TTS/encoder/utils/__pycache__/generic_utils.cpython-310.pyc +0 -0
Makefile
CHANGED
@@ -5,9 +5,6 @@ install:
|
|
5 |
test:
|
6 |
python app.py
|
7 |
|
8 |
-
debug:
|
9 |
-
#python -m pytest -vv --pdb #Debugger is invoked
|
10 |
-
|
11 |
format:
|
12 |
#black *.py
|
13 |
|
|
|
5 |
test:
|
6 |
python app.py
|
7 |
|
|
|
|
|
|
|
8 |
format:
|
9 |
#black *.py
|
10 |
|
TTS/.models.json
DELETED
@@ -1,938 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"tts_models": {
|
3 |
-
"multilingual": {
|
4 |
-
"multi-dataset": {
|
5 |
-
"xtts_v2": {
|
6 |
-
"description": "XTTS-v2.0.3 by Coqui with 17 languages.",
|
7 |
-
"hf_url": [
|
8 |
-
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
|
9 |
-
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
|
10 |
-
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
|
11 |
-
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5",
|
12 |
-
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/speakers_xtts.pth"
|
13 |
-
],
|
14 |
-
"model_hash": "10f92b55c512af7a8d39d650547a15a7",
|
15 |
-
"default_vocoder": null,
|
16 |
-
"commit": "480a6cdf7",
|
17 |
-
"license": "CPML",
|
18 |
-
"contact": "[email protected]",
|
19 |
-
"tos_required": true
|
20 |
-
},
|
21 |
-
"xtts_v1.1": {
|
22 |
-
"description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.",
|
23 |
-
"hf_url": [
|
24 |
-
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth",
|
25 |
-
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/config.json",
|
26 |
-
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json",
|
27 |
-
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/hash.md5"
|
28 |
-
],
|
29 |
-
"model_hash": "7c62beaf58d39b729de287330dc254e7b515677416839b649a50e7cf74c3df59",
|
30 |
-
"default_vocoder": null,
|
31 |
-
"commit": "82910a63",
|
32 |
-
"license": "CPML",
|
33 |
-
"contact": "[email protected]",
|
34 |
-
"tos_required": true
|
35 |
-
},
|
36 |
-
"your_tts": {
|
37 |
-
"description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
|
38 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
|
39 |
-
"default_vocoder": null,
|
40 |
-
"commit": "e9a1953e",
|
41 |
-
"license": "CC BY-NC-ND 4.0",
|
42 |
-
"contact": "[email protected]"
|
43 |
-
},
|
44 |
-
"bark": {
|
45 |
-
"description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
|
46 |
-
"hf_url": [
|
47 |
-
"https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
|
48 |
-
"https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
|
49 |
-
"https://coqui.gateway.scarf.sh/hf/text_2.pt",
|
50 |
-
"https://coqui.gateway.scarf.sh/hf/bark/config.json",
|
51 |
-
"https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
|
52 |
-
"https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
|
53 |
-
],
|
54 |
-
"default_vocoder": null,
|
55 |
-
"commit": "e9a1953e",
|
56 |
-
"license": "MIT",
|
57 |
-
"contact": "https://www.suno.ai/"
|
58 |
-
}
|
59 |
-
}
|
60 |
-
},
|
61 |
-
"bg": {
|
62 |
-
"cv": {
|
63 |
-
"vits": {
|
64 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
|
65 |
-
"default_vocoder": null,
|
66 |
-
"commit": null,
|
67 |
-
"author": "@NeonGeckoCom",
|
68 |
-
"license": "bsd-3-clause"
|
69 |
-
}
|
70 |
-
}
|
71 |
-
},
|
72 |
-
"cs": {
|
73 |
-
"cv": {
|
74 |
-
"vits": {
|
75 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
|
76 |
-
"default_vocoder": null,
|
77 |
-
"commit": null,
|
78 |
-
"author": "@NeonGeckoCom",
|
79 |
-
"license": "bsd-3-clause"
|
80 |
-
}
|
81 |
-
}
|
82 |
-
},
|
83 |
-
"da": {
|
84 |
-
"cv": {
|
85 |
-
"vits": {
|
86 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
|
87 |
-
"default_vocoder": null,
|
88 |
-
"commit": null,
|
89 |
-
"author": "@NeonGeckoCom",
|
90 |
-
"license": "bsd-3-clause"
|
91 |
-
}
|
92 |
-
}
|
93 |
-
},
|
94 |
-
"et": {
|
95 |
-
"cv": {
|
96 |
-
"vits": {
|
97 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
|
98 |
-
"default_vocoder": null,
|
99 |
-
"commit": null,
|
100 |
-
"author": "@NeonGeckoCom",
|
101 |
-
"license": "bsd-3-clause"
|
102 |
-
}
|
103 |
-
}
|
104 |
-
},
|
105 |
-
"ga": {
|
106 |
-
"cv": {
|
107 |
-
"vits": {
|
108 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
|
109 |
-
"default_vocoder": null,
|
110 |
-
"commit": null,
|
111 |
-
"author": "@NeonGeckoCom",
|
112 |
-
"license": "bsd-3-clause"
|
113 |
-
}
|
114 |
-
}
|
115 |
-
},
|
116 |
-
"en": {
|
117 |
-
"ek1": {
|
118 |
-
"tacotron2": {
|
119 |
-
"description": "EK1 en-rp tacotron2 by NMStoker",
|
120 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
|
121 |
-
"default_vocoder": "vocoder_models/en/ek1/wavegrad",
|
122 |
-
"commit": "c802255",
|
123 |
-
"license": "apache 2.0"
|
124 |
-
}
|
125 |
-
},
|
126 |
-
"ljspeech": {
|
127 |
-
"tacotron2-DDC": {
|
128 |
-
"description": "Tacotron2 with Double Decoder Consistency.",
|
129 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
|
130 |
-
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
131 |
-
"commit": "bae2ad0f",
|
132 |
-
"author": "Eren Gölge @erogol",
|
133 |
-
"license": "apache 2.0",
|
134 |
-
"contact": "[email protected]"
|
135 |
-
},
|
136 |
-
"tacotron2-DDC_ph": {
|
137 |
-
"description": "Tacotron2 with Double Decoder Consistency with phonemes.",
|
138 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
|
139 |
-
"default_vocoder": "vocoder_models/en/ljspeech/univnet",
|
140 |
-
"commit": "3900448",
|
141 |
-
"author": "Eren Gölge @erogol",
|
142 |
-
"license": "apache 2.0",
|
143 |
-
"contact": "[email protected]"
|
144 |
-
},
|
145 |
-
"glow-tts": {
|
146 |
-
"description": "",
|
147 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
|
148 |
-
"stats_file": null,
|
149 |
-
"default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
|
150 |
-
"commit": "",
|
151 |
-
"author": "Eren Gölge @erogol",
|
152 |
-
"license": "MPL",
|
153 |
-
"contact": "[email protected]"
|
154 |
-
},
|
155 |
-
"speedy-speech": {
|
156 |
-
"description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
|
157 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
|
158 |
-
"stats_file": null,
|
159 |
-
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
160 |
-
"commit": "4581e3d",
|
161 |
-
"author": "Eren Gölge @erogol",
|
162 |
-
"license": "apache 2.0",
|
163 |
-
"contact": "[email protected]"
|
164 |
-
},
|
165 |
-
"tacotron2-DCA": {
|
166 |
-
"description": "",
|
167 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
|
168 |
-
"default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
|
169 |
-
"commit": "",
|
170 |
-
"author": "Eren Gölge @erogol",
|
171 |
-
"license": "MPL",
|
172 |
-
"contact": "[email protected]"
|
173 |
-
},
|
174 |
-
"vits": {
|
175 |
-
"description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
|
176 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
|
177 |
-
"default_vocoder": null,
|
178 |
-
"commit": "3900448",
|
179 |
-
"author": "Eren Gölge @erogol",
|
180 |
-
"license": "apache 2.0",
|
181 |
-
"contact": "[email protected]"
|
182 |
-
},
|
183 |
-
"vits--neon": {
|
184 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
|
185 |
-
"default_vocoder": null,
|
186 |
-
"author": "@NeonGeckoCom",
|
187 |
-
"license": "bsd-3-clause",
|
188 |
-
"contact": null,
|
189 |
-
"commit": null
|
190 |
-
},
|
191 |
-
"fast_pitch": {
|
192 |
-
"description": "FastPitch model trained on LJSpeech using the Aligner Network",
|
193 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
|
194 |
-
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
195 |
-
"commit": "b27b3ba",
|
196 |
-
"author": "Eren Gölge @erogol",
|
197 |
-
"license": "apache 2.0",
|
198 |
-
"contact": "[email protected]"
|
199 |
-
},
|
200 |
-
"overflow": {
|
201 |
-
"description": "Overflow model trained on LJSpeech",
|
202 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
|
203 |
-
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
204 |
-
"commit": "3b1a28f",
|
205 |
-
"author": "Eren Gölge @erogol",
|
206 |
-
"license": "apache 2.0",
|
207 |
-
"contact": "[email protected]"
|
208 |
-
},
|
209 |
-
"neural_hmm": {
|
210 |
-
"description": "Neural HMM model trained on LJSpeech",
|
211 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.11.0_models/tts_models--en--ljspeech--neural_hmm.zip",
|
212 |
-
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
213 |
-
"commit": "3b1a28f",
|
214 |
-
"author": "Shivam Metha @shivammehta25",
|
215 |
-
"license": "apache 2.0",
|
216 |
-
"contact": "d83ee8fe45e3c0d776d4a865aca21d7c2ac324c4"
|
217 |
-
}
|
218 |
-
},
|
219 |
-
"vctk": {
|
220 |
-
"vits": {
|
221 |
-
"description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
|
222 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
|
223 |
-
"default_vocoder": null,
|
224 |
-
"commit": "3900448",
|
225 |
-
"author": "Eren @erogol",
|
226 |
-
"license": "apache 2.0",
|
227 |
-
"contact": "[email protected]"
|
228 |
-
},
|
229 |
-
"fast_pitch": {
|
230 |
-
"description": "FastPitch model trained on VCTK dataseset.",
|
231 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
|
232 |
-
"default_vocoder": null,
|
233 |
-
"commit": "bdab788d",
|
234 |
-
"author": "Eren @erogol",
|
235 |
-
"license": "CC BY-NC-ND 4.0",
|
236 |
-
"contact": "[email protected]"
|
237 |
-
}
|
238 |
-
},
|
239 |
-
"sam": {
|
240 |
-
"tacotron-DDC": {
|
241 |
-
"description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
|
242 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
|
243 |
-
"default_vocoder": "vocoder_models/en/sam/hifigan_v2",
|
244 |
-
"commit": "bae2ad0f",
|
245 |
-
"author": "Eren Gölge @erogol",
|
246 |
-
"license": "apache 2.0",
|
247 |
-
"contact": "[email protected]"
|
248 |
-
}
|
249 |
-
},
|
250 |
-
"blizzard2013": {
|
251 |
-
"capacitron-t2-c50": {
|
252 |
-
"description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
|
253 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
|
254 |
-
"commit": "d6284e7",
|
255 |
-
"default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
|
256 |
-
"author": "Adam Froghyar @a-froghyar",
|
257 |
-
"license": "apache 2.0",
|
258 |
-
"contact": "[email protected]"
|
259 |
-
},
|
260 |
-
"capacitron-t2-c150_v2": {
|
261 |
-
"description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
|
262 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
|
263 |
-
"commit": "a67039d",
|
264 |
-
"default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
|
265 |
-
"author": "Adam Froghyar @a-froghyar",
|
266 |
-
"license": "apache 2.0",
|
267 |
-
"contact": "[email protected]"
|
268 |
-
}
|
269 |
-
},
|
270 |
-
"multi-dataset": {
|
271 |
-
"tortoise-v2": {
|
272 |
-
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
|
273 |
-
"github_rls_url": [
|
274 |
-
"https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
|
275 |
-
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
|
276 |
-
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
|
277 |
-
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
|
278 |
-
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
|
279 |
-
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
|
280 |
-
"https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
|
281 |
-
"https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
|
282 |
-
"https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
|
283 |
-
],
|
284 |
-
"commit": "c1875f6",
|
285 |
-
"default_vocoder": null,
|
286 |
-
"author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
|
287 |
-
"license": "apache 2.0"
|
288 |
-
}
|
289 |
-
},
|
290 |
-
"jenny": {
|
291 |
-
"jenny": {
|
292 |
-
"description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
|
293 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
|
294 |
-
"default_vocoder": null,
|
295 |
-
"commit": "ba40a1c",
|
296 |
-
"license": "custom - see https://github.com/dioco-group/jenny-tts-dataset#important",
|
297 |
-
"author": "@noml4u"
|
298 |
-
}
|
299 |
-
}
|
300 |
-
},
|
301 |
-
"es": {
|
302 |
-
"mai": {
|
303 |
-
"tacotron2-DDC": {
|
304 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
|
305 |
-
"default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
|
306 |
-
"commit": "",
|
307 |
-
"author": "Eren Gölge @erogol",
|
308 |
-
"license": "MPL",
|
309 |
-
"contact": "[email protected]"
|
310 |
-
}
|
311 |
-
},
|
312 |
-
"css10": {
|
313 |
-
"vits": {
|
314 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
|
315 |
-
"default_vocoder": null,
|
316 |
-
"commit": null,
|
317 |
-
"author": "@NeonGeckoCom",
|
318 |
-
"license": "bsd-3-clause"
|
319 |
-
}
|
320 |
-
}
|
321 |
-
},
|
322 |
-
"fr": {
|
323 |
-
"mai": {
|
324 |
-
"tacotron2-DDC": {
|
325 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
|
326 |
-
"default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
|
327 |
-
"commit": null,
|
328 |
-
"author": "Eren Gölge @erogol",
|
329 |
-
"license": "MPL",
|
330 |
-
"contact": "[email protected]"
|
331 |
-
}
|
332 |
-
},
|
333 |
-
"css10": {
|
334 |
-
"vits": {
|
335 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
|
336 |
-
"default_vocoder": null,
|
337 |
-
"commit": null,
|
338 |
-
"author": "@NeonGeckoCom",
|
339 |
-
"license": "bsd-3-clause"
|
340 |
-
}
|
341 |
-
}
|
342 |
-
},
|
343 |
-
"uk": {
|
344 |
-
"mai": {
|
345 |
-
"glow-tts": {
|
346 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
|
347 |
-
"author": "@robinhad",
|
348 |
-
"commit": "bdab788d",
|
349 |
-
"license": "MIT",
|
350 |
-
"contact": "",
|
351 |
-
"default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
|
352 |
-
},
|
353 |
-
"vits": {
|
354 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
|
355 |
-
"default_vocoder": null,
|
356 |
-
"commit": null,
|
357 |
-
"author": "@NeonGeckoCom",
|
358 |
-
"license": "bsd-3-clause"
|
359 |
-
}
|
360 |
-
}
|
361 |
-
},
|
362 |
-
"zh-CN": {
|
363 |
-
"baker": {
|
364 |
-
"tacotron2-DDC-GST": {
|
365 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
|
366 |
-
"commit": "unknown",
|
367 |
-
"author": "@kirianguiller",
|
368 |
-
"license": "apache 2.0",
|
369 |
-
"default_vocoder": null
|
370 |
-
}
|
371 |
-
}
|
372 |
-
},
|
373 |
-
"nl": {
|
374 |
-
"mai": {
|
375 |
-
"tacotron2-DDC": {
|
376 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
|
377 |
-
"author": "@r-dh",
|
378 |
-
"license": "apache 2.0",
|
379 |
-
"default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
|
380 |
-
"stats_file": null,
|
381 |
-
"commit": "540d811"
|
382 |
-
}
|
383 |
-
},
|
384 |
-
"css10": {
|
385 |
-
"vits": {
|
386 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
|
387 |
-
"default_vocoder": null,
|
388 |
-
"commit": null,
|
389 |
-
"author": "@NeonGeckoCom",
|
390 |
-
"license": "bsd-3-clause"
|
391 |
-
}
|
392 |
-
}
|
393 |
-
},
|
394 |
-
"de": {
|
395 |
-
"thorsten": {
|
396 |
-
"tacotron2-DCA": {
|
397 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
|
398 |
-
"default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
|
399 |
-
"author": "@thorstenMueller",
|
400 |
-
"license": "apache 2.0",
|
401 |
-
"commit": "unknown"
|
402 |
-
},
|
403 |
-
"vits": {
|
404 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--de--thorsten--vits.zip",
|
405 |
-
"default_vocoder": null,
|
406 |
-
"author": "@thorstenMueller",
|
407 |
-
"license": "apache 2.0",
|
408 |
-
"commit": "unknown"
|
409 |
-
},
|
410 |
-
"tacotron2-DDC": {
|
411 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
|
412 |
-
"default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
|
413 |
-
"description": "Thorsten-Dec2021-22k-DDC",
|
414 |
-
"author": "@thorstenMueller",
|
415 |
-
"license": "apache 2.0",
|
416 |
-
"commit": "unknown"
|
417 |
-
}
|
418 |
-
},
|
419 |
-
"css10": {
|
420 |
-
"vits-neon": {
|
421 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
|
422 |
-
"default_vocoder": null,
|
423 |
-
"author": "@NeonGeckoCom",
|
424 |
-
"license": "bsd-3-clause",
|
425 |
-
"commit": null
|
426 |
-
}
|
427 |
-
}
|
428 |
-
},
|
429 |
-
"ja": {
|
430 |
-
"kokoro": {
|
431 |
-
"tacotron2-DDC": {
|
432 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
|
433 |
-
"default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
|
434 |
-
"description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
|
435 |
-
"author": "@kaiidams",
|
436 |
-
"license": "apache 2.0",
|
437 |
-
"commit": "401fbd89"
|
438 |
-
}
|
439 |
-
}
|
440 |
-
},
|
441 |
-
"tr": {
|
442 |
-
"common-voice": {
|
443 |
-
"glow-tts": {
|
444 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
|
445 |
-
"default_vocoder": "vocoder_models/tr/common-voice/hifigan",
|
446 |
-
"license": "MIT",
|
447 |
-
"description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
|
448 |
-
"author": "Fatih Akademi",
|
449 |
-
"commit": null
|
450 |
-
}
|
451 |
-
}
|
452 |
-
},
|
453 |
-
"it": {
|
454 |
-
"mai_female": {
|
455 |
-
"glow-tts": {
|
456 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
|
457 |
-
"default_vocoder": null,
|
458 |
-
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
459 |
-
"author": "@nicolalandro",
|
460 |
-
"license": "apache 2.0",
|
461 |
-
"commit": null
|
462 |
-
},
|
463 |
-
"vits": {
|
464 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
|
465 |
-
"default_vocoder": null,
|
466 |
-
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
467 |
-
"author": "@nicolalandro",
|
468 |
-
"license": "apache 2.0",
|
469 |
-
"commit": null
|
470 |
-
}
|
471 |
-
},
|
472 |
-
"mai_male": {
|
473 |
-
"glow-tts": {
|
474 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
|
475 |
-
"default_vocoder": null,
|
476 |
-
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
477 |
-
"author": "@nicolalandro",
|
478 |
-
"license": "apache 2.0",
|
479 |
-
"commit": null
|
480 |
-
},
|
481 |
-
"vits": {
|
482 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
|
483 |
-
"default_vocoder": null,
|
484 |
-
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
485 |
-
"author": "@nicolalandro",
|
486 |
-
"license": "apache 2.0",
|
487 |
-
"commit": null
|
488 |
-
}
|
489 |
-
}
|
490 |
-
},
|
491 |
-
"ewe": {
|
492 |
-
"openbible": {
|
493 |
-
"vits": {
|
494 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
|
495 |
-
"default_vocoder": null,
|
496 |
-
"license": "CC-BY-SA 4.0",
|
497 |
-
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
498 |
-
"author": "@coqui_ai",
|
499 |
-
"commit": "1b22f03"
|
500 |
-
}
|
501 |
-
}
|
502 |
-
},
|
503 |
-
"hau": {
|
504 |
-
"openbible": {
|
505 |
-
"vits": {
|
506 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
|
507 |
-
"default_vocoder": null,
|
508 |
-
"license": "CC-BY-SA 4.0",
|
509 |
-
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
510 |
-
"author": "@coqui_ai",
|
511 |
-
"commit": "1b22f03"
|
512 |
-
}
|
513 |
-
}
|
514 |
-
},
|
515 |
-
"lin": {
|
516 |
-
"openbible": {
|
517 |
-
"vits": {
|
518 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
|
519 |
-
"default_vocoder": null,
|
520 |
-
"license": "CC-BY-SA 4.0",
|
521 |
-
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
522 |
-
"author": "@coqui_ai",
|
523 |
-
"commit": "1b22f03"
|
524 |
-
}
|
525 |
-
}
|
526 |
-
},
|
527 |
-
"tw_akuapem": {
|
528 |
-
"openbible": {
|
529 |
-
"vits": {
|
530 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
|
531 |
-
"default_vocoder": null,
|
532 |
-
"license": "CC-BY-SA 4.0",
|
533 |
-
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
534 |
-
"author": "@coqui_ai",
|
535 |
-
"commit": "1b22f03"
|
536 |
-
}
|
537 |
-
}
|
538 |
-
},
|
539 |
-
"tw_asante": {
|
540 |
-
"openbible": {
|
541 |
-
"vits": {
|
542 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
|
543 |
-
"default_vocoder": null,
|
544 |
-
"license": "CC-BY-SA 4.0",
|
545 |
-
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
546 |
-
"author": "@coqui_ai",
|
547 |
-
"commit": "1b22f03"
|
548 |
-
}
|
549 |
-
}
|
550 |
-
},
|
551 |
-
"yor": {
|
552 |
-
"openbible": {
|
553 |
-
"vits": {
|
554 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
|
555 |
-
"default_vocoder": null,
|
556 |
-
"license": "CC-BY-SA 4.0",
|
557 |
-
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
558 |
-
"author": "@coqui_ai",
|
559 |
-
"commit": "1b22f03"
|
560 |
-
}
|
561 |
-
}
|
562 |
-
},
|
563 |
-
"hu": {
|
564 |
-
"css10": {
|
565 |
-
"vits": {
|
566 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
|
567 |
-
"default_vocoder": null,
|
568 |
-
"commit": null,
|
569 |
-
"author": "@NeonGeckoCom",
|
570 |
-
"license": "bsd-3-clause"
|
571 |
-
}
|
572 |
-
}
|
573 |
-
},
|
574 |
-
"el": {
|
575 |
-
"cv": {
|
576 |
-
"vits": {
|
577 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
|
578 |
-
"default_vocoder": null,
|
579 |
-
"commit": null,
|
580 |
-
"author": "@NeonGeckoCom",
|
581 |
-
"license": "bsd-3-clause"
|
582 |
-
}
|
583 |
-
}
|
584 |
-
},
|
585 |
-
"fi": {
|
586 |
-
"css10": {
|
587 |
-
"vits": {
|
588 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
|
589 |
-
"default_vocoder": null,
|
590 |
-
"commit": null,
|
591 |
-
"author": "@NeonGeckoCom",
|
592 |
-
"license": "bsd-3-clause"
|
593 |
-
}
|
594 |
-
}
|
595 |
-
},
|
596 |
-
"hr": {
|
597 |
-
"cv": {
|
598 |
-
"vits": {
|
599 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
|
600 |
-
"default_vocoder": null,
|
601 |
-
"commit": null,
|
602 |
-
"author": "@NeonGeckoCom",
|
603 |
-
"license": "bsd-3-clause"
|
604 |
-
}
|
605 |
-
}
|
606 |
-
},
|
607 |
-
"lt": {
|
608 |
-
"cv": {
|
609 |
-
"vits": {
|
610 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
|
611 |
-
"default_vocoder": null,
|
612 |
-
"commit": null,
|
613 |
-
"author": "@NeonGeckoCom",
|
614 |
-
"license": "bsd-3-clause"
|
615 |
-
}
|
616 |
-
}
|
617 |
-
},
|
618 |
-
"lv": {
|
619 |
-
"cv": {
|
620 |
-
"vits": {
|
621 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
|
622 |
-
"default_vocoder": null,
|
623 |
-
"commit": null,
|
624 |
-
"author": "@NeonGeckoCom",
|
625 |
-
"license": "bsd-3-clause"
|
626 |
-
}
|
627 |
-
}
|
628 |
-
},
|
629 |
-
"mt": {
|
630 |
-
"cv": {
|
631 |
-
"vits": {
|
632 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
|
633 |
-
"default_vocoder": null,
|
634 |
-
"commit": null,
|
635 |
-
"author": "@NeonGeckoCom",
|
636 |
-
"license": "bsd-3-clause"
|
637 |
-
}
|
638 |
-
}
|
639 |
-
},
|
640 |
-
"pl": {
|
641 |
-
"mai_female": {
|
642 |
-
"vits": {
|
643 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
|
644 |
-
"default_vocoder": null,
|
645 |
-
"commit": null,
|
646 |
-
"author": "@NeonGeckoCom",
|
647 |
-
"license": "bsd-3-clause"
|
648 |
-
}
|
649 |
-
}
|
650 |
-
},
|
651 |
-
"pt": {
|
652 |
-
"cv": {
|
653 |
-
"vits": {
|
654 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
|
655 |
-
"default_vocoder": null,
|
656 |
-
"commit": null,
|
657 |
-
"author": "@NeonGeckoCom",
|
658 |
-
"license": "bsd-3-clause"
|
659 |
-
}
|
660 |
-
}
|
661 |
-
},
|
662 |
-
"ro": {
|
663 |
-
"cv": {
|
664 |
-
"vits": {
|
665 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
|
666 |
-
"default_vocoder": null,
|
667 |
-
"commit": null,
|
668 |
-
"author": "@NeonGeckoCom",
|
669 |
-
"license": "bsd-3-clause"
|
670 |
-
}
|
671 |
-
}
|
672 |
-
},
|
673 |
-
"sk": {
|
674 |
-
"cv": {
|
675 |
-
"vits": {
|
676 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
|
677 |
-
"default_vocoder": null,
|
678 |
-
"commit": null,
|
679 |
-
"author": "@NeonGeckoCom",
|
680 |
-
"license": "bsd-3-clause"
|
681 |
-
}
|
682 |
-
}
|
683 |
-
},
|
684 |
-
"sl": {
|
685 |
-
"cv": {
|
686 |
-
"vits": {
|
687 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
|
688 |
-
"default_vocoder": null,
|
689 |
-
"commit": null,
|
690 |
-
"author": "@NeonGeckoCom",
|
691 |
-
"license": "bsd-3-clause"
|
692 |
-
}
|
693 |
-
}
|
694 |
-
},
|
695 |
-
"sv": {
|
696 |
-
"cv": {
|
697 |
-
"vits": {
|
698 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
|
699 |
-
"default_vocoder": null,
|
700 |
-
"commit": null,
|
701 |
-
"author": "@NeonGeckoCom",
|
702 |
-
"license": "bsd-3-clause"
|
703 |
-
}
|
704 |
-
}
|
705 |
-
},
|
706 |
-
"ca": {
|
707 |
-
"custom": {
|
708 |
-
"vits": {
|
709 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
|
710 |
-
"default_vocoder": null,
|
711 |
-
"commit": null,
|
712 |
-
"description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
|
713 |
-
"author": "@gullabi",
|
714 |
-
"license": "CC-BY-4.0"
|
715 |
-
}
|
716 |
-
}
|
717 |
-
},
|
718 |
-
"fa": {
|
719 |
-
"custom": {
|
720 |
-
"glow-tts": {
|
721 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
|
722 |
-
"default_vocoder": null,
|
723 |
-
"commit": null,
|
724 |
-
"description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
|
725 |
-
"author": "@karim23657",
|
726 |
-
"license": "CC-BY-4.0"
|
727 |
-
}
|
728 |
-
}
|
729 |
-
},
|
730 |
-
"bn": {
|
731 |
-
"custom": {
|
732 |
-
"vits-male": {
|
733 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
|
734 |
-
"default_vocoder": null,
|
735 |
-
"commit": null,
|
736 |
-
"description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
|
737 |
-
"author": "@mobassir94",
|
738 |
-
"license": "Apache 2.0"
|
739 |
-
},
|
740 |
-
"vits-female": {
|
741 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
|
742 |
-
"default_vocoder": null,
|
743 |
-
"commit": null,
|
744 |
-
"description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
|
745 |
-
"author": "@mobassir94",
|
746 |
-
"license": "Apache 2.0"
|
747 |
-
}
|
748 |
-
}
|
749 |
-
},
|
750 |
-
"be": {
|
751 |
-
"common-voice": {
|
752 |
-
"glow-tts":{
|
753 |
-
"description": "Belarusian GlowTTS model created by @alex73 (Github).",
|
754 |
-
"github_rls_url":"https://coqui.gateway.scarf.sh/v0.16.6/tts_models--be--common-voice--glow-tts.zip",
|
755 |
-
"default_vocoder": "vocoder_models/be/common-voice/hifigan",
|
756 |
-
"commit": "c0aabb85",
|
757 |
-
"license": "CC-BY-SA 4.0",
|
758 |
-
"contact": "[email protected]"
|
759 |
-
}
|
760 |
-
}
|
761 |
-
}
|
762 |
-
},
|
763 |
-
"vocoder_models": {
|
764 |
-
"universal": {
|
765 |
-
"libri-tts": {
|
766 |
-
"wavegrad": {
|
767 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
|
768 |
-
"commit": "ea976b0",
|
769 |
-
"author": "Eren Gölge @erogol",
|
770 |
-
"license": "MPL",
|
771 |
-
"contact": "[email protected]"
|
772 |
-
},
|
773 |
-
"fullband-melgan": {
|
774 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
|
775 |
-
"commit": "4132240",
|
776 |
-
"author": "Eren Gölge @erogol",
|
777 |
-
"license": "MPL",
|
778 |
-
"contact": "[email protected]"
|
779 |
-
}
|
780 |
-
}
|
781 |
-
},
|
782 |
-
"en": {
|
783 |
-
"ek1": {
|
784 |
-
"wavegrad": {
|
785 |
-
"description": "EK1 en-rp wavegrad by NMStoker",
|
786 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
|
787 |
-
"commit": "c802255",
|
788 |
-
"license": "apache 2.0"
|
789 |
-
}
|
790 |
-
},
|
791 |
-
"ljspeech": {
|
792 |
-
"multiband-melgan": {
|
793 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
|
794 |
-
"commit": "ea976b0",
|
795 |
-
"author": "Eren Gölge @erogol",
|
796 |
-
"license": "MPL",
|
797 |
-
"contact": "[email protected]"
|
798 |
-
},
|
799 |
-
"hifigan_v2": {
|
800 |
-
"description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
|
801 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
|
802 |
-
"commit": "bae2ad0f",
|
803 |
-
"author": "@erogol",
|
804 |
-
"license": "apache 2.0",
|
805 |
-
"contact": "[email protected]"
|
806 |
-
},
|
807 |
-
"univnet": {
|
808 |
-
"description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
|
809 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
|
810 |
-
"commit": "4581e3d",
|
811 |
-
"author": "Eren @erogol",
|
812 |
-
"license": "apache 2.0",
|
813 |
-
"contact": "[email protected]"
|
814 |
-
}
|
815 |
-
},
|
816 |
-
"blizzard2013": {
|
817 |
-
"hifigan_v2": {
|
818 |
-
"description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
|
819 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
|
820 |
-
"commit": "d6284e7",
|
821 |
-
"author": "Adam Froghyar @a-froghyar",
|
822 |
-
"license": "apache 2.0",
|
823 |
-
"contact": "[email protected]"
|
824 |
-
}
|
825 |
-
},
|
826 |
-
"vctk": {
|
827 |
-
"hifigan_v2": {
|
828 |
-
"description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
|
829 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
|
830 |
-
"commit": "2f07160",
|
831 |
-
"author": "Edresson Casanova",
|
832 |
-
"license": "apache 2.0",
|
833 |
-
"contact": ""
|
834 |
-
}
|
835 |
-
},
|
836 |
-
"sam": {
|
837 |
-
"hifigan_v2": {
|
838 |
-
"description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
|
839 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
|
840 |
-
"commit": "2f07160",
|
841 |
-
"author": "Eren Gölge @erogol",
|
842 |
-
"license": "apache 2.0",
|
843 |
-
"contact": "[email protected]"
|
844 |
-
}
|
845 |
-
}
|
846 |
-
},
|
847 |
-
"nl": {
|
848 |
-
"mai": {
|
849 |
-
"parallel-wavegan": {
|
850 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
|
851 |
-
"author": "@r-dh",
|
852 |
-
"license": "apache 2.0",
|
853 |
-
"commit": "unknown"
|
854 |
-
}
|
855 |
-
}
|
856 |
-
},
|
857 |
-
"de": {
|
858 |
-
"thorsten": {
|
859 |
-
"wavegrad": {
|
860 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
|
861 |
-
"author": "@thorstenMueller",
|
862 |
-
"license": "apache 2.0",
|
863 |
-
"commit": "unknown"
|
864 |
-
},
|
865 |
-
"fullband-melgan": {
|
866 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
|
867 |
-
"author": "@thorstenMueller",
|
868 |
-
"license": "apache 2.0",
|
869 |
-
"commit": "unknown"
|
870 |
-
},
|
871 |
-
"hifigan_v1": {
|
872 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
|
873 |
-
"description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
|
874 |
-
"author": "@thorstenMueller",
|
875 |
-
"license": "apache 2.0",
|
876 |
-
"commit": "unknown"
|
877 |
-
}
|
878 |
-
}
|
879 |
-
},
|
880 |
-
"ja": {
|
881 |
-
"kokoro": {
|
882 |
-
"hifigan_v1": {
|
883 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
|
884 |
-
"description": "HifiGAN model trained for kokoro dataset by @kaiidams",
|
885 |
-
"author": "@kaiidams",
|
886 |
-
"license": "apache 2.0",
|
887 |
-
"commit": "3900448"
|
888 |
-
}
|
889 |
-
}
|
890 |
-
},
|
891 |
-
"uk": {
|
892 |
-
"mai": {
|
893 |
-
"multiband-melgan": {
|
894 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
|
895 |
-
"author": "@robinhad",
|
896 |
-
"commit": "bdab788d",
|
897 |
-
"license": "MIT",
|
898 |
-
"contact": ""
|
899 |
-
}
|
900 |
-
}
|
901 |
-
},
|
902 |
-
"tr": {
|
903 |
-
"common-voice": {
|
904 |
-
"hifigan": {
|
905 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
|
906 |
-
"description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
|
907 |
-
"author": "Fatih Akademi",
|
908 |
-
"license": "MIT",
|
909 |
-
"commit": null
|
910 |
-
}
|
911 |
-
}
|
912 |
-
},
|
913 |
-
"be": {
|
914 |
-
"common-voice": {
|
915 |
-
"hifigan": {
|
916 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.16.6/vocoder_models--be--common-voice--hifigan.zip",
|
917 |
-
"description": "Belarusian HiFiGAN model created by @alex73 (Github).",
|
918 |
-
"author": "@alex73",
|
919 |
-
"license": "CC-BY-SA 4.0",
|
920 |
-
"commit": "c0aabb85"
|
921 |
-
}
|
922 |
-
}
|
923 |
-
}
|
924 |
-
},
|
925 |
-
"voice_conversion_models": {
|
926 |
-
"multilingual": {
|
927 |
-
"vctk": {
|
928 |
-
"freevc24": {
|
929 |
-
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
|
930 |
-
"description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
|
931 |
-
"author": "Jing-Yi Li @OlaWod",
|
932 |
-
"license": "MIT",
|
933 |
-
"commit": null
|
934 |
-
}
|
935 |
-
}
|
936 |
-
}
|
937 |
-
}
|
938 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
0.22.0
|
|
|
|
TTS/__init__.py
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
|
4 |
-
version = f.read().strip()
|
5 |
-
|
6 |
-
__version__ = version
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/__pycache__/__init__.cpython-310.pyc
DELETED
Binary file (375 Bytes)
|
|
TTS/__pycache__/model.cpython-310.pyc
DELETED
Binary file (2.6 kB)
|
|
TTS/api.py
DELETED
@@ -1,458 +0,0 @@
|
|
1 |
-
import tempfile
|
2 |
-
import warnings
|
3 |
-
from pathlib import Path
|
4 |
-
from typing import Union
|
5 |
-
|
6 |
-
import numpy as np
|
7 |
-
from torch import nn
|
8 |
-
|
9 |
-
from TTS.utils.audio.numpy_transforms import save_wav
|
10 |
-
from TTS.utils.manage import ModelManager
|
11 |
-
from TTS.utils.synthesizer import Synthesizer
|
12 |
-
from TTS.config import load_config
|
13 |
-
|
14 |
-
|
15 |
-
class TTS(nn.Module):
|
16 |
-
"""TODO: Add voice conversion and Capacitron support."""
|
17 |
-
|
18 |
-
def __init__(
|
19 |
-
self,
|
20 |
-
model_name: str = "",
|
21 |
-
model_path: str = None,
|
22 |
-
config_path: str = None,
|
23 |
-
vocoder_path: str = None,
|
24 |
-
vocoder_config_path: str = None,
|
25 |
-
progress_bar: bool = True,
|
26 |
-
gpu=False,
|
27 |
-
):
|
28 |
-
"""🐸TTS python interface that allows to load and use the released models.
|
29 |
-
|
30 |
-
Example with a multi-speaker model:
|
31 |
-
>>> from TTS.api import TTS
|
32 |
-
>>> tts = TTS(TTS.list_models()[0])
|
33 |
-
>>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
|
34 |
-
>>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
|
35 |
-
|
36 |
-
Example with a single-speaker model:
|
37 |
-
>>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
|
38 |
-
>>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
|
39 |
-
|
40 |
-
Example loading a model from a path:
|
41 |
-
>>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
|
42 |
-
>>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
|
43 |
-
|
44 |
-
Example voice cloning with YourTTS in English, French and Portuguese:
|
45 |
-
>>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
|
46 |
-
>>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
|
47 |
-
>>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
|
48 |
-
>>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
|
49 |
-
|
50 |
-
Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
|
51 |
-
>>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
|
52 |
-
>>> tts.tts_to_file("This is a test.", file_path="output.wav")
|
53 |
-
|
54 |
-
Args:
|
55 |
-
model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
|
56 |
-
model_path (str, optional): Path to the model checkpoint. Defaults to None.
|
57 |
-
config_path (str, optional): Path to the model config. Defaults to None.
|
58 |
-
vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
|
59 |
-
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
|
60 |
-
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
|
61 |
-
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
62 |
-
"""
|
63 |
-
super().__init__()
|
64 |
-
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
|
65 |
-
self.config = load_config(config_path) if config_path else None
|
66 |
-
self.synthesizer = None
|
67 |
-
self.voice_converter = None
|
68 |
-
self.model_name = ""
|
69 |
-
if gpu:
|
70 |
-
warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
|
71 |
-
|
72 |
-
if model_name is not None and len(model_name) > 0:
|
73 |
-
if "tts_models" in model_name:
|
74 |
-
self.load_tts_model_by_name(model_name, gpu)
|
75 |
-
elif "voice_conversion_models" in model_name:
|
76 |
-
self.load_vc_model_by_name(model_name, gpu)
|
77 |
-
else:
|
78 |
-
self.load_model_by_name(model_name, gpu)
|
79 |
-
|
80 |
-
if model_path:
|
81 |
-
self.load_tts_model_by_path(
|
82 |
-
model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
|
83 |
-
)
|
84 |
-
|
85 |
-
@property
|
86 |
-
def models(self):
|
87 |
-
return self.manager.list_tts_models()
|
88 |
-
|
89 |
-
@property
|
90 |
-
def is_multi_speaker(self):
|
91 |
-
if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
|
92 |
-
return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
|
93 |
-
return False
|
94 |
-
|
95 |
-
@property
|
96 |
-
def is_multi_lingual(self):
|
97 |
-
# Not sure what sets this to None, but applied a fix to prevent crashing.
|
98 |
-
if (
|
99 |
-
isinstance(self.model_name, str)
|
100 |
-
and "xtts" in self.model_name
|
101 |
-
or self.config
|
102 |
-
and ("xtts" in self.config.model or len(self.config.languages) > 1)
|
103 |
-
):
|
104 |
-
return True
|
105 |
-
if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
|
106 |
-
return self.synthesizer.tts_model.language_manager.num_languages > 1
|
107 |
-
return False
|
108 |
-
|
109 |
-
@property
|
110 |
-
def speakers(self):
|
111 |
-
if not self.is_multi_speaker:
|
112 |
-
return None
|
113 |
-
return self.synthesizer.tts_model.speaker_manager.speaker_names
|
114 |
-
|
115 |
-
@property
|
116 |
-
def languages(self):
|
117 |
-
if not self.is_multi_lingual:
|
118 |
-
return None
|
119 |
-
return self.synthesizer.tts_model.language_manager.language_names
|
120 |
-
|
121 |
-
@staticmethod
|
122 |
-
def get_models_file_path():
|
123 |
-
return Path(__file__).parent / ".models.json"
|
124 |
-
|
125 |
-
def list_models(self):
|
126 |
-
return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
|
127 |
-
|
128 |
-
def download_model_by_name(self, model_name: str):
|
129 |
-
model_path, config_path, model_item = self.manager.download_model(model_name)
|
130 |
-
if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
|
131 |
-
# return model directory if there are multiple files
|
132 |
-
# we assume that the model knows how to load itself
|
133 |
-
return None, None, None, None, model_path
|
134 |
-
if model_item.get("default_vocoder") is None:
|
135 |
-
return model_path, config_path, None, None, None
|
136 |
-
vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
|
137 |
-
return model_path, config_path, vocoder_path, vocoder_config_path, None
|
138 |
-
|
139 |
-
def load_model_by_name(self, model_name: str, gpu: bool = False):
|
140 |
-
"""Load one of the 🐸TTS models by name.
|
141 |
-
|
142 |
-
Args:
|
143 |
-
model_name (str): Model name to load. You can list models by ```tts.models```.
|
144 |
-
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
145 |
-
"""
|
146 |
-
self.load_tts_model_by_name(model_name, gpu)
|
147 |
-
|
148 |
-
def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
|
149 |
-
"""Load one of the voice conversion models by name.
|
150 |
-
|
151 |
-
Args:
|
152 |
-
model_name (str): Model name to load. You can list models by ```tts.models```.
|
153 |
-
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
154 |
-
"""
|
155 |
-
self.model_name = model_name
|
156 |
-
model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
|
157 |
-
self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
|
158 |
-
|
159 |
-
def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
|
160 |
-
"""Load one of 🐸TTS models by name.
|
161 |
-
|
162 |
-
Args:
|
163 |
-
model_name (str): Model name to load. You can list models by ```tts.models```.
|
164 |
-
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
165 |
-
|
166 |
-
TODO: Add tests
|
167 |
-
"""
|
168 |
-
self.synthesizer = None
|
169 |
-
self.model_name = model_name
|
170 |
-
|
171 |
-
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
|
172 |
-
model_name
|
173 |
-
)
|
174 |
-
|
175 |
-
# init synthesizer
|
176 |
-
# None values are fetch from the model
|
177 |
-
self.synthesizer = Synthesizer(
|
178 |
-
tts_checkpoint=model_path,
|
179 |
-
tts_config_path=config_path,
|
180 |
-
tts_speakers_file=None,
|
181 |
-
tts_languages_file=None,
|
182 |
-
vocoder_checkpoint=vocoder_path,
|
183 |
-
vocoder_config=vocoder_config_path,
|
184 |
-
encoder_checkpoint=None,
|
185 |
-
encoder_config=None,
|
186 |
-
model_dir=model_dir,
|
187 |
-
use_cuda=gpu,
|
188 |
-
)
|
189 |
-
|
190 |
-
def load_tts_model_by_path(
|
191 |
-
self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
|
192 |
-
):
|
193 |
-
"""Load a model from a path.
|
194 |
-
|
195 |
-
Args:
|
196 |
-
model_path (str): Path to the model checkpoint.
|
197 |
-
config_path (str): Path to the model config.
|
198 |
-
vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
|
199 |
-
vocoder_config (str, optional): Path to the vocoder config. Defaults to None.
|
200 |
-
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
201 |
-
"""
|
202 |
-
|
203 |
-
self.synthesizer = Synthesizer(
|
204 |
-
tts_checkpoint=model_path,
|
205 |
-
tts_config_path=config_path,
|
206 |
-
tts_speakers_file=None,
|
207 |
-
tts_languages_file=None,
|
208 |
-
vocoder_checkpoint=vocoder_path,
|
209 |
-
vocoder_config=vocoder_config,
|
210 |
-
encoder_checkpoint=None,
|
211 |
-
encoder_config=None,
|
212 |
-
use_cuda=gpu,
|
213 |
-
)
|
214 |
-
|
215 |
-
def _check_arguments(
|
216 |
-
self,
|
217 |
-
speaker: str = None,
|
218 |
-
language: str = None,
|
219 |
-
speaker_wav: str = None,
|
220 |
-
emotion: str = None,
|
221 |
-
speed: float = None,
|
222 |
-
**kwargs,
|
223 |
-
) -> None:
|
224 |
-
"""Check if the arguments are valid for the model."""
|
225 |
-
# check for the coqui tts models
|
226 |
-
if self.is_multi_speaker and (speaker is None and speaker_wav is None):
|
227 |
-
raise ValueError("Model is multi-speaker but no `speaker` is provided.")
|
228 |
-
if self.is_multi_lingual and language is None:
|
229 |
-
raise ValueError("Model is multi-lingual but no `language` is provided.")
|
230 |
-
if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
|
231 |
-
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
|
232 |
-
if not self.is_multi_lingual and language is not None:
|
233 |
-
raise ValueError("Model is not multi-lingual but `language` is provided.")
|
234 |
-
if not emotion is None and not speed is None:
|
235 |
-
raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
|
236 |
-
|
237 |
-
def tts(
|
238 |
-
self,
|
239 |
-
text: str,
|
240 |
-
speaker: str = None,
|
241 |
-
language: str = None,
|
242 |
-
speaker_wav: str = None,
|
243 |
-
emotion: str = None,
|
244 |
-
speed: float = None,
|
245 |
-
split_sentences: bool = True,
|
246 |
-
**kwargs,
|
247 |
-
):
|
248 |
-
"""Convert text to speech.
|
249 |
-
|
250 |
-
Args:
|
251 |
-
text (str):
|
252 |
-
Input text to synthesize.
|
253 |
-
speaker (str, optional):
|
254 |
-
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
|
255 |
-
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
|
256 |
-
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
|
257 |
-
supported by `XTTS` model.
|
258 |
-
speaker_wav (str, optional):
|
259 |
-
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
|
260 |
-
Defaults to None.
|
261 |
-
emotion (str, optional):
|
262 |
-
Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
|
263 |
-
speed (float, optional):
|
264 |
-
Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
|
265 |
-
Defaults to None.
|
266 |
-
split_sentences (bool, optional):
|
267 |
-
Split text into sentences, synthesize them separately and concatenate the file audio.
|
268 |
-
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
|
269 |
-
applicable to the 🐸TTS models. Defaults to True.
|
270 |
-
kwargs (dict, optional):
|
271 |
-
Additional arguments for the model.
|
272 |
-
"""
|
273 |
-
self._check_arguments(
|
274 |
-
speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
|
275 |
-
)
|
276 |
-
wav = self.synthesizer.tts(
|
277 |
-
text=text,
|
278 |
-
speaker_name=speaker,
|
279 |
-
language_name=language,
|
280 |
-
speaker_wav=speaker_wav,
|
281 |
-
reference_wav=None,
|
282 |
-
style_wav=None,
|
283 |
-
style_text=None,
|
284 |
-
reference_speaker_name=None,
|
285 |
-
split_sentences=split_sentences,
|
286 |
-
**kwargs,
|
287 |
-
)
|
288 |
-
return wav
|
289 |
-
|
290 |
-
def tts_to_file(
|
291 |
-
self,
|
292 |
-
text: str,
|
293 |
-
speaker: str = None,
|
294 |
-
language: str = None,
|
295 |
-
speaker_wav: str = None,
|
296 |
-
emotion: str = None,
|
297 |
-
speed: float = 1.0,
|
298 |
-
pipe_out=None,
|
299 |
-
file_path: str = "output.wav",
|
300 |
-
split_sentences: bool = True,
|
301 |
-
**kwargs,
|
302 |
-
):
|
303 |
-
"""Convert text to speech.
|
304 |
-
|
305 |
-
Args:
|
306 |
-
text (str):
|
307 |
-
Input text to synthesize.
|
308 |
-
speaker (str, optional):
|
309 |
-
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
|
310 |
-
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
|
311 |
-
language (str, optional):
|
312 |
-
Language code for multi-lingual models. You can check whether loaded model is multi-lingual
|
313 |
-
`tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
|
314 |
-
speaker_wav (str, optional):
|
315 |
-
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
|
316 |
-
Defaults to None.
|
317 |
-
emotion (str, optional):
|
318 |
-
Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
|
319 |
-
speed (float, optional):
|
320 |
-
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
|
321 |
-
pipe_out (BytesIO, optional):
|
322 |
-
Flag to stdout the generated TTS wav file for shell pipe.
|
323 |
-
file_path (str, optional):
|
324 |
-
Output file path. Defaults to "output.wav".
|
325 |
-
split_sentences (bool, optional):
|
326 |
-
Split text into sentences, synthesize them separately and concatenate the file audio.
|
327 |
-
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
|
328 |
-
applicable to the 🐸TTS models. Defaults to True.
|
329 |
-
kwargs (dict, optional):
|
330 |
-
Additional arguments for the model.
|
331 |
-
"""
|
332 |
-
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
|
333 |
-
|
334 |
-
wav = self.tts(
|
335 |
-
text=text,
|
336 |
-
speaker=speaker,
|
337 |
-
language=language,
|
338 |
-
speaker_wav=speaker_wav,
|
339 |
-
split_sentences=split_sentences,
|
340 |
-
**kwargs,
|
341 |
-
)
|
342 |
-
self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
|
343 |
-
return file_path
|
344 |
-
|
345 |
-
def voice_conversion(
|
346 |
-
self,
|
347 |
-
source_wav: str,
|
348 |
-
target_wav: str,
|
349 |
-
):
|
350 |
-
"""Voice conversion with FreeVC. Convert source wav to target speaker.
|
351 |
-
|
352 |
-
Args:``
|
353 |
-
source_wav (str):
|
354 |
-
Path to the source wav file.
|
355 |
-
target_wav (str):`
|
356 |
-
Path to the target wav file.
|
357 |
-
"""
|
358 |
-
wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
|
359 |
-
return wav
|
360 |
-
|
361 |
-
def voice_conversion_to_file(
|
362 |
-
self,
|
363 |
-
source_wav: str,
|
364 |
-
target_wav: str,
|
365 |
-
file_path: str = "output.wav",
|
366 |
-
):
|
367 |
-
"""Voice conversion with FreeVC. Convert source wav to target speaker.
|
368 |
-
|
369 |
-
Args:
|
370 |
-
source_wav (str):
|
371 |
-
Path to the source wav file.
|
372 |
-
target_wav (str):
|
373 |
-
Path to the target wav file.
|
374 |
-
file_path (str, optional):
|
375 |
-
Output file path. Defaults to "output.wav".
|
376 |
-
"""
|
377 |
-
wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
|
378 |
-
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
|
379 |
-
return file_path
|
380 |
-
|
381 |
-
def tts_with_vc(
|
382 |
-
self,
|
383 |
-
text: str,
|
384 |
-
language: str = None,
|
385 |
-
speaker_wav: str = None,
|
386 |
-
speaker: str = None,
|
387 |
-
split_sentences: bool = True,
|
388 |
-
):
|
389 |
-
"""Convert text to speech with voice conversion.
|
390 |
-
|
391 |
-
It combines tts with voice conversion to fake voice cloning.
|
392 |
-
|
393 |
-
- Convert text to speech with tts.
|
394 |
-
- Convert the output wav to target speaker with voice conversion.
|
395 |
-
|
396 |
-
Args:
|
397 |
-
text (str):
|
398 |
-
Input text to synthesize.
|
399 |
-
language (str, optional):
|
400 |
-
Language code for multi-lingual models. You can check whether loaded model is multi-lingual
|
401 |
-
`tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
|
402 |
-
speaker_wav (str, optional):
|
403 |
-
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
|
404 |
-
Defaults to None.
|
405 |
-
speaker (str, optional):
|
406 |
-
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
|
407 |
-
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
|
408 |
-
split_sentences (bool, optional):
|
409 |
-
Split text into sentences, synthesize them separately and concatenate the file audio.
|
410 |
-
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
|
411 |
-
applicable to the 🐸TTS models. Defaults to True.
|
412 |
-
"""
|
413 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
414 |
-
# Lazy code... save it to a temp file to resample it while reading it for VC
|
415 |
-
self.tts_to_file(
|
416 |
-
text=text, speaker=speaker, language=language, file_path=fp.name, split_sentences=split_sentences
|
417 |
-
)
|
418 |
-
if self.voice_converter is None:
|
419 |
-
self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
|
420 |
-
wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
|
421 |
-
return wav
|
422 |
-
|
423 |
-
def tts_with_vc_to_file(
|
424 |
-
self,
|
425 |
-
text: str,
|
426 |
-
language: str = None,
|
427 |
-
speaker_wav: str = None,
|
428 |
-
file_path: str = "output.wav",
|
429 |
-
speaker: str = None,
|
430 |
-
split_sentences: bool = True,
|
431 |
-
):
|
432 |
-
"""Convert text to speech with voice conversion and save to file.
|
433 |
-
|
434 |
-
Check `tts_with_vc` for more details.
|
435 |
-
|
436 |
-
Args:
|
437 |
-
text (str):
|
438 |
-
Input text to synthesize.
|
439 |
-
language (str, optional):
|
440 |
-
Language code for multi-lingual models. You can check whether loaded model is multi-lingual
|
441 |
-
`tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
|
442 |
-
speaker_wav (str, optional):
|
443 |
-
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
|
444 |
-
Defaults to None.
|
445 |
-
file_path (str, optional):
|
446 |
-
Output file path. Defaults to "output.wav".
|
447 |
-
speaker (str, optional):
|
448 |
-
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
|
449 |
-
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
|
450 |
-
split_sentences (bool, optional):
|
451 |
-
Split text into sentences, synthesize them separately and concatenate the file audio.
|
452 |
-
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
|
453 |
-
applicable to the 🐸TTS models. Defaults to True.
|
454 |
-
"""
|
455 |
-
wav = self.tts_with_vc(
|
456 |
-
text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
|
457 |
-
)
|
458 |
-
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/bin/__init__.py
DELETED
File without changes
|
TTS/bin/collect_env_info.py
DELETED
@@ -1,48 +0,0 @@
|
|
1 |
-
"""Get detailed info about the working environment."""
|
2 |
-
import os
|
3 |
-
import platform
|
4 |
-
import sys
|
5 |
-
|
6 |
-
import numpy
|
7 |
-
import torch
|
8 |
-
|
9 |
-
sys.path += [os.path.abspath(".."), os.path.abspath(".")]
|
10 |
-
import json
|
11 |
-
|
12 |
-
import TTS
|
13 |
-
|
14 |
-
|
15 |
-
def system_info():
|
16 |
-
return {
|
17 |
-
"OS": platform.system(),
|
18 |
-
"architecture": platform.architecture(),
|
19 |
-
"version": platform.version(),
|
20 |
-
"processor": platform.processor(),
|
21 |
-
"python": platform.python_version(),
|
22 |
-
}
|
23 |
-
|
24 |
-
|
25 |
-
def cuda_info():
|
26 |
-
return {
|
27 |
-
"GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
|
28 |
-
"available": torch.cuda.is_available(),
|
29 |
-
"version": torch.version.cuda,
|
30 |
-
}
|
31 |
-
|
32 |
-
|
33 |
-
def package_info():
|
34 |
-
return {
|
35 |
-
"numpy": numpy.__version__,
|
36 |
-
"PyTorch_version": torch.__version__,
|
37 |
-
"PyTorch_debug": torch.version.debug,
|
38 |
-
"TTS": TTS.__version__,
|
39 |
-
}
|
40 |
-
|
41 |
-
|
42 |
-
def main():
|
43 |
-
details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
|
44 |
-
print(json.dumps(details, indent=4, sort_keys=True))
|
45 |
-
|
46 |
-
|
47 |
-
if __name__ == "__main__":
|
48 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/bin/compute_attention_masks.py
DELETED
@@ -1,165 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import importlib
|
3 |
-
import os
|
4 |
-
from argparse import RawTextHelpFormatter
|
5 |
-
|
6 |
-
import numpy as np
|
7 |
-
import torch
|
8 |
-
from torch.utils.data import DataLoader
|
9 |
-
from tqdm import tqdm
|
10 |
-
|
11 |
-
from TTS.config import load_config
|
12 |
-
from TTS.tts.datasets.TTSDataset import TTSDataset
|
13 |
-
from TTS.tts.models import setup_model
|
14 |
-
from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
|
15 |
-
from TTS.utils.audio import AudioProcessor
|
16 |
-
from TTS.utils.io import load_checkpoint
|
17 |
-
|
18 |
-
if __name__ == "__main__":
|
19 |
-
# pylint: disable=bad-option-value
|
20 |
-
parser = argparse.ArgumentParser(
|
21 |
-
description="""Extract attention masks from trained Tacotron/Tacotron2 models.
|
22 |
-
These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
|
23 |
-
"""Each attention mask is written to the same path as the input wav file with ".npy" file extension.
|
24 |
-
(e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
|
25 |
-
"""
|
26 |
-
Example run:
|
27 |
-
CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
|
28 |
-
--model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
|
29 |
-
--config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
|
30 |
-
--dataset_metafile metadata.csv
|
31 |
-
--data_path /root/LJSpeech-1.1/
|
32 |
-
--batch_size 32
|
33 |
-
--dataset ljspeech
|
34 |
-
--use_cuda True
|
35 |
-
""",
|
36 |
-
formatter_class=RawTextHelpFormatter,
|
37 |
-
)
|
38 |
-
parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
|
39 |
-
parser.add_argument(
|
40 |
-
"--config_path",
|
41 |
-
type=str,
|
42 |
-
required=True,
|
43 |
-
help="Path to Tacotron/Tacotron2 config file.",
|
44 |
-
)
|
45 |
-
parser.add_argument(
|
46 |
-
"--dataset",
|
47 |
-
type=str,
|
48 |
-
default="",
|
49 |
-
required=True,
|
50 |
-
help="Target dataset processor name from TTS.tts.dataset.preprocess.",
|
51 |
-
)
|
52 |
-
|
53 |
-
parser.add_argument(
|
54 |
-
"--dataset_metafile",
|
55 |
-
type=str,
|
56 |
-
default="",
|
57 |
-
required=True,
|
58 |
-
help="Dataset metafile inclusing file paths with transcripts.",
|
59 |
-
)
|
60 |
-
parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
|
61 |
-
parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
|
62 |
-
|
63 |
-
parser.add_argument(
|
64 |
-
"--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
|
65 |
-
)
|
66 |
-
args = parser.parse_args()
|
67 |
-
|
68 |
-
C = load_config(args.config_path)
|
69 |
-
ap = AudioProcessor(**C.audio)
|
70 |
-
|
71 |
-
# if the vocabulary was passed, replace the default
|
72 |
-
if "characters" in C.keys():
|
73 |
-
symbols, phonemes = make_symbols(**C.characters)
|
74 |
-
|
75 |
-
# load the model
|
76 |
-
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
|
77 |
-
# TODO: handle multi-speaker
|
78 |
-
model = setup_model(C)
|
79 |
-
model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
|
80 |
-
|
81 |
-
# data loader
|
82 |
-
preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
|
83 |
-
preprocessor = getattr(preprocessor, args.dataset)
|
84 |
-
meta_data = preprocessor(args.data_path, args.dataset_metafile)
|
85 |
-
dataset = TTSDataset(
|
86 |
-
model.decoder.r,
|
87 |
-
C.text_cleaner,
|
88 |
-
compute_linear_spec=False,
|
89 |
-
ap=ap,
|
90 |
-
meta_data=meta_data,
|
91 |
-
characters=C.characters if "characters" in C.keys() else None,
|
92 |
-
add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
|
93 |
-
use_phonemes=C.use_phonemes,
|
94 |
-
phoneme_cache_path=C.phoneme_cache_path,
|
95 |
-
phoneme_language=C.phoneme_language,
|
96 |
-
enable_eos_bos=C.enable_eos_bos_chars,
|
97 |
-
)
|
98 |
-
|
99 |
-
dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
|
100 |
-
loader = DataLoader(
|
101 |
-
dataset,
|
102 |
-
batch_size=args.batch_size,
|
103 |
-
num_workers=4,
|
104 |
-
collate_fn=dataset.collate_fn,
|
105 |
-
shuffle=False,
|
106 |
-
drop_last=False,
|
107 |
-
)
|
108 |
-
|
109 |
-
# compute attentions
|
110 |
-
file_paths = []
|
111 |
-
with torch.no_grad():
|
112 |
-
for data in tqdm(loader):
|
113 |
-
# setup input data
|
114 |
-
text_input = data[0]
|
115 |
-
text_lengths = data[1]
|
116 |
-
linear_input = data[3]
|
117 |
-
mel_input = data[4]
|
118 |
-
mel_lengths = data[5]
|
119 |
-
stop_targets = data[6]
|
120 |
-
item_idxs = data[7]
|
121 |
-
|
122 |
-
# dispatch data to GPU
|
123 |
-
if args.use_cuda:
|
124 |
-
text_input = text_input.cuda()
|
125 |
-
text_lengths = text_lengths.cuda()
|
126 |
-
mel_input = mel_input.cuda()
|
127 |
-
mel_lengths = mel_lengths.cuda()
|
128 |
-
|
129 |
-
model_outputs = model.forward(text_input, text_lengths, mel_input)
|
130 |
-
|
131 |
-
alignments = model_outputs["alignments"].detach()
|
132 |
-
for idx, alignment in enumerate(alignments):
|
133 |
-
item_idx = item_idxs[idx]
|
134 |
-
# interpolate if r > 1
|
135 |
-
alignment = (
|
136 |
-
torch.nn.functional.interpolate(
|
137 |
-
alignment.transpose(0, 1).unsqueeze(0),
|
138 |
-
size=None,
|
139 |
-
scale_factor=model.decoder.r,
|
140 |
-
mode="nearest",
|
141 |
-
align_corners=None,
|
142 |
-
recompute_scale_factor=None,
|
143 |
-
)
|
144 |
-
.squeeze(0)
|
145 |
-
.transpose(0, 1)
|
146 |
-
)
|
147 |
-
# remove paddings
|
148 |
-
alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
|
149 |
-
# set file paths
|
150 |
-
wav_file_name = os.path.basename(item_idx)
|
151 |
-
align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
|
152 |
-
file_path = item_idx.replace(wav_file_name, align_file_name)
|
153 |
-
# save output
|
154 |
-
wav_file_abs_path = os.path.abspath(item_idx)
|
155 |
-
file_abs_path = os.path.abspath(file_path)
|
156 |
-
file_paths.append([wav_file_abs_path, file_abs_path])
|
157 |
-
np.save(file_path, alignment)
|
158 |
-
|
159 |
-
# ourput metafile
|
160 |
-
metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
|
161 |
-
|
162 |
-
with open(metafile, "w", encoding="utf-8") as f:
|
163 |
-
for p in file_paths:
|
164 |
-
f.write(f"{p[0]}|{p[1]}\n")
|
165 |
-
print(f" >> Metafile created: {metafile}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/bin/compute_embeddings.py
DELETED
@@ -1,197 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import os
|
3 |
-
from argparse import RawTextHelpFormatter
|
4 |
-
|
5 |
-
import torch
|
6 |
-
from tqdm import tqdm
|
7 |
-
|
8 |
-
from TTS.config import load_config
|
9 |
-
from TTS.config.shared_configs import BaseDatasetConfig
|
10 |
-
from TTS.tts.datasets import load_tts_samples
|
11 |
-
from TTS.tts.utils.managers import save_file
|
12 |
-
from TTS.tts.utils.speakers import SpeakerManager
|
13 |
-
|
14 |
-
|
15 |
-
def compute_embeddings(
|
16 |
-
model_path,
|
17 |
-
config_path,
|
18 |
-
output_path,
|
19 |
-
old_speakers_file=None,
|
20 |
-
old_append=False,
|
21 |
-
config_dataset_path=None,
|
22 |
-
formatter_name=None,
|
23 |
-
dataset_name=None,
|
24 |
-
dataset_path=None,
|
25 |
-
meta_file_train=None,
|
26 |
-
meta_file_val=None,
|
27 |
-
disable_cuda=False,
|
28 |
-
no_eval=False,
|
29 |
-
):
|
30 |
-
use_cuda = torch.cuda.is_available() and not disable_cuda
|
31 |
-
|
32 |
-
if config_dataset_path is not None:
|
33 |
-
c_dataset = load_config(config_dataset_path)
|
34 |
-
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
|
35 |
-
else:
|
36 |
-
c_dataset = BaseDatasetConfig()
|
37 |
-
c_dataset.formatter = formatter_name
|
38 |
-
c_dataset.dataset_name = dataset_name
|
39 |
-
c_dataset.path = dataset_path
|
40 |
-
if meta_file_train is not None:
|
41 |
-
c_dataset.meta_file_train = meta_file_train
|
42 |
-
if meta_file_val is not None:
|
43 |
-
c_dataset.meta_file_val = meta_file_val
|
44 |
-
meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
|
45 |
-
|
46 |
-
if meta_data_eval is None:
|
47 |
-
samples = meta_data_train
|
48 |
-
else:
|
49 |
-
samples = meta_data_train + meta_data_eval
|
50 |
-
|
51 |
-
encoder_manager = SpeakerManager(
|
52 |
-
encoder_model_path=model_path,
|
53 |
-
encoder_config_path=config_path,
|
54 |
-
d_vectors_file_path=old_speakers_file,
|
55 |
-
use_cuda=use_cuda,
|
56 |
-
)
|
57 |
-
|
58 |
-
class_name_key = encoder_manager.encoder_config.class_name_key
|
59 |
-
|
60 |
-
# compute speaker embeddings
|
61 |
-
if old_speakers_file is not None and old_append:
|
62 |
-
speaker_mapping = encoder_manager.embeddings
|
63 |
-
else:
|
64 |
-
speaker_mapping = {}
|
65 |
-
|
66 |
-
for fields in tqdm(samples):
|
67 |
-
class_name = fields[class_name_key]
|
68 |
-
audio_file = fields["audio_file"]
|
69 |
-
embedding_key = fields["audio_unique_name"]
|
70 |
-
|
71 |
-
# Only update the speaker name when the embedding is already in the old file.
|
72 |
-
if embedding_key in speaker_mapping:
|
73 |
-
speaker_mapping[embedding_key]["name"] = class_name
|
74 |
-
continue
|
75 |
-
|
76 |
-
if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids:
|
77 |
-
# get the embedding from the old file
|
78 |
-
embedd = encoder_manager.get_embedding_by_clip(embedding_key)
|
79 |
-
else:
|
80 |
-
# extract the embedding
|
81 |
-
embedd = encoder_manager.compute_embedding_from_clip(audio_file)
|
82 |
-
|
83 |
-
# create speaker_mapping if target dataset is defined
|
84 |
-
speaker_mapping[embedding_key] = {}
|
85 |
-
speaker_mapping[embedding_key]["name"] = class_name
|
86 |
-
speaker_mapping[embedding_key]["embedding"] = embedd
|
87 |
-
|
88 |
-
if speaker_mapping:
|
89 |
-
# save speaker_mapping if target dataset is defined
|
90 |
-
if os.path.isdir(output_path):
|
91 |
-
mapping_file_path = os.path.join(output_path, "speakers.pth")
|
92 |
-
else:
|
93 |
-
mapping_file_path = output_path
|
94 |
-
|
95 |
-
if os.path.dirname(mapping_file_path) != "":
|
96 |
-
os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
|
97 |
-
|
98 |
-
save_file(speaker_mapping, mapping_file_path)
|
99 |
-
print("Speaker embeddings saved at:", mapping_file_path)
|
100 |
-
|
101 |
-
|
102 |
-
if __name__ == "__main__":
|
103 |
-
parser = argparse.ArgumentParser(
|
104 |
-
description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
|
105 |
-
"""
|
106 |
-
Example runs:
|
107 |
-
python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
|
108 |
-
|
109 |
-
python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
|
110 |
-
""",
|
111 |
-
formatter_class=RawTextHelpFormatter,
|
112 |
-
)
|
113 |
-
parser.add_argument(
|
114 |
-
"--model_path",
|
115 |
-
type=str,
|
116 |
-
help="Path to model checkpoint file. It defaults to the released speaker encoder.",
|
117 |
-
default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
118 |
-
)
|
119 |
-
parser.add_argument(
|
120 |
-
"--config_path",
|
121 |
-
type=str,
|
122 |
-
help="Path to model config file. It defaults to the released speaker encoder config.",
|
123 |
-
default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
124 |
-
)
|
125 |
-
parser.add_argument(
|
126 |
-
"--config_dataset_path",
|
127 |
-
type=str,
|
128 |
-
help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
|
129 |
-
default=None,
|
130 |
-
)
|
131 |
-
parser.add_argument(
|
132 |
-
"--output_path",
|
133 |
-
type=str,
|
134 |
-
help="Path for output `pth` or `json` file.",
|
135 |
-
default="speakers.pth",
|
136 |
-
)
|
137 |
-
parser.add_argument(
|
138 |
-
"--old_file",
|
139 |
-
type=str,
|
140 |
-
help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
|
141 |
-
default=None,
|
142 |
-
)
|
143 |
-
parser.add_argument(
|
144 |
-
"--old_append",
|
145 |
-
help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
|
146 |
-
default=False,
|
147 |
-
action="store_true",
|
148 |
-
)
|
149 |
-
parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
|
150 |
-
parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
|
151 |
-
parser.add_argument(
|
152 |
-
"--formatter_name",
|
153 |
-
type=str,
|
154 |
-
help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
|
155 |
-
default=None,
|
156 |
-
)
|
157 |
-
parser.add_argument(
|
158 |
-
"--dataset_name",
|
159 |
-
type=str,
|
160 |
-
help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
|
161 |
-
default=None,
|
162 |
-
)
|
163 |
-
parser.add_argument(
|
164 |
-
"--dataset_path",
|
165 |
-
type=str,
|
166 |
-
help="Path to the dataset. You either need to provide this or `config_dataset_path`",
|
167 |
-
default=None,
|
168 |
-
)
|
169 |
-
parser.add_argument(
|
170 |
-
"--meta_file_train",
|
171 |
-
type=str,
|
172 |
-
help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
|
173 |
-
default=None,
|
174 |
-
)
|
175 |
-
parser.add_argument(
|
176 |
-
"--meta_file_val",
|
177 |
-
type=str,
|
178 |
-
help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
|
179 |
-
default=None,
|
180 |
-
)
|
181 |
-
args = parser.parse_args()
|
182 |
-
|
183 |
-
compute_embeddings(
|
184 |
-
args.model_path,
|
185 |
-
args.config_path,
|
186 |
-
args.output_path,
|
187 |
-
old_speakers_file=args.old_file,
|
188 |
-
old_append=args.old_append,
|
189 |
-
config_dataset_path=args.config_dataset_path,
|
190 |
-
formatter_name=args.formatter_name,
|
191 |
-
dataset_name=args.dataset_name,
|
192 |
-
dataset_path=args.dataset_path,
|
193 |
-
meta_file_train=args.meta_file_train,
|
194 |
-
meta_file_val=args.meta_file_val,
|
195 |
-
disable_cuda=args.disable_cuda,
|
196 |
-
no_eval=args.no_eval,
|
197 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/bin/compute_statistics.py
DELETED
@@ -1,96 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
# -*- coding: utf-8 -*-
|
3 |
-
|
4 |
-
import argparse
|
5 |
-
import glob
|
6 |
-
import os
|
7 |
-
|
8 |
-
import numpy as np
|
9 |
-
from tqdm import tqdm
|
10 |
-
|
11 |
-
# from TTS.utils.io import load_config
|
12 |
-
from TTS.config import load_config
|
13 |
-
from TTS.tts.datasets import load_tts_samples
|
14 |
-
from TTS.utils.audio import AudioProcessor
|
15 |
-
|
16 |
-
|
17 |
-
def main():
|
18 |
-
"""Run preprocessing process."""
|
19 |
-
parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
|
20 |
-
parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
|
21 |
-
parser.add_argument("out_path", type=str, help="save path (directory and filename).")
|
22 |
-
parser.add_argument(
|
23 |
-
"--data_path",
|
24 |
-
type=str,
|
25 |
-
required=False,
|
26 |
-
help="folder including the target set of wavs overriding dataset config.",
|
27 |
-
)
|
28 |
-
args, overrides = parser.parse_known_args()
|
29 |
-
|
30 |
-
CONFIG = load_config(args.config_path)
|
31 |
-
CONFIG.parse_known_args(overrides, relaxed_parser=True)
|
32 |
-
|
33 |
-
# load config
|
34 |
-
CONFIG.audio.signal_norm = False # do not apply earlier normalization
|
35 |
-
CONFIG.audio.stats_path = None # discard pre-defined stats
|
36 |
-
|
37 |
-
# load audio processor
|
38 |
-
ap = AudioProcessor(**CONFIG.audio.to_dict())
|
39 |
-
|
40 |
-
# load the meta data of target dataset
|
41 |
-
if args.data_path:
|
42 |
-
dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
|
43 |
-
else:
|
44 |
-
dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
|
45 |
-
print(f" > There are {len(dataset_items)} files.")
|
46 |
-
|
47 |
-
mel_sum = 0
|
48 |
-
mel_square_sum = 0
|
49 |
-
linear_sum = 0
|
50 |
-
linear_square_sum = 0
|
51 |
-
N = 0
|
52 |
-
for item in tqdm(dataset_items):
|
53 |
-
# compute features
|
54 |
-
wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
|
55 |
-
linear = ap.spectrogram(wav)
|
56 |
-
mel = ap.melspectrogram(wav)
|
57 |
-
|
58 |
-
# compute stats
|
59 |
-
N += mel.shape[1]
|
60 |
-
mel_sum += mel.sum(1)
|
61 |
-
linear_sum += linear.sum(1)
|
62 |
-
mel_square_sum += (mel**2).sum(axis=1)
|
63 |
-
linear_square_sum += (linear**2).sum(axis=1)
|
64 |
-
|
65 |
-
mel_mean = mel_sum / N
|
66 |
-
mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
|
67 |
-
linear_mean = linear_sum / N
|
68 |
-
linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
|
69 |
-
|
70 |
-
output_file_path = args.out_path
|
71 |
-
stats = {}
|
72 |
-
stats["mel_mean"] = mel_mean
|
73 |
-
stats["mel_std"] = mel_scale
|
74 |
-
stats["linear_mean"] = linear_mean
|
75 |
-
stats["linear_std"] = linear_scale
|
76 |
-
|
77 |
-
print(f" > Avg mel spec mean: {mel_mean.mean()}")
|
78 |
-
print(f" > Avg mel spec scale: {mel_scale.mean()}")
|
79 |
-
print(f" > Avg linear spec mean: {linear_mean.mean()}")
|
80 |
-
print(f" > Avg linear spec scale: {linear_scale.mean()}")
|
81 |
-
|
82 |
-
# set default config values for mean-var scaling
|
83 |
-
CONFIG.audio.stats_path = output_file_path
|
84 |
-
CONFIG.audio.signal_norm = True
|
85 |
-
# remove redundant values
|
86 |
-
del CONFIG.audio.max_norm
|
87 |
-
del CONFIG.audio.min_level_db
|
88 |
-
del CONFIG.audio.symmetric_norm
|
89 |
-
del CONFIG.audio.clip_norm
|
90 |
-
stats["audio_config"] = CONFIG.audio.to_dict()
|
91 |
-
np.save(output_file_path, stats, allow_pickle=True)
|
92 |
-
print(f" > stats saved to {output_file_path}")
|
93 |
-
|
94 |
-
|
95 |
-
if __name__ == "__main__":
|
96 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/bin/eval_encoder.py
DELETED
@@ -1,88 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
from argparse import RawTextHelpFormatter
|
3 |
-
|
4 |
-
import torch
|
5 |
-
from tqdm import tqdm
|
6 |
-
|
7 |
-
from TTS.config import load_config
|
8 |
-
from TTS.tts.datasets import load_tts_samples
|
9 |
-
from TTS.tts.utils.speakers import SpeakerManager
|
10 |
-
|
11 |
-
|
12 |
-
def compute_encoder_accuracy(dataset_items, encoder_manager):
|
13 |
-
class_name_key = encoder_manager.encoder_config.class_name_key
|
14 |
-
map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
|
15 |
-
|
16 |
-
class_acc_dict = {}
|
17 |
-
|
18 |
-
# compute embeddings for all wav_files
|
19 |
-
for item in tqdm(dataset_items):
|
20 |
-
class_name = item[class_name_key]
|
21 |
-
wav_file = item["audio_file"]
|
22 |
-
|
23 |
-
# extract the embedding
|
24 |
-
embedd = encoder_manager.compute_embedding_from_clip(wav_file)
|
25 |
-
if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
|
26 |
-
embedding = torch.FloatTensor(embedd).unsqueeze(0)
|
27 |
-
if encoder_manager.use_cuda:
|
28 |
-
embedding = embedding.cuda()
|
29 |
-
|
30 |
-
class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
|
31 |
-
predicted_label = map_classid_to_classname[str(class_id)]
|
32 |
-
else:
|
33 |
-
predicted_label = None
|
34 |
-
|
35 |
-
if class_name is not None and predicted_label is not None:
|
36 |
-
is_equal = int(class_name == predicted_label)
|
37 |
-
if class_name not in class_acc_dict:
|
38 |
-
class_acc_dict[class_name] = [is_equal]
|
39 |
-
else:
|
40 |
-
class_acc_dict[class_name].append(is_equal)
|
41 |
-
else:
|
42 |
-
raise RuntimeError("Error: class_name or/and predicted_label are None")
|
43 |
-
|
44 |
-
acc_avg = 0
|
45 |
-
for key, values in class_acc_dict.items():
|
46 |
-
acc = sum(values) / len(values)
|
47 |
-
print("Class", key, "Accuracy:", acc)
|
48 |
-
acc_avg += acc
|
49 |
-
|
50 |
-
print("Average Accuracy:", acc_avg / len(class_acc_dict))
|
51 |
-
|
52 |
-
|
53 |
-
if __name__ == "__main__":
|
54 |
-
parser = argparse.ArgumentParser(
|
55 |
-
description="""Compute the accuracy of the encoder.\n\n"""
|
56 |
-
"""
|
57 |
-
Example runs:
|
58 |
-
python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
|
59 |
-
""",
|
60 |
-
formatter_class=RawTextHelpFormatter,
|
61 |
-
)
|
62 |
-
parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
|
63 |
-
parser.add_argument(
|
64 |
-
"config_path",
|
65 |
-
type=str,
|
66 |
-
help="Path to model config file.",
|
67 |
-
)
|
68 |
-
|
69 |
-
parser.add_argument(
|
70 |
-
"config_dataset_path",
|
71 |
-
type=str,
|
72 |
-
help="Path to dataset config file.",
|
73 |
-
)
|
74 |
-
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
|
75 |
-
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
|
76 |
-
|
77 |
-
args = parser.parse_args()
|
78 |
-
|
79 |
-
c_dataset = load_config(args.config_dataset_path)
|
80 |
-
|
81 |
-
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
|
82 |
-
items = meta_data_train + meta_data_eval
|
83 |
-
|
84 |
-
enc_manager = SpeakerManager(
|
85 |
-
encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
|
86 |
-
)
|
87 |
-
|
88 |
-
compute_encoder_accuracy(items, enc_manager)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/bin/extract_tts_spectrograms.py
DELETED
@@ -1,287 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
"""Extract Mel spectrograms with teacher forcing."""
|
3 |
-
|
4 |
-
import argparse
|
5 |
-
import os
|
6 |
-
|
7 |
-
import numpy as np
|
8 |
-
import torch
|
9 |
-
from torch.utils.data import DataLoader
|
10 |
-
from tqdm import tqdm
|
11 |
-
|
12 |
-
from TTS.config import load_config
|
13 |
-
from TTS.tts.datasets import TTSDataset, load_tts_samples
|
14 |
-
from TTS.tts.models import setup_model
|
15 |
-
from TTS.tts.utils.speakers import SpeakerManager
|
16 |
-
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
17 |
-
from TTS.utils.audio import AudioProcessor
|
18 |
-
from TTS.utils.audio.numpy_transforms import quantize
|
19 |
-
from TTS.utils.generic_utils import count_parameters
|
20 |
-
|
21 |
-
use_cuda = torch.cuda.is_available()
|
22 |
-
|
23 |
-
|
24 |
-
def setup_loader(ap, r, verbose=False):
|
25 |
-
tokenizer, _ = TTSTokenizer.init_from_config(c)
|
26 |
-
dataset = TTSDataset(
|
27 |
-
outputs_per_step=r,
|
28 |
-
compute_linear_spec=False,
|
29 |
-
samples=meta_data,
|
30 |
-
tokenizer=tokenizer,
|
31 |
-
ap=ap,
|
32 |
-
batch_group_size=0,
|
33 |
-
min_text_len=c.min_text_len,
|
34 |
-
max_text_len=c.max_text_len,
|
35 |
-
min_audio_len=c.min_audio_len,
|
36 |
-
max_audio_len=c.max_audio_len,
|
37 |
-
phoneme_cache_path=c.phoneme_cache_path,
|
38 |
-
precompute_num_workers=0,
|
39 |
-
use_noise_augment=False,
|
40 |
-
verbose=verbose,
|
41 |
-
speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
|
42 |
-
d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
|
43 |
-
)
|
44 |
-
|
45 |
-
if c.use_phonemes and c.compute_input_seq_cache:
|
46 |
-
# precompute phonemes to have a better estimate of sequence lengths.
|
47 |
-
dataset.compute_input_seq(c.num_loader_workers)
|
48 |
-
dataset.preprocess_samples()
|
49 |
-
|
50 |
-
loader = DataLoader(
|
51 |
-
dataset,
|
52 |
-
batch_size=c.batch_size,
|
53 |
-
shuffle=False,
|
54 |
-
collate_fn=dataset.collate_fn,
|
55 |
-
drop_last=False,
|
56 |
-
sampler=None,
|
57 |
-
num_workers=c.num_loader_workers,
|
58 |
-
pin_memory=False,
|
59 |
-
)
|
60 |
-
return loader
|
61 |
-
|
62 |
-
|
63 |
-
def set_filename(wav_path, out_path):
|
64 |
-
wav_file = os.path.basename(wav_path)
|
65 |
-
file_name = wav_file.split(".")[0]
|
66 |
-
os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
|
67 |
-
os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
|
68 |
-
os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
|
69 |
-
os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
|
70 |
-
wavq_path = os.path.join(out_path, "quant", file_name)
|
71 |
-
mel_path = os.path.join(out_path, "mel", file_name)
|
72 |
-
wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
|
73 |
-
wav_path = os.path.join(out_path, "wav", file_name + ".wav")
|
74 |
-
return file_name, wavq_path, mel_path, wav_gl_path, wav_path
|
75 |
-
|
76 |
-
|
77 |
-
def format_data(data):
|
78 |
-
# setup input data
|
79 |
-
text_input = data["token_id"]
|
80 |
-
text_lengths = data["token_id_lengths"]
|
81 |
-
mel_input = data["mel"]
|
82 |
-
mel_lengths = data["mel_lengths"]
|
83 |
-
item_idx = data["item_idxs"]
|
84 |
-
d_vectors = data["d_vectors"]
|
85 |
-
speaker_ids = data["speaker_ids"]
|
86 |
-
attn_mask = data["attns"]
|
87 |
-
avg_text_length = torch.mean(text_lengths.float())
|
88 |
-
avg_spec_length = torch.mean(mel_lengths.float())
|
89 |
-
|
90 |
-
# dispatch data to GPU
|
91 |
-
if use_cuda:
|
92 |
-
text_input = text_input.cuda(non_blocking=True)
|
93 |
-
text_lengths = text_lengths.cuda(non_blocking=True)
|
94 |
-
mel_input = mel_input.cuda(non_blocking=True)
|
95 |
-
mel_lengths = mel_lengths.cuda(non_blocking=True)
|
96 |
-
if speaker_ids is not None:
|
97 |
-
speaker_ids = speaker_ids.cuda(non_blocking=True)
|
98 |
-
if d_vectors is not None:
|
99 |
-
d_vectors = d_vectors.cuda(non_blocking=True)
|
100 |
-
if attn_mask is not None:
|
101 |
-
attn_mask = attn_mask.cuda(non_blocking=True)
|
102 |
-
return (
|
103 |
-
text_input,
|
104 |
-
text_lengths,
|
105 |
-
mel_input,
|
106 |
-
mel_lengths,
|
107 |
-
speaker_ids,
|
108 |
-
d_vectors,
|
109 |
-
avg_text_length,
|
110 |
-
avg_spec_length,
|
111 |
-
attn_mask,
|
112 |
-
item_idx,
|
113 |
-
)
|
114 |
-
|
115 |
-
|
116 |
-
@torch.no_grad()
|
117 |
-
def inference(
|
118 |
-
model_name,
|
119 |
-
model,
|
120 |
-
ap,
|
121 |
-
text_input,
|
122 |
-
text_lengths,
|
123 |
-
mel_input,
|
124 |
-
mel_lengths,
|
125 |
-
speaker_ids=None,
|
126 |
-
d_vectors=None,
|
127 |
-
):
|
128 |
-
if model_name == "glow_tts":
|
129 |
-
speaker_c = None
|
130 |
-
if speaker_ids is not None:
|
131 |
-
speaker_c = speaker_ids
|
132 |
-
elif d_vectors is not None:
|
133 |
-
speaker_c = d_vectors
|
134 |
-
outputs = model.inference_with_MAS(
|
135 |
-
text_input,
|
136 |
-
text_lengths,
|
137 |
-
mel_input,
|
138 |
-
mel_lengths,
|
139 |
-
aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
|
140 |
-
)
|
141 |
-
model_output = outputs["model_outputs"]
|
142 |
-
model_output = model_output.detach().cpu().numpy()
|
143 |
-
|
144 |
-
elif "tacotron" in model_name:
|
145 |
-
aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
|
146 |
-
outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
|
147 |
-
postnet_outputs = outputs["model_outputs"]
|
148 |
-
# normalize tacotron output
|
149 |
-
if model_name == "tacotron":
|
150 |
-
mel_specs = []
|
151 |
-
postnet_outputs = postnet_outputs.data.cpu().numpy()
|
152 |
-
for b in range(postnet_outputs.shape[0]):
|
153 |
-
postnet_output = postnet_outputs[b]
|
154 |
-
mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
|
155 |
-
model_output = torch.stack(mel_specs).cpu().numpy()
|
156 |
-
|
157 |
-
elif model_name == "tacotron2":
|
158 |
-
model_output = postnet_outputs.detach().cpu().numpy()
|
159 |
-
return model_output
|
160 |
-
|
161 |
-
|
162 |
-
def extract_spectrograms(
|
163 |
-
data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
|
164 |
-
):
|
165 |
-
model.eval()
|
166 |
-
export_metadata = []
|
167 |
-
for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
|
168 |
-
# format data
|
169 |
-
(
|
170 |
-
text_input,
|
171 |
-
text_lengths,
|
172 |
-
mel_input,
|
173 |
-
mel_lengths,
|
174 |
-
speaker_ids,
|
175 |
-
d_vectors,
|
176 |
-
_,
|
177 |
-
_,
|
178 |
-
_,
|
179 |
-
item_idx,
|
180 |
-
) = format_data(data)
|
181 |
-
|
182 |
-
model_output = inference(
|
183 |
-
c.model.lower(),
|
184 |
-
model,
|
185 |
-
ap,
|
186 |
-
text_input,
|
187 |
-
text_lengths,
|
188 |
-
mel_input,
|
189 |
-
mel_lengths,
|
190 |
-
speaker_ids,
|
191 |
-
d_vectors,
|
192 |
-
)
|
193 |
-
|
194 |
-
for idx in range(text_input.shape[0]):
|
195 |
-
wav_file_path = item_idx[idx]
|
196 |
-
wav = ap.load_wav(wav_file_path)
|
197 |
-
_, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
|
198 |
-
|
199 |
-
# quantize and save wav
|
200 |
-
if quantize_bits > 0:
|
201 |
-
wavq = quantize(wav, quantize_bits)
|
202 |
-
np.save(wavq_path, wavq)
|
203 |
-
|
204 |
-
# save TTS mel
|
205 |
-
mel = model_output[idx]
|
206 |
-
mel_length = mel_lengths[idx]
|
207 |
-
mel = mel[:mel_length, :].T
|
208 |
-
np.save(mel_path, mel)
|
209 |
-
|
210 |
-
export_metadata.append([wav_file_path, mel_path])
|
211 |
-
if save_audio:
|
212 |
-
ap.save_wav(wav, wav_path)
|
213 |
-
|
214 |
-
if debug:
|
215 |
-
print("Audio for debug saved at:", wav_gl_path)
|
216 |
-
wav = ap.inv_melspectrogram(mel)
|
217 |
-
ap.save_wav(wav, wav_gl_path)
|
218 |
-
|
219 |
-
with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
|
220 |
-
for data in export_metadata:
|
221 |
-
f.write(f"{data[0]}|{data[1]+'.npy'}\n")
|
222 |
-
|
223 |
-
|
224 |
-
def main(args): # pylint: disable=redefined-outer-name
|
225 |
-
# pylint: disable=global-variable-undefined
|
226 |
-
global meta_data, speaker_manager
|
227 |
-
|
228 |
-
# Audio processor
|
229 |
-
ap = AudioProcessor(**c.audio)
|
230 |
-
|
231 |
-
# load data instances
|
232 |
-
meta_data_train, meta_data_eval = load_tts_samples(
|
233 |
-
c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
|
234 |
-
)
|
235 |
-
|
236 |
-
# use eval and training partitions
|
237 |
-
meta_data = meta_data_train + meta_data_eval
|
238 |
-
|
239 |
-
# init speaker manager
|
240 |
-
if c.use_speaker_embedding:
|
241 |
-
speaker_manager = SpeakerManager(data_items=meta_data)
|
242 |
-
elif c.use_d_vector_file:
|
243 |
-
speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
|
244 |
-
else:
|
245 |
-
speaker_manager = None
|
246 |
-
|
247 |
-
# setup model
|
248 |
-
model = setup_model(c)
|
249 |
-
|
250 |
-
# restore model
|
251 |
-
model.load_checkpoint(c, args.checkpoint_path, eval=True)
|
252 |
-
|
253 |
-
if use_cuda:
|
254 |
-
model.cuda()
|
255 |
-
|
256 |
-
num_params = count_parameters(model)
|
257 |
-
print("\n > Model has {} parameters".format(num_params), flush=True)
|
258 |
-
# set r
|
259 |
-
r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
|
260 |
-
own_loader = setup_loader(ap, r, verbose=True)
|
261 |
-
|
262 |
-
extract_spectrograms(
|
263 |
-
own_loader,
|
264 |
-
model,
|
265 |
-
ap,
|
266 |
-
args.output_path,
|
267 |
-
quantize_bits=args.quantize_bits,
|
268 |
-
save_audio=args.save_audio,
|
269 |
-
debug=args.debug,
|
270 |
-
metada_name="metada.txt",
|
271 |
-
)
|
272 |
-
|
273 |
-
|
274 |
-
if __name__ == "__main__":
|
275 |
-
parser = argparse.ArgumentParser()
|
276 |
-
parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
|
277 |
-
parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
|
278 |
-
parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
|
279 |
-
parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
|
280 |
-
parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
|
281 |
-
parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
|
282 |
-
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
|
283 |
-
args = parser.parse_args()
|
284 |
-
|
285 |
-
c = load_config(args.config_path)
|
286 |
-
c.audio.trim_silence = False
|
287 |
-
main(args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/bin/find_unique_chars.py
DELETED
@@ -1,45 +0,0 @@
|
|
1 |
-
"""Find all the unique characters in a dataset"""
|
2 |
-
import argparse
|
3 |
-
from argparse import RawTextHelpFormatter
|
4 |
-
|
5 |
-
from TTS.config import load_config
|
6 |
-
from TTS.tts.datasets import load_tts_samples
|
7 |
-
|
8 |
-
|
9 |
-
def main():
|
10 |
-
# pylint: disable=bad-option-value
|
11 |
-
parser = argparse.ArgumentParser(
|
12 |
-
description="""Find all the unique characters or phonemes in a dataset.\n\n"""
|
13 |
-
"""
|
14 |
-
Example runs:
|
15 |
-
|
16 |
-
python TTS/bin/find_unique_chars.py --config_path config.json
|
17 |
-
""",
|
18 |
-
formatter_class=RawTextHelpFormatter,
|
19 |
-
)
|
20 |
-
parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
|
21 |
-
args = parser.parse_args()
|
22 |
-
|
23 |
-
c = load_config(args.config_path)
|
24 |
-
|
25 |
-
# load all datasets
|
26 |
-
train_items, eval_items = load_tts_samples(
|
27 |
-
c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
|
28 |
-
)
|
29 |
-
|
30 |
-
items = train_items + eval_items
|
31 |
-
|
32 |
-
texts = "".join(item["text"] for item in items)
|
33 |
-
chars = set(texts)
|
34 |
-
lower_chars = filter(lambda c: c.islower(), chars)
|
35 |
-
chars_force_lower = [c.lower() for c in chars]
|
36 |
-
chars_force_lower = set(chars_force_lower)
|
37 |
-
|
38 |
-
print(f" > Number of unique characters: {len(chars)}")
|
39 |
-
print(f" > Unique characters: {''.join(sorted(chars))}")
|
40 |
-
print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
|
41 |
-
print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
|
42 |
-
|
43 |
-
|
44 |
-
if __name__ == "__main__":
|
45 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/bin/find_unique_phonemes.py
DELETED
@@ -1,74 +0,0 @@
|
|
1 |
-
"""Find all the unique characters in a dataset"""
|
2 |
-
import argparse
|
3 |
-
import multiprocessing
|
4 |
-
from argparse import RawTextHelpFormatter
|
5 |
-
|
6 |
-
from tqdm.contrib.concurrent import process_map
|
7 |
-
|
8 |
-
from TTS.config import load_config
|
9 |
-
from TTS.tts.datasets import load_tts_samples
|
10 |
-
from TTS.tts.utils.text.phonemizers import Gruut
|
11 |
-
|
12 |
-
|
13 |
-
def compute_phonemes(item):
|
14 |
-
text = item["text"]
|
15 |
-
ph = phonemizer.phonemize(text).replace("|", "")
|
16 |
-
return set(list(ph))
|
17 |
-
|
18 |
-
|
19 |
-
def main():
|
20 |
-
# pylint: disable=W0601
|
21 |
-
global c, phonemizer
|
22 |
-
# pylint: disable=bad-option-value
|
23 |
-
parser = argparse.ArgumentParser(
|
24 |
-
description="""Find all the unique characters or phonemes in a dataset.\n\n"""
|
25 |
-
"""
|
26 |
-
Example runs:
|
27 |
-
|
28 |
-
python TTS/bin/find_unique_phonemes.py --config_path config.json
|
29 |
-
""",
|
30 |
-
formatter_class=RawTextHelpFormatter,
|
31 |
-
)
|
32 |
-
parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
|
33 |
-
args = parser.parse_args()
|
34 |
-
|
35 |
-
c = load_config(args.config_path)
|
36 |
-
|
37 |
-
# load all datasets
|
38 |
-
train_items, eval_items = load_tts_samples(
|
39 |
-
c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
|
40 |
-
)
|
41 |
-
items = train_items + eval_items
|
42 |
-
print("Num items:", len(items))
|
43 |
-
|
44 |
-
language_list = [item["language"] for item in items]
|
45 |
-
is_lang_def = all(language_list)
|
46 |
-
|
47 |
-
if not c.phoneme_language or not is_lang_def:
|
48 |
-
raise ValueError("Phoneme language must be defined in config.")
|
49 |
-
|
50 |
-
if not language_list.count(language_list[0]) == len(language_list):
|
51 |
-
raise ValueError(
|
52 |
-
"Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
|
53 |
-
)
|
54 |
-
|
55 |
-
phonemizer = Gruut(language=language_list[0], keep_puncs=True)
|
56 |
-
|
57 |
-
phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
|
58 |
-
phones = []
|
59 |
-
for ph in phonemes:
|
60 |
-
phones.extend(ph)
|
61 |
-
|
62 |
-
phones = set(phones)
|
63 |
-
lower_phones = filter(lambda c: c.islower(), phones)
|
64 |
-
phones_force_lower = [c.lower() for c in phones]
|
65 |
-
phones_force_lower = set(phones_force_lower)
|
66 |
-
|
67 |
-
print(f" > Number of unique phonemes: {len(phones)}")
|
68 |
-
print(f" > Unique phonemes: {''.join(sorted(phones))}")
|
69 |
-
print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
|
70 |
-
print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
|
71 |
-
|
72 |
-
|
73 |
-
if __name__ == "__main__":
|
74 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/bin/remove_silence_using_vad.py
DELETED
@@ -1,124 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import glob
|
3 |
-
import multiprocessing
|
4 |
-
import os
|
5 |
-
import pathlib
|
6 |
-
|
7 |
-
import torch
|
8 |
-
from tqdm import tqdm
|
9 |
-
|
10 |
-
from TTS.utils.vad import get_vad_model_and_utils, remove_silence
|
11 |
-
|
12 |
-
torch.set_num_threads(1)
|
13 |
-
|
14 |
-
|
15 |
-
def adjust_path_and_remove_silence(audio_path):
|
16 |
-
output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
|
17 |
-
# ignore if the file exists
|
18 |
-
if os.path.exists(output_path) and not args.force:
|
19 |
-
return output_path, False
|
20 |
-
|
21 |
-
# create all directory structure
|
22 |
-
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
23 |
-
# remove the silence and save the audio
|
24 |
-
output_path, is_speech = remove_silence(
|
25 |
-
model_and_utils,
|
26 |
-
audio_path,
|
27 |
-
output_path,
|
28 |
-
trim_just_beginning_and_end=args.trim_just_beginning_and_end,
|
29 |
-
use_cuda=args.use_cuda,
|
30 |
-
)
|
31 |
-
return output_path, is_speech
|
32 |
-
|
33 |
-
|
34 |
-
def preprocess_audios():
|
35 |
-
files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
|
36 |
-
print("> Number of files: ", len(files))
|
37 |
-
if not args.force:
|
38 |
-
print("> Ignoring files that already exist in the output idrectory.")
|
39 |
-
|
40 |
-
if args.trim_just_beginning_and_end:
|
41 |
-
print("> Trimming just the beginning and the end with nonspeech parts.")
|
42 |
-
else:
|
43 |
-
print("> Trimming all nonspeech parts.")
|
44 |
-
|
45 |
-
filtered_files = []
|
46 |
-
if files:
|
47 |
-
# create threads
|
48 |
-
# num_threads = multiprocessing.cpu_count()
|
49 |
-
# process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
|
50 |
-
|
51 |
-
if args.num_processes > 1:
|
52 |
-
with multiprocessing.Pool(processes=args.num_processes) as pool:
|
53 |
-
results = list(
|
54 |
-
tqdm(
|
55 |
-
pool.imap_unordered(adjust_path_and_remove_silence, files),
|
56 |
-
total=len(files),
|
57 |
-
desc="Processing audio files",
|
58 |
-
)
|
59 |
-
)
|
60 |
-
for output_path, is_speech in results:
|
61 |
-
if not is_speech:
|
62 |
-
filtered_files.append(output_path)
|
63 |
-
else:
|
64 |
-
for f in tqdm(files):
|
65 |
-
output_path, is_speech = adjust_path_and_remove_silence(f)
|
66 |
-
if not is_speech:
|
67 |
-
filtered_files.append(output_path)
|
68 |
-
|
69 |
-
# write files that do not have speech
|
70 |
-
with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
|
71 |
-
for file in filtered_files:
|
72 |
-
f.write(str(file) + "\n")
|
73 |
-
else:
|
74 |
-
print("> No files Found !")
|
75 |
-
|
76 |
-
|
77 |
-
if __name__ == "__main__":
|
78 |
-
parser = argparse.ArgumentParser(
|
79 |
-
description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
|
80 |
-
)
|
81 |
-
parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
|
82 |
-
parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
|
83 |
-
parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
|
84 |
-
parser.add_argument(
|
85 |
-
"-g",
|
86 |
-
"--glob",
|
87 |
-
type=str,
|
88 |
-
default="**/*.wav",
|
89 |
-
help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
|
90 |
-
)
|
91 |
-
parser.add_argument(
|
92 |
-
"-t",
|
93 |
-
"--trim_just_beginning_and_end",
|
94 |
-
type=bool,
|
95 |
-
default=True,
|
96 |
-
help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
|
97 |
-
)
|
98 |
-
parser.add_argument(
|
99 |
-
"-c",
|
100 |
-
"--use_cuda",
|
101 |
-
type=bool,
|
102 |
-
default=False,
|
103 |
-
help="If True use cuda",
|
104 |
-
)
|
105 |
-
parser.add_argument(
|
106 |
-
"--use_onnx",
|
107 |
-
type=bool,
|
108 |
-
default=False,
|
109 |
-
help="If True use onnx",
|
110 |
-
)
|
111 |
-
parser.add_argument(
|
112 |
-
"--num_processes",
|
113 |
-
type=int,
|
114 |
-
default=1,
|
115 |
-
help="Number of processes to use",
|
116 |
-
)
|
117 |
-
args = parser.parse_args()
|
118 |
-
|
119 |
-
if args.output_dir == "":
|
120 |
-
args.output_dir = args.input_dir
|
121 |
-
|
122 |
-
# load the model and utils
|
123 |
-
model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
|
124 |
-
preprocess_audios()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/bin/resample.py
DELETED
@@ -1,90 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import glob
|
3 |
-
import os
|
4 |
-
from argparse import RawTextHelpFormatter
|
5 |
-
from multiprocessing import Pool
|
6 |
-
from shutil import copytree
|
7 |
-
|
8 |
-
import librosa
|
9 |
-
import soundfile as sf
|
10 |
-
from tqdm import tqdm
|
11 |
-
|
12 |
-
|
13 |
-
def resample_file(func_args):
|
14 |
-
filename, output_sr = func_args
|
15 |
-
y, sr = librosa.load(filename, sr=output_sr)
|
16 |
-
sf.write(filename, y, sr)
|
17 |
-
|
18 |
-
|
19 |
-
def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
|
20 |
-
if output_dir:
|
21 |
-
print("Recursively copying the input folder...")
|
22 |
-
copytree(input_dir, output_dir)
|
23 |
-
input_dir = output_dir
|
24 |
-
|
25 |
-
print("Resampling the audio files...")
|
26 |
-
audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
|
27 |
-
print(f"Found {len(audio_files)} files...")
|
28 |
-
audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
|
29 |
-
with Pool(processes=n_jobs) as p:
|
30 |
-
with tqdm(total=len(audio_files)) as pbar:
|
31 |
-
for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
|
32 |
-
pbar.update()
|
33 |
-
|
34 |
-
print("Done !")
|
35 |
-
|
36 |
-
|
37 |
-
if __name__ == "__main__":
|
38 |
-
parser = argparse.ArgumentParser(
|
39 |
-
description="""Resample a folder recusively with librosa
|
40 |
-
Can be used in place or create a copy of the folder as an output.\n\n
|
41 |
-
Example run:
|
42 |
-
python TTS/bin/resample.py
|
43 |
-
--input_dir /root/LJSpeech-1.1/
|
44 |
-
--output_sr 22050
|
45 |
-
--output_dir /root/resampled_LJSpeech-1.1/
|
46 |
-
--file_ext wav
|
47 |
-
--n_jobs 24
|
48 |
-
""",
|
49 |
-
formatter_class=RawTextHelpFormatter,
|
50 |
-
)
|
51 |
-
|
52 |
-
parser.add_argument(
|
53 |
-
"--input_dir",
|
54 |
-
type=str,
|
55 |
-
default=None,
|
56 |
-
required=True,
|
57 |
-
help="Path of the folder containing the audio files to resample",
|
58 |
-
)
|
59 |
-
|
60 |
-
parser.add_argument(
|
61 |
-
"--output_sr",
|
62 |
-
type=int,
|
63 |
-
default=22050,
|
64 |
-
required=False,
|
65 |
-
help="Samlple rate to which the audio files should be resampled",
|
66 |
-
)
|
67 |
-
|
68 |
-
parser.add_argument(
|
69 |
-
"--output_dir",
|
70 |
-
type=str,
|
71 |
-
default=None,
|
72 |
-
required=False,
|
73 |
-
help="Path of the destination folder. If not defined, the operation is done in place",
|
74 |
-
)
|
75 |
-
|
76 |
-
parser.add_argument(
|
77 |
-
"--file_ext",
|
78 |
-
type=str,
|
79 |
-
default="wav",
|
80 |
-
required=False,
|
81 |
-
help="Extension of the audio files to resample",
|
82 |
-
)
|
83 |
-
|
84 |
-
parser.add_argument(
|
85 |
-
"--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
|
86 |
-
)
|
87 |
-
|
88 |
-
args = parser.parse_args()
|
89 |
-
|
90 |
-
resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/bin/synthesize.py
DELETED
@@ -1,494 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
# -*- coding: utf-8 -*-
|
3 |
-
|
4 |
-
import argparse
|
5 |
-
import contextlib
|
6 |
-
import sys
|
7 |
-
from argparse import RawTextHelpFormatter
|
8 |
-
|
9 |
-
# pylint: disable=redefined-outer-name, unused-argument
|
10 |
-
from pathlib import Path
|
11 |
-
|
12 |
-
description = """
|
13 |
-
Synthesize speech on command line.
|
14 |
-
|
15 |
-
You can either use your trained model or choose a model from the provided list.
|
16 |
-
|
17 |
-
If you don't specify any models, then it uses LJSpeech based English model.
|
18 |
-
|
19 |
-
#### Single Speaker Models
|
20 |
-
|
21 |
-
- List provided models:
|
22 |
-
|
23 |
-
```
|
24 |
-
$ tts --list_models
|
25 |
-
```
|
26 |
-
|
27 |
-
- Get model info (for both tts_models and vocoder_models):
|
28 |
-
|
29 |
-
- Query by type/name:
|
30 |
-
The model_info_by_name uses the name as it from the --list_models.
|
31 |
-
```
|
32 |
-
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
|
33 |
-
```
|
34 |
-
For example:
|
35 |
-
```
|
36 |
-
$ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
|
37 |
-
$ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
|
38 |
-
```
|
39 |
-
- Query by type/idx:
|
40 |
-
The model_query_idx uses the corresponding idx from --list_models.
|
41 |
-
|
42 |
-
```
|
43 |
-
$ tts --model_info_by_idx "<model_type>/<model_query_idx>"
|
44 |
-
```
|
45 |
-
|
46 |
-
For example:
|
47 |
-
|
48 |
-
```
|
49 |
-
$ tts --model_info_by_idx tts_models/3
|
50 |
-
```
|
51 |
-
|
52 |
-
- Query info for model info by full name:
|
53 |
-
```
|
54 |
-
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
|
55 |
-
```
|
56 |
-
|
57 |
-
- Run TTS with default models:
|
58 |
-
|
59 |
-
```
|
60 |
-
$ tts --text "Text for TTS" --out_path output/path/speech.wav
|
61 |
-
```
|
62 |
-
|
63 |
-
- Run TTS and pipe out the generated TTS wav file data:
|
64 |
-
|
65 |
-
```
|
66 |
-
$ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
|
67 |
-
```
|
68 |
-
|
69 |
-
- Run a TTS model with its default vocoder model:
|
70 |
-
|
71 |
-
```
|
72 |
-
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
|
73 |
-
```
|
74 |
-
|
75 |
-
For example:
|
76 |
-
|
77 |
-
```
|
78 |
-
$ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
|
79 |
-
```
|
80 |
-
|
81 |
-
- Run with specific TTS and vocoder models from the list:
|
82 |
-
|
83 |
-
```
|
84 |
-
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
|
85 |
-
```
|
86 |
-
|
87 |
-
For example:
|
88 |
-
|
89 |
-
```
|
90 |
-
$ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
|
91 |
-
```
|
92 |
-
|
93 |
-
- Run your own TTS model (Using Griffin-Lim Vocoder):
|
94 |
-
|
95 |
-
```
|
96 |
-
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
|
97 |
-
```
|
98 |
-
|
99 |
-
- Run your own TTS and Vocoder models:
|
100 |
-
|
101 |
-
```
|
102 |
-
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
|
103 |
-
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
|
104 |
-
```
|
105 |
-
|
106 |
-
#### Multi-speaker Models
|
107 |
-
|
108 |
-
- List the available speakers and choose a <speaker_id> among them:
|
109 |
-
|
110 |
-
```
|
111 |
-
$ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
|
112 |
-
```
|
113 |
-
|
114 |
-
- Run the multi-speaker TTS model with the target speaker ID:
|
115 |
-
|
116 |
-
```
|
117 |
-
$ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
|
118 |
-
```
|
119 |
-
|
120 |
-
- Run your own multi-speaker TTS model:
|
121 |
-
|
122 |
-
```
|
123 |
-
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
|
124 |
-
```
|
125 |
-
|
126 |
-
### Voice Conversion Models
|
127 |
-
|
128 |
-
```
|
129 |
-
$ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
|
130 |
-
```
|
131 |
-
"""
|
132 |
-
|
133 |
-
|
134 |
-
def str2bool(v):
|
135 |
-
if isinstance(v, bool):
|
136 |
-
return v
|
137 |
-
if v.lower() in ("yes", "true", "t", "y", "1"):
|
138 |
-
return True
|
139 |
-
if v.lower() in ("no", "false", "f", "n", "0"):
|
140 |
-
return False
|
141 |
-
raise argparse.ArgumentTypeError("Boolean value expected.")
|
142 |
-
|
143 |
-
|
144 |
-
def main():
|
145 |
-
parser = argparse.ArgumentParser(
|
146 |
-
description=description.replace(" ```\n", ""),
|
147 |
-
formatter_class=RawTextHelpFormatter,
|
148 |
-
)
|
149 |
-
|
150 |
-
parser.add_argument(
|
151 |
-
"--list_models",
|
152 |
-
type=str2bool,
|
153 |
-
nargs="?",
|
154 |
-
const=True,
|
155 |
-
default=False,
|
156 |
-
help="list available pre-trained TTS and vocoder models.",
|
157 |
-
)
|
158 |
-
|
159 |
-
parser.add_argument(
|
160 |
-
"--model_info_by_idx",
|
161 |
-
type=str,
|
162 |
-
default=None,
|
163 |
-
help="model info using query format: <model_type>/<model_query_idx>",
|
164 |
-
)
|
165 |
-
|
166 |
-
parser.add_argument(
|
167 |
-
"--model_info_by_name",
|
168 |
-
type=str,
|
169 |
-
default=None,
|
170 |
-
help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
|
171 |
-
)
|
172 |
-
|
173 |
-
parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
|
174 |
-
|
175 |
-
# Args for running pre-trained TTS models.
|
176 |
-
parser.add_argument(
|
177 |
-
"--model_name",
|
178 |
-
type=str,
|
179 |
-
default="tts_models/en/ljspeech/tacotron2-DDC",
|
180 |
-
help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
|
181 |
-
)
|
182 |
-
parser.add_argument(
|
183 |
-
"--vocoder_name",
|
184 |
-
type=str,
|
185 |
-
default=None,
|
186 |
-
help="Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>",
|
187 |
-
)
|
188 |
-
|
189 |
-
# Args for running custom models
|
190 |
-
parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
|
191 |
-
parser.add_argument(
|
192 |
-
"--model_path",
|
193 |
-
type=str,
|
194 |
-
default=None,
|
195 |
-
help="Path to model file.",
|
196 |
-
)
|
197 |
-
parser.add_argument(
|
198 |
-
"--out_path",
|
199 |
-
type=str,
|
200 |
-
default="tts_output.wav",
|
201 |
-
help="Output wav file path.",
|
202 |
-
)
|
203 |
-
parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
|
204 |
-
parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
|
205 |
-
parser.add_argument(
|
206 |
-
"--vocoder_path",
|
207 |
-
type=str,
|
208 |
-
help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
|
209 |
-
default=None,
|
210 |
-
)
|
211 |
-
parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
|
212 |
-
parser.add_argument(
|
213 |
-
"--encoder_path",
|
214 |
-
type=str,
|
215 |
-
help="Path to speaker encoder model file.",
|
216 |
-
default=None,
|
217 |
-
)
|
218 |
-
parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
|
219 |
-
parser.add_argument(
|
220 |
-
"--pipe_out",
|
221 |
-
help="stdout the generated TTS wav file for shell pipe.",
|
222 |
-
type=str2bool,
|
223 |
-
nargs="?",
|
224 |
-
const=True,
|
225 |
-
default=False,
|
226 |
-
)
|
227 |
-
|
228 |
-
# args for multi-speaker synthesis
|
229 |
-
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
|
230 |
-
parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
|
231 |
-
parser.add_argument(
|
232 |
-
"--speaker_idx",
|
233 |
-
type=str,
|
234 |
-
help="Target speaker ID for a multi-speaker TTS model.",
|
235 |
-
default=None,
|
236 |
-
)
|
237 |
-
parser.add_argument(
|
238 |
-
"--language_idx",
|
239 |
-
type=str,
|
240 |
-
help="Target language ID for a multi-lingual TTS model.",
|
241 |
-
default=None,
|
242 |
-
)
|
243 |
-
parser.add_argument(
|
244 |
-
"--speaker_wav",
|
245 |
-
nargs="+",
|
246 |
-
help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
|
247 |
-
default=None,
|
248 |
-
)
|
249 |
-
parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
|
250 |
-
parser.add_argument(
|
251 |
-
"--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
|
252 |
-
)
|
253 |
-
parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
|
254 |
-
parser.add_argument(
|
255 |
-
"--list_speaker_idxs",
|
256 |
-
help="List available speaker ids for the defined multi-speaker model.",
|
257 |
-
type=str2bool,
|
258 |
-
nargs="?",
|
259 |
-
const=True,
|
260 |
-
default=False,
|
261 |
-
)
|
262 |
-
parser.add_argument(
|
263 |
-
"--list_language_idxs",
|
264 |
-
help="List available language ids for the defined multi-lingual model.",
|
265 |
-
type=str2bool,
|
266 |
-
nargs="?",
|
267 |
-
const=True,
|
268 |
-
default=False,
|
269 |
-
)
|
270 |
-
# aux args
|
271 |
-
parser.add_argument(
|
272 |
-
"--save_spectogram",
|
273 |
-
type=bool,
|
274 |
-
help="If true save raw spectogram for further (vocoder) processing in out_path.",
|
275 |
-
default=False,
|
276 |
-
)
|
277 |
-
parser.add_argument(
|
278 |
-
"--reference_wav",
|
279 |
-
type=str,
|
280 |
-
help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
|
281 |
-
default=None,
|
282 |
-
)
|
283 |
-
parser.add_argument(
|
284 |
-
"--reference_speaker_idx",
|
285 |
-
type=str,
|
286 |
-
help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
|
287 |
-
default=None,
|
288 |
-
)
|
289 |
-
parser.add_argument(
|
290 |
-
"--progress_bar",
|
291 |
-
type=str2bool,
|
292 |
-
help="If true shows a progress bar for the model download. Defaults to True",
|
293 |
-
default=True,
|
294 |
-
)
|
295 |
-
|
296 |
-
# voice conversion args
|
297 |
-
parser.add_argument(
|
298 |
-
"--source_wav",
|
299 |
-
type=str,
|
300 |
-
default=None,
|
301 |
-
help="Original audio file to convert in the voice of the target_wav",
|
302 |
-
)
|
303 |
-
parser.add_argument(
|
304 |
-
"--target_wav",
|
305 |
-
type=str,
|
306 |
-
default=None,
|
307 |
-
help="Target audio file to convert in the voice of the source_wav",
|
308 |
-
)
|
309 |
-
|
310 |
-
parser.add_argument(
|
311 |
-
"--voice_dir",
|
312 |
-
type=str,
|
313 |
-
default=None,
|
314 |
-
help="Voice dir for tortoise model",
|
315 |
-
)
|
316 |
-
|
317 |
-
args = parser.parse_args()
|
318 |
-
|
319 |
-
# print the description if either text or list_models is not set
|
320 |
-
check_args = [
|
321 |
-
args.text,
|
322 |
-
args.list_models,
|
323 |
-
args.list_speaker_idxs,
|
324 |
-
args.list_language_idxs,
|
325 |
-
args.reference_wav,
|
326 |
-
args.model_info_by_idx,
|
327 |
-
args.model_info_by_name,
|
328 |
-
args.source_wav,
|
329 |
-
args.target_wav,
|
330 |
-
]
|
331 |
-
if not any(check_args):
|
332 |
-
parser.parse_args(["-h"])
|
333 |
-
|
334 |
-
pipe_out = sys.stdout if args.pipe_out else None
|
335 |
-
|
336 |
-
with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
|
337 |
-
# Late-import to make things load faster
|
338 |
-
from TTS.api import TTS
|
339 |
-
from TTS.utils.manage import ModelManager
|
340 |
-
from TTS.utils.synthesizer import Synthesizer
|
341 |
-
|
342 |
-
# load model manager
|
343 |
-
path = Path(__file__).parent / "../.models.json"
|
344 |
-
manager = ModelManager(path, progress_bar=args.progress_bar)
|
345 |
-
api = TTS()
|
346 |
-
|
347 |
-
tts_path = None
|
348 |
-
tts_config_path = None
|
349 |
-
speakers_file_path = None
|
350 |
-
language_ids_file_path = None
|
351 |
-
vocoder_path = None
|
352 |
-
vocoder_config_path = None
|
353 |
-
encoder_path = None
|
354 |
-
encoder_config_path = None
|
355 |
-
vc_path = None
|
356 |
-
vc_config_path = None
|
357 |
-
model_dir = None
|
358 |
-
|
359 |
-
# CASE1 #list : list pre-trained TTS models
|
360 |
-
if args.list_models:
|
361 |
-
manager.list_models()
|
362 |
-
sys.exit()
|
363 |
-
|
364 |
-
# CASE2 #info : model info for pre-trained TTS models
|
365 |
-
if args.model_info_by_idx:
|
366 |
-
model_query = args.model_info_by_idx
|
367 |
-
manager.model_info_by_idx(model_query)
|
368 |
-
sys.exit()
|
369 |
-
|
370 |
-
if args.model_info_by_name:
|
371 |
-
model_query_full_name = args.model_info_by_name
|
372 |
-
manager.model_info_by_full_name(model_query_full_name)
|
373 |
-
sys.exit()
|
374 |
-
|
375 |
-
# CASE3: load pre-trained model paths
|
376 |
-
if args.model_name is not None and not args.model_path:
|
377 |
-
model_path, config_path, model_item = manager.download_model(args.model_name)
|
378 |
-
# tts model
|
379 |
-
if model_item["model_type"] == "tts_models":
|
380 |
-
tts_path = model_path
|
381 |
-
tts_config_path = config_path
|
382 |
-
if "default_vocoder" in model_item:
|
383 |
-
args.vocoder_name = (
|
384 |
-
model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
|
385 |
-
)
|
386 |
-
|
387 |
-
# voice conversion model
|
388 |
-
if model_item["model_type"] == "voice_conversion_models":
|
389 |
-
vc_path = model_path
|
390 |
-
vc_config_path = config_path
|
391 |
-
|
392 |
-
# tts model with multiple files to be loaded from the directory path
|
393 |
-
if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
|
394 |
-
model_dir = model_path
|
395 |
-
tts_path = None
|
396 |
-
tts_config_path = None
|
397 |
-
args.vocoder_name = None
|
398 |
-
|
399 |
-
# load vocoder
|
400 |
-
if args.vocoder_name is not None and not args.vocoder_path:
|
401 |
-
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
|
402 |
-
|
403 |
-
# CASE4: set custom model paths
|
404 |
-
if args.model_path is not None:
|
405 |
-
tts_path = args.model_path
|
406 |
-
tts_config_path = args.config_path
|
407 |
-
speakers_file_path = args.speakers_file_path
|
408 |
-
language_ids_file_path = args.language_ids_file_path
|
409 |
-
|
410 |
-
if args.vocoder_path is not None:
|
411 |
-
vocoder_path = args.vocoder_path
|
412 |
-
vocoder_config_path = args.vocoder_config_path
|
413 |
-
|
414 |
-
if args.encoder_path is not None:
|
415 |
-
encoder_path = args.encoder_path
|
416 |
-
encoder_config_path = args.encoder_config_path
|
417 |
-
|
418 |
-
device = args.device
|
419 |
-
if args.use_cuda:
|
420 |
-
device = "cuda"
|
421 |
-
|
422 |
-
# load models
|
423 |
-
synthesizer = Synthesizer(
|
424 |
-
tts_path,
|
425 |
-
tts_config_path,
|
426 |
-
speakers_file_path,
|
427 |
-
language_ids_file_path,
|
428 |
-
vocoder_path,
|
429 |
-
vocoder_config_path,
|
430 |
-
encoder_path,
|
431 |
-
encoder_config_path,
|
432 |
-
vc_path,
|
433 |
-
vc_config_path,
|
434 |
-
model_dir,
|
435 |
-
args.voice_dir,
|
436 |
-
).to(device)
|
437 |
-
|
438 |
-
# query speaker ids of a multi-speaker model.
|
439 |
-
if args.list_speaker_idxs:
|
440 |
-
print(
|
441 |
-
" > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
|
442 |
-
)
|
443 |
-
print(synthesizer.tts_model.speaker_manager.name_to_id)
|
444 |
-
return
|
445 |
-
|
446 |
-
# query langauge ids of a multi-lingual model.
|
447 |
-
if args.list_language_idxs:
|
448 |
-
print(
|
449 |
-
" > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
|
450 |
-
)
|
451 |
-
print(synthesizer.tts_model.language_manager.name_to_id)
|
452 |
-
return
|
453 |
-
|
454 |
-
# check the arguments against a multi-speaker model.
|
455 |
-
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
|
456 |
-
print(
|
457 |
-
" [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
|
458 |
-
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
|
459 |
-
)
|
460 |
-
return
|
461 |
-
|
462 |
-
# RUN THE SYNTHESIS
|
463 |
-
if args.text:
|
464 |
-
print(" > Text: {}".format(args.text))
|
465 |
-
|
466 |
-
# kick it
|
467 |
-
if tts_path is not None:
|
468 |
-
wav = synthesizer.tts(
|
469 |
-
args.text,
|
470 |
-
speaker_name=args.speaker_idx,
|
471 |
-
language_name=args.language_idx,
|
472 |
-
speaker_wav=args.speaker_wav,
|
473 |
-
reference_wav=args.reference_wav,
|
474 |
-
style_wav=args.capacitron_style_wav,
|
475 |
-
style_text=args.capacitron_style_text,
|
476 |
-
reference_speaker_name=args.reference_speaker_idx,
|
477 |
-
)
|
478 |
-
elif vc_path is not None:
|
479 |
-
wav = synthesizer.voice_conversion(
|
480 |
-
source_wav=args.source_wav,
|
481 |
-
target_wav=args.target_wav,
|
482 |
-
)
|
483 |
-
elif model_dir is not None:
|
484 |
-
wav = synthesizer.tts(
|
485 |
-
args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
|
486 |
-
)
|
487 |
-
|
488 |
-
# save the results
|
489 |
-
print(" > Saving output to {}".format(args.out_path))
|
490 |
-
synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
|
491 |
-
|
492 |
-
|
493 |
-
if __name__ == "__main__":
|
494 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/bin/train_encoder.py
DELETED
@@ -1,332 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
# -*- coding: utf-8 -*-
|
3 |
-
|
4 |
-
import os
|
5 |
-
import sys
|
6 |
-
import time
|
7 |
-
import traceback
|
8 |
-
|
9 |
-
import torch
|
10 |
-
from torch.utils.data import DataLoader
|
11 |
-
from trainer.io import copy_model_files, save_best_model, save_checkpoint
|
12 |
-
from trainer.torch import NoamLR
|
13 |
-
from trainer.trainer_utils import get_optimizer
|
14 |
-
|
15 |
-
from TTS.encoder.dataset import EncoderDataset
|
16 |
-
from TTS.encoder.utils.generic_utils import setup_encoder_model
|
17 |
-
from TTS.encoder.utils.training import init_training
|
18 |
-
from TTS.encoder.utils.visual import plot_embeddings
|
19 |
-
from TTS.tts.datasets import load_tts_samples
|
20 |
-
from TTS.utils.audio import AudioProcessor
|
21 |
-
from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
|
22 |
-
from TTS.utils.samplers import PerfectBatchSampler
|
23 |
-
from TTS.utils.training import check_update
|
24 |
-
|
25 |
-
torch.backends.cudnn.enabled = True
|
26 |
-
torch.backends.cudnn.benchmark = True
|
27 |
-
torch.manual_seed(54321)
|
28 |
-
use_cuda = torch.cuda.is_available()
|
29 |
-
num_gpus = torch.cuda.device_count()
|
30 |
-
print(" > Using CUDA: ", use_cuda)
|
31 |
-
print(" > Number of GPUs: ", num_gpus)
|
32 |
-
|
33 |
-
|
34 |
-
def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
|
35 |
-
num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
|
36 |
-
num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
|
37 |
-
|
38 |
-
dataset = EncoderDataset(
|
39 |
-
c,
|
40 |
-
ap,
|
41 |
-
meta_data_eval if is_val else meta_data_train,
|
42 |
-
voice_len=c.voice_len,
|
43 |
-
num_utter_per_class=num_utter_per_class,
|
44 |
-
num_classes_in_batch=num_classes_in_batch,
|
45 |
-
verbose=verbose,
|
46 |
-
augmentation_config=c.audio_augmentation if not is_val else None,
|
47 |
-
use_torch_spec=c.model_params.get("use_torch_spec", False),
|
48 |
-
)
|
49 |
-
# get classes list
|
50 |
-
classes = dataset.get_class_list()
|
51 |
-
|
52 |
-
sampler = PerfectBatchSampler(
|
53 |
-
dataset.items,
|
54 |
-
classes,
|
55 |
-
batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
|
56 |
-
num_classes_in_batch=num_classes_in_batch,
|
57 |
-
num_gpus=1,
|
58 |
-
shuffle=not is_val,
|
59 |
-
drop_last=True,
|
60 |
-
)
|
61 |
-
|
62 |
-
if len(classes) < num_classes_in_batch:
|
63 |
-
if is_val:
|
64 |
-
raise RuntimeError(
|
65 |
-
f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
|
66 |
-
)
|
67 |
-
raise RuntimeError(
|
68 |
-
f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
|
69 |
-
)
|
70 |
-
|
71 |
-
# set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
|
72 |
-
if is_val:
|
73 |
-
dataset.set_classes(train_classes)
|
74 |
-
|
75 |
-
loader = DataLoader(
|
76 |
-
dataset,
|
77 |
-
num_workers=c.num_loader_workers,
|
78 |
-
batch_sampler=sampler,
|
79 |
-
collate_fn=dataset.collate_fn,
|
80 |
-
)
|
81 |
-
|
82 |
-
return loader, classes, dataset.get_map_classid_to_classname()
|
83 |
-
|
84 |
-
|
85 |
-
def evaluation(model, criterion, data_loader, global_step):
|
86 |
-
eval_loss = 0
|
87 |
-
for _, data in enumerate(data_loader):
|
88 |
-
with torch.no_grad():
|
89 |
-
# setup input data
|
90 |
-
inputs, labels = data
|
91 |
-
|
92 |
-
# agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
|
93 |
-
labels = torch.transpose(
|
94 |
-
labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
|
95 |
-
).reshape(labels.shape)
|
96 |
-
inputs = torch.transpose(
|
97 |
-
inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
|
98 |
-
).reshape(inputs.shape)
|
99 |
-
|
100 |
-
# dispatch data to GPU
|
101 |
-
if use_cuda:
|
102 |
-
inputs = inputs.cuda(non_blocking=True)
|
103 |
-
labels = labels.cuda(non_blocking=True)
|
104 |
-
|
105 |
-
# forward pass model
|
106 |
-
outputs = model(inputs)
|
107 |
-
|
108 |
-
# loss computation
|
109 |
-
loss = criterion(
|
110 |
-
outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
|
111 |
-
)
|
112 |
-
|
113 |
-
eval_loss += loss.item()
|
114 |
-
|
115 |
-
eval_avg_loss = eval_loss / len(data_loader)
|
116 |
-
# save stats
|
117 |
-
dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
|
118 |
-
# plot the last batch in the evaluation
|
119 |
-
figures = {
|
120 |
-
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
|
121 |
-
}
|
122 |
-
dashboard_logger.eval_figures(global_step, figures)
|
123 |
-
return eval_avg_loss
|
124 |
-
|
125 |
-
|
126 |
-
def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
|
127 |
-
model.train()
|
128 |
-
best_loss = {"train_loss": None, "eval_loss": float("inf")}
|
129 |
-
avg_loader_time = 0
|
130 |
-
end_time = time.time()
|
131 |
-
for epoch in range(c.epochs):
|
132 |
-
tot_loss = 0
|
133 |
-
epoch_time = 0
|
134 |
-
for _, data in enumerate(data_loader):
|
135 |
-
start_time = time.time()
|
136 |
-
|
137 |
-
# setup input data
|
138 |
-
inputs, labels = data
|
139 |
-
# agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
|
140 |
-
labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
|
141 |
-
labels.shape
|
142 |
-
)
|
143 |
-
inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
|
144 |
-
inputs.shape
|
145 |
-
)
|
146 |
-
# ToDo: move it to a unit test
|
147 |
-
# labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
|
148 |
-
# inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
|
149 |
-
# idx = 0
|
150 |
-
# for j in range(0, c.num_classes_in_batch, 1):
|
151 |
-
# for i in range(j, len(labels), c.num_classes_in_batch):
|
152 |
-
# if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
|
153 |
-
# print("Invalid")
|
154 |
-
# print(labels)
|
155 |
-
# exit()
|
156 |
-
# idx += 1
|
157 |
-
# labels = labels_converted
|
158 |
-
# inputs = inputs_converted
|
159 |
-
|
160 |
-
loader_time = time.time() - end_time
|
161 |
-
global_step += 1
|
162 |
-
|
163 |
-
# setup lr
|
164 |
-
if c.lr_decay:
|
165 |
-
scheduler.step()
|
166 |
-
optimizer.zero_grad()
|
167 |
-
|
168 |
-
# dispatch data to GPU
|
169 |
-
if use_cuda:
|
170 |
-
inputs = inputs.cuda(non_blocking=True)
|
171 |
-
labels = labels.cuda(non_blocking=True)
|
172 |
-
|
173 |
-
# forward pass model
|
174 |
-
outputs = model(inputs)
|
175 |
-
|
176 |
-
# loss computation
|
177 |
-
loss = criterion(
|
178 |
-
outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
|
179 |
-
)
|
180 |
-
loss.backward()
|
181 |
-
grad_norm, _ = check_update(model, c.grad_clip)
|
182 |
-
optimizer.step()
|
183 |
-
|
184 |
-
step_time = time.time() - start_time
|
185 |
-
epoch_time += step_time
|
186 |
-
|
187 |
-
# acumulate the total epoch loss
|
188 |
-
tot_loss += loss.item()
|
189 |
-
|
190 |
-
# Averaged Loader Time
|
191 |
-
num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
|
192 |
-
avg_loader_time = (
|
193 |
-
1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
|
194 |
-
if avg_loader_time != 0
|
195 |
-
else loader_time
|
196 |
-
)
|
197 |
-
current_lr = optimizer.param_groups[0]["lr"]
|
198 |
-
|
199 |
-
if global_step % c.steps_plot_stats == 0:
|
200 |
-
# Plot Training Epoch Stats
|
201 |
-
train_stats = {
|
202 |
-
"loss": loss.item(),
|
203 |
-
"lr": current_lr,
|
204 |
-
"grad_norm": grad_norm,
|
205 |
-
"step_time": step_time,
|
206 |
-
"avg_loader_time": avg_loader_time,
|
207 |
-
}
|
208 |
-
dashboard_logger.train_epoch_stats(global_step, train_stats)
|
209 |
-
figures = {
|
210 |
-
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
|
211 |
-
}
|
212 |
-
dashboard_logger.train_figures(global_step, figures)
|
213 |
-
|
214 |
-
if global_step % c.print_step == 0:
|
215 |
-
print(
|
216 |
-
" | > Step:{} Loss:{:.5f} GradNorm:{:.5f} "
|
217 |
-
"StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
|
218 |
-
global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
|
219 |
-
),
|
220 |
-
flush=True,
|
221 |
-
)
|
222 |
-
|
223 |
-
if global_step % c.save_step == 0:
|
224 |
-
# save model
|
225 |
-
save_checkpoint(
|
226 |
-
c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict()
|
227 |
-
)
|
228 |
-
|
229 |
-
end_time = time.time()
|
230 |
-
|
231 |
-
print("")
|
232 |
-
print(
|
233 |
-
">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
|
234 |
-
"EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
|
235 |
-
epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
|
236 |
-
),
|
237 |
-
flush=True,
|
238 |
-
)
|
239 |
-
# evaluation
|
240 |
-
if c.run_eval:
|
241 |
-
model.eval()
|
242 |
-
eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
|
243 |
-
print("\n\n")
|
244 |
-
print("--> EVAL PERFORMANCE")
|
245 |
-
print(
|
246 |
-
" | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
|
247 |
-
flush=True,
|
248 |
-
)
|
249 |
-
# save the best checkpoint
|
250 |
-
best_loss = save_best_model(
|
251 |
-
{"train_loss": None, "eval_loss": eval_loss},
|
252 |
-
best_loss,
|
253 |
-
c,
|
254 |
-
model,
|
255 |
-
optimizer,
|
256 |
-
None,
|
257 |
-
global_step,
|
258 |
-
epoch,
|
259 |
-
OUT_PATH,
|
260 |
-
criterion=criterion.state_dict(),
|
261 |
-
)
|
262 |
-
model.train()
|
263 |
-
|
264 |
-
return best_loss, global_step
|
265 |
-
|
266 |
-
|
267 |
-
def main(args): # pylint: disable=redefined-outer-name
|
268 |
-
# pylint: disable=global-variable-undefined
|
269 |
-
global meta_data_train
|
270 |
-
global meta_data_eval
|
271 |
-
global train_classes
|
272 |
-
|
273 |
-
ap = AudioProcessor(**c.audio)
|
274 |
-
model = setup_encoder_model(c)
|
275 |
-
|
276 |
-
optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
|
277 |
-
|
278 |
-
# pylint: disable=redefined-outer-name
|
279 |
-
meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
|
280 |
-
|
281 |
-
train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
|
282 |
-
if c.run_eval:
|
283 |
-
eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
|
284 |
-
else:
|
285 |
-
eval_data_loader = None
|
286 |
-
|
287 |
-
num_classes = len(train_classes)
|
288 |
-
criterion = model.get_criterion(c, num_classes)
|
289 |
-
|
290 |
-
if c.loss == "softmaxproto" and c.model != "speaker_encoder":
|
291 |
-
c.map_classid_to_classname = map_classid_to_classname
|
292 |
-
copy_model_files(c, OUT_PATH, new_fields={})
|
293 |
-
|
294 |
-
if args.restore_path:
|
295 |
-
criterion, args.restore_step = model.load_checkpoint(
|
296 |
-
c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
|
297 |
-
)
|
298 |
-
print(" > Model restored from step %d" % args.restore_step, flush=True)
|
299 |
-
else:
|
300 |
-
args.restore_step = 0
|
301 |
-
|
302 |
-
if c.lr_decay:
|
303 |
-
scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
|
304 |
-
else:
|
305 |
-
scheduler = None
|
306 |
-
|
307 |
-
num_params = count_parameters(model)
|
308 |
-
print("\n > Model has {} parameters".format(num_params), flush=True)
|
309 |
-
|
310 |
-
if use_cuda:
|
311 |
-
model = model.cuda()
|
312 |
-
criterion.cuda()
|
313 |
-
|
314 |
-
global_step = args.restore_step
|
315 |
-
_, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
|
316 |
-
|
317 |
-
|
318 |
-
if __name__ == "__main__":
|
319 |
-
args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
|
320 |
-
|
321 |
-
try:
|
322 |
-
main(args)
|
323 |
-
except KeyboardInterrupt:
|
324 |
-
remove_experiment_folder(OUT_PATH)
|
325 |
-
try:
|
326 |
-
sys.exit(0)
|
327 |
-
except SystemExit:
|
328 |
-
os._exit(0) # pylint: disable=protected-access
|
329 |
-
except Exception: # pylint: disable=broad-except
|
330 |
-
remove_experiment_folder(OUT_PATH)
|
331 |
-
traceback.print_exc()
|
332 |
-
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/bin/train_tts.py
DELETED
@@ -1,71 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from dataclasses import dataclass, field
|
3 |
-
|
4 |
-
from trainer import Trainer, TrainerArgs
|
5 |
-
|
6 |
-
from TTS.config import load_config, register_config
|
7 |
-
from TTS.tts.datasets import load_tts_samples
|
8 |
-
from TTS.tts.models import setup_model
|
9 |
-
|
10 |
-
|
11 |
-
@dataclass
|
12 |
-
class TrainTTSArgs(TrainerArgs):
|
13 |
-
config_path: str = field(default=None, metadata={"help": "Path to the config file."})
|
14 |
-
|
15 |
-
|
16 |
-
def main():
|
17 |
-
"""Run `tts` model training directly by a `config.json` file."""
|
18 |
-
# init trainer args
|
19 |
-
train_args = TrainTTSArgs()
|
20 |
-
parser = train_args.init_argparse(arg_prefix="")
|
21 |
-
|
22 |
-
# override trainer args from comman-line args
|
23 |
-
args, config_overrides = parser.parse_known_args()
|
24 |
-
train_args.parse_args(args)
|
25 |
-
|
26 |
-
# load config.json and register
|
27 |
-
if args.config_path or args.continue_path:
|
28 |
-
if args.config_path:
|
29 |
-
# init from a file
|
30 |
-
config = load_config(args.config_path)
|
31 |
-
if len(config_overrides) > 0:
|
32 |
-
config.parse_known_args(config_overrides, relaxed_parser=True)
|
33 |
-
elif args.continue_path:
|
34 |
-
# continue from a prev experiment
|
35 |
-
config = load_config(os.path.join(args.continue_path, "config.json"))
|
36 |
-
if len(config_overrides) > 0:
|
37 |
-
config.parse_known_args(config_overrides, relaxed_parser=True)
|
38 |
-
else:
|
39 |
-
# init from console args
|
40 |
-
from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
|
41 |
-
|
42 |
-
config_base = BaseTrainingConfig()
|
43 |
-
config_base.parse_known_args(config_overrides)
|
44 |
-
config = register_config(config_base.model)()
|
45 |
-
|
46 |
-
# load training samples
|
47 |
-
train_samples, eval_samples = load_tts_samples(
|
48 |
-
config.datasets,
|
49 |
-
eval_split=True,
|
50 |
-
eval_split_max_size=config.eval_split_max_size,
|
51 |
-
eval_split_size=config.eval_split_size,
|
52 |
-
)
|
53 |
-
|
54 |
-
# init the model from config
|
55 |
-
model = setup_model(config, train_samples + eval_samples)
|
56 |
-
|
57 |
-
# init the trainer and 🚀
|
58 |
-
trainer = Trainer(
|
59 |
-
train_args,
|
60 |
-
model.config,
|
61 |
-
config.output_path,
|
62 |
-
model=model,
|
63 |
-
train_samples=train_samples,
|
64 |
-
eval_samples=eval_samples,
|
65 |
-
parse_command_line_args=False,
|
66 |
-
)
|
67 |
-
trainer.fit()
|
68 |
-
|
69 |
-
|
70 |
-
if __name__ == "__main__":
|
71 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/bin/train_vocoder.py
DELETED
@@ -1,77 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from dataclasses import dataclass, field
|
3 |
-
|
4 |
-
from trainer import Trainer, TrainerArgs
|
5 |
-
|
6 |
-
from TTS.config import load_config, register_config
|
7 |
-
from TTS.utils.audio import AudioProcessor
|
8 |
-
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
9 |
-
from TTS.vocoder.models import setup_model
|
10 |
-
|
11 |
-
|
12 |
-
@dataclass
|
13 |
-
class TrainVocoderArgs(TrainerArgs):
|
14 |
-
config_path: str = field(default=None, metadata={"help": "Path to the config file."})
|
15 |
-
|
16 |
-
|
17 |
-
def main():
|
18 |
-
"""Run `tts` model training directly by a `config.json` file."""
|
19 |
-
# init trainer args
|
20 |
-
train_args = TrainVocoderArgs()
|
21 |
-
parser = train_args.init_argparse(arg_prefix="")
|
22 |
-
|
23 |
-
# override trainer args from comman-line args
|
24 |
-
args, config_overrides = parser.parse_known_args()
|
25 |
-
train_args.parse_args(args)
|
26 |
-
|
27 |
-
# load config.json and register
|
28 |
-
if args.config_path or args.continue_path:
|
29 |
-
if args.config_path:
|
30 |
-
# init from a file
|
31 |
-
config = load_config(args.config_path)
|
32 |
-
if len(config_overrides) > 0:
|
33 |
-
config.parse_known_args(config_overrides, relaxed_parser=True)
|
34 |
-
elif args.continue_path:
|
35 |
-
# continue from a prev experiment
|
36 |
-
config = load_config(os.path.join(args.continue_path, "config.json"))
|
37 |
-
if len(config_overrides) > 0:
|
38 |
-
config.parse_known_args(config_overrides, relaxed_parser=True)
|
39 |
-
else:
|
40 |
-
# init from console args
|
41 |
-
from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
|
42 |
-
|
43 |
-
config_base = BaseTrainingConfig()
|
44 |
-
config_base.parse_known_args(config_overrides)
|
45 |
-
config = register_config(config_base.model)()
|
46 |
-
|
47 |
-
# load training samples
|
48 |
-
if "feature_path" in config and config.feature_path:
|
49 |
-
# load pre-computed features
|
50 |
-
print(f" > Loading features from: {config.feature_path}")
|
51 |
-
eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
|
52 |
-
else:
|
53 |
-
# load data raw wav files
|
54 |
-
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
|
55 |
-
|
56 |
-
# setup audio processor
|
57 |
-
ap = AudioProcessor(**config.audio)
|
58 |
-
|
59 |
-
# init the model from config
|
60 |
-
model = setup_model(config)
|
61 |
-
|
62 |
-
# init the trainer and 🚀
|
63 |
-
trainer = Trainer(
|
64 |
-
train_args,
|
65 |
-
config,
|
66 |
-
config.output_path,
|
67 |
-
model=model,
|
68 |
-
train_samples=train_samples,
|
69 |
-
eval_samples=eval_samples,
|
70 |
-
training_assets={"audio_processor": ap},
|
71 |
-
parse_command_line_args=False,
|
72 |
-
)
|
73 |
-
trainer.fit()
|
74 |
-
|
75 |
-
|
76 |
-
if __name__ == "__main__":
|
77 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/bin/tune_wavegrad.py
DELETED
@@ -1,103 +0,0 @@
|
|
1 |
-
"""Search a good noise schedule for WaveGrad for a given number of inference iterations"""
|
2 |
-
import argparse
|
3 |
-
from itertools import product as cartesian_product
|
4 |
-
|
5 |
-
import numpy as np
|
6 |
-
import torch
|
7 |
-
from torch.utils.data import DataLoader
|
8 |
-
from tqdm import tqdm
|
9 |
-
|
10 |
-
from TTS.config import load_config
|
11 |
-
from TTS.utils.audio import AudioProcessor
|
12 |
-
from TTS.vocoder.datasets.preprocess import load_wav_data
|
13 |
-
from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
|
14 |
-
from TTS.vocoder.models import setup_model
|
15 |
-
|
16 |
-
if __name__ == "__main__":
|
17 |
-
parser = argparse.ArgumentParser()
|
18 |
-
parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
|
19 |
-
parser.add_argument("--config_path", type=str, help="Path to model config file.")
|
20 |
-
parser.add_argument("--data_path", type=str, help="Path to data directory.")
|
21 |
-
parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
|
22 |
-
parser.add_argument(
|
23 |
-
"--num_iter",
|
24 |
-
type=int,
|
25 |
-
help="Number of model inference iterations that you like to optimize noise schedule for.",
|
26 |
-
)
|
27 |
-
parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
|
28 |
-
parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
|
29 |
-
parser.add_argument(
|
30 |
-
"--search_depth",
|
31 |
-
type=int,
|
32 |
-
default=3,
|
33 |
-
help="Search granularity. Increasing this increases the run-time exponentially.",
|
34 |
-
)
|
35 |
-
|
36 |
-
# load config
|
37 |
-
args = parser.parse_args()
|
38 |
-
config = load_config(args.config_path)
|
39 |
-
|
40 |
-
# setup audio processor
|
41 |
-
ap = AudioProcessor(**config.audio)
|
42 |
-
|
43 |
-
# load dataset
|
44 |
-
_, train_data = load_wav_data(args.data_path, 0)
|
45 |
-
train_data = train_data[: args.num_samples]
|
46 |
-
dataset = WaveGradDataset(
|
47 |
-
ap=ap,
|
48 |
-
items=train_data,
|
49 |
-
seq_len=-1,
|
50 |
-
hop_len=ap.hop_length,
|
51 |
-
pad_short=config.pad_short,
|
52 |
-
conv_pad=config.conv_pad,
|
53 |
-
is_training=True,
|
54 |
-
return_segments=False,
|
55 |
-
use_noise_augment=False,
|
56 |
-
use_cache=False,
|
57 |
-
verbose=True,
|
58 |
-
)
|
59 |
-
loader = DataLoader(
|
60 |
-
dataset,
|
61 |
-
batch_size=1,
|
62 |
-
shuffle=False,
|
63 |
-
collate_fn=dataset.collate_full_clips,
|
64 |
-
drop_last=False,
|
65 |
-
num_workers=config.num_loader_workers,
|
66 |
-
pin_memory=False,
|
67 |
-
)
|
68 |
-
|
69 |
-
# setup the model
|
70 |
-
model = setup_model(config)
|
71 |
-
if args.use_cuda:
|
72 |
-
model.cuda()
|
73 |
-
|
74 |
-
# setup optimization parameters
|
75 |
-
base_values = sorted(10 * np.random.uniform(size=args.search_depth))
|
76 |
-
print(f" > base values: {base_values}")
|
77 |
-
exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
|
78 |
-
best_error = float("inf")
|
79 |
-
best_schedule = None # pylint: disable=C0103
|
80 |
-
total_search_iter = len(base_values) ** args.num_iter
|
81 |
-
for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
|
82 |
-
beta = exponents * base
|
83 |
-
model.compute_noise_level(beta)
|
84 |
-
for data in loader:
|
85 |
-
mel, audio = data
|
86 |
-
y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
|
87 |
-
|
88 |
-
if args.use_cuda:
|
89 |
-
y_hat = y_hat.cpu()
|
90 |
-
y_hat = y_hat.numpy()
|
91 |
-
|
92 |
-
mel_hat = []
|
93 |
-
for i in range(y_hat.shape[0]):
|
94 |
-
m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
|
95 |
-
mel_hat.append(torch.from_numpy(m))
|
96 |
-
|
97 |
-
mel_hat = torch.stack(mel_hat)
|
98 |
-
mse = torch.sum((mel - mel_hat) ** 2).mean()
|
99 |
-
if mse.item() < best_error:
|
100 |
-
best_error = mse.item()
|
101 |
-
best_schedule = {"beta": beta}
|
102 |
-
print(f" > Found a better schedule. - MSE: {mse.item()}")
|
103 |
-
np.save(args.output_path, best_schedule)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/config/__init__.py
DELETED
@@ -1,135 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import re
|
4 |
-
from typing import Dict
|
5 |
-
|
6 |
-
import fsspec
|
7 |
-
import yaml
|
8 |
-
from coqpit import Coqpit
|
9 |
-
|
10 |
-
from TTS.config.shared_configs import *
|
11 |
-
from TTS.utils.generic_utils import find_module
|
12 |
-
|
13 |
-
|
14 |
-
def read_json_with_comments(json_path):
|
15 |
-
"""for backward compat."""
|
16 |
-
# fallback to json
|
17 |
-
with fsspec.open(json_path, "r", encoding="utf-8") as f:
|
18 |
-
input_str = f.read()
|
19 |
-
# handle comments but not urls with //
|
20 |
-
input_str = re.sub(r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str)
|
21 |
-
return json.loads(input_str)
|
22 |
-
|
23 |
-
def register_config(model_name: str) -> Coqpit:
|
24 |
-
"""Find the right config for the given model name.
|
25 |
-
|
26 |
-
Args:
|
27 |
-
model_name (str): Model name.
|
28 |
-
|
29 |
-
Raises:
|
30 |
-
ModuleNotFoundError: No matching config for the model name.
|
31 |
-
|
32 |
-
Returns:
|
33 |
-
Coqpit: config class.
|
34 |
-
"""
|
35 |
-
config_class = None
|
36 |
-
config_name = model_name + "_config"
|
37 |
-
|
38 |
-
# TODO: fix this
|
39 |
-
if model_name == "xtts":
|
40 |
-
from TTS.tts.configs.xtts_config import XttsConfig
|
41 |
-
|
42 |
-
config_class = XttsConfig
|
43 |
-
paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
|
44 |
-
for path in paths:
|
45 |
-
try:
|
46 |
-
config_class = find_module(path, config_name)
|
47 |
-
except ModuleNotFoundError:
|
48 |
-
pass
|
49 |
-
if config_class is None:
|
50 |
-
raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
|
51 |
-
return config_class
|
52 |
-
|
53 |
-
|
54 |
-
def _process_model_name(config_dict: Dict) -> str:
|
55 |
-
"""Format the model name as expected. It is a band-aid for the old `vocoder` model names.
|
56 |
-
|
57 |
-
Args:
|
58 |
-
config_dict (Dict): A dictionary including the config fields.
|
59 |
-
|
60 |
-
Returns:
|
61 |
-
str: Formatted modelname.
|
62 |
-
"""
|
63 |
-
model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
|
64 |
-
model_name = model_name.replace("_generator", "").replace("_discriminator", "")
|
65 |
-
return model_name
|
66 |
-
|
67 |
-
|
68 |
-
def load_config(config_path: str) -> Coqpit:
|
69 |
-
"""Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
|
70 |
-
to find the corresponding Config class. Then initialize the Config.
|
71 |
-
|
72 |
-
Args:
|
73 |
-
config_path (str): path to the config file.
|
74 |
-
|
75 |
-
Raises:
|
76 |
-
TypeError: given config file has an unknown type.
|
77 |
-
|
78 |
-
Returns:
|
79 |
-
Coqpit: TTS config object.
|
80 |
-
"""
|
81 |
-
config_dict = {}
|
82 |
-
ext = os.path.splitext(config_path)[1]
|
83 |
-
if ext in (".yml", ".yaml"):
|
84 |
-
with fsspec.open(config_path, "r", encoding="utf-8") as f:
|
85 |
-
data = yaml.safe_load(f)
|
86 |
-
elif ext == ".json":
|
87 |
-
try:
|
88 |
-
with fsspec.open(config_path, "r", encoding="utf-8") as f:
|
89 |
-
data = json.load(f)
|
90 |
-
except json.decoder.JSONDecodeError:
|
91 |
-
# backwards compat.
|
92 |
-
data = read_json_with_comments(config_path)
|
93 |
-
else:
|
94 |
-
raise TypeError(f" [!] Unknown config file type {ext}")
|
95 |
-
config_dict.update(data)
|
96 |
-
model_name = _process_model_name(config_dict)
|
97 |
-
config_class = register_config(model_name.lower())
|
98 |
-
config = config_class()
|
99 |
-
config.from_dict(config_dict)
|
100 |
-
return config
|
101 |
-
|
102 |
-
|
103 |
-
def check_config_and_model_args(config, arg_name, value):
|
104 |
-
"""Check the give argument in `config.model_args` if exist or in `config` for
|
105 |
-
the given value.
|
106 |
-
|
107 |
-
Return False if the argument does not exist in `config.model_args` or `config`.
|
108 |
-
This is to patch up the compatibility between models with and without `model_args`.
|
109 |
-
|
110 |
-
TODO: Remove this in the future with a unified approach.
|
111 |
-
"""
|
112 |
-
if hasattr(config, "model_args"):
|
113 |
-
if arg_name in config.model_args:
|
114 |
-
return config.model_args[arg_name] == value
|
115 |
-
if hasattr(config, arg_name):
|
116 |
-
return config[arg_name] == value
|
117 |
-
return False
|
118 |
-
|
119 |
-
|
120 |
-
def get_from_config_or_model_args(config, arg_name):
|
121 |
-
"""Get the given argument from `config.model_args` if exist or in `config`."""
|
122 |
-
if hasattr(config, "model_args"):
|
123 |
-
if arg_name in config.model_args:
|
124 |
-
return config.model_args[arg_name]
|
125 |
-
return config[arg_name]
|
126 |
-
|
127 |
-
|
128 |
-
def get_from_config_or_model_args_with_default(config, arg_name, def_val):
|
129 |
-
"""Get the given argument from `config.model_args` if exist or in `config`."""
|
130 |
-
if hasattr(config, "model_args"):
|
131 |
-
if arg_name in config.model_args:
|
132 |
-
return config.model_args[arg_name]
|
133 |
-
if hasattr(config, arg_name):
|
134 |
-
return config[arg_name]
|
135 |
-
return def_val
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/config/__pycache__/__init__.cpython-310.pyc
DELETED
Binary file (4.26 kB)
|
|
TTS/config/__pycache__/shared_configs.cpython-310.pyc
DELETED
Binary file (9.53 kB)
|
|
TTS/config/shared_configs.py
DELETED
@@ -1,268 +0,0 @@
|
|
1 |
-
from dataclasses import asdict, dataclass
|
2 |
-
from typing import List
|
3 |
-
|
4 |
-
from coqpit import Coqpit, check_argument
|
5 |
-
from trainer import TrainerConfig
|
6 |
-
|
7 |
-
|
8 |
-
@dataclass
|
9 |
-
class BaseAudioConfig(Coqpit):
|
10 |
-
"""Base config to definge audio processing parameters. It is used to initialize
|
11 |
-
```TTS.utils.audio.AudioProcessor.```
|
12 |
-
|
13 |
-
Args:
|
14 |
-
fft_size (int):
|
15 |
-
Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
|
16 |
-
|
17 |
-
win_length (int):
|
18 |
-
Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
|
19 |
-
```fft_size```. Defaults to 1024.
|
20 |
-
|
21 |
-
hop_length (int):
|
22 |
-
Number of audio samples between adjacent STFT columns. Defaults to 1024.
|
23 |
-
|
24 |
-
frame_shift_ms (int):
|
25 |
-
Set ```hop_length``` based on milliseconds and sampling rate.
|
26 |
-
|
27 |
-
frame_length_ms (int):
|
28 |
-
Set ```win_length``` based on milliseconds and sampling rate.
|
29 |
-
|
30 |
-
stft_pad_mode (str):
|
31 |
-
Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
|
32 |
-
|
33 |
-
sample_rate (int):
|
34 |
-
Audio sampling rate. Defaults to 22050.
|
35 |
-
|
36 |
-
resample (bool):
|
37 |
-
Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
|
38 |
-
|
39 |
-
preemphasis (float):
|
40 |
-
Preemphasis coefficient. Defaults to 0.0.
|
41 |
-
|
42 |
-
ref_level_db (int): 20
|
43 |
-
Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
|
44 |
-
Defaults to 20.
|
45 |
-
|
46 |
-
do_sound_norm (bool):
|
47 |
-
Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
|
48 |
-
|
49 |
-
log_func (str):
|
50 |
-
Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
|
51 |
-
|
52 |
-
do_trim_silence (bool):
|
53 |
-
Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
|
54 |
-
|
55 |
-
do_amp_to_db_linear (bool, optional):
|
56 |
-
enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
|
57 |
-
|
58 |
-
do_amp_to_db_mel (bool, optional):
|
59 |
-
enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
|
60 |
-
|
61 |
-
pitch_fmax (float, optional):
|
62 |
-
Maximum frequency of the F0 frames. Defaults to ```640```.
|
63 |
-
|
64 |
-
pitch_fmin (float, optional):
|
65 |
-
Minimum frequency of the F0 frames. Defaults to ```1```.
|
66 |
-
|
67 |
-
trim_db (int):
|
68 |
-
Silence threshold used for silence trimming. Defaults to 45.
|
69 |
-
|
70 |
-
do_rms_norm (bool, optional):
|
71 |
-
enable/disable RMS volume normalization when loading an audio file. Defaults to False.
|
72 |
-
|
73 |
-
db_level (int, optional):
|
74 |
-
dB level used for rms normalization. The range is -99 to 0. Defaults to None.
|
75 |
-
|
76 |
-
power (float):
|
77 |
-
Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
|
78 |
-
artifacts in the synthesized voice. Defaults to 1.5.
|
79 |
-
|
80 |
-
griffin_lim_iters (int):
|
81 |
-
Number of Griffing Lim iterations. Defaults to 60.
|
82 |
-
|
83 |
-
num_mels (int):
|
84 |
-
Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
|
85 |
-
|
86 |
-
mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
|
87 |
-
It needs to be adjusted for a dataset. Defaults to 0.
|
88 |
-
|
89 |
-
mel_fmax (float):
|
90 |
-
Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
|
91 |
-
|
92 |
-
spec_gain (int):
|
93 |
-
Gain applied when converting amplitude to DB. Defaults to 20.
|
94 |
-
|
95 |
-
signal_norm (bool):
|
96 |
-
enable/disable signal normalization. Defaults to True.
|
97 |
-
|
98 |
-
min_level_db (int):
|
99 |
-
minimum db threshold for the computed melspectrograms. Defaults to -100.
|
100 |
-
|
101 |
-
symmetric_norm (bool):
|
102 |
-
enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
|
103 |
-
[0, k], Defaults to True.
|
104 |
-
|
105 |
-
max_norm (float):
|
106 |
-
```k``` defining the normalization range. Defaults to 4.0.
|
107 |
-
|
108 |
-
clip_norm (bool):
|
109 |
-
enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
|
110 |
-
|
111 |
-
stats_path (str):
|
112 |
-
Path to the computed stats file. Defaults to None.
|
113 |
-
"""
|
114 |
-
|
115 |
-
# stft parameters
|
116 |
-
fft_size: int = 1024
|
117 |
-
win_length: int = 1024
|
118 |
-
hop_length: int = 256
|
119 |
-
frame_shift_ms: int = None
|
120 |
-
frame_length_ms: int = None
|
121 |
-
stft_pad_mode: str = "reflect"
|
122 |
-
# audio processing parameters
|
123 |
-
sample_rate: int = 22050
|
124 |
-
resample: bool = False
|
125 |
-
preemphasis: float = 0.0
|
126 |
-
ref_level_db: int = 20
|
127 |
-
do_sound_norm: bool = False
|
128 |
-
log_func: str = "np.log10"
|
129 |
-
# silence trimming
|
130 |
-
do_trim_silence: bool = True
|
131 |
-
trim_db: int = 45
|
132 |
-
# rms volume normalization
|
133 |
-
do_rms_norm: bool = False
|
134 |
-
db_level: float = None
|
135 |
-
# griffin-lim params
|
136 |
-
power: float = 1.5
|
137 |
-
griffin_lim_iters: int = 60
|
138 |
-
# mel-spec params
|
139 |
-
num_mels: int = 80
|
140 |
-
mel_fmin: float = 0.0
|
141 |
-
mel_fmax: float = None
|
142 |
-
spec_gain: int = 20
|
143 |
-
do_amp_to_db_linear: bool = True
|
144 |
-
do_amp_to_db_mel: bool = True
|
145 |
-
# f0 params
|
146 |
-
pitch_fmax: float = 640.0
|
147 |
-
pitch_fmin: float = 1.0
|
148 |
-
# normalization params
|
149 |
-
signal_norm: bool = True
|
150 |
-
min_level_db: int = -100
|
151 |
-
symmetric_norm: bool = True
|
152 |
-
max_norm: float = 4.0
|
153 |
-
clip_norm: bool = True
|
154 |
-
stats_path: str = None
|
155 |
-
|
156 |
-
def check_values(
|
157 |
-
self,
|
158 |
-
):
|
159 |
-
"""Check config fields"""
|
160 |
-
c = asdict(self)
|
161 |
-
check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
|
162 |
-
check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
|
163 |
-
check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
|
164 |
-
check_argument(
|
165 |
-
"frame_length_ms",
|
166 |
-
c,
|
167 |
-
restricted=True,
|
168 |
-
min_val=10,
|
169 |
-
max_val=1000,
|
170 |
-
alternative="win_length",
|
171 |
-
)
|
172 |
-
check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
|
173 |
-
check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
|
174 |
-
check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
|
175 |
-
check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
|
176 |
-
check_argument("power", c, restricted=True, min_val=1, max_val=5)
|
177 |
-
check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
|
178 |
-
|
179 |
-
# normalization parameters
|
180 |
-
check_argument("signal_norm", c, restricted=True)
|
181 |
-
check_argument("symmetric_norm", c, restricted=True)
|
182 |
-
check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
|
183 |
-
check_argument("clip_norm", c, restricted=True)
|
184 |
-
check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
|
185 |
-
check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
|
186 |
-
check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
|
187 |
-
check_argument("do_trim_silence", c, restricted=True)
|
188 |
-
check_argument("trim_db", c, restricted=True)
|
189 |
-
|
190 |
-
|
191 |
-
@dataclass
|
192 |
-
class BaseDatasetConfig(Coqpit):
|
193 |
-
"""Base config for TTS datasets.
|
194 |
-
|
195 |
-
Args:
|
196 |
-
formatter (str):
|
197 |
-
Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
|
198 |
-
|
199 |
-
dataset_name (str):
|
200 |
-
Unique name for the dataset. Defaults to `""`.
|
201 |
-
|
202 |
-
path (str):
|
203 |
-
Root path to the dataset files. Defaults to `""`.
|
204 |
-
|
205 |
-
meta_file_train (str):
|
206 |
-
Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
|
207 |
-
Defaults to `""`.
|
208 |
-
|
209 |
-
ignored_speakers (List):
|
210 |
-
List of speakers IDs that are not used at the training. Default None.
|
211 |
-
|
212 |
-
language (str):
|
213 |
-
Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
|
214 |
-
|
215 |
-
phonemizer (str):
|
216 |
-
Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.
|
217 |
-
|
218 |
-
meta_file_val (str):
|
219 |
-
Name of the dataset meta file that defines the instances used at validation.
|
220 |
-
|
221 |
-
meta_file_attn_mask (str):
|
222 |
-
Path to the file that lists the attention mask files used with models that require attention masks to
|
223 |
-
train the duration predictor.
|
224 |
-
"""
|
225 |
-
|
226 |
-
formatter: str = ""
|
227 |
-
dataset_name: str = ""
|
228 |
-
path: str = ""
|
229 |
-
meta_file_train: str = ""
|
230 |
-
ignored_speakers: List[str] = None
|
231 |
-
language: str = ""
|
232 |
-
phonemizer: str = ""
|
233 |
-
meta_file_val: str = ""
|
234 |
-
meta_file_attn_mask: str = ""
|
235 |
-
|
236 |
-
def check_values(
|
237 |
-
self,
|
238 |
-
):
|
239 |
-
"""Check config fields"""
|
240 |
-
c = asdict(self)
|
241 |
-
check_argument("formatter", c, restricted=True)
|
242 |
-
check_argument("path", c, restricted=True)
|
243 |
-
check_argument("meta_file_train", c, restricted=True)
|
244 |
-
check_argument("meta_file_val", c, restricted=False)
|
245 |
-
check_argument("meta_file_attn_mask", c, restricted=False)
|
246 |
-
|
247 |
-
|
248 |
-
@dataclass
|
249 |
-
class BaseTrainingConfig(TrainerConfig):
|
250 |
-
"""Base config to define the basic 🐸TTS training parameters that are shared
|
251 |
-
among all the models. It is based on ```Trainer.TrainingConfig```.
|
252 |
-
|
253 |
-
Args:
|
254 |
-
model (str):
|
255 |
-
Name of the model that is used in the training.
|
256 |
-
|
257 |
-
num_loader_workers (int):
|
258 |
-
Number of workers for training time dataloader.
|
259 |
-
|
260 |
-
num_eval_loader_workers (int):
|
261 |
-
Number of workers for evaluation time dataloader.
|
262 |
-
"""
|
263 |
-
|
264 |
-
model: str = None
|
265 |
-
# dataloading
|
266 |
-
num_loader_workers: int = 0
|
267 |
-
num_eval_loader_workers: int = 0
|
268 |
-
use_noise_augment: bool = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/demos/xtts_ft_demo/requirements.txt
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
faster_whisper==0.9.0
|
2 |
-
gradio==4.7.1
|
|
|
|
|
|
TTS/demos/xtts_ft_demo/utils/formatter.py
DELETED
@@ -1,160 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import gc
|
3 |
-
import torchaudio
|
4 |
-
import pandas
|
5 |
-
from faster_whisper import WhisperModel
|
6 |
-
from glob import glob
|
7 |
-
|
8 |
-
from tqdm import tqdm
|
9 |
-
|
10 |
-
import torch
|
11 |
-
import torchaudio
|
12 |
-
# torch.set_num_threads(1)
|
13 |
-
|
14 |
-
from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners
|
15 |
-
|
16 |
-
torch.set_num_threads(16)
|
17 |
-
|
18 |
-
|
19 |
-
import os
|
20 |
-
|
21 |
-
audio_types = (".wav", ".mp3", ".flac")
|
22 |
-
|
23 |
-
|
24 |
-
def list_audios(basePath, contains=None):
|
25 |
-
# return the set of files that are valid
|
26 |
-
return list_files(basePath, validExts=audio_types, contains=contains)
|
27 |
-
|
28 |
-
def list_files(basePath, validExts=None, contains=None):
|
29 |
-
# loop over the directory structure
|
30 |
-
for (rootDir, dirNames, filenames) in os.walk(basePath):
|
31 |
-
# loop over the filenames in the current directory
|
32 |
-
for filename in filenames:
|
33 |
-
# if the contains string is not none and the filename does not contain
|
34 |
-
# the supplied string, then ignore the file
|
35 |
-
if contains is not None and filename.find(contains) == -1:
|
36 |
-
continue
|
37 |
-
|
38 |
-
# determine the file extension of the current file
|
39 |
-
ext = filename[filename.rfind("."):].lower()
|
40 |
-
|
41 |
-
# check to see if the file is an audio and should be processed
|
42 |
-
if validExts is None or ext.endswith(validExts):
|
43 |
-
# construct the path to the audio and yield it
|
44 |
-
audioPath = os.path.join(rootDir, filename)
|
45 |
-
yield audioPath
|
46 |
-
|
47 |
-
def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None):
|
48 |
-
audio_total_size = 0
|
49 |
-
# make sure that ooutput file exists
|
50 |
-
os.makedirs(out_path, exist_ok=True)
|
51 |
-
|
52 |
-
# Loading Whisper
|
53 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
54 |
-
|
55 |
-
print("Loading Whisper Model!")
|
56 |
-
asr_model = WhisperModel("large-v2", device=device, compute_type="float16")
|
57 |
-
|
58 |
-
metadata = {"audio_file": [], "text": [], "speaker_name": []}
|
59 |
-
|
60 |
-
if gradio_progress is not None:
|
61 |
-
tqdm_object = gradio_progress.tqdm(audio_files, desc="Formatting...")
|
62 |
-
else:
|
63 |
-
tqdm_object = tqdm(audio_files)
|
64 |
-
|
65 |
-
for audio_path in tqdm_object:
|
66 |
-
wav, sr = torchaudio.load(audio_path)
|
67 |
-
# stereo to mono if needed
|
68 |
-
if wav.size(0) != 1:
|
69 |
-
wav = torch.mean(wav, dim=0, keepdim=True)
|
70 |
-
|
71 |
-
wav = wav.squeeze()
|
72 |
-
audio_total_size += (wav.size(-1) / sr)
|
73 |
-
|
74 |
-
segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
|
75 |
-
segments = list(segments)
|
76 |
-
i = 0
|
77 |
-
sentence = ""
|
78 |
-
sentence_start = None
|
79 |
-
first_word = True
|
80 |
-
# added all segments words in a unique list
|
81 |
-
words_list = []
|
82 |
-
for _, segment in enumerate(segments):
|
83 |
-
words = list(segment.words)
|
84 |
-
words_list.extend(words)
|
85 |
-
|
86 |
-
# process each word
|
87 |
-
for word_idx, word in enumerate(words_list):
|
88 |
-
if first_word:
|
89 |
-
sentence_start = word.start
|
90 |
-
# If it is the first sentence, add buffer or get the begining of the file
|
91 |
-
if word_idx == 0:
|
92 |
-
sentence_start = max(sentence_start - buffer, 0) # Add buffer to the sentence start
|
93 |
-
else:
|
94 |
-
# get previous sentence end
|
95 |
-
previous_word_end = words_list[word_idx - 1].end
|
96 |
-
# add buffer or get the silence midle between the previous sentence and the current one
|
97 |
-
sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2)
|
98 |
-
|
99 |
-
sentence = word.word
|
100 |
-
first_word = False
|
101 |
-
else:
|
102 |
-
sentence += word.word
|
103 |
-
|
104 |
-
if word.word[-1] in ["!", ".", "?"]:
|
105 |
-
sentence = sentence[1:]
|
106 |
-
# Expand number and abbreviations plus normalization
|
107 |
-
sentence = multilingual_cleaners(sentence, target_language)
|
108 |
-
audio_file_name, _ = os.path.splitext(os.path.basename(audio_path))
|
109 |
-
|
110 |
-
audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav"
|
111 |
-
|
112 |
-
# Check for the next word's existence
|
113 |
-
if word_idx + 1 < len(words_list):
|
114 |
-
next_word_start = words_list[word_idx + 1].start
|
115 |
-
else:
|
116 |
-
# If don't have more words it means that it is the last sentence then use the audio len as next word start
|
117 |
-
next_word_start = (wav.shape[0] - 1) / sr
|
118 |
-
|
119 |
-
# Average the current word end and next word start
|
120 |
-
word_end = min((word.end + next_word_start) / 2, word.end + buffer)
|
121 |
-
|
122 |
-
absoulte_path = os.path.join(out_path, audio_file)
|
123 |
-
os.makedirs(os.path.dirname(absoulte_path), exist_ok=True)
|
124 |
-
i += 1
|
125 |
-
first_word = True
|
126 |
-
|
127 |
-
audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0)
|
128 |
-
# if the audio is too short ignore it (i.e < 0.33 seconds)
|
129 |
-
if audio.size(-1) >= sr/3:
|
130 |
-
torchaudio.save(absoulte_path,
|
131 |
-
audio,
|
132 |
-
sr
|
133 |
-
)
|
134 |
-
else:
|
135 |
-
continue
|
136 |
-
|
137 |
-
metadata["audio_file"].append(audio_file)
|
138 |
-
metadata["text"].append(sentence)
|
139 |
-
metadata["speaker_name"].append(speaker_name)
|
140 |
-
|
141 |
-
df = pandas.DataFrame(metadata)
|
142 |
-
df = df.sample(frac=1)
|
143 |
-
num_val_samples = int(len(df)*eval_percentage)
|
144 |
-
|
145 |
-
df_eval = df[:num_val_samples]
|
146 |
-
df_train = df[num_val_samples:]
|
147 |
-
|
148 |
-
df_train = df_train.sort_values('audio_file')
|
149 |
-
train_metadata_path = os.path.join(out_path, "metadata_train.csv")
|
150 |
-
df_train.to_csv(train_metadata_path, sep="|", index=False)
|
151 |
-
|
152 |
-
eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
|
153 |
-
df_eval = df_eval.sort_values('audio_file')
|
154 |
-
df_eval.to_csv(eval_metadata_path, sep="|", index=False)
|
155 |
-
|
156 |
-
# deallocate VRAM and RAM
|
157 |
-
del asr_model, df_train, df_eval, df, metadata
|
158 |
-
gc.collect()
|
159 |
-
|
160 |
-
return train_metadata_path, eval_metadata_path, audio_total_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/demos/xtts_ft_demo/utils/gpt_train.py
DELETED
@@ -1,172 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import gc
|
3 |
-
|
4 |
-
from trainer import Trainer, TrainerArgs
|
5 |
-
|
6 |
-
from TTS.config.shared_configs import BaseDatasetConfig
|
7 |
-
from TTS.tts.datasets import load_tts_samples
|
8 |
-
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
|
9 |
-
from TTS.utils.manage import ModelManager
|
10 |
-
|
11 |
-
|
12 |
-
def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path, max_audio_length=255995):
|
13 |
-
# Logging parameters
|
14 |
-
RUN_NAME = "GPT_XTTS_FT"
|
15 |
-
PROJECT_NAME = "XTTS_trainer"
|
16 |
-
DASHBOARD_LOGGER = "tensorboard"
|
17 |
-
LOGGER_URI = None
|
18 |
-
|
19 |
-
# Set here the path that the checkpoints will be saved. Default: ./run/training/
|
20 |
-
OUT_PATH = os.path.join(output_path, "run", "training")
|
21 |
-
|
22 |
-
# Training Parameters
|
23 |
-
OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
|
24 |
-
START_WITH_EVAL = False # if True it will star with evaluation
|
25 |
-
BATCH_SIZE = batch_size # set here the batch size
|
26 |
-
GRAD_ACUMM_STEPS = grad_acumm # set here the grad accumulation steps
|
27 |
-
|
28 |
-
|
29 |
-
# Define here the dataset that you want to use for the fine-tuning on.
|
30 |
-
config_dataset = BaseDatasetConfig(
|
31 |
-
formatter="coqui",
|
32 |
-
dataset_name="ft_dataset",
|
33 |
-
path=os.path.dirname(train_csv),
|
34 |
-
meta_file_train=train_csv,
|
35 |
-
meta_file_val=eval_csv,
|
36 |
-
language=language,
|
37 |
-
)
|
38 |
-
|
39 |
-
# Add here the configs of the datasets
|
40 |
-
DATASETS_CONFIG_LIST = [config_dataset]
|
41 |
-
|
42 |
-
# Define the path where XTTS v2.0.1 files will be downloaded
|
43 |
-
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
|
44 |
-
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
|
45 |
-
|
46 |
-
|
47 |
-
# DVAE files
|
48 |
-
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
|
49 |
-
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
|
50 |
-
|
51 |
-
# Set the path to the downloaded files
|
52 |
-
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
|
53 |
-
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
|
54 |
-
|
55 |
-
# download DVAE files if needed
|
56 |
-
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
|
57 |
-
print(" > Downloading DVAE files!")
|
58 |
-
ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
|
59 |
-
|
60 |
-
|
61 |
-
# Download XTTS v2.0 checkpoint if needed
|
62 |
-
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
|
63 |
-
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
|
64 |
-
XTTS_CONFIG_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json"
|
65 |
-
|
66 |
-
# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
|
67 |
-
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
|
68 |
-
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
|
69 |
-
XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK)) # config.json file
|
70 |
-
|
71 |
-
# download XTTS v2.0 files if needed
|
72 |
-
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
|
73 |
-
print(" > Downloading XTTS v2.0 files!")
|
74 |
-
ModelManager._download_model_files(
|
75 |
-
[TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK, XTTS_CONFIG_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
|
76 |
-
)
|
77 |
-
|
78 |
-
# init args and config
|
79 |
-
model_args = GPTArgs(
|
80 |
-
max_conditioning_length=132300, # 6 secs
|
81 |
-
min_conditioning_length=66150, # 3 secs
|
82 |
-
debug_loading_failures=False,
|
83 |
-
max_wav_length=max_audio_length, # ~11.6 seconds
|
84 |
-
max_text_length=200,
|
85 |
-
mel_norm_file=MEL_NORM_FILE,
|
86 |
-
dvae_checkpoint=DVAE_CHECKPOINT,
|
87 |
-
xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
|
88 |
-
tokenizer_file=TOKENIZER_FILE,
|
89 |
-
gpt_num_audio_tokens=1026,
|
90 |
-
gpt_start_audio_token=1024,
|
91 |
-
gpt_stop_audio_token=1025,
|
92 |
-
gpt_use_masking_gt_prompt_approach=True,
|
93 |
-
gpt_use_perceiver_resampler=True,
|
94 |
-
)
|
95 |
-
# define audio config
|
96 |
-
audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
|
97 |
-
# training parameters config
|
98 |
-
config = GPTTrainerConfig(
|
99 |
-
epochs=num_epochs,
|
100 |
-
output_path=OUT_PATH,
|
101 |
-
model_args=model_args,
|
102 |
-
run_name=RUN_NAME,
|
103 |
-
project_name=PROJECT_NAME,
|
104 |
-
run_description="""
|
105 |
-
GPT XTTS training
|
106 |
-
""",
|
107 |
-
dashboard_logger=DASHBOARD_LOGGER,
|
108 |
-
logger_uri=LOGGER_URI,
|
109 |
-
audio=audio_config,
|
110 |
-
batch_size=BATCH_SIZE,
|
111 |
-
batch_group_size=48,
|
112 |
-
eval_batch_size=BATCH_SIZE,
|
113 |
-
num_loader_workers=8,
|
114 |
-
eval_split_max_size=256,
|
115 |
-
print_step=50,
|
116 |
-
plot_step=100,
|
117 |
-
log_model_step=100,
|
118 |
-
save_step=1000,
|
119 |
-
save_n_checkpoints=1,
|
120 |
-
save_checkpoints=True,
|
121 |
-
# target_loss="loss",
|
122 |
-
print_eval=False,
|
123 |
-
# Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
|
124 |
-
optimizer="AdamW",
|
125 |
-
optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
|
126 |
-
optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
|
127 |
-
lr=5e-06, # learning rate
|
128 |
-
lr_scheduler="MultiStepLR",
|
129 |
-
# it was adjusted accordly for the new step scheme
|
130 |
-
lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
|
131 |
-
test_sentences=[],
|
132 |
-
)
|
133 |
-
|
134 |
-
# init the model from config
|
135 |
-
model = GPTTrainer.init_from_config(config)
|
136 |
-
|
137 |
-
# load training samples
|
138 |
-
train_samples, eval_samples = load_tts_samples(
|
139 |
-
DATASETS_CONFIG_LIST,
|
140 |
-
eval_split=True,
|
141 |
-
eval_split_max_size=config.eval_split_max_size,
|
142 |
-
eval_split_size=config.eval_split_size,
|
143 |
-
)
|
144 |
-
|
145 |
-
# init the trainer and 🚀
|
146 |
-
trainer = Trainer(
|
147 |
-
TrainerArgs(
|
148 |
-
restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
|
149 |
-
skip_train_epoch=False,
|
150 |
-
start_with_eval=START_WITH_EVAL,
|
151 |
-
grad_accum_steps=GRAD_ACUMM_STEPS,
|
152 |
-
),
|
153 |
-
config,
|
154 |
-
output_path=OUT_PATH,
|
155 |
-
model=model,
|
156 |
-
train_samples=train_samples,
|
157 |
-
eval_samples=eval_samples,
|
158 |
-
)
|
159 |
-
trainer.fit()
|
160 |
-
|
161 |
-
# get the longest text audio file to use as speaker reference
|
162 |
-
samples_len = [len(item["text"].split(" ")) for item in train_samples]
|
163 |
-
longest_text_idx = samples_len.index(max(samples_len))
|
164 |
-
speaker_ref = train_samples[longest_text_idx]["audio_file"]
|
165 |
-
|
166 |
-
trainer_out_path = trainer.output_path
|
167 |
-
|
168 |
-
# deallocate VRAM and RAM
|
169 |
-
del model, trainer, train_samples, eval_samples
|
170 |
-
gc.collect()
|
171 |
-
|
172 |
-
return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/demos/xtts_ft_demo/xtts_demo.py
DELETED
@@ -1,415 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import os
|
3 |
-
import sys
|
4 |
-
import tempfile
|
5 |
-
|
6 |
-
import gradio as gr
|
7 |
-
import librosa.display
|
8 |
-
import numpy as np
|
9 |
-
|
10 |
-
import os
|
11 |
-
import torch
|
12 |
-
import torchaudio
|
13 |
-
import traceback
|
14 |
-
from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list
|
15 |
-
from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt
|
16 |
-
|
17 |
-
from TTS.tts.configs.xtts_config import XttsConfig
|
18 |
-
from TTS.tts.models.xtts import Xtts
|
19 |
-
|
20 |
-
|
21 |
-
def clear_gpu_cache():
|
22 |
-
# clear the GPU cache
|
23 |
-
if torch.cuda.is_available():
|
24 |
-
torch.cuda.empty_cache()
|
25 |
-
|
26 |
-
XTTS_MODEL = None
|
27 |
-
def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
|
28 |
-
global XTTS_MODEL
|
29 |
-
clear_gpu_cache()
|
30 |
-
if not xtts_checkpoint or not xtts_config or not xtts_vocab:
|
31 |
-
return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!"
|
32 |
-
config = XttsConfig()
|
33 |
-
config.load_json(xtts_config)
|
34 |
-
XTTS_MODEL = Xtts.init_from_config(config)
|
35 |
-
print("Loading XTTS model! ")
|
36 |
-
XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
|
37 |
-
if torch.cuda.is_available():
|
38 |
-
XTTS_MODEL.cuda()
|
39 |
-
|
40 |
-
print("Model Loaded!")
|
41 |
-
return "Model Loaded!"
|
42 |
-
|
43 |
-
def run_tts(lang, tts_text, speaker_audio_file):
|
44 |
-
if XTTS_MODEL is None or not speaker_audio_file:
|
45 |
-
return "You need to run the previous step to load the model !!", None, None
|
46 |
-
|
47 |
-
gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
|
48 |
-
out = XTTS_MODEL.inference(
|
49 |
-
text=tts_text,
|
50 |
-
language=lang,
|
51 |
-
gpt_cond_latent=gpt_cond_latent,
|
52 |
-
speaker_embedding=speaker_embedding,
|
53 |
-
temperature=XTTS_MODEL.config.temperature, # Add custom parameters here
|
54 |
-
length_penalty=XTTS_MODEL.config.length_penalty,
|
55 |
-
repetition_penalty=XTTS_MODEL.config.repetition_penalty,
|
56 |
-
top_k=XTTS_MODEL.config.top_k,
|
57 |
-
top_p=XTTS_MODEL.config.top_p,
|
58 |
-
)
|
59 |
-
|
60 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
61 |
-
out["wav"] = torch.tensor(out["wav"]).unsqueeze(0)
|
62 |
-
out_path = fp.name
|
63 |
-
torchaudio.save(out_path, out["wav"], 24000)
|
64 |
-
|
65 |
-
return "Speech generated !", out_path, speaker_audio_file
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
# define a logger to redirect
|
71 |
-
class Logger:
|
72 |
-
def __init__(self, filename="log.out"):
|
73 |
-
self.log_file = filename
|
74 |
-
self.terminal = sys.stdout
|
75 |
-
self.log = open(self.log_file, "w")
|
76 |
-
|
77 |
-
def write(self, message):
|
78 |
-
self.terminal.write(message)
|
79 |
-
self.log.write(message)
|
80 |
-
|
81 |
-
def flush(self):
|
82 |
-
self.terminal.flush()
|
83 |
-
self.log.flush()
|
84 |
-
|
85 |
-
def isatty(self):
|
86 |
-
return False
|
87 |
-
|
88 |
-
# redirect stdout and stderr to a file
|
89 |
-
sys.stdout = Logger()
|
90 |
-
sys.stderr = sys.stdout
|
91 |
-
|
92 |
-
|
93 |
-
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
94 |
-
import logging
|
95 |
-
logging.basicConfig(
|
96 |
-
level=logging.INFO,
|
97 |
-
format="%(asctime)s [%(levelname)s] %(message)s",
|
98 |
-
handlers=[
|
99 |
-
logging.StreamHandler(sys.stdout)
|
100 |
-
]
|
101 |
-
)
|
102 |
-
|
103 |
-
def read_logs():
|
104 |
-
sys.stdout.flush()
|
105 |
-
with open(sys.stdout.log_file, "r") as f:
|
106 |
-
return f.read()
|
107 |
-
|
108 |
-
|
109 |
-
if __name__ == "__main__":
|
110 |
-
|
111 |
-
parser = argparse.ArgumentParser(
|
112 |
-
description="""XTTS fine-tuning demo\n\n"""
|
113 |
-
"""
|
114 |
-
Example runs:
|
115 |
-
python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port
|
116 |
-
""",
|
117 |
-
formatter_class=argparse.RawTextHelpFormatter,
|
118 |
-
)
|
119 |
-
parser.add_argument(
|
120 |
-
"--port",
|
121 |
-
type=int,
|
122 |
-
help="Port to run the gradio demo. Default: 5003",
|
123 |
-
default=5003,
|
124 |
-
)
|
125 |
-
parser.add_argument(
|
126 |
-
"--out_path",
|
127 |
-
type=str,
|
128 |
-
help="Output path (where data and checkpoints will be saved) Default: /tmp/xtts_ft/",
|
129 |
-
default="/tmp/xtts_ft/",
|
130 |
-
)
|
131 |
-
|
132 |
-
parser.add_argument(
|
133 |
-
"--num_epochs",
|
134 |
-
type=int,
|
135 |
-
help="Number of epochs to train. Default: 10",
|
136 |
-
default=10,
|
137 |
-
)
|
138 |
-
parser.add_argument(
|
139 |
-
"--batch_size",
|
140 |
-
type=int,
|
141 |
-
help="Batch size. Default: 4",
|
142 |
-
default=4,
|
143 |
-
)
|
144 |
-
parser.add_argument(
|
145 |
-
"--grad_acumm",
|
146 |
-
type=int,
|
147 |
-
help="Grad accumulation steps. Default: 1",
|
148 |
-
default=1,
|
149 |
-
)
|
150 |
-
parser.add_argument(
|
151 |
-
"--max_audio_length",
|
152 |
-
type=int,
|
153 |
-
help="Max permitted audio size in seconds. Default: 11",
|
154 |
-
default=11,
|
155 |
-
)
|
156 |
-
|
157 |
-
args = parser.parse_args()
|
158 |
-
|
159 |
-
with gr.Blocks() as demo:
|
160 |
-
with gr.Tab("1 - Data processing"):
|
161 |
-
out_path = gr.Textbox(
|
162 |
-
label="Output path (where data and checkpoints will be saved):",
|
163 |
-
value=args.out_path,
|
164 |
-
)
|
165 |
-
# upload_file = gr.Audio(
|
166 |
-
# sources="upload",
|
167 |
-
# label="Select here the audio files that you want to use for XTTS trainining !",
|
168 |
-
# type="filepath",
|
169 |
-
# )
|
170 |
-
upload_file = gr.File(
|
171 |
-
file_count="multiple",
|
172 |
-
label="Select here the audio files that you want to use for XTTS trainining (Supported formats: wav, mp3, and flac)",
|
173 |
-
)
|
174 |
-
lang = gr.Dropdown(
|
175 |
-
label="Dataset Language",
|
176 |
-
value="en",
|
177 |
-
choices=[
|
178 |
-
"en",
|
179 |
-
"es",
|
180 |
-
"fr",
|
181 |
-
"de",
|
182 |
-
"it",
|
183 |
-
"pt",
|
184 |
-
"pl",
|
185 |
-
"tr",
|
186 |
-
"ru",
|
187 |
-
"nl",
|
188 |
-
"cs",
|
189 |
-
"ar",
|
190 |
-
"zh",
|
191 |
-
"hu",
|
192 |
-
"ko",
|
193 |
-
"ja"
|
194 |
-
],
|
195 |
-
)
|
196 |
-
progress_data = gr.Label(
|
197 |
-
label="Progress:"
|
198 |
-
)
|
199 |
-
logs = gr.Textbox(
|
200 |
-
label="Logs:",
|
201 |
-
interactive=False,
|
202 |
-
)
|
203 |
-
demo.load(read_logs, None, logs, every=1)
|
204 |
-
|
205 |
-
prompt_compute_btn = gr.Button(value="Step 1 - Create dataset")
|
206 |
-
|
207 |
-
def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)):
|
208 |
-
clear_gpu_cache()
|
209 |
-
out_path = os.path.join(out_path, "dataset")
|
210 |
-
os.makedirs(out_path, exist_ok=True)
|
211 |
-
if audio_path is None:
|
212 |
-
return "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!", "", ""
|
213 |
-
else:
|
214 |
-
try:
|
215 |
-
train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress)
|
216 |
-
except:
|
217 |
-
traceback.print_exc()
|
218 |
-
error = traceback.format_exc()
|
219 |
-
return f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}", "", ""
|
220 |
-
|
221 |
-
clear_gpu_cache()
|
222 |
-
|
223 |
-
# if audio total len is less than 2 minutes raise an error
|
224 |
-
if audio_total_size < 120:
|
225 |
-
message = "The sum of the duration of the audios that you provided should be at least 2 minutes!"
|
226 |
-
print(message)
|
227 |
-
return message, "", ""
|
228 |
-
|
229 |
-
print("Dataset Processed!")
|
230 |
-
return "Dataset Processed!", train_meta, eval_meta
|
231 |
-
|
232 |
-
with gr.Tab("2 - Fine-tuning XTTS Encoder"):
|
233 |
-
train_csv = gr.Textbox(
|
234 |
-
label="Train CSV:",
|
235 |
-
)
|
236 |
-
eval_csv = gr.Textbox(
|
237 |
-
label="Eval CSV:",
|
238 |
-
)
|
239 |
-
num_epochs = gr.Slider(
|
240 |
-
label="Number of epochs:",
|
241 |
-
minimum=1,
|
242 |
-
maximum=100,
|
243 |
-
step=1,
|
244 |
-
value=args.num_epochs,
|
245 |
-
)
|
246 |
-
batch_size = gr.Slider(
|
247 |
-
label="Batch size:",
|
248 |
-
minimum=2,
|
249 |
-
maximum=512,
|
250 |
-
step=1,
|
251 |
-
value=args.batch_size,
|
252 |
-
)
|
253 |
-
grad_acumm = gr.Slider(
|
254 |
-
label="Grad accumulation steps:",
|
255 |
-
minimum=2,
|
256 |
-
maximum=128,
|
257 |
-
step=1,
|
258 |
-
value=args.grad_acumm,
|
259 |
-
)
|
260 |
-
max_audio_length = gr.Slider(
|
261 |
-
label="Max permitted audio size in seconds:",
|
262 |
-
minimum=2,
|
263 |
-
maximum=20,
|
264 |
-
step=1,
|
265 |
-
value=args.max_audio_length,
|
266 |
-
)
|
267 |
-
progress_train = gr.Label(
|
268 |
-
label="Progress:"
|
269 |
-
)
|
270 |
-
logs_tts_train = gr.Textbox(
|
271 |
-
label="Logs:",
|
272 |
-
interactive=False,
|
273 |
-
)
|
274 |
-
demo.load(read_logs, None, logs_tts_train, every=1)
|
275 |
-
train_btn = gr.Button(value="Step 2 - Run the training")
|
276 |
-
|
277 |
-
def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length):
|
278 |
-
clear_gpu_cache()
|
279 |
-
if not train_csv or not eval_csv:
|
280 |
-
return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", ""
|
281 |
-
try:
|
282 |
-
# convert seconds to waveform frames
|
283 |
-
max_audio_length = int(max_audio_length * 22050)
|
284 |
-
config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path, max_audio_length=max_audio_length)
|
285 |
-
except:
|
286 |
-
traceback.print_exc()
|
287 |
-
error = traceback.format_exc()
|
288 |
-
return f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}", "", "", "", ""
|
289 |
-
|
290 |
-
# copy original files to avoid parameters changes issues
|
291 |
-
os.system(f"cp {config_path} {exp_path}")
|
292 |
-
os.system(f"cp {vocab_file} {exp_path}")
|
293 |
-
|
294 |
-
ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth")
|
295 |
-
print("Model training done!")
|
296 |
-
clear_gpu_cache()
|
297 |
-
return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav
|
298 |
-
|
299 |
-
with gr.Tab("3 - Inference"):
|
300 |
-
with gr.Row():
|
301 |
-
with gr.Column() as col1:
|
302 |
-
xtts_checkpoint = gr.Textbox(
|
303 |
-
label="XTTS checkpoint path:",
|
304 |
-
value="",
|
305 |
-
)
|
306 |
-
xtts_config = gr.Textbox(
|
307 |
-
label="XTTS config path:",
|
308 |
-
value="",
|
309 |
-
)
|
310 |
-
|
311 |
-
xtts_vocab = gr.Textbox(
|
312 |
-
label="XTTS vocab path:",
|
313 |
-
value="",
|
314 |
-
)
|
315 |
-
progress_load = gr.Label(
|
316 |
-
label="Progress:"
|
317 |
-
)
|
318 |
-
load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model")
|
319 |
-
|
320 |
-
with gr.Column() as col2:
|
321 |
-
speaker_reference_audio = gr.Textbox(
|
322 |
-
label="Speaker reference audio:",
|
323 |
-
value="",
|
324 |
-
)
|
325 |
-
tts_language = gr.Dropdown(
|
326 |
-
label="Language",
|
327 |
-
value="en",
|
328 |
-
choices=[
|
329 |
-
"en",
|
330 |
-
"es",
|
331 |
-
"fr",
|
332 |
-
"de",
|
333 |
-
"it",
|
334 |
-
"pt",
|
335 |
-
"pl",
|
336 |
-
"tr",
|
337 |
-
"ru",
|
338 |
-
"nl",
|
339 |
-
"cs",
|
340 |
-
"ar",
|
341 |
-
"zh",
|
342 |
-
"hu",
|
343 |
-
"ko",
|
344 |
-
"ja",
|
345 |
-
]
|
346 |
-
)
|
347 |
-
tts_text = gr.Textbox(
|
348 |
-
label="Input Text.",
|
349 |
-
value="This model sounds really good and above all, it's reasonably fast.",
|
350 |
-
)
|
351 |
-
tts_btn = gr.Button(value="Step 4 - Inference")
|
352 |
-
|
353 |
-
with gr.Column() as col3:
|
354 |
-
progress_gen = gr.Label(
|
355 |
-
label="Progress:"
|
356 |
-
)
|
357 |
-
tts_output_audio = gr.Audio(label="Generated Audio.")
|
358 |
-
reference_audio = gr.Audio(label="Reference audio used.")
|
359 |
-
|
360 |
-
prompt_compute_btn.click(
|
361 |
-
fn=preprocess_dataset,
|
362 |
-
inputs=[
|
363 |
-
upload_file,
|
364 |
-
lang,
|
365 |
-
out_path,
|
366 |
-
],
|
367 |
-
outputs=[
|
368 |
-
progress_data,
|
369 |
-
train_csv,
|
370 |
-
eval_csv,
|
371 |
-
],
|
372 |
-
)
|
373 |
-
|
374 |
-
|
375 |
-
train_btn.click(
|
376 |
-
fn=train_model,
|
377 |
-
inputs=[
|
378 |
-
lang,
|
379 |
-
train_csv,
|
380 |
-
eval_csv,
|
381 |
-
num_epochs,
|
382 |
-
batch_size,
|
383 |
-
grad_acumm,
|
384 |
-
out_path,
|
385 |
-
max_audio_length,
|
386 |
-
],
|
387 |
-
outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio],
|
388 |
-
)
|
389 |
-
|
390 |
-
load_btn.click(
|
391 |
-
fn=load_model,
|
392 |
-
inputs=[
|
393 |
-
xtts_checkpoint,
|
394 |
-
xtts_config,
|
395 |
-
xtts_vocab
|
396 |
-
],
|
397 |
-
outputs=[progress_load],
|
398 |
-
)
|
399 |
-
|
400 |
-
tts_btn.click(
|
401 |
-
fn=run_tts,
|
402 |
-
inputs=[
|
403 |
-
tts_language,
|
404 |
-
tts_text,
|
405 |
-
speaker_reference_audio,
|
406 |
-
],
|
407 |
-
outputs=[progress_gen, tts_output_audio, reference_audio],
|
408 |
-
)
|
409 |
-
|
410 |
-
demo.launch(
|
411 |
-
share=True,
|
412 |
-
debug=False,
|
413 |
-
server_port=args.port,
|
414 |
-
server_name="0.0.0.0"
|
415 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/encoder/README.md
DELETED
@@ -1,18 +0,0 @@
|
|
1 |
-
### Speaker Encoder
|
2 |
-
|
3 |
-
This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
|
4 |
-
|
5 |
-
With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
|
6 |
-
|
7 |
-
Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
|
8 |
-
|
9 |
-
![](umap.png)
|
10 |
-
|
11 |
-
Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
|
12 |
-
|
13 |
-
To run the code, you need to follow the same flow as in TTS.
|
14 |
-
|
15 |
-
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
|
16 |
-
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
|
17 |
-
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
|
18 |
-
- Watch training on Tensorboard as in TTS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/encoder/__init__.py
DELETED
File without changes
|
TTS/encoder/__pycache__/__init__.cpython-310.pyc
DELETED
Binary file (165 Bytes)
|
|
TTS/encoder/__pycache__/losses.cpython-310.pyc
DELETED
Binary file (7.81 kB)
|
|
TTS/encoder/configs/base_encoder_config.py
DELETED
@@ -1,61 +0,0 @@
|
|
1 |
-
from dataclasses import asdict, dataclass, field
|
2 |
-
from typing import Dict, List
|
3 |
-
|
4 |
-
from coqpit import MISSING
|
5 |
-
|
6 |
-
from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
|
7 |
-
|
8 |
-
|
9 |
-
@dataclass
|
10 |
-
class BaseEncoderConfig(BaseTrainingConfig):
|
11 |
-
"""Defines parameters for a Generic Encoder model."""
|
12 |
-
|
13 |
-
model: str = None
|
14 |
-
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
|
15 |
-
datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
|
16 |
-
# model params
|
17 |
-
model_params: Dict = field(
|
18 |
-
default_factory=lambda: {
|
19 |
-
"model_name": "lstm",
|
20 |
-
"input_dim": 80,
|
21 |
-
"proj_dim": 256,
|
22 |
-
"lstm_dim": 768,
|
23 |
-
"num_lstm_layers": 3,
|
24 |
-
"use_lstm_with_projection": True,
|
25 |
-
}
|
26 |
-
)
|
27 |
-
|
28 |
-
audio_augmentation: Dict = field(default_factory=lambda: {})
|
29 |
-
|
30 |
-
# training params
|
31 |
-
epochs: int = 10000
|
32 |
-
loss: str = "angleproto"
|
33 |
-
grad_clip: float = 3.0
|
34 |
-
lr: float = 0.0001
|
35 |
-
optimizer: str = "radam"
|
36 |
-
optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
|
37 |
-
lr_decay: bool = False
|
38 |
-
warmup_steps: int = 4000
|
39 |
-
|
40 |
-
# logging params
|
41 |
-
tb_model_param_stats: bool = False
|
42 |
-
steps_plot_stats: int = 10
|
43 |
-
save_step: int = 1000
|
44 |
-
print_step: int = 20
|
45 |
-
run_eval: bool = False
|
46 |
-
|
47 |
-
# data loader
|
48 |
-
num_classes_in_batch: int = MISSING
|
49 |
-
num_utter_per_class: int = MISSING
|
50 |
-
eval_num_classes_in_batch: int = None
|
51 |
-
eval_num_utter_per_class: int = None
|
52 |
-
|
53 |
-
num_loader_workers: int = MISSING
|
54 |
-
voice_len: float = 1.6
|
55 |
-
|
56 |
-
def check_values(self):
|
57 |
-
super().check_values()
|
58 |
-
c = asdict(self)
|
59 |
-
assert (
|
60 |
-
c["model_params"]["input_dim"] == self.audio.num_mels
|
61 |
-
), " [!] model input dimendion must be equal to melspectrogram dimension."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/encoder/configs/emotion_encoder_config.py
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
from dataclasses import asdict, dataclass
|
2 |
-
|
3 |
-
from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
|
4 |
-
|
5 |
-
|
6 |
-
@dataclass
|
7 |
-
class EmotionEncoderConfig(BaseEncoderConfig):
|
8 |
-
"""Defines parameters for Emotion Encoder model."""
|
9 |
-
|
10 |
-
model: str = "emotion_encoder"
|
11 |
-
map_classid_to_classname: dict = None
|
12 |
-
class_name_key: str = "emotion_name"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/encoder/configs/speaker_encoder_config.py
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
from dataclasses import asdict, dataclass
|
2 |
-
|
3 |
-
from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
|
4 |
-
|
5 |
-
|
6 |
-
@dataclass
|
7 |
-
class SpeakerEncoderConfig(BaseEncoderConfig):
|
8 |
-
"""Defines parameters for Speaker Encoder model."""
|
9 |
-
|
10 |
-
model: str = "speaker_encoder"
|
11 |
-
class_name_key: str = "speaker_name"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/encoder/dataset.py
DELETED
@@ -1,147 +0,0 @@
|
|
1 |
-
import random
|
2 |
-
|
3 |
-
import torch
|
4 |
-
from torch.utils.data import Dataset
|
5 |
-
|
6 |
-
from TTS.encoder.utils.generic_utils import AugmentWAV
|
7 |
-
|
8 |
-
|
9 |
-
class EncoderDataset(Dataset):
|
10 |
-
def __init__(
|
11 |
-
self,
|
12 |
-
config,
|
13 |
-
ap,
|
14 |
-
meta_data,
|
15 |
-
voice_len=1.6,
|
16 |
-
num_classes_in_batch=64,
|
17 |
-
num_utter_per_class=10,
|
18 |
-
verbose=False,
|
19 |
-
augmentation_config=None,
|
20 |
-
use_torch_spec=None,
|
21 |
-
):
|
22 |
-
"""
|
23 |
-
Args:
|
24 |
-
ap (TTS.tts.utils.AudioProcessor): audio processor object.
|
25 |
-
meta_data (list): list of dataset instances.
|
26 |
-
seq_len (int): voice segment length in seconds.
|
27 |
-
verbose (bool): print diagnostic information.
|
28 |
-
"""
|
29 |
-
super().__init__()
|
30 |
-
self.config = config
|
31 |
-
self.items = meta_data
|
32 |
-
self.sample_rate = ap.sample_rate
|
33 |
-
self.seq_len = int(voice_len * self.sample_rate)
|
34 |
-
self.num_utter_per_class = num_utter_per_class
|
35 |
-
self.ap = ap
|
36 |
-
self.verbose = verbose
|
37 |
-
self.use_torch_spec = use_torch_spec
|
38 |
-
self.classes, self.items = self.__parse_items()
|
39 |
-
|
40 |
-
self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
|
41 |
-
|
42 |
-
# Data Augmentation
|
43 |
-
self.augmentator = None
|
44 |
-
self.gaussian_augmentation_config = None
|
45 |
-
if augmentation_config:
|
46 |
-
self.data_augmentation_p = augmentation_config["p"]
|
47 |
-
if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
|
48 |
-
self.augmentator = AugmentWAV(ap, augmentation_config)
|
49 |
-
|
50 |
-
if "gaussian" in augmentation_config.keys():
|
51 |
-
self.gaussian_augmentation_config = augmentation_config["gaussian"]
|
52 |
-
|
53 |
-
if self.verbose:
|
54 |
-
print("\n > DataLoader initialization")
|
55 |
-
print(f" | > Classes per Batch: {num_classes_in_batch}")
|
56 |
-
print(f" | > Number of instances : {len(self.items)}")
|
57 |
-
print(f" | > Sequence length: {self.seq_len}")
|
58 |
-
print(f" | > Num Classes: {len(self.classes)}")
|
59 |
-
print(f" | > Classes: {self.classes}")
|
60 |
-
|
61 |
-
def load_wav(self, filename):
|
62 |
-
audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
|
63 |
-
return audio
|
64 |
-
|
65 |
-
def __parse_items(self):
|
66 |
-
class_to_utters = {}
|
67 |
-
for item in self.items:
|
68 |
-
path_ = item["audio_file"]
|
69 |
-
class_name = item[self.config.class_name_key]
|
70 |
-
if class_name in class_to_utters.keys():
|
71 |
-
class_to_utters[class_name].append(path_)
|
72 |
-
else:
|
73 |
-
class_to_utters[class_name] = [
|
74 |
-
path_,
|
75 |
-
]
|
76 |
-
|
77 |
-
# skip classes with number of samples >= self.num_utter_per_class
|
78 |
-
class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
|
79 |
-
|
80 |
-
classes = list(class_to_utters.keys())
|
81 |
-
classes.sort()
|
82 |
-
|
83 |
-
new_items = []
|
84 |
-
for item in self.items:
|
85 |
-
path_ = item["audio_file"]
|
86 |
-
class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"]
|
87 |
-
# ignore filtered classes
|
88 |
-
if class_name not in classes:
|
89 |
-
continue
|
90 |
-
# ignore small audios
|
91 |
-
if self.load_wav(path_).shape[0] - self.seq_len <= 0:
|
92 |
-
continue
|
93 |
-
|
94 |
-
new_items.append({"wav_file_path": path_, "class_name": class_name})
|
95 |
-
|
96 |
-
return classes, new_items
|
97 |
-
|
98 |
-
def __len__(self):
|
99 |
-
return len(self.items)
|
100 |
-
|
101 |
-
def get_num_classes(self):
|
102 |
-
return len(self.classes)
|
103 |
-
|
104 |
-
def get_class_list(self):
|
105 |
-
return self.classes
|
106 |
-
|
107 |
-
def set_classes(self, classes):
|
108 |
-
self.classes = classes
|
109 |
-
self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
|
110 |
-
|
111 |
-
def get_map_classid_to_classname(self):
|
112 |
-
return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
|
113 |
-
|
114 |
-
def __getitem__(self, idx):
|
115 |
-
return self.items[idx]
|
116 |
-
|
117 |
-
def collate_fn(self, batch):
|
118 |
-
# get the batch class_ids
|
119 |
-
labels = []
|
120 |
-
feats = []
|
121 |
-
for item in batch:
|
122 |
-
utter_path = item["wav_file_path"]
|
123 |
-
class_name = item["class_name"]
|
124 |
-
|
125 |
-
# get classid
|
126 |
-
class_id = self.classname_to_classid[class_name]
|
127 |
-
# load wav file
|
128 |
-
wav = self.load_wav(utter_path)
|
129 |
-
offset = random.randint(0, wav.shape[0] - self.seq_len)
|
130 |
-
wav = wav[offset : offset + self.seq_len]
|
131 |
-
|
132 |
-
if self.augmentator is not None and self.data_augmentation_p:
|
133 |
-
if random.random() < self.data_augmentation_p:
|
134 |
-
wav = self.augmentator.apply_one(wav)
|
135 |
-
|
136 |
-
if not self.use_torch_spec:
|
137 |
-
mel = self.ap.melspectrogram(wav)
|
138 |
-
feats.append(torch.FloatTensor(mel))
|
139 |
-
else:
|
140 |
-
feats.append(torch.FloatTensor(wav))
|
141 |
-
|
142 |
-
labels.append(class_id)
|
143 |
-
|
144 |
-
feats = torch.stack(feats)
|
145 |
-
labels = torch.LongTensor(labels)
|
146 |
-
|
147 |
-
return feats, labels
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/encoder/losses.py
DELETED
@@ -1,226 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import torch.nn.functional as F
|
3 |
-
from torch import nn
|
4 |
-
|
5 |
-
|
6 |
-
# adapted from https://github.com/cvqluu/GE2E-Loss
|
7 |
-
class GE2ELoss(nn.Module):
|
8 |
-
def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
|
9 |
-
"""
|
10 |
-
Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
|
11 |
-
Accepts an input of size (N, M, D)
|
12 |
-
where N is the number of speakers in the batch,
|
13 |
-
M is the number of utterances per speaker,
|
14 |
-
and D is the dimensionality of the embedding vector (e.g. d-vector)
|
15 |
-
Args:
|
16 |
-
- init_w (float): defines the initial value of w in Equation (5) of [1]
|
17 |
-
- init_b (float): definies the initial value of b in Equation (5) of [1]
|
18 |
-
"""
|
19 |
-
super().__init__()
|
20 |
-
# pylint: disable=E1102
|
21 |
-
self.w = nn.Parameter(torch.tensor(init_w))
|
22 |
-
# pylint: disable=E1102
|
23 |
-
self.b = nn.Parameter(torch.tensor(init_b))
|
24 |
-
self.loss_method = loss_method
|
25 |
-
|
26 |
-
print(" > Initialized Generalized End-to-End loss")
|
27 |
-
|
28 |
-
assert self.loss_method in ["softmax", "contrast"]
|
29 |
-
|
30 |
-
if self.loss_method == "softmax":
|
31 |
-
self.embed_loss = self.embed_loss_softmax
|
32 |
-
if self.loss_method == "contrast":
|
33 |
-
self.embed_loss = self.embed_loss_contrast
|
34 |
-
|
35 |
-
# pylint: disable=R0201
|
36 |
-
def calc_new_centroids(self, dvecs, centroids, spkr, utt):
|
37 |
-
"""
|
38 |
-
Calculates the new centroids excluding the reference utterance
|
39 |
-
"""
|
40 |
-
excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
|
41 |
-
excl = torch.mean(excl, 0)
|
42 |
-
new_centroids = []
|
43 |
-
for i, centroid in enumerate(centroids):
|
44 |
-
if i == spkr:
|
45 |
-
new_centroids.append(excl)
|
46 |
-
else:
|
47 |
-
new_centroids.append(centroid)
|
48 |
-
return torch.stack(new_centroids)
|
49 |
-
|
50 |
-
def calc_cosine_sim(self, dvecs, centroids):
|
51 |
-
"""
|
52 |
-
Make the cosine similarity matrix with dims (N,M,N)
|
53 |
-
"""
|
54 |
-
cos_sim_matrix = []
|
55 |
-
for spkr_idx, speaker in enumerate(dvecs):
|
56 |
-
cs_row = []
|
57 |
-
for utt_idx, utterance in enumerate(speaker):
|
58 |
-
new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx)
|
59 |
-
# vector based cosine similarity for speed
|
60 |
-
cs_row.append(
|
61 |
-
torch.clamp(
|
62 |
-
torch.mm(
|
63 |
-
utterance.unsqueeze(1).transpose(0, 1),
|
64 |
-
new_centroids.transpose(0, 1),
|
65 |
-
)
|
66 |
-
/ (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
|
67 |
-
1e-6,
|
68 |
-
)
|
69 |
-
)
|
70 |
-
cs_row = torch.cat(cs_row, dim=0)
|
71 |
-
cos_sim_matrix.append(cs_row)
|
72 |
-
return torch.stack(cos_sim_matrix)
|
73 |
-
|
74 |
-
# pylint: disable=R0201
|
75 |
-
def embed_loss_softmax(self, dvecs, cos_sim_matrix):
|
76 |
-
"""
|
77 |
-
Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
|
78 |
-
"""
|
79 |
-
N, M, _ = dvecs.shape
|
80 |
-
L = []
|
81 |
-
for j in range(N):
|
82 |
-
L_row = []
|
83 |
-
for i in range(M):
|
84 |
-
L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
|
85 |
-
L_row = torch.stack(L_row)
|
86 |
-
L.append(L_row)
|
87 |
-
return torch.stack(L)
|
88 |
-
|
89 |
-
# pylint: disable=R0201
|
90 |
-
def embed_loss_contrast(self, dvecs, cos_sim_matrix):
|
91 |
-
"""
|
92 |
-
Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
|
93 |
-
"""
|
94 |
-
N, M, _ = dvecs.shape
|
95 |
-
L = []
|
96 |
-
for j in range(N):
|
97 |
-
L_row = []
|
98 |
-
for i in range(M):
|
99 |
-
centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
|
100 |
-
excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]))
|
101 |
-
L_row.append(1.0 - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids))
|
102 |
-
L_row = torch.stack(L_row)
|
103 |
-
L.append(L_row)
|
104 |
-
return torch.stack(L)
|
105 |
-
|
106 |
-
def forward(self, x, _label=None):
|
107 |
-
"""
|
108 |
-
Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
|
109 |
-
"""
|
110 |
-
|
111 |
-
assert x.size()[1] >= 2
|
112 |
-
|
113 |
-
centroids = torch.mean(x, 1)
|
114 |
-
cos_sim_matrix = self.calc_cosine_sim(x, centroids)
|
115 |
-
torch.clamp(self.w, 1e-6)
|
116 |
-
cos_sim_matrix = self.w * cos_sim_matrix + self.b
|
117 |
-
L = self.embed_loss(x, cos_sim_matrix)
|
118 |
-
return L.mean()
|
119 |
-
|
120 |
-
|
121 |
-
# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
|
122 |
-
class AngleProtoLoss(nn.Module):
|
123 |
-
"""
|
124 |
-
Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
|
125 |
-
Accepts an input of size (N, M, D)
|
126 |
-
where N is the number of speakers in the batch,
|
127 |
-
M is the number of utterances per speaker,
|
128 |
-
and D is the dimensionality of the embedding vector
|
129 |
-
Args:
|
130 |
-
- init_w (float): defines the initial value of w
|
131 |
-
- init_b (float): definies the initial value of b
|
132 |
-
"""
|
133 |
-
|
134 |
-
def __init__(self, init_w=10.0, init_b=-5.0):
|
135 |
-
super().__init__()
|
136 |
-
# pylint: disable=E1102
|
137 |
-
self.w = nn.Parameter(torch.tensor(init_w))
|
138 |
-
# pylint: disable=E1102
|
139 |
-
self.b = nn.Parameter(torch.tensor(init_b))
|
140 |
-
self.criterion = torch.nn.CrossEntropyLoss()
|
141 |
-
|
142 |
-
print(" > Initialized Angular Prototypical loss")
|
143 |
-
|
144 |
-
def forward(self, x, _label=None):
|
145 |
-
"""
|
146 |
-
Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
|
147 |
-
"""
|
148 |
-
|
149 |
-
assert x.size()[1] >= 2
|
150 |
-
|
151 |
-
out_anchor = torch.mean(x[:, 1:, :], 1)
|
152 |
-
out_positive = x[:, 0, :]
|
153 |
-
num_speakers = out_anchor.size()[0]
|
154 |
-
|
155 |
-
cos_sim_matrix = F.cosine_similarity(
|
156 |
-
out_positive.unsqueeze(-1).expand(-1, -1, num_speakers),
|
157 |
-
out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2),
|
158 |
-
)
|
159 |
-
torch.clamp(self.w, 1e-6)
|
160 |
-
cos_sim_matrix = cos_sim_matrix * self.w + self.b
|
161 |
-
label = torch.arange(num_speakers).to(cos_sim_matrix.device)
|
162 |
-
L = self.criterion(cos_sim_matrix, label)
|
163 |
-
return L
|
164 |
-
|
165 |
-
|
166 |
-
class SoftmaxLoss(nn.Module):
|
167 |
-
"""
|
168 |
-
Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
|
169 |
-
Args:
|
170 |
-
- embedding_dim (float): speaker embedding dim
|
171 |
-
- n_speakers (float): number of speakers
|
172 |
-
"""
|
173 |
-
|
174 |
-
def __init__(self, embedding_dim, n_speakers):
|
175 |
-
super().__init__()
|
176 |
-
|
177 |
-
self.criterion = torch.nn.CrossEntropyLoss()
|
178 |
-
self.fc = nn.Linear(embedding_dim, n_speakers)
|
179 |
-
|
180 |
-
print("Initialised Softmax Loss")
|
181 |
-
|
182 |
-
def forward(self, x, label=None):
|
183 |
-
# reshape for compatibility
|
184 |
-
x = x.reshape(-1, x.size()[-1])
|
185 |
-
label = label.reshape(-1)
|
186 |
-
|
187 |
-
x = self.fc(x)
|
188 |
-
L = self.criterion(x, label)
|
189 |
-
|
190 |
-
return L
|
191 |
-
|
192 |
-
def inference(self, embedding):
|
193 |
-
x = self.fc(embedding)
|
194 |
-
activations = torch.nn.functional.softmax(x, dim=1).squeeze(0)
|
195 |
-
class_id = torch.argmax(activations)
|
196 |
-
return class_id
|
197 |
-
|
198 |
-
|
199 |
-
class SoftmaxAngleProtoLoss(nn.Module):
|
200 |
-
"""
|
201 |
-
Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
|
202 |
-
Args:
|
203 |
-
- embedding_dim (float): speaker embedding dim
|
204 |
-
- n_speakers (float): number of speakers
|
205 |
-
- init_w (float): defines the initial value of w
|
206 |
-
- init_b (float): definies the initial value of b
|
207 |
-
"""
|
208 |
-
|
209 |
-
def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
|
210 |
-
super().__init__()
|
211 |
-
|
212 |
-
self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
|
213 |
-
self.angleproto = AngleProtoLoss(init_w, init_b)
|
214 |
-
|
215 |
-
print("Initialised SoftmaxAnglePrototypical Loss")
|
216 |
-
|
217 |
-
def forward(self, x, label=None):
|
218 |
-
"""
|
219 |
-
Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
|
220 |
-
"""
|
221 |
-
|
222 |
-
Lp = self.angleproto(x)
|
223 |
-
|
224 |
-
Ls = self.softmax(x, label)
|
225 |
-
|
226 |
-
return Ls + Lp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/encoder/models/__pycache__/base_encoder.cpython-310.pyc
DELETED
Binary file (4.53 kB)
|
|
TTS/encoder/models/__pycache__/lstm.cpython-310.pyc
DELETED
Binary file (3.61 kB)
|
|
TTS/encoder/models/__pycache__/resnet.cpython-310.pyc
DELETED
Binary file (5.84 kB)
|
|
TTS/encoder/models/base_encoder.py
DELETED
@@ -1,161 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import torch
|
3 |
-
import torchaudio
|
4 |
-
from coqpit import Coqpit
|
5 |
-
from torch import nn
|
6 |
-
|
7 |
-
from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
|
8 |
-
from TTS.utils.generic_utils import set_init_dict
|
9 |
-
from TTS.utils.io import load_fsspec
|
10 |
-
|
11 |
-
|
12 |
-
class PreEmphasis(nn.Module):
|
13 |
-
def __init__(self, coefficient=0.97):
|
14 |
-
super().__init__()
|
15 |
-
self.coefficient = coefficient
|
16 |
-
self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
|
17 |
-
|
18 |
-
def forward(self, x):
|
19 |
-
assert len(x.size()) == 2
|
20 |
-
|
21 |
-
x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
|
22 |
-
return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
|
23 |
-
|
24 |
-
|
25 |
-
class BaseEncoder(nn.Module):
|
26 |
-
"""Base `encoder` class. Every new `encoder` model must inherit this.
|
27 |
-
|
28 |
-
It defines common `encoder` specific functions.
|
29 |
-
"""
|
30 |
-
|
31 |
-
# pylint: disable=W0102
|
32 |
-
def __init__(self):
|
33 |
-
super(BaseEncoder, self).__init__()
|
34 |
-
|
35 |
-
def get_torch_mel_spectrogram_class(self, audio_config):
|
36 |
-
return torch.nn.Sequential(
|
37 |
-
PreEmphasis(audio_config["preemphasis"]),
|
38 |
-
# TorchSTFT(
|
39 |
-
# n_fft=audio_config["fft_size"],
|
40 |
-
# hop_length=audio_config["hop_length"],
|
41 |
-
# win_length=audio_config["win_length"],
|
42 |
-
# sample_rate=audio_config["sample_rate"],
|
43 |
-
# window="hamming_window",
|
44 |
-
# mel_fmin=0.0,
|
45 |
-
# mel_fmax=None,
|
46 |
-
# use_htk=True,
|
47 |
-
# do_amp_to_db=False,
|
48 |
-
# n_mels=audio_config["num_mels"],
|
49 |
-
# power=2.0,
|
50 |
-
# use_mel=True,
|
51 |
-
# mel_norm=None,
|
52 |
-
# )
|
53 |
-
torchaudio.transforms.MelSpectrogram(
|
54 |
-
sample_rate=audio_config["sample_rate"],
|
55 |
-
n_fft=audio_config["fft_size"],
|
56 |
-
win_length=audio_config["win_length"],
|
57 |
-
hop_length=audio_config["hop_length"],
|
58 |
-
window_fn=torch.hamming_window,
|
59 |
-
n_mels=audio_config["num_mels"],
|
60 |
-
),
|
61 |
-
)
|
62 |
-
|
63 |
-
@torch.no_grad()
|
64 |
-
def inference(self, x, l2_norm=True):
|
65 |
-
return self.forward(x, l2_norm)
|
66 |
-
|
67 |
-
@torch.no_grad()
|
68 |
-
def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
|
69 |
-
"""
|
70 |
-
Generate embeddings for a batch of utterances
|
71 |
-
x: 1xTxD
|
72 |
-
"""
|
73 |
-
# map to the waveform size
|
74 |
-
if self.use_torch_spec:
|
75 |
-
num_frames = num_frames * self.audio_config["hop_length"]
|
76 |
-
|
77 |
-
max_len = x.shape[1]
|
78 |
-
|
79 |
-
if max_len < num_frames:
|
80 |
-
num_frames = max_len
|
81 |
-
|
82 |
-
offsets = np.linspace(0, max_len - num_frames, num=num_eval)
|
83 |
-
|
84 |
-
frames_batch = []
|
85 |
-
for offset in offsets:
|
86 |
-
offset = int(offset)
|
87 |
-
end_offset = int(offset + num_frames)
|
88 |
-
frames = x[:, offset:end_offset]
|
89 |
-
frames_batch.append(frames)
|
90 |
-
|
91 |
-
frames_batch = torch.cat(frames_batch, dim=0)
|
92 |
-
embeddings = self.inference(frames_batch, l2_norm=l2_norm)
|
93 |
-
|
94 |
-
if return_mean:
|
95 |
-
embeddings = torch.mean(embeddings, dim=0, keepdim=True)
|
96 |
-
return embeddings
|
97 |
-
|
98 |
-
def get_criterion(self, c: Coqpit, num_classes=None):
|
99 |
-
if c.loss == "ge2e":
|
100 |
-
criterion = GE2ELoss(loss_method="softmax")
|
101 |
-
elif c.loss == "angleproto":
|
102 |
-
criterion = AngleProtoLoss()
|
103 |
-
elif c.loss == "softmaxproto":
|
104 |
-
criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
|
105 |
-
else:
|
106 |
-
raise Exception("The %s not is a loss supported" % c.loss)
|
107 |
-
return criterion
|
108 |
-
|
109 |
-
def load_checkpoint(
|
110 |
-
self,
|
111 |
-
config: Coqpit,
|
112 |
-
checkpoint_path: str,
|
113 |
-
eval: bool = False,
|
114 |
-
use_cuda: bool = False,
|
115 |
-
criterion=None,
|
116 |
-
cache=False,
|
117 |
-
):
|
118 |
-
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
119 |
-
try:
|
120 |
-
self.load_state_dict(state["model"])
|
121 |
-
print(" > Model fully restored. ")
|
122 |
-
except (KeyError, RuntimeError) as error:
|
123 |
-
# If eval raise the error
|
124 |
-
if eval:
|
125 |
-
raise error
|
126 |
-
|
127 |
-
print(" > Partial model initialization.")
|
128 |
-
model_dict = self.state_dict()
|
129 |
-
model_dict = set_init_dict(model_dict, state["model"], c)
|
130 |
-
self.load_state_dict(model_dict)
|
131 |
-
del model_dict
|
132 |
-
|
133 |
-
# load the criterion for restore_path
|
134 |
-
if criterion is not None and "criterion" in state:
|
135 |
-
try:
|
136 |
-
criterion.load_state_dict(state["criterion"])
|
137 |
-
except (KeyError, RuntimeError) as error:
|
138 |
-
print(" > Criterion load ignored because of:", error)
|
139 |
-
|
140 |
-
# instance and load the criterion for the encoder classifier in inference time
|
141 |
-
if (
|
142 |
-
eval
|
143 |
-
and criterion is None
|
144 |
-
and "criterion" in state
|
145 |
-
and getattr(config, "map_classid_to_classname", None) is not None
|
146 |
-
):
|
147 |
-
criterion = self.get_criterion(config, len(config.map_classid_to_classname))
|
148 |
-
criterion.load_state_dict(state["criterion"])
|
149 |
-
|
150 |
-
if use_cuda:
|
151 |
-
self.cuda()
|
152 |
-
if criterion is not None:
|
153 |
-
criterion = criterion.cuda()
|
154 |
-
|
155 |
-
if eval:
|
156 |
-
self.eval()
|
157 |
-
assert not self.training
|
158 |
-
|
159 |
-
if not eval:
|
160 |
-
return criterion, state["step"]
|
161 |
-
return criterion
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/encoder/models/lstm.py
DELETED
@@ -1,99 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from torch import nn
|
3 |
-
|
4 |
-
from TTS.encoder.models.base_encoder import BaseEncoder
|
5 |
-
|
6 |
-
|
7 |
-
class LSTMWithProjection(nn.Module):
|
8 |
-
def __init__(self, input_size, hidden_size, proj_size):
|
9 |
-
super().__init__()
|
10 |
-
self.input_size = input_size
|
11 |
-
self.hidden_size = hidden_size
|
12 |
-
self.proj_size = proj_size
|
13 |
-
self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
|
14 |
-
self.linear = nn.Linear(hidden_size, proj_size, bias=False)
|
15 |
-
|
16 |
-
def forward(self, x):
|
17 |
-
self.lstm.flatten_parameters()
|
18 |
-
o, (_, _) = self.lstm(x)
|
19 |
-
return self.linear(o)
|
20 |
-
|
21 |
-
|
22 |
-
class LSTMWithoutProjection(nn.Module):
|
23 |
-
def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
|
24 |
-
super().__init__()
|
25 |
-
self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
|
26 |
-
self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
|
27 |
-
self.relu = nn.ReLU()
|
28 |
-
|
29 |
-
def forward(self, x):
|
30 |
-
_, (hidden, _) = self.lstm(x)
|
31 |
-
return self.relu(self.linear(hidden[-1]))
|
32 |
-
|
33 |
-
|
34 |
-
class LSTMSpeakerEncoder(BaseEncoder):
|
35 |
-
def __init__(
|
36 |
-
self,
|
37 |
-
input_dim,
|
38 |
-
proj_dim=256,
|
39 |
-
lstm_dim=768,
|
40 |
-
num_lstm_layers=3,
|
41 |
-
use_lstm_with_projection=True,
|
42 |
-
use_torch_spec=False,
|
43 |
-
audio_config=None,
|
44 |
-
):
|
45 |
-
super().__init__()
|
46 |
-
self.use_lstm_with_projection = use_lstm_with_projection
|
47 |
-
self.use_torch_spec = use_torch_spec
|
48 |
-
self.audio_config = audio_config
|
49 |
-
self.proj_dim = proj_dim
|
50 |
-
|
51 |
-
layers = []
|
52 |
-
# choise LSTM layer
|
53 |
-
if use_lstm_with_projection:
|
54 |
-
layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
|
55 |
-
for _ in range(num_lstm_layers - 1):
|
56 |
-
layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
|
57 |
-
self.layers = nn.Sequential(*layers)
|
58 |
-
else:
|
59 |
-
self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
|
60 |
-
|
61 |
-
self.instancenorm = nn.InstanceNorm1d(input_dim)
|
62 |
-
|
63 |
-
if self.use_torch_spec:
|
64 |
-
self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
|
65 |
-
else:
|
66 |
-
self.torch_spec = None
|
67 |
-
|
68 |
-
self._init_layers()
|
69 |
-
|
70 |
-
def _init_layers(self):
|
71 |
-
for name, param in self.layers.named_parameters():
|
72 |
-
if "bias" in name:
|
73 |
-
nn.init.constant_(param, 0.0)
|
74 |
-
elif "weight" in name:
|
75 |
-
nn.init.xavier_normal_(param)
|
76 |
-
|
77 |
-
def forward(self, x, l2_norm=True):
|
78 |
-
"""Forward pass of the model.
|
79 |
-
|
80 |
-
Args:
|
81 |
-
x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
|
82 |
-
to compute the spectrogram on-the-fly.
|
83 |
-
l2_norm (bool): Whether to L2-normalize the outputs.
|
84 |
-
|
85 |
-
Shapes:
|
86 |
-
- x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
|
87 |
-
"""
|
88 |
-
with torch.no_grad():
|
89 |
-
with torch.cuda.amp.autocast(enabled=False):
|
90 |
-
if self.use_torch_spec:
|
91 |
-
x.squeeze_(1)
|
92 |
-
x = self.torch_spec(x)
|
93 |
-
x = self.instancenorm(x).transpose(1, 2)
|
94 |
-
d = self.layers(x)
|
95 |
-
if self.use_lstm_with_projection:
|
96 |
-
d = d[:, -1]
|
97 |
-
if l2_norm:
|
98 |
-
d = torch.nn.functional.normalize(d, p=2, dim=1)
|
99 |
-
return d
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/encoder/models/resnet.py
DELETED
@@ -1,198 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from torch import nn
|
3 |
-
|
4 |
-
# from TTS.utils.audio.torch_transforms import TorchSTFT
|
5 |
-
from TTS.encoder.models.base_encoder import BaseEncoder
|
6 |
-
|
7 |
-
|
8 |
-
class SELayer(nn.Module):
|
9 |
-
def __init__(self, channel, reduction=8):
|
10 |
-
super(SELayer, self).__init__()
|
11 |
-
self.avg_pool = nn.AdaptiveAvgPool2d(1)
|
12 |
-
self.fc = nn.Sequential(
|
13 |
-
nn.Linear(channel, channel // reduction),
|
14 |
-
nn.ReLU(inplace=True),
|
15 |
-
nn.Linear(channel // reduction, channel),
|
16 |
-
nn.Sigmoid(),
|
17 |
-
)
|
18 |
-
|
19 |
-
def forward(self, x):
|
20 |
-
b, c, _, _ = x.size()
|
21 |
-
y = self.avg_pool(x).view(b, c)
|
22 |
-
y = self.fc(y).view(b, c, 1, 1)
|
23 |
-
return x * y
|
24 |
-
|
25 |
-
|
26 |
-
class SEBasicBlock(nn.Module):
|
27 |
-
expansion = 1
|
28 |
-
|
29 |
-
def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
|
30 |
-
super(SEBasicBlock, self).__init__()
|
31 |
-
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
|
32 |
-
self.bn1 = nn.BatchNorm2d(planes)
|
33 |
-
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
|
34 |
-
self.bn2 = nn.BatchNorm2d(planes)
|
35 |
-
self.relu = nn.ReLU(inplace=True)
|
36 |
-
self.se = SELayer(planes, reduction)
|
37 |
-
self.downsample = downsample
|
38 |
-
self.stride = stride
|
39 |
-
|
40 |
-
def forward(self, x):
|
41 |
-
residual = x
|
42 |
-
|
43 |
-
out = self.conv1(x)
|
44 |
-
out = self.relu(out)
|
45 |
-
out = self.bn1(out)
|
46 |
-
|
47 |
-
out = self.conv2(out)
|
48 |
-
out = self.bn2(out)
|
49 |
-
out = self.se(out)
|
50 |
-
|
51 |
-
if self.downsample is not None:
|
52 |
-
residual = self.downsample(x)
|
53 |
-
|
54 |
-
out += residual
|
55 |
-
out = self.relu(out)
|
56 |
-
return out
|
57 |
-
|
58 |
-
|
59 |
-
class ResNetSpeakerEncoder(BaseEncoder):
|
60 |
-
"""Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
|
61 |
-
Adapted from: https://github.com/clovaai/voxceleb_trainer
|
62 |
-
"""
|
63 |
-
|
64 |
-
# pylint: disable=W0102
|
65 |
-
def __init__(
|
66 |
-
self,
|
67 |
-
input_dim=64,
|
68 |
-
proj_dim=512,
|
69 |
-
layers=[3, 4, 6, 3],
|
70 |
-
num_filters=[32, 64, 128, 256],
|
71 |
-
encoder_type="ASP",
|
72 |
-
log_input=False,
|
73 |
-
use_torch_spec=False,
|
74 |
-
audio_config=None,
|
75 |
-
):
|
76 |
-
super(ResNetSpeakerEncoder, self).__init__()
|
77 |
-
|
78 |
-
self.encoder_type = encoder_type
|
79 |
-
self.input_dim = input_dim
|
80 |
-
self.log_input = log_input
|
81 |
-
self.use_torch_spec = use_torch_spec
|
82 |
-
self.audio_config = audio_config
|
83 |
-
self.proj_dim = proj_dim
|
84 |
-
|
85 |
-
self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
|
86 |
-
self.relu = nn.ReLU(inplace=True)
|
87 |
-
self.bn1 = nn.BatchNorm2d(num_filters[0])
|
88 |
-
|
89 |
-
self.inplanes = num_filters[0]
|
90 |
-
self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0])
|
91 |
-
self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2))
|
92 |
-
self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2))
|
93 |
-
self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
|
94 |
-
|
95 |
-
self.instancenorm = nn.InstanceNorm1d(input_dim)
|
96 |
-
|
97 |
-
if self.use_torch_spec:
|
98 |
-
self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
|
99 |
-
else:
|
100 |
-
self.torch_spec = None
|
101 |
-
|
102 |
-
outmap_size = int(self.input_dim / 8)
|
103 |
-
|
104 |
-
self.attention = nn.Sequential(
|
105 |
-
nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
|
106 |
-
nn.ReLU(),
|
107 |
-
nn.BatchNorm1d(128),
|
108 |
-
nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
|
109 |
-
nn.Softmax(dim=2),
|
110 |
-
)
|
111 |
-
|
112 |
-
if self.encoder_type == "SAP":
|
113 |
-
out_dim = num_filters[3] * outmap_size
|
114 |
-
elif self.encoder_type == "ASP":
|
115 |
-
out_dim = num_filters[3] * outmap_size * 2
|
116 |
-
else:
|
117 |
-
raise ValueError("Undefined encoder")
|
118 |
-
|
119 |
-
self.fc = nn.Linear(out_dim, proj_dim)
|
120 |
-
|
121 |
-
self._init_layers()
|
122 |
-
|
123 |
-
def _init_layers(self):
|
124 |
-
for m in self.modules():
|
125 |
-
if isinstance(m, nn.Conv2d):
|
126 |
-
nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
|
127 |
-
elif isinstance(m, nn.BatchNorm2d):
|
128 |
-
nn.init.constant_(m.weight, 1)
|
129 |
-
nn.init.constant_(m.bias, 0)
|
130 |
-
|
131 |
-
def create_layer(self, block, planes, blocks, stride=1):
|
132 |
-
downsample = None
|
133 |
-
if stride != 1 or self.inplanes != planes * block.expansion:
|
134 |
-
downsample = nn.Sequential(
|
135 |
-
nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
|
136 |
-
nn.BatchNorm2d(planes * block.expansion),
|
137 |
-
)
|
138 |
-
|
139 |
-
layers = []
|
140 |
-
layers.append(block(self.inplanes, planes, stride, downsample))
|
141 |
-
self.inplanes = planes * block.expansion
|
142 |
-
for _ in range(1, blocks):
|
143 |
-
layers.append(block(self.inplanes, planes))
|
144 |
-
|
145 |
-
return nn.Sequential(*layers)
|
146 |
-
|
147 |
-
# pylint: disable=R0201
|
148 |
-
def new_parameter(self, *size):
|
149 |
-
out = nn.Parameter(torch.FloatTensor(*size))
|
150 |
-
nn.init.xavier_normal_(out)
|
151 |
-
return out
|
152 |
-
|
153 |
-
def forward(self, x, l2_norm=False):
|
154 |
-
"""Forward pass of the model.
|
155 |
-
|
156 |
-
Args:
|
157 |
-
x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
|
158 |
-
to compute the spectrogram on-the-fly.
|
159 |
-
l2_norm (bool): Whether to L2-normalize the outputs.
|
160 |
-
|
161 |
-
Shapes:
|
162 |
-
- x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
|
163 |
-
"""
|
164 |
-
x.squeeze_(1)
|
165 |
-
# if you torch spec compute it otherwise use the mel spec computed by the AP
|
166 |
-
if self.use_torch_spec:
|
167 |
-
x = self.torch_spec(x)
|
168 |
-
|
169 |
-
if self.log_input:
|
170 |
-
x = (x + 1e-6).log()
|
171 |
-
x = self.instancenorm(x).unsqueeze(1)
|
172 |
-
|
173 |
-
x = self.conv1(x)
|
174 |
-
x = self.relu(x)
|
175 |
-
x = self.bn1(x)
|
176 |
-
|
177 |
-
x = self.layer1(x)
|
178 |
-
x = self.layer2(x)
|
179 |
-
x = self.layer3(x)
|
180 |
-
x = self.layer4(x)
|
181 |
-
|
182 |
-
x = x.reshape(x.size()[0], -1, x.size()[-1])
|
183 |
-
|
184 |
-
w = self.attention(x)
|
185 |
-
|
186 |
-
if self.encoder_type == "SAP":
|
187 |
-
x = torch.sum(x * w, dim=2)
|
188 |
-
elif self.encoder_type == "ASP":
|
189 |
-
mu = torch.sum(x * w, dim=2)
|
190 |
-
sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5))
|
191 |
-
x = torch.cat((mu, sg), 1)
|
192 |
-
|
193 |
-
x = x.view(x.size()[0], -1)
|
194 |
-
x = self.fc(x)
|
195 |
-
|
196 |
-
if l2_norm:
|
197 |
-
x = torch.nn.functional.normalize(x, p=2, dim=1)
|
198 |
-
return x
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TTS/encoder/requirements.txt
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
umap-learn
|
2 |
-
numpy>=1.17.0
|
|
|
|
|
|
TTS/encoder/utils/__init__.py
DELETED
File without changes
|
TTS/encoder/utils/__pycache__/__init__.cpython-310.pyc
DELETED
Binary file (171 Bytes)
|
|
TTS/encoder/utils/__pycache__/generic_utils.cpython-310.pyc
DELETED
Binary file (3.7 kB)
|
|