ANYANTUDRE commited on
Commit
7c7161e
·
1 Parent(s): cfe3883

fixed typo in goai_stt_ttt_pipeline

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Makefile +0 -3
  2. TTS/.models.json +0 -938
  3. TTS/VERSION +0 -1
  4. TTS/__init__.py +0 -6
  5. TTS/__pycache__/__init__.cpython-310.pyc +0 -0
  6. TTS/__pycache__/model.cpython-310.pyc +0 -0
  7. TTS/api.py +0 -458
  8. TTS/bin/__init__.py +0 -0
  9. TTS/bin/collect_env_info.py +0 -48
  10. TTS/bin/compute_attention_masks.py +0 -165
  11. TTS/bin/compute_embeddings.py +0 -197
  12. TTS/bin/compute_statistics.py +0 -96
  13. TTS/bin/eval_encoder.py +0 -88
  14. TTS/bin/extract_tts_spectrograms.py +0 -287
  15. TTS/bin/find_unique_chars.py +0 -45
  16. TTS/bin/find_unique_phonemes.py +0 -74
  17. TTS/bin/remove_silence_using_vad.py +0 -124
  18. TTS/bin/resample.py +0 -90
  19. TTS/bin/synthesize.py +0 -494
  20. TTS/bin/train_encoder.py +0 -332
  21. TTS/bin/train_tts.py +0 -71
  22. TTS/bin/train_vocoder.py +0 -77
  23. TTS/bin/tune_wavegrad.py +0 -103
  24. TTS/config/__init__.py +0 -135
  25. TTS/config/__pycache__/__init__.cpython-310.pyc +0 -0
  26. TTS/config/__pycache__/shared_configs.cpython-310.pyc +0 -0
  27. TTS/config/shared_configs.py +0 -268
  28. TTS/demos/xtts_ft_demo/requirements.txt +0 -2
  29. TTS/demos/xtts_ft_demo/utils/formatter.py +0 -160
  30. TTS/demos/xtts_ft_demo/utils/gpt_train.py +0 -172
  31. TTS/demos/xtts_ft_demo/xtts_demo.py +0 -415
  32. TTS/encoder/README.md +0 -18
  33. TTS/encoder/__init__.py +0 -0
  34. TTS/encoder/__pycache__/__init__.cpython-310.pyc +0 -0
  35. TTS/encoder/__pycache__/losses.cpython-310.pyc +0 -0
  36. TTS/encoder/configs/base_encoder_config.py +0 -61
  37. TTS/encoder/configs/emotion_encoder_config.py +0 -12
  38. TTS/encoder/configs/speaker_encoder_config.py +0 -11
  39. TTS/encoder/dataset.py +0 -147
  40. TTS/encoder/losses.py +0 -226
  41. TTS/encoder/models/__pycache__/base_encoder.cpython-310.pyc +0 -0
  42. TTS/encoder/models/__pycache__/lstm.cpython-310.pyc +0 -0
  43. TTS/encoder/models/__pycache__/resnet.cpython-310.pyc +0 -0
  44. TTS/encoder/models/base_encoder.py +0 -161
  45. TTS/encoder/models/lstm.py +0 -99
  46. TTS/encoder/models/resnet.py +0 -198
  47. TTS/encoder/requirements.txt +0 -2
  48. TTS/encoder/utils/__init__.py +0 -0
  49. TTS/encoder/utils/__pycache__/__init__.cpython-310.pyc +0 -0
  50. TTS/encoder/utils/__pycache__/generic_utils.cpython-310.pyc +0 -0
Makefile CHANGED
@@ -5,9 +5,6 @@ install:
5
  test:
6
  python app.py
7
 
8
- debug:
9
- #python -m pytest -vv --pdb #Debugger is invoked
10
-
11
  format:
12
  #black *.py
13
 
 
5
  test:
6
  python app.py
7
 
 
 
 
8
  format:
9
  #black *.py
10
 
TTS/.models.json DELETED
@@ -1,938 +0,0 @@
1
- {
2
- "tts_models": {
3
- "multilingual": {
4
- "multi-dataset": {
5
- "xtts_v2": {
6
- "description": "XTTS-v2.0.3 by Coqui with 17 languages.",
7
- "hf_url": [
8
- "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
9
- "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
10
- "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
11
- "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5",
12
- "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/speakers_xtts.pth"
13
- ],
14
- "model_hash": "10f92b55c512af7a8d39d650547a15a7",
15
- "default_vocoder": null,
16
- "commit": "480a6cdf7",
17
- "license": "CPML",
18
- "contact": "[email protected]",
19
- "tos_required": true
20
- },
21
- "xtts_v1.1": {
22
- "description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.",
23
- "hf_url": [
24
- "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth",
25
- "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/config.json",
26
- "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json",
27
- "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/hash.md5"
28
- ],
29
- "model_hash": "7c62beaf58d39b729de287330dc254e7b515677416839b649a50e7cf74c3df59",
30
- "default_vocoder": null,
31
- "commit": "82910a63",
32
- "license": "CPML",
33
- "contact": "[email protected]",
34
- "tos_required": true
35
- },
36
- "your_tts": {
37
- "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
38
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
39
- "default_vocoder": null,
40
- "commit": "e9a1953e",
41
- "license": "CC BY-NC-ND 4.0",
42
- "contact": "[email protected]"
43
- },
44
- "bark": {
45
- "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
46
- "hf_url": [
47
- "https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
48
- "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
49
- "https://coqui.gateway.scarf.sh/hf/text_2.pt",
50
- "https://coqui.gateway.scarf.sh/hf/bark/config.json",
51
- "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
52
- "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
53
- ],
54
- "default_vocoder": null,
55
- "commit": "e9a1953e",
56
- "license": "MIT",
57
- "contact": "https://www.suno.ai/"
58
- }
59
- }
60
- },
61
- "bg": {
62
- "cv": {
63
- "vits": {
64
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
65
- "default_vocoder": null,
66
- "commit": null,
67
- "author": "@NeonGeckoCom",
68
- "license": "bsd-3-clause"
69
- }
70
- }
71
- },
72
- "cs": {
73
- "cv": {
74
- "vits": {
75
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
76
- "default_vocoder": null,
77
- "commit": null,
78
- "author": "@NeonGeckoCom",
79
- "license": "bsd-3-clause"
80
- }
81
- }
82
- },
83
- "da": {
84
- "cv": {
85
- "vits": {
86
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
87
- "default_vocoder": null,
88
- "commit": null,
89
- "author": "@NeonGeckoCom",
90
- "license": "bsd-3-clause"
91
- }
92
- }
93
- },
94
- "et": {
95
- "cv": {
96
- "vits": {
97
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
98
- "default_vocoder": null,
99
- "commit": null,
100
- "author": "@NeonGeckoCom",
101
- "license": "bsd-3-clause"
102
- }
103
- }
104
- },
105
- "ga": {
106
- "cv": {
107
- "vits": {
108
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
109
- "default_vocoder": null,
110
- "commit": null,
111
- "author": "@NeonGeckoCom",
112
- "license": "bsd-3-clause"
113
- }
114
- }
115
- },
116
- "en": {
117
- "ek1": {
118
- "tacotron2": {
119
- "description": "EK1 en-rp tacotron2 by NMStoker",
120
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
121
- "default_vocoder": "vocoder_models/en/ek1/wavegrad",
122
- "commit": "c802255",
123
- "license": "apache 2.0"
124
- }
125
- },
126
- "ljspeech": {
127
- "tacotron2-DDC": {
128
- "description": "Tacotron2 with Double Decoder Consistency.",
129
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
130
- "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
131
- "commit": "bae2ad0f",
132
- "author": "Eren Gölge @erogol",
133
- "license": "apache 2.0",
134
- "contact": "[email protected]"
135
- },
136
- "tacotron2-DDC_ph": {
137
- "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
138
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
139
- "default_vocoder": "vocoder_models/en/ljspeech/univnet",
140
- "commit": "3900448",
141
- "author": "Eren Gölge @erogol",
142
- "license": "apache 2.0",
143
- "contact": "[email protected]"
144
- },
145
- "glow-tts": {
146
- "description": "",
147
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
148
- "stats_file": null,
149
- "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
150
- "commit": "",
151
- "author": "Eren Gölge @erogol",
152
- "license": "MPL",
153
- "contact": "[email protected]"
154
- },
155
- "speedy-speech": {
156
- "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
157
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
158
- "stats_file": null,
159
- "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
160
- "commit": "4581e3d",
161
- "author": "Eren Gölge @erogol",
162
- "license": "apache 2.0",
163
- "contact": "[email protected]"
164
- },
165
- "tacotron2-DCA": {
166
- "description": "",
167
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
168
- "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
169
- "commit": "",
170
- "author": "Eren Gölge @erogol",
171
- "license": "MPL",
172
- "contact": "[email protected]"
173
- },
174
- "vits": {
175
- "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
176
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
177
- "default_vocoder": null,
178
- "commit": "3900448",
179
- "author": "Eren Gölge @erogol",
180
- "license": "apache 2.0",
181
- "contact": "[email protected]"
182
- },
183
- "vits--neon": {
184
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
185
- "default_vocoder": null,
186
- "author": "@NeonGeckoCom",
187
- "license": "bsd-3-clause",
188
- "contact": null,
189
- "commit": null
190
- },
191
- "fast_pitch": {
192
- "description": "FastPitch model trained on LJSpeech using the Aligner Network",
193
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
194
- "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
195
- "commit": "b27b3ba",
196
- "author": "Eren Gölge @erogol",
197
- "license": "apache 2.0",
198
- "contact": "[email protected]"
199
- },
200
- "overflow": {
201
- "description": "Overflow model trained on LJSpeech",
202
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
203
- "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
204
- "commit": "3b1a28f",
205
- "author": "Eren Gölge @erogol",
206
- "license": "apache 2.0",
207
- "contact": "[email protected]"
208
- },
209
- "neural_hmm": {
210
- "description": "Neural HMM model trained on LJSpeech",
211
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.11.0_models/tts_models--en--ljspeech--neural_hmm.zip",
212
- "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
213
- "commit": "3b1a28f",
214
- "author": "Shivam Metha @shivammehta25",
215
- "license": "apache 2.0",
216
- "contact": "d83ee8fe45e3c0d776d4a865aca21d7c2ac324c4"
217
- }
218
- },
219
- "vctk": {
220
- "vits": {
221
- "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
222
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
223
- "default_vocoder": null,
224
- "commit": "3900448",
225
- "author": "Eren @erogol",
226
- "license": "apache 2.0",
227
- "contact": "[email protected]"
228
- },
229
- "fast_pitch": {
230
- "description": "FastPitch model trained on VCTK dataseset.",
231
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
232
- "default_vocoder": null,
233
- "commit": "bdab788d",
234
- "author": "Eren @erogol",
235
- "license": "CC BY-NC-ND 4.0",
236
- "contact": "[email protected]"
237
- }
238
- },
239
- "sam": {
240
- "tacotron-DDC": {
241
- "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
242
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
243
- "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
244
- "commit": "bae2ad0f",
245
- "author": "Eren Gölge @erogol",
246
- "license": "apache 2.0",
247
- "contact": "[email protected]"
248
- }
249
- },
250
- "blizzard2013": {
251
- "capacitron-t2-c50": {
252
- "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
253
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
254
- "commit": "d6284e7",
255
- "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
256
- "author": "Adam Froghyar @a-froghyar",
257
- "license": "apache 2.0",
258
- "contact": "[email protected]"
259
- },
260
- "capacitron-t2-c150_v2": {
261
- "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
262
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
263
- "commit": "a67039d",
264
- "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
265
- "author": "Adam Froghyar @a-froghyar",
266
- "license": "apache 2.0",
267
- "contact": "[email protected]"
268
- }
269
- },
270
- "multi-dataset": {
271
- "tortoise-v2": {
272
- "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
273
- "github_rls_url": [
274
- "https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
275
- "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
276
- "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
277
- "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
278
- "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
279
- "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
280
- "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
281
- "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
282
- "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
283
- ],
284
- "commit": "c1875f6",
285
- "default_vocoder": null,
286
- "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
287
- "license": "apache 2.0"
288
- }
289
- },
290
- "jenny": {
291
- "jenny": {
292
- "description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
293
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
294
- "default_vocoder": null,
295
- "commit": "ba40a1c",
296
- "license": "custom - see https://github.com/dioco-group/jenny-tts-dataset#important",
297
- "author": "@noml4u"
298
- }
299
- }
300
- },
301
- "es": {
302
- "mai": {
303
- "tacotron2-DDC": {
304
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
305
- "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
306
- "commit": "",
307
- "author": "Eren Gölge @erogol",
308
- "license": "MPL",
309
- "contact": "[email protected]"
310
- }
311
- },
312
- "css10": {
313
- "vits": {
314
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
315
- "default_vocoder": null,
316
- "commit": null,
317
- "author": "@NeonGeckoCom",
318
- "license": "bsd-3-clause"
319
- }
320
- }
321
- },
322
- "fr": {
323
- "mai": {
324
- "tacotron2-DDC": {
325
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
326
- "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
327
- "commit": null,
328
- "author": "Eren Gölge @erogol",
329
- "license": "MPL",
330
- "contact": "[email protected]"
331
- }
332
- },
333
- "css10": {
334
- "vits": {
335
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
336
- "default_vocoder": null,
337
- "commit": null,
338
- "author": "@NeonGeckoCom",
339
- "license": "bsd-3-clause"
340
- }
341
- }
342
- },
343
- "uk": {
344
- "mai": {
345
- "glow-tts": {
346
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
347
- "author": "@robinhad",
348
- "commit": "bdab788d",
349
- "license": "MIT",
350
- "contact": "",
351
- "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
352
- },
353
- "vits": {
354
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
355
- "default_vocoder": null,
356
- "commit": null,
357
- "author": "@NeonGeckoCom",
358
- "license": "bsd-3-clause"
359
- }
360
- }
361
- },
362
- "zh-CN": {
363
- "baker": {
364
- "tacotron2-DDC-GST": {
365
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
366
- "commit": "unknown",
367
- "author": "@kirianguiller",
368
- "license": "apache 2.0",
369
- "default_vocoder": null
370
- }
371
- }
372
- },
373
- "nl": {
374
- "mai": {
375
- "tacotron2-DDC": {
376
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
377
- "author": "@r-dh",
378
- "license": "apache 2.0",
379
- "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
380
- "stats_file": null,
381
- "commit": "540d811"
382
- }
383
- },
384
- "css10": {
385
- "vits": {
386
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
387
- "default_vocoder": null,
388
- "commit": null,
389
- "author": "@NeonGeckoCom",
390
- "license": "bsd-3-clause"
391
- }
392
- }
393
- },
394
- "de": {
395
- "thorsten": {
396
- "tacotron2-DCA": {
397
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
398
- "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
399
- "author": "@thorstenMueller",
400
- "license": "apache 2.0",
401
- "commit": "unknown"
402
- },
403
- "vits": {
404
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--de--thorsten--vits.zip",
405
- "default_vocoder": null,
406
- "author": "@thorstenMueller",
407
- "license": "apache 2.0",
408
- "commit": "unknown"
409
- },
410
- "tacotron2-DDC": {
411
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
412
- "default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
413
- "description": "Thorsten-Dec2021-22k-DDC",
414
- "author": "@thorstenMueller",
415
- "license": "apache 2.0",
416
- "commit": "unknown"
417
- }
418
- },
419
- "css10": {
420
- "vits-neon": {
421
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
422
- "default_vocoder": null,
423
- "author": "@NeonGeckoCom",
424
- "license": "bsd-3-clause",
425
- "commit": null
426
- }
427
- }
428
- },
429
- "ja": {
430
- "kokoro": {
431
- "tacotron2-DDC": {
432
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
433
- "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
434
- "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
435
- "author": "@kaiidams",
436
- "license": "apache 2.0",
437
- "commit": "401fbd89"
438
- }
439
- }
440
- },
441
- "tr": {
442
- "common-voice": {
443
- "glow-tts": {
444
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
445
- "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
446
- "license": "MIT",
447
- "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
448
- "author": "Fatih Akademi",
449
- "commit": null
450
- }
451
- }
452
- },
453
- "it": {
454
- "mai_female": {
455
- "glow-tts": {
456
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
457
- "default_vocoder": null,
458
- "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
459
- "author": "@nicolalandro",
460
- "license": "apache 2.0",
461
- "commit": null
462
- },
463
- "vits": {
464
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
465
- "default_vocoder": null,
466
- "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
467
- "author": "@nicolalandro",
468
- "license": "apache 2.0",
469
- "commit": null
470
- }
471
- },
472
- "mai_male": {
473
- "glow-tts": {
474
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
475
- "default_vocoder": null,
476
- "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
477
- "author": "@nicolalandro",
478
- "license": "apache 2.0",
479
- "commit": null
480
- },
481
- "vits": {
482
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
483
- "default_vocoder": null,
484
- "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
485
- "author": "@nicolalandro",
486
- "license": "apache 2.0",
487
- "commit": null
488
- }
489
- }
490
- },
491
- "ewe": {
492
- "openbible": {
493
- "vits": {
494
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
495
- "default_vocoder": null,
496
- "license": "CC-BY-SA 4.0",
497
- "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
498
- "author": "@coqui_ai",
499
- "commit": "1b22f03"
500
- }
501
- }
502
- },
503
- "hau": {
504
- "openbible": {
505
- "vits": {
506
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
507
- "default_vocoder": null,
508
- "license": "CC-BY-SA 4.0",
509
- "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
510
- "author": "@coqui_ai",
511
- "commit": "1b22f03"
512
- }
513
- }
514
- },
515
- "lin": {
516
- "openbible": {
517
- "vits": {
518
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
519
- "default_vocoder": null,
520
- "license": "CC-BY-SA 4.0",
521
- "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
522
- "author": "@coqui_ai",
523
- "commit": "1b22f03"
524
- }
525
- }
526
- },
527
- "tw_akuapem": {
528
- "openbible": {
529
- "vits": {
530
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
531
- "default_vocoder": null,
532
- "license": "CC-BY-SA 4.0",
533
- "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
534
- "author": "@coqui_ai",
535
- "commit": "1b22f03"
536
- }
537
- }
538
- },
539
- "tw_asante": {
540
- "openbible": {
541
- "vits": {
542
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
543
- "default_vocoder": null,
544
- "license": "CC-BY-SA 4.0",
545
- "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
546
- "author": "@coqui_ai",
547
- "commit": "1b22f03"
548
- }
549
- }
550
- },
551
- "yor": {
552
- "openbible": {
553
- "vits": {
554
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
555
- "default_vocoder": null,
556
- "license": "CC-BY-SA 4.0",
557
- "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
558
- "author": "@coqui_ai",
559
- "commit": "1b22f03"
560
- }
561
- }
562
- },
563
- "hu": {
564
- "css10": {
565
- "vits": {
566
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
567
- "default_vocoder": null,
568
- "commit": null,
569
- "author": "@NeonGeckoCom",
570
- "license": "bsd-3-clause"
571
- }
572
- }
573
- },
574
- "el": {
575
- "cv": {
576
- "vits": {
577
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
578
- "default_vocoder": null,
579
- "commit": null,
580
- "author": "@NeonGeckoCom",
581
- "license": "bsd-3-clause"
582
- }
583
- }
584
- },
585
- "fi": {
586
- "css10": {
587
- "vits": {
588
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
589
- "default_vocoder": null,
590
- "commit": null,
591
- "author": "@NeonGeckoCom",
592
- "license": "bsd-3-clause"
593
- }
594
- }
595
- },
596
- "hr": {
597
- "cv": {
598
- "vits": {
599
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
600
- "default_vocoder": null,
601
- "commit": null,
602
- "author": "@NeonGeckoCom",
603
- "license": "bsd-3-clause"
604
- }
605
- }
606
- },
607
- "lt": {
608
- "cv": {
609
- "vits": {
610
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
611
- "default_vocoder": null,
612
- "commit": null,
613
- "author": "@NeonGeckoCom",
614
- "license": "bsd-3-clause"
615
- }
616
- }
617
- },
618
- "lv": {
619
- "cv": {
620
- "vits": {
621
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
622
- "default_vocoder": null,
623
- "commit": null,
624
- "author": "@NeonGeckoCom",
625
- "license": "bsd-3-clause"
626
- }
627
- }
628
- },
629
- "mt": {
630
- "cv": {
631
- "vits": {
632
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
633
- "default_vocoder": null,
634
- "commit": null,
635
- "author": "@NeonGeckoCom",
636
- "license": "bsd-3-clause"
637
- }
638
- }
639
- },
640
- "pl": {
641
- "mai_female": {
642
- "vits": {
643
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
644
- "default_vocoder": null,
645
- "commit": null,
646
- "author": "@NeonGeckoCom",
647
- "license": "bsd-3-clause"
648
- }
649
- }
650
- },
651
- "pt": {
652
- "cv": {
653
- "vits": {
654
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
655
- "default_vocoder": null,
656
- "commit": null,
657
- "author": "@NeonGeckoCom",
658
- "license": "bsd-3-clause"
659
- }
660
- }
661
- },
662
- "ro": {
663
- "cv": {
664
- "vits": {
665
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
666
- "default_vocoder": null,
667
- "commit": null,
668
- "author": "@NeonGeckoCom",
669
- "license": "bsd-3-clause"
670
- }
671
- }
672
- },
673
- "sk": {
674
- "cv": {
675
- "vits": {
676
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
677
- "default_vocoder": null,
678
- "commit": null,
679
- "author": "@NeonGeckoCom",
680
- "license": "bsd-3-clause"
681
- }
682
- }
683
- },
684
- "sl": {
685
- "cv": {
686
- "vits": {
687
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
688
- "default_vocoder": null,
689
- "commit": null,
690
- "author": "@NeonGeckoCom",
691
- "license": "bsd-3-clause"
692
- }
693
- }
694
- },
695
- "sv": {
696
- "cv": {
697
- "vits": {
698
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
699
- "default_vocoder": null,
700
- "commit": null,
701
- "author": "@NeonGeckoCom",
702
- "license": "bsd-3-clause"
703
- }
704
- }
705
- },
706
- "ca": {
707
- "custom": {
708
- "vits": {
709
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
710
- "default_vocoder": null,
711
- "commit": null,
712
- "description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
713
- "author": "@gullabi",
714
- "license": "CC-BY-4.0"
715
- }
716
- }
717
- },
718
- "fa": {
719
- "custom": {
720
- "glow-tts": {
721
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
722
- "default_vocoder": null,
723
- "commit": null,
724
- "description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
725
- "author": "@karim23657",
726
- "license": "CC-BY-4.0"
727
- }
728
- }
729
- },
730
- "bn": {
731
- "custom": {
732
- "vits-male": {
733
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
734
- "default_vocoder": null,
735
- "commit": null,
736
- "description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
737
- "author": "@mobassir94",
738
- "license": "Apache 2.0"
739
- },
740
- "vits-female": {
741
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
742
- "default_vocoder": null,
743
- "commit": null,
744
- "description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
745
- "author": "@mobassir94",
746
- "license": "Apache 2.0"
747
- }
748
- }
749
- },
750
- "be": {
751
- "common-voice": {
752
- "glow-tts":{
753
- "description": "Belarusian GlowTTS model created by @alex73 (Github).",
754
- "github_rls_url":"https://coqui.gateway.scarf.sh/v0.16.6/tts_models--be--common-voice--glow-tts.zip",
755
- "default_vocoder": "vocoder_models/be/common-voice/hifigan",
756
- "commit": "c0aabb85",
757
- "license": "CC-BY-SA 4.0",
758
- "contact": "[email protected]"
759
- }
760
- }
761
- }
762
- },
763
- "vocoder_models": {
764
- "universal": {
765
- "libri-tts": {
766
- "wavegrad": {
767
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
768
- "commit": "ea976b0",
769
- "author": "Eren Gölge @erogol",
770
- "license": "MPL",
771
- "contact": "[email protected]"
772
- },
773
- "fullband-melgan": {
774
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
775
- "commit": "4132240",
776
- "author": "Eren Gölge @erogol",
777
- "license": "MPL",
778
- "contact": "[email protected]"
779
- }
780
- }
781
- },
782
- "en": {
783
- "ek1": {
784
- "wavegrad": {
785
- "description": "EK1 en-rp wavegrad by NMStoker",
786
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
787
- "commit": "c802255",
788
- "license": "apache 2.0"
789
- }
790
- },
791
- "ljspeech": {
792
- "multiband-melgan": {
793
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
794
- "commit": "ea976b0",
795
- "author": "Eren Gölge @erogol",
796
- "license": "MPL",
797
- "contact": "[email protected]"
798
- },
799
- "hifigan_v2": {
800
- "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
801
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
802
- "commit": "bae2ad0f",
803
- "author": "@erogol",
804
- "license": "apache 2.0",
805
- "contact": "[email protected]"
806
- },
807
- "univnet": {
808
- "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
809
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
810
- "commit": "4581e3d",
811
- "author": "Eren @erogol",
812
- "license": "apache 2.0",
813
- "contact": "[email protected]"
814
- }
815
- },
816
- "blizzard2013": {
817
- "hifigan_v2": {
818
- "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
819
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
820
- "commit": "d6284e7",
821
- "author": "Adam Froghyar @a-froghyar",
822
- "license": "apache 2.0",
823
- "contact": "[email protected]"
824
- }
825
- },
826
- "vctk": {
827
- "hifigan_v2": {
828
- "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
829
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
830
- "commit": "2f07160",
831
- "author": "Edresson Casanova",
832
- "license": "apache 2.0",
833
- "contact": ""
834
- }
835
- },
836
- "sam": {
837
- "hifigan_v2": {
838
- "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
839
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
840
- "commit": "2f07160",
841
- "author": "Eren Gölge @erogol",
842
- "license": "apache 2.0",
843
- "contact": "[email protected]"
844
- }
845
- }
846
- },
847
- "nl": {
848
- "mai": {
849
- "parallel-wavegan": {
850
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
851
- "author": "@r-dh",
852
- "license": "apache 2.0",
853
- "commit": "unknown"
854
- }
855
- }
856
- },
857
- "de": {
858
- "thorsten": {
859
- "wavegrad": {
860
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
861
- "author": "@thorstenMueller",
862
- "license": "apache 2.0",
863
- "commit": "unknown"
864
- },
865
- "fullband-melgan": {
866
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
867
- "author": "@thorstenMueller",
868
- "license": "apache 2.0",
869
- "commit": "unknown"
870
- },
871
- "hifigan_v1": {
872
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
873
- "description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
874
- "author": "@thorstenMueller",
875
- "license": "apache 2.0",
876
- "commit": "unknown"
877
- }
878
- }
879
- },
880
- "ja": {
881
- "kokoro": {
882
- "hifigan_v1": {
883
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
884
- "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
885
- "author": "@kaiidams",
886
- "license": "apache 2.0",
887
- "commit": "3900448"
888
- }
889
- }
890
- },
891
- "uk": {
892
- "mai": {
893
- "multiband-melgan": {
894
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
895
- "author": "@robinhad",
896
- "commit": "bdab788d",
897
- "license": "MIT",
898
- "contact": ""
899
- }
900
- }
901
- },
902
- "tr": {
903
- "common-voice": {
904
- "hifigan": {
905
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
906
- "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
907
- "author": "Fatih Akademi",
908
- "license": "MIT",
909
- "commit": null
910
- }
911
- }
912
- },
913
- "be": {
914
- "common-voice": {
915
- "hifigan": {
916
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.16.6/vocoder_models--be--common-voice--hifigan.zip",
917
- "description": "Belarusian HiFiGAN model created by @alex73 (Github).",
918
- "author": "@alex73",
919
- "license": "CC-BY-SA 4.0",
920
- "commit": "c0aabb85"
921
- }
922
- }
923
- }
924
- },
925
- "voice_conversion_models": {
926
- "multilingual": {
927
- "vctk": {
928
- "freevc24": {
929
- "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
930
- "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
931
- "author": "Jing-Yi Li @OlaWod",
932
- "license": "MIT",
933
- "commit": null
934
- }
935
- }
936
- }
937
- }
938
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.22.0
 
 
TTS/__init__.py DELETED
@@ -1,6 +0,0 @@
1
- import os
2
-
3
- with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
4
- version = f.read().strip()
5
-
6
- __version__ = version
 
 
 
 
 
 
 
TTS/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (375 Bytes)
 
TTS/__pycache__/model.cpython-310.pyc DELETED
Binary file (2.6 kB)
 
TTS/api.py DELETED
@@ -1,458 +0,0 @@
1
- import tempfile
2
- import warnings
3
- from pathlib import Path
4
- from typing import Union
5
-
6
- import numpy as np
7
- from torch import nn
8
-
9
- from TTS.utils.audio.numpy_transforms import save_wav
10
- from TTS.utils.manage import ModelManager
11
- from TTS.utils.synthesizer import Synthesizer
12
- from TTS.config import load_config
13
-
14
-
15
- class TTS(nn.Module):
16
- """TODO: Add voice conversion and Capacitron support."""
17
-
18
- def __init__(
19
- self,
20
- model_name: str = "",
21
- model_path: str = None,
22
- config_path: str = None,
23
- vocoder_path: str = None,
24
- vocoder_config_path: str = None,
25
- progress_bar: bool = True,
26
- gpu=False,
27
- ):
28
- """🐸TTS python interface that allows to load and use the released models.
29
-
30
- Example with a multi-speaker model:
31
- >>> from TTS.api import TTS
32
- >>> tts = TTS(TTS.list_models()[0])
33
- >>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
34
- >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
35
-
36
- Example with a single-speaker model:
37
- >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
38
- >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
39
-
40
- Example loading a model from a path:
41
- >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
42
- >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
43
-
44
- Example voice cloning with YourTTS in English, French and Portuguese:
45
- >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
46
- >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
47
- >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
48
- >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
49
-
50
- Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
51
- >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
52
- >>> tts.tts_to_file("This is a test.", file_path="output.wav")
53
-
54
- Args:
55
- model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
56
- model_path (str, optional): Path to the model checkpoint. Defaults to None.
57
- config_path (str, optional): Path to the model config. Defaults to None.
58
- vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
59
- vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
60
- progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
61
- gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
62
- """
63
- super().__init__()
64
- self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
65
- self.config = load_config(config_path) if config_path else None
66
- self.synthesizer = None
67
- self.voice_converter = None
68
- self.model_name = ""
69
- if gpu:
70
- warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
71
-
72
- if model_name is not None and len(model_name) > 0:
73
- if "tts_models" in model_name:
74
- self.load_tts_model_by_name(model_name, gpu)
75
- elif "voice_conversion_models" in model_name:
76
- self.load_vc_model_by_name(model_name, gpu)
77
- else:
78
- self.load_model_by_name(model_name, gpu)
79
-
80
- if model_path:
81
- self.load_tts_model_by_path(
82
- model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
83
- )
84
-
85
- @property
86
- def models(self):
87
- return self.manager.list_tts_models()
88
-
89
- @property
90
- def is_multi_speaker(self):
91
- if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
92
- return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
93
- return False
94
-
95
- @property
96
- def is_multi_lingual(self):
97
- # Not sure what sets this to None, but applied a fix to prevent crashing.
98
- if (
99
- isinstance(self.model_name, str)
100
- and "xtts" in self.model_name
101
- or self.config
102
- and ("xtts" in self.config.model or len(self.config.languages) > 1)
103
- ):
104
- return True
105
- if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
106
- return self.synthesizer.tts_model.language_manager.num_languages > 1
107
- return False
108
-
109
- @property
110
- def speakers(self):
111
- if not self.is_multi_speaker:
112
- return None
113
- return self.synthesizer.tts_model.speaker_manager.speaker_names
114
-
115
- @property
116
- def languages(self):
117
- if not self.is_multi_lingual:
118
- return None
119
- return self.synthesizer.tts_model.language_manager.language_names
120
-
121
- @staticmethod
122
- def get_models_file_path():
123
- return Path(__file__).parent / ".models.json"
124
-
125
- def list_models(self):
126
- return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
127
-
128
- def download_model_by_name(self, model_name: str):
129
- model_path, config_path, model_item = self.manager.download_model(model_name)
130
- if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
131
- # return model directory if there are multiple files
132
- # we assume that the model knows how to load itself
133
- return None, None, None, None, model_path
134
- if model_item.get("default_vocoder") is None:
135
- return model_path, config_path, None, None, None
136
- vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
137
- return model_path, config_path, vocoder_path, vocoder_config_path, None
138
-
139
- def load_model_by_name(self, model_name: str, gpu: bool = False):
140
- """Load one of the 🐸TTS models by name.
141
-
142
- Args:
143
- model_name (str): Model name to load. You can list models by ```tts.models```.
144
- gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
145
- """
146
- self.load_tts_model_by_name(model_name, gpu)
147
-
148
- def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
149
- """Load one of the voice conversion models by name.
150
-
151
- Args:
152
- model_name (str): Model name to load. You can list models by ```tts.models```.
153
- gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
154
- """
155
- self.model_name = model_name
156
- model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
157
- self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
158
-
159
- def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
160
- """Load one of 🐸TTS models by name.
161
-
162
- Args:
163
- model_name (str): Model name to load. You can list models by ```tts.models```.
164
- gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
165
-
166
- TODO: Add tests
167
- """
168
- self.synthesizer = None
169
- self.model_name = model_name
170
-
171
- model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
172
- model_name
173
- )
174
-
175
- # init synthesizer
176
- # None values are fetch from the model
177
- self.synthesizer = Synthesizer(
178
- tts_checkpoint=model_path,
179
- tts_config_path=config_path,
180
- tts_speakers_file=None,
181
- tts_languages_file=None,
182
- vocoder_checkpoint=vocoder_path,
183
- vocoder_config=vocoder_config_path,
184
- encoder_checkpoint=None,
185
- encoder_config=None,
186
- model_dir=model_dir,
187
- use_cuda=gpu,
188
- )
189
-
190
- def load_tts_model_by_path(
191
- self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
192
- ):
193
- """Load a model from a path.
194
-
195
- Args:
196
- model_path (str): Path to the model checkpoint.
197
- config_path (str): Path to the model config.
198
- vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
199
- vocoder_config (str, optional): Path to the vocoder config. Defaults to None.
200
- gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
201
- """
202
-
203
- self.synthesizer = Synthesizer(
204
- tts_checkpoint=model_path,
205
- tts_config_path=config_path,
206
- tts_speakers_file=None,
207
- tts_languages_file=None,
208
- vocoder_checkpoint=vocoder_path,
209
- vocoder_config=vocoder_config,
210
- encoder_checkpoint=None,
211
- encoder_config=None,
212
- use_cuda=gpu,
213
- )
214
-
215
- def _check_arguments(
216
- self,
217
- speaker: str = None,
218
- language: str = None,
219
- speaker_wav: str = None,
220
- emotion: str = None,
221
- speed: float = None,
222
- **kwargs,
223
- ) -> None:
224
- """Check if the arguments are valid for the model."""
225
- # check for the coqui tts models
226
- if self.is_multi_speaker and (speaker is None and speaker_wav is None):
227
- raise ValueError("Model is multi-speaker but no `speaker` is provided.")
228
- if self.is_multi_lingual and language is None:
229
- raise ValueError("Model is multi-lingual but no `language` is provided.")
230
- if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
231
- raise ValueError("Model is not multi-speaker but `speaker` is provided.")
232
- if not self.is_multi_lingual and language is not None:
233
- raise ValueError("Model is not multi-lingual but `language` is provided.")
234
- if not emotion is None and not speed is None:
235
- raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
236
-
237
- def tts(
238
- self,
239
- text: str,
240
- speaker: str = None,
241
- language: str = None,
242
- speaker_wav: str = None,
243
- emotion: str = None,
244
- speed: float = None,
245
- split_sentences: bool = True,
246
- **kwargs,
247
- ):
248
- """Convert text to speech.
249
-
250
- Args:
251
- text (str):
252
- Input text to synthesize.
253
- speaker (str, optional):
254
- Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
255
- `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
256
- language (str): Language of the text. If None, the default language of the speaker is used. Language is only
257
- supported by `XTTS` model.
258
- speaker_wav (str, optional):
259
- Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
260
- Defaults to None.
261
- emotion (str, optional):
262
- Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
263
- speed (float, optional):
264
- Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
265
- Defaults to None.
266
- split_sentences (bool, optional):
267
- Split text into sentences, synthesize them separately and concatenate the file audio.
268
- Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
269
- applicable to the 🐸TTS models. Defaults to True.
270
- kwargs (dict, optional):
271
- Additional arguments for the model.
272
- """
273
- self._check_arguments(
274
- speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
275
- )
276
- wav = self.synthesizer.tts(
277
- text=text,
278
- speaker_name=speaker,
279
- language_name=language,
280
- speaker_wav=speaker_wav,
281
- reference_wav=None,
282
- style_wav=None,
283
- style_text=None,
284
- reference_speaker_name=None,
285
- split_sentences=split_sentences,
286
- **kwargs,
287
- )
288
- return wav
289
-
290
- def tts_to_file(
291
- self,
292
- text: str,
293
- speaker: str = None,
294
- language: str = None,
295
- speaker_wav: str = None,
296
- emotion: str = None,
297
- speed: float = 1.0,
298
- pipe_out=None,
299
- file_path: str = "output.wav",
300
- split_sentences: bool = True,
301
- **kwargs,
302
- ):
303
- """Convert text to speech.
304
-
305
- Args:
306
- text (str):
307
- Input text to synthesize.
308
- speaker (str, optional):
309
- Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
310
- `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
311
- language (str, optional):
312
- Language code for multi-lingual models. You can check whether loaded model is multi-lingual
313
- `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
314
- speaker_wav (str, optional):
315
- Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
316
- Defaults to None.
317
- emotion (str, optional):
318
- Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
319
- speed (float, optional):
320
- Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
321
- pipe_out (BytesIO, optional):
322
- Flag to stdout the generated TTS wav file for shell pipe.
323
- file_path (str, optional):
324
- Output file path. Defaults to "output.wav".
325
- split_sentences (bool, optional):
326
- Split text into sentences, synthesize them separately and concatenate the file audio.
327
- Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
328
- applicable to the 🐸TTS models. Defaults to True.
329
- kwargs (dict, optional):
330
- Additional arguments for the model.
331
- """
332
- self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
333
-
334
- wav = self.tts(
335
- text=text,
336
- speaker=speaker,
337
- language=language,
338
- speaker_wav=speaker_wav,
339
- split_sentences=split_sentences,
340
- **kwargs,
341
- )
342
- self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
343
- return file_path
344
-
345
- def voice_conversion(
346
- self,
347
- source_wav: str,
348
- target_wav: str,
349
- ):
350
- """Voice conversion with FreeVC. Convert source wav to target speaker.
351
-
352
- Args:``
353
- source_wav (str):
354
- Path to the source wav file.
355
- target_wav (str):`
356
- Path to the target wav file.
357
- """
358
- wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
359
- return wav
360
-
361
- def voice_conversion_to_file(
362
- self,
363
- source_wav: str,
364
- target_wav: str,
365
- file_path: str = "output.wav",
366
- ):
367
- """Voice conversion with FreeVC. Convert source wav to target speaker.
368
-
369
- Args:
370
- source_wav (str):
371
- Path to the source wav file.
372
- target_wav (str):
373
- Path to the target wav file.
374
- file_path (str, optional):
375
- Output file path. Defaults to "output.wav".
376
- """
377
- wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
378
- save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
379
- return file_path
380
-
381
- def tts_with_vc(
382
- self,
383
- text: str,
384
- language: str = None,
385
- speaker_wav: str = None,
386
- speaker: str = None,
387
- split_sentences: bool = True,
388
- ):
389
- """Convert text to speech with voice conversion.
390
-
391
- It combines tts with voice conversion to fake voice cloning.
392
-
393
- - Convert text to speech with tts.
394
- - Convert the output wav to target speaker with voice conversion.
395
-
396
- Args:
397
- text (str):
398
- Input text to synthesize.
399
- language (str, optional):
400
- Language code for multi-lingual models. You can check whether loaded model is multi-lingual
401
- `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
402
- speaker_wav (str, optional):
403
- Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
404
- Defaults to None.
405
- speaker (str, optional):
406
- Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
407
- `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
408
- split_sentences (bool, optional):
409
- Split text into sentences, synthesize them separately and concatenate the file audio.
410
- Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
411
- applicable to the 🐸TTS models. Defaults to True.
412
- """
413
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
414
- # Lazy code... save it to a temp file to resample it while reading it for VC
415
- self.tts_to_file(
416
- text=text, speaker=speaker, language=language, file_path=fp.name, split_sentences=split_sentences
417
- )
418
- if self.voice_converter is None:
419
- self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
420
- wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
421
- return wav
422
-
423
- def tts_with_vc_to_file(
424
- self,
425
- text: str,
426
- language: str = None,
427
- speaker_wav: str = None,
428
- file_path: str = "output.wav",
429
- speaker: str = None,
430
- split_sentences: bool = True,
431
- ):
432
- """Convert text to speech with voice conversion and save to file.
433
-
434
- Check `tts_with_vc` for more details.
435
-
436
- Args:
437
- text (str):
438
- Input text to synthesize.
439
- language (str, optional):
440
- Language code for multi-lingual models. You can check whether loaded model is multi-lingual
441
- `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
442
- speaker_wav (str, optional):
443
- Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
444
- Defaults to None.
445
- file_path (str, optional):
446
- Output file path. Defaults to "output.wav".
447
- speaker (str, optional):
448
- Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
449
- `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
450
- split_sentences (bool, optional):
451
- Split text into sentences, synthesize them separately and concatenate the file audio.
452
- Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
453
- applicable to the 🐸TTS models. Defaults to True.
454
- """
455
- wav = self.tts_with_vc(
456
- text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
457
- )
458
- save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/bin/__init__.py DELETED
File without changes
TTS/bin/collect_env_info.py DELETED
@@ -1,48 +0,0 @@
1
- """Get detailed info about the working environment."""
2
- import os
3
- import platform
4
- import sys
5
-
6
- import numpy
7
- import torch
8
-
9
- sys.path += [os.path.abspath(".."), os.path.abspath(".")]
10
- import json
11
-
12
- import TTS
13
-
14
-
15
- def system_info():
16
- return {
17
- "OS": platform.system(),
18
- "architecture": platform.architecture(),
19
- "version": platform.version(),
20
- "processor": platform.processor(),
21
- "python": platform.python_version(),
22
- }
23
-
24
-
25
- def cuda_info():
26
- return {
27
- "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
28
- "available": torch.cuda.is_available(),
29
- "version": torch.version.cuda,
30
- }
31
-
32
-
33
- def package_info():
34
- return {
35
- "numpy": numpy.__version__,
36
- "PyTorch_version": torch.__version__,
37
- "PyTorch_debug": torch.version.debug,
38
- "TTS": TTS.__version__,
39
- }
40
-
41
-
42
- def main():
43
- details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
44
- print(json.dumps(details, indent=4, sort_keys=True))
45
-
46
-
47
- if __name__ == "__main__":
48
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/bin/compute_attention_masks.py DELETED
@@ -1,165 +0,0 @@
1
- import argparse
2
- import importlib
3
- import os
4
- from argparse import RawTextHelpFormatter
5
-
6
- import numpy as np
7
- import torch
8
- from torch.utils.data import DataLoader
9
- from tqdm import tqdm
10
-
11
- from TTS.config import load_config
12
- from TTS.tts.datasets.TTSDataset import TTSDataset
13
- from TTS.tts.models import setup_model
14
- from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
15
- from TTS.utils.audio import AudioProcessor
16
- from TTS.utils.io import load_checkpoint
17
-
18
- if __name__ == "__main__":
19
- # pylint: disable=bad-option-value
20
- parser = argparse.ArgumentParser(
21
- description="""Extract attention masks from trained Tacotron/Tacotron2 models.
22
- These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
23
- """Each attention mask is written to the same path as the input wav file with ".npy" file extension.
24
- (e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
25
- """
26
- Example run:
27
- CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
28
- --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
29
- --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
30
- --dataset_metafile metadata.csv
31
- --data_path /root/LJSpeech-1.1/
32
- --batch_size 32
33
- --dataset ljspeech
34
- --use_cuda True
35
- """,
36
- formatter_class=RawTextHelpFormatter,
37
- )
38
- parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
39
- parser.add_argument(
40
- "--config_path",
41
- type=str,
42
- required=True,
43
- help="Path to Tacotron/Tacotron2 config file.",
44
- )
45
- parser.add_argument(
46
- "--dataset",
47
- type=str,
48
- default="",
49
- required=True,
50
- help="Target dataset processor name from TTS.tts.dataset.preprocess.",
51
- )
52
-
53
- parser.add_argument(
54
- "--dataset_metafile",
55
- type=str,
56
- default="",
57
- required=True,
58
- help="Dataset metafile inclusing file paths with transcripts.",
59
- )
60
- parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
61
- parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
62
-
63
- parser.add_argument(
64
- "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
65
- )
66
- args = parser.parse_args()
67
-
68
- C = load_config(args.config_path)
69
- ap = AudioProcessor(**C.audio)
70
-
71
- # if the vocabulary was passed, replace the default
72
- if "characters" in C.keys():
73
- symbols, phonemes = make_symbols(**C.characters)
74
-
75
- # load the model
76
- num_chars = len(phonemes) if C.use_phonemes else len(symbols)
77
- # TODO: handle multi-speaker
78
- model = setup_model(C)
79
- model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
80
-
81
- # data loader
82
- preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
83
- preprocessor = getattr(preprocessor, args.dataset)
84
- meta_data = preprocessor(args.data_path, args.dataset_metafile)
85
- dataset = TTSDataset(
86
- model.decoder.r,
87
- C.text_cleaner,
88
- compute_linear_spec=False,
89
- ap=ap,
90
- meta_data=meta_data,
91
- characters=C.characters if "characters" in C.keys() else None,
92
- add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
93
- use_phonemes=C.use_phonemes,
94
- phoneme_cache_path=C.phoneme_cache_path,
95
- phoneme_language=C.phoneme_language,
96
- enable_eos_bos=C.enable_eos_bos_chars,
97
- )
98
-
99
- dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
100
- loader = DataLoader(
101
- dataset,
102
- batch_size=args.batch_size,
103
- num_workers=4,
104
- collate_fn=dataset.collate_fn,
105
- shuffle=False,
106
- drop_last=False,
107
- )
108
-
109
- # compute attentions
110
- file_paths = []
111
- with torch.no_grad():
112
- for data in tqdm(loader):
113
- # setup input data
114
- text_input = data[0]
115
- text_lengths = data[1]
116
- linear_input = data[3]
117
- mel_input = data[4]
118
- mel_lengths = data[5]
119
- stop_targets = data[6]
120
- item_idxs = data[7]
121
-
122
- # dispatch data to GPU
123
- if args.use_cuda:
124
- text_input = text_input.cuda()
125
- text_lengths = text_lengths.cuda()
126
- mel_input = mel_input.cuda()
127
- mel_lengths = mel_lengths.cuda()
128
-
129
- model_outputs = model.forward(text_input, text_lengths, mel_input)
130
-
131
- alignments = model_outputs["alignments"].detach()
132
- for idx, alignment in enumerate(alignments):
133
- item_idx = item_idxs[idx]
134
- # interpolate if r > 1
135
- alignment = (
136
- torch.nn.functional.interpolate(
137
- alignment.transpose(0, 1).unsqueeze(0),
138
- size=None,
139
- scale_factor=model.decoder.r,
140
- mode="nearest",
141
- align_corners=None,
142
- recompute_scale_factor=None,
143
- )
144
- .squeeze(0)
145
- .transpose(0, 1)
146
- )
147
- # remove paddings
148
- alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
149
- # set file paths
150
- wav_file_name = os.path.basename(item_idx)
151
- align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
152
- file_path = item_idx.replace(wav_file_name, align_file_name)
153
- # save output
154
- wav_file_abs_path = os.path.abspath(item_idx)
155
- file_abs_path = os.path.abspath(file_path)
156
- file_paths.append([wav_file_abs_path, file_abs_path])
157
- np.save(file_path, alignment)
158
-
159
- # ourput metafile
160
- metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
161
-
162
- with open(metafile, "w", encoding="utf-8") as f:
163
- for p in file_paths:
164
- f.write(f"{p[0]}|{p[1]}\n")
165
- print(f" >> Metafile created: {metafile}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/bin/compute_embeddings.py DELETED
@@ -1,197 +0,0 @@
1
- import argparse
2
- import os
3
- from argparse import RawTextHelpFormatter
4
-
5
- import torch
6
- from tqdm import tqdm
7
-
8
- from TTS.config import load_config
9
- from TTS.config.shared_configs import BaseDatasetConfig
10
- from TTS.tts.datasets import load_tts_samples
11
- from TTS.tts.utils.managers import save_file
12
- from TTS.tts.utils.speakers import SpeakerManager
13
-
14
-
15
- def compute_embeddings(
16
- model_path,
17
- config_path,
18
- output_path,
19
- old_speakers_file=None,
20
- old_append=False,
21
- config_dataset_path=None,
22
- formatter_name=None,
23
- dataset_name=None,
24
- dataset_path=None,
25
- meta_file_train=None,
26
- meta_file_val=None,
27
- disable_cuda=False,
28
- no_eval=False,
29
- ):
30
- use_cuda = torch.cuda.is_available() and not disable_cuda
31
-
32
- if config_dataset_path is not None:
33
- c_dataset = load_config(config_dataset_path)
34
- meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
35
- else:
36
- c_dataset = BaseDatasetConfig()
37
- c_dataset.formatter = formatter_name
38
- c_dataset.dataset_name = dataset_name
39
- c_dataset.path = dataset_path
40
- if meta_file_train is not None:
41
- c_dataset.meta_file_train = meta_file_train
42
- if meta_file_val is not None:
43
- c_dataset.meta_file_val = meta_file_val
44
- meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
45
-
46
- if meta_data_eval is None:
47
- samples = meta_data_train
48
- else:
49
- samples = meta_data_train + meta_data_eval
50
-
51
- encoder_manager = SpeakerManager(
52
- encoder_model_path=model_path,
53
- encoder_config_path=config_path,
54
- d_vectors_file_path=old_speakers_file,
55
- use_cuda=use_cuda,
56
- )
57
-
58
- class_name_key = encoder_manager.encoder_config.class_name_key
59
-
60
- # compute speaker embeddings
61
- if old_speakers_file is not None and old_append:
62
- speaker_mapping = encoder_manager.embeddings
63
- else:
64
- speaker_mapping = {}
65
-
66
- for fields in tqdm(samples):
67
- class_name = fields[class_name_key]
68
- audio_file = fields["audio_file"]
69
- embedding_key = fields["audio_unique_name"]
70
-
71
- # Only update the speaker name when the embedding is already in the old file.
72
- if embedding_key in speaker_mapping:
73
- speaker_mapping[embedding_key]["name"] = class_name
74
- continue
75
-
76
- if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids:
77
- # get the embedding from the old file
78
- embedd = encoder_manager.get_embedding_by_clip(embedding_key)
79
- else:
80
- # extract the embedding
81
- embedd = encoder_manager.compute_embedding_from_clip(audio_file)
82
-
83
- # create speaker_mapping if target dataset is defined
84
- speaker_mapping[embedding_key] = {}
85
- speaker_mapping[embedding_key]["name"] = class_name
86
- speaker_mapping[embedding_key]["embedding"] = embedd
87
-
88
- if speaker_mapping:
89
- # save speaker_mapping if target dataset is defined
90
- if os.path.isdir(output_path):
91
- mapping_file_path = os.path.join(output_path, "speakers.pth")
92
- else:
93
- mapping_file_path = output_path
94
-
95
- if os.path.dirname(mapping_file_path) != "":
96
- os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
97
-
98
- save_file(speaker_mapping, mapping_file_path)
99
- print("Speaker embeddings saved at:", mapping_file_path)
100
-
101
-
102
- if __name__ == "__main__":
103
- parser = argparse.ArgumentParser(
104
- description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
105
- """
106
- Example runs:
107
- python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
108
-
109
- python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
110
- """,
111
- formatter_class=RawTextHelpFormatter,
112
- )
113
- parser.add_argument(
114
- "--model_path",
115
- type=str,
116
- help="Path to model checkpoint file. It defaults to the released speaker encoder.",
117
- default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
118
- )
119
- parser.add_argument(
120
- "--config_path",
121
- type=str,
122
- help="Path to model config file. It defaults to the released speaker encoder config.",
123
- default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
124
- )
125
- parser.add_argument(
126
- "--config_dataset_path",
127
- type=str,
128
- help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
129
- default=None,
130
- )
131
- parser.add_argument(
132
- "--output_path",
133
- type=str,
134
- help="Path for output `pth` or `json` file.",
135
- default="speakers.pth",
136
- )
137
- parser.add_argument(
138
- "--old_file",
139
- type=str,
140
- help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
141
- default=None,
142
- )
143
- parser.add_argument(
144
- "--old_append",
145
- help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
146
- default=False,
147
- action="store_true",
148
- )
149
- parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
150
- parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
151
- parser.add_argument(
152
- "--formatter_name",
153
- type=str,
154
- help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
155
- default=None,
156
- )
157
- parser.add_argument(
158
- "--dataset_name",
159
- type=str,
160
- help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
161
- default=None,
162
- )
163
- parser.add_argument(
164
- "--dataset_path",
165
- type=str,
166
- help="Path to the dataset. You either need to provide this or `config_dataset_path`",
167
- default=None,
168
- )
169
- parser.add_argument(
170
- "--meta_file_train",
171
- type=str,
172
- help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
173
- default=None,
174
- )
175
- parser.add_argument(
176
- "--meta_file_val",
177
- type=str,
178
- help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
179
- default=None,
180
- )
181
- args = parser.parse_args()
182
-
183
- compute_embeddings(
184
- args.model_path,
185
- args.config_path,
186
- args.output_path,
187
- old_speakers_file=args.old_file,
188
- old_append=args.old_append,
189
- config_dataset_path=args.config_dataset_path,
190
- formatter_name=args.formatter_name,
191
- dataset_name=args.dataset_name,
192
- dataset_path=args.dataset_path,
193
- meta_file_train=args.meta_file_train,
194
- meta_file_val=args.meta_file_val,
195
- disable_cuda=args.disable_cuda,
196
- no_eval=args.no_eval,
197
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/bin/compute_statistics.py DELETED
@@ -1,96 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- import argparse
5
- import glob
6
- import os
7
-
8
- import numpy as np
9
- from tqdm import tqdm
10
-
11
- # from TTS.utils.io import load_config
12
- from TTS.config import load_config
13
- from TTS.tts.datasets import load_tts_samples
14
- from TTS.utils.audio import AudioProcessor
15
-
16
-
17
- def main():
18
- """Run preprocessing process."""
19
- parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
20
- parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
21
- parser.add_argument("out_path", type=str, help="save path (directory and filename).")
22
- parser.add_argument(
23
- "--data_path",
24
- type=str,
25
- required=False,
26
- help="folder including the target set of wavs overriding dataset config.",
27
- )
28
- args, overrides = parser.parse_known_args()
29
-
30
- CONFIG = load_config(args.config_path)
31
- CONFIG.parse_known_args(overrides, relaxed_parser=True)
32
-
33
- # load config
34
- CONFIG.audio.signal_norm = False # do not apply earlier normalization
35
- CONFIG.audio.stats_path = None # discard pre-defined stats
36
-
37
- # load audio processor
38
- ap = AudioProcessor(**CONFIG.audio.to_dict())
39
-
40
- # load the meta data of target dataset
41
- if args.data_path:
42
- dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
43
- else:
44
- dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
45
- print(f" > There are {len(dataset_items)} files.")
46
-
47
- mel_sum = 0
48
- mel_square_sum = 0
49
- linear_sum = 0
50
- linear_square_sum = 0
51
- N = 0
52
- for item in tqdm(dataset_items):
53
- # compute features
54
- wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
55
- linear = ap.spectrogram(wav)
56
- mel = ap.melspectrogram(wav)
57
-
58
- # compute stats
59
- N += mel.shape[1]
60
- mel_sum += mel.sum(1)
61
- linear_sum += linear.sum(1)
62
- mel_square_sum += (mel**2).sum(axis=1)
63
- linear_square_sum += (linear**2).sum(axis=1)
64
-
65
- mel_mean = mel_sum / N
66
- mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
67
- linear_mean = linear_sum / N
68
- linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
69
-
70
- output_file_path = args.out_path
71
- stats = {}
72
- stats["mel_mean"] = mel_mean
73
- stats["mel_std"] = mel_scale
74
- stats["linear_mean"] = linear_mean
75
- stats["linear_std"] = linear_scale
76
-
77
- print(f" > Avg mel spec mean: {mel_mean.mean()}")
78
- print(f" > Avg mel spec scale: {mel_scale.mean()}")
79
- print(f" > Avg linear spec mean: {linear_mean.mean()}")
80
- print(f" > Avg linear spec scale: {linear_scale.mean()}")
81
-
82
- # set default config values for mean-var scaling
83
- CONFIG.audio.stats_path = output_file_path
84
- CONFIG.audio.signal_norm = True
85
- # remove redundant values
86
- del CONFIG.audio.max_norm
87
- del CONFIG.audio.min_level_db
88
- del CONFIG.audio.symmetric_norm
89
- del CONFIG.audio.clip_norm
90
- stats["audio_config"] = CONFIG.audio.to_dict()
91
- np.save(output_file_path, stats, allow_pickle=True)
92
- print(f" > stats saved to {output_file_path}")
93
-
94
-
95
- if __name__ == "__main__":
96
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/bin/eval_encoder.py DELETED
@@ -1,88 +0,0 @@
1
- import argparse
2
- from argparse import RawTextHelpFormatter
3
-
4
- import torch
5
- from tqdm import tqdm
6
-
7
- from TTS.config import load_config
8
- from TTS.tts.datasets import load_tts_samples
9
- from TTS.tts.utils.speakers import SpeakerManager
10
-
11
-
12
- def compute_encoder_accuracy(dataset_items, encoder_manager):
13
- class_name_key = encoder_manager.encoder_config.class_name_key
14
- map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
15
-
16
- class_acc_dict = {}
17
-
18
- # compute embeddings for all wav_files
19
- for item in tqdm(dataset_items):
20
- class_name = item[class_name_key]
21
- wav_file = item["audio_file"]
22
-
23
- # extract the embedding
24
- embedd = encoder_manager.compute_embedding_from_clip(wav_file)
25
- if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
26
- embedding = torch.FloatTensor(embedd).unsqueeze(0)
27
- if encoder_manager.use_cuda:
28
- embedding = embedding.cuda()
29
-
30
- class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
31
- predicted_label = map_classid_to_classname[str(class_id)]
32
- else:
33
- predicted_label = None
34
-
35
- if class_name is not None and predicted_label is not None:
36
- is_equal = int(class_name == predicted_label)
37
- if class_name not in class_acc_dict:
38
- class_acc_dict[class_name] = [is_equal]
39
- else:
40
- class_acc_dict[class_name].append(is_equal)
41
- else:
42
- raise RuntimeError("Error: class_name or/and predicted_label are None")
43
-
44
- acc_avg = 0
45
- for key, values in class_acc_dict.items():
46
- acc = sum(values) / len(values)
47
- print("Class", key, "Accuracy:", acc)
48
- acc_avg += acc
49
-
50
- print("Average Accuracy:", acc_avg / len(class_acc_dict))
51
-
52
-
53
- if __name__ == "__main__":
54
- parser = argparse.ArgumentParser(
55
- description="""Compute the accuracy of the encoder.\n\n"""
56
- """
57
- Example runs:
58
- python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
59
- """,
60
- formatter_class=RawTextHelpFormatter,
61
- )
62
- parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
63
- parser.add_argument(
64
- "config_path",
65
- type=str,
66
- help="Path to model config file.",
67
- )
68
-
69
- parser.add_argument(
70
- "config_dataset_path",
71
- type=str,
72
- help="Path to dataset config file.",
73
- )
74
- parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
75
- parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
76
-
77
- args = parser.parse_args()
78
-
79
- c_dataset = load_config(args.config_dataset_path)
80
-
81
- meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
82
- items = meta_data_train + meta_data_eval
83
-
84
- enc_manager = SpeakerManager(
85
- encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
86
- )
87
-
88
- compute_encoder_accuracy(items, enc_manager)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/bin/extract_tts_spectrograms.py DELETED
@@ -1,287 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Extract Mel spectrograms with teacher forcing."""
3
-
4
- import argparse
5
- import os
6
-
7
- import numpy as np
8
- import torch
9
- from torch.utils.data import DataLoader
10
- from tqdm import tqdm
11
-
12
- from TTS.config import load_config
13
- from TTS.tts.datasets import TTSDataset, load_tts_samples
14
- from TTS.tts.models import setup_model
15
- from TTS.tts.utils.speakers import SpeakerManager
16
- from TTS.tts.utils.text.tokenizer import TTSTokenizer
17
- from TTS.utils.audio import AudioProcessor
18
- from TTS.utils.audio.numpy_transforms import quantize
19
- from TTS.utils.generic_utils import count_parameters
20
-
21
- use_cuda = torch.cuda.is_available()
22
-
23
-
24
- def setup_loader(ap, r, verbose=False):
25
- tokenizer, _ = TTSTokenizer.init_from_config(c)
26
- dataset = TTSDataset(
27
- outputs_per_step=r,
28
- compute_linear_spec=False,
29
- samples=meta_data,
30
- tokenizer=tokenizer,
31
- ap=ap,
32
- batch_group_size=0,
33
- min_text_len=c.min_text_len,
34
- max_text_len=c.max_text_len,
35
- min_audio_len=c.min_audio_len,
36
- max_audio_len=c.max_audio_len,
37
- phoneme_cache_path=c.phoneme_cache_path,
38
- precompute_num_workers=0,
39
- use_noise_augment=False,
40
- verbose=verbose,
41
- speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
42
- d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
43
- )
44
-
45
- if c.use_phonemes and c.compute_input_seq_cache:
46
- # precompute phonemes to have a better estimate of sequence lengths.
47
- dataset.compute_input_seq(c.num_loader_workers)
48
- dataset.preprocess_samples()
49
-
50
- loader = DataLoader(
51
- dataset,
52
- batch_size=c.batch_size,
53
- shuffle=False,
54
- collate_fn=dataset.collate_fn,
55
- drop_last=False,
56
- sampler=None,
57
- num_workers=c.num_loader_workers,
58
- pin_memory=False,
59
- )
60
- return loader
61
-
62
-
63
- def set_filename(wav_path, out_path):
64
- wav_file = os.path.basename(wav_path)
65
- file_name = wav_file.split(".")[0]
66
- os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
67
- os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
68
- os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
69
- os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
70
- wavq_path = os.path.join(out_path, "quant", file_name)
71
- mel_path = os.path.join(out_path, "mel", file_name)
72
- wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
73
- wav_path = os.path.join(out_path, "wav", file_name + ".wav")
74
- return file_name, wavq_path, mel_path, wav_gl_path, wav_path
75
-
76
-
77
- def format_data(data):
78
- # setup input data
79
- text_input = data["token_id"]
80
- text_lengths = data["token_id_lengths"]
81
- mel_input = data["mel"]
82
- mel_lengths = data["mel_lengths"]
83
- item_idx = data["item_idxs"]
84
- d_vectors = data["d_vectors"]
85
- speaker_ids = data["speaker_ids"]
86
- attn_mask = data["attns"]
87
- avg_text_length = torch.mean(text_lengths.float())
88
- avg_spec_length = torch.mean(mel_lengths.float())
89
-
90
- # dispatch data to GPU
91
- if use_cuda:
92
- text_input = text_input.cuda(non_blocking=True)
93
- text_lengths = text_lengths.cuda(non_blocking=True)
94
- mel_input = mel_input.cuda(non_blocking=True)
95
- mel_lengths = mel_lengths.cuda(non_blocking=True)
96
- if speaker_ids is not None:
97
- speaker_ids = speaker_ids.cuda(non_blocking=True)
98
- if d_vectors is not None:
99
- d_vectors = d_vectors.cuda(non_blocking=True)
100
- if attn_mask is not None:
101
- attn_mask = attn_mask.cuda(non_blocking=True)
102
- return (
103
- text_input,
104
- text_lengths,
105
- mel_input,
106
- mel_lengths,
107
- speaker_ids,
108
- d_vectors,
109
- avg_text_length,
110
- avg_spec_length,
111
- attn_mask,
112
- item_idx,
113
- )
114
-
115
-
116
- @torch.no_grad()
117
- def inference(
118
- model_name,
119
- model,
120
- ap,
121
- text_input,
122
- text_lengths,
123
- mel_input,
124
- mel_lengths,
125
- speaker_ids=None,
126
- d_vectors=None,
127
- ):
128
- if model_name == "glow_tts":
129
- speaker_c = None
130
- if speaker_ids is not None:
131
- speaker_c = speaker_ids
132
- elif d_vectors is not None:
133
- speaker_c = d_vectors
134
- outputs = model.inference_with_MAS(
135
- text_input,
136
- text_lengths,
137
- mel_input,
138
- mel_lengths,
139
- aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
140
- )
141
- model_output = outputs["model_outputs"]
142
- model_output = model_output.detach().cpu().numpy()
143
-
144
- elif "tacotron" in model_name:
145
- aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
146
- outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
147
- postnet_outputs = outputs["model_outputs"]
148
- # normalize tacotron output
149
- if model_name == "tacotron":
150
- mel_specs = []
151
- postnet_outputs = postnet_outputs.data.cpu().numpy()
152
- for b in range(postnet_outputs.shape[0]):
153
- postnet_output = postnet_outputs[b]
154
- mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
155
- model_output = torch.stack(mel_specs).cpu().numpy()
156
-
157
- elif model_name == "tacotron2":
158
- model_output = postnet_outputs.detach().cpu().numpy()
159
- return model_output
160
-
161
-
162
- def extract_spectrograms(
163
- data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
164
- ):
165
- model.eval()
166
- export_metadata = []
167
- for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
168
- # format data
169
- (
170
- text_input,
171
- text_lengths,
172
- mel_input,
173
- mel_lengths,
174
- speaker_ids,
175
- d_vectors,
176
- _,
177
- _,
178
- _,
179
- item_idx,
180
- ) = format_data(data)
181
-
182
- model_output = inference(
183
- c.model.lower(),
184
- model,
185
- ap,
186
- text_input,
187
- text_lengths,
188
- mel_input,
189
- mel_lengths,
190
- speaker_ids,
191
- d_vectors,
192
- )
193
-
194
- for idx in range(text_input.shape[0]):
195
- wav_file_path = item_idx[idx]
196
- wav = ap.load_wav(wav_file_path)
197
- _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
198
-
199
- # quantize and save wav
200
- if quantize_bits > 0:
201
- wavq = quantize(wav, quantize_bits)
202
- np.save(wavq_path, wavq)
203
-
204
- # save TTS mel
205
- mel = model_output[idx]
206
- mel_length = mel_lengths[idx]
207
- mel = mel[:mel_length, :].T
208
- np.save(mel_path, mel)
209
-
210
- export_metadata.append([wav_file_path, mel_path])
211
- if save_audio:
212
- ap.save_wav(wav, wav_path)
213
-
214
- if debug:
215
- print("Audio for debug saved at:", wav_gl_path)
216
- wav = ap.inv_melspectrogram(mel)
217
- ap.save_wav(wav, wav_gl_path)
218
-
219
- with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
220
- for data in export_metadata:
221
- f.write(f"{data[0]}|{data[1]+'.npy'}\n")
222
-
223
-
224
- def main(args): # pylint: disable=redefined-outer-name
225
- # pylint: disable=global-variable-undefined
226
- global meta_data, speaker_manager
227
-
228
- # Audio processor
229
- ap = AudioProcessor(**c.audio)
230
-
231
- # load data instances
232
- meta_data_train, meta_data_eval = load_tts_samples(
233
- c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
234
- )
235
-
236
- # use eval and training partitions
237
- meta_data = meta_data_train + meta_data_eval
238
-
239
- # init speaker manager
240
- if c.use_speaker_embedding:
241
- speaker_manager = SpeakerManager(data_items=meta_data)
242
- elif c.use_d_vector_file:
243
- speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
244
- else:
245
- speaker_manager = None
246
-
247
- # setup model
248
- model = setup_model(c)
249
-
250
- # restore model
251
- model.load_checkpoint(c, args.checkpoint_path, eval=True)
252
-
253
- if use_cuda:
254
- model.cuda()
255
-
256
- num_params = count_parameters(model)
257
- print("\n > Model has {} parameters".format(num_params), flush=True)
258
- # set r
259
- r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
260
- own_loader = setup_loader(ap, r, verbose=True)
261
-
262
- extract_spectrograms(
263
- own_loader,
264
- model,
265
- ap,
266
- args.output_path,
267
- quantize_bits=args.quantize_bits,
268
- save_audio=args.save_audio,
269
- debug=args.debug,
270
- metada_name="metada.txt",
271
- )
272
-
273
-
274
- if __name__ == "__main__":
275
- parser = argparse.ArgumentParser()
276
- parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
277
- parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
278
- parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
279
- parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
280
- parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
281
- parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
282
- parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
283
- args = parser.parse_args()
284
-
285
- c = load_config(args.config_path)
286
- c.audio.trim_silence = False
287
- main(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/bin/find_unique_chars.py DELETED
@@ -1,45 +0,0 @@
1
- """Find all the unique characters in a dataset"""
2
- import argparse
3
- from argparse import RawTextHelpFormatter
4
-
5
- from TTS.config import load_config
6
- from TTS.tts.datasets import load_tts_samples
7
-
8
-
9
- def main():
10
- # pylint: disable=bad-option-value
11
- parser = argparse.ArgumentParser(
12
- description="""Find all the unique characters or phonemes in a dataset.\n\n"""
13
- """
14
- Example runs:
15
-
16
- python TTS/bin/find_unique_chars.py --config_path config.json
17
- """,
18
- formatter_class=RawTextHelpFormatter,
19
- )
20
- parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
21
- args = parser.parse_args()
22
-
23
- c = load_config(args.config_path)
24
-
25
- # load all datasets
26
- train_items, eval_items = load_tts_samples(
27
- c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
28
- )
29
-
30
- items = train_items + eval_items
31
-
32
- texts = "".join(item["text"] for item in items)
33
- chars = set(texts)
34
- lower_chars = filter(lambda c: c.islower(), chars)
35
- chars_force_lower = [c.lower() for c in chars]
36
- chars_force_lower = set(chars_force_lower)
37
-
38
- print(f" > Number of unique characters: {len(chars)}")
39
- print(f" > Unique characters: {''.join(sorted(chars))}")
40
- print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
41
- print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
42
-
43
-
44
- if __name__ == "__main__":
45
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/bin/find_unique_phonemes.py DELETED
@@ -1,74 +0,0 @@
1
- """Find all the unique characters in a dataset"""
2
- import argparse
3
- import multiprocessing
4
- from argparse import RawTextHelpFormatter
5
-
6
- from tqdm.contrib.concurrent import process_map
7
-
8
- from TTS.config import load_config
9
- from TTS.tts.datasets import load_tts_samples
10
- from TTS.tts.utils.text.phonemizers import Gruut
11
-
12
-
13
- def compute_phonemes(item):
14
- text = item["text"]
15
- ph = phonemizer.phonemize(text).replace("|", "")
16
- return set(list(ph))
17
-
18
-
19
- def main():
20
- # pylint: disable=W0601
21
- global c, phonemizer
22
- # pylint: disable=bad-option-value
23
- parser = argparse.ArgumentParser(
24
- description="""Find all the unique characters or phonemes in a dataset.\n\n"""
25
- """
26
- Example runs:
27
-
28
- python TTS/bin/find_unique_phonemes.py --config_path config.json
29
- """,
30
- formatter_class=RawTextHelpFormatter,
31
- )
32
- parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
33
- args = parser.parse_args()
34
-
35
- c = load_config(args.config_path)
36
-
37
- # load all datasets
38
- train_items, eval_items = load_tts_samples(
39
- c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
40
- )
41
- items = train_items + eval_items
42
- print("Num items:", len(items))
43
-
44
- language_list = [item["language"] for item in items]
45
- is_lang_def = all(language_list)
46
-
47
- if not c.phoneme_language or not is_lang_def:
48
- raise ValueError("Phoneme language must be defined in config.")
49
-
50
- if not language_list.count(language_list[0]) == len(language_list):
51
- raise ValueError(
52
- "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
53
- )
54
-
55
- phonemizer = Gruut(language=language_list[0], keep_puncs=True)
56
-
57
- phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
58
- phones = []
59
- for ph in phonemes:
60
- phones.extend(ph)
61
-
62
- phones = set(phones)
63
- lower_phones = filter(lambda c: c.islower(), phones)
64
- phones_force_lower = [c.lower() for c in phones]
65
- phones_force_lower = set(phones_force_lower)
66
-
67
- print(f" > Number of unique phonemes: {len(phones)}")
68
- print(f" > Unique phonemes: {''.join(sorted(phones))}")
69
- print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
70
- print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
71
-
72
-
73
- if __name__ == "__main__":
74
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/bin/remove_silence_using_vad.py DELETED
@@ -1,124 +0,0 @@
1
- import argparse
2
- import glob
3
- import multiprocessing
4
- import os
5
- import pathlib
6
-
7
- import torch
8
- from tqdm import tqdm
9
-
10
- from TTS.utils.vad import get_vad_model_and_utils, remove_silence
11
-
12
- torch.set_num_threads(1)
13
-
14
-
15
- def adjust_path_and_remove_silence(audio_path):
16
- output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
17
- # ignore if the file exists
18
- if os.path.exists(output_path) and not args.force:
19
- return output_path, False
20
-
21
- # create all directory structure
22
- pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
23
- # remove the silence and save the audio
24
- output_path, is_speech = remove_silence(
25
- model_and_utils,
26
- audio_path,
27
- output_path,
28
- trim_just_beginning_and_end=args.trim_just_beginning_and_end,
29
- use_cuda=args.use_cuda,
30
- )
31
- return output_path, is_speech
32
-
33
-
34
- def preprocess_audios():
35
- files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
36
- print("> Number of files: ", len(files))
37
- if not args.force:
38
- print("> Ignoring files that already exist in the output idrectory.")
39
-
40
- if args.trim_just_beginning_and_end:
41
- print("> Trimming just the beginning and the end with nonspeech parts.")
42
- else:
43
- print("> Trimming all nonspeech parts.")
44
-
45
- filtered_files = []
46
- if files:
47
- # create threads
48
- # num_threads = multiprocessing.cpu_count()
49
- # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
50
-
51
- if args.num_processes > 1:
52
- with multiprocessing.Pool(processes=args.num_processes) as pool:
53
- results = list(
54
- tqdm(
55
- pool.imap_unordered(adjust_path_and_remove_silence, files),
56
- total=len(files),
57
- desc="Processing audio files",
58
- )
59
- )
60
- for output_path, is_speech in results:
61
- if not is_speech:
62
- filtered_files.append(output_path)
63
- else:
64
- for f in tqdm(files):
65
- output_path, is_speech = adjust_path_and_remove_silence(f)
66
- if not is_speech:
67
- filtered_files.append(output_path)
68
-
69
- # write files that do not have speech
70
- with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
71
- for file in filtered_files:
72
- f.write(str(file) + "\n")
73
- else:
74
- print("> No files Found !")
75
-
76
-
77
- if __name__ == "__main__":
78
- parser = argparse.ArgumentParser(
79
- description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
80
- )
81
- parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
82
- parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
83
- parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
84
- parser.add_argument(
85
- "-g",
86
- "--glob",
87
- type=str,
88
- default="**/*.wav",
89
- help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
90
- )
91
- parser.add_argument(
92
- "-t",
93
- "--trim_just_beginning_and_end",
94
- type=bool,
95
- default=True,
96
- help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
97
- )
98
- parser.add_argument(
99
- "-c",
100
- "--use_cuda",
101
- type=bool,
102
- default=False,
103
- help="If True use cuda",
104
- )
105
- parser.add_argument(
106
- "--use_onnx",
107
- type=bool,
108
- default=False,
109
- help="If True use onnx",
110
- )
111
- parser.add_argument(
112
- "--num_processes",
113
- type=int,
114
- default=1,
115
- help="Number of processes to use",
116
- )
117
- args = parser.parse_args()
118
-
119
- if args.output_dir == "":
120
- args.output_dir = args.input_dir
121
-
122
- # load the model and utils
123
- model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
124
- preprocess_audios()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/bin/resample.py DELETED
@@ -1,90 +0,0 @@
1
- import argparse
2
- import glob
3
- import os
4
- from argparse import RawTextHelpFormatter
5
- from multiprocessing import Pool
6
- from shutil import copytree
7
-
8
- import librosa
9
- import soundfile as sf
10
- from tqdm import tqdm
11
-
12
-
13
- def resample_file(func_args):
14
- filename, output_sr = func_args
15
- y, sr = librosa.load(filename, sr=output_sr)
16
- sf.write(filename, y, sr)
17
-
18
-
19
- def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
20
- if output_dir:
21
- print("Recursively copying the input folder...")
22
- copytree(input_dir, output_dir)
23
- input_dir = output_dir
24
-
25
- print("Resampling the audio files...")
26
- audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
27
- print(f"Found {len(audio_files)} files...")
28
- audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
29
- with Pool(processes=n_jobs) as p:
30
- with tqdm(total=len(audio_files)) as pbar:
31
- for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
32
- pbar.update()
33
-
34
- print("Done !")
35
-
36
-
37
- if __name__ == "__main__":
38
- parser = argparse.ArgumentParser(
39
- description="""Resample a folder recusively with librosa
40
- Can be used in place or create a copy of the folder as an output.\n\n
41
- Example run:
42
- python TTS/bin/resample.py
43
- --input_dir /root/LJSpeech-1.1/
44
- --output_sr 22050
45
- --output_dir /root/resampled_LJSpeech-1.1/
46
- --file_ext wav
47
- --n_jobs 24
48
- """,
49
- formatter_class=RawTextHelpFormatter,
50
- )
51
-
52
- parser.add_argument(
53
- "--input_dir",
54
- type=str,
55
- default=None,
56
- required=True,
57
- help="Path of the folder containing the audio files to resample",
58
- )
59
-
60
- parser.add_argument(
61
- "--output_sr",
62
- type=int,
63
- default=22050,
64
- required=False,
65
- help="Samlple rate to which the audio files should be resampled",
66
- )
67
-
68
- parser.add_argument(
69
- "--output_dir",
70
- type=str,
71
- default=None,
72
- required=False,
73
- help="Path of the destination folder. If not defined, the operation is done in place",
74
- )
75
-
76
- parser.add_argument(
77
- "--file_ext",
78
- type=str,
79
- default="wav",
80
- required=False,
81
- help="Extension of the audio files to resample",
82
- )
83
-
84
- parser.add_argument(
85
- "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
86
- )
87
-
88
- args = parser.parse_args()
89
-
90
- resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/bin/synthesize.py DELETED
@@ -1,494 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- import argparse
5
- import contextlib
6
- import sys
7
- from argparse import RawTextHelpFormatter
8
-
9
- # pylint: disable=redefined-outer-name, unused-argument
10
- from pathlib import Path
11
-
12
- description = """
13
- Synthesize speech on command line.
14
-
15
- You can either use your trained model or choose a model from the provided list.
16
-
17
- If you don't specify any models, then it uses LJSpeech based English model.
18
-
19
- #### Single Speaker Models
20
-
21
- - List provided models:
22
-
23
- ```
24
- $ tts --list_models
25
- ```
26
-
27
- - Get model info (for both tts_models and vocoder_models):
28
-
29
- - Query by type/name:
30
- The model_info_by_name uses the name as it from the --list_models.
31
- ```
32
- $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
33
- ```
34
- For example:
35
- ```
36
- $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
37
- $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
38
- ```
39
- - Query by type/idx:
40
- The model_query_idx uses the corresponding idx from --list_models.
41
-
42
- ```
43
- $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
44
- ```
45
-
46
- For example:
47
-
48
- ```
49
- $ tts --model_info_by_idx tts_models/3
50
- ```
51
-
52
- - Query info for model info by full name:
53
- ```
54
- $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
55
- ```
56
-
57
- - Run TTS with default models:
58
-
59
- ```
60
- $ tts --text "Text for TTS" --out_path output/path/speech.wav
61
- ```
62
-
63
- - Run TTS and pipe out the generated TTS wav file data:
64
-
65
- ```
66
- $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
67
- ```
68
-
69
- - Run a TTS model with its default vocoder model:
70
-
71
- ```
72
- $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
73
- ```
74
-
75
- For example:
76
-
77
- ```
78
- $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
79
- ```
80
-
81
- - Run with specific TTS and vocoder models from the list:
82
-
83
- ```
84
- $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
85
- ```
86
-
87
- For example:
88
-
89
- ```
90
- $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
91
- ```
92
-
93
- - Run your own TTS model (Using Griffin-Lim Vocoder):
94
-
95
- ```
96
- $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
97
- ```
98
-
99
- - Run your own TTS and Vocoder models:
100
-
101
- ```
102
- $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
103
- --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
104
- ```
105
-
106
- #### Multi-speaker Models
107
-
108
- - List the available speakers and choose a <speaker_id> among them:
109
-
110
- ```
111
- $ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
112
- ```
113
-
114
- - Run the multi-speaker TTS model with the target speaker ID:
115
-
116
- ```
117
- $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
118
- ```
119
-
120
- - Run your own multi-speaker TTS model:
121
-
122
- ```
123
- $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
124
- ```
125
-
126
- ### Voice Conversion Models
127
-
128
- ```
129
- $ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
130
- ```
131
- """
132
-
133
-
134
- def str2bool(v):
135
- if isinstance(v, bool):
136
- return v
137
- if v.lower() in ("yes", "true", "t", "y", "1"):
138
- return True
139
- if v.lower() in ("no", "false", "f", "n", "0"):
140
- return False
141
- raise argparse.ArgumentTypeError("Boolean value expected.")
142
-
143
-
144
- def main():
145
- parser = argparse.ArgumentParser(
146
- description=description.replace(" ```\n", ""),
147
- formatter_class=RawTextHelpFormatter,
148
- )
149
-
150
- parser.add_argument(
151
- "--list_models",
152
- type=str2bool,
153
- nargs="?",
154
- const=True,
155
- default=False,
156
- help="list available pre-trained TTS and vocoder models.",
157
- )
158
-
159
- parser.add_argument(
160
- "--model_info_by_idx",
161
- type=str,
162
- default=None,
163
- help="model info using query format: <model_type>/<model_query_idx>",
164
- )
165
-
166
- parser.add_argument(
167
- "--model_info_by_name",
168
- type=str,
169
- default=None,
170
- help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
171
- )
172
-
173
- parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
174
-
175
- # Args for running pre-trained TTS models.
176
- parser.add_argument(
177
- "--model_name",
178
- type=str,
179
- default="tts_models/en/ljspeech/tacotron2-DDC",
180
- help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
181
- )
182
- parser.add_argument(
183
- "--vocoder_name",
184
- type=str,
185
- default=None,
186
- help="Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>",
187
- )
188
-
189
- # Args for running custom models
190
- parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
191
- parser.add_argument(
192
- "--model_path",
193
- type=str,
194
- default=None,
195
- help="Path to model file.",
196
- )
197
- parser.add_argument(
198
- "--out_path",
199
- type=str,
200
- default="tts_output.wav",
201
- help="Output wav file path.",
202
- )
203
- parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
204
- parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
205
- parser.add_argument(
206
- "--vocoder_path",
207
- type=str,
208
- help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
209
- default=None,
210
- )
211
- parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
212
- parser.add_argument(
213
- "--encoder_path",
214
- type=str,
215
- help="Path to speaker encoder model file.",
216
- default=None,
217
- )
218
- parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
219
- parser.add_argument(
220
- "--pipe_out",
221
- help="stdout the generated TTS wav file for shell pipe.",
222
- type=str2bool,
223
- nargs="?",
224
- const=True,
225
- default=False,
226
- )
227
-
228
- # args for multi-speaker synthesis
229
- parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
230
- parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
231
- parser.add_argument(
232
- "--speaker_idx",
233
- type=str,
234
- help="Target speaker ID for a multi-speaker TTS model.",
235
- default=None,
236
- )
237
- parser.add_argument(
238
- "--language_idx",
239
- type=str,
240
- help="Target language ID for a multi-lingual TTS model.",
241
- default=None,
242
- )
243
- parser.add_argument(
244
- "--speaker_wav",
245
- nargs="+",
246
- help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
247
- default=None,
248
- )
249
- parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
250
- parser.add_argument(
251
- "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
252
- )
253
- parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
254
- parser.add_argument(
255
- "--list_speaker_idxs",
256
- help="List available speaker ids for the defined multi-speaker model.",
257
- type=str2bool,
258
- nargs="?",
259
- const=True,
260
- default=False,
261
- )
262
- parser.add_argument(
263
- "--list_language_idxs",
264
- help="List available language ids for the defined multi-lingual model.",
265
- type=str2bool,
266
- nargs="?",
267
- const=True,
268
- default=False,
269
- )
270
- # aux args
271
- parser.add_argument(
272
- "--save_spectogram",
273
- type=bool,
274
- help="If true save raw spectogram for further (vocoder) processing in out_path.",
275
- default=False,
276
- )
277
- parser.add_argument(
278
- "--reference_wav",
279
- type=str,
280
- help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
281
- default=None,
282
- )
283
- parser.add_argument(
284
- "--reference_speaker_idx",
285
- type=str,
286
- help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
287
- default=None,
288
- )
289
- parser.add_argument(
290
- "--progress_bar",
291
- type=str2bool,
292
- help="If true shows a progress bar for the model download. Defaults to True",
293
- default=True,
294
- )
295
-
296
- # voice conversion args
297
- parser.add_argument(
298
- "--source_wav",
299
- type=str,
300
- default=None,
301
- help="Original audio file to convert in the voice of the target_wav",
302
- )
303
- parser.add_argument(
304
- "--target_wav",
305
- type=str,
306
- default=None,
307
- help="Target audio file to convert in the voice of the source_wav",
308
- )
309
-
310
- parser.add_argument(
311
- "--voice_dir",
312
- type=str,
313
- default=None,
314
- help="Voice dir for tortoise model",
315
- )
316
-
317
- args = parser.parse_args()
318
-
319
- # print the description if either text or list_models is not set
320
- check_args = [
321
- args.text,
322
- args.list_models,
323
- args.list_speaker_idxs,
324
- args.list_language_idxs,
325
- args.reference_wav,
326
- args.model_info_by_idx,
327
- args.model_info_by_name,
328
- args.source_wav,
329
- args.target_wav,
330
- ]
331
- if not any(check_args):
332
- parser.parse_args(["-h"])
333
-
334
- pipe_out = sys.stdout if args.pipe_out else None
335
-
336
- with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
337
- # Late-import to make things load faster
338
- from TTS.api import TTS
339
- from TTS.utils.manage import ModelManager
340
- from TTS.utils.synthesizer import Synthesizer
341
-
342
- # load model manager
343
- path = Path(__file__).parent / "../.models.json"
344
- manager = ModelManager(path, progress_bar=args.progress_bar)
345
- api = TTS()
346
-
347
- tts_path = None
348
- tts_config_path = None
349
- speakers_file_path = None
350
- language_ids_file_path = None
351
- vocoder_path = None
352
- vocoder_config_path = None
353
- encoder_path = None
354
- encoder_config_path = None
355
- vc_path = None
356
- vc_config_path = None
357
- model_dir = None
358
-
359
- # CASE1 #list : list pre-trained TTS models
360
- if args.list_models:
361
- manager.list_models()
362
- sys.exit()
363
-
364
- # CASE2 #info : model info for pre-trained TTS models
365
- if args.model_info_by_idx:
366
- model_query = args.model_info_by_idx
367
- manager.model_info_by_idx(model_query)
368
- sys.exit()
369
-
370
- if args.model_info_by_name:
371
- model_query_full_name = args.model_info_by_name
372
- manager.model_info_by_full_name(model_query_full_name)
373
- sys.exit()
374
-
375
- # CASE3: load pre-trained model paths
376
- if args.model_name is not None and not args.model_path:
377
- model_path, config_path, model_item = manager.download_model(args.model_name)
378
- # tts model
379
- if model_item["model_type"] == "tts_models":
380
- tts_path = model_path
381
- tts_config_path = config_path
382
- if "default_vocoder" in model_item:
383
- args.vocoder_name = (
384
- model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
385
- )
386
-
387
- # voice conversion model
388
- if model_item["model_type"] == "voice_conversion_models":
389
- vc_path = model_path
390
- vc_config_path = config_path
391
-
392
- # tts model with multiple files to be loaded from the directory path
393
- if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
394
- model_dir = model_path
395
- tts_path = None
396
- tts_config_path = None
397
- args.vocoder_name = None
398
-
399
- # load vocoder
400
- if args.vocoder_name is not None and not args.vocoder_path:
401
- vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
402
-
403
- # CASE4: set custom model paths
404
- if args.model_path is not None:
405
- tts_path = args.model_path
406
- tts_config_path = args.config_path
407
- speakers_file_path = args.speakers_file_path
408
- language_ids_file_path = args.language_ids_file_path
409
-
410
- if args.vocoder_path is not None:
411
- vocoder_path = args.vocoder_path
412
- vocoder_config_path = args.vocoder_config_path
413
-
414
- if args.encoder_path is not None:
415
- encoder_path = args.encoder_path
416
- encoder_config_path = args.encoder_config_path
417
-
418
- device = args.device
419
- if args.use_cuda:
420
- device = "cuda"
421
-
422
- # load models
423
- synthesizer = Synthesizer(
424
- tts_path,
425
- tts_config_path,
426
- speakers_file_path,
427
- language_ids_file_path,
428
- vocoder_path,
429
- vocoder_config_path,
430
- encoder_path,
431
- encoder_config_path,
432
- vc_path,
433
- vc_config_path,
434
- model_dir,
435
- args.voice_dir,
436
- ).to(device)
437
-
438
- # query speaker ids of a multi-speaker model.
439
- if args.list_speaker_idxs:
440
- print(
441
- " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
442
- )
443
- print(synthesizer.tts_model.speaker_manager.name_to_id)
444
- return
445
-
446
- # query langauge ids of a multi-lingual model.
447
- if args.list_language_idxs:
448
- print(
449
- " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
450
- )
451
- print(synthesizer.tts_model.language_manager.name_to_id)
452
- return
453
-
454
- # check the arguments against a multi-speaker model.
455
- if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
456
- print(
457
- " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
458
- "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
459
- )
460
- return
461
-
462
- # RUN THE SYNTHESIS
463
- if args.text:
464
- print(" > Text: {}".format(args.text))
465
-
466
- # kick it
467
- if tts_path is not None:
468
- wav = synthesizer.tts(
469
- args.text,
470
- speaker_name=args.speaker_idx,
471
- language_name=args.language_idx,
472
- speaker_wav=args.speaker_wav,
473
- reference_wav=args.reference_wav,
474
- style_wav=args.capacitron_style_wav,
475
- style_text=args.capacitron_style_text,
476
- reference_speaker_name=args.reference_speaker_idx,
477
- )
478
- elif vc_path is not None:
479
- wav = synthesizer.voice_conversion(
480
- source_wav=args.source_wav,
481
- target_wav=args.target_wav,
482
- )
483
- elif model_dir is not None:
484
- wav = synthesizer.tts(
485
- args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
486
- )
487
-
488
- # save the results
489
- print(" > Saving output to {}".format(args.out_path))
490
- synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
491
-
492
-
493
- if __name__ == "__main__":
494
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/bin/train_encoder.py DELETED
@@ -1,332 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- import os
5
- import sys
6
- import time
7
- import traceback
8
-
9
- import torch
10
- from torch.utils.data import DataLoader
11
- from trainer.io import copy_model_files, save_best_model, save_checkpoint
12
- from trainer.torch import NoamLR
13
- from trainer.trainer_utils import get_optimizer
14
-
15
- from TTS.encoder.dataset import EncoderDataset
16
- from TTS.encoder.utils.generic_utils import setup_encoder_model
17
- from TTS.encoder.utils.training import init_training
18
- from TTS.encoder.utils.visual import plot_embeddings
19
- from TTS.tts.datasets import load_tts_samples
20
- from TTS.utils.audio import AudioProcessor
21
- from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
22
- from TTS.utils.samplers import PerfectBatchSampler
23
- from TTS.utils.training import check_update
24
-
25
- torch.backends.cudnn.enabled = True
26
- torch.backends.cudnn.benchmark = True
27
- torch.manual_seed(54321)
28
- use_cuda = torch.cuda.is_available()
29
- num_gpus = torch.cuda.device_count()
30
- print(" > Using CUDA: ", use_cuda)
31
- print(" > Number of GPUs: ", num_gpus)
32
-
33
-
34
- def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
35
- num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
36
- num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
37
-
38
- dataset = EncoderDataset(
39
- c,
40
- ap,
41
- meta_data_eval if is_val else meta_data_train,
42
- voice_len=c.voice_len,
43
- num_utter_per_class=num_utter_per_class,
44
- num_classes_in_batch=num_classes_in_batch,
45
- verbose=verbose,
46
- augmentation_config=c.audio_augmentation if not is_val else None,
47
- use_torch_spec=c.model_params.get("use_torch_spec", False),
48
- )
49
- # get classes list
50
- classes = dataset.get_class_list()
51
-
52
- sampler = PerfectBatchSampler(
53
- dataset.items,
54
- classes,
55
- batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
56
- num_classes_in_batch=num_classes_in_batch,
57
- num_gpus=1,
58
- shuffle=not is_val,
59
- drop_last=True,
60
- )
61
-
62
- if len(classes) < num_classes_in_batch:
63
- if is_val:
64
- raise RuntimeError(
65
- f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
66
- )
67
- raise RuntimeError(
68
- f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
69
- )
70
-
71
- # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
72
- if is_val:
73
- dataset.set_classes(train_classes)
74
-
75
- loader = DataLoader(
76
- dataset,
77
- num_workers=c.num_loader_workers,
78
- batch_sampler=sampler,
79
- collate_fn=dataset.collate_fn,
80
- )
81
-
82
- return loader, classes, dataset.get_map_classid_to_classname()
83
-
84
-
85
- def evaluation(model, criterion, data_loader, global_step):
86
- eval_loss = 0
87
- for _, data in enumerate(data_loader):
88
- with torch.no_grad():
89
- # setup input data
90
- inputs, labels = data
91
-
92
- # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
93
- labels = torch.transpose(
94
- labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
95
- ).reshape(labels.shape)
96
- inputs = torch.transpose(
97
- inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
98
- ).reshape(inputs.shape)
99
-
100
- # dispatch data to GPU
101
- if use_cuda:
102
- inputs = inputs.cuda(non_blocking=True)
103
- labels = labels.cuda(non_blocking=True)
104
-
105
- # forward pass model
106
- outputs = model(inputs)
107
-
108
- # loss computation
109
- loss = criterion(
110
- outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
111
- )
112
-
113
- eval_loss += loss.item()
114
-
115
- eval_avg_loss = eval_loss / len(data_loader)
116
- # save stats
117
- dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
118
- # plot the last batch in the evaluation
119
- figures = {
120
- "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
121
- }
122
- dashboard_logger.eval_figures(global_step, figures)
123
- return eval_avg_loss
124
-
125
-
126
- def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
127
- model.train()
128
- best_loss = {"train_loss": None, "eval_loss": float("inf")}
129
- avg_loader_time = 0
130
- end_time = time.time()
131
- for epoch in range(c.epochs):
132
- tot_loss = 0
133
- epoch_time = 0
134
- for _, data in enumerate(data_loader):
135
- start_time = time.time()
136
-
137
- # setup input data
138
- inputs, labels = data
139
- # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
140
- labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
141
- labels.shape
142
- )
143
- inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
144
- inputs.shape
145
- )
146
- # ToDo: move it to a unit test
147
- # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
148
- # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
149
- # idx = 0
150
- # for j in range(0, c.num_classes_in_batch, 1):
151
- # for i in range(j, len(labels), c.num_classes_in_batch):
152
- # if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
153
- # print("Invalid")
154
- # print(labels)
155
- # exit()
156
- # idx += 1
157
- # labels = labels_converted
158
- # inputs = inputs_converted
159
-
160
- loader_time = time.time() - end_time
161
- global_step += 1
162
-
163
- # setup lr
164
- if c.lr_decay:
165
- scheduler.step()
166
- optimizer.zero_grad()
167
-
168
- # dispatch data to GPU
169
- if use_cuda:
170
- inputs = inputs.cuda(non_blocking=True)
171
- labels = labels.cuda(non_blocking=True)
172
-
173
- # forward pass model
174
- outputs = model(inputs)
175
-
176
- # loss computation
177
- loss = criterion(
178
- outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
179
- )
180
- loss.backward()
181
- grad_norm, _ = check_update(model, c.grad_clip)
182
- optimizer.step()
183
-
184
- step_time = time.time() - start_time
185
- epoch_time += step_time
186
-
187
- # acumulate the total epoch loss
188
- tot_loss += loss.item()
189
-
190
- # Averaged Loader Time
191
- num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
192
- avg_loader_time = (
193
- 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
194
- if avg_loader_time != 0
195
- else loader_time
196
- )
197
- current_lr = optimizer.param_groups[0]["lr"]
198
-
199
- if global_step % c.steps_plot_stats == 0:
200
- # Plot Training Epoch Stats
201
- train_stats = {
202
- "loss": loss.item(),
203
- "lr": current_lr,
204
- "grad_norm": grad_norm,
205
- "step_time": step_time,
206
- "avg_loader_time": avg_loader_time,
207
- }
208
- dashboard_logger.train_epoch_stats(global_step, train_stats)
209
- figures = {
210
- "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
211
- }
212
- dashboard_logger.train_figures(global_step, figures)
213
-
214
- if global_step % c.print_step == 0:
215
- print(
216
- " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} "
217
- "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
218
- global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
219
- ),
220
- flush=True,
221
- )
222
-
223
- if global_step % c.save_step == 0:
224
- # save model
225
- save_checkpoint(
226
- c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict()
227
- )
228
-
229
- end_time = time.time()
230
-
231
- print("")
232
- print(
233
- ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
234
- "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
235
- epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
236
- ),
237
- flush=True,
238
- )
239
- # evaluation
240
- if c.run_eval:
241
- model.eval()
242
- eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
243
- print("\n\n")
244
- print("--> EVAL PERFORMANCE")
245
- print(
246
- " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
247
- flush=True,
248
- )
249
- # save the best checkpoint
250
- best_loss = save_best_model(
251
- {"train_loss": None, "eval_loss": eval_loss},
252
- best_loss,
253
- c,
254
- model,
255
- optimizer,
256
- None,
257
- global_step,
258
- epoch,
259
- OUT_PATH,
260
- criterion=criterion.state_dict(),
261
- )
262
- model.train()
263
-
264
- return best_loss, global_step
265
-
266
-
267
- def main(args): # pylint: disable=redefined-outer-name
268
- # pylint: disable=global-variable-undefined
269
- global meta_data_train
270
- global meta_data_eval
271
- global train_classes
272
-
273
- ap = AudioProcessor(**c.audio)
274
- model = setup_encoder_model(c)
275
-
276
- optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
277
-
278
- # pylint: disable=redefined-outer-name
279
- meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
280
-
281
- train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
282
- if c.run_eval:
283
- eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
284
- else:
285
- eval_data_loader = None
286
-
287
- num_classes = len(train_classes)
288
- criterion = model.get_criterion(c, num_classes)
289
-
290
- if c.loss == "softmaxproto" and c.model != "speaker_encoder":
291
- c.map_classid_to_classname = map_classid_to_classname
292
- copy_model_files(c, OUT_PATH, new_fields={})
293
-
294
- if args.restore_path:
295
- criterion, args.restore_step = model.load_checkpoint(
296
- c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
297
- )
298
- print(" > Model restored from step %d" % args.restore_step, flush=True)
299
- else:
300
- args.restore_step = 0
301
-
302
- if c.lr_decay:
303
- scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
304
- else:
305
- scheduler = None
306
-
307
- num_params = count_parameters(model)
308
- print("\n > Model has {} parameters".format(num_params), flush=True)
309
-
310
- if use_cuda:
311
- model = model.cuda()
312
- criterion.cuda()
313
-
314
- global_step = args.restore_step
315
- _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
316
-
317
-
318
- if __name__ == "__main__":
319
- args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
320
-
321
- try:
322
- main(args)
323
- except KeyboardInterrupt:
324
- remove_experiment_folder(OUT_PATH)
325
- try:
326
- sys.exit(0)
327
- except SystemExit:
328
- os._exit(0) # pylint: disable=protected-access
329
- except Exception: # pylint: disable=broad-except
330
- remove_experiment_folder(OUT_PATH)
331
- traceback.print_exc()
332
- sys.exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/bin/train_tts.py DELETED
@@ -1,71 +0,0 @@
1
- import os
2
- from dataclasses import dataclass, field
3
-
4
- from trainer import Trainer, TrainerArgs
5
-
6
- from TTS.config import load_config, register_config
7
- from TTS.tts.datasets import load_tts_samples
8
- from TTS.tts.models import setup_model
9
-
10
-
11
- @dataclass
12
- class TrainTTSArgs(TrainerArgs):
13
- config_path: str = field(default=None, metadata={"help": "Path to the config file."})
14
-
15
-
16
- def main():
17
- """Run `tts` model training directly by a `config.json` file."""
18
- # init trainer args
19
- train_args = TrainTTSArgs()
20
- parser = train_args.init_argparse(arg_prefix="")
21
-
22
- # override trainer args from comman-line args
23
- args, config_overrides = parser.parse_known_args()
24
- train_args.parse_args(args)
25
-
26
- # load config.json and register
27
- if args.config_path or args.continue_path:
28
- if args.config_path:
29
- # init from a file
30
- config = load_config(args.config_path)
31
- if len(config_overrides) > 0:
32
- config.parse_known_args(config_overrides, relaxed_parser=True)
33
- elif args.continue_path:
34
- # continue from a prev experiment
35
- config = load_config(os.path.join(args.continue_path, "config.json"))
36
- if len(config_overrides) > 0:
37
- config.parse_known_args(config_overrides, relaxed_parser=True)
38
- else:
39
- # init from console args
40
- from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
41
-
42
- config_base = BaseTrainingConfig()
43
- config_base.parse_known_args(config_overrides)
44
- config = register_config(config_base.model)()
45
-
46
- # load training samples
47
- train_samples, eval_samples = load_tts_samples(
48
- config.datasets,
49
- eval_split=True,
50
- eval_split_max_size=config.eval_split_max_size,
51
- eval_split_size=config.eval_split_size,
52
- )
53
-
54
- # init the model from config
55
- model = setup_model(config, train_samples + eval_samples)
56
-
57
- # init the trainer and 🚀
58
- trainer = Trainer(
59
- train_args,
60
- model.config,
61
- config.output_path,
62
- model=model,
63
- train_samples=train_samples,
64
- eval_samples=eval_samples,
65
- parse_command_line_args=False,
66
- )
67
- trainer.fit()
68
-
69
-
70
- if __name__ == "__main__":
71
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/bin/train_vocoder.py DELETED
@@ -1,77 +0,0 @@
1
- import os
2
- from dataclasses import dataclass, field
3
-
4
- from trainer import Trainer, TrainerArgs
5
-
6
- from TTS.config import load_config, register_config
7
- from TTS.utils.audio import AudioProcessor
8
- from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
9
- from TTS.vocoder.models import setup_model
10
-
11
-
12
- @dataclass
13
- class TrainVocoderArgs(TrainerArgs):
14
- config_path: str = field(default=None, metadata={"help": "Path to the config file."})
15
-
16
-
17
- def main():
18
- """Run `tts` model training directly by a `config.json` file."""
19
- # init trainer args
20
- train_args = TrainVocoderArgs()
21
- parser = train_args.init_argparse(arg_prefix="")
22
-
23
- # override trainer args from comman-line args
24
- args, config_overrides = parser.parse_known_args()
25
- train_args.parse_args(args)
26
-
27
- # load config.json and register
28
- if args.config_path or args.continue_path:
29
- if args.config_path:
30
- # init from a file
31
- config = load_config(args.config_path)
32
- if len(config_overrides) > 0:
33
- config.parse_known_args(config_overrides, relaxed_parser=True)
34
- elif args.continue_path:
35
- # continue from a prev experiment
36
- config = load_config(os.path.join(args.continue_path, "config.json"))
37
- if len(config_overrides) > 0:
38
- config.parse_known_args(config_overrides, relaxed_parser=True)
39
- else:
40
- # init from console args
41
- from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
42
-
43
- config_base = BaseTrainingConfig()
44
- config_base.parse_known_args(config_overrides)
45
- config = register_config(config_base.model)()
46
-
47
- # load training samples
48
- if "feature_path" in config and config.feature_path:
49
- # load pre-computed features
50
- print(f" > Loading features from: {config.feature_path}")
51
- eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
52
- else:
53
- # load data raw wav files
54
- eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
55
-
56
- # setup audio processor
57
- ap = AudioProcessor(**config.audio)
58
-
59
- # init the model from config
60
- model = setup_model(config)
61
-
62
- # init the trainer and 🚀
63
- trainer = Trainer(
64
- train_args,
65
- config,
66
- config.output_path,
67
- model=model,
68
- train_samples=train_samples,
69
- eval_samples=eval_samples,
70
- training_assets={"audio_processor": ap},
71
- parse_command_line_args=False,
72
- )
73
- trainer.fit()
74
-
75
-
76
- if __name__ == "__main__":
77
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/bin/tune_wavegrad.py DELETED
@@ -1,103 +0,0 @@
1
- """Search a good noise schedule for WaveGrad for a given number of inference iterations"""
2
- import argparse
3
- from itertools import product as cartesian_product
4
-
5
- import numpy as np
6
- import torch
7
- from torch.utils.data import DataLoader
8
- from tqdm import tqdm
9
-
10
- from TTS.config import load_config
11
- from TTS.utils.audio import AudioProcessor
12
- from TTS.vocoder.datasets.preprocess import load_wav_data
13
- from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
14
- from TTS.vocoder.models import setup_model
15
-
16
- if __name__ == "__main__":
17
- parser = argparse.ArgumentParser()
18
- parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
19
- parser.add_argument("--config_path", type=str, help="Path to model config file.")
20
- parser.add_argument("--data_path", type=str, help="Path to data directory.")
21
- parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
22
- parser.add_argument(
23
- "--num_iter",
24
- type=int,
25
- help="Number of model inference iterations that you like to optimize noise schedule for.",
26
- )
27
- parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
28
- parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
29
- parser.add_argument(
30
- "--search_depth",
31
- type=int,
32
- default=3,
33
- help="Search granularity. Increasing this increases the run-time exponentially.",
34
- )
35
-
36
- # load config
37
- args = parser.parse_args()
38
- config = load_config(args.config_path)
39
-
40
- # setup audio processor
41
- ap = AudioProcessor(**config.audio)
42
-
43
- # load dataset
44
- _, train_data = load_wav_data(args.data_path, 0)
45
- train_data = train_data[: args.num_samples]
46
- dataset = WaveGradDataset(
47
- ap=ap,
48
- items=train_data,
49
- seq_len=-1,
50
- hop_len=ap.hop_length,
51
- pad_short=config.pad_short,
52
- conv_pad=config.conv_pad,
53
- is_training=True,
54
- return_segments=False,
55
- use_noise_augment=False,
56
- use_cache=False,
57
- verbose=True,
58
- )
59
- loader = DataLoader(
60
- dataset,
61
- batch_size=1,
62
- shuffle=False,
63
- collate_fn=dataset.collate_full_clips,
64
- drop_last=False,
65
- num_workers=config.num_loader_workers,
66
- pin_memory=False,
67
- )
68
-
69
- # setup the model
70
- model = setup_model(config)
71
- if args.use_cuda:
72
- model.cuda()
73
-
74
- # setup optimization parameters
75
- base_values = sorted(10 * np.random.uniform(size=args.search_depth))
76
- print(f" > base values: {base_values}")
77
- exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
78
- best_error = float("inf")
79
- best_schedule = None # pylint: disable=C0103
80
- total_search_iter = len(base_values) ** args.num_iter
81
- for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
82
- beta = exponents * base
83
- model.compute_noise_level(beta)
84
- for data in loader:
85
- mel, audio = data
86
- y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
87
-
88
- if args.use_cuda:
89
- y_hat = y_hat.cpu()
90
- y_hat = y_hat.numpy()
91
-
92
- mel_hat = []
93
- for i in range(y_hat.shape[0]):
94
- m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
95
- mel_hat.append(torch.from_numpy(m))
96
-
97
- mel_hat = torch.stack(mel_hat)
98
- mse = torch.sum((mel - mel_hat) ** 2).mean()
99
- if mse.item() < best_error:
100
- best_error = mse.item()
101
- best_schedule = {"beta": beta}
102
- print(f" > Found a better schedule. - MSE: {mse.item()}")
103
- np.save(args.output_path, best_schedule)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/config/__init__.py DELETED
@@ -1,135 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from typing import Dict
5
-
6
- import fsspec
7
- import yaml
8
- from coqpit import Coqpit
9
-
10
- from TTS.config.shared_configs import *
11
- from TTS.utils.generic_utils import find_module
12
-
13
-
14
- def read_json_with_comments(json_path):
15
- """for backward compat."""
16
- # fallback to json
17
- with fsspec.open(json_path, "r", encoding="utf-8") as f:
18
- input_str = f.read()
19
- # handle comments but not urls with //
20
- input_str = re.sub(r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str)
21
- return json.loads(input_str)
22
-
23
- def register_config(model_name: str) -> Coqpit:
24
- """Find the right config for the given model name.
25
-
26
- Args:
27
- model_name (str): Model name.
28
-
29
- Raises:
30
- ModuleNotFoundError: No matching config for the model name.
31
-
32
- Returns:
33
- Coqpit: config class.
34
- """
35
- config_class = None
36
- config_name = model_name + "_config"
37
-
38
- # TODO: fix this
39
- if model_name == "xtts":
40
- from TTS.tts.configs.xtts_config import XttsConfig
41
-
42
- config_class = XttsConfig
43
- paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
44
- for path in paths:
45
- try:
46
- config_class = find_module(path, config_name)
47
- except ModuleNotFoundError:
48
- pass
49
- if config_class is None:
50
- raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
51
- return config_class
52
-
53
-
54
- def _process_model_name(config_dict: Dict) -> str:
55
- """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
56
-
57
- Args:
58
- config_dict (Dict): A dictionary including the config fields.
59
-
60
- Returns:
61
- str: Formatted modelname.
62
- """
63
- model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
64
- model_name = model_name.replace("_generator", "").replace("_discriminator", "")
65
- return model_name
66
-
67
-
68
- def load_config(config_path: str) -> Coqpit:
69
- """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
70
- to find the corresponding Config class. Then initialize the Config.
71
-
72
- Args:
73
- config_path (str): path to the config file.
74
-
75
- Raises:
76
- TypeError: given config file has an unknown type.
77
-
78
- Returns:
79
- Coqpit: TTS config object.
80
- """
81
- config_dict = {}
82
- ext = os.path.splitext(config_path)[1]
83
- if ext in (".yml", ".yaml"):
84
- with fsspec.open(config_path, "r", encoding="utf-8") as f:
85
- data = yaml.safe_load(f)
86
- elif ext == ".json":
87
- try:
88
- with fsspec.open(config_path, "r", encoding="utf-8") as f:
89
- data = json.load(f)
90
- except json.decoder.JSONDecodeError:
91
- # backwards compat.
92
- data = read_json_with_comments(config_path)
93
- else:
94
- raise TypeError(f" [!] Unknown config file type {ext}")
95
- config_dict.update(data)
96
- model_name = _process_model_name(config_dict)
97
- config_class = register_config(model_name.lower())
98
- config = config_class()
99
- config.from_dict(config_dict)
100
- return config
101
-
102
-
103
- def check_config_and_model_args(config, arg_name, value):
104
- """Check the give argument in `config.model_args` if exist or in `config` for
105
- the given value.
106
-
107
- Return False if the argument does not exist in `config.model_args` or `config`.
108
- This is to patch up the compatibility between models with and without `model_args`.
109
-
110
- TODO: Remove this in the future with a unified approach.
111
- """
112
- if hasattr(config, "model_args"):
113
- if arg_name in config.model_args:
114
- return config.model_args[arg_name] == value
115
- if hasattr(config, arg_name):
116
- return config[arg_name] == value
117
- return False
118
-
119
-
120
- def get_from_config_or_model_args(config, arg_name):
121
- """Get the given argument from `config.model_args` if exist or in `config`."""
122
- if hasattr(config, "model_args"):
123
- if arg_name in config.model_args:
124
- return config.model_args[arg_name]
125
- return config[arg_name]
126
-
127
-
128
- def get_from_config_or_model_args_with_default(config, arg_name, def_val):
129
- """Get the given argument from `config.model_args` if exist or in `config`."""
130
- if hasattr(config, "model_args"):
131
- if arg_name in config.model_args:
132
- return config.model_args[arg_name]
133
- if hasattr(config, arg_name):
134
- return config[arg_name]
135
- return def_val
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/config/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (4.26 kB)
 
TTS/config/__pycache__/shared_configs.cpython-310.pyc DELETED
Binary file (9.53 kB)
 
TTS/config/shared_configs.py DELETED
@@ -1,268 +0,0 @@
1
- from dataclasses import asdict, dataclass
2
- from typing import List
3
-
4
- from coqpit import Coqpit, check_argument
5
- from trainer import TrainerConfig
6
-
7
-
8
- @dataclass
9
- class BaseAudioConfig(Coqpit):
10
- """Base config to definge audio processing parameters. It is used to initialize
11
- ```TTS.utils.audio.AudioProcessor.```
12
-
13
- Args:
14
- fft_size (int):
15
- Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
16
-
17
- win_length (int):
18
- Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
19
- ```fft_size```. Defaults to 1024.
20
-
21
- hop_length (int):
22
- Number of audio samples between adjacent STFT columns. Defaults to 1024.
23
-
24
- frame_shift_ms (int):
25
- Set ```hop_length``` based on milliseconds and sampling rate.
26
-
27
- frame_length_ms (int):
28
- Set ```win_length``` based on milliseconds and sampling rate.
29
-
30
- stft_pad_mode (str):
31
- Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
32
-
33
- sample_rate (int):
34
- Audio sampling rate. Defaults to 22050.
35
-
36
- resample (bool):
37
- Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
38
-
39
- preemphasis (float):
40
- Preemphasis coefficient. Defaults to 0.0.
41
-
42
- ref_level_db (int): 20
43
- Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
44
- Defaults to 20.
45
-
46
- do_sound_norm (bool):
47
- Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
48
-
49
- log_func (str):
50
- Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
51
-
52
- do_trim_silence (bool):
53
- Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
54
-
55
- do_amp_to_db_linear (bool, optional):
56
- enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
57
-
58
- do_amp_to_db_mel (bool, optional):
59
- enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
60
-
61
- pitch_fmax (float, optional):
62
- Maximum frequency of the F0 frames. Defaults to ```640```.
63
-
64
- pitch_fmin (float, optional):
65
- Minimum frequency of the F0 frames. Defaults to ```1```.
66
-
67
- trim_db (int):
68
- Silence threshold used for silence trimming. Defaults to 45.
69
-
70
- do_rms_norm (bool, optional):
71
- enable/disable RMS volume normalization when loading an audio file. Defaults to False.
72
-
73
- db_level (int, optional):
74
- dB level used for rms normalization. The range is -99 to 0. Defaults to None.
75
-
76
- power (float):
77
- Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
78
- artifacts in the synthesized voice. Defaults to 1.5.
79
-
80
- griffin_lim_iters (int):
81
- Number of Griffing Lim iterations. Defaults to 60.
82
-
83
- num_mels (int):
84
- Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
85
-
86
- mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
87
- It needs to be adjusted for a dataset. Defaults to 0.
88
-
89
- mel_fmax (float):
90
- Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
91
-
92
- spec_gain (int):
93
- Gain applied when converting amplitude to DB. Defaults to 20.
94
-
95
- signal_norm (bool):
96
- enable/disable signal normalization. Defaults to True.
97
-
98
- min_level_db (int):
99
- minimum db threshold for the computed melspectrograms. Defaults to -100.
100
-
101
- symmetric_norm (bool):
102
- enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
103
- [0, k], Defaults to True.
104
-
105
- max_norm (float):
106
- ```k``` defining the normalization range. Defaults to 4.0.
107
-
108
- clip_norm (bool):
109
- enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
110
-
111
- stats_path (str):
112
- Path to the computed stats file. Defaults to None.
113
- """
114
-
115
- # stft parameters
116
- fft_size: int = 1024
117
- win_length: int = 1024
118
- hop_length: int = 256
119
- frame_shift_ms: int = None
120
- frame_length_ms: int = None
121
- stft_pad_mode: str = "reflect"
122
- # audio processing parameters
123
- sample_rate: int = 22050
124
- resample: bool = False
125
- preemphasis: float = 0.0
126
- ref_level_db: int = 20
127
- do_sound_norm: bool = False
128
- log_func: str = "np.log10"
129
- # silence trimming
130
- do_trim_silence: bool = True
131
- trim_db: int = 45
132
- # rms volume normalization
133
- do_rms_norm: bool = False
134
- db_level: float = None
135
- # griffin-lim params
136
- power: float = 1.5
137
- griffin_lim_iters: int = 60
138
- # mel-spec params
139
- num_mels: int = 80
140
- mel_fmin: float = 0.0
141
- mel_fmax: float = None
142
- spec_gain: int = 20
143
- do_amp_to_db_linear: bool = True
144
- do_amp_to_db_mel: bool = True
145
- # f0 params
146
- pitch_fmax: float = 640.0
147
- pitch_fmin: float = 1.0
148
- # normalization params
149
- signal_norm: bool = True
150
- min_level_db: int = -100
151
- symmetric_norm: bool = True
152
- max_norm: float = 4.0
153
- clip_norm: bool = True
154
- stats_path: str = None
155
-
156
- def check_values(
157
- self,
158
- ):
159
- """Check config fields"""
160
- c = asdict(self)
161
- check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
162
- check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
163
- check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
164
- check_argument(
165
- "frame_length_ms",
166
- c,
167
- restricted=True,
168
- min_val=10,
169
- max_val=1000,
170
- alternative="win_length",
171
- )
172
- check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
173
- check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
174
- check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
175
- check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
176
- check_argument("power", c, restricted=True, min_val=1, max_val=5)
177
- check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
178
-
179
- # normalization parameters
180
- check_argument("signal_norm", c, restricted=True)
181
- check_argument("symmetric_norm", c, restricted=True)
182
- check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
183
- check_argument("clip_norm", c, restricted=True)
184
- check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
185
- check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
186
- check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
187
- check_argument("do_trim_silence", c, restricted=True)
188
- check_argument("trim_db", c, restricted=True)
189
-
190
-
191
- @dataclass
192
- class BaseDatasetConfig(Coqpit):
193
- """Base config for TTS datasets.
194
-
195
- Args:
196
- formatter (str):
197
- Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
198
-
199
- dataset_name (str):
200
- Unique name for the dataset. Defaults to `""`.
201
-
202
- path (str):
203
- Root path to the dataset files. Defaults to `""`.
204
-
205
- meta_file_train (str):
206
- Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
207
- Defaults to `""`.
208
-
209
- ignored_speakers (List):
210
- List of speakers IDs that are not used at the training. Default None.
211
-
212
- language (str):
213
- Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
214
-
215
- phonemizer (str):
216
- Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.
217
-
218
- meta_file_val (str):
219
- Name of the dataset meta file that defines the instances used at validation.
220
-
221
- meta_file_attn_mask (str):
222
- Path to the file that lists the attention mask files used with models that require attention masks to
223
- train the duration predictor.
224
- """
225
-
226
- formatter: str = ""
227
- dataset_name: str = ""
228
- path: str = ""
229
- meta_file_train: str = ""
230
- ignored_speakers: List[str] = None
231
- language: str = ""
232
- phonemizer: str = ""
233
- meta_file_val: str = ""
234
- meta_file_attn_mask: str = ""
235
-
236
- def check_values(
237
- self,
238
- ):
239
- """Check config fields"""
240
- c = asdict(self)
241
- check_argument("formatter", c, restricted=True)
242
- check_argument("path", c, restricted=True)
243
- check_argument("meta_file_train", c, restricted=True)
244
- check_argument("meta_file_val", c, restricted=False)
245
- check_argument("meta_file_attn_mask", c, restricted=False)
246
-
247
-
248
- @dataclass
249
- class BaseTrainingConfig(TrainerConfig):
250
- """Base config to define the basic 🐸TTS training parameters that are shared
251
- among all the models. It is based on ```Trainer.TrainingConfig```.
252
-
253
- Args:
254
- model (str):
255
- Name of the model that is used in the training.
256
-
257
- num_loader_workers (int):
258
- Number of workers for training time dataloader.
259
-
260
- num_eval_loader_workers (int):
261
- Number of workers for evaluation time dataloader.
262
- """
263
-
264
- model: str = None
265
- # dataloading
266
- num_loader_workers: int = 0
267
- num_eval_loader_workers: int = 0
268
- use_noise_augment: bool = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/demos/xtts_ft_demo/requirements.txt DELETED
@@ -1,2 +0,0 @@
1
- faster_whisper==0.9.0
2
- gradio==4.7.1
 
 
 
TTS/demos/xtts_ft_demo/utils/formatter.py DELETED
@@ -1,160 +0,0 @@
1
- import os
2
- import gc
3
- import torchaudio
4
- import pandas
5
- from faster_whisper import WhisperModel
6
- from glob import glob
7
-
8
- from tqdm import tqdm
9
-
10
- import torch
11
- import torchaudio
12
- # torch.set_num_threads(1)
13
-
14
- from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners
15
-
16
- torch.set_num_threads(16)
17
-
18
-
19
- import os
20
-
21
- audio_types = (".wav", ".mp3", ".flac")
22
-
23
-
24
- def list_audios(basePath, contains=None):
25
- # return the set of files that are valid
26
- return list_files(basePath, validExts=audio_types, contains=contains)
27
-
28
- def list_files(basePath, validExts=None, contains=None):
29
- # loop over the directory structure
30
- for (rootDir, dirNames, filenames) in os.walk(basePath):
31
- # loop over the filenames in the current directory
32
- for filename in filenames:
33
- # if the contains string is not none and the filename does not contain
34
- # the supplied string, then ignore the file
35
- if contains is not None and filename.find(contains) == -1:
36
- continue
37
-
38
- # determine the file extension of the current file
39
- ext = filename[filename.rfind("."):].lower()
40
-
41
- # check to see if the file is an audio and should be processed
42
- if validExts is None or ext.endswith(validExts):
43
- # construct the path to the audio and yield it
44
- audioPath = os.path.join(rootDir, filename)
45
- yield audioPath
46
-
47
- def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None):
48
- audio_total_size = 0
49
- # make sure that ooutput file exists
50
- os.makedirs(out_path, exist_ok=True)
51
-
52
- # Loading Whisper
53
- device = "cuda" if torch.cuda.is_available() else "cpu"
54
-
55
- print("Loading Whisper Model!")
56
- asr_model = WhisperModel("large-v2", device=device, compute_type="float16")
57
-
58
- metadata = {"audio_file": [], "text": [], "speaker_name": []}
59
-
60
- if gradio_progress is not None:
61
- tqdm_object = gradio_progress.tqdm(audio_files, desc="Formatting...")
62
- else:
63
- tqdm_object = tqdm(audio_files)
64
-
65
- for audio_path in tqdm_object:
66
- wav, sr = torchaudio.load(audio_path)
67
- # stereo to mono if needed
68
- if wav.size(0) != 1:
69
- wav = torch.mean(wav, dim=0, keepdim=True)
70
-
71
- wav = wav.squeeze()
72
- audio_total_size += (wav.size(-1) / sr)
73
-
74
- segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
75
- segments = list(segments)
76
- i = 0
77
- sentence = ""
78
- sentence_start = None
79
- first_word = True
80
- # added all segments words in a unique list
81
- words_list = []
82
- for _, segment in enumerate(segments):
83
- words = list(segment.words)
84
- words_list.extend(words)
85
-
86
- # process each word
87
- for word_idx, word in enumerate(words_list):
88
- if first_word:
89
- sentence_start = word.start
90
- # If it is the first sentence, add buffer or get the begining of the file
91
- if word_idx == 0:
92
- sentence_start = max(sentence_start - buffer, 0) # Add buffer to the sentence start
93
- else:
94
- # get previous sentence end
95
- previous_word_end = words_list[word_idx - 1].end
96
- # add buffer or get the silence midle between the previous sentence and the current one
97
- sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2)
98
-
99
- sentence = word.word
100
- first_word = False
101
- else:
102
- sentence += word.word
103
-
104
- if word.word[-1] in ["!", ".", "?"]:
105
- sentence = sentence[1:]
106
- # Expand number and abbreviations plus normalization
107
- sentence = multilingual_cleaners(sentence, target_language)
108
- audio_file_name, _ = os.path.splitext(os.path.basename(audio_path))
109
-
110
- audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav"
111
-
112
- # Check for the next word's existence
113
- if word_idx + 1 < len(words_list):
114
- next_word_start = words_list[word_idx + 1].start
115
- else:
116
- # If don't have more words it means that it is the last sentence then use the audio len as next word start
117
- next_word_start = (wav.shape[0] - 1) / sr
118
-
119
- # Average the current word end and next word start
120
- word_end = min((word.end + next_word_start) / 2, word.end + buffer)
121
-
122
- absoulte_path = os.path.join(out_path, audio_file)
123
- os.makedirs(os.path.dirname(absoulte_path), exist_ok=True)
124
- i += 1
125
- first_word = True
126
-
127
- audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0)
128
- # if the audio is too short ignore it (i.e < 0.33 seconds)
129
- if audio.size(-1) >= sr/3:
130
- torchaudio.save(absoulte_path,
131
- audio,
132
- sr
133
- )
134
- else:
135
- continue
136
-
137
- metadata["audio_file"].append(audio_file)
138
- metadata["text"].append(sentence)
139
- metadata["speaker_name"].append(speaker_name)
140
-
141
- df = pandas.DataFrame(metadata)
142
- df = df.sample(frac=1)
143
- num_val_samples = int(len(df)*eval_percentage)
144
-
145
- df_eval = df[:num_val_samples]
146
- df_train = df[num_val_samples:]
147
-
148
- df_train = df_train.sort_values('audio_file')
149
- train_metadata_path = os.path.join(out_path, "metadata_train.csv")
150
- df_train.to_csv(train_metadata_path, sep="|", index=False)
151
-
152
- eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
153
- df_eval = df_eval.sort_values('audio_file')
154
- df_eval.to_csv(eval_metadata_path, sep="|", index=False)
155
-
156
- # deallocate VRAM and RAM
157
- del asr_model, df_train, df_eval, df, metadata
158
- gc.collect()
159
-
160
- return train_metadata_path, eval_metadata_path, audio_total_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/demos/xtts_ft_demo/utils/gpt_train.py DELETED
@@ -1,172 +0,0 @@
1
- import os
2
- import gc
3
-
4
- from trainer import Trainer, TrainerArgs
5
-
6
- from TTS.config.shared_configs import BaseDatasetConfig
7
- from TTS.tts.datasets import load_tts_samples
8
- from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
9
- from TTS.utils.manage import ModelManager
10
-
11
-
12
- def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path, max_audio_length=255995):
13
- # Logging parameters
14
- RUN_NAME = "GPT_XTTS_FT"
15
- PROJECT_NAME = "XTTS_trainer"
16
- DASHBOARD_LOGGER = "tensorboard"
17
- LOGGER_URI = None
18
-
19
- # Set here the path that the checkpoints will be saved. Default: ./run/training/
20
- OUT_PATH = os.path.join(output_path, "run", "training")
21
-
22
- # Training Parameters
23
- OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
24
- START_WITH_EVAL = False # if True it will star with evaluation
25
- BATCH_SIZE = batch_size # set here the batch size
26
- GRAD_ACUMM_STEPS = grad_acumm # set here the grad accumulation steps
27
-
28
-
29
- # Define here the dataset that you want to use for the fine-tuning on.
30
- config_dataset = BaseDatasetConfig(
31
- formatter="coqui",
32
- dataset_name="ft_dataset",
33
- path=os.path.dirname(train_csv),
34
- meta_file_train=train_csv,
35
- meta_file_val=eval_csv,
36
- language=language,
37
- )
38
-
39
- # Add here the configs of the datasets
40
- DATASETS_CONFIG_LIST = [config_dataset]
41
-
42
- # Define the path where XTTS v2.0.1 files will be downloaded
43
- CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
44
- os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
45
-
46
-
47
- # DVAE files
48
- DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
49
- MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
50
-
51
- # Set the path to the downloaded files
52
- DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
53
- MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
54
-
55
- # download DVAE files if needed
56
- if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
57
- print(" > Downloading DVAE files!")
58
- ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
59
-
60
-
61
- # Download XTTS v2.0 checkpoint if needed
62
- TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
63
- XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
64
- XTTS_CONFIG_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json"
65
-
66
- # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
67
- TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
68
- XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
69
- XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK)) # config.json file
70
-
71
- # download XTTS v2.0 files if needed
72
- if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
73
- print(" > Downloading XTTS v2.0 files!")
74
- ModelManager._download_model_files(
75
- [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK, XTTS_CONFIG_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
76
- )
77
-
78
- # init args and config
79
- model_args = GPTArgs(
80
- max_conditioning_length=132300, # 6 secs
81
- min_conditioning_length=66150, # 3 secs
82
- debug_loading_failures=False,
83
- max_wav_length=max_audio_length, # ~11.6 seconds
84
- max_text_length=200,
85
- mel_norm_file=MEL_NORM_FILE,
86
- dvae_checkpoint=DVAE_CHECKPOINT,
87
- xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
88
- tokenizer_file=TOKENIZER_FILE,
89
- gpt_num_audio_tokens=1026,
90
- gpt_start_audio_token=1024,
91
- gpt_stop_audio_token=1025,
92
- gpt_use_masking_gt_prompt_approach=True,
93
- gpt_use_perceiver_resampler=True,
94
- )
95
- # define audio config
96
- audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
97
- # training parameters config
98
- config = GPTTrainerConfig(
99
- epochs=num_epochs,
100
- output_path=OUT_PATH,
101
- model_args=model_args,
102
- run_name=RUN_NAME,
103
- project_name=PROJECT_NAME,
104
- run_description="""
105
- GPT XTTS training
106
- """,
107
- dashboard_logger=DASHBOARD_LOGGER,
108
- logger_uri=LOGGER_URI,
109
- audio=audio_config,
110
- batch_size=BATCH_SIZE,
111
- batch_group_size=48,
112
- eval_batch_size=BATCH_SIZE,
113
- num_loader_workers=8,
114
- eval_split_max_size=256,
115
- print_step=50,
116
- plot_step=100,
117
- log_model_step=100,
118
- save_step=1000,
119
- save_n_checkpoints=1,
120
- save_checkpoints=True,
121
- # target_loss="loss",
122
- print_eval=False,
123
- # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
124
- optimizer="AdamW",
125
- optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
126
- optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
127
- lr=5e-06, # learning rate
128
- lr_scheduler="MultiStepLR",
129
- # it was adjusted accordly for the new step scheme
130
- lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
131
- test_sentences=[],
132
- )
133
-
134
- # init the model from config
135
- model = GPTTrainer.init_from_config(config)
136
-
137
- # load training samples
138
- train_samples, eval_samples = load_tts_samples(
139
- DATASETS_CONFIG_LIST,
140
- eval_split=True,
141
- eval_split_max_size=config.eval_split_max_size,
142
- eval_split_size=config.eval_split_size,
143
- )
144
-
145
- # init the trainer and 🚀
146
- trainer = Trainer(
147
- TrainerArgs(
148
- restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
149
- skip_train_epoch=False,
150
- start_with_eval=START_WITH_EVAL,
151
- grad_accum_steps=GRAD_ACUMM_STEPS,
152
- ),
153
- config,
154
- output_path=OUT_PATH,
155
- model=model,
156
- train_samples=train_samples,
157
- eval_samples=eval_samples,
158
- )
159
- trainer.fit()
160
-
161
- # get the longest text audio file to use as speaker reference
162
- samples_len = [len(item["text"].split(" ")) for item in train_samples]
163
- longest_text_idx = samples_len.index(max(samples_len))
164
- speaker_ref = train_samples[longest_text_idx]["audio_file"]
165
-
166
- trainer_out_path = trainer.output_path
167
-
168
- # deallocate VRAM and RAM
169
- del model, trainer, train_samples, eval_samples
170
- gc.collect()
171
-
172
- return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/demos/xtts_ft_demo/xtts_demo.py DELETED
@@ -1,415 +0,0 @@
1
- import argparse
2
- import os
3
- import sys
4
- import tempfile
5
-
6
- import gradio as gr
7
- import librosa.display
8
- import numpy as np
9
-
10
- import os
11
- import torch
12
- import torchaudio
13
- import traceback
14
- from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list
15
- from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt
16
-
17
- from TTS.tts.configs.xtts_config import XttsConfig
18
- from TTS.tts.models.xtts import Xtts
19
-
20
-
21
- def clear_gpu_cache():
22
- # clear the GPU cache
23
- if torch.cuda.is_available():
24
- torch.cuda.empty_cache()
25
-
26
- XTTS_MODEL = None
27
- def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
28
- global XTTS_MODEL
29
- clear_gpu_cache()
30
- if not xtts_checkpoint or not xtts_config or not xtts_vocab:
31
- return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!"
32
- config = XttsConfig()
33
- config.load_json(xtts_config)
34
- XTTS_MODEL = Xtts.init_from_config(config)
35
- print("Loading XTTS model! ")
36
- XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
37
- if torch.cuda.is_available():
38
- XTTS_MODEL.cuda()
39
-
40
- print("Model Loaded!")
41
- return "Model Loaded!"
42
-
43
- def run_tts(lang, tts_text, speaker_audio_file):
44
- if XTTS_MODEL is None or not speaker_audio_file:
45
- return "You need to run the previous step to load the model !!", None, None
46
-
47
- gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
48
- out = XTTS_MODEL.inference(
49
- text=tts_text,
50
- language=lang,
51
- gpt_cond_latent=gpt_cond_latent,
52
- speaker_embedding=speaker_embedding,
53
- temperature=XTTS_MODEL.config.temperature, # Add custom parameters here
54
- length_penalty=XTTS_MODEL.config.length_penalty,
55
- repetition_penalty=XTTS_MODEL.config.repetition_penalty,
56
- top_k=XTTS_MODEL.config.top_k,
57
- top_p=XTTS_MODEL.config.top_p,
58
- )
59
-
60
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
61
- out["wav"] = torch.tensor(out["wav"]).unsqueeze(0)
62
- out_path = fp.name
63
- torchaudio.save(out_path, out["wav"], 24000)
64
-
65
- return "Speech generated !", out_path, speaker_audio_file
66
-
67
-
68
-
69
-
70
- # define a logger to redirect
71
- class Logger:
72
- def __init__(self, filename="log.out"):
73
- self.log_file = filename
74
- self.terminal = sys.stdout
75
- self.log = open(self.log_file, "w")
76
-
77
- def write(self, message):
78
- self.terminal.write(message)
79
- self.log.write(message)
80
-
81
- def flush(self):
82
- self.terminal.flush()
83
- self.log.flush()
84
-
85
- def isatty(self):
86
- return False
87
-
88
- # redirect stdout and stderr to a file
89
- sys.stdout = Logger()
90
- sys.stderr = sys.stdout
91
-
92
-
93
- # logging.basicConfig(stream=sys.stdout, level=logging.INFO)
94
- import logging
95
- logging.basicConfig(
96
- level=logging.INFO,
97
- format="%(asctime)s [%(levelname)s] %(message)s",
98
- handlers=[
99
- logging.StreamHandler(sys.stdout)
100
- ]
101
- )
102
-
103
- def read_logs():
104
- sys.stdout.flush()
105
- with open(sys.stdout.log_file, "r") as f:
106
- return f.read()
107
-
108
-
109
- if __name__ == "__main__":
110
-
111
- parser = argparse.ArgumentParser(
112
- description="""XTTS fine-tuning demo\n\n"""
113
- """
114
- Example runs:
115
- python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port
116
- """,
117
- formatter_class=argparse.RawTextHelpFormatter,
118
- )
119
- parser.add_argument(
120
- "--port",
121
- type=int,
122
- help="Port to run the gradio demo. Default: 5003",
123
- default=5003,
124
- )
125
- parser.add_argument(
126
- "--out_path",
127
- type=str,
128
- help="Output path (where data and checkpoints will be saved) Default: /tmp/xtts_ft/",
129
- default="/tmp/xtts_ft/",
130
- )
131
-
132
- parser.add_argument(
133
- "--num_epochs",
134
- type=int,
135
- help="Number of epochs to train. Default: 10",
136
- default=10,
137
- )
138
- parser.add_argument(
139
- "--batch_size",
140
- type=int,
141
- help="Batch size. Default: 4",
142
- default=4,
143
- )
144
- parser.add_argument(
145
- "--grad_acumm",
146
- type=int,
147
- help="Grad accumulation steps. Default: 1",
148
- default=1,
149
- )
150
- parser.add_argument(
151
- "--max_audio_length",
152
- type=int,
153
- help="Max permitted audio size in seconds. Default: 11",
154
- default=11,
155
- )
156
-
157
- args = parser.parse_args()
158
-
159
- with gr.Blocks() as demo:
160
- with gr.Tab("1 - Data processing"):
161
- out_path = gr.Textbox(
162
- label="Output path (where data and checkpoints will be saved):",
163
- value=args.out_path,
164
- )
165
- # upload_file = gr.Audio(
166
- # sources="upload",
167
- # label="Select here the audio files that you want to use for XTTS trainining !",
168
- # type="filepath",
169
- # )
170
- upload_file = gr.File(
171
- file_count="multiple",
172
- label="Select here the audio files that you want to use for XTTS trainining (Supported formats: wav, mp3, and flac)",
173
- )
174
- lang = gr.Dropdown(
175
- label="Dataset Language",
176
- value="en",
177
- choices=[
178
- "en",
179
- "es",
180
- "fr",
181
- "de",
182
- "it",
183
- "pt",
184
- "pl",
185
- "tr",
186
- "ru",
187
- "nl",
188
- "cs",
189
- "ar",
190
- "zh",
191
- "hu",
192
- "ko",
193
- "ja"
194
- ],
195
- )
196
- progress_data = gr.Label(
197
- label="Progress:"
198
- )
199
- logs = gr.Textbox(
200
- label="Logs:",
201
- interactive=False,
202
- )
203
- demo.load(read_logs, None, logs, every=1)
204
-
205
- prompt_compute_btn = gr.Button(value="Step 1 - Create dataset")
206
-
207
- def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)):
208
- clear_gpu_cache()
209
- out_path = os.path.join(out_path, "dataset")
210
- os.makedirs(out_path, exist_ok=True)
211
- if audio_path is None:
212
- return "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!", "", ""
213
- else:
214
- try:
215
- train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress)
216
- except:
217
- traceback.print_exc()
218
- error = traceback.format_exc()
219
- return f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}", "", ""
220
-
221
- clear_gpu_cache()
222
-
223
- # if audio total len is less than 2 minutes raise an error
224
- if audio_total_size < 120:
225
- message = "The sum of the duration of the audios that you provided should be at least 2 minutes!"
226
- print(message)
227
- return message, "", ""
228
-
229
- print("Dataset Processed!")
230
- return "Dataset Processed!", train_meta, eval_meta
231
-
232
- with gr.Tab("2 - Fine-tuning XTTS Encoder"):
233
- train_csv = gr.Textbox(
234
- label="Train CSV:",
235
- )
236
- eval_csv = gr.Textbox(
237
- label="Eval CSV:",
238
- )
239
- num_epochs = gr.Slider(
240
- label="Number of epochs:",
241
- minimum=1,
242
- maximum=100,
243
- step=1,
244
- value=args.num_epochs,
245
- )
246
- batch_size = gr.Slider(
247
- label="Batch size:",
248
- minimum=2,
249
- maximum=512,
250
- step=1,
251
- value=args.batch_size,
252
- )
253
- grad_acumm = gr.Slider(
254
- label="Grad accumulation steps:",
255
- minimum=2,
256
- maximum=128,
257
- step=1,
258
- value=args.grad_acumm,
259
- )
260
- max_audio_length = gr.Slider(
261
- label="Max permitted audio size in seconds:",
262
- minimum=2,
263
- maximum=20,
264
- step=1,
265
- value=args.max_audio_length,
266
- )
267
- progress_train = gr.Label(
268
- label="Progress:"
269
- )
270
- logs_tts_train = gr.Textbox(
271
- label="Logs:",
272
- interactive=False,
273
- )
274
- demo.load(read_logs, None, logs_tts_train, every=1)
275
- train_btn = gr.Button(value="Step 2 - Run the training")
276
-
277
- def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length):
278
- clear_gpu_cache()
279
- if not train_csv or not eval_csv:
280
- return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", ""
281
- try:
282
- # convert seconds to waveform frames
283
- max_audio_length = int(max_audio_length * 22050)
284
- config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path, max_audio_length=max_audio_length)
285
- except:
286
- traceback.print_exc()
287
- error = traceback.format_exc()
288
- return f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}", "", "", "", ""
289
-
290
- # copy original files to avoid parameters changes issues
291
- os.system(f"cp {config_path} {exp_path}")
292
- os.system(f"cp {vocab_file} {exp_path}")
293
-
294
- ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth")
295
- print("Model training done!")
296
- clear_gpu_cache()
297
- return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav
298
-
299
- with gr.Tab("3 - Inference"):
300
- with gr.Row():
301
- with gr.Column() as col1:
302
- xtts_checkpoint = gr.Textbox(
303
- label="XTTS checkpoint path:",
304
- value="",
305
- )
306
- xtts_config = gr.Textbox(
307
- label="XTTS config path:",
308
- value="",
309
- )
310
-
311
- xtts_vocab = gr.Textbox(
312
- label="XTTS vocab path:",
313
- value="",
314
- )
315
- progress_load = gr.Label(
316
- label="Progress:"
317
- )
318
- load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model")
319
-
320
- with gr.Column() as col2:
321
- speaker_reference_audio = gr.Textbox(
322
- label="Speaker reference audio:",
323
- value="",
324
- )
325
- tts_language = gr.Dropdown(
326
- label="Language",
327
- value="en",
328
- choices=[
329
- "en",
330
- "es",
331
- "fr",
332
- "de",
333
- "it",
334
- "pt",
335
- "pl",
336
- "tr",
337
- "ru",
338
- "nl",
339
- "cs",
340
- "ar",
341
- "zh",
342
- "hu",
343
- "ko",
344
- "ja",
345
- ]
346
- )
347
- tts_text = gr.Textbox(
348
- label="Input Text.",
349
- value="This model sounds really good and above all, it's reasonably fast.",
350
- )
351
- tts_btn = gr.Button(value="Step 4 - Inference")
352
-
353
- with gr.Column() as col3:
354
- progress_gen = gr.Label(
355
- label="Progress:"
356
- )
357
- tts_output_audio = gr.Audio(label="Generated Audio.")
358
- reference_audio = gr.Audio(label="Reference audio used.")
359
-
360
- prompt_compute_btn.click(
361
- fn=preprocess_dataset,
362
- inputs=[
363
- upload_file,
364
- lang,
365
- out_path,
366
- ],
367
- outputs=[
368
- progress_data,
369
- train_csv,
370
- eval_csv,
371
- ],
372
- )
373
-
374
-
375
- train_btn.click(
376
- fn=train_model,
377
- inputs=[
378
- lang,
379
- train_csv,
380
- eval_csv,
381
- num_epochs,
382
- batch_size,
383
- grad_acumm,
384
- out_path,
385
- max_audio_length,
386
- ],
387
- outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio],
388
- )
389
-
390
- load_btn.click(
391
- fn=load_model,
392
- inputs=[
393
- xtts_checkpoint,
394
- xtts_config,
395
- xtts_vocab
396
- ],
397
- outputs=[progress_load],
398
- )
399
-
400
- tts_btn.click(
401
- fn=run_tts,
402
- inputs=[
403
- tts_language,
404
- tts_text,
405
- speaker_reference_audio,
406
- ],
407
- outputs=[progress_gen, tts_output_audio, reference_audio],
408
- )
409
-
410
- demo.launch(
411
- share=True,
412
- debug=False,
413
- server_port=args.port,
414
- server_name="0.0.0.0"
415
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/encoder/README.md DELETED
@@ -1,18 +0,0 @@
1
- ### Speaker Encoder
2
-
3
- This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
4
-
5
- With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
6
-
7
- Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
8
-
9
- ![](umap.png)
10
-
11
- Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
12
-
13
- To run the code, you need to follow the same flow as in TTS.
14
-
15
- - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
16
- - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
17
- - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
18
- - Watch training on Tensorboard as in TTS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/encoder/__init__.py DELETED
File without changes
TTS/encoder/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (165 Bytes)
 
TTS/encoder/__pycache__/losses.cpython-310.pyc DELETED
Binary file (7.81 kB)
 
TTS/encoder/configs/base_encoder_config.py DELETED
@@ -1,61 +0,0 @@
1
- from dataclasses import asdict, dataclass, field
2
- from typing import Dict, List
3
-
4
- from coqpit import MISSING
5
-
6
- from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
7
-
8
-
9
- @dataclass
10
- class BaseEncoderConfig(BaseTrainingConfig):
11
- """Defines parameters for a Generic Encoder model."""
12
-
13
- model: str = None
14
- audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
15
- datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
16
- # model params
17
- model_params: Dict = field(
18
- default_factory=lambda: {
19
- "model_name": "lstm",
20
- "input_dim": 80,
21
- "proj_dim": 256,
22
- "lstm_dim": 768,
23
- "num_lstm_layers": 3,
24
- "use_lstm_with_projection": True,
25
- }
26
- )
27
-
28
- audio_augmentation: Dict = field(default_factory=lambda: {})
29
-
30
- # training params
31
- epochs: int = 10000
32
- loss: str = "angleproto"
33
- grad_clip: float = 3.0
34
- lr: float = 0.0001
35
- optimizer: str = "radam"
36
- optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
37
- lr_decay: bool = False
38
- warmup_steps: int = 4000
39
-
40
- # logging params
41
- tb_model_param_stats: bool = False
42
- steps_plot_stats: int = 10
43
- save_step: int = 1000
44
- print_step: int = 20
45
- run_eval: bool = False
46
-
47
- # data loader
48
- num_classes_in_batch: int = MISSING
49
- num_utter_per_class: int = MISSING
50
- eval_num_classes_in_batch: int = None
51
- eval_num_utter_per_class: int = None
52
-
53
- num_loader_workers: int = MISSING
54
- voice_len: float = 1.6
55
-
56
- def check_values(self):
57
- super().check_values()
58
- c = asdict(self)
59
- assert (
60
- c["model_params"]["input_dim"] == self.audio.num_mels
61
- ), " [!] model input dimendion must be equal to melspectrogram dimension."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/encoder/configs/emotion_encoder_config.py DELETED
@@ -1,12 +0,0 @@
1
- from dataclasses import asdict, dataclass
2
-
3
- from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4
-
5
-
6
- @dataclass
7
- class EmotionEncoderConfig(BaseEncoderConfig):
8
- """Defines parameters for Emotion Encoder model."""
9
-
10
- model: str = "emotion_encoder"
11
- map_classid_to_classname: dict = None
12
- class_name_key: str = "emotion_name"
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/encoder/configs/speaker_encoder_config.py DELETED
@@ -1,11 +0,0 @@
1
- from dataclasses import asdict, dataclass
2
-
3
- from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4
-
5
-
6
- @dataclass
7
- class SpeakerEncoderConfig(BaseEncoderConfig):
8
- """Defines parameters for Speaker Encoder model."""
9
-
10
- model: str = "speaker_encoder"
11
- class_name_key: str = "speaker_name"
 
 
 
 
 
 
 
 
 
 
 
 
TTS/encoder/dataset.py DELETED
@@ -1,147 +0,0 @@
1
- import random
2
-
3
- import torch
4
- from torch.utils.data import Dataset
5
-
6
- from TTS.encoder.utils.generic_utils import AugmentWAV
7
-
8
-
9
- class EncoderDataset(Dataset):
10
- def __init__(
11
- self,
12
- config,
13
- ap,
14
- meta_data,
15
- voice_len=1.6,
16
- num_classes_in_batch=64,
17
- num_utter_per_class=10,
18
- verbose=False,
19
- augmentation_config=None,
20
- use_torch_spec=None,
21
- ):
22
- """
23
- Args:
24
- ap (TTS.tts.utils.AudioProcessor): audio processor object.
25
- meta_data (list): list of dataset instances.
26
- seq_len (int): voice segment length in seconds.
27
- verbose (bool): print diagnostic information.
28
- """
29
- super().__init__()
30
- self.config = config
31
- self.items = meta_data
32
- self.sample_rate = ap.sample_rate
33
- self.seq_len = int(voice_len * self.sample_rate)
34
- self.num_utter_per_class = num_utter_per_class
35
- self.ap = ap
36
- self.verbose = verbose
37
- self.use_torch_spec = use_torch_spec
38
- self.classes, self.items = self.__parse_items()
39
-
40
- self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
41
-
42
- # Data Augmentation
43
- self.augmentator = None
44
- self.gaussian_augmentation_config = None
45
- if augmentation_config:
46
- self.data_augmentation_p = augmentation_config["p"]
47
- if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
48
- self.augmentator = AugmentWAV(ap, augmentation_config)
49
-
50
- if "gaussian" in augmentation_config.keys():
51
- self.gaussian_augmentation_config = augmentation_config["gaussian"]
52
-
53
- if self.verbose:
54
- print("\n > DataLoader initialization")
55
- print(f" | > Classes per Batch: {num_classes_in_batch}")
56
- print(f" | > Number of instances : {len(self.items)}")
57
- print(f" | > Sequence length: {self.seq_len}")
58
- print(f" | > Num Classes: {len(self.classes)}")
59
- print(f" | > Classes: {self.classes}")
60
-
61
- def load_wav(self, filename):
62
- audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
63
- return audio
64
-
65
- def __parse_items(self):
66
- class_to_utters = {}
67
- for item in self.items:
68
- path_ = item["audio_file"]
69
- class_name = item[self.config.class_name_key]
70
- if class_name in class_to_utters.keys():
71
- class_to_utters[class_name].append(path_)
72
- else:
73
- class_to_utters[class_name] = [
74
- path_,
75
- ]
76
-
77
- # skip classes with number of samples >= self.num_utter_per_class
78
- class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
79
-
80
- classes = list(class_to_utters.keys())
81
- classes.sort()
82
-
83
- new_items = []
84
- for item in self.items:
85
- path_ = item["audio_file"]
86
- class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"]
87
- # ignore filtered classes
88
- if class_name not in classes:
89
- continue
90
- # ignore small audios
91
- if self.load_wav(path_).shape[0] - self.seq_len <= 0:
92
- continue
93
-
94
- new_items.append({"wav_file_path": path_, "class_name": class_name})
95
-
96
- return classes, new_items
97
-
98
- def __len__(self):
99
- return len(self.items)
100
-
101
- def get_num_classes(self):
102
- return len(self.classes)
103
-
104
- def get_class_list(self):
105
- return self.classes
106
-
107
- def set_classes(self, classes):
108
- self.classes = classes
109
- self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
110
-
111
- def get_map_classid_to_classname(self):
112
- return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
113
-
114
- def __getitem__(self, idx):
115
- return self.items[idx]
116
-
117
- def collate_fn(self, batch):
118
- # get the batch class_ids
119
- labels = []
120
- feats = []
121
- for item in batch:
122
- utter_path = item["wav_file_path"]
123
- class_name = item["class_name"]
124
-
125
- # get classid
126
- class_id = self.classname_to_classid[class_name]
127
- # load wav file
128
- wav = self.load_wav(utter_path)
129
- offset = random.randint(0, wav.shape[0] - self.seq_len)
130
- wav = wav[offset : offset + self.seq_len]
131
-
132
- if self.augmentator is not None and self.data_augmentation_p:
133
- if random.random() < self.data_augmentation_p:
134
- wav = self.augmentator.apply_one(wav)
135
-
136
- if not self.use_torch_spec:
137
- mel = self.ap.melspectrogram(wav)
138
- feats.append(torch.FloatTensor(mel))
139
- else:
140
- feats.append(torch.FloatTensor(wav))
141
-
142
- labels.append(class_id)
143
-
144
- feats = torch.stack(feats)
145
- labels = torch.LongTensor(labels)
146
-
147
- return feats, labels
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/encoder/losses.py DELETED
@@ -1,226 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
-
6
- # adapted from https://github.com/cvqluu/GE2E-Loss
7
- class GE2ELoss(nn.Module):
8
- def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
9
- """
10
- Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
11
- Accepts an input of size (N, M, D)
12
- where N is the number of speakers in the batch,
13
- M is the number of utterances per speaker,
14
- and D is the dimensionality of the embedding vector (e.g. d-vector)
15
- Args:
16
- - init_w (float): defines the initial value of w in Equation (5) of [1]
17
- - init_b (float): definies the initial value of b in Equation (5) of [1]
18
- """
19
- super().__init__()
20
- # pylint: disable=E1102
21
- self.w = nn.Parameter(torch.tensor(init_w))
22
- # pylint: disable=E1102
23
- self.b = nn.Parameter(torch.tensor(init_b))
24
- self.loss_method = loss_method
25
-
26
- print(" > Initialized Generalized End-to-End loss")
27
-
28
- assert self.loss_method in ["softmax", "contrast"]
29
-
30
- if self.loss_method == "softmax":
31
- self.embed_loss = self.embed_loss_softmax
32
- if self.loss_method == "contrast":
33
- self.embed_loss = self.embed_loss_contrast
34
-
35
- # pylint: disable=R0201
36
- def calc_new_centroids(self, dvecs, centroids, spkr, utt):
37
- """
38
- Calculates the new centroids excluding the reference utterance
39
- """
40
- excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
41
- excl = torch.mean(excl, 0)
42
- new_centroids = []
43
- for i, centroid in enumerate(centroids):
44
- if i == spkr:
45
- new_centroids.append(excl)
46
- else:
47
- new_centroids.append(centroid)
48
- return torch.stack(new_centroids)
49
-
50
- def calc_cosine_sim(self, dvecs, centroids):
51
- """
52
- Make the cosine similarity matrix with dims (N,M,N)
53
- """
54
- cos_sim_matrix = []
55
- for spkr_idx, speaker in enumerate(dvecs):
56
- cs_row = []
57
- for utt_idx, utterance in enumerate(speaker):
58
- new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx)
59
- # vector based cosine similarity for speed
60
- cs_row.append(
61
- torch.clamp(
62
- torch.mm(
63
- utterance.unsqueeze(1).transpose(0, 1),
64
- new_centroids.transpose(0, 1),
65
- )
66
- / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
67
- 1e-6,
68
- )
69
- )
70
- cs_row = torch.cat(cs_row, dim=0)
71
- cos_sim_matrix.append(cs_row)
72
- return torch.stack(cos_sim_matrix)
73
-
74
- # pylint: disable=R0201
75
- def embed_loss_softmax(self, dvecs, cos_sim_matrix):
76
- """
77
- Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
78
- """
79
- N, M, _ = dvecs.shape
80
- L = []
81
- for j in range(N):
82
- L_row = []
83
- for i in range(M):
84
- L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
85
- L_row = torch.stack(L_row)
86
- L.append(L_row)
87
- return torch.stack(L)
88
-
89
- # pylint: disable=R0201
90
- def embed_loss_contrast(self, dvecs, cos_sim_matrix):
91
- """
92
- Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
93
- """
94
- N, M, _ = dvecs.shape
95
- L = []
96
- for j in range(N):
97
- L_row = []
98
- for i in range(M):
99
- centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
100
- excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]))
101
- L_row.append(1.0 - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids))
102
- L_row = torch.stack(L_row)
103
- L.append(L_row)
104
- return torch.stack(L)
105
-
106
- def forward(self, x, _label=None):
107
- """
108
- Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
109
- """
110
-
111
- assert x.size()[1] >= 2
112
-
113
- centroids = torch.mean(x, 1)
114
- cos_sim_matrix = self.calc_cosine_sim(x, centroids)
115
- torch.clamp(self.w, 1e-6)
116
- cos_sim_matrix = self.w * cos_sim_matrix + self.b
117
- L = self.embed_loss(x, cos_sim_matrix)
118
- return L.mean()
119
-
120
-
121
- # adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
122
- class AngleProtoLoss(nn.Module):
123
- """
124
- Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
125
- Accepts an input of size (N, M, D)
126
- where N is the number of speakers in the batch,
127
- M is the number of utterances per speaker,
128
- and D is the dimensionality of the embedding vector
129
- Args:
130
- - init_w (float): defines the initial value of w
131
- - init_b (float): definies the initial value of b
132
- """
133
-
134
- def __init__(self, init_w=10.0, init_b=-5.0):
135
- super().__init__()
136
- # pylint: disable=E1102
137
- self.w = nn.Parameter(torch.tensor(init_w))
138
- # pylint: disable=E1102
139
- self.b = nn.Parameter(torch.tensor(init_b))
140
- self.criterion = torch.nn.CrossEntropyLoss()
141
-
142
- print(" > Initialized Angular Prototypical loss")
143
-
144
- def forward(self, x, _label=None):
145
- """
146
- Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
147
- """
148
-
149
- assert x.size()[1] >= 2
150
-
151
- out_anchor = torch.mean(x[:, 1:, :], 1)
152
- out_positive = x[:, 0, :]
153
- num_speakers = out_anchor.size()[0]
154
-
155
- cos_sim_matrix = F.cosine_similarity(
156
- out_positive.unsqueeze(-1).expand(-1, -1, num_speakers),
157
- out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2),
158
- )
159
- torch.clamp(self.w, 1e-6)
160
- cos_sim_matrix = cos_sim_matrix * self.w + self.b
161
- label = torch.arange(num_speakers).to(cos_sim_matrix.device)
162
- L = self.criterion(cos_sim_matrix, label)
163
- return L
164
-
165
-
166
- class SoftmaxLoss(nn.Module):
167
- """
168
- Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
169
- Args:
170
- - embedding_dim (float): speaker embedding dim
171
- - n_speakers (float): number of speakers
172
- """
173
-
174
- def __init__(self, embedding_dim, n_speakers):
175
- super().__init__()
176
-
177
- self.criterion = torch.nn.CrossEntropyLoss()
178
- self.fc = nn.Linear(embedding_dim, n_speakers)
179
-
180
- print("Initialised Softmax Loss")
181
-
182
- def forward(self, x, label=None):
183
- # reshape for compatibility
184
- x = x.reshape(-1, x.size()[-1])
185
- label = label.reshape(-1)
186
-
187
- x = self.fc(x)
188
- L = self.criterion(x, label)
189
-
190
- return L
191
-
192
- def inference(self, embedding):
193
- x = self.fc(embedding)
194
- activations = torch.nn.functional.softmax(x, dim=1).squeeze(0)
195
- class_id = torch.argmax(activations)
196
- return class_id
197
-
198
-
199
- class SoftmaxAngleProtoLoss(nn.Module):
200
- """
201
- Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
202
- Args:
203
- - embedding_dim (float): speaker embedding dim
204
- - n_speakers (float): number of speakers
205
- - init_w (float): defines the initial value of w
206
- - init_b (float): definies the initial value of b
207
- """
208
-
209
- def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
210
- super().__init__()
211
-
212
- self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
213
- self.angleproto = AngleProtoLoss(init_w, init_b)
214
-
215
- print("Initialised SoftmaxAnglePrototypical Loss")
216
-
217
- def forward(self, x, label=None):
218
- """
219
- Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
220
- """
221
-
222
- Lp = self.angleproto(x)
223
-
224
- Ls = self.softmax(x, label)
225
-
226
- return Ls + Lp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/encoder/models/__pycache__/base_encoder.cpython-310.pyc DELETED
Binary file (4.53 kB)
 
TTS/encoder/models/__pycache__/lstm.cpython-310.pyc DELETED
Binary file (3.61 kB)
 
TTS/encoder/models/__pycache__/resnet.cpython-310.pyc DELETED
Binary file (5.84 kB)
 
TTS/encoder/models/base_encoder.py DELETED
@@ -1,161 +0,0 @@
1
- import numpy as np
2
- import torch
3
- import torchaudio
4
- from coqpit import Coqpit
5
- from torch import nn
6
-
7
- from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
8
- from TTS.utils.generic_utils import set_init_dict
9
- from TTS.utils.io import load_fsspec
10
-
11
-
12
- class PreEmphasis(nn.Module):
13
- def __init__(self, coefficient=0.97):
14
- super().__init__()
15
- self.coefficient = coefficient
16
- self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
17
-
18
- def forward(self, x):
19
- assert len(x.size()) == 2
20
-
21
- x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
22
- return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
23
-
24
-
25
- class BaseEncoder(nn.Module):
26
- """Base `encoder` class. Every new `encoder` model must inherit this.
27
-
28
- It defines common `encoder` specific functions.
29
- """
30
-
31
- # pylint: disable=W0102
32
- def __init__(self):
33
- super(BaseEncoder, self).__init__()
34
-
35
- def get_torch_mel_spectrogram_class(self, audio_config):
36
- return torch.nn.Sequential(
37
- PreEmphasis(audio_config["preemphasis"]),
38
- # TorchSTFT(
39
- # n_fft=audio_config["fft_size"],
40
- # hop_length=audio_config["hop_length"],
41
- # win_length=audio_config["win_length"],
42
- # sample_rate=audio_config["sample_rate"],
43
- # window="hamming_window",
44
- # mel_fmin=0.0,
45
- # mel_fmax=None,
46
- # use_htk=True,
47
- # do_amp_to_db=False,
48
- # n_mels=audio_config["num_mels"],
49
- # power=2.0,
50
- # use_mel=True,
51
- # mel_norm=None,
52
- # )
53
- torchaudio.transforms.MelSpectrogram(
54
- sample_rate=audio_config["sample_rate"],
55
- n_fft=audio_config["fft_size"],
56
- win_length=audio_config["win_length"],
57
- hop_length=audio_config["hop_length"],
58
- window_fn=torch.hamming_window,
59
- n_mels=audio_config["num_mels"],
60
- ),
61
- )
62
-
63
- @torch.no_grad()
64
- def inference(self, x, l2_norm=True):
65
- return self.forward(x, l2_norm)
66
-
67
- @torch.no_grad()
68
- def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
69
- """
70
- Generate embeddings for a batch of utterances
71
- x: 1xTxD
72
- """
73
- # map to the waveform size
74
- if self.use_torch_spec:
75
- num_frames = num_frames * self.audio_config["hop_length"]
76
-
77
- max_len = x.shape[1]
78
-
79
- if max_len < num_frames:
80
- num_frames = max_len
81
-
82
- offsets = np.linspace(0, max_len - num_frames, num=num_eval)
83
-
84
- frames_batch = []
85
- for offset in offsets:
86
- offset = int(offset)
87
- end_offset = int(offset + num_frames)
88
- frames = x[:, offset:end_offset]
89
- frames_batch.append(frames)
90
-
91
- frames_batch = torch.cat(frames_batch, dim=0)
92
- embeddings = self.inference(frames_batch, l2_norm=l2_norm)
93
-
94
- if return_mean:
95
- embeddings = torch.mean(embeddings, dim=0, keepdim=True)
96
- return embeddings
97
-
98
- def get_criterion(self, c: Coqpit, num_classes=None):
99
- if c.loss == "ge2e":
100
- criterion = GE2ELoss(loss_method="softmax")
101
- elif c.loss == "angleproto":
102
- criterion = AngleProtoLoss()
103
- elif c.loss == "softmaxproto":
104
- criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
105
- else:
106
- raise Exception("The %s not is a loss supported" % c.loss)
107
- return criterion
108
-
109
- def load_checkpoint(
110
- self,
111
- config: Coqpit,
112
- checkpoint_path: str,
113
- eval: bool = False,
114
- use_cuda: bool = False,
115
- criterion=None,
116
- cache=False,
117
- ):
118
- state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
119
- try:
120
- self.load_state_dict(state["model"])
121
- print(" > Model fully restored. ")
122
- except (KeyError, RuntimeError) as error:
123
- # If eval raise the error
124
- if eval:
125
- raise error
126
-
127
- print(" > Partial model initialization.")
128
- model_dict = self.state_dict()
129
- model_dict = set_init_dict(model_dict, state["model"], c)
130
- self.load_state_dict(model_dict)
131
- del model_dict
132
-
133
- # load the criterion for restore_path
134
- if criterion is not None and "criterion" in state:
135
- try:
136
- criterion.load_state_dict(state["criterion"])
137
- except (KeyError, RuntimeError) as error:
138
- print(" > Criterion load ignored because of:", error)
139
-
140
- # instance and load the criterion for the encoder classifier in inference time
141
- if (
142
- eval
143
- and criterion is None
144
- and "criterion" in state
145
- and getattr(config, "map_classid_to_classname", None) is not None
146
- ):
147
- criterion = self.get_criterion(config, len(config.map_classid_to_classname))
148
- criterion.load_state_dict(state["criterion"])
149
-
150
- if use_cuda:
151
- self.cuda()
152
- if criterion is not None:
153
- criterion = criterion.cuda()
154
-
155
- if eval:
156
- self.eval()
157
- assert not self.training
158
-
159
- if not eval:
160
- return criterion, state["step"]
161
- return criterion
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/encoder/models/lstm.py DELETED
@@ -1,99 +0,0 @@
1
- import torch
2
- from torch import nn
3
-
4
- from TTS.encoder.models.base_encoder import BaseEncoder
5
-
6
-
7
- class LSTMWithProjection(nn.Module):
8
- def __init__(self, input_size, hidden_size, proj_size):
9
- super().__init__()
10
- self.input_size = input_size
11
- self.hidden_size = hidden_size
12
- self.proj_size = proj_size
13
- self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
14
- self.linear = nn.Linear(hidden_size, proj_size, bias=False)
15
-
16
- def forward(self, x):
17
- self.lstm.flatten_parameters()
18
- o, (_, _) = self.lstm(x)
19
- return self.linear(o)
20
-
21
-
22
- class LSTMWithoutProjection(nn.Module):
23
- def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
24
- super().__init__()
25
- self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
26
- self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
27
- self.relu = nn.ReLU()
28
-
29
- def forward(self, x):
30
- _, (hidden, _) = self.lstm(x)
31
- return self.relu(self.linear(hidden[-1]))
32
-
33
-
34
- class LSTMSpeakerEncoder(BaseEncoder):
35
- def __init__(
36
- self,
37
- input_dim,
38
- proj_dim=256,
39
- lstm_dim=768,
40
- num_lstm_layers=3,
41
- use_lstm_with_projection=True,
42
- use_torch_spec=False,
43
- audio_config=None,
44
- ):
45
- super().__init__()
46
- self.use_lstm_with_projection = use_lstm_with_projection
47
- self.use_torch_spec = use_torch_spec
48
- self.audio_config = audio_config
49
- self.proj_dim = proj_dim
50
-
51
- layers = []
52
- # choise LSTM layer
53
- if use_lstm_with_projection:
54
- layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
55
- for _ in range(num_lstm_layers - 1):
56
- layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
57
- self.layers = nn.Sequential(*layers)
58
- else:
59
- self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
60
-
61
- self.instancenorm = nn.InstanceNorm1d(input_dim)
62
-
63
- if self.use_torch_spec:
64
- self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
65
- else:
66
- self.torch_spec = None
67
-
68
- self._init_layers()
69
-
70
- def _init_layers(self):
71
- for name, param in self.layers.named_parameters():
72
- if "bias" in name:
73
- nn.init.constant_(param, 0.0)
74
- elif "weight" in name:
75
- nn.init.xavier_normal_(param)
76
-
77
- def forward(self, x, l2_norm=True):
78
- """Forward pass of the model.
79
-
80
- Args:
81
- x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
82
- to compute the spectrogram on-the-fly.
83
- l2_norm (bool): Whether to L2-normalize the outputs.
84
-
85
- Shapes:
86
- - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
87
- """
88
- with torch.no_grad():
89
- with torch.cuda.amp.autocast(enabled=False):
90
- if self.use_torch_spec:
91
- x.squeeze_(1)
92
- x = self.torch_spec(x)
93
- x = self.instancenorm(x).transpose(1, 2)
94
- d = self.layers(x)
95
- if self.use_lstm_with_projection:
96
- d = d[:, -1]
97
- if l2_norm:
98
- d = torch.nn.functional.normalize(d, p=2, dim=1)
99
- return d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/encoder/models/resnet.py DELETED
@@ -1,198 +0,0 @@
1
- import torch
2
- from torch import nn
3
-
4
- # from TTS.utils.audio.torch_transforms import TorchSTFT
5
- from TTS.encoder.models.base_encoder import BaseEncoder
6
-
7
-
8
- class SELayer(nn.Module):
9
- def __init__(self, channel, reduction=8):
10
- super(SELayer, self).__init__()
11
- self.avg_pool = nn.AdaptiveAvgPool2d(1)
12
- self.fc = nn.Sequential(
13
- nn.Linear(channel, channel // reduction),
14
- nn.ReLU(inplace=True),
15
- nn.Linear(channel // reduction, channel),
16
- nn.Sigmoid(),
17
- )
18
-
19
- def forward(self, x):
20
- b, c, _, _ = x.size()
21
- y = self.avg_pool(x).view(b, c)
22
- y = self.fc(y).view(b, c, 1, 1)
23
- return x * y
24
-
25
-
26
- class SEBasicBlock(nn.Module):
27
- expansion = 1
28
-
29
- def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
30
- super(SEBasicBlock, self).__init__()
31
- self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
32
- self.bn1 = nn.BatchNorm2d(planes)
33
- self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
34
- self.bn2 = nn.BatchNorm2d(planes)
35
- self.relu = nn.ReLU(inplace=True)
36
- self.se = SELayer(planes, reduction)
37
- self.downsample = downsample
38
- self.stride = stride
39
-
40
- def forward(self, x):
41
- residual = x
42
-
43
- out = self.conv1(x)
44
- out = self.relu(out)
45
- out = self.bn1(out)
46
-
47
- out = self.conv2(out)
48
- out = self.bn2(out)
49
- out = self.se(out)
50
-
51
- if self.downsample is not None:
52
- residual = self.downsample(x)
53
-
54
- out += residual
55
- out = self.relu(out)
56
- return out
57
-
58
-
59
- class ResNetSpeakerEncoder(BaseEncoder):
60
- """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
61
- Adapted from: https://github.com/clovaai/voxceleb_trainer
62
- """
63
-
64
- # pylint: disable=W0102
65
- def __init__(
66
- self,
67
- input_dim=64,
68
- proj_dim=512,
69
- layers=[3, 4, 6, 3],
70
- num_filters=[32, 64, 128, 256],
71
- encoder_type="ASP",
72
- log_input=False,
73
- use_torch_spec=False,
74
- audio_config=None,
75
- ):
76
- super(ResNetSpeakerEncoder, self).__init__()
77
-
78
- self.encoder_type = encoder_type
79
- self.input_dim = input_dim
80
- self.log_input = log_input
81
- self.use_torch_spec = use_torch_spec
82
- self.audio_config = audio_config
83
- self.proj_dim = proj_dim
84
-
85
- self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
86
- self.relu = nn.ReLU(inplace=True)
87
- self.bn1 = nn.BatchNorm2d(num_filters[0])
88
-
89
- self.inplanes = num_filters[0]
90
- self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0])
91
- self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2))
92
- self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2))
93
- self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
94
-
95
- self.instancenorm = nn.InstanceNorm1d(input_dim)
96
-
97
- if self.use_torch_spec:
98
- self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
99
- else:
100
- self.torch_spec = None
101
-
102
- outmap_size = int(self.input_dim / 8)
103
-
104
- self.attention = nn.Sequential(
105
- nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
106
- nn.ReLU(),
107
- nn.BatchNorm1d(128),
108
- nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
109
- nn.Softmax(dim=2),
110
- )
111
-
112
- if self.encoder_type == "SAP":
113
- out_dim = num_filters[3] * outmap_size
114
- elif self.encoder_type == "ASP":
115
- out_dim = num_filters[3] * outmap_size * 2
116
- else:
117
- raise ValueError("Undefined encoder")
118
-
119
- self.fc = nn.Linear(out_dim, proj_dim)
120
-
121
- self._init_layers()
122
-
123
- def _init_layers(self):
124
- for m in self.modules():
125
- if isinstance(m, nn.Conv2d):
126
- nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
127
- elif isinstance(m, nn.BatchNorm2d):
128
- nn.init.constant_(m.weight, 1)
129
- nn.init.constant_(m.bias, 0)
130
-
131
- def create_layer(self, block, planes, blocks, stride=1):
132
- downsample = None
133
- if stride != 1 or self.inplanes != planes * block.expansion:
134
- downsample = nn.Sequential(
135
- nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
136
- nn.BatchNorm2d(planes * block.expansion),
137
- )
138
-
139
- layers = []
140
- layers.append(block(self.inplanes, planes, stride, downsample))
141
- self.inplanes = planes * block.expansion
142
- for _ in range(1, blocks):
143
- layers.append(block(self.inplanes, planes))
144
-
145
- return nn.Sequential(*layers)
146
-
147
- # pylint: disable=R0201
148
- def new_parameter(self, *size):
149
- out = nn.Parameter(torch.FloatTensor(*size))
150
- nn.init.xavier_normal_(out)
151
- return out
152
-
153
- def forward(self, x, l2_norm=False):
154
- """Forward pass of the model.
155
-
156
- Args:
157
- x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
158
- to compute the spectrogram on-the-fly.
159
- l2_norm (bool): Whether to L2-normalize the outputs.
160
-
161
- Shapes:
162
- - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
163
- """
164
- x.squeeze_(1)
165
- # if you torch spec compute it otherwise use the mel spec computed by the AP
166
- if self.use_torch_spec:
167
- x = self.torch_spec(x)
168
-
169
- if self.log_input:
170
- x = (x + 1e-6).log()
171
- x = self.instancenorm(x).unsqueeze(1)
172
-
173
- x = self.conv1(x)
174
- x = self.relu(x)
175
- x = self.bn1(x)
176
-
177
- x = self.layer1(x)
178
- x = self.layer2(x)
179
- x = self.layer3(x)
180
- x = self.layer4(x)
181
-
182
- x = x.reshape(x.size()[0], -1, x.size()[-1])
183
-
184
- w = self.attention(x)
185
-
186
- if self.encoder_type == "SAP":
187
- x = torch.sum(x * w, dim=2)
188
- elif self.encoder_type == "ASP":
189
- mu = torch.sum(x * w, dim=2)
190
- sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5))
191
- x = torch.cat((mu, sg), 1)
192
-
193
- x = x.view(x.size()[0], -1)
194
- x = self.fc(x)
195
-
196
- if l2_norm:
197
- x = torch.nn.functional.normalize(x, p=2, dim=1)
198
- return x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
TTS/encoder/requirements.txt DELETED
@@ -1,2 +0,0 @@
1
- umap-learn
2
- numpy>=1.17.0
 
 
 
TTS/encoder/utils/__init__.py DELETED
File without changes
TTS/encoder/utils/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (171 Bytes)
 
TTS/encoder/utils/__pycache__/generic_utils.cpython-310.pyc DELETED
Binary file (3.7 kB)