Moore-Language-Space-ZeroGPU

Runtime error

App Files Files Community

ANYANTUDRE commited on Oct 22, 2024

Commit

7c7161e

1 Parent(s): cfe3883

fixed typo in goai_stt_ttt_pipeline

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Makefile +0 -3
TTS/.models.json +0 -938
TTS/VERSION +0 -1
TTS/__init__.py +0 -6
TTS/__pycache__/__init__.cpython-310.pyc +0 -0
TTS/__pycache__/model.cpython-310.pyc +0 -0
TTS/api.py +0 -458
TTS/bin/__init__.py +0 -0
TTS/bin/collect_env_info.py +0 -48
TTS/bin/compute_attention_masks.py +0 -165
TTS/bin/compute_embeddings.py +0 -197
TTS/bin/compute_statistics.py +0 -96
TTS/bin/eval_encoder.py +0 -88
TTS/bin/extract_tts_spectrograms.py +0 -287
TTS/bin/find_unique_chars.py +0 -45
TTS/bin/find_unique_phonemes.py +0 -74
TTS/bin/remove_silence_using_vad.py +0 -124
TTS/bin/resample.py +0 -90
TTS/bin/synthesize.py +0 -494
TTS/bin/train_encoder.py +0 -332
TTS/bin/train_tts.py +0 -71
TTS/bin/train_vocoder.py +0 -77
TTS/bin/tune_wavegrad.py +0 -103
TTS/config/__init__.py +0 -135
TTS/config/__pycache__/__init__.cpython-310.pyc +0 -0
TTS/config/__pycache__/shared_configs.cpython-310.pyc +0 -0
TTS/config/shared_configs.py +0 -268
TTS/demos/xtts_ft_demo/requirements.txt +0 -2
TTS/demos/xtts_ft_demo/utils/formatter.py +0 -160
TTS/demos/xtts_ft_demo/utils/gpt_train.py +0 -172
TTS/demos/xtts_ft_demo/xtts_demo.py +0 -415
TTS/encoder/README.md +0 -18
TTS/encoder/__init__.py +0 -0
TTS/encoder/__pycache__/__init__.cpython-310.pyc +0 -0
TTS/encoder/__pycache__/losses.cpython-310.pyc +0 -0
TTS/encoder/configs/base_encoder_config.py +0 -61
TTS/encoder/configs/emotion_encoder_config.py +0 -12
TTS/encoder/configs/speaker_encoder_config.py +0 -11
TTS/encoder/dataset.py +0 -147
TTS/encoder/losses.py +0 -226
TTS/encoder/models/__pycache__/base_encoder.cpython-310.pyc +0 -0
TTS/encoder/models/__pycache__/lstm.cpython-310.pyc +0 -0
TTS/encoder/models/__pycache__/resnet.cpython-310.pyc +0 -0
TTS/encoder/models/base_encoder.py +0 -161
TTS/encoder/models/lstm.py +0 -99
TTS/encoder/models/resnet.py +0 -198
TTS/encoder/requirements.txt +0 -2
TTS/encoder/utils/__init__.py +0 -0
TTS/encoder/utils/__pycache__/__init__.cpython-310.pyc +0 -0
TTS/encoder/utils/__pycache__/generic_utils.cpython-310.pyc +0 -0

Makefile CHANGED Viewed

@@ -5,9 +5,6 @@ install:
 test:
 	python app.py
-debug:
-	#python -m pytest -vv --pdb	#Debugger is invoked
 format:
 	#black *.py

 test:
 	python app.py
 format:
 	#black *.py

TTS/.models.json DELETED Viewed

@@ -1,938 +0,0 @@
-{
-    "tts_models": {
-        "multilingual": {
-            "multi-dataset": {
-                "xtts_v2": {
-                    "description": "XTTS-v2.0.3 by Coqui with 17 languages.",
-                    "hf_url": [
-                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
-                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
-                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
-                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5",
-                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/speakers_xtts.pth"
-                    ],
-                    "model_hash": "10f92b55c512af7a8d39d650547a15a7",
-                    "default_vocoder": null,
-                    "commit": "480a6cdf7",
-                    "license": "CPML",
-                    "contact": "[email protected]",
-                    "tos_required": true
-                },
-                "xtts_v1.1": {
-                    "description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.",
-                    "hf_url": [
-                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth",
-                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/config.json",
-                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json",
-                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/hash.md5"
-                    ],
-                    "model_hash": "7c62beaf58d39b729de287330dc254e7b515677416839b649a50e7cf74c3df59",
-                    "default_vocoder": null,
-                    "commit": "82910a63",
-                    "license": "CPML",
-                    "contact": "[email protected]",
-                    "tos_required": true
-                },
-                "your_tts": {
-                    "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
-                    "default_vocoder": null,
-                    "commit": "e9a1953e",
-                    "license": "CC BY-NC-ND 4.0",
-                    "contact": "[email protected]"
-                },
-                "bark": {
-                    "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
-                    "hf_url": [
-                        "https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
-                        "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
-                        "https://coqui.gateway.scarf.sh/hf/text_2.pt",
-                        "https://coqui.gateway.scarf.sh/hf/bark/config.json",
-                        "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
-                        "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
-                    ],
-                    "default_vocoder": null,
-                    "commit": "e9a1953e",
-                    "license": "MIT",
-                    "contact": "https://www.suno.ai/"
-                }
-            }
-        },
-        "bg": {
-            "cv": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "cs": {
-            "cv": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "da": {
-            "cv": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "et": {
-            "cv": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "ga": {
-            "cv": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "en": {
-            "ek1": {
-                "tacotron2": {
-                    "description": "EK1 en-rp tacotron2 by NMStoker",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
-                    "default_vocoder": "vocoder_models/en/ek1/wavegrad",
-                    "commit": "c802255",
-                    "license": "apache 2.0"
-                }
-            },
-            "ljspeech": {
-                "tacotron2-DDC": {
-                    "description": "Tacotron2 with Double Decoder Consistency.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
-                    "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
-                    "commit": "bae2ad0f",
-                    "author": "Eren Gölge @erogol",
-                    "license": "apache 2.0",
-                    "contact": "[email protected]"
-                },
-                "tacotron2-DDC_ph": {
-                    "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
-                    "default_vocoder": "vocoder_models/en/ljspeech/univnet",
-                    "commit": "3900448",
-                    "author": "Eren Gölge @erogol",
-                    "license": "apache 2.0",
-                    "contact": "[email protected]"
-                },
-                "glow-tts": {
-                    "description": "",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
-                    "stats_file": null,
-                    "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
-                    "commit": "",
-                    "author": "Eren Gölge @erogol",
-                    "license": "MPL",
-                    "contact": "[email protected]"
-                },
-                "speedy-speech": {
-                    "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
-                    "stats_file": null,
-                    "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
-                    "commit": "4581e3d",
-                    "author": "Eren Gölge @erogol",
-                    "license": "apache 2.0",
-                    "contact": "[email protected]"
-                },
-                "tacotron2-DCA": {
-                    "description": "",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
-                    "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
-                    "commit": "",
-                    "author": "Eren Gölge @erogol",
-                    "license": "MPL",
-                    "contact": "[email protected]"
-                },
-                "vits": {
-                    "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
-                    "default_vocoder": null,
-                    "commit": "3900448",
-                    "author": "Eren Gölge @erogol",
-                    "license": "apache 2.0",
-                    "contact": "[email protected]"
-                },
-                "vits--neon": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
-                    "default_vocoder": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause",
-                    "contact": null,
-                    "commit": null
-                },
-                "fast_pitch": {
-                    "description": "FastPitch model trained on LJSpeech using the Aligner Network",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
-                    "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
-                    "commit": "b27b3ba",
-                    "author": "Eren Gölge @erogol",
-                    "license": "apache 2.0",
-                    "contact": "[email protected]"
-                },
-                "overflow": {
-                    "description": "Overflow model trained on LJSpeech",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
-                    "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
-                    "commit": "3b1a28f",
-                    "author": "Eren Gölge @erogol",
-                    "license": "apache 2.0",
-                    "contact": "[email protected]"
-                },
-                "neural_hmm": {
-                    "description": "Neural HMM model trained on LJSpeech",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.11.0_models/tts_models--en--ljspeech--neural_hmm.zip",
-                    "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
-                    "commit": "3b1a28f",
-                    "author": "Shivam Metha @shivammehta25",
-                    "license": "apache 2.0",
-                    "contact": "d83ee8fe45e3c0d776d4a865aca21d7c2ac324c4"
-                }
-            },
-            "vctk": {
-                "vits": {
-                    "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
-                    "default_vocoder": null,
-                    "commit": "3900448",
-                    "author": "Eren @erogol",
-                    "license": "apache 2.0",
-                    "contact": "[email protected]"
-                },
-                "fast_pitch": {
-                    "description": "FastPitch model trained on VCTK dataseset.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
-                    "default_vocoder": null,
-                    "commit": "bdab788d",
-                    "author": "Eren @erogol",
-                    "license": "CC BY-NC-ND 4.0",
-                    "contact": "[email protected]"
-                }
-            },
-            "sam": {
-                "tacotron-DDC": {
-                    "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
-                    "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
-                    "commit": "bae2ad0f",
-                    "author": "Eren Gölge @erogol",
-                    "license": "apache 2.0",
-                    "contact": "[email protected]"
-                }
-            },
-            "blizzard2013": {
-                "capacitron-t2-c50": {
-                    "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
-                    "commit": "d6284e7",
-                    "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
-                    "author": "Adam Froghyar @a-froghyar",
-                    "license": "apache 2.0",
-                    "contact": "[email protected]"
-                },
-                "capacitron-t2-c150_v2": {
-                    "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
-                    "commit": "a67039d",
-                    "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
-                    "author": "Adam Froghyar @a-froghyar",
-                    "license": "apache 2.0",
-                    "contact": "[email protected]"
-                }
-            },
-            "multi-dataset": {
-                "tortoise-v2": {
-                    "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
-                    "github_rls_url": [
-                        "https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
-                        "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
-                        "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
-                        "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
-                        "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
-                        "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
-                        "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
-                        "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
-                        "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
-                    ],
-                    "commit": "c1875f6",
-                    "default_vocoder": null,
-                    "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
-                    "license": "apache 2.0"
-                }
-            },
-            "jenny": {
-                "jenny": {
-                    "description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
-                    "default_vocoder": null,
-                    "commit": "ba40a1c",
-                    "license": "custom - see https://github.com/dioco-group/jenny-tts-dataset#important",
-                    "author": "@noml4u"
-                }
-            }
-        },
-        "es": {
-            "mai": {
-                "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
-                    "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
-                    "commit": "",
-                    "author": "Eren Gölge @erogol",
-                    "license": "MPL",
-                    "contact": "[email protected]"
-                }
-            },
-            "css10": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "fr": {
-            "mai": {
-                "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
-                    "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
-                    "commit": null,
-                    "author": "Eren Gölge @erogol",
-                    "license": "MPL",
-                    "contact": "[email protected]"
-                }
-            },
-            "css10": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "uk": {
-            "mai": {
-                "glow-tts": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
-                    "author": "@robinhad",
-                    "commit": "bdab788d",
-                    "license": "MIT",
-                    "contact": "",
-                    "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
-                },
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "zh-CN": {
-            "baker": {
-                "tacotron2-DDC-GST": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
-                    "commit": "unknown",
-                    "author": "@kirianguiller",
-                    "license": "apache 2.0",
-                    "default_vocoder": null
-                }
-            }
-        },
-        "nl": {
-            "mai": {
-                "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
-                    "author": "@r-dh",
-                    "license": "apache 2.0",
-                    "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
-                    "stats_file": null,
-                    "commit": "540d811"
-                }
-            },
-            "css10": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "de": {
-            "thorsten": {
-                "tacotron2-DCA": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
-                    "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
-                    "author": "@thorstenMueller",
-                    "license": "apache 2.0",
-                    "commit": "unknown"
-                },
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--de--thorsten--vits.zip",
-                    "default_vocoder": null,
-                    "author": "@thorstenMueller",
-                    "license": "apache 2.0",
-                    "commit": "unknown"
-                },
-                "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
-                    "default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
-                    "description": "Thorsten-Dec2021-22k-DDC",
-                    "author": "@thorstenMueller",
-                    "license": "apache 2.0",
-                    "commit": "unknown"
-                }
-            },
-            "css10": {
-                "vits-neon": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
-                    "default_vocoder": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause",
-                    "commit": null
-                }
-            }
-        },
-        "ja": {
-            "kokoro": {
-                "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
-                    "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
-                    "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
-                    "author": "@kaiidams",
-                    "license": "apache 2.0",
-                    "commit": "401fbd89"
-                }
-            }
-        },
-        "tr": {
-            "common-voice": {
-                "glow-tts": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
-                    "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
-                    "license": "MIT",
-                    "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
-                    "author": "Fatih Akademi",
-                    "commit": null
-                }
-            }
-        },
-        "it": {
-            "mai_female": {
-                "glow-tts": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
-                    "default_vocoder": null,
-                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
-                    "author": "@nicolalandro",
-                    "license": "apache 2.0",
-                    "commit": null
-                },
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
-                    "default_vocoder": null,
-                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
-                    "author": "@nicolalandro",
-                    "license": "apache 2.0",
-                    "commit": null
-                }
-            },
-            "mai_male": {
-                "glow-tts": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
-                    "default_vocoder": null,
-                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
-                    "author": "@nicolalandro",
-                    "license": "apache 2.0",
-                    "commit": null
-                },
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
-                    "default_vocoder": null,
-                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
-                    "author": "@nicolalandro",
-                    "license": "apache 2.0",
-                    "commit": null
-                }
-            }
-        },
-        "ewe": {
-            "openbible": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
-                    "default_vocoder": null,
-                    "license": "CC-BY-SA 4.0",
-                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
-                    "author": "@coqui_ai",
-                    "commit": "1b22f03"
-                }
-            }
-        },
-        "hau": {
-            "openbible": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
-                    "default_vocoder": null,
-                    "license": "CC-BY-SA 4.0",
-                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
-                    "author": "@coqui_ai",
-                    "commit": "1b22f03"
-                }
-            }
-        },
-        "lin": {
-            "openbible": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
-                    "default_vocoder": null,
-                    "license": "CC-BY-SA 4.0",
-                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
-                    "author": "@coqui_ai",
-                    "commit": "1b22f03"
-                }
-            }
-        },
-        "tw_akuapem": {
-            "openbible": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
-                    "default_vocoder": null,
-                    "license": "CC-BY-SA 4.0",
-                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
-                    "author": "@coqui_ai",
-                    "commit": "1b22f03"
-                }
-            }
-        },
-        "tw_asante": {
-            "openbible": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
-                    "default_vocoder": null,
-                    "license": "CC-BY-SA 4.0",
-                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
-                    "author": "@coqui_ai",
-                    "commit": "1b22f03"
-                }
-            }
-        },
-        "yor": {
-            "openbible": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
-                    "default_vocoder": null,
-                    "license": "CC-BY-SA 4.0",
-                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
-                    "author": "@coqui_ai",
-                    "commit": "1b22f03"
-                }
-            }
-        },
-        "hu": {
-            "css10": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "el": {
-            "cv": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "fi": {
-            "css10": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "hr": {
-            "cv": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "lt": {
-            "cv": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "lv": {
-            "cv": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "mt": {
-            "cv": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "pl": {
-            "mai_female": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "pt": {
-            "cv": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "ro": {
-            "cv": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "sk": {
-            "cv": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "sl": {
-            "cv": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "sv": {
-            "cv": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "author": "@NeonGeckoCom",
-                    "license": "bsd-3-clause"
-                }
-            }
-        },
-        "ca": {
-            "custom": {
-                "vits": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
-                    "author": "@gullabi",
-                    "license": "CC-BY-4.0"
-                }
-            }
-        },
-        "fa": {
-            "custom": {
-                "glow-tts": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
-                    "author": "@karim23657",
-                    "license": "CC-BY-4.0"
-                }
-            }
-        },
-        "bn": {
-            "custom": {
-                "vits-male": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
-                    "author": "@mobassir94",
-                    "license": "Apache 2.0"
-                },
-                "vits-female": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
-                    "default_vocoder": null,
-                    "commit": null,
-                    "description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
-                    "author": "@mobassir94",
-                    "license": "Apache 2.0"
-                }
-            }
-        },
-        "be": {
-            "common-voice": {
-                "glow-tts":{
-                    "description": "Belarusian GlowTTS model created by @alex73 (Github).",
-                    "github_rls_url":"https://coqui.gateway.scarf.sh/v0.16.6/tts_models--be--common-voice--glow-tts.zip",
-                    "default_vocoder": "vocoder_models/be/common-voice/hifigan",
-                    "commit": "c0aabb85",
-                    "license": "CC-BY-SA 4.0",
-                    "contact": "[email protected]"
-                }
-            }
-        }
-    },
-    "vocoder_models": {
-        "universal": {
-            "libri-tts": {
-                "wavegrad": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
-                    "commit": "ea976b0",
-                    "author": "Eren Gölge @erogol",
-                    "license": "MPL",
-                    "contact": "[email protected]"
-                },
-                "fullband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
-                    "commit": "4132240",
-                    "author": "Eren Gölge @erogol",
-                    "license": "MPL",
-                    "contact": "[email protected]"
-                }
-            }
-        },
-        "en": {
-            "ek1": {
-                "wavegrad": {
-                    "description": "EK1 en-rp wavegrad by NMStoker",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
-                    "commit": "c802255",
-                    "license": "apache 2.0"
-                }
-            },
-            "ljspeech": {
-                "multiband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
-                    "commit": "ea976b0",
-                    "author": "Eren Gölge @erogol",
-                    "license": "MPL",
-                    "contact": "[email protected]"
-                },
-                "hifigan_v2": {
-                    "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
-                    "commit": "bae2ad0f",
-                    "author": "@erogol",
-                    "license": "apache 2.0",
-                    "contact": "[email protected]"
-                },
-                "univnet": {
-                    "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
-                    "commit": "4581e3d",
-                    "author": "Eren @erogol",
-                    "license": "apache 2.0",
-                    "contact": "[email protected]"
-                }
-            },
-            "blizzard2013": {
-                "hifigan_v2": {
-                    "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
-                    "commit": "d6284e7",
-                    "author": "Adam Froghyar @a-froghyar",
-                    "license": "apache 2.0",
-                    "contact": "[email protected]"
-                }
-            },
-            "vctk": {
-                "hifigan_v2": {
-                    "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
-                    "commit": "2f07160",
-                    "author": "Edresson Casanova",
-                    "license": "apache 2.0",
-                    "contact": ""
-                }
-            },
-            "sam": {
-                "hifigan_v2": {
-                    "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
-                    "commit": "2f07160",
-                    "author": "Eren Gölge @erogol",
-                    "license": "apache 2.0",
-                    "contact": "[email protected]"
-                }
-            }
-        },
-        "nl": {
-            "mai": {
-                "parallel-wavegan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
-                    "author": "@r-dh",
-                    "license": "apache 2.0",
-                    "commit": "unknown"
-                }
-            }
-        },
-        "de": {
-            "thorsten": {
-                "wavegrad": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
-                    "author": "@thorstenMueller",
-                    "license": "apache 2.0",
-                    "commit": "unknown"
-                },
-                "fullband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
-                    "author": "@thorstenMueller",
-                    "license": "apache 2.0",
-                    "commit": "unknown"
-                },
-                "hifigan_v1": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
-                    "description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
-                    "author": "@thorstenMueller",
-                    "license": "apache 2.0",
-                    "commit": "unknown"
-                }
-            }
-        },
-        "ja": {
-            "kokoro": {
-                "hifigan_v1": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
-                    "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
-                    "author": "@kaiidams",
-                    "license": "apache 2.0",
-                    "commit": "3900448"
-                }
-            }
-        },
-        "uk": {
-            "mai": {
-                "multiband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
-                    "author": "@robinhad",
-                    "commit": "bdab788d",
-                    "license": "MIT",
-                    "contact": ""
-                }
-            }
-        },
-        "tr": {
-            "common-voice": {
-                "hifigan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
-                    "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
-                    "author": "Fatih Akademi",
-                    "license": "MIT",
-                    "commit": null
-                }
-            }
-        },
-        "be": {
-            "common-voice": {
-                "hifigan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.16.6/vocoder_models--be--common-voice--hifigan.zip",
-                    "description": "Belarusian HiFiGAN model created by @alex73 (Github).",
-                    "author": "@alex73",
-                    "license": "CC-BY-SA 4.0",
-                    "commit": "c0aabb85"
-                }
-            }
-        }
-    },
-    "voice_conversion_models": {
-        "multilingual": {
-            "vctk": {
-                "freevc24": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
-                    "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
-                    "author": "Jing-Yi Li @OlaWod",
-                    "license": "MIT",
-                    "commit": null
-                }
-            }
-        }
-    }
-}

TTS/VERSION DELETED Viewed

	@@ -1 +0,0 @@
1	- 0.22.0

TTS/__init__.py DELETED Viewed

@@ -1,6 +0,0 @@
-import os
-with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
-    version = f.read().strip()
-__version__ = version

TTS/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (375 Bytes)

TTS/__pycache__/model.cpython-310.pyc DELETED Viewed

Binary file (2.6 kB)

TTS/api.py DELETED Viewed

@@ -1,458 +0,0 @@
-import tempfile
-import warnings
-from pathlib import Path
-from typing import Union
-import numpy as np
-from torch import nn
-from TTS.utils.audio.numpy_transforms import save_wav
-from TTS.utils.manage import ModelManager
-from TTS.utils.synthesizer import Synthesizer
-from TTS.config import load_config
-class TTS(nn.Module):
-    """TODO: Add voice conversion and Capacitron support."""
-    def __init__(
-        self,
-        model_name: str = "",
-        model_path: str = None,
-        config_path: str = None,
-        vocoder_path: str = None,
-        vocoder_config_path: str = None,
-        progress_bar: bool = True,
-        gpu=False,
-    ):
-        """🐸TTS python interface that allows to load and use the released models.
-        Example with a multi-speaker model:
-            >>> from TTS.api import TTS
-            >>> tts = TTS(TTS.list_models()[0])
-            >>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
-            >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
-        Example with a single-speaker model:
-            >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
-            >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
-        Example loading a model from a path:
-            >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
-            >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
-        Example voice cloning with YourTTS in English, French and Portuguese:
-            >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
-            >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
-            >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
-            >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
-        Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
-            >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
-            >>> tts.tts_to_file("This is a test.", file_path="output.wav")
-        Args:
-            model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
-            model_path (str, optional): Path to the model checkpoint. Defaults to None.
-            config_path (str, optional): Path to the model config. Defaults to None.
-            vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
-            vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
-            progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
-            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
-        """
-        super().__init__()
-        self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
-        self.config = load_config(config_path) if config_path else None
-        self.synthesizer = None
-        self.voice_converter = None
-        self.model_name = ""
-        if gpu:
-            warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
-        if model_name is not None and len(model_name) > 0:
-            if "tts_models" in model_name:
-                self.load_tts_model_by_name(model_name, gpu)
-            elif "voice_conversion_models" in model_name:
-                self.load_vc_model_by_name(model_name, gpu)
-            else:
-                self.load_model_by_name(model_name, gpu)
-        if model_path:
-            self.load_tts_model_by_path(
-                model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
-            )
-    @property
-    def models(self):
-        return self.manager.list_tts_models()
-    @property
-    def is_multi_speaker(self):
-        if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
-            return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
-        return False
-    @property
-    def is_multi_lingual(self):
-        # Not sure what sets this to None, but applied a fix to prevent crashing.
-        if (
-            isinstance(self.model_name, str)
-            and "xtts" in self.model_name
-            or self.config
-            and ("xtts" in self.config.model or len(self.config.languages) > 1)
-        ):
-            return True
-        if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
-            return self.synthesizer.tts_model.language_manager.num_languages > 1
-        return False
-    @property
-    def speakers(self):
-        if not self.is_multi_speaker:
-            return None
-        return self.synthesizer.tts_model.speaker_manager.speaker_names
-    @property
-    def languages(self):
-        if not self.is_multi_lingual:
-            return None
-        return self.synthesizer.tts_model.language_manager.language_names
-    @staticmethod
-    def get_models_file_path():
-        return Path(__file__).parent / ".models.json"
-    def list_models(self):
-        return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
-    def download_model_by_name(self, model_name: str):
-        model_path, config_path, model_item = self.manager.download_model(model_name)
-        if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
-            # return model directory if there are multiple files
-            # we assume that the model knows how to load itself
-            return None, None, None, None, model_path
-        if model_item.get("default_vocoder") is None:
-            return model_path, config_path, None, None, None
-        vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
-        return model_path, config_path, vocoder_path, vocoder_config_path, None
-    def load_model_by_name(self, model_name: str, gpu: bool = False):
-        """Load one of the 🐸TTS models by name.
-        Args:
-            model_name (str): Model name to load. You can list models by ```tts.models```.
-            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
-        """
-        self.load_tts_model_by_name(model_name, gpu)
-    def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
-        """Load one of the voice conversion models by name.
-        Args:
-            model_name (str): Model name to load. You can list models by ```tts.models```.
-            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
-        """
-        self.model_name = model_name
-        model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
-        self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
-    def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
-        """Load one of 🐸TTS models by name.
-        Args:
-            model_name (str): Model name to load. You can list models by ```tts.models```.
-            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
-        TODO: Add tests
-        """
-        self.synthesizer = None
-        self.model_name = model_name
-        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
-            model_name
-        )
-        # init synthesizer
-        # None values are fetch from the model
-        self.synthesizer = Synthesizer(
-            tts_checkpoint=model_path,
-            tts_config_path=config_path,
-            tts_speakers_file=None,
-            tts_languages_file=None,
-            vocoder_checkpoint=vocoder_path,
-            vocoder_config=vocoder_config_path,
-            encoder_checkpoint=None,
-            encoder_config=None,
-            model_dir=model_dir,
-            use_cuda=gpu,
-        )
-    def load_tts_model_by_path(
-        self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
-    ):
-        """Load a model from a path.
-        Args:
-            model_path (str): Path to the model checkpoint.
-            config_path (str): Path to the model config.
-            vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
-            vocoder_config (str, optional): Path to the vocoder config. Defaults to None.
-            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
-        """
-        self.synthesizer = Synthesizer(
-            tts_checkpoint=model_path,
-            tts_config_path=config_path,
-            tts_speakers_file=None,
-            tts_languages_file=None,
-            vocoder_checkpoint=vocoder_path,
-            vocoder_config=vocoder_config,
-            encoder_checkpoint=None,
-            encoder_config=None,
-            use_cuda=gpu,
-        )
-    def _check_arguments(
-        self,
-        speaker: str = None,
-        language: str = None,
-        speaker_wav: str = None,
-        emotion: str = None,
-        speed: float = None,
-        **kwargs,
-    ) -> None:
-        """Check if the arguments are valid for the model."""
-        # check for the coqui tts models
-        if self.is_multi_speaker and (speaker is None and speaker_wav is None):
-            raise ValueError("Model is multi-speaker but no `speaker` is provided.")
-        if self.is_multi_lingual and language is None:
-            raise ValueError("Model is multi-lingual but no `language` is provided.")
-        if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
-            raise ValueError("Model is not multi-speaker but `speaker` is provided.")
-        if not self.is_multi_lingual and language is not None:
-            raise ValueError("Model is not multi-lingual but `language` is provided.")
-        if not emotion is None and not speed is None:
-            raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
-    def tts(
-        self,
-        text: str,
-        speaker: str = None,
-        language: str = None,
-        speaker_wav: str = None,
-        emotion: str = None,
-        speed: float = None,
-        split_sentences: bool = True,
-        **kwargs,
-    ):
-        """Convert text to speech.
-        Args:
-            text (str):
-                Input text to synthesize.
-            speaker (str, optional):
-                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
-                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
-            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS` model.
-            speaker_wav (str, optional):
-                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
-                Defaults to None.
-            emotion (str, optional):
-                Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
-            speed (float, optional):
-                Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
-                Defaults to None.
-            split_sentences (bool, optional):
-                Split text into sentences, synthesize them separately and concatenate the file audio.
-                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
-                applicable to the 🐸TTS models. Defaults to True.
-            kwargs (dict, optional):
-                Additional arguments for the model.
-        """
-        self._check_arguments(
-            speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
-        )
-        wav = self.synthesizer.tts(
-            text=text,
-            speaker_name=speaker,
-            language_name=language,
-            speaker_wav=speaker_wav,
-            reference_wav=None,
-            style_wav=None,
-            style_text=None,
-            reference_speaker_name=None,
-            split_sentences=split_sentences,
-            **kwargs,
-        )
-        return wav
-    def tts_to_file(
-        self,
-        text: str,
-        speaker: str = None,
-        language: str = None,
-        speaker_wav: str = None,
-        emotion: str = None,
-        speed: float = 1.0,
-        pipe_out=None,
-        file_path: str = "output.wav",
-        split_sentences: bool = True,
-        **kwargs,
-    ):
-        """Convert text to speech.
-        Args:
-            text (str):
-                Input text to synthesize.
-            speaker (str, optional):
-                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
-                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
-            language (str, optional):
-                Language code for multi-lingual models. You can check whether loaded model is multi-lingual
-                `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
-            speaker_wav (str, optional):
-                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
-                Defaults to None.
-            emotion (str, optional):
-                Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
-            speed (float, optional):
-                Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
-            pipe_out (BytesIO, optional):
-                Flag to stdout the generated TTS wav file for shell pipe.
-            file_path (str, optional):
-                Output file path. Defaults to "output.wav".
-            split_sentences (bool, optional):
-                Split text into sentences, synthesize them separately and concatenate the file audio.
-                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
-                applicable to the 🐸TTS models. Defaults to True.
-            kwargs (dict, optional):
-                Additional arguments for the model.
-        """
-        self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
-        wav = self.tts(
-            text=text,
-            speaker=speaker,
-            language=language,
-            speaker_wav=speaker_wav,
-            split_sentences=split_sentences,
-            **kwargs,
-        )
-        self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
-        return file_path
-    def voice_conversion(
-        self,
-        source_wav: str,
-        target_wav: str,
-    ):
-        """Voice conversion with FreeVC. Convert source wav to target speaker.
-        Args:``
-            source_wav (str):
-                Path to the source wav file.
-            target_wav (str):`
-                Path to the target wav file.
-        """
-        wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
-        return wav
-    def voice_conversion_to_file(
-        self,
-        source_wav: str,
-        target_wav: str,
-        file_path: str = "output.wav",
-    ):
-        """Voice conversion with FreeVC. Convert source wav to target speaker.
-        Args:
-            source_wav (str):
-                Path to the source wav file.
-            target_wav (str):
-                Path to the target wav file.
-            file_path (str, optional):
-                Output file path. Defaults to "output.wav".
-        """
-        wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
-        save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
-        return file_path
-    def tts_with_vc(
-        self,
-        text: str,
-        language: str = None,
-        speaker_wav: str = None,
-        speaker: str = None,
-        split_sentences: bool = True,
-    ):
-        """Convert text to speech with voice conversion.
-        It combines tts with voice conversion to fake voice cloning.
-        - Convert text to speech with tts.
-        - Convert the output wav to target speaker with voice conversion.
-        Args:
-            text (str):
-                Input text to synthesize.
-            language (str, optional):
-                Language code for multi-lingual models. You can check whether loaded model is multi-lingual
-                `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
-            speaker_wav (str, optional):
-                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
-                Defaults to None.
-            speaker (str, optional):
-                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
-                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
-            split_sentences (bool, optional):
-                Split text into sentences, synthesize them separately and concatenate the file audio.
-                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
-                applicable to the 🐸TTS models. Defaults to True.
-        """
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
-            # Lazy code... save it to a temp file to resample it while reading it for VC
-            self.tts_to_file(
-                text=text, speaker=speaker, language=language, file_path=fp.name, split_sentences=split_sentences
-            )
-        if self.voice_converter is None:
-            self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
-        wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
-        return wav
-    def tts_with_vc_to_file(
-        self,
-        text: str,
-        language: str = None,
-        speaker_wav: str = None,
-        file_path: str = "output.wav",
-        speaker: str = None,
-        split_sentences: bool = True,
-    ):
-        """Convert text to speech with voice conversion and save to file.
-        Check `tts_with_vc` for more details.
-        Args:
-            text (str):
-                Input text to synthesize.
-            language (str, optional):
-                Language code for multi-lingual models. You can check whether loaded model is multi-lingual
-                `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
-            speaker_wav (str, optional):
-                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
-                Defaults to None.
-            file_path (str, optional):
-                Output file path. Defaults to "output.wav".
-            speaker (str, optional):
-                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
-                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
-            split_sentences (bool, optional):
-                Split text into sentences, synthesize them separately and concatenate the file audio.
-                Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
-                applicable to the 🐸TTS models. Defaults to True.
-        """
-        wav = self.tts_with_vc(
-            text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
-        )
-        save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)

TTS/bin/__init__.py DELETED Viewed

File without changes

TTS/bin/collect_env_info.py DELETED Viewed

@@ -1,48 +0,0 @@
-"""Get detailed info about the working environment."""
-import os
-import platform
-import sys
-import numpy
-import torch
-sys.path += [os.path.abspath(".."), os.path.abspath(".")]
-import json
-import TTS
-def system_info():
-    return {
-        "OS": platform.system(),
-        "architecture": platform.architecture(),
-        "version": platform.version(),
-        "processor": platform.processor(),
-        "python": platform.python_version(),
-    }
-def cuda_info():
-    return {
-        "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
-        "available": torch.cuda.is_available(),
-        "version": torch.version.cuda,
-    }
-def package_info():
-    return {
-        "numpy": numpy.__version__,
-        "PyTorch_version": torch.__version__,
-        "PyTorch_debug": torch.version.debug,
-        "TTS": TTS.__version__,
-    }
-def main():
-    details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
-    print(json.dumps(details, indent=4, sort_keys=True))
-if __name__ == "__main__":
-    main()

TTS/bin/compute_attention_masks.py DELETED Viewed

@@ -1,165 +0,0 @@
-import argparse
-import importlib
-import os
-from argparse import RawTextHelpFormatter
-import numpy as np
-import torch
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-from TTS.config import load_config
-from TTS.tts.datasets.TTSDataset import TTSDataset
-from TTS.tts.models import setup_model
-from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
-from TTS.utils.audio import AudioProcessor
-from TTS.utils.io import load_checkpoint
-if __name__ == "__main__":
-    # pylint: disable=bad-option-value
-    parser = argparse.ArgumentParser(
-        description="""Extract attention masks from trained Tacotron/Tacotron2 models.
-These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
-        """Each attention mask is written to the same path as the input wav file with ".npy" file extension.
-(e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
-        """
-Example run:
-    CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
-        --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
-        --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
-        --dataset_metafile metadata.csv
-        --data_path /root/LJSpeech-1.1/
-        --batch_size 32
-        --dataset ljspeech
-        --use_cuda True
-""",
-        formatter_class=RawTextHelpFormatter,
-    )
-    parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
-    parser.add_argument(
-        "--config_path",
-        type=str,
-        required=True,
-        help="Path to Tacotron/Tacotron2 config file.",
-    )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default="",
-        required=True,
-        help="Target dataset processor name from TTS.tts.dataset.preprocess.",
-    )
-    parser.add_argument(
-        "--dataset_metafile",
-        type=str,
-        default="",
-        required=True,
-        help="Dataset metafile inclusing file paths with transcripts.",
-    )
-    parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
-    parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
-    parser.add_argument(
-        "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
-    )
-    args = parser.parse_args()
-    C = load_config(args.config_path)
-    ap = AudioProcessor(**C.audio)
-    # if the vocabulary was passed, replace the default
-    if "characters" in C.keys():
-        symbols, phonemes = make_symbols(**C.characters)
-    # load the model
-    num_chars = len(phonemes) if C.use_phonemes else len(symbols)
-    # TODO: handle multi-speaker
-    model = setup_model(C)
-    model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
-    # data loader
-    preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
-    preprocessor = getattr(preprocessor, args.dataset)
-    meta_data = preprocessor(args.data_path, args.dataset_metafile)
-    dataset = TTSDataset(
-        model.decoder.r,
-        C.text_cleaner,
-        compute_linear_spec=False,
-        ap=ap,
-        meta_data=meta_data,
-        characters=C.characters if "characters" in C.keys() else None,
-        add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
-        use_phonemes=C.use_phonemes,
-        phoneme_cache_path=C.phoneme_cache_path,
-        phoneme_language=C.phoneme_language,
-        enable_eos_bos=C.enable_eos_bos_chars,
-    )
-    dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
-    loader = DataLoader(
-        dataset,
-        batch_size=args.batch_size,
-        num_workers=4,
-        collate_fn=dataset.collate_fn,
-        shuffle=False,
-        drop_last=False,
-    )
-    # compute attentions
-    file_paths = []
-    with torch.no_grad():
-        for data in tqdm(loader):
-            # setup input data
-            text_input = data[0]
-            text_lengths = data[1]
-            linear_input = data[3]
-            mel_input = data[4]
-            mel_lengths = data[5]
-            stop_targets = data[6]
-            item_idxs = data[7]
-            # dispatch data to GPU
-            if args.use_cuda:
-                text_input = text_input.cuda()
-                text_lengths = text_lengths.cuda()
-                mel_input = mel_input.cuda()
-                mel_lengths = mel_lengths.cuda()
-            model_outputs = model.forward(text_input, text_lengths, mel_input)
-            alignments = model_outputs["alignments"].detach()
-            for idx, alignment in enumerate(alignments):
-                item_idx = item_idxs[idx]
-                # interpolate if r > 1
-                alignment = (
-                    torch.nn.functional.interpolate(
-                        alignment.transpose(0, 1).unsqueeze(0),
-                        size=None,
-                        scale_factor=model.decoder.r,
-                        mode="nearest",
-                        align_corners=None,
-                        recompute_scale_factor=None,
-                    )
-                    .squeeze(0)
-                    .transpose(0, 1)
-                )
-                # remove paddings
-                alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
-                # set file paths
-                wav_file_name = os.path.basename(item_idx)
-                align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
-                file_path = item_idx.replace(wav_file_name, align_file_name)
-                # save output
-                wav_file_abs_path = os.path.abspath(item_idx)
-                file_abs_path = os.path.abspath(file_path)
-                file_paths.append([wav_file_abs_path, file_abs_path])
-                np.save(file_path, alignment)
-        # ourput metafile
-        metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
-        with open(metafile, "w", encoding="utf-8") as f:
-            for p in file_paths:
-                f.write(f"{p[0]}|{p[1]}\n")
-        print(f" >> Metafile created: {metafile}")

TTS/bin/compute_embeddings.py DELETED Viewed

@@ -1,197 +0,0 @@
-import argparse
-import os
-from argparse import RawTextHelpFormatter
-import torch
-from tqdm import tqdm
-from TTS.config import load_config
-from TTS.config.shared_configs import BaseDatasetConfig
-from TTS.tts.datasets import load_tts_samples
-from TTS.tts.utils.managers import save_file
-from TTS.tts.utils.speakers import SpeakerManager
-def compute_embeddings(
-    model_path,
-    config_path,
-    output_path,
-    old_speakers_file=None,
-    old_append=False,
-    config_dataset_path=None,
-    formatter_name=None,
-    dataset_name=None,
-    dataset_path=None,
-    meta_file_train=None,
-    meta_file_val=None,
-    disable_cuda=False,
-    no_eval=False,
-):
-    use_cuda = torch.cuda.is_available() and not disable_cuda
-    if config_dataset_path is not None:
-        c_dataset = load_config(config_dataset_path)
-        meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
-    else:
-        c_dataset = BaseDatasetConfig()
-        c_dataset.formatter = formatter_name
-        c_dataset.dataset_name = dataset_name
-        c_dataset.path = dataset_path
-        if meta_file_train is not None:
-            c_dataset.meta_file_train = meta_file_train
-        if meta_file_val is not None:
-            c_dataset.meta_file_val = meta_file_val
-        meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
-    if meta_data_eval is None:
-        samples = meta_data_train
-    else:
-        samples = meta_data_train + meta_data_eval
-    encoder_manager = SpeakerManager(
-        encoder_model_path=model_path,
-        encoder_config_path=config_path,
-        d_vectors_file_path=old_speakers_file,
-        use_cuda=use_cuda,
-    )
-    class_name_key = encoder_manager.encoder_config.class_name_key
-    # compute speaker embeddings
-    if old_speakers_file is not None and old_append:
-        speaker_mapping = encoder_manager.embeddings
-    else:
-        speaker_mapping = {}
-    for fields in tqdm(samples):
-        class_name = fields[class_name_key]
-        audio_file = fields["audio_file"]
-        embedding_key = fields["audio_unique_name"]
-        # Only update the speaker name when the embedding is already in the old file.
-        if embedding_key in speaker_mapping:
-            speaker_mapping[embedding_key]["name"] = class_name
-            continue
-        if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids:
-            # get the embedding from the old file
-            embedd = encoder_manager.get_embedding_by_clip(embedding_key)
-        else:
-            # extract the embedding
-            embedd = encoder_manager.compute_embedding_from_clip(audio_file)
-        # create speaker_mapping if target dataset is defined
-        speaker_mapping[embedding_key] = {}
-        speaker_mapping[embedding_key]["name"] = class_name
-        speaker_mapping[embedding_key]["embedding"] = embedd
-    if speaker_mapping:
-        # save speaker_mapping if target dataset is defined
-        if os.path.isdir(output_path):
-            mapping_file_path = os.path.join(output_path, "speakers.pth")
-        else:
-            mapping_file_path = output_path
-        if os.path.dirname(mapping_file_path) != "":
-            os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
-        save_file(speaker_mapping, mapping_file_path)
-        print("Speaker embeddings saved at:", mapping_file_path)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
-        """
-        Example runs:
-        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --config_dataset_path dataset_config.json
-        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
-        """,
-        formatter_class=RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        help="Path to model checkpoint file. It defaults to the released speaker encoder.",
-        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
-    )
-    parser.add_argument(
-        "--config_path",
-        type=str,
-        help="Path to model config file. It defaults to the released speaker encoder config.",
-        default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
-    )
-    parser.add_argument(
-        "--config_dataset_path",
-        type=str,
-        help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
-        default=None,
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        help="Path for output `pth` or `json` file.",
-        default="speakers.pth",
-    )
-    parser.add_argument(
-        "--old_file",
-        type=str,
-        help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
-        default=None,
-    )
-    parser.add_argument(
-        "--old_append",
-        help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
-        default=False,
-        action="store_true",
-    )
-    parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
-    parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
-    parser.add_argument(
-        "--formatter_name",
-        type=str,
-        help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--dataset_path",
-        type=str,
-        help="Path to the dataset. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--meta_file_train",
-        type=str,
-        help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    parser.add_argument(
-        "--meta_file_val",
-        type=str,
-        help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
-        default=None,
-    )
-    args = parser.parse_args()
-    compute_embeddings(
-        args.model_path,
-        args.config_path,
-        args.output_path,
-        old_speakers_file=args.old_file,
-        old_append=args.old_append,
-        config_dataset_path=args.config_dataset_path,
-        formatter_name=args.formatter_name,
-        dataset_name=args.dataset_name,
-        dataset_path=args.dataset_path,
-        meta_file_train=args.meta_file_train,
-        meta_file_val=args.meta_file_val,
-        disable_cuda=args.disable_cuda,
-        no_eval=args.no_eval,
-    )

TTS/bin/compute_statistics.py DELETED Viewed

@@ -1,96 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-import argparse
-import glob
-import os
-import numpy as np
-from tqdm import tqdm
-# from TTS.utils.io import load_config
-from TTS.config import load_config
-from TTS.tts.datasets import load_tts_samples
-from TTS.utils.audio import AudioProcessor
-def main():
-    """Run preprocessing process."""
-    parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
-    parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
-    parser.add_argument("out_path", type=str, help="save path (directory and filename).")
-    parser.add_argument(
-        "--data_path",
-        type=str,
-        required=False,
-        help="folder including the target set of wavs overriding dataset config.",
-    )
-    args, overrides = parser.parse_known_args()
-    CONFIG = load_config(args.config_path)
-    CONFIG.parse_known_args(overrides, relaxed_parser=True)
-    # load config
-    CONFIG.audio.signal_norm = False  # do not apply earlier normalization
-    CONFIG.audio.stats_path = None  # discard pre-defined stats
-    # load audio processor
-    ap = AudioProcessor(**CONFIG.audio.to_dict())
-    # load the meta data of target dataset
-    if args.data_path:
-        dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
-    else:
-        dataset_items = load_tts_samples(CONFIG.datasets)[0]  # take only train data
-    print(f" > There are {len(dataset_items)} files.")
-    mel_sum = 0
-    mel_square_sum = 0
-    linear_sum = 0
-    linear_square_sum = 0
-    N = 0
-    for item in tqdm(dataset_items):
-        # compute features
-        wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
-        linear = ap.spectrogram(wav)
-        mel = ap.melspectrogram(wav)
-        # compute stats
-        N += mel.shape[1]
-        mel_sum += mel.sum(1)
-        linear_sum += linear.sum(1)
-        mel_square_sum += (mel**2).sum(axis=1)
-        linear_square_sum += (linear**2).sum(axis=1)
-    mel_mean = mel_sum / N
-    mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
-    linear_mean = linear_sum / N
-    linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
-    output_file_path = args.out_path
-    stats = {}
-    stats["mel_mean"] = mel_mean
-    stats["mel_std"] = mel_scale
-    stats["linear_mean"] = linear_mean
-    stats["linear_std"] = linear_scale
-    print(f" > Avg mel spec mean: {mel_mean.mean()}")
-    print(f" > Avg mel spec scale: {mel_scale.mean()}")
-    print(f" > Avg linear spec mean: {linear_mean.mean()}")
-    print(f" > Avg linear spec scale: {linear_scale.mean()}")
-    # set default config values for mean-var scaling
-    CONFIG.audio.stats_path = output_file_path
-    CONFIG.audio.signal_norm = True
-    # remove redundant values
-    del CONFIG.audio.max_norm
-    del CONFIG.audio.min_level_db
-    del CONFIG.audio.symmetric_norm
-    del CONFIG.audio.clip_norm
-    stats["audio_config"] = CONFIG.audio.to_dict()
-    np.save(output_file_path, stats, allow_pickle=True)
-    print(f" > stats saved to {output_file_path}")
-if __name__ == "__main__":
-    main()

TTS/bin/eval_encoder.py DELETED Viewed

@@ -1,88 +0,0 @@
-import argparse
-from argparse import RawTextHelpFormatter
-import torch
-from tqdm import tqdm
-from TTS.config import load_config
-from TTS.tts.datasets import load_tts_samples
-from TTS.tts.utils.speakers import SpeakerManager
-def compute_encoder_accuracy(dataset_items, encoder_manager):
-    class_name_key = encoder_manager.encoder_config.class_name_key
-    map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
-    class_acc_dict = {}
-    # compute embeddings for all wav_files
-    for item in tqdm(dataset_items):
-        class_name = item[class_name_key]
-        wav_file = item["audio_file"]
-        # extract the embedding
-        embedd = encoder_manager.compute_embedding_from_clip(wav_file)
-        if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
-            embedding = torch.FloatTensor(embedd).unsqueeze(0)
-            if encoder_manager.use_cuda:
-                embedding = embedding.cuda()
-            class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
-            predicted_label = map_classid_to_classname[str(class_id)]
-        else:
-            predicted_label = None
-        if class_name is not None and predicted_label is not None:
-            is_equal = int(class_name == predicted_label)
-            if class_name not in class_acc_dict:
-                class_acc_dict[class_name] = [is_equal]
-            else:
-                class_acc_dict[class_name].append(is_equal)
-        else:
-            raise RuntimeError("Error: class_name or/and predicted_label are None")
-    acc_avg = 0
-    for key, values in class_acc_dict.items():
-        acc = sum(values) / len(values)
-        print("Class", key, "Accuracy:", acc)
-        acc_avg += acc
-    print("Average Accuracy:", acc_avg / len(class_acc_dict))
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="""Compute the accuracy of the encoder.\n\n"""
-        """
-        Example runs:
-        python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json  dataset_config.json
-        """,
-        formatter_class=RawTextHelpFormatter,
-    )
-    parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
-    parser.add_argument(
-        "config_path",
-        type=str,
-        help="Path to model config file.",
-    )
-    parser.add_argument(
-        "config_dataset_path",
-        type=str,
-        help="Path to dataset config file.",
-    )
-    parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
-    parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
-    args = parser.parse_args()
-    c_dataset = load_config(args.config_dataset_path)
-    meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
-    items = meta_data_train + meta_data_eval
-    enc_manager = SpeakerManager(
-        encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
-    )
-    compute_encoder_accuracy(items, enc_manager)

TTS/bin/extract_tts_spectrograms.py DELETED Viewed

@@ -1,287 +0,0 @@
-#!/usr/bin/env python3
-"""Extract Mel spectrograms with teacher forcing."""
-import argparse
-import os
-import numpy as np
-import torch
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-from TTS.config import load_config
-from TTS.tts.datasets import TTSDataset, load_tts_samples
-from TTS.tts.models import setup_model
-from TTS.tts.utils.speakers import SpeakerManager
-from TTS.tts.utils.text.tokenizer import TTSTokenizer
-from TTS.utils.audio import AudioProcessor
-from TTS.utils.audio.numpy_transforms import quantize
-from TTS.utils.generic_utils import count_parameters
-use_cuda = torch.cuda.is_available()
-def setup_loader(ap, r, verbose=False):
-    tokenizer, _ = TTSTokenizer.init_from_config(c)
-    dataset = TTSDataset(
-        outputs_per_step=r,
-        compute_linear_spec=False,
-        samples=meta_data,
-        tokenizer=tokenizer,
-        ap=ap,
-        batch_group_size=0,
-        min_text_len=c.min_text_len,
-        max_text_len=c.max_text_len,
-        min_audio_len=c.min_audio_len,
-        max_audio_len=c.max_audio_len,
-        phoneme_cache_path=c.phoneme_cache_path,
-        precompute_num_workers=0,
-        use_noise_augment=False,
-        verbose=verbose,
-        speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
-        d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
-    )
-    if c.use_phonemes and c.compute_input_seq_cache:
-        # precompute phonemes to have a better estimate of sequence lengths.
-        dataset.compute_input_seq(c.num_loader_workers)
-    dataset.preprocess_samples()
-    loader = DataLoader(
-        dataset,
-        batch_size=c.batch_size,
-        shuffle=False,
-        collate_fn=dataset.collate_fn,
-        drop_last=False,
-        sampler=None,
-        num_workers=c.num_loader_workers,
-        pin_memory=False,
-    )
-    return loader
-def set_filename(wav_path, out_path):
-    wav_file = os.path.basename(wav_path)
-    file_name = wav_file.split(".")[0]
-    os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
-    os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
-    os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
-    os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
-    wavq_path = os.path.join(out_path, "quant", file_name)
-    mel_path = os.path.join(out_path, "mel", file_name)
-    wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
-    wav_path = os.path.join(out_path, "wav", file_name + ".wav")
-    return file_name, wavq_path, mel_path, wav_gl_path, wav_path
-def format_data(data):
-    # setup input data
-    text_input = data["token_id"]
-    text_lengths = data["token_id_lengths"]
-    mel_input = data["mel"]
-    mel_lengths = data["mel_lengths"]
-    item_idx = data["item_idxs"]
-    d_vectors = data["d_vectors"]
-    speaker_ids = data["speaker_ids"]
-    attn_mask = data["attns"]
-    avg_text_length = torch.mean(text_lengths.float())
-    avg_spec_length = torch.mean(mel_lengths.float())
-    # dispatch data to GPU
-    if use_cuda:
-        text_input = text_input.cuda(non_blocking=True)
-        text_lengths = text_lengths.cuda(non_blocking=True)
-        mel_input = mel_input.cuda(non_blocking=True)
-        mel_lengths = mel_lengths.cuda(non_blocking=True)
-        if speaker_ids is not None:
-            speaker_ids = speaker_ids.cuda(non_blocking=True)
-        if d_vectors is not None:
-            d_vectors = d_vectors.cuda(non_blocking=True)
-        if attn_mask is not None:
-            attn_mask = attn_mask.cuda(non_blocking=True)
-    return (
-        text_input,
-        text_lengths,
-        mel_input,
-        mel_lengths,
-        speaker_ids,
-        d_vectors,
-        avg_text_length,
-        avg_spec_length,
-        attn_mask,
-        item_idx,
-    )
-@torch.no_grad()
-def inference(
-    model_name,
-    model,
-    ap,
-    text_input,
-    text_lengths,
-    mel_input,
-    mel_lengths,
-    speaker_ids=None,
-    d_vectors=None,
-):
-    if model_name == "glow_tts":
-        speaker_c = None
-        if speaker_ids is not None:
-            speaker_c = speaker_ids
-        elif d_vectors is not None:
-            speaker_c = d_vectors
-        outputs = model.inference_with_MAS(
-            text_input,
-            text_lengths,
-            mel_input,
-            mel_lengths,
-            aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
-        )
-        model_output = outputs["model_outputs"]
-        model_output = model_output.detach().cpu().numpy()
-    elif "tacotron" in model_name:
-        aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
-        outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
-        postnet_outputs = outputs["model_outputs"]
-        # normalize tacotron output
-        if model_name == "tacotron":
-            mel_specs = []
-            postnet_outputs = postnet_outputs.data.cpu().numpy()
-            for b in range(postnet_outputs.shape[0]):
-                postnet_output = postnet_outputs[b]
-                mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
-            model_output = torch.stack(mel_specs).cpu().numpy()
-        elif model_name == "tacotron2":
-            model_output = postnet_outputs.detach().cpu().numpy()
-    return model_output
-def extract_spectrograms(
-    data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
-):
-    model.eval()
-    export_metadata = []
-    for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
-        # format data
-        (
-            text_input,
-            text_lengths,
-            mel_input,
-            mel_lengths,
-            speaker_ids,
-            d_vectors,
-            _,
-            _,
-            _,
-            item_idx,
-        ) = format_data(data)
-        model_output = inference(
-            c.model.lower(),
-            model,
-            ap,
-            text_input,
-            text_lengths,
-            mel_input,
-            mel_lengths,
-            speaker_ids,
-            d_vectors,
-        )
-        for idx in range(text_input.shape[0]):
-            wav_file_path = item_idx[idx]
-            wav = ap.load_wav(wav_file_path)
-            _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
-            # quantize and save wav
-            if quantize_bits > 0:
-                wavq = quantize(wav, quantize_bits)
-                np.save(wavq_path, wavq)
-            # save TTS mel
-            mel = model_output[idx]
-            mel_length = mel_lengths[idx]
-            mel = mel[:mel_length, :].T
-            np.save(mel_path, mel)
-            export_metadata.append([wav_file_path, mel_path])
-            if save_audio:
-                ap.save_wav(wav, wav_path)
-            if debug:
-                print("Audio for debug saved at:", wav_gl_path)
-                wav = ap.inv_melspectrogram(mel)
-                ap.save_wav(wav, wav_gl_path)
-    with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
-        for data in export_metadata:
-            f.write(f"{data[0]}|{data[1]+'.npy'}\n")
-def main(args):  # pylint: disable=redefined-outer-name
-    # pylint: disable=global-variable-undefined
-    global meta_data, speaker_manager
-    # Audio processor
-    ap = AudioProcessor(**c.audio)
-    # load data instances
-    meta_data_train, meta_data_eval = load_tts_samples(
-        c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
-    )
-    # use eval and training partitions
-    meta_data = meta_data_train + meta_data_eval
-    # init speaker manager
-    if c.use_speaker_embedding:
-        speaker_manager = SpeakerManager(data_items=meta_data)
-    elif c.use_d_vector_file:
-        speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
-    else:
-        speaker_manager = None
-    # setup model
-    model = setup_model(c)
-    # restore model
-    model.load_checkpoint(c, args.checkpoint_path, eval=True)
-    if use_cuda:
-        model.cuda()
-    num_params = count_parameters(model)
-    print("\n > Model has {} parameters".format(num_params), flush=True)
-    # set r
-    r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
-    own_loader = setup_loader(ap, r, verbose=True)
-    extract_spectrograms(
-        own_loader,
-        model,
-        ap,
-        args.output_path,
-        quantize_bits=args.quantize_bits,
-        save_audio=args.save_audio,
-        debug=args.debug,
-        metada_name="metada.txt",
-    )
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
-    parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
-    parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
-    parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
-    parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
-    parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
-    parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
-    args = parser.parse_args()
-    c = load_config(args.config_path)
-    c.audio.trim_silence = False
-    main(args)

TTS/bin/find_unique_chars.py DELETED Viewed

@@ -1,45 +0,0 @@
-"""Find all the unique characters in a dataset"""
-import argparse
-from argparse import RawTextHelpFormatter
-from TTS.config import load_config
-from TTS.tts.datasets import load_tts_samples
-def main():
-    # pylint: disable=bad-option-value
-    parser = argparse.ArgumentParser(
-        description="""Find all the unique characters or phonemes in a dataset.\n\n"""
-        """
-    Example runs:
-    python TTS/bin/find_unique_chars.py --config_path config.json
-    """,
-        formatter_class=RawTextHelpFormatter,
-    )
-    parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
-    args = parser.parse_args()
-    c = load_config(args.config_path)
-    # load all datasets
-    train_items, eval_items = load_tts_samples(
-        c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
-    )
-    items = train_items + eval_items
-    texts = "".join(item["text"] for item in items)
-    chars = set(texts)
-    lower_chars = filter(lambda c: c.islower(), chars)
-    chars_force_lower = [c.lower() for c in chars]
-    chars_force_lower = set(chars_force_lower)
-    print(f" > Number of unique characters: {len(chars)}")
-    print(f" > Unique characters: {''.join(sorted(chars))}")
-    print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
-    print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
-if __name__ == "__main__":
-    main()

TTS/bin/find_unique_phonemes.py DELETED Viewed

@@ -1,74 +0,0 @@
-"""Find all the unique characters in a dataset"""
-import argparse
-import multiprocessing
-from argparse import RawTextHelpFormatter
-from tqdm.contrib.concurrent import process_map
-from TTS.config import load_config
-from TTS.tts.datasets import load_tts_samples
-from TTS.tts.utils.text.phonemizers import Gruut
-def compute_phonemes(item):
-    text = item["text"]
-    ph = phonemizer.phonemize(text).replace("|", "")
-    return set(list(ph))
-def main():
-    # pylint: disable=W0601
-    global c, phonemizer
-    # pylint: disable=bad-option-value
-    parser = argparse.ArgumentParser(
-        description="""Find all the unique characters or phonemes in a dataset.\n\n"""
-        """
-    Example runs:
-    python TTS/bin/find_unique_phonemes.py --config_path config.json
-    """,
-        formatter_class=RawTextHelpFormatter,
-    )
-    parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
-    args = parser.parse_args()
-    c = load_config(args.config_path)
-    # load all datasets
-    train_items, eval_items = load_tts_samples(
-        c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
-    )
-    items = train_items + eval_items
-    print("Num items:", len(items))
-    language_list = [item["language"] for item in items]
-    is_lang_def = all(language_list)
-    if not c.phoneme_language or not is_lang_def:
-        raise ValueError("Phoneme language must be defined in config.")
-    if not language_list.count(language_list[0]) == len(language_list):
-        raise ValueError(
-            "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
-        )
-    phonemizer = Gruut(language=language_list[0], keep_puncs=True)
-    phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
-    phones = []
-    for ph in phonemes:
-        phones.extend(ph)
-    phones = set(phones)
-    lower_phones = filter(lambda c: c.islower(), phones)
-    phones_force_lower = [c.lower() for c in phones]
-    phones_force_lower = set(phones_force_lower)
-    print(f" > Number of unique phonemes: {len(phones)}")
-    print(f" > Unique phonemes: {''.join(sorted(phones))}")
-    print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
-    print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
-if __name__ == "__main__":
-    main()

TTS/bin/remove_silence_using_vad.py DELETED Viewed

@@ -1,124 +0,0 @@
-import argparse
-import glob
-import multiprocessing
-import os
-import pathlib
-import torch
-from tqdm import tqdm
-from TTS.utils.vad import get_vad_model_and_utils, remove_silence
-torch.set_num_threads(1)
-def adjust_path_and_remove_silence(audio_path):
-    output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
-    # ignore if the file exists
-    if os.path.exists(output_path) and not args.force:
-        return output_path, False
-    # create all directory structure
-    pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-    # remove the silence and save the audio
-    output_path, is_speech = remove_silence(
-        model_and_utils,
-        audio_path,
-        output_path,
-        trim_just_beginning_and_end=args.trim_just_beginning_and_end,
-        use_cuda=args.use_cuda,
-    )
-    return output_path, is_speech
-def preprocess_audios():
-    files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
-    print("> Number of files: ", len(files))
-    if not args.force:
-        print("> Ignoring files that already exist in the output idrectory.")
-    if args.trim_just_beginning_and_end:
-        print("> Trimming just the beginning and the end with nonspeech parts.")
-    else:
-        print("> Trimming all nonspeech parts.")
-    filtered_files = []
-    if files:
-        # create threads
-        # num_threads = multiprocessing.cpu_count()
-        # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
-        if args.num_processes > 1:
-            with multiprocessing.Pool(processes=args.num_processes) as pool:
-                results = list(
-                    tqdm(
-                        pool.imap_unordered(adjust_path_and_remove_silence, files),
-                        total=len(files),
-                        desc="Processing audio files",
-                    )
-                )
-            for output_path, is_speech in results:
-                if not is_speech:
-                    filtered_files.append(output_path)
-        else:
-            for f in tqdm(files):
-                output_path, is_speech = adjust_path_and_remove_silence(f)
-                if not is_speech:
-                    filtered_files.append(output_path)
-        # write files that do not have speech
-        with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
-            for file in filtered_files:
-                f.write(str(file) + "\n")
-    else:
-        print("> No files Found !")
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
-    )
-    parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
-    parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
-    parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
-    parser.add_argument(
-        "-g",
-        "--glob",
-        type=str,
-        default="**/*.wav",
-        help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
-    )
-    parser.add_argument(
-        "-t",
-        "--trim_just_beginning_and_end",
-        type=bool,
-        default=True,
-        help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
-    )
-    parser.add_argument(
-        "-c",
-        "--use_cuda",
-        type=bool,
-        default=False,
-        help="If True use cuda",
-    )
-    parser.add_argument(
-        "--use_onnx",
-        type=bool,
-        default=False,
-        help="If True use onnx",
-    )
-    parser.add_argument(
-        "--num_processes",
-        type=int,
-        default=1,
-        help="Number of processes to use",
-    )
-    args = parser.parse_args()
-    if args.output_dir == "":
-        args.output_dir = args.input_dir
-    # load the model and utils
-    model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
-    preprocess_audios()

TTS/bin/resample.py DELETED Viewed

@@ -1,90 +0,0 @@
-import argparse
-import glob
-import os
-from argparse import RawTextHelpFormatter
-from multiprocessing import Pool
-from shutil import copytree
-import librosa
-import soundfile as sf
-from tqdm import tqdm
-def resample_file(func_args):
-    filename, output_sr = func_args
-    y, sr = librosa.load(filename, sr=output_sr)
-    sf.write(filename, y, sr)
-def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
-    if output_dir:
-        print("Recursively copying the input folder...")
-        copytree(input_dir, output_dir)
-        input_dir = output_dir
-    print("Resampling the audio files...")
-    audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
-    print(f"Found {len(audio_files)} files...")
-    audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
-    with Pool(processes=n_jobs) as p:
-        with tqdm(total=len(audio_files)) as pbar:
-            for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
-                pbar.update()
-    print("Done !")
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="""Resample a folder recusively with librosa
-                       Can be used in place or create a copy of the folder as an output.\n\n
-                       Example run:
-                            python TTS/bin/resample.py
-                                --input_dir /root/LJSpeech-1.1/
-                                --output_sr 22050
-                                --output_dir /root/resampled_LJSpeech-1.1/
-                                --file_ext wav
-                                --n_jobs 24
-                    """,
-        formatter_class=RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--input_dir",
-        type=str,
-        default=None,
-        required=True,
-        help="Path of the folder containing the audio files to resample",
-    )
-    parser.add_argument(
-        "--output_sr",
-        type=int,
-        default=22050,
-        required=False,
-        help="Samlple rate to which the audio files should be resampled",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default=None,
-        required=False,
-        help="Path of the destination folder. If not defined, the operation is done in place",
-    )
-    parser.add_argument(
-        "--file_ext",
-        type=str,
-        default="wav",
-        required=False,
-        help="Extension of the audio files to resample",
-    )
-    parser.add_argument(
-        "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
-    )
-    args = parser.parse_args()
-    resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)

TTS/bin/synthesize.py DELETED Viewed

@@ -1,494 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-import argparse
-import contextlib
-import sys
-from argparse import RawTextHelpFormatter
-# pylint: disable=redefined-outer-name, unused-argument
-from pathlib import Path
-description = """
-Synthesize speech on command line.
-You can either use your trained model or choose a model from the provided list.
-If you don't specify any models, then it uses LJSpeech based English model.
-#### Single Speaker Models
-- List provided models:
-  ```
-  $ tts --list_models
-  ```
-- Get model info (for both tts_models and vocoder_models):
-  - Query by type/name:
-    The model_info_by_name uses the name as it from the --list_models.
-    ```
-    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
-    ```
-    For example:
-    ```
-    $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
-    $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
-    ```
-  - Query by type/idx:
-    The model_query_idx uses the corresponding idx from --list_models.
-    ```
-    $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
-    ```
-    For example:
-    ```
-    $ tts --model_info_by_idx tts_models/3
-    ```
-  - Query info for model info by full name:
-    ```
-    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
-    ```
-- Run TTS with default models:
-  ```
-  $ tts --text "Text for TTS" --out_path output/path/speech.wav
-  ```
-- Run TTS and pipe out the generated TTS wav file data:
-  ```
-  $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
-  ```
-- Run a TTS model with its default vocoder model:
-  ```
-  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
-  ```
-  For example:
-  ```
-  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
-  ```
-- Run with specific TTS and vocoder models from the list:
-  ```
-  $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
-  ```
-  For example:
-  ```
-  $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
-  ```
-- Run your own TTS model (Using Griffin-Lim Vocoder):
-  ```
-  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
-  ```
-- Run your own TTS and Vocoder models:
-  ```
-  $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
-      --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
-  ```
-#### Multi-speaker Models
-- List the available speakers and choose a <speaker_id> among them:
-  ```
-  $ tts --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
-  ```
-- Run the multi-speaker TTS model with the target speaker ID:
-  ```
-  $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
-  ```
-- Run your own multi-speaker TTS model:
-  ```
-  $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
-  ```
-### Voice Conversion Models
-```
-$ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
-```
-"""
-def str2bool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() in ("yes", "true", "t", "y", "1"):
-        return True
-    if v.lower() in ("no", "false", "f", "n", "0"):
-        return False
-    raise argparse.ArgumentTypeError("Boolean value expected.")
-def main():
-    parser = argparse.ArgumentParser(
-        description=description.replace("    ```\n", ""),
-        formatter_class=RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--list_models",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
-        help="list available pre-trained TTS and vocoder models.",
-    )
-    parser.add_argument(
-        "--model_info_by_idx",
-        type=str,
-        default=None,
-        help="model info using query format: <model_type>/<model_query_idx>",
-    )
-    parser.add_argument(
-        "--model_info_by_name",
-        type=str,
-        default=None,
-        help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
-    )
-    parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
-    # Args for running pre-trained TTS models.
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        default="tts_models/en/ljspeech/tacotron2-DDC",
-        help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
-    )
-    parser.add_argument(
-        "--vocoder_name",
-        type=str,
-        default=None,
-        help="Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>",
-    )
-    # Args for running custom models
-    parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        default=None,
-        help="Path to model file.",
-    )
-    parser.add_argument(
-        "--out_path",
-        type=str,
-        default="tts_output.wav",
-        help="Output wav file path.",
-    )
-    parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
-    parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
-    parser.add_argument(
-        "--vocoder_path",
-        type=str,
-        help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
-        default=None,
-    )
-    parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
-    parser.add_argument(
-        "--encoder_path",
-        type=str,
-        help="Path to speaker encoder model file.",
-        default=None,
-    )
-    parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
-    parser.add_argument(
-        "--pipe_out",
-        help="stdout the generated TTS wav file for shell pipe.",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
-    )
-    # args for multi-speaker synthesis
-    parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
-    parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
-    parser.add_argument(
-        "--speaker_idx",
-        type=str,
-        help="Target speaker ID for a multi-speaker TTS model.",
-        default=None,
-    )
-    parser.add_argument(
-        "--language_idx",
-        type=str,
-        help="Target language ID for a multi-lingual TTS model.",
-        default=None,
-    )
-    parser.add_argument(
-        "--speaker_wav",
-        nargs="+",
-        help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
-        default=None,
-    )
-    parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
-    parser.add_argument(
-        "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
-    )
-    parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
-    parser.add_argument(
-        "--list_speaker_idxs",
-        help="List available speaker ids for the defined multi-speaker model.",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
-    )
-    parser.add_argument(
-        "--list_language_idxs",
-        help="List available language ids for the defined multi-lingual model.",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
-    )
-    # aux args
-    parser.add_argument(
-        "--save_spectogram",
-        type=bool,
-        help="If true save raw spectogram for further (vocoder) processing in out_path.",
-        default=False,
-    )
-    parser.add_argument(
-        "--reference_wav",
-        type=str,
-        help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
-        default=None,
-    )
-    parser.add_argument(
-        "--reference_speaker_idx",
-        type=str,
-        help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
-        default=None,
-    )
-    parser.add_argument(
-        "--progress_bar",
-        type=str2bool,
-        help="If true shows a progress bar for the model download. Defaults to True",
-        default=True,
-    )
-    # voice conversion args
-    parser.add_argument(
-        "--source_wav",
-        type=str,
-        default=None,
-        help="Original audio file to convert in the voice of the target_wav",
-    )
-    parser.add_argument(
-        "--target_wav",
-        type=str,
-        default=None,
-        help="Target audio file to convert in the voice of the source_wav",
-    )
-    parser.add_argument(
-        "--voice_dir",
-        type=str,
-        default=None,
-        help="Voice dir for tortoise model",
-    )
-    args = parser.parse_args()
-    # print the description if either text or list_models is not set
-    check_args = [
-        args.text,
-        args.list_models,
-        args.list_speaker_idxs,
-        args.list_language_idxs,
-        args.reference_wav,
-        args.model_info_by_idx,
-        args.model_info_by_name,
-        args.source_wav,
-        args.target_wav,
-    ]
-    if not any(check_args):
-        parser.parse_args(["-h"])
-    pipe_out = sys.stdout if args.pipe_out else None
-    with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
-        # Late-import to make things load faster
-        from TTS.api import TTS
-        from TTS.utils.manage import ModelManager
-        from TTS.utils.synthesizer import Synthesizer
-        # load model manager
-        path = Path(__file__).parent / "../.models.json"
-        manager = ModelManager(path, progress_bar=args.progress_bar)
-        api = TTS()
-        tts_path = None
-        tts_config_path = None
-        speakers_file_path = None
-        language_ids_file_path = None
-        vocoder_path = None
-        vocoder_config_path = None
-        encoder_path = None
-        encoder_config_path = None
-        vc_path = None
-        vc_config_path = None
-        model_dir = None
-        # CASE1 #list : list pre-trained TTS models
-        if args.list_models:
-            manager.list_models()
-            sys.exit()
-        # CASE2 #info : model info for pre-trained TTS models
-        if args.model_info_by_idx:
-            model_query = args.model_info_by_idx
-            manager.model_info_by_idx(model_query)
-            sys.exit()
-        if args.model_info_by_name:
-            model_query_full_name = args.model_info_by_name
-            manager.model_info_by_full_name(model_query_full_name)
-            sys.exit()
-        # CASE3: load pre-trained model paths
-        if args.model_name is not None and not args.model_path:
-            model_path, config_path, model_item = manager.download_model(args.model_name)
-            # tts model
-            if model_item["model_type"] == "tts_models":
-                tts_path = model_path
-                tts_config_path = config_path
-                if "default_vocoder" in model_item:
-                    args.vocoder_name = (
-                        model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
-                    )
-            # voice conversion model
-            if model_item["model_type"] == "voice_conversion_models":
-                vc_path = model_path
-                vc_config_path = config_path
-            # tts model with multiple files to be loaded from the directory path
-            if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
-                model_dir = model_path
-                tts_path = None
-                tts_config_path = None
-                args.vocoder_name = None
-        # load vocoder
-        if args.vocoder_name is not None and not args.vocoder_path:
-            vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
-        # CASE4: set custom model paths
-        if args.model_path is not None:
-            tts_path = args.model_path
-            tts_config_path = args.config_path
-            speakers_file_path = args.speakers_file_path
-            language_ids_file_path = args.language_ids_file_path
-        if args.vocoder_path is not None:
-            vocoder_path = args.vocoder_path
-            vocoder_config_path = args.vocoder_config_path
-        if args.encoder_path is not None:
-            encoder_path = args.encoder_path
-            encoder_config_path = args.encoder_config_path
-        device = args.device
-        if args.use_cuda:
-            device = "cuda"
-        # load models
-        synthesizer = Synthesizer(
-            tts_path,
-            tts_config_path,
-            speakers_file_path,
-            language_ids_file_path,
-            vocoder_path,
-            vocoder_config_path,
-            encoder_path,
-            encoder_config_path,
-            vc_path,
-            vc_config_path,
-            model_dir,
-            args.voice_dir,
-        ).to(device)
-        # query speaker ids of a multi-speaker model.
-        if args.list_speaker_idxs:
-            print(
-                " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
-            )
-            print(synthesizer.tts_model.speaker_manager.name_to_id)
-            return
-        # query langauge ids of a multi-lingual model.
-        if args.list_language_idxs:
-            print(
-                " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
-            )
-            print(synthesizer.tts_model.language_manager.name_to_id)
-            return
-        # check the arguments against a multi-speaker model.
-        if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
-            print(
-                " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
-                "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
-            )
-            return
-        # RUN THE SYNTHESIS
-        if args.text:
-            print(" > Text: {}".format(args.text))
-        # kick it
-        if tts_path is not None:
-            wav = synthesizer.tts(
-                args.text,
-                speaker_name=args.speaker_idx,
-                language_name=args.language_idx,
-                speaker_wav=args.speaker_wav,
-                reference_wav=args.reference_wav,
-                style_wav=args.capacitron_style_wav,
-                style_text=args.capacitron_style_text,
-                reference_speaker_name=args.reference_speaker_idx,
-            )
-        elif vc_path is not None:
-            wav = synthesizer.voice_conversion(
-                source_wav=args.source_wav,
-                target_wav=args.target_wav,
-            )
-        elif model_dir is not None:
-            wav = synthesizer.tts(
-                args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
-            )
-        # save the results
-        print(" > Saving output to {}".format(args.out_path))
-        synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
-if __name__ == "__main__":
-    main()

TTS/bin/train_encoder.py DELETED Viewed

@@ -1,332 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-import os
-import sys
-import time
-import traceback
-import torch
-from torch.utils.data import DataLoader
-from trainer.io import copy_model_files, save_best_model, save_checkpoint
-from trainer.torch import NoamLR
-from trainer.trainer_utils import get_optimizer
-from TTS.encoder.dataset import EncoderDataset
-from TTS.encoder.utils.generic_utils import setup_encoder_model
-from TTS.encoder.utils.training import init_training
-from TTS.encoder.utils.visual import plot_embeddings
-from TTS.tts.datasets import load_tts_samples
-from TTS.utils.audio import AudioProcessor
-from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
-from TTS.utils.samplers import PerfectBatchSampler
-from TTS.utils.training import check_update
-torch.backends.cudnn.enabled = True
-torch.backends.cudnn.benchmark = True
-torch.manual_seed(54321)
-use_cuda = torch.cuda.is_available()
-num_gpus = torch.cuda.device_count()
-print(" > Using CUDA: ", use_cuda)
-print(" > Number of GPUs: ", num_gpus)
-def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
-    num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
-    num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
-    dataset = EncoderDataset(
-        c,
-        ap,
-        meta_data_eval if is_val else meta_data_train,
-        voice_len=c.voice_len,
-        num_utter_per_class=num_utter_per_class,
-        num_classes_in_batch=num_classes_in_batch,
-        verbose=verbose,
-        augmentation_config=c.audio_augmentation if not is_val else None,
-        use_torch_spec=c.model_params.get("use_torch_spec", False),
-    )
-    # get classes list
-    classes = dataset.get_class_list()
-    sampler = PerfectBatchSampler(
-        dataset.items,
-        classes,
-        batch_size=num_classes_in_batch * num_utter_per_class,  # total batch size
-        num_classes_in_batch=num_classes_in_batch,
-        num_gpus=1,
-        shuffle=not is_val,
-        drop_last=True,
-    )
-    if len(classes) < num_classes_in_batch:
-        if is_val:
-            raise RuntimeError(
-                f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
-            )
-        raise RuntimeError(
-            f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
-        )
-    # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
-    if is_val:
-        dataset.set_classes(train_classes)
-    loader = DataLoader(
-        dataset,
-        num_workers=c.num_loader_workers,
-        batch_sampler=sampler,
-        collate_fn=dataset.collate_fn,
-    )
-    return loader, classes, dataset.get_map_classid_to_classname()
-def evaluation(model, criterion, data_loader, global_step):
-    eval_loss = 0
-    for _, data in enumerate(data_loader):
-        with torch.no_grad():
-            # setup input data
-            inputs, labels = data
-            # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
-            labels = torch.transpose(
-                labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
-            ).reshape(labels.shape)
-            inputs = torch.transpose(
-                inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
-            ).reshape(inputs.shape)
-            # dispatch data to GPU
-            if use_cuda:
-                inputs = inputs.cuda(non_blocking=True)
-                labels = labels.cuda(non_blocking=True)
-            # forward pass model
-            outputs = model(inputs)
-            # loss computation
-            loss = criterion(
-                outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
-            )
-            eval_loss += loss.item()
-    eval_avg_loss = eval_loss / len(data_loader)
-    # save stats
-    dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
-    # plot the last batch in the evaluation
-    figures = {
-        "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
-    }
-    dashboard_logger.eval_figures(global_step, figures)
-    return eval_avg_loss
-def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
-    model.train()
-    best_loss = {"train_loss": None, "eval_loss": float("inf")}
-    avg_loader_time = 0
-    end_time = time.time()
-    for epoch in range(c.epochs):
-        tot_loss = 0
-        epoch_time = 0
-        for _, data in enumerate(data_loader):
-            start_time = time.time()
-            # setup input data
-            inputs, labels = data
-            # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
-            labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
-                labels.shape
-            )
-            inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
-                inputs.shape
-            )
-            # ToDo: move it to a unit test
-            # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
-            # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
-            # idx = 0
-            # for j in range(0, c.num_classes_in_batch, 1):
-            #     for i in range(j, len(labels), c.num_classes_in_batch):
-            #         if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
-            #             print("Invalid")
-            #             print(labels)
-            #             exit()
-            #         idx += 1
-            # labels = labels_converted
-            # inputs = inputs_converted
-            loader_time = time.time() - end_time
-            global_step += 1
-            # setup lr
-            if c.lr_decay:
-                scheduler.step()
-            optimizer.zero_grad()
-            # dispatch data to GPU
-            if use_cuda:
-                inputs = inputs.cuda(non_blocking=True)
-                labels = labels.cuda(non_blocking=True)
-            # forward pass model
-            outputs = model(inputs)
-            # loss computation
-            loss = criterion(
-                outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
-            )
-            loss.backward()
-            grad_norm, _ = check_update(model, c.grad_clip)
-            optimizer.step()
-            step_time = time.time() - start_time
-            epoch_time += step_time
-            # acumulate the total epoch loss
-            tot_loss += loss.item()
-            # Averaged Loader Time
-            num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
-            avg_loader_time = (
-                1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
-                if avg_loader_time != 0
-                else loader_time
-            )
-            current_lr = optimizer.param_groups[0]["lr"]
-            if global_step % c.steps_plot_stats == 0:
-                # Plot Training Epoch Stats
-                train_stats = {
-                    "loss": loss.item(),
-                    "lr": current_lr,
-                    "grad_norm": grad_norm,
-                    "step_time": step_time,
-                    "avg_loader_time": avg_loader_time,
-                }
-                dashboard_logger.train_epoch_stats(global_step, train_stats)
-                figures = {
-                    "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
-                }
-                dashboard_logger.train_figures(global_step, figures)
-            if global_step % c.print_step == 0:
-                print(
-                    "   | > Step:{}  Loss:{:.5f}  GradNorm:{:.5f}  "
-                    "StepTime:{:.2f}  LoaderTime:{:.2f}  AvGLoaderTime:{:.2f}  LR:{:.6f}".format(
-                        global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
-                    ),
-                    flush=True,
-                )
-            if global_step % c.save_step == 0:
-                # save model
-                save_checkpoint(
-                    c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict()
-                )
-            end_time = time.time()
-        print("")
-        print(
-            ">>> Epoch:{}  AvgLoss: {:.5f} GradNorm:{:.5f}  "
-            "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
-                epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
-            ),
-            flush=True,
-        )
-        # evaluation
-        if c.run_eval:
-            model.eval()
-            eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
-            print("\n\n")
-            print("--> EVAL PERFORMANCE")
-            print(
-                "   | > Epoch:{}  AvgLoss: {:.5f} ".format(epoch, eval_loss),
-                flush=True,
-            )
-            # save the best checkpoint
-            best_loss = save_best_model(
-                {"train_loss": None, "eval_loss": eval_loss},
-                best_loss,
-                c,
-                model,
-                optimizer,
-                None,
-                global_step,
-                epoch,
-                OUT_PATH,
-                criterion=criterion.state_dict(),
-            )
-            model.train()
-    return best_loss, global_step
-def main(args):  # pylint: disable=redefined-outer-name
-    # pylint: disable=global-variable-undefined
-    global meta_data_train
-    global meta_data_eval
-    global train_classes
-    ap = AudioProcessor(**c.audio)
-    model = setup_encoder_model(c)
-    optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
-    # pylint: disable=redefined-outer-name
-    meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
-    train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
-    if c.run_eval:
-        eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
-    else:
-        eval_data_loader = None
-    num_classes = len(train_classes)
-    criterion = model.get_criterion(c, num_classes)
-    if c.loss == "softmaxproto" and c.model != "speaker_encoder":
-        c.map_classid_to_classname = map_classid_to_classname
-        copy_model_files(c, OUT_PATH, new_fields={})
-    if args.restore_path:
-        criterion, args.restore_step = model.load_checkpoint(
-            c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
-        )
-        print(" > Model restored from step %d" % args.restore_step, flush=True)
-    else:
-        args.restore_step = 0
-    if c.lr_decay:
-        scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
-    else:
-        scheduler = None
-    num_params = count_parameters(model)
-    print("\n > Model has {} parameters".format(num_params), flush=True)
-    if use_cuda:
-        model = model.cuda()
-        criterion.cuda()
-    global_step = args.restore_step
-    _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
-if __name__ == "__main__":
-    args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
-    try:
-        main(args)
-    except KeyboardInterrupt:
-        remove_experiment_folder(OUT_PATH)
-        try:
-            sys.exit(0)
-        except SystemExit:
-            os._exit(0)  # pylint: disable=protected-access
-    except Exception:  # pylint: disable=broad-except
-        remove_experiment_folder(OUT_PATH)
-        traceback.print_exc()
-        sys.exit(1)

TTS/bin/train_tts.py DELETED Viewed

@@ -1,71 +0,0 @@
-import os
-from dataclasses import dataclass, field
-from trainer import Trainer, TrainerArgs
-from TTS.config import load_config, register_config
-from TTS.tts.datasets import load_tts_samples
-from TTS.tts.models import setup_model
-@dataclass
-class TrainTTSArgs(TrainerArgs):
-    config_path: str = field(default=None, metadata={"help": "Path to the config file."})
-def main():
-    """Run `tts` model training directly by a `config.json` file."""
-    # init trainer args
-    train_args = TrainTTSArgs()
-    parser = train_args.init_argparse(arg_prefix="")
-    # override trainer args from comman-line args
-    args, config_overrides = parser.parse_known_args()
-    train_args.parse_args(args)
-    # load config.json and register
-    if args.config_path or args.continue_path:
-        if args.config_path:
-            # init from a file
-            config = load_config(args.config_path)
-            if len(config_overrides) > 0:
-                config.parse_known_args(config_overrides, relaxed_parser=True)
-        elif args.continue_path:
-            # continue from a prev experiment
-            config = load_config(os.path.join(args.continue_path, "config.json"))
-            if len(config_overrides) > 0:
-                config.parse_known_args(config_overrides, relaxed_parser=True)
-        else:
-            # init from console args
-            from TTS.config.shared_configs import BaseTrainingConfig  # pylint: disable=import-outside-toplevel
-            config_base = BaseTrainingConfig()
-            config_base.parse_known_args(config_overrides)
-            config = register_config(config_base.model)()
-    # load training samples
-    train_samples, eval_samples = load_tts_samples(
-        config.datasets,
-        eval_split=True,
-        eval_split_max_size=config.eval_split_max_size,
-        eval_split_size=config.eval_split_size,
-    )
-    # init the model from config
-    model = setup_model(config, train_samples + eval_samples)
-    # init the trainer and 🚀
-    trainer = Trainer(
-        train_args,
-        model.config,
-        config.output_path,
-        model=model,
-        train_samples=train_samples,
-        eval_samples=eval_samples,
-        parse_command_line_args=False,
-    )
-    trainer.fit()
-if __name__ == "__main__":
-    main()

TTS/bin/train_vocoder.py DELETED Viewed

@@ -1,77 +0,0 @@
-import os
-from dataclasses import dataclass, field
-from trainer import Trainer, TrainerArgs
-from TTS.config import load_config, register_config
-from TTS.utils.audio import AudioProcessor
-from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
-from TTS.vocoder.models import setup_model
-@dataclass
-class TrainVocoderArgs(TrainerArgs):
-    config_path: str = field(default=None, metadata={"help": "Path to the config file."})
-def main():
-    """Run `tts` model training directly by a `config.json` file."""
-    # init trainer args
-    train_args = TrainVocoderArgs()
-    parser = train_args.init_argparse(arg_prefix="")
-    # override trainer args from comman-line args
-    args, config_overrides = parser.parse_known_args()
-    train_args.parse_args(args)
-    # load config.json and register
-    if args.config_path or args.continue_path:
-        if args.config_path:
-            # init from a file
-            config = load_config(args.config_path)
-            if len(config_overrides) > 0:
-                config.parse_known_args(config_overrides, relaxed_parser=True)
-        elif args.continue_path:
-            # continue from a prev experiment
-            config = load_config(os.path.join(args.continue_path, "config.json"))
-            if len(config_overrides) > 0:
-                config.parse_known_args(config_overrides, relaxed_parser=True)
-        else:
-            # init from console args
-            from TTS.config.shared_configs import BaseTrainingConfig  # pylint: disable=import-outside-toplevel
-            config_base = BaseTrainingConfig()
-            config_base.parse_known_args(config_overrides)
-            config = register_config(config_base.model)()
-    # load training samples
-    if "feature_path" in config and config.feature_path:
-        # load pre-computed features
-        print(f" > Loading features from: {config.feature_path}")
-        eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
-    else:
-        # load data raw wav files
-        eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
-    # setup audio processor
-    ap = AudioProcessor(**config.audio)
-    # init the model from config
-    model = setup_model(config)
-    # init the trainer and 🚀
-    trainer = Trainer(
-        train_args,
-        config,
-        config.output_path,
-        model=model,
-        train_samples=train_samples,
-        eval_samples=eval_samples,
-        training_assets={"audio_processor": ap},
-        parse_command_line_args=False,
-    )
-    trainer.fit()
-if __name__ == "__main__":
-    main()

TTS/bin/tune_wavegrad.py DELETED Viewed

@@ -1,103 +0,0 @@
-"""Search a good noise schedule for WaveGrad for a given number of inference iterations"""
-import argparse
-from itertools import product as cartesian_product
-import numpy as np
-import torch
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-from TTS.config import load_config
-from TTS.utils.audio import AudioProcessor
-from TTS.vocoder.datasets.preprocess import load_wav_data
-from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
-from TTS.vocoder.models import setup_model
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
-    parser.add_argument("--config_path", type=str, help="Path to model config file.")
-    parser.add_argument("--data_path", type=str, help="Path to data directory.")
-    parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
-    parser.add_argument(
-        "--num_iter",
-        type=int,
-        help="Number of model inference iterations that you like to optimize noise schedule for.",
-    )
-    parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
-    parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
-    parser.add_argument(
-        "--search_depth",
-        type=int,
-        default=3,
-        help="Search granularity. Increasing this increases the run-time exponentially.",
-    )
-    # load config
-    args = parser.parse_args()
-    config = load_config(args.config_path)
-    # setup audio processor
-    ap = AudioProcessor(**config.audio)
-    # load dataset
-    _, train_data = load_wav_data(args.data_path, 0)
-    train_data = train_data[: args.num_samples]
-    dataset = WaveGradDataset(
-        ap=ap,
-        items=train_data,
-        seq_len=-1,
-        hop_len=ap.hop_length,
-        pad_short=config.pad_short,
-        conv_pad=config.conv_pad,
-        is_training=True,
-        return_segments=False,
-        use_noise_augment=False,
-        use_cache=False,
-        verbose=True,
-    )
-    loader = DataLoader(
-        dataset,
-        batch_size=1,
-        shuffle=False,
-        collate_fn=dataset.collate_full_clips,
-        drop_last=False,
-        num_workers=config.num_loader_workers,
-        pin_memory=False,
-    )
-    # setup the model
-    model = setup_model(config)
-    if args.use_cuda:
-        model.cuda()
-    # setup optimization parameters
-    base_values = sorted(10 * np.random.uniform(size=args.search_depth))
-    print(f" > base values: {base_values}")
-    exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
-    best_error = float("inf")
-    best_schedule = None  # pylint: disable=C0103
-    total_search_iter = len(base_values) ** args.num_iter
-    for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
-        beta = exponents * base
-        model.compute_noise_level(beta)
-        for data in loader:
-            mel, audio = data
-            y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
-            if args.use_cuda:
-                y_hat = y_hat.cpu()
-            y_hat = y_hat.numpy()
-            mel_hat = []
-            for i in range(y_hat.shape[0]):
-                m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
-                mel_hat.append(torch.from_numpy(m))
-            mel_hat = torch.stack(mel_hat)
-            mse = torch.sum((mel - mel_hat) ** 2).mean()
-            if mse.item() < best_error:
-                best_error = mse.item()
-                best_schedule = {"beta": beta}
-                print(f" > Found a better schedule. - MSE: {mse.item()}")
-                np.save(args.output_path, best_schedule)

TTS/config/__init__.py DELETED Viewed

@@ -1,135 +0,0 @@
-import json
-import os
-import re
-from typing import Dict
-import fsspec
-import yaml
-from coqpit import Coqpit
-from TTS.config.shared_configs import *
-from TTS.utils.generic_utils import find_module
-def read_json_with_comments(json_path):
-    """for backward compat."""
-    # fallback to json
-    with fsspec.open(json_path, "r", encoding="utf-8") as f:
-        input_str = f.read()
-    # handle comments but not urls with //
-    input_str = re.sub(r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str)
-    return json.loads(input_str)
-def register_config(model_name: str) -> Coqpit:
-    """Find the right config for the given model name.
-    Args:
-        model_name (str): Model name.
-    Raises:
-        ModuleNotFoundError: No matching config for the model name.
-    Returns:
-        Coqpit: config class.
-    """
-    config_class = None
-    config_name = model_name + "_config"
-    # TODO: fix this
-    if model_name == "xtts":
-        from TTS.tts.configs.xtts_config import XttsConfig
-        config_class = XttsConfig
-    paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
-    for path in paths:
-        try:
-            config_class = find_module(path, config_name)
-        except ModuleNotFoundError:
-            pass
-    if config_class is None:
-        raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
-    return config_class
-def _process_model_name(config_dict: Dict) -> str:
-    """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
-    Args:
-        config_dict (Dict): A dictionary including the config fields.
-    Returns:
-        str: Formatted modelname.
-    """
-    model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
-    model_name = model_name.replace("_generator", "").replace("_discriminator", "")
-    return model_name
-def load_config(config_path: str) -> Coqpit:
-    """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
-    to find the corresponding Config class. Then initialize the Config.
-    Args:
-        config_path (str): path to the config file.
-    Raises:
-        TypeError: given config file has an unknown type.
-    Returns:
-        Coqpit: TTS config object.
-    """
-    config_dict = {}
-    ext = os.path.splitext(config_path)[1]
-    if ext in (".yml", ".yaml"):
-        with fsspec.open(config_path, "r", encoding="utf-8") as f:
-            data = yaml.safe_load(f)
-    elif ext == ".json":
-        try:
-            with fsspec.open(config_path, "r", encoding="utf-8") as f:
-                data = json.load(f)
-        except json.decoder.JSONDecodeError:
-            # backwards compat.
-            data = read_json_with_comments(config_path)
-    else:
-        raise TypeError(f" [!] Unknown config file type {ext}")
-    config_dict.update(data)
-    model_name = _process_model_name(config_dict)
-    config_class = register_config(model_name.lower())
-    config = config_class()
-    config.from_dict(config_dict)
-    return config
-def check_config_and_model_args(config, arg_name, value):
-    """Check the give argument in `config.model_args` if exist or in `config` for
-    the given value.
-    Return False if the argument does not exist in `config.model_args` or `config`.
-    This is to patch up the compatibility between models with and without `model_args`.
-    TODO: Remove this in the future with a unified approach.
-    """
-    if hasattr(config, "model_args"):
-        if arg_name in config.model_args:
-            return config.model_args[arg_name] == value
-    if hasattr(config, arg_name):
-        return config[arg_name] == value
-    return False
-def get_from_config_or_model_args(config, arg_name):
-    """Get the given argument from `config.model_args` if exist or in `config`."""
-    if hasattr(config, "model_args"):
-        if arg_name in config.model_args:
-            return config.model_args[arg_name]
-    return config[arg_name]
-def get_from_config_or_model_args_with_default(config, arg_name, def_val):
-    """Get the given argument from `config.model_args` if exist or in `config`."""
-    if hasattr(config, "model_args"):
-        if arg_name in config.model_args:
-            return config.model_args[arg_name]
-    if hasattr(config, arg_name):
-        return config[arg_name]
-    return def_val

TTS/config/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (4.26 kB)

TTS/config/__pycache__/shared_configs.cpython-310.pyc DELETED Viewed

Binary file (9.53 kB)

TTS/config/shared_configs.py DELETED Viewed

@@ -1,268 +0,0 @@
-from dataclasses import asdict, dataclass
-from typing import List
-from coqpit import Coqpit, check_argument
-from trainer import TrainerConfig
-@dataclass
-class BaseAudioConfig(Coqpit):
-    """Base config to definge audio processing parameters. It is used to initialize
-    ```TTS.utils.audio.AudioProcessor.```
-    Args:
-        fft_size (int):
-            Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
-        win_length (int):
-            Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
-            ```fft_size```. Defaults to 1024.
-        hop_length (int):
-            Number of audio samples between adjacent STFT columns. Defaults to 1024.
-        frame_shift_ms (int):
-            Set ```hop_length``` based on milliseconds and sampling rate.
-        frame_length_ms (int):
-            Set ```win_length``` based on milliseconds and sampling rate.
-        stft_pad_mode (str):
-            Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
-        sample_rate (int):
-            Audio sampling rate. Defaults to 22050.
-        resample (bool):
-            Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
-        preemphasis (float):
-            Preemphasis coefficient. Defaults to 0.0.
-        ref_level_db (int): 20
-            Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
-            Defaults to 20.
-        do_sound_norm (bool):
-            Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
-        log_func (str):
-            Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
-        do_trim_silence (bool):
-            Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
-        do_amp_to_db_linear (bool, optional):
-            enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
-        do_amp_to_db_mel (bool, optional):
-            enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
-        pitch_fmax (float, optional):
-            Maximum frequency of the F0 frames. Defaults to ```640```.
-        pitch_fmin (float, optional):
-            Minimum frequency of the F0 frames. Defaults to ```1```.
-        trim_db (int):
-            Silence threshold used for silence trimming. Defaults to 45.
-        do_rms_norm (bool, optional):
-            enable/disable RMS volume normalization when loading an audio file. Defaults to False.
-        db_level (int, optional):
-            dB level used for rms normalization. The range is -99 to 0. Defaults to None.
-        power (float):
-            Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
-            artifacts in the synthesized voice. Defaults to 1.5.
-        griffin_lim_iters (int):
-            Number of Griffing Lim iterations. Defaults to 60.
-        num_mels (int):
-            Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
-        mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
-            It needs to be adjusted for a dataset. Defaults to 0.
-        mel_fmax (float):
-            Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
-        spec_gain (int):
-            Gain applied when converting amplitude to DB. Defaults to 20.
-        signal_norm (bool):
-            enable/disable signal normalization. Defaults to True.
-        min_level_db (int):
-            minimum db threshold for the computed melspectrograms. Defaults to -100.
-        symmetric_norm (bool):
-            enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
-            [0, k], Defaults to True.
-        max_norm (float):
-            ```k``` defining the normalization range. Defaults to 4.0.
-        clip_norm (bool):
-            enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
-        stats_path (str):
-            Path to the computed stats file. Defaults to None.
-    """
-    # stft parameters
-    fft_size: int = 1024
-    win_length: int = 1024
-    hop_length: int = 256
-    frame_shift_ms: int = None
-    frame_length_ms: int = None
-    stft_pad_mode: str = "reflect"
-    # audio processing parameters
-    sample_rate: int = 22050
-    resample: bool = False
-    preemphasis: float = 0.0
-    ref_level_db: int = 20
-    do_sound_norm: bool = False
-    log_func: str = "np.log10"
-    # silence trimming
-    do_trim_silence: bool = True
-    trim_db: int = 45
-    # rms volume normalization
-    do_rms_norm: bool = False
-    db_level: float = None
-    # griffin-lim params
-    power: float = 1.5
-    griffin_lim_iters: int = 60
-    # mel-spec params
-    num_mels: int = 80
-    mel_fmin: float = 0.0
-    mel_fmax: float = None
-    spec_gain: int = 20
-    do_amp_to_db_linear: bool = True
-    do_amp_to_db_mel: bool = True
-    # f0 params
-    pitch_fmax: float = 640.0
-    pitch_fmin: float = 1.0
-    # normalization params
-    signal_norm: bool = True
-    min_level_db: int = -100
-    symmetric_norm: bool = True
-    max_norm: float = 4.0
-    clip_norm: bool = True
-    stats_path: str = None
-    def check_values(
-        self,
-    ):
-        """Check config fields"""
-        c = asdict(self)
-        check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
-        check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
-        check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
-        check_argument(
-            "frame_length_ms",
-            c,
-            restricted=True,
-            min_val=10,
-            max_val=1000,
-            alternative="win_length",
-        )
-        check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
-        check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
-        check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
-        check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
-        check_argument("power", c, restricted=True, min_val=1, max_val=5)
-        check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
-        # normalization parameters
-        check_argument("signal_norm", c, restricted=True)
-        check_argument("symmetric_norm", c, restricted=True)
-        check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
-        check_argument("clip_norm", c, restricted=True)
-        check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
-        check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
-        check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
-        check_argument("do_trim_silence", c, restricted=True)
-        check_argument("trim_db", c, restricted=True)
-@dataclass
-class BaseDatasetConfig(Coqpit):
-    """Base config for TTS datasets.
-    Args:
-        formatter (str):
-            Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
-        dataset_name (str):
-            Unique name for the dataset. Defaults to `""`.
-        path (str):
-            Root path to the dataset files. Defaults to `""`.
-        meta_file_train (str):
-            Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
-            Defaults to `""`.
-        ignored_speakers (List):
-            List of speakers IDs that are not used at the training. Default None.
-        language (str):
-            Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
-        phonemizer (str):
-            Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.
-        meta_file_val (str):
-            Name of the dataset meta file that defines the instances used at validation.
-        meta_file_attn_mask (str):
-            Path to the file that lists the attention mask files used with models that require attention masks to
-            train the duration predictor.
-    """
-    formatter: str = ""
-    dataset_name: str = ""
-    path: str = ""
-    meta_file_train: str = ""
-    ignored_speakers: List[str] = None
-    language: str = ""
-    phonemizer: str = ""
-    meta_file_val: str = ""
-    meta_file_attn_mask: str = ""
-    def check_values(
-        self,
-    ):
-        """Check config fields"""
-        c = asdict(self)
-        check_argument("formatter", c, restricted=True)
-        check_argument("path", c, restricted=True)
-        check_argument("meta_file_train", c, restricted=True)
-        check_argument("meta_file_val", c, restricted=False)
-        check_argument("meta_file_attn_mask", c, restricted=False)
-@dataclass
-class BaseTrainingConfig(TrainerConfig):
-    """Base config to define the basic 🐸TTS training parameters that are shared
-    among all the models. It is based on ```Trainer.TrainingConfig```.
-    Args:
-        model (str):
-            Name of the model that is used in the training.
-        num_loader_workers (int):
-            Number of workers for training time dataloader.
-        num_eval_loader_workers (int):
-            Number of workers for evaluation time dataloader.
-    """
-    model: str = None
-    # dataloading
-    num_loader_workers: int = 0
-    num_eval_loader_workers: int = 0
-    use_noise_augment: bool = False

TTS/demos/xtts_ft_demo/requirements.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- faster_whisper==0.9.0
2	- gradio==4.7.1

TTS/demos/xtts_ft_demo/utils/formatter.py DELETED Viewed

@@ -1,160 +0,0 @@
-import os
-import gc
-import torchaudio
-import pandas
-from faster_whisper import WhisperModel
-from glob import glob
-from tqdm import tqdm
-import torch
-import torchaudio
-# torch.set_num_threads(1)
-from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners
-torch.set_num_threads(16)
-import os
-audio_types = (".wav", ".mp3", ".flac")
-def list_audios(basePath, contains=None):
-    # return the set of files that are valid
-    return list_files(basePath, validExts=audio_types, contains=contains)
-def list_files(basePath, validExts=None, contains=None):
-    # loop over the directory structure
-    for (rootDir, dirNames, filenames) in os.walk(basePath):
-        # loop over the filenames in the current directory
-        for filename in filenames:
-            # if the contains string is not none and the filename does not contain
-            # the supplied string, then ignore the file
-            if contains is not None and filename.find(contains) == -1:
-                continue
-            # determine the file extension of the current file
-            ext = filename[filename.rfind("."):].lower()
-            # check to see if the file is an audio and should be processed
-            if validExts is None or ext.endswith(validExts):
-                # construct the path to the audio and yield it
-                audioPath = os.path.join(rootDir, filename)
-                yield audioPath
-def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None):
-    audio_total_size = 0
-    # make sure that ooutput file exists
-    os.makedirs(out_path, exist_ok=True)
-    # Loading Whisper
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    print("Loading Whisper Model!")
-    asr_model = WhisperModel("large-v2", device=device, compute_type="float16")
-    metadata = {"audio_file": [], "text": [], "speaker_name": []}
-    if gradio_progress is not None:
-        tqdm_object = gradio_progress.tqdm(audio_files, desc="Formatting...")
-    else:
-        tqdm_object = tqdm(audio_files)
-    for audio_path in tqdm_object:
-        wav, sr = torchaudio.load(audio_path)
-        # stereo to mono if needed
-        if wav.size(0) != 1:
-            wav = torch.mean(wav, dim=0, keepdim=True)
-        wav = wav.squeeze()
-        audio_total_size += (wav.size(-1) / sr)
-        segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
-        segments = list(segments)
-        i = 0
-        sentence = ""
-        sentence_start = None
-        first_word = True
-        # added all segments words in a unique list
-        words_list = []
-        for _, segment in enumerate(segments):
-            words = list(segment.words)
-            words_list.extend(words)
-        # process each word
-        for word_idx, word in enumerate(words_list):
-            if first_word:
-                sentence_start = word.start
-                # If it is the first sentence, add buffer or get the begining of the file
-                if word_idx == 0:
-                    sentence_start = max(sentence_start - buffer, 0)  # Add buffer to the sentence start
-                else:
-                    # get previous sentence end
-                    previous_word_end = words_list[word_idx - 1].end
-                    # add buffer or get the silence midle between the previous sentence and the current one
-                    sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2)
-                sentence = word.word
-                first_word = False
-            else:
-                sentence += word.word
-            if word.word[-1] in ["!", ".", "?"]:
-                sentence = sentence[1:]
-                # Expand number and abbreviations plus normalization
-                sentence = multilingual_cleaners(sentence, target_language)
-                audio_file_name, _ = os.path.splitext(os.path.basename(audio_path))
-                audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav"
-                # Check for the next word's existence
-                if word_idx + 1 < len(words_list):
-                    next_word_start = words_list[word_idx + 1].start
-                else:
-                    # If don't have more words it means that it is the last sentence then use the audio len as next word start
-                    next_word_start = (wav.shape[0] - 1) / sr
-                # Average the current word end and next word start
-                word_end = min((word.end + next_word_start) / 2, word.end + buffer)
-                absoulte_path = os.path.join(out_path, audio_file)
-                os.makedirs(os.path.dirname(absoulte_path), exist_ok=True)
-                i += 1
-                first_word = True
-                audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0)
-                # if the audio is too short ignore it (i.e < 0.33 seconds)
-                if audio.size(-1) >= sr/3:
-                    torchaudio.save(absoulte_path,
-                        audio,
-                        sr
-                    )
-                else:
-                    continue
-                metadata["audio_file"].append(audio_file)
-                metadata["text"].append(sentence)
-                metadata["speaker_name"].append(speaker_name)
-    df = pandas.DataFrame(metadata)
-    df = df.sample(frac=1)
-    num_val_samples = int(len(df)*eval_percentage)
-    df_eval = df[:num_val_samples]
-    df_train = df[num_val_samples:]
-    df_train = df_train.sort_values('audio_file')
-    train_metadata_path = os.path.join(out_path, "metadata_train.csv")
-    df_train.to_csv(train_metadata_path, sep="|", index=False)
-    eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
-    df_eval = df_eval.sort_values('audio_file')
-    df_eval.to_csv(eval_metadata_path, sep="|", index=False)
-    # deallocate VRAM and RAM
-    del asr_model, df_train, df_eval, df, metadata
-    gc.collect()
-    return train_metadata_path, eval_metadata_path, audio_total_size

TTS/demos/xtts_ft_demo/utils/gpt_train.py DELETED Viewed

@@ -1,172 +0,0 @@
-import os
-import gc
-from trainer import Trainer, TrainerArgs
-from TTS.config.shared_configs import BaseDatasetConfig
-from TTS.tts.datasets import load_tts_samples
-from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
-from TTS.utils.manage import ModelManager
-def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path, max_audio_length=255995):
-    #  Logging parameters
-    RUN_NAME = "GPT_XTTS_FT"
-    PROJECT_NAME = "XTTS_trainer"
-    DASHBOARD_LOGGER = "tensorboard"
-    LOGGER_URI = None
-    # Set here the path that the checkpoints will be saved. Default: ./run/training/
-    OUT_PATH = os.path.join(output_path, "run", "training")
-    # Training Parameters
-    OPTIMIZER_WD_ONLY_ON_WEIGHTS = True  # for multi-gpu training please make it False
-    START_WITH_EVAL = False  # if True it will star with evaluation
-    BATCH_SIZE = batch_size  # set here the batch size
-    GRAD_ACUMM_STEPS = grad_acumm  # set here the grad accumulation steps
-    # Define here the dataset that you want to use for the fine-tuning on.
-    config_dataset = BaseDatasetConfig(
-        formatter="coqui",
-        dataset_name="ft_dataset",
-        path=os.path.dirname(train_csv),
-        meta_file_train=train_csv,
-        meta_file_val=eval_csv,
-        language=language,
-    )
-    # Add here the configs of the datasets
-    DATASETS_CONFIG_LIST = [config_dataset]
-    # Define the path where XTTS v2.0.1 files will be downloaded
-    CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
-    os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
-    # DVAE files
-    DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
-    MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
-    # Set the path to the downloaded files
-    DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
-    MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
-    # download DVAE files if needed
-    if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
-        print(" > Downloading DVAE files!")
-        ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
-    # Download XTTS v2.0 checkpoint if needed
-    TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
-    XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
-    XTTS_CONFIG_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json"
-    # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
-    TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json file
-    XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth file
-    XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK))  # config.json file
-    # download XTTS v2.0 files if needed
-    if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
-        print(" > Downloading XTTS v2.0 files!")
-        ModelManager._download_model_files(
-            [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK, XTTS_CONFIG_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
-        )
-    # init args and config
-    model_args = GPTArgs(
-        max_conditioning_length=132300,  # 6 secs
-        min_conditioning_length=66150,  # 3 secs
-        debug_loading_failures=False,
-        max_wav_length=max_audio_length,  # ~11.6 seconds
-        max_text_length=200,
-        mel_norm_file=MEL_NORM_FILE,
-        dvae_checkpoint=DVAE_CHECKPOINT,
-        xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
-        tokenizer_file=TOKENIZER_FILE,
-        gpt_num_audio_tokens=1026,
-        gpt_start_audio_token=1024,
-        gpt_stop_audio_token=1025,
-        gpt_use_masking_gt_prompt_approach=True,
-        gpt_use_perceiver_resampler=True,
-    )
-    # define audio config
-    audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
-    # training parameters config
-    config = GPTTrainerConfig(
-        epochs=num_epochs,
-        output_path=OUT_PATH,
-        model_args=model_args,
-        run_name=RUN_NAME,
-        project_name=PROJECT_NAME,
-        run_description="""
-            GPT XTTS training
-            """,
-        dashboard_logger=DASHBOARD_LOGGER,
-        logger_uri=LOGGER_URI,
-        audio=audio_config,
-        batch_size=BATCH_SIZE,
-        batch_group_size=48,
-        eval_batch_size=BATCH_SIZE,
-        num_loader_workers=8,
-        eval_split_max_size=256,
-        print_step=50,
-        plot_step=100,
-        log_model_step=100,
-        save_step=1000,
-        save_n_checkpoints=1,
-        save_checkpoints=True,
-        # target_loss="loss",
-        print_eval=False,
-        # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
-        optimizer="AdamW",
-        optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
-        optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
-        lr=5e-06,  # learning rate
-        lr_scheduler="MultiStepLR",
-        # it was adjusted accordly for the new step scheme
-        lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
-        test_sentences=[],
-    )
-    # init the model from config
-    model = GPTTrainer.init_from_config(config)
-    # load training samples
-    train_samples, eval_samples = load_tts_samples(
-        DATASETS_CONFIG_LIST,
-        eval_split=True,
-        eval_split_max_size=config.eval_split_max_size,
-        eval_split_size=config.eval_split_size,
-    )
-    # init the trainer and 🚀
-    trainer = Trainer(
-        TrainerArgs(
-            restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
-            skip_train_epoch=False,
-            start_with_eval=START_WITH_EVAL,
-            grad_accum_steps=GRAD_ACUMM_STEPS,
-        ),
-        config,
-        output_path=OUT_PATH,
-        model=model,
-        train_samples=train_samples,
-        eval_samples=eval_samples,
-    )
-    trainer.fit()
-    # get the longest text audio file to use as speaker reference
-    samples_len = [len(item["text"].split(" ")) for item in train_samples]
-    longest_text_idx =  samples_len.index(max(samples_len))
-    speaker_ref = train_samples[longest_text_idx]["audio_file"]
-    trainer_out_path = trainer.output_path
-    # deallocate VRAM and RAM
-    del model, trainer, train_samples, eval_samples
-    gc.collect()
-    return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref

TTS/demos/xtts_ft_demo/xtts_demo.py DELETED Viewed

@@ -1,415 +0,0 @@
-import argparse
-import os
-import sys
-import tempfile
-import gradio as gr
-import librosa.display
-import numpy as np
-import os
-import torch
-import torchaudio
-import traceback
-from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list
-from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt
-from TTS.tts.configs.xtts_config import XttsConfig
-from TTS.tts.models.xtts import Xtts
-def clear_gpu_cache():
-    # clear the GPU cache
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-XTTS_MODEL = None
-def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
-    global XTTS_MODEL
-    clear_gpu_cache()
-    if not xtts_checkpoint or not xtts_config or not xtts_vocab:
-        return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!"
-    config = XttsConfig()
-    config.load_json(xtts_config)
-    XTTS_MODEL = Xtts.init_from_config(config)
-    print("Loading XTTS model! ")
-    XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
-    if torch.cuda.is_available():
-        XTTS_MODEL.cuda()
-    print("Model Loaded!")
-    return "Model Loaded!"
-def run_tts(lang, tts_text, speaker_audio_file):
-    if XTTS_MODEL is None or not speaker_audio_file:
-        return "You need to run the previous step to load the model !!", None, None
-    gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
-    out = XTTS_MODEL.inference(
-        text=tts_text,
-        language=lang,
-        gpt_cond_latent=gpt_cond_latent,
-        speaker_embedding=speaker_embedding,
-        temperature=XTTS_MODEL.config.temperature, # Add custom parameters here
-        length_penalty=XTTS_MODEL.config.length_penalty,
-        repetition_penalty=XTTS_MODEL.config.repetition_penalty,
-        top_k=XTTS_MODEL.config.top_k,
-        top_p=XTTS_MODEL.config.top_p,
-    )
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
-        out["wav"] = torch.tensor(out["wav"]).unsqueeze(0)
-        out_path = fp.name
-        torchaudio.save(out_path, out["wav"], 24000)
-    return "Speech generated !", out_path, speaker_audio_file
-# define a logger to redirect
-class Logger:
-    def __init__(self, filename="log.out"):
-        self.log_file = filename
-        self.terminal = sys.stdout
-        self.log = open(self.log_file, "w")
-    def write(self, message):
-        self.terminal.write(message)
-        self.log.write(message)
-    def flush(self):
-        self.terminal.flush()
-        self.log.flush()
-    def isatty(self):
-        return False
-# redirect stdout and stderr to a file
-sys.stdout = Logger()
-sys.stderr = sys.stdout
-# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
-import logging
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(levelname)s] %(message)s",
-    handlers=[
-        logging.StreamHandler(sys.stdout)
-    ]
-)
-def read_logs():
-    sys.stdout.flush()
-    with open(sys.stdout.log_file, "r") as f:
-        return f.read()
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="""XTTS fine-tuning demo\n\n"""
-        """
-        Example runs:
-        python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port
-        """,
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--port",
-        type=int,
-        help="Port to run the gradio demo. Default: 5003",
-        default=5003,
-    )
-    parser.add_argument(
-        "--out_path",
-        type=str,
-        help="Output path (where data and checkpoints will be saved) Default: /tmp/xtts_ft/",
-        default="/tmp/xtts_ft/",
-    )
-    parser.add_argument(
-        "--num_epochs",
-        type=int,
-        help="Number of epochs to train. Default: 10",
-        default=10,
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        help="Batch size. Default: 4",
-        default=4,
-    )
-    parser.add_argument(
-        "--grad_acumm",
-        type=int,
-        help="Grad accumulation steps. Default: 1",
-        default=1,
-    )
-    parser.add_argument(
-        "--max_audio_length",
-        type=int,
-        help="Max permitted audio size in seconds. Default: 11",
-        default=11,
-    )
-    args = parser.parse_args()
-    with gr.Blocks() as demo:
-        with gr.Tab("1 - Data processing"):
-            out_path = gr.Textbox(
-                label="Output path (where data and checkpoints will be saved):",
-                value=args.out_path,
-            )
-            # upload_file = gr.Audio(
-            #     sources="upload",
-            #     label="Select here the audio files that you want to use for XTTS trainining !",
-            #     type="filepath",
-            # )
-            upload_file = gr.File(
-                file_count="multiple",
-                label="Select here the audio files that you want to use for XTTS trainining (Supported formats: wav, mp3, and flac)",
-            )
-            lang = gr.Dropdown(
-                label="Dataset Language",
-                value="en",
-                choices=[
-                    "en",
-                    "es",
-                    "fr",
-                    "de",
-                    "it",
-                    "pt",
-                    "pl",
-                    "tr",
-                    "ru",
-                    "nl",
-                    "cs",
-                    "ar",
-                    "zh",
-                    "hu",
-                    "ko",
-                    "ja"
-                ],
-            )
-            progress_data = gr.Label(
-                label="Progress:"
-            )
-            logs = gr.Textbox(
-                label="Logs:",
-                interactive=False,
-            )
-            demo.load(read_logs, None, logs, every=1)
-            prompt_compute_btn = gr.Button(value="Step 1 - Create dataset")
-            def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)):
-                clear_gpu_cache()
-                out_path = os.path.join(out_path, "dataset")
-                os.makedirs(out_path, exist_ok=True)
-                if audio_path is None:
-                    return "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!", "", ""
-                else:
-                    try:
-                        train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress)
-                    except:
-                        traceback.print_exc()
-                        error = traceback.format_exc()
-                        return f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}", "", ""
-                clear_gpu_cache()
-                # if audio total len is less than 2 minutes raise an error
-                if audio_total_size < 120:
-                    message = "The sum of the duration of the audios that you provided should be at least 2 minutes!"
-                    print(message)
-                    return message, "", ""
-                print("Dataset Processed!")
-                return "Dataset Processed!", train_meta, eval_meta
-        with gr.Tab("2 - Fine-tuning XTTS Encoder"):
-            train_csv = gr.Textbox(
-                label="Train CSV:",
-            )
-            eval_csv = gr.Textbox(
-                label="Eval CSV:",
-            )
-            num_epochs =  gr.Slider(
-                label="Number of epochs:",
-                minimum=1,
-                maximum=100,
-                step=1,
-                value=args.num_epochs,
-            )
-            batch_size = gr.Slider(
-                label="Batch size:",
-                minimum=2,
-                maximum=512,
-                step=1,
-                value=args.batch_size,
-            )
-            grad_acumm = gr.Slider(
-                label="Grad accumulation steps:",
-                minimum=2,
-                maximum=128,
-                step=1,
-                value=args.grad_acumm,
-            )
-            max_audio_length = gr.Slider(
-                label="Max permitted audio size in seconds:",
-                minimum=2,
-                maximum=20,
-                step=1,
-                value=args.max_audio_length,
-            )
-            progress_train = gr.Label(
-                label="Progress:"
-            )
-            logs_tts_train = gr.Textbox(
-                label="Logs:",
-                interactive=False,
-            )
-            demo.load(read_logs, None, logs_tts_train, every=1)
-            train_btn = gr.Button(value="Step 2 - Run the training")
-            def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length):
-                clear_gpu_cache()
-                if not train_csv or not eval_csv:
-                    return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", ""
-                try:
-                    # convert seconds to waveform frames
-                    max_audio_length = int(max_audio_length * 22050)
-                    config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path, max_audio_length=max_audio_length)
-                except:
-                    traceback.print_exc()
-                    error = traceback.format_exc()
-                    return f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}", "", "", "", ""
-                # copy original files to avoid parameters changes issues
-                os.system(f"cp {config_path} {exp_path}")
-                os.system(f"cp {vocab_file} {exp_path}")
-                ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth")
-                print("Model training done!")
-                clear_gpu_cache()
-                return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav
-        with gr.Tab("3 - Inference"):
-            with gr.Row():
-                with gr.Column() as col1:
-                    xtts_checkpoint = gr.Textbox(
-                        label="XTTS checkpoint path:",
-                        value="",
-                    )
-                    xtts_config = gr.Textbox(
-                        label="XTTS config path:",
-                        value="",
-                    )
-                    xtts_vocab = gr.Textbox(
-                        label="XTTS vocab path:",
-                        value="",
-                    )
-                    progress_load = gr.Label(
-                        label="Progress:"
-                    )
-                    load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model")
-                with gr.Column() as col2:
-                    speaker_reference_audio = gr.Textbox(
-                        label="Speaker reference audio:",
-                        value="",
-                    )
-                    tts_language = gr.Dropdown(
-                        label="Language",
-                        value="en",
-                        choices=[
-                            "en",
-                            "es",
-                            "fr",
-                            "de",
-                            "it",
-                            "pt",
-                            "pl",
-                            "tr",
-                            "ru",
-                            "nl",
-                            "cs",
-                            "ar",
-                            "zh",
-                            "hu",
-                            "ko",
-                            "ja",
-                        ]
-                    )
-                    tts_text = gr.Textbox(
-                        label="Input Text.",
-                        value="This model sounds really good and above all, it's reasonably fast.",
-                    )
-                    tts_btn = gr.Button(value="Step 4 - Inference")
-                with gr.Column() as col3:
-                    progress_gen = gr.Label(
-                        label="Progress:"
-                    )
-                    tts_output_audio = gr.Audio(label="Generated Audio.")
-                    reference_audio = gr.Audio(label="Reference audio used.")
-            prompt_compute_btn.click(
-                fn=preprocess_dataset,
-                inputs=[
-                    upload_file,
-                    lang,
-                    out_path,
-                ],
-                outputs=[
-                    progress_data,
-                    train_csv,
-                    eval_csv,
-                ],
-            )
-            train_btn.click(
-                fn=train_model,
-                inputs=[
-                    lang,
-                    train_csv,
-                    eval_csv,
-                    num_epochs,
-                    batch_size,
-                    grad_acumm,
-                    out_path,
-                    max_audio_length,
-                ],
-                outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio],
-            )
-            load_btn.click(
-                fn=load_model,
-                inputs=[
-                    xtts_checkpoint,
-                    xtts_config,
-                    xtts_vocab
-                ],
-                outputs=[progress_load],
-            )
-            tts_btn.click(
-                fn=run_tts,
-                inputs=[
-                    tts_language,
-                    tts_text,
-                    speaker_reference_audio,
-                ],
-                outputs=[progress_gen, tts_output_audio, reference_audio],
-            )
-    demo.launch(
-        share=True,
-        debug=False,
-        server_port=args.port,
-        server_name="0.0.0.0"
-    )

TTS/encoder/README.md DELETED Viewed

@@ -1,18 +0,0 @@
-### Speaker Encoder
-This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
-With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
-Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
-![](umap.png)
-Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
-To run the code, you need to follow the same flow as in TTS.
-- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
-- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
-- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
-- Watch training on Tensorboard as in TTS

TTS/encoder/__init__.py DELETED Viewed

File without changes

TTS/encoder/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (165 Bytes)

TTS/encoder/__pycache__/losses.cpython-310.pyc DELETED Viewed

Binary file (7.81 kB)

TTS/encoder/configs/base_encoder_config.py DELETED Viewed

@@ -1,61 +0,0 @@
-from dataclasses import asdict, dataclass, field
-from typing import Dict, List
-from coqpit import MISSING
-from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
-@dataclass
-class BaseEncoderConfig(BaseTrainingConfig):
-    """Defines parameters for a Generic Encoder model."""
-    model: str = None
-    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
-    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
-    # model params
-    model_params: Dict = field(
-        default_factory=lambda: {
-            "model_name": "lstm",
-            "input_dim": 80,
-            "proj_dim": 256,
-            "lstm_dim": 768,
-            "num_lstm_layers": 3,
-            "use_lstm_with_projection": True,
-        }
-    )
-    audio_augmentation: Dict = field(default_factory=lambda: {})
-    # training params
-    epochs: int = 10000
-    loss: str = "angleproto"
-    grad_clip: float = 3.0
-    lr: float = 0.0001
-    optimizer: str = "radam"
-    optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
-    lr_decay: bool = False
-    warmup_steps: int = 4000
-    # logging params
-    tb_model_param_stats: bool = False
-    steps_plot_stats: int = 10
-    save_step: int = 1000
-    print_step: int = 20
-    run_eval: bool = False
-    # data loader
-    num_classes_in_batch: int = MISSING
-    num_utter_per_class: int = MISSING
-    eval_num_classes_in_batch: int = None
-    eval_num_utter_per_class: int = None
-    num_loader_workers: int = MISSING
-    voice_len: float = 1.6
-    def check_values(self):
-        super().check_values()
-        c = asdict(self)
-        assert (
-            c["model_params"]["input_dim"] == self.audio.num_mels
-        ), " [!] model input dimendion must be equal to melspectrogram dimension."

TTS/encoder/configs/emotion_encoder_config.py DELETED Viewed

@@ -1,12 +0,0 @@
-from dataclasses import asdict, dataclass
-from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
-@dataclass
-class EmotionEncoderConfig(BaseEncoderConfig):
-    """Defines parameters for Emotion Encoder model."""
-    model: str = "emotion_encoder"
-    map_classid_to_classname: dict = None
-    class_name_key: str = "emotion_name"

TTS/encoder/configs/speaker_encoder_config.py DELETED Viewed

@@ -1,11 +0,0 @@
-from dataclasses import asdict, dataclass
-from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
-@dataclass
-class SpeakerEncoderConfig(BaseEncoderConfig):
-    """Defines parameters for Speaker Encoder model."""
-    model: str = "speaker_encoder"
-    class_name_key: str = "speaker_name"

TTS/encoder/dataset.py DELETED Viewed

@@ -1,147 +0,0 @@
-import random
-import torch
-from torch.utils.data import Dataset
-from TTS.encoder.utils.generic_utils import AugmentWAV
-class EncoderDataset(Dataset):
-    def __init__(
-        self,
-        config,
-        ap,
-        meta_data,
-        voice_len=1.6,
-        num_classes_in_batch=64,
-        num_utter_per_class=10,
-        verbose=False,
-        augmentation_config=None,
-        use_torch_spec=None,
-    ):
-        """
-        Args:
-            ap (TTS.tts.utils.AudioProcessor): audio processor object.
-            meta_data (list): list of dataset instances.
-            seq_len (int): voice segment length in seconds.
-            verbose (bool): print diagnostic information.
-        """
-        super().__init__()
-        self.config = config
-        self.items = meta_data
-        self.sample_rate = ap.sample_rate
-        self.seq_len = int(voice_len * self.sample_rate)
-        self.num_utter_per_class = num_utter_per_class
-        self.ap = ap
-        self.verbose = verbose
-        self.use_torch_spec = use_torch_spec
-        self.classes, self.items = self.__parse_items()
-        self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
-        # Data Augmentation
-        self.augmentator = None
-        self.gaussian_augmentation_config = None
-        if augmentation_config:
-            self.data_augmentation_p = augmentation_config["p"]
-            if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
-                self.augmentator = AugmentWAV(ap, augmentation_config)
-            if "gaussian" in augmentation_config.keys():
-                self.gaussian_augmentation_config = augmentation_config["gaussian"]
-        if self.verbose:
-            print("\n > DataLoader initialization")
-            print(f" | > Classes per Batch: {num_classes_in_batch}")
-            print(f" | > Number of instances : {len(self.items)}")
-            print(f" | > Sequence length: {self.seq_len}")
-            print(f" | > Num Classes: {len(self.classes)}")
-            print(f" | > Classes: {self.classes}")
-    def load_wav(self, filename):
-        audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
-        return audio
-    def __parse_items(self):
-        class_to_utters = {}
-        for item in self.items:
-            path_ = item["audio_file"]
-            class_name = item[self.config.class_name_key]
-            if class_name in class_to_utters.keys():
-                class_to_utters[class_name].append(path_)
-            else:
-                class_to_utters[class_name] = [
-                    path_,
-                ]
-        # skip classes with number of samples >= self.num_utter_per_class
-        class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
-        classes = list(class_to_utters.keys())
-        classes.sort()
-        new_items = []
-        for item in self.items:
-            path_ = item["audio_file"]
-            class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"]
-            # ignore filtered classes
-            if class_name not in classes:
-                continue
-            # ignore small audios
-            if self.load_wav(path_).shape[0] - self.seq_len <= 0:
-                continue
-            new_items.append({"wav_file_path": path_, "class_name": class_name})
-        return classes, new_items
-    def __len__(self):
-        return len(self.items)
-    def get_num_classes(self):
-        return len(self.classes)
-    def get_class_list(self):
-        return self.classes
-    def set_classes(self, classes):
-        self.classes = classes
-        self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
-    def get_map_classid_to_classname(self):
-        return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
-    def __getitem__(self, idx):
-        return self.items[idx]
-    def collate_fn(self, batch):
-        # get the batch class_ids
-        labels = []
-        feats = []
-        for item in batch:
-            utter_path = item["wav_file_path"]
-            class_name = item["class_name"]
-            # get classid
-            class_id = self.classname_to_classid[class_name]
-            # load wav file
-            wav = self.load_wav(utter_path)
-            offset = random.randint(0, wav.shape[0] - self.seq_len)
-            wav = wav[offset : offset + self.seq_len]
-            if self.augmentator is not None and self.data_augmentation_p:
-                if random.random() < self.data_augmentation_p:
-                    wav = self.augmentator.apply_one(wav)
-            if not self.use_torch_spec:
-                mel = self.ap.melspectrogram(wav)
-                feats.append(torch.FloatTensor(mel))
-            else:
-                feats.append(torch.FloatTensor(wav))
-            labels.append(class_id)
-        feats = torch.stack(feats)
-        labels = torch.LongTensor(labels)
-        return feats, labels

TTS/encoder/losses.py DELETED Viewed

@@ -1,226 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-# adapted from https://github.com/cvqluu/GE2E-Loss
-class GE2ELoss(nn.Module):
-    def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
-        """
-        Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
-        Accepts an input of size (N, M, D)
-            where N is the number of speakers in the batch,
-            M is the number of utterances per speaker,
-            and D is the dimensionality of the embedding vector (e.g. d-vector)
-        Args:
-            - init_w (float): defines the initial value of w in Equation (5) of [1]
-            - init_b (float): definies the initial value of b in Equation (5) of [1]
-        """
-        super().__init__()
-        # pylint: disable=E1102
-        self.w = nn.Parameter(torch.tensor(init_w))
-        # pylint: disable=E1102
-        self.b = nn.Parameter(torch.tensor(init_b))
-        self.loss_method = loss_method
-        print(" > Initialized Generalized End-to-End loss")
-        assert self.loss_method in ["softmax", "contrast"]
-        if self.loss_method == "softmax":
-            self.embed_loss = self.embed_loss_softmax
-        if self.loss_method == "contrast":
-            self.embed_loss = self.embed_loss_contrast
-    # pylint: disable=R0201
-    def calc_new_centroids(self, dvecs, centroids, spkr, utt):
-        """
-        Calculates the new centroids excluding the reference utterance
-        """
-        excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
-        excl = torch.mean(excl, 0)
-        new_centroids = []
-        for i, centroid in enumerate(centroids):
-            if i == spkr:
-                new_centroids.append(excl)
-            else:
-                new_centroids.append(centroid)
-        return torch.stack(new_centroids)
-    def calc_cosine_sim(self, dvecs, centroids):
-        """
-        Make the cosine similarity matrix with dims (N,M,N)
-        """
-        cos_sim_matrix = []
-        for spkr_idx, speaker in enumerate(dvecs):
-            cs_row = []
-            for utt_idx, utterance in enumerate(speaker):
-                new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx)
-                # vector based cosine similarity for speed
-                cs_row.append(
-                    torch.clamp(
-                        torch.mm(
-                            utterance.unsqueeze(1).transpose(0, 1),
-                            new_centroids.transpose(0, 1),
-                        )
-                        / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
-                        1e-6,
-                    )
-                )
-            cs_row = torch.cat(cs_row, dim=0)
-            cos_sim_matrix.append(cs_row)
-        return torch.stack(cos_sim_matrix)
-    # pylint: disable=R0201
-    def embed_loss_softmax(self, dvecs, cos_sim_matrix):
-        """
-        Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
-        """
-        N, M, _ = dvecs.shape
-        L = []
-        for j in range(N):
-            L_row = []
-            for i in range(M):
-                L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
-            L_row = torch.stack(L_row)
-            L.append(L_row)
-        return torch.stack(L)
-    # pylint: disable=R0201
-    def embed_loss_contrast(self, dvecs, cos_sim_matrix):
-        """
-        Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
-        """
-        N, M, _ = dvecs.shape
-        L = []
-        for j in range(N):
-            L_row = []
-            for i in range(M):
-                centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
-                excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]))
-                L_row.append(1.0 - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids))
-            L_row = torch.stack(L_row)
-            L.append(L_row)
-        return torch.stack(L)
-    def forward(self, x, _label=None):
-        """
-        Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
-        """
-        assert x.size()[1] >= 2
-        centroids = torch.mean(x, 1)
-        cos_sim_matrix = self.calc_cosine_sim(x, centroids)
-        torch.clamp(self.w, 1e-6)
-        cos_sim_matrix = self.w * cos_sim_matrix + self.b
-        L = self.embed_loss(x, cos_sim_matrix)
-        return L.mean()
-# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
-class AngleProtoLoss(nn.Module):
-    """
-    Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
-        Accepts an input of size (N, M, D)
-            where N is the number of speakers in the batch,
-            M is the number of utterances per speaker,
-            and D is the dimensionality of the embedding vector
-        Args:
-            - init_w (float): defines the initial value of w
-            - init_b (float): definies the initial value of b
-    """
-    def __init__(self, init_w=10.0, init_b=-5.0):
-        super().__init__()
-        # pylint: disable=E1102
-        self.w = nn.Parameter(torch.tensor(init_w))
-        # pylint: disable=E1102
-        self.b = nn.Parameter(torch.tensor(init_b))
-        self.criterion = torch.nn.CrossEntropyLoss()
-        print(" > Initialized Angular Prototypical loss")
-    def forward(self, x, _label=None):
-        """
-        Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
-        """
-        assert x.size()[1] >= 2
-        out_anchor = torch.mean(x[:, 1:, :], 1)
-        out_positive = x[:, 0, :]
-        num_speakers = out_anchor.size()[0]
-        cos_sim_matrix = F.cosine_similarity(
-            out_positive.unsqueeze(-1).expand(-1, -1, num_speakers),
-            out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2),
-        )
-        torch.clamp(self.w, 1e-6)
-        cos_sim_matrix = cos_sim_matrix * self.w + self.b
-        label = torch.arange(num_speakers).to(cos_sim_matrix.device)
-        L = self.criterion(cos_sim_matrix, label)
-        return L
-class SoftmaxLoss(nn.Module):
-    """
-    Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
-        Args:
-            - embedding_dim (float): speaker embedding dim
-            - n_speakers (float): number of speakers
-    """
-    def __init__(self, embedding_dim, n_speakers):
-        super().__init__()
-        self.criterion = torch.nn.CrossEntropyLoss()
-        self.fc = nn.Linear(embedding_dim, n_speakers)
-        print("Initialised Softmax Loss")
-    def forward(self, x, label=None):
-        # reshape for compatibility
-        x = x.reshape(-1, x.size()[-1])
-        label = label.reshape(-1)
-        x = self.fc(x)
-        L = self.criterion(x, label)
-        return L
-    def inference(self, embedding):
-        x = self.fc(embedding)
-        activations = torch.nn.functional.softmax(x, dim=1).squeeze(0)
-        class_id = torch.argmax(activations)
-        return class_id
-class SoftmaxAngleProtoLoss(nn.Module):
-    """
-    Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
-        Args:
-            - embedding_dim (float): speaker embedding dim
-            - n_speakers (float): number of speakers
-            - init_w (float): defines the initial value of w
-            - init_b (float): definies the initial value of b
-    """
-    def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
-        super().__init__()
-        self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
-        self.angleproto = AngleProtoLoss(init_w, init_b)
-        print("Initialised SoftmaxAnglePrototypical Loss")
-    def forward(self, x, label=None):
-        """
-        Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
-        """
-        Lp = self.angleproto(x)
-        Ls = self.softmax(x, label)
-        return Ls + Lp

TTS/encoder/models/__pycache__/base_encoder.cpython-310.pyc DELETED Viewed

Binary file (4.53 kB)

TTS/encoder/models/__pycache__/lstm.cpython-310.pyc DELETED Viewed

Binary file (3.61 kB)

TTS/encoder/models/__pycache__/resnet.cpython-310.pyc DELETED Viewed

Binary file (5.84 kB)

TTS/encoder/models/base_encoder.py DELETED Viewed

@@ -1,161 +0,0 @@
-import numpy as np
-import torch
-import torchaudio
-from coqpit import Coqpit
-from torch import nn
-from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
-from TTS.utils.generic_utils import set_init_dict
-from TTS.utils.io import load_fsspec
-class PreEmphasis(nn.Module):
-    def __init__(self, coefficient=0.97):
-        super().__init__()
-        self.coefficient = coefficient
-        self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
-    def forward(self, x):
-        assert len(x.size()) == 2
-        x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
-        return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
-class BaseEncoder(nn.Module):
-    """Base `encoder` class. Every new `encoder` model must inherit this.
-    It defines common `encoder` specific functions.
-    """
-    # pylint: disable=W0102
-    def __init__(self):
-        super(BaseEncoder, self).__init__()
-    def get_torch_mel_spectrogram_class(self, audio_config):
-        return torch.nn.Sequential(
-            PreEmphasis(audio_config["preemphasis"]),
-            # TorchSTFT(
-            #     n_fft=audio_config["fft_size"],
-            #     hop_length=audio_config["hop_length"],
-            #     win_length=audio_config["win_length"],
-            #     sample_rate=audio_config["sample_rate"],
-            #     window="hamming_window",
-            #     mel_fmin=0.0,
-            #     mel_fmax=None,
-            #     use_htk=True,
-            #     do_amp_to_db=False,
-            #     n_mels=audio_config["num_mels"],
-            #     power=2.0,
-            #     use_mel=True,
-            #     mel_norm=None,
-            # )
-            torchaudio.transforms.MelSpectrogram(
-                sample_rate=audio_config["sample_rate"],
-                n_fft=audio_config["fft_size"],
-                win_length=audio_config["win_length"],
-                hop_length=audio_config["hop_length"],
-                window_fn=torch.hamming_window,
-                n_mels=audio_config["num_mels"],
-            ),
-        )
-    @torch.no_grad()
-    def inference(self, x, l2_norm=True):
-        return self.forward(x, l2_norm)
-    @torch.no_grad()
-    def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
-        """
-        Generate embeddings for a batch of utterances
-        x: 1xTxD
-        """
-        # map to the waveform size
-        if self.use_torch_spec:
-            num_frames = num_frames * self.audio_config["hop_length"]
-        max_len = x.shape[1]
-        if max_len < num_frames:
-            num_frames = max_len
-        offsets = np.linspace(0, max_len - num_frames, num=num_eval)
-        frames_batch = []
-        for offset in offsets:
-            offset = int(offset)
-            end_offset = int(offset + num_frames)
-            frames = x[:, offset:end_offset]
-            frames_batch.append(frames)
-        frames_batch = torch.cat(frames_batch, dim=0)
-        embeddings = self.inference(frames_batch, l2_norm=l2_norm)
-        if return_mean:
-            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
-        return embeddings
-    def get_criterion(self, c: Coqpit, num_classes=None):
-        if c.loss == "ge2e":
-            criterion = GE2ELoss(loss_method="softmax")
-        elif c.loss == "angleproto":
-            criterion = AngleProtoLoss()
-        elif c.loss == "softmaxproto":
-            criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
-        else:
-            raise Exception("The %s  not is a loss supported" % c.loss)
-        return criterion
-    def load_checkpoint(
-        self,
-        config: Coqpit,
-        checkpoint_path: str,
-        eval: bool = False,
-        use_cuda: bool = False,
-        criterion=None,
-        cache=False,
-    ):
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
-        try:
-            self.load_state_dict(state["model"])
-            print(" > Model fully restored. ")
-        except (KeyError, RuntimeError) as error:
-            # If eval raise the error
-            if eval:
-                raise error
-            print(" > Partial model initialization.")
-            model_dict = self.state_dict()
-            model_dict = set_init_dict(model_dict, state["model"], c)
-            self.load_state_dict(model_dict)
-            del model_dict
-        # load the criterion for restore_path
-        if criterion is not None and "criterion" in state:
-            try:
-                criterion.load_state_dict(state["criterion"])
-            except (KeyError, RuntimeError) as error:
-                print(" > Criterion load ignored because of:", error)
-        # instance and load the criterion for the encoder classifier in inference time
-        if (
-            eval
-            and criterion is None
-            and "criterion" in state
-            and getattr(config, "map_classid_to_classname", None) is not None
-        ):
-            criterion = self.get_criterion(config, len(config.map_classid_to_classname))
-            criterion.load_state_dict(state["criterion"])
-        if use_cuda:
-            self.cuda()
-            if criterion is not None:
-                criterion = criterion.cuda()
-        if eval:
-            self.eval()
-            assert not self.training
-        if not eval:
-            return criterion, state["step"]
-        return criterion

TTS/encoder/models/lstm.py DELETED Viewed

@@ -1,99 +0,0 @@
-import torch
-from torch import nn
-from TTS.encoder.models.base_encoder import BaseEncoder
-class LSTMWithProjection(nn.Module):
-    def __init__(self, input_size, hidden_size, proj_size):
-        super().__init__()
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.proj_size = proj_size
-        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
-        self.linear = nn.Linear(hidden_size, proj_size, bias=False)
-    def forward(self, x):
-        self.lstm.flatten_parameters()
-        o, (_, _) = self.lstm(x)
-        return self.linear(o)
-class LSTMWithoutProjection(nn.Module):
-    def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
-        super().__init__()
-        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
-        self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
-        self.relu = nn.ReLU()
-    def forward(self, x):
-        _, (hidden, _) = self.lstm(x)
-        return self.relu(self.linear(hidden[-1]))
-class LSTMSpeakerEncoder(BaseEncoder):
-    def __init__(
-        self,
-        input_dim,
-        proj_dim=256,
-        lstm_dim=768,
-        num_lstm_layers=3,
-        use_lstm_with_projection=True,
-        use_torch_spec=False,
-        audio_config=None,
-    ):
-        super().__init__()
-        self.use_lstm_with_projection = use_lstm_with_projection
-        self.use_torch_spec = use_torch_spec
-        self.audio_config = audio_config
-        self.proj_dim = proj_dim
-        layers = []
-        # choise LSTM layer
-        if use_lstm_with_projection:
-            layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
-            for _ in range(num_lstm_layers - 1):
-                layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
-            self.layers = nn.Sequential(*layers)
-        else:
-            self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
-        self.instancenorm = nn.InstanceNorm1d(input_dim)
-        if self.use_torch_spec:
-            self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
-        else:
-            self.torch_spec = None
-        self._init_layers()
-    def _init_layers(self):
-        for name, param in self.layers.named_parameters():
-            if "bias" in name:
-                nn.init.constant_(param, 0.0)
-            elif "weight" in name:
-                nn.init.xavier_normal_(param)
-    def forward(self, x, l2_norm=True):
-        """Forward pass of the model.
-        Args:
-            x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
-                to compute the spectrogram on-the-fly.
-            l2_norm (bool): Whether to L2-normalize the outputs.
-        Shapes:
-            - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
-        """
-        with torch.no_grad():
-            with torch.cuda.amp.autocast(enabled=False):
-                if self.use_torch_spec:
-                    x.squeeze_(1)
-                    x = self.torch_spec(x)
-                x = self.instancenorm(x).transpose(1, 2)
-        d = self.layers(x)
-        if self.use_lstm_with_projection:
-            d = d[:, -1]
-        if l2_norm:
-            d = torch.nn.functional.normalize(d, p=2, dim=1)
-        return d

TTS/encoder/models/resnet.py DELETED Viewed

@@ -1,198 +0,0 @@
-import torch
-from torch import nn
-# from TTS.utils.audio.torch_transforms import TorchSTFT
-from TTS.encoder.models.base_encoder import BaseEncoder
-class SELayer(nn.Module):
-    def __init__(self, channel, reduction=8):
-        super(SELayer, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.fc = nn.Sequential(
-            nn.Linear(channel, channel // reduction),
-            nn.ReLU(inplace=True),
-            nn.Linear(channel // reduction, channel),
-            nn.Sigmoid(),
-        )
-    def forward(self, x):
-        b, c, _, _ = x.size()
-        y = self.avg_pool(x).view(b, c)
-        y = self.fc(y).view(b, c, 1, 1)
-        return x * y
-class SEBasicBlock(nn.Module):
-    expansion = 1
-    def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
-        super(SEBasicBlock, self).__init__()
-        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.se = SELayer(planes, reduction)
-        self.downsample = downsample
-        self.stride = stride
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.relu(out)
-        out = self.bn1(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.se(out)
-        if self.downsample is not None:
-            residual = self.downsample(x)
-        out += residual
-        out = self.relu(out)
-        return out
-class ResNetSpeakerEncoder(BaseEncoder):
-    """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
-    Adapted from: https://github.com/clovaai/voxceleb_trainer
-    """
-    # pylint: disable=W0102
-    def __init__(
-        self,
-        input_dim=64,
-        proj_dim=512,
-        layers=[3, 4, 6, 3],
-        num_filters=[32, 64, 128, 256],
-        encoder_type="ASP",
-        log_input=False,
-        use_torch_spec=False,
-        audio_config=None,
-    ):
-        super(ResNetSpeakerEncoder, self).__init__()
-        self.encoder_type = encoder_type
-        self.input_dim = input_dim
-        self.log_input = log_input
-        self.use_torch_spec = use_torch_spec
-        self.audio_config = audio_config
-        self.proj_dim = proj_dim
-        self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
-        self.relu = nn.ReLU(inplace=True)
-        self.bn1 = nn.BatchNorm2d(num_filters[0])
-        self.inplanes = num_filters[0]
-        self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0])
-        self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2))
-        self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2))
-        self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
-        self.instancenorm = nn.InstanceNorm1d(input_dim)
-        if self.use_torch_spec:
-            self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
-        else:
-            self.torch_spec = None
-        outmap_size = int(self.input_dim / 8)
-        self.attention = nn.Sequential(
-            nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
-            nn.ReLU(),
-            nn.BatchNorm1d(128),
-            nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
-            nn.Softmax(dim=2),
-        )
-        if self.encoder_type == "SAP":
-            out_dim = num_filters[3] * outmap_size
-        elif self.encoder_type == "ASP":
-            out_dim = num_filters[3] * outmap_size * 2
-        else:
-            raise ValueError("Undefined encoder")
-        self.fc = nn.Linear(out_dim, proj_dim)
-        self._init_layers()
-    def _init_layers(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-    def create_layer(self, block, planes, blocks, stride=1):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(planes * block.expansion),
-            )
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample))
-        self.inplanes = planes * block.expansion
-        for _ in range(1, blocks):
-            layers.append(block(self.inplanes, planes))
-        return nn.Sequential(*layers)
-    # pylint: disable=R0201
-    def new_parameter(self, *size):
-        out = nn.Parameter(torch.FloatTensor(*size))
-        nn.init.xavier_normal_(out)
-        return out
-    def forward(self, x, l2_norm=False):
-        """Forward pass of the model.
-        Args:
-            x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
-                to compute the spectrogram on-the-fly.
-            l2_norm (bool): Whether to L2-normalize the outputs.
-        Shapes:
-            - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
-        """
-        x.squeeze_(1)
-        # if you torch spec compute it otherwise use the mel spec computed by the AP
-        if self.use_torch_spec:
-            x = self.torch_spec(x)
-        if self.log_input:
-            x = (x + 1e-6).log()
-        x = self.instancenorm(x).unsqueeze(1)
-        x = self.conv1(x)
-        x = self.relu(x)
-        x = self.bn1(x)
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-        x = x.reshape(x.size()[0], -1, x.size()[-1])
-        w = self.attention(x)
-        if self.encoder_type == "SAP":
-            x = torch.sum(x * w, dim=2)
-        elif self.encoder_type == "ASP":
-            mu = torch.sum(x * w, dim=2)
-            sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5))
-            x = torch.cat((mu, sg), 1)
-        x = x.view(x.size()[0], -1)
-        x = self.fc(x)
-        if l2_norm:
-            x = torch.nn.functional.normalize(x, p=2, dim=1)
-        return x

TTS/encoder/requirements.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- umap-learn
2	- numpy>=1.17.0

TTS/encoder/utils/__init__.py DELETED Viewed

File without changes

TTS/encoder/utils/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (171 Bytes)

TTS/encoder/utils/__pycache__/generic_utils.cpython-310.pyc DELETED Viewed

Binary file (3.7 kB)