New model

Browse files

Files changed (7) hide show

best_model.pth +2 -2
config.json +23 -17
events.out.tfevents.1732240190.DESKTOP-4S0973M.2975.0 +3 -0
events.out.tfevents.1732266340.DESKTOP-4S0973M.857.0 +3 -0
events.out.tfevents.1732309973.DESKTOP-4S0973M.911.0 +3 -0
train_vits.py +97 -0
trainer_0_log.txt +0 -0

best_model.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e495847684ad5314bb6903e20e16f04ed0084e7efe2133676fad9eb9999ef57c
-size 997700278

 version https://git-lfs.github.com/spec/v1
+oid sha256:9deed38616baaf2bdf6024d7d87289e1e641faa459a20f7d53ba7d99093cf891
+size 997712054

config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
     "output_path": "/home/aymen/Tamazight-NLP/Speech/coqui_tts",
     "logger_uri": null,
-    "run_name": "tashelhit_bible",
     "project_name": null,
     "run_description": "\ud83d\udc38Coqui trainer run.",
     "print_step": 25,
@@ -11,14 +11,14 @@
     "dashboard_logger": "tensorboard",
     "save_on_interrupt": true,
     "log_model_step": null,
-    "save_step": 10000,
     "save_n_checkpoints": 5,
     "save_checkpoints": true,
-    "save_all_best": true,
-    "save_best_after": 1000,
     "target_loss": null,
-    "print_eval": false,
-    "test_delay_epochs": 0,
     "run_eval": true,
     "run_eval_steps": null,
     "distributed_backend": "nccl",
@@ -57,7 +57,7 @@
     "use_noise_augment": false,
     "audio": {
         "fft_size": 1024,
-        "sample_rate": 16000,
         "win_length": 1024,
         "hop_length": 256,
         "num_mels": 80,
@@ -80,18 +80,18 @@
         "bos": "<BOS>",
         "blank": "<BLNK>",
         "characters": "\u2d30\u2d31\u2d33\u2d37\u2d39\u2d3b\u2d3c\u2d3d\u2d40\u2d43\u2d44\u2d45\u2d47\u2d49\u2d4a\u2d4d\u2d4e\u2d4f\u2d53\u2d54\u2d55\u2d56\u2d59\u2d5a\u2d5b\u2d5c\u2d5f\u2d61\u2d62\u2d63\u2d65\u2d6f",
-        "punctuations": " ",
         "phonemes": null,
         "is_unique": true,
         "is_sorted": true
     },
     "add_blank": true,
-    "batch_group_size": 0,
     "loss_masking": null,
-    "min_audio_len": 0,
     "max_audio_len": Infinity,
     "min_text_len": 1,
-    "max_text_len": 250,
     "compute_f0": false,
     "compute_energy": false,
     "compute_linear_spec": true,
@@ -103,8 +103,8 @@
         {
             "formatter": "nemo",
             "dataset_name": "",
-            "path": "/home/aymen/Tamazight-NLP/Speech/media_ipsapps/shi_tls/hq_segmented/manifests/",
-            "meta_file_train": "manifest.json",
             "ignored_speakers": null,
             "language": "",
             "phonemizer": "",
@@ -114,10 +114,16 @@
     ],
     "test_sentences": [
         [
-            "\u2d30\u2d63\u2d53\u2d4d"
         ],
         [
-            "\u2d33\u2d4f \u2d30\u2d37 \u2d30\u2d3d \u2d49\u2d59\u2d59\u2d33\u2d4f \u2d55\u2d31\u2d31\u2d49 \u2d49\u2d5c\u2d5c\u2d53 \u2d3d"
         ]
     ],
     "eval_split_max_size": null,
@@ -129,7 +135,7 @@
     "use_length_weighted_sampler": false,
     "length_weighted_sampler_alpha": 1.0,
     "model_args": {
-        "num_chars": 35,
         "out_channels": 513,
         "spec_segment_size": 32,
         "hidden_channels": 192,
@@ -256,6 +262,6 @@
     "use_d_vector_file": false,
     "d_vector_file": null,
     "d_vector_dim": 0,
-    "restore_path": "/home/aymen/Tamazight-NLP/Speech/coqui_tts/tashelhit_bible-November-17-2024_01+37PM-0000000/checkpoint_28947.pth",
     "github_branch": "inside_docker"
 }

 {
     "output_path": "/home/aymen/Tamazight-NLP/Speech/coqui_tts",
     "logger_uri": null,
+    "run_name": "vits_shi_male",
     "project_name": null,
     "run_description": "\ud83d\udc38Coqui trainer run.",
     "print_step": 25,
     "dashboard_logger": "tensorboard",
     "save_on_interrupt": true,
     "log_model_step": null,
+    "save_step": 5000,
     "save_n_checkpoints": 5,
     "save_checkpoints": true,
+    "save_all_best": false,
+    "save_best_after": 0,
     "target_loss": null,
+    "print_eval": true,
+    "test_delay_epochs": -1,
     "run_eval": true,
     "run_eval_steps": null,
     "distributed_backend": "nccl",
     "use_noise_augment": false,
     "audio": {
         "fft_size": 1024,
+        "sample_rate": 22050,
         "win_length": 1024,
         "hop_length": 256,
         "num_mels": 80,
         "bos": "<BOS>",
         "blank": "<BLNK>",
         "characters": "\u2d30\u2d31\u2d33\u2d37\u2d39\u2d3b\u2d3c\u2d3d\u2d40\u2d43\u2d44\u2d45\u2d47\u2d49\u2d4a\u2d4d\u2d4e\u2d4f\u2d53\u2d54\u2d55\u2d56\u2d59\u2d5a\u2d5b\u2d5c\u2d5f\u2d61\u2d62\u2d63\u2d65\u2d6f",
+        "punctuations": " !,.:?",
         "phonemes": null,
         "is_unique": true,
         "is_sorted": true
     },
     "add_blank": true,
+    "batch_group_size": 5,
     "loss_masking": null,
+    "min_audio_len": 1,
     "max_audio_len": Infinity,
     "min_text_len": 1,
+    "max_text_len": Infinity,
     "compute_f0": false,
     "compute_energy": false,
     "compute_linear_spec": true,
         {
             "formatter": "nemo",
             "dataset_name": "",
+            "path": "/home/aymen/Tamazight-NLP/Speech/media_ipsapps/shi_tls/cut_22kHz_offset/manifests/",
+            "meta_file_train": "manifest_processed.json",
             "ignored_speakers": null,
             "language": "",
             "phonemizer": "",
     ],
     "test_sentences": [
         [
+            "\u2d30\u2d63\u2d53\u2d4d. \u2d4e\u2d30\u2d4f\u2d63\u2d30\u2d3d\u2d49\u2d4f?"
         ],
         [
+            "\u2d61\u2d30 \u2d5c\u2d30\u2d4e\u2d56\u2d30\u2d54\u2d5c \u2d4e\u2d30 \u2d37 \u2d53\u2d3d\u2d30\u2d4f \u2d5c\u2d59\u2d3d\u2d54\u2d5c?"
+        ],
+        [
+            "\u2d33\u2d4f! \u2d30\u2d37 \u2d30\u2d3d \u2d49\u2d59\u2d59\u2d33\u2d4f \u2d55\u2d31\u2d31\u2d49 \u2d49\u2d5c\u2d5c\u2d53 \u2d3d."
+        ],
+        [
+            "\u2d30\u2d54\u2d54\u2d30\u2d61 \u2d4f \u2d4d\u2d40\u2d4e\u2d4e \u2d62\u2d53\u2d3d\u2d54 \u2d30\u2d56 \u2d49\u2d40\u2d37\u2d53\u2d4e\u2d4f \u2d4f\u2d4f\u2d56!"
         ]
     ],
     "eval_split_max_size": null,
     "use_length_weighted_sampler": false,
     "length_weighted_sampler_alpha": 1.0,
     "model_args": {
+        "num_chars": 40,
         "out_channels": 513,
         "spec_segment_size": 32,
         "hidden_channels": 192,
     "use_d_vector_file": false,
     "d_vector_file": null,
     "d_vector_dim": 0,
+    "restore_path": "/home/aymen/Tamazight-NLP/Speech/coqui_tts/vits_shi_male-November-22-2024_02+49AM-0000000/checkpoint_23106.pth",
     "github_branch": "inside_docker"
 }

events.out.tfevents.1732240190.DESKTOP-4S0973M.2975.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c1becd36564b6f039e7ead68827eaa4de95da20454ee1f9a93c90a3a1d33f5f
+size 1643638

events.out.tfevents.1732266340.DESKTOP-4S0973M.857.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:240f28bbd7d51661943c84742eee4a6fe451716276e03a3ded409b5163b815d3
+size 20549500

events.out.tfevents.1732309973.DESKTOP-4S0973M.911.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06817c8595a3a147808ea9a411347a0db94dc8d710c0accc60e64d0c54ca88f6
+size 39935360

train_vits.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+from trainer import Trainer, TrainerArgs
+from TTS.tts.configs.shared_configs import BaseDatasetConfig , CharactersConfig
+from TTS.tts.configs.vits_config import VitsConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.vits import Vits, VitsAudioConfig
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+output_path = os.path.dirname(os.path.abspath(__file__))
+# DEFINE DATASET CONFIG
+TRAIN_PATH = "/home/aymen/Tamazight-NLP/Speech/media_ipsapps/shi_tls/cut_22kHz_offset/manifests/"
+dataset_config = BaseDatasetConfig(
+    formatter="nemo", meta_file_train="manifest_processed.json", path=TRAIN_PATH
+)
+audio_config = VitsAudioConfig(
+    sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
+)
+character_config = CharactersConfig(
+    characters_class= "TTS.tts.models.vits.VitsCharacters",
+    characters= "ⴰⴱⴳⴷⴹⴻⴼⴽⵀⵃⵄⵅⵇⵉⵊⵍⵎⵏⵓⵔⵕⵖⵙⵚⵛⵜⵟⵡⵢⵣⵥⵯ",
+    punctuations=" !,.:?",
+    pad= "<PAD>",
+    eos= "<EOS>",
+    bos= "<BOS>",
+    blank= "<BLNK>",
+)
+config = VitsConfig(
+    audio=audio_config,
+    characters=character_config,
+    run_name="vits_shi_male",
+    batch_size=16,
+    eval_batch_size=4,
+    batch_group_size=5,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1000,
+    save_step=5000,
+    text_cleaner="no_cleaners",
+    use_phonemes=False,
+    compute_input_seq_cache=True,
+    print_step=25,
+    print_eval=True,
+    mixed_precision=True,
+    output_path=output_path,
+    datasets=[dataset_config],
+    cudnn_benchmark=False,
+    test_sentences=[
+        ["ⴰⵣⵓⵍ. ⵎⴰⵏⵣⴰⴽⵉⵏ?"],
+        ["ⵡⴰ ⵜⴰⵎⵖⴰⵔⵜ ⵎⴰ ⴷ ⵓⴽⴰⵏ ⵜⵙⴽⵔⵜ?"],
+        ["ⴳⵏ! ⴰⴷ ⴰⴽ ⵉⵙⵙⴳⵏ ⵕⴱⴱⵉ ⵉⵜⵜⵓ ⴽ."],
+        ["ⴰⵔⵔⴰⵡ ⵏ ⵍⵀⵎⵎ ⵢⵓⴽⵔ ⴰⵖ ⵉⵀⴷⵓⵎⵏ ⵏⵏⵖ!"]
+    ],
+)
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# config is updated with the default characters if not defined in the config.
+tokenizer, config = TTSTokenizer.init_from_config(config)
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+# init model
+model = Vits(config, ap, tokenizer, speaker_manager=None)
+# init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(),
+    config,
+    output_path,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+)
+trainer.fit()

trainer_0_log.txt ADDED Viewed

The diff for this file is too large to render. See raw diff