ayymen commited on
Commit
ceb44c9
1 Parent(s): dba95d6
best_model.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e495847684ad5314bb6903e20e16f04ed0084e7efe2133676fad9eb9999ef57c
3
- size 997700278
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9deed38616baaf2bdf6024d7d87289e1e641faa459a20f7d53ba7d99093cf891
3
+ size 997712054
config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "output_path": "/home/aymen/Tamazight-NLP/Speech/coqui_tts",
3
  "logger_uri": null,
4
- "run_name": "tashelhit_bible",
5
  "project_name": null,
6
  "run_description": "\ud83d\udc38Coqui trainer run.",
7
  "print_step": 25,
@@ -11,14 +11,14 @@
11
  "dashboard_logger": "tensorboard",
12
  "save_on_interrupt": true,
13
  "log_model_step": null,
14
- "save_step": 10000,
15
  "save_n_checkpoints": 5,
16
  "save_checkpoints": true,
17
- "save_all_best": true,
18
- "save_best_after": 1000,
19
  "target_loss": null,
20
- "print_eval": false,
21
- "test_delay_epochs": 0,
22
  "run_eval": true,
23
  "run_eval_steps": null,
24
  "distributed_backend": "nccl",
@@ -57,7 +57,7 @@
57
  "use_noise_augment": false,
58
  "audio": {
59
  "fft_size": 1024,
60
- "sample_rate": 16000,
61
  "win_length": 1024,
62
  "hop_length": 256,
63
  "num_mels": 80,
@@ -80,18 +80,18 @@
80
  "bos": "<BOS>",
81
  "blank": "<BLNK>",
82
  "characters": "\u2d30\u2d31\u2d33\u2d37\u2d39\u2d3b\u2d3c\u2d3d\u2d40\u2d43\u2d44\u2d45\u2d47\u2d49\u2d4a\u2d4d\u2d4e\u2d4f\u2d53\u2d54\u2d55\u2d56\u2d59\u2d5a\u2d5b\u2d5c\u2d5f\u2d61\u2d62\u2d63\u2d65\u2d6f",
83
- "punctuations": " ",
84
  "phonemes": null,
85
  "is_unique": true,
86
  "is_sorted": true
87
  },
88
  "add_blank": true,
89
- "batch_group_size": 0,
90
  "loss_masking": null,
91
- "min_audio_len": 0,
92
  "max_audio_len": Infinity,
93
  "min_text_len": 1,
94
- "max_text_len": 250,
95
  "compute_f0": false,
96
  "compute_energy": false,
97
  "compute_linear_spec": true,
@@ -103,8 +103,8 @@
103
  {
104
  "formatter": "nemo",
105
  "dataset_name": "",
106
- "path": "/home/aymen/Tamazight-NLP/Speech/media_ipsapps/shi_tls/hq_segmented/manifests/",
107
- "meta_file_train": "manifest.json",
108
  "ignored_speakers": null,
109
  "language": "",
110
  "phonemizer": "",
@@ -114,10 +114,16 @@
114
  ],
115
  "test_sentences": [
116
  [
117
- "\u2d30\u2d63\u2d53\u2d4d"
118
  ],
119
  [
120
- "\u2d33\u2d4f \u2d30\u2d37 \u2d30\u2d3d \u2d49\u2d59\u2d59\u2d33\u2d4f \u2d55\u2d31\u2d31\u2d49 \u2d49\u2d5c\u2d5c\u2d53 \u2d3d"
 
 
 
 
 
 
121
  ]
122
  ],
123
  "eval_split_max_size": null,
@@ -129,7 +135,7 @@
129
  "use_length_weighted_sampler": false,
130
  "length_weighted_sampler_alpha": 1.0,
131
  "model_args": {
132
- "num_chars": 35,
133
  "out_channels": 513,
134
  "spec_segment_size": 32,
135
  "hidden_channels": 192,
@@ -256,6 +262,6 @@
256
  "use_d_vector_file": false,
257
  "d_vector_file": null,
258
  "d_vector_dim": 0,
259
- "restore_path": "/home/aymen/Tamazight-NLP/Speech/coqui_tts/tashelhit_bible-November-17-2024_01+37PM-0000000/checkpoint_28947.pth",
260
  "github_branch": "inside_docker"
261
  }
 
1
  {
2
  "output_path": "/home/aymen/Tamazight-NLP/Speech/coqui_tts",
3
  "logger_uri": null,
4
+ "run_name": "vits_shi_male",
5
  "project_name": null,
6
  "run_description": "\ud83d\udc38Coqui trainer run.",
7
  "print_step": 25,
 
11
  "dashboard_logger": "tensorboard",
12
  "save_on_interrupt": true,
13
  "log_model_step": null,
14
+ "save_step": 5000,
15
  "save_n_checkpoints": 5,
16
  "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 0,
19
  "target_loss": null,
20
+ "print_eval": true,
21
+ "test_delay_epochs": -1,
22
  "run_eval": true,
23
  "run_eval_steps": null,
24
  "distributed_backend": "nccl",
 
57
  "use_noise_augment": false,
58
  "audio": {
59
  "fft_size": 1024,
60
+ "sample_rate": 22050,
61
  "win_length": 1024,
62
  "hop_length": 256,
63
  "num_mels": 80,
 
80
  "bos": "<BOS>",
81
  "blank": "<BLNK>",
82
  "characters": "\u2d30\u2d31\u2d33\u2d37\u2d39\u2d3b\u2d3c\u2d3d\u2d40\u2d43\u2d44\u2d45\u2d47\u2d49\u2d4a\u2d4d\u2d4e\u2d4f\u2d53\u2d54\u2d55\u2d56\u2d59\u2d5a\u2d5b\u2d5c\u2d5f\u2d61\u2d62\u2d63\u2d65\u2d6f",
83
+ "punctuations": " !,.:?",
84
  "phonemes": null,
85
  "is_unique": true,
86
  "is_sorted": true
87
  },
88
  "add_blank": true,
89
+ "batch_group_size": 5,
90
  "loss_masking": null,
91
+ "min_audio_len": 1,
92
  "max_audio_len": Infinity,
93
  "min_text_len": 1,
94
+ "max_text_len": Infinity,
95
  "compute_f0": false,
96
  "compute_energy": false,
97
  "compute_linear_spec": true,
 
103
  {
104
  "formatter": "nemo",
105
  "dataset_name": "",
106
+ "path": "/home/aymen/Tamazight-NLP/Speech/media_ipsapps/shi_tls/cut_22kHz_offset/manifests/",
107
+ "meta_file_train": "manifest_processed.json",
108
  "ignored_speakers": null,
109
  "language": "",
110
  "phonemizer": "",
 
114
  ],
115
  "test_sentences": [
116
  [
117
+ "\u2d30\u2d63\u2d53\u2d4d. \u2d4e\u2d30\u2d4f\u2d63\u2d30\u2d3d\u2d49\u2d4f?"
118
  ],
119
  [
120
+ "\u2d61\u2d30 \u2d5c\u2d30\u2d4e\u2d56\u2d30\u2d54\u2d5c \u2d4e\u2d30 \u2d37 \u2d53\u2d3d\u2d30\u2d4f \u2d5c\u2d59\u2d3d\u2d54\u2d5c?"
121
+ ],
122
+ [
123
+ "\u2d33\u2d4f! \u2d30\u2d37 \u2d30\u2d3d \u2d49\u2d59\u2d59\u2d33\u2d4f \u2d55\u2d31\u2d31\u2d49 \u2d49\u2d5c\u2d5c\u2d53 \u2d3d."
124
+ ],
125
+ [
126
+ "\u2d30\u2d54\u2d54\u2d30\u2d61 \u2d4f \u2d4d\u2d40\u2d4e\u2d4e \u2d62\u2d53\u2d3d\u2d54 \u2d30\u2d56 \u2d49\u2d40\u2d37\u2d53\u2d4e\u2d4f \u2d4f\u2d4f\u2d56!"
127
  ]
128
  ],
129
  "eval_split_max_size": null,
 
135
  "use_length_weighted_sampler": false,
136
  "length_weighted_sampler_alpha": 1.0,
137
  "model_args": {
138
+ "num_chars": 40,
139
  "out_channels": 513,
140
  "spec_segment_size": 32,
141
  "hidden_channels": 192,
 
262
  "use_d_vector_file": false,
263
  "d_vector_file": null,
264
  "d_vector_dim": 0,
265
+ "restore_path": "/home/aymen/Tamazight-NLP/Speech/coqui_tts/vits_shi_male-November-22-2024_02+49AM-0000000/checkpoint_23106.pth",
266
  "github_branch": "inside_docker"
267
  }
events.out.tfevents.1732240190.DESKTOP-4S0973M.2975.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c1becd36564b6f039e7ead68827eaa4de95da20454ee1f9a93c90a3a1d33f5f
3
+ size 1643638
events.out.tfevents.1732266340.DESKTOP-4S0973M.857.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:240f28bbd7d51661943c84742eee4a6fe451716276e03a3ded409b5163b815d3
3
+ size 20549500
events.out.tfevents.1732309973.DESKTOP-4S0973M.911.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06817c8595a3a147808ea9a411347a0db94dc8d710c0accc60e64d0c54ca88f6
3
+ size 39935360
train_vits.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from trainer import Trainer, TrainerArgs
4
+
5
+ from TTS.tts.configs.shared_configs import BaseDatasetConfig , CharactersConfig
6
+ from TTS.tts.configs.vits_config import VitsConfig
7
+ from TTS.tts.datasets import load_tts_samples
8
+ from TTS.tts.models.vits import Vits, VitsAudioConfig
9
+ from TTS.tts.utils.text.tokenizer import TTSTokenizer
10
+ from TTS.utils.audio import AudioProcessor
11
+
12
+ output_path = os.path.dirname(os.path.abspath(__file__))
13
+ # DEFINE DATASET CONFIG
14
+ TRAIN_PATH = "/home/aymen/Tamazight-NLP/Speech/media_ipsapps/shi_tls/cut_22kHz_offset/manifests/"
15
+ dataset_config = BaseDatasetConfig(
16
+ formatter="nemo", meta_file_train="manifest_processed.json", path=TRAIN_PATH
17
+ )
18
+
19
+ audio_config = VitsAudioConfig(
20
+ sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
21
+ )
22
+
23
+ character_config = CharactersConfig(
24
+ characters_class= "TTS.tts.models.vits.VitsCharacters",
25
+ characters= "ⴰⴱⴳⴷⴹⴻⴼⴽⵀⵃⵄⵅⵇⵉⵊⵍⵎⵏⵓⵔⵕⵖⵙⵚⵛⵜⵟⵡⵢⵣⵥⵯ",
26
+ punctuations=" !,.:?",
27
+ pad= "<PAD>",
28
+ eos= "<EOS>",
29
+ bos= "<BOS>",
30
+ blank= "<BLNK>",
31
+ )
32
+
33
+ config = VitsConfig(
34
+ audio=audio_config,
35
+ characters=character_config,
36
+ run_name="vits_shi_male",
37
+ batch_size=16,
38
+ eval_batch_size=4,
39
+ batch_group_size=5,
40
+ num_loader_workers=4,
41
+ num_eval_loader_workers=4,
42
+ run_eval=True,
43
+ test_delay_epochs=-1,
44
+ epochs=1000,
45
+ save_step=5000,
46
+ text_cleaner="no_cleaners",
47
+ use_phonemes=False,
48
+ compute_input_seq_cache=True,
49
+ print_step=25,
50
+ print_eval=True,
51
+ mixed_precision=True,
52
+ output_path=output_path,
53
+ datasets=[dataset_config],
54
+ cudnn_benchmark=False,
55
+ test_sentences=[
56
+ ["ⴰⵣⵓⵍ. ⵎⴰⵏⵣⴰⴽⵉⵏ?"],
57
+ ["ⵡⴰ ⵜⴰⵎⵖⴰⵔⵜ ⵎⴰ ⴷ ⵓⴽⴰⵏ ⵜⵙⴽⵔⵜ?"],
58
+ ["ⴳⵏ! ⴰⴷ ⴰⴽ ⵉⵙⵙⴳⵏ ⵕⴱⴱⵉ ⵉⵜⵜⵓ ⴽ."],
59
+ ["ⴰⵔⵔⴰⵡ ⵏ ⵍⵀⵎⵎ ⵢⵓⴽⵔ ⴰⵖ ⵉⵀⴷⵓⵎⵏ ⵏⵏⵖ!"]
60
+ ],
61
+ )
62
+
63
+ # INITIALIZE THE AUDIO PROCESSOR
64
+ # Audio processor is used for feature extraction and audio I/O.
65
+ # It mainly serves to the dataloader and the training loggers.
66
+ ap = AudioProcessor.init_from_config(config)
67
+
68
+ # INITIALIZE THE TOKENIZER
69
+ # Tokenizer is used to convert text to sequences of token IDs.
70
+ # config is updated with the default characters if not defined in the config.
71
+ tokenizer, config = TTSTokenizer.init_from_config(config)
72
+
73
+ # LOAD DATA SAMPLES
74
+ # Each sample is a list of ```[text, audio_file_path, speaker_name]```
75
+ # You can define your custom sample loader returning the list of samples.
76
+ # Or define your custom formatter and pass it to the `load_tts_samples`.
77
+ # Check `TTS.tts.datasets.load_tts_samples` for more details.
78
+ train_samples, eval_samples = load_tts_samples(
79
+ dataset_config,
80
+ eval_split=True,
81
+ eval_split_max_size=config.eval_split_max_size,
82
+ eval_split_size=config.eval_split_size,
83
+ )
84
+
85
+ # init model
86
+ model = Vits(config, ap, tokenizer, speaker_manager=None)
87
+
88
+ # init the trainer and 🚀
89
+ trainer = Trainer(
90
+ TrainerArgs(),
91
+ config,
92
+ output_path,
93
+ model=model,
94
+ train_samples=train_samples,
95
+ eval_samples=eval_samples,
96
+ )
97
+ trainer.fit()
trainer_0_log.txt ADDED
The diff for this file is too large to render. See raw diff