Spaces:
Running
Running
Yurii Paniv
commited on
Commit
ยท
6449e88
1
Parent(s):
eb57397
Remove Coqui
Browse files- README.md +4 -22
- app.py +4 -7
- config.json +0 -319
- config.yaml +371 -0
- requirements.txt +6 -3
- setup.py +4 -4
- tests/test_formatter.py +2 -2
- tests/test_tts.py +2 -2
- ukrainian_tts/formatter.py +1 -0
- ukrainian_tts/tts.py +41 -26
README.md
CHANGED
@@ -4,14 +4,14 @@ emoji: ๐
|
|
4 |
colorFrom: blue
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
-
sdk_version : 3.
|
8 |
python_version: 3.9
|
9 |
app_file: app.py
|
10 |
pinned: false
|
11 |
---
|
12 |
|
13 |
# Ukrainian TTS ๐ข๐ค
|
14 |
-
Ukrainian TTS (text-to-speech) using
|
15 |
|
16 |
![pytest](https://github.com/robinhad/ukrainian-tts/actions/workflows/hf-sync.yml/badge.svg)
|
17 |
[![Open In HF๐ค Space ](https://img.shields.io/badge/Open%20Demo-%F0%9F%A4%97%20Space-yellow)](https://huggingface.co/spaces/robinhad/ukrainian-tts)
|
@@ -65,33 +65,15 @@ pip install git+https://github.com/robinhad/ukrainian-tts.git
|
|
65 |
```python
|
66 |
from ukrainian_tts.tts import TTS, Voices, Stress
|
67 |
|
68 |
-
tts = TTS(
|
69 |
with open("test.wav", mode="wb") as file:
|
70 |
_, text = tts.tts("ะัะธะฒัั", Voices.Dmytro.value, Stress.Model.value, file)
|
71 |
print("Accented text:", text)
|
72 |
```
|
73 |
|
74 |
-
## Run manually:
|
75 |
-
`Caution: this won't use normalizer and autostress like a web demo. `
|
76 |
-
1. `pip install -r requirements.txt`.
|
77 |
-
2. Download `model.pth` and `speakers.pth` from "Releases" tab.
|
78 |
-
3. Launch as one-time command:
|
79 |
-
```
|
80 |
-
tts --text "Text for TTS" \
|
81 |
-
--model_path path/to/model.pth \
|
82 |
-
--config_path path/to/config.json \
|
83 |
-
--speaker_idx dmytro \
|
84 |
-
--out_path folder/to/save/output.wav
|
85 |
-
```
|
86 |
-
or alternatively launch web server using:
|
87 |
-
```
|
88 |
-
tts-server --model_path path/to/model.pth \
|
89 |
-
--config_path path/to/config.json
|
90 |
-
```
|
91 |
|
92 |
# How to train: ๐๏ธ
|
93 |
-
|
94 |
-
2. Instead of provided `config.json` use one from this repo.
|
95 |
|
96 |
|
97 |
# Attribution ๐ค
|
|
|
4 |
colorFrom: blue
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
+
sdk_version : 3.12
|
8 |
python_version: 3.9
|
9 |
app_file: app.py
|
10 |
pinned: false
|
11 |
---
|
12 |
|
13 |
# Ukrainian TTS ๐ข๐ค
|
14 |
+
Ukrainian TTS (text-to-speech) using ESPNET.
|
15 |
|
16 |
![pytest](https://github.com/robinhad/ukrainian-tts/actions/workflows/hf-sync.yml/badge.svg)
|
17 |
[![Open In HF๐ค Space ](https://img.shields.io/badge/Open%20Demo-%F0%9F%A4%97%20Space-yellow)](https://huggingface.co/spaces/robinhad/ukrainian-tts)
|
|
|
65 |
```python
|
66 |
from ukrainian_tts.tts import TTS, Voices, Stress
|
67 |
|
68 |
+
tts = TTS()
|
69 |
with open("test.wav", mode="wb") as file:
|
70 |
_, text = tts.tts("ะัะธะฒัั", Voices.Dmytro.value, Stress.Model.value, file)
|
71 |
print("Accented text:", text)
|
72 |
```
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
# How to train: ๐๏ธ
|
76 |
+
TBD
|
|
|
77 |
|
78 |
|
79 |
# Attribution ๐ค
|
app.py
CHANGED
@@ -53,11 +53,8 @@ class VoiceOption(Enum):
|
|
53 |
|
54 |
print(f"CUDA available? {is_available()}")
|
55 |
|
56 |
-
badge = (
|
57 |
-
"https://visitor-badge-reloaded.herokuapp.com/badge?page_id=robinhad.ukrainian-tts"
|
58 |
-
)
|
59 |
|
60 |
-
ukr_tts = TTS(
|
61 |
|
62 |
|
63 |
def tts(text: str, voice: str, stress: str):
|
@@ -121,9 +118,9 @@ iface = gr.Interface(
|
|
121 |
gr.components.Audio(label="Output"),
|
122 |
gr.components.Textbox(label="ะะฐะณะพะปะพัะตะฝะธะน ัะตะบัั"),
|
123 |
],
|
124 |
-
title="
|
125 |
-
description="ะฃะบัะฐัะฝะพะผะพะฒะฝะธะน๐บ๐ฆ TTS ะทะฐ ะดะพะฟะพะผะพะณะพั
|
126 |
-
article=article
|
127 |
examples=[
|
128 |
[
|
129 |
"ะะฒะตะดััั, ะฑัะดั ะปะฐัะบะฐ, ัะฒะพั ัะตัะตะฝะฝั.",
|
|
|
53 |
|
54 |
print(f"CUDA available? {is_available()}")
|
55 |
|
|
|
|
|
|
|
56 |
|
57 |
+
ukr_tts = TTS()
|
58 |
|
59 |
|
60 |
def tts(text: str, voice: str, stress: str):
|
|
|
118 |
gr.components.Audio(label="Output"),
|
119 |
gr.components.Textbox(label="ะะฐะณะพะปะพัะตะฝะธะน ัะตะบัั"),
|
120 |
],
|
121 |
+
title="๐ค๐ฌ๐บ๐ฆ - ESPNET",
|
122 |
+
description="ะฃะบัะฐัะฝะพะผะพะฒะฝะธะน๐บ๐ฆ TTS ะทะฐ ะดะพะฟะพะผะพะณะพั ESPNET (ัะพะฑ ะฒัััะฝั ะฟะพััะฐะฒะธัะธ ะฝะฐะณะพะปะพั, ะฒะธะบะพัะธััะพะฒัะนัะต + ะฟะตัะตะด ะณะพะปะพัะฝะพั)",
|
123 |
+
article=article,
|
124 |
examples=[
|
125 |
[
|
126 |
"ะะฒะตะดััั, ะฑัะดั ะปะฐัะบะฐ, ัะฒะพั ัะตัะตะฝะฝั.",
|
config.json
DELETED
@@ -1,319 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"output_path": "/home/robinhad/Projects/TTS",
|
3 |
-
"logger_uri": null,
|
4 |
-
"run_name": "vits_mykyta_latest",
|
5 |
-
"project_name": null,
|
6 |
-
"run_description": "\ud83d\udc38Coqui trainer run.",
|
7 |
-
"print_step": 25,
|
8 |
-
"plot_step": 100,
|
9 |
-
"model_param_stats": false,
|
10 |
-
"wandb_entity": null,
|
11 |
-
"dashboard_logger": "tensorboard",
|
12 |
-
"log_model_step": 5000,
|
13 |
-
"save_step": 5000,
|
14 |
-
"save_n_checkpoints": 5,
|
15 |
-
"save_checkpoints": true,
|
16 |
-
"save_all_best": false,
|
17 |
-
"save_best_after": 10000,
|
18 |
-
"target_loss": null,
|
19 |
-
"print_eval": false,
|
20 |
-
"test_delay_epochs": -1,
|
21 |
-
"run_eval": true,
|
22 |
-
"run_eval_steps": null,
|
23 |
-
"distributed_backend": "nccl",
|
24 |
-
"distributed_url": "tcp://localhost:54321",
|
25 |
-
"mixed_precision": true,
|
26 |
-
"epochs": 1500,
|
27 |
-
"batch_size": 64,
|
28 |
-
"eval_batch_size": 16,
|
29 |
-
"grad_clip": [
|
30 |
-
1000,
|
31 |
-
1000
|
32 |
-
],
|
33 |
-
"scheduler_after_epoch": true,
|
34 |
-
"lr": 0.001,
|
35 |
-
"optimizer": "AdamW",
|
36 |
-
"optimizer_params": {
|
37 |
-
"betas": [
|
38 |
-
0.8,
|
39 |
-
0.99
|
40 |
-
],
|
41 |
-
"eps": 1e-09,
|
42 |
-
"weight_decay": 0.01
|
43 |
-
},
|
44 |
-
"lr_scheduler": "",
|
45 |
-
"lr_scheduler_params": {},
|
46 |
-
"use_grad_scaler": false,
|
47 |
-
"cudnn_enable": true,
|
48 |
-
"cudnn_deterministic": false,
|
49 |
-
"cudnn_benchmark": false,
|
50 |
-
"training_seed": 54321,
|
51 |
-
"model": "vits",
|
52 |
-
"num_loader_workers": 12,
|
53 |
-
"num_eval_loader_workers": 12,
|
54 |
-
"use_noise_augment": false,
|
55 |
-
"audio": {
|
56 |
-
"fft_size": 1024,
|
57 |
-
"win_length": 1024,
|
58 |
-
"hop_length": 256,
|
59 |
-
"frame_shift_ms": null,
|
60 |
-
"frame_length_ms": null,
|
61 |
-
"stft_pad_mode": "reflect",
|
62 |
-
"sample_rate": 22050,
|
63 |
-
"resample": false,
|
64 |
-
"preemphasis": 0,
|
65 |
-
"ref_level_db": 35,
|
66 |
-
"do_sound_norm": true,
|
67 |
-
"log_func": "np.log",
|
68 |
-
"do_trim_silence": false,
|
69 |
-
"trim_db": 35,
|
70 |
-
"do_rms_norm": false,
|
71 |
-
"db_level": -24,
|
72 |
-
"power": 1.1,
|
73 |
-
"griffin_lim_iters": 60,
|
74 |
-
"num_mels": 80,
|
75 |
-
"mel_fmin": 0,
|
76 |
-
"mel_fmax": null,
|
77 |
-
"spec_gain": 6.0,
|
78 |
-
"do_amp_to_db_linear": true,
|
79 |
-
"do_amp_to_db_mel": true,
|
80 |
-
"pitch_fmax": 640.0,
|
81 |
-
"pitch_fmin": 0.0,
|
82 |
-
"signal_norm": true,
|
83 |
-
"min_level_db": -100,
|
84 |
-
"symmetric_norm": true,
|
85 |
-
"max_norm": 1.0,
|
86 |
-
"clip_norm": true,
|
87 |
-
"stats_path": null
|
88 |
-
},
|
89 |
-
"use_phonemes": false,
|
90 |
-
"phonemizer": null,
|
91 |
-
"phoneme_language": "uk",
|
92 |
-
"compute_input_seq_cache": false,
|
93 |
-
"text_cleaner": "basic_cleaners",
|
94 |
-
"enable_eos_bos_chars": false,
|
95 |
-
"test_sentences_file": "",
|
96 |
-
"phoneme_cache_path": "/home/robinhad/Projects/TTS/phoneme_cache",
|
97 |
-
"characters": {
|
98 |
-
"characters_class": "TTS.tts.models.vits.VitsCharacters",
|
99 |
-
"vocab_dict": null,
|
100 |
-
"pad": "<PAD>",
|
101 |
-
"eos": "<EOS>",
|
102 |
-
"bos": "<BOS>",
|
103 |
-
"blank": "<BLNK>",
|
104 |
-
"characters": "!\"'(),-/:;.?\u00ab\u00bb+\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0454\u0456\u0457\u0491\u2013\u2014\u2015\u2018\u2019\u201c\u201d\u201e\u2026 ",
|
105 |
-
"punctuations": "!\"'(),-/:;?\u00ab\u00bb+\u2013\u2014\u2015\u2018\u2019\u201c\u201d\u201e\u2026 ",
|
106 |
-
"phonemes": null,
|
107 |
-
"is_unique": true,
|
108 |
-
"is_sorted": true
|
109 |
-
},
|
110 |
-
"add_blank": true,
|
111 |
-
"batch_group_size": 0,
|
112 |
-
"loss_masking": null,
|
113 |
-
"sort_by_audio_len": true,
|
114 |
-
"min_audio_len": 32768,
|
115 |
-
"max_audio_len": 264600,
|
116 |
-
"min_text_len": 1,
|
117 |
-
"max_text_len": Infinity,
|
118 |
-
"compute_f0": false,
|
119 |
-
"compute_linear_spec": true,
|
120 |
-
"precompute_num_workers": 16,
|
121 |
-
"start_by_longest": false,
|
122 |
-
"datasets": [
|
123 |
-
{
|
124 |
-
"name": "mailabs",
|
125 |
-
"path": "/home/robinhad/Data/Audio/ukr-tts-dataset-mai",
|
126 |
-
"meta_file_train": "",
|
127 |
-
"ignored_speakers": null,
|
128 |
-
"language": "",
|
129 |
-
"meta_file_val": "",
|
130 |
-
"meta_file_attn_mask": ""
|
131 |
-
}
|
132 |
-
],
|
133 |
-
"test_sentences": [
|
134 |
-
[
|
135 |
-
"\u0414+\u0435\u0441\u044f\u0442\u044c \u0440\u0430\u0437+\u0456\u0432 \u0432\u0456\u0434\u043c+\u0456\u0440\u044f\u0439, +\u0430 \u0440+\u0430\u0437 - \u0432\u0456\u0434\u0440+\u0456\u0436.",
|
136 |
-
"olena",
|
137 |
-
null,
|
138 |
-
null
|
139 |
-
],
|
140 |
-
[
|
141 |
-
"\u0413\u043e\u0432\u043e\u0440+\u0438, \u043d+\u0456\u0431\u0438 \u0442+\u0438 \u0436\u0438\u0432+\u0438\u0439!"
|
142 |
-
],
|
143 |
-
[
|
144 |
-
"\u041f\u0435\u0440\u0435\u043f\u0440+\u043e\u0448\u0443\u044e, \u0414+\u0435\u0439\u0432\u0435, \u043d+\u0430 \u0436+\u0430\u043b\u044c, +\u044f \u043d+\u0435 \u043c+\u043e\u0436\u0443 \u0437\u0440\u043e\u0431+\u0438\u0442\u0438 \u0446+\u0435.",
|
145 |
-
"lada",
|
146 |
-
null,
|
147 |
-
null
|
148 |
-
],
|
149 |
-
[
|
150 |
-
"\u041f\u0435\u0440\u0435\u043f\u0440+\u043e\u0448\u0443\u044e, \u0414+\u0435\u0439\u0432\u0435, \u043d+\u0430 \u0436+\u0430\u043b\u044c, +\u044f \u043d+\u0435 \u043c+\u043e\u0436\u0443 \u0437\u0440\u043e\u0431+\u0438\u0442\u0438 \u0446+\u0435.",
|
151 |
-
"mykyta",
|
152 |
-
null,
|
153 |
-
null
|
154 |
-
],
|
155 |
-
[
|
156 |
-
"\u0425\u0442+\u043e \u0442+\u0438 \u0442\u0430\u043a+\u0438\u0439 +\u0456 +\u044f\u043a \u0442\u0435\u0431+\u0435 \u0437\u0432+\u0430\u0442\u0438?",
|
157 |
-
"mykyta",
|
158 |
-
null,
|
159 |
-
null
|
160 |
-
],
|
161 |
-
[
|
162 |
-
"\u0425\u0442+\u043e \u0442+\u0438 \u0442\u0430\u043a+\u0438\u0439 +\u0456 +\u044f\u043a \u0442\u0435\u0431+\u0435 \u0437\u0432+\u0430\u0442\u0438?",
|
163 |
-
"dmytro",
|
164 |
-
null,
|
165 |
-
null
|
166 |
-
],
|
167 |
-
[
|
168 |
-
"\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
|
169 |
-
"lada",
|
170 |
-
null,
|
171 |
-
null
|
172 |
-
],
|
173 |
-
[
|
174 |
-
"\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
|
175 |
-
"dmytro",
|
176 |
-
null,
|
177 |
-
null
|
178 |
-
],
|
179 |
-
[
|
180 |
-
"\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
|
181 |
-
"olga",
|
182 |
-
null,
|
183 |
-
null
|
184 |
-
]
|
185 |
-
],
|
186 |
-
"eval_split_max_size": null,
|
187 |
-
"eval_split_size": 0.01,
|
188 |
-
"use_speaker_weighted_sampler": false,
|
189 |
-
"speaker_weighted_sampler_alpha": 1.0,
|
190 |
-
"use_language_weighted_sampler": false,
|
191 |
-
"language_weighted_sampler_alpha": 1.0,
|
192 |
-
"use_length_weighted_sampler": false,
|
193 |
-
"length_weighted_sampler_alpha": 1.0,
|
194 |
-
"model_args": {
|
195 |
-
"num_chars": 84,
|
196 |
-
"out_channels": 513,
|
197 |
-
"spec_segment_size": 32,
|
198 |
-
"hidden_channels": 192,
|
199 |
-
"hidden_channels_ffn_text_encoder": 768,
|
200 |
-
"num_heads_text_encoder": 2,
|
201 |
-
"num_layers_text_encoder": 6,
|
202 |
-
"kernel_size_text_encoder": 3,
|
203 |
-
"dropout_p_text_encoder": 0.1,
|
204 |
-
"dropout_p_duration_predictor": 0.5,
|
205 |
-
"kernel_size_posterior_encoder": 5,
|
206 |
-
"dilation_rate_posterior_encoder": 1,
|
207 |
-
"num_layers_posterior_encoder": 16,
|
208 |
-
"kernel_size_flow": 5,
|
209 |
-
"dilation_rate_flow": 1,
|
210 |
-
"num_layers_flow": 4,
|
211 |
-
"resblock_type_decoder": "1",
|
212 |
-
"resblock_kernel_sizes_decoder": [
|
213 |
-
3,
|
214 |
-
7,
|
215 |
-
11
|
216 |
-
],
|
217 |
-
"resblock_dilation_sizes_decoder": [
|
218 |
-
[
|
219 |
-
1,
|
220 |
-
3,
|
221 |
-
5
|
222 |
-
],
|
223 |
-
[
|
224 |
-
1,
|
225 |
-
3,
|
226 |
-
5
|
227 |
-
],
|
228 |
-
[
|
229 |
-
1,
|
230 |
-
3,
|
231 |
-
5
|
232 |
-
]
|
233 |
-
],
|
234 |
-
"upsample_rates_decoder": [
|
235 |
-
8,
|
236 |
-
8,
|
237 |
-
2,
|
238 |
-
2
|
239 |
-
],
|
240 |
-
"upsample_initial_channel_decoder": 512,
|
241 |
-
"upsample_kernel_sizes_decoder": [
|
242 |
-
16,
|
243 |
-
16,
|
244 |
-
4,
|
245 |
-
4
|
246 |
-
],
|
247 |
-
"periods_multi_period_discriminator": [
|
248 |
-
2,
|
249 |
-
3,
|
250 |
-
5,
|
251 |
-
7,
|
252 |
-
11
|
253 |
-
],
|
254 |
-
"use_sdp": true,
|
255 |
-
"noise_scale": 1.0,
|
256 |
-
"inference_noise_scale": 0.667,
|
257 |
-
"length_scale": 1,
|
258 |
-
"noise_scale_dp": 1.0,
|
259 |
-
"inference_noise_scale_dp": 1.0,
|
260 |
-
"max_inference_len": null,
|
261 |
-
"init_discriminator": true,
|
262 |
-
"use_spectral_norm_disriminator": false,
|
263 |
-
"use_speaker_embedding": true,
|
264 |
-
"num_speakers": 5,
|
265 |
-
"speakers_file": "speakers.pth",
|
266 |
-
"d_vector_file": null,
|
267 |
-
"speaker_embedding_channels": 256,
|
268 |
-
"use_d_vector_file": false,
|
269 |
-
"d_vector_dim": 0,
|
270 |
-
"detach_dp_input": true,
|
271 |
-
"use_language_embedding": false,
|
272 |
-
"embedded_language_dim": 4,
|
273 |
-
"num_languages": 0,
|
274 |
-
"language_ids_file": null,
|
275 |
-
"use_speaker_encoder_as_loss": false,
|
276 |
-
"speaker_encoder_config_path": "",
|
277 |
-
"speaker_encoder_model_path": "",
|
278 |
-
"condition_dp_on_speaker": true,
|
279 |
-
"freeze_encoder": false,
|
280 |
-
"freeze_DP": false,
|
281 |
-
"freeze_PE": false,
|
282 |
-
"freeze_flow_decoder": false,
|
283 |
-
"freeze_waveform_decoder": false,
|
284 |
-
"encoder_sample_rate": null,
|
285 |
-
"interpolate_z": true,
|
286 |
-
"reinit_DP": false,
|
287 |
-
"reinit_text_encoder": false
|
288 |
-
},
|
289 |
-
"lr_gen": 0.0002,
|
290 |
-
"lr_disc": 0.0002,
|
291 |
-
"lr_scheduler_gen": "ExponentialLR",
|
292 |
-
"lr_scheduler_gen_params": {
|
293 |
-
"gamma": 0.999875,
|
294 |
-
"last_epoch": -1
|
295 |
-
},
|
296 |
-
"lr_scheduler_disc": "ExponentialLR",
|
297 |
-
"lr_scheduler_disc_params": {
|
298 |
-
"gamma": 0.999875,
|
299 |
-
"last_epoch": -1
|
300 |
-
},
|
301 |
-
"kl_loss_alpha": 1.0,
|
302 |
-
"disc_loss_alpha": 1.0,
|
303 |
-
"gen_loss_alpha": 1.0,
|
304 |
-
"feat_loss_alpha": 1.0,
|
305 |
-
"mel_loss_alpha": 45.0,
|
306 |
-
"dur_loss_alpha": 1.0,
|
307 |
-
"speaker_encoder_loss_alpha": 1.0,
|
308 |
-
"return_wav": true,
|
309 |
-
"r": 1,
|
310 |
-
"num_speakers": 0,
|
311 |
-
"use_speaker_embedding": true,
|
312 |
-
"speakers_file": "speakers.pth",
|
313 |
-
"speaker_embedding_channels": 256,
|
314 |
-
"language_ids_file": null,
|
315 |
-
"use_language_embedding": false,
|
316 |
-
"use_d_vector_file": false,
|
317 |
-
"d_vector_file": null,
|
318 |
-
"d_vector_dim": 0
|
319 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.yaml
ADDED
@@ -0,0 +1,371 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
config: ./conf/tuning/train_vits.yaml
|
2 |
+
print_config: false
|
3 |
+
log_level: INFO
|
4 |
+
dry_run: false
|
5 |
+
iterator_type: sequence
|
6 |
+
output_dir: /mnt/tts-storage/exp/22k/tts_train_vits_raw_char
|
7 |
+
ngpu: 1
|
8 |
+
seed: 3407
|
9 |
+
num_workers: 1
|
10 |
+
num_att_plot: 3
|
11 |
+
dist_backend: nccl
|
12 |
+
dist_init_method: env://
|
13 |
+
dist_world_size: null
|
14 |
+
dist_rank: null
|
15 |
+
local_rank: 0
|
16 |
+
dist_master_addr: null
|
17 |
+
dist_master_port: null
|
18 |
+
dist_launcher: null
|
19 |
+
multiprocessing_distributed: false
|
20 |
+
unused_parameters: true
|
21 |
+
sharded_ddp: false
|
22 |
+
cudnn_enabled: true
|
23 |
+
cudnn_benchmark: false
|
24 |
+
cudnn_deterministic: false
|
25 |
+
collect_stats: false
|
26 |
+
write_collected_feats: false
|
27 |
+
max_epoch: 300
|
28 |
+
patience: null
|
29 |
+
val_scheduler_criterion:
|
30 |
+
- valid
|
31 |
+
- loss
|
32 |
+
early_stopping_criterion:
|
33 |
+
- valid
|
34 |
+
- loss
|
35 |
+
- min
|
36 |
+
best_model_criterion:
|
37 |
+
- - train
|
38 |
+
- total_count
|
39 |
+
- max
|
40 |
+
keep_nbest_models: 10
|
41 |
+
nbest_averaging_interval: 0
|
42 |
+
grad_clip: -1
|
43 |
+
grad_clip_type: 2.0
|
44 |
+
grad_noise: false
|
45 |
+
accum_grad: 1
|
46 |
+
no_forward_run: false
|
47 |
+
resume: true
|
48 |
+
train_dtype: float32
|
49 |
+
use_amp: false
|
50 |
+
log_interval: 50
|
51 |
+
use_matplotlib: true
|
52 |
+
use_tensorboard: true
|
53 |
+
create_graph_in_tensorboard: false
|
54 |
+
use_wandb: false
|
55 |
+
wandb_project: null
|
56 |
+
wandb_id: null
|
57 |
+
wandb_entity: null
|
58 |
+
wandb_name: null
|
59 |
+
wandb_model_log_interval: -1
|
60 |
+
detect_anomaly: false
|
61 |
+
pretrain_path: null
|
62 |
+
init_param: []
|
63 |
+
ignore_init_mismatch: false
|
64 |
+
freeze_param: []
|
65 |
+
num_iters_per_epoch: null
|
66 |
+
batch_size: 20
|
67 |
+
valid_batch_size: null
|
68 |
+
batch_bins: 1900000
|
69 |
+
valid_batch_bins: null
|
70 |
+
train_shape_file:
|
71 |
+
- /mnt/tts-storage/exp/22k/tts_stats_raw_linear_spectrogram_char/train/text_shape.char
|
72 |
+
- /mnt/tts-storage/exp/22k/tts_stats_raw_linear_spectrogram_char/train/speech_shape
|
73 |
+
valid_shape_file:
|
74 |
+
- /mnt/tts-storage/exp/22k/tts_stats_raw_linear_spectrogram_char/valid/text_shape.char
|
75 |
+
- /mnt/tts-storage/exp/22k/tts_stats_raw_linear_spectrogram_char/valid/speech_shape
|
76 |
+
batch_type: numel
|
77 |
+
valid_batch_type: null
|
78 |
+
fold_length:
|
79 |
+
- 150
|
80 |
+
- 204800
|
81 |
+
sort_in_batch: descending
|
82 |
+
sort_batch: descending
|
83 |
+
multiple_iterator: false
|
84 |
+
chunk_length: 500
|
85 |
+
chunk_shift_ratio: 0.5
|
86 |
+
num_cache_chunks: 1024
|
87 |
+
train_data_path_and_name_and_type:
|
88 |
+
- - dump/22k/raw/tr_no_dev/text
|
89 |
+
- text
|
90 |
+
- text
|
91 |
+
- - dump/22k/raw/tr_no_dev/wav.scp
|
92 |
+
- speech
|
93 |
+
- sound
|
94 |
+
- - dump/22k/raw/tr_no_dev/utt2sid
|
95 |
+
- sids
|
96 |
+
- text_int
|
97 |
+
valid_data_path_and_name_and_type:
|
98 |
+
- - dump/22k/raw/dev/text
|
99 |
+
- text
|
100 |
+
- text
|
101 |
+
- - dump/22k/raw/dev/wav.scp
|
102 |
+
- speech
|
103 |
+
- sound
|
104 |
+
- - dump/22k/raw/dev/utt2sid
|
105 |
+
- sids
|
106 |
+
- text_int
|
107 |
+
allow_variable_data_keys: false
|
108 |
+
max_cache_size: 0.0
|
109 |
+
max_cache_fd: 32
|
110 |
+
valid_max_cache_size: null
|
111 |
+
optim: adamw
|
112 |
+
optim_conf:
|
113 |
+
lr: 0.0002
|
114 |
+
betas:
|
115 |
+
- 0.8
|
116 |
+
- 0.99
|
117 |
+
eps: 1.0e-09
|
118 |
+
weight_decay: 0.0
|
119 |
+
scheduler: exponentiallr
|
120 |
+
scheduler_conf:
|
121 |
+
gamma: 0.999875
|
122 |
+
optim2: adamw
|
123 |
+
optim2_conf:
|
124 |
+
lr: 0.0002
|
125 |
+
betas:
|
126 |
+
- 0.8
|
127 |
+
- 0.99
|
128 |
+
eps: 1.0e-09
|
129 |
+
weight_decay: 0.0
|
130 |
+
scheduler2: exponentiallr
|
131 |
+
scheduler2_conf:
|
132 |
+
gamma: 0.999875
|
133 |
+
generator_first: false
|
134 |
+
token_list:
|
135 |
+
- <blank>
|
136 |
+
- <unk>
|
137 |
+
- +
|
138 |
+
- <space>
|
139 |
+
- ะพ
|
140 |
+
- ะฐ
|
141 |
+
- ะธ
|
142 |
+
- ะฝ
|
143 |
+
- ะฒ
|
144 |
+
- ั
|
145 |
+
- ั
|
146 |
+
- ะต
|
147 |
+
- ั
|
148 |
+
- ั
|
149 |
+
- ะป
|
150 |
+
- ั
|
151 |
+
- ะด
|
152 |
+
- ะบ
|
153 |
+
- ะผ
|
154 |
+
- ะฟ
|
155 |
+
- ั
|
156 |
+
- ะท
|
157 |
+
- ','
|
158 |
+
- ะฑ
|
159 |
+
- ั
|
160 |
+
- ะณ
|
161 |
+
- ั
|
162 |
+
- ะน
|
163 |
+
- ั
|
164 |
+
- ะถ
|
165 |
+
- ั
|
166 |
+
- ั
|
167 |
+
- ั
|
168 |
+
- ั
|
169 |
+
- โ
|
170 |
+
- ั
|
171 |
+
- ั
|
172 |
+
- '?'
|
173 |
+
- .
|
174 |
+
- ั
|
175 |
+
- ยซ
|
176 |
+
- ยป
|
177 |
+
- '!'
|
178 |
+
- ''''
|
179 |
+
- ':'
|
180 |
+
- โฆ
|
181 |
+
- '-'
|
182 |
+
- า
|
183 |
+
- โ
|
184 |
+
- โ
|
185 |
+
- '"'
|
186 |
+
- ;
|
187 |
+
- โ
|
188 |
+
- โ
|
189 |
+
- <sos/eos>
|
190 |
+
odim: null
|
191 |
+
model_conf: {}
|
192 |
+
use_preprocessor: true
|
193 |
+
token_type: char
|
194 |
+
bpemodel: null
|
195 |
+
non_linguistic_symbols: null
|
196 |
+
cleaner: null
|
197 |
+
g2p: g2p_en_no_space
|
198 |
+
feats_extract: linear_spectrogram
|
199 |
+
feats_extract_conf:
|
200 |
+
n_fft: 1024
|
201 |
+
hop_length: 256
|
202 |
+
win_length: null
|
203 |
+
normalize: null
|
204 |
+
normalize_conf: {}
|
205 |
+
tts: vits
|
206 |
+
tts_conf:
|
207 |
+
generator_type: vits_generator
|
208 |
+
generator_params:
|
209 |
+
hidden_channels: 192
|
210 |
+
spks: 128
|
211 |
+
global_channels: 256
|
212 |
+
segment_size: 32
|
213 |
+
text_encoder_attention_heads: 2
|
214 |
+
text_encoder_ffn_expand: 4
|
215 |
+
text_encoder_blocks: 6
|
216 |
+
text_encoder_positionwise_layer_type: conv1d
|
217 |
+
text_encoder_positionwise_conv_kernel_size: 3
|
218 |
+
text_encoder_positional_encoding_layer_type: rel_pos
|
219 |
+
text_encoder_self_attention_layer_type: rel_selfattn
|
220 |
+
text_encoder_activation_type: swish
|
221 |
+
text_encoder_normalize_before: true
|
222 |
+
text_encoder_dropout_rate: 0.1
|
223 |
+
text_encoder_positional_dropout_rate: 0.0
|
224 |
+
text_encoder_attention_dropout_rate: 0.1
|
225 |
+
use_macaron_style_in_text_encoder: true
|
226 |
+
use_conformer_conv_in_text_encoder: false
|
227 |
+
text_encoder_conformer_kernel_size: -1
|
228 |
+
decoder_kernel_size: 7
|
229 |
+
decoder_channels: 512
|
230 |
+
decoder_upsample_scales:
|
231 |
+
- 8
|
232 |
+
- 8
|
233 |
+
- 2
|
234 |
+
- 2
|
235 |
+
decoder_upsample_kernel_sizes:
|
236 |
+
- 16
|
237 |
+
- 16
|
238 |
+
- 4
|
239 |
+
- 4
|
240 |
+
decoder_resblock_kernel_sizes:
|
241 |
+
- 3
|
242 |
+
- 7
|
243 |
+
- 11
|
244 |
+
decoder_resblock_dilations:
|
245 |
+
- - 1
|
246 |
+
- 3
|
247 |
+
- 5
|
248 |
+
- - 1
|
249 |
+
- 3
|
250 |
+
- 5
|
251 |
+
- - 1
|
252 |
+
- 3
|
253 |
+
- 5
|
254 |
+
use_weight_norm_in_decoder: true
|
255 |
+
posterior_encoder_kernel_size: 5
|
256 |
+
posterior_encoder_layers: 16
|
257 |
+
posterior_encoder_stacks: 1
|
258 |
+
posterior_encoder_base_dilation: 1
|
259 |
+
posterior_encoder_dropout_rate: 0.0
|
260 |
+
use_weight_norm_in_posterior_encoder: true
|
261 |
+
flow_flows: 4
|
262 |
+
flow_kernel_size: 5
|
263 |
+
flow_base_dilation: 1
|
264 |
+
flow_layers: 4
|
265 |
+
flow_dropout_rate: 0.0
|
266 |
+
use_weight_norm_in_flow: true
|
267 |
+
use_only_mean_in_flow: true
|
268 |
+
stochastic_duration_predictor_kernel_size: 3
|
269 |
+
stochastic_duration_predictor_dropout_rate: 0.5
|
270 |
+
stochastic_duration_predictor_flows: 4
|
271 |
+
stochastic_duration_predictor_dds_conv_layers: 3
|
272 |
+
vocabs: 55
|
273 |
+
aux_channels: 513
|
274 |
+
discriminator_type: hifigan_multi_scale_multi_period_discriminator
|
275 |
+
discriminator_params:
|
276 |
+
scales: 1
|
277 |
+
scale_downsample_pooling: AvgPool1d
|
278 |
+
scale_downsample_pooling_params:
|
279 |
+
kernel_size: 4
|
280 |
+
stride: 2
|
281 |
+
padding: 2
|
282 |
+
scale_discriminator_params:
|
283 |
+
in_channels: 1
|
284 |
+
out_channels: 1
|
285 |
+
kernel_sizes:
|
286 |
+
- 15
|
287 |
+
- 41
|
288 |
+
- 5
|
289 |
+
- 3
|
290 |
+
channels: 128
|
291 |
+
max_downsample_channels: 1024
|
292 |
+
max_groups: 16
|
293 |
+
bias: true
|
294 |
+
downsample_scales:
|
295 |
+
- 2
|
296 |
+
- 2
|
297 |
+
- 4
|
298 |
+
- 4
|
299 |
+
- 1
|
300 |
+
nonlinear_activation: LeakyReLU
|
301 |
+
nonlinear_activation_params:
|
302 |
+
negative_slope: 0.1
|
303 |
+
use_weight_norm: true
|
304 |
+
use_spectral_norm: false
|
305 |
+
follow_official_norm: false
|
306 |
+
periods:
|
307 |
+
- 2
|
308 |
+
- 3
|
309 |
+
- 5
|
310 |
+
- 7
|
311 |
+
- 11
|
312 |
+
period_discriminator_params:
|
313 |
+
in_channels: 1
|
314 |
+
out_channels: 1
|
315 |
+
kernel_sizes:
|
316 |
+
- 5
|
317 |
+
- 3
|
318 |
+
channels: 32
|
319 |
+
downsample_scales:
|
320 |
+
- 3
|
321 |
+
- 3
|
322 |
+
- 3
|
323 |
+
- 3
|
324 |
+
- 1
|
325 |
+
max_downsample_channels: 1024
|
326 |
+
bias: true
|
327 |
+
nonlinear_activation: LeakyReLU
|
328 |
+
nonlinear_activation_params:
|
329 |
+
negative_slope: 0.1
|
330 |
+
use_weight_norm: true
|
331 |
+
use_spectral_norm: false
|
332 |
+
generator_adv_loss_params:
|
333 |
+
average_by_discriminators: false
|
334 |
+
loss_type: mse
|
335 |
+
discriminator_adv_loss_params:
|
336 |
+
average_by_discriminators: false
|
337 |
+
loss_type: mse
|
338 |
+
feat_match_loss_params:
|
339 |
+
average_by_discriminators: false
|
340 |
+
average_by_layers: false
|
341 |
+
include_final_outputs: true
|
342 |
+
mel_loss_params:
|
343 |
+
fs: 22050
|
344 |
+
n_fft: 1024
|
345 |
+
hop_length: 256
|
346 |
+
win_length: null
|
347 |
+
window: hann
|
348 |
+
n_mels: 80
|
349 |
+
fmin: 0
|
350 |
+
fmax: null
|
351 |
+
log_base: null
|
352 |
+
lambda_adv: 1.0
|
353 |
+
lambda_mel: 45.0
|
354 |
+
lambda_feat_match: 2.0
|
355 |
+
lambda_dur: 1.0
|
356 |
+
lambda_kl: 1.0
|
357 |
+
sampling_rate: 22050
|
358 |
+
cache_generator_outputs: true
|
359 |
+
pitch_extract: null
|
360 |
+
pitch_extract_conf: {}
|
361 |
+
pitch_normalize: null
|
362 |
+
pitch_normalize_conf: {}
|
363 |
+
energy_extract: null
|
364 |
+
energy_extract_conf: {}
|
365 |
+
energy_normalize: null
|
366 |
+
energy_normalize_conf: {}
|
367 |
+
required:
|
368 |
+
- output_dir
|
369 |
+
- token_list
|
370 |
+
version: '202209'
|
371 |
+
distributed: false
|
requirements.txt
CHANGED
@@ -1,6 +1,9 @@
|
|
1 |
# requirements for HuggingFace demo. Installs local package.
|
2 |
-
torch
|
3 |
-
|
|
|
|
|
4 |
ukrainian-word-stress==1.0.1
|
5 |
git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c
|
6 |
-
gradio==3.
|
|
|
|
1 |
# requirements for HuggingFace demo. Installs local package.
|
2 |
+
torch
|
3 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
4 |
+
espnet==202209
|
5 |
+
num2words==0.5.12
|
6 |
ukrainian-word-stress==1.0.1
|
7 |
git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c
|
8 |
+
gradio==3.12
|
9 |
+
huggingface_hub==0.11.1
|
setup.py
CHANGED
@@ -3,8 +3,8 @@ from setuptools import setup, find_packages
|
|
3 |
|
4 |
setup(
|
5 |
name="ukrainian-tts",
|
6 |
-
version="
|
7 |
-
description="Ukrainian TTS using
|
8 |
author="Yurii Paniv",
|
9 |
author_email="[email protected]",
|
10 |
url="https://github.com/robinhad/ukrainian-tts",
|
@@ -12,8 +12,8 @@ setup(
|
|
12 |
packages=find_packages(),
|
13 |
python_requires=">3.6.0",
|
14 |
install_requires=[
|
15 |
-
"
|
16 |
-
"
|
17 |
"ukrainian-word-stress==1.0.1",
|
18 |
"ukrainian_accentor @ git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c",
|
19 |
],
|
|
|
3 |
|
4 |
setup(
|
5 |
name="ukrainian-tts",
|
6 |
+
version="4.0",
|
7 |
+
description="Ukrainian TTS using ESPNET",
|
8 |
author="Yurii Paniv",
|
9 |
author_email="[email protected]",
|
10 |
url="https://github.com/robinhad/ukrainian-tts",
|
|
|
12 |
packages=find_packages(),
|
13 |
python_requires=">3.6.0",
|
14 |
install_requires=[
|
15 |
+
"espnet==202209",
|
16 |
+
"num2words==0.5.12",
|
17 |
"ukrainian-word-stress==1.0.1",
|
18 |
"ukrainian_accentor @ git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c",
|
19 |
],
|
tests/test_formatter.py
CHANGED
@@ -3,8 +3,8 @@ from ukrainian_tts.formatter import preprocess_text
|
|
3 |
|
4 |
def test_formatter():
|
5 |
examples = [
|
6 |
-
("Quality of life update", "
|
7 |
-
("ะัะฝ ัะบัะฐะฒ 20000000 $", "
|
8 |
(
|
9 |
"111 000 000 000 ะดะพะปะฐััะฒ ะดะตัะถะฐะฒะฝะพะณะพ ะฑะพัะณั.",
|
10 |
"ััะพ ะพะดะธะฝะฐะดัััั ะผัะปัััะดัะฒ ะดะพะปะฐััะฒ ะดะตัะถะฐะฒะฝะพะณะพ ะฑะพัะณั.",
|
|
|
3 |
|
4 |
def test_formatter():
|
5 |
examples = [
|
6 |
+
("Quality of life update", "ะบะฒัะฐะปััั ะพั ะปััะต ัะฟะดะฐัะต"),
|
7 |
+
("ะัะฝ ัะบัะฐะฒ 20000000 $", "ะฒัะฝ ัะบัะฐะฒ ะดะฒะฐะดัััั ะผัะปัะนะพะฝัะฒ ะดะพะปะฐั"),
|
8 |
(
|
9 |
"111 000 000 000 ะดะพะปะฐััะฒ ะดะตัะถะฐะฒะฝะพะณะพ ะฑะพัะณั.",
|
10 |
"ััะพ ะพะดะธะฝะฐะดัััั ะผัะปัััะดัะฒ ะดะพะปะฐััะฒ ะดะตัะถะฐะฒะฝะพะณะพ ะฑะพัะณั.",
|
tests/test_tts.py
CHANGED
@@ -3,9 +3,9 @@ from io import BytesIO
|
|
3 |
|
4 |
|
5 |
def test_tts():
|
6 |
-
tts = TTS(
|
7 |
file = BytesIO()
|
8 |
_, text = tts.tts("ะัะธะฒัั", Voices.Dmytro.value, Stress.Dictionary.value, file)
|
9 |
file.seek(0)
|
10 |
-
assert text == "
|
11 |
assert file.getbuffer().nbytes > 1000 # check that file was generated
|
|
|
3 |
|
4 |
|
5 |
def test_tts():
|
6 |
+
tts = TTS()
|
7 |
file = BytesIO()
|
8 |
_, text = tts.tts("ะัะธะฒัั", Voices.Dmytro.value, Stress.Dictionary.value, file)
|
9 |
file.seek(0)
|
10 |
+
assert text == "ะฟัะธะฒ+ัั"
|
11 |
assert file.getbuffer().nbytes > 1000 # check that file was generated
|
ukrainian_tts/formatter.py
CHANGED
@@ -76,4 +76,5 @@ def preprocess_text(text, use_autostress_model=False):
|
|
76 |
text = text.replace(english_char.upper(), english[english_char].upper())
|
77 |
text = text.replace(english_char, english[english_char])
|
78 |
|
|
|
79 |
return text
|
|
|
76 |
text = text.replace(english_char.upper(), english[english_char].upper())
|
77 |
text = text.replace(english_char, english[english_char])
|
78 |
|
79 |
+
text = text.lower()
|
80 |
return text
|
ukrainian_tts/tts.py
CHANGED
@@ -1,21 +1,24 @@
|
|
1 |
from io import BytesIO
|
2 |
import requests
|
3 |
from os.path import exists, join
|
4 |
-
from
|
5 |
from enum import Enum
|
6 |
from .formatter import preprocess_text
|
7 |
from .stress import sentence_to_stress, stress_dict, stress_with_model
|
8 |
from torch import no_grad
|
|
|
|
|
|
|
9 |
|
10 |
|
11 |
class Voices(Enum):
|
12 |
"""List of available voices for the model."""
|
13 |
|
14 |
-
Olena =
|
15 |
-
Mykyta =
|
16 |
-
Lada =
|
17 |
-
Dmytro =
|
18 |
-
Olga =
|
19 |
|
20 |
|
21 |
class Stress(Enum):
|
@@ -30,14 +33,15 @@ class Stress(Enum):
|
|
30 |
class TTS:
|
31 |
""" """
|
32 |
|
33 |
-
def __init__(self, cache_folder=None,
|
34 |
"""
|
35 |
Class to setup a text-to-speech engine, from download to model creation. \n
|
36 |
Downloads or uses files from `cache_folder` directory. \n
|
37 |
By default stores in current directory."""
|
38 |
-
self.
|
|
|
39 |
|
40 |
-
def tts(self, text: str, voice:
|
41 |
"""
|
42 |
Run a Text-to-Speech engine and output to `output_fp` BytesIO-like object.
|
43 |
- `text` - your model input text.
|
@@ -63,39 +67,50 @@ class TTS:
|
|
63 |
text = preprocess_text(text, stress)
|
64 |
text = sentence_to_stress(text, stress_with_model if stress else stress_dict)
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
with no_grad():
|
67 |
-
|
68 |
-
self.synthesizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
output_fp.seek(0)
|
71 |
|
72 |
return output_fp, text
|
73 |
|
74 |
-
def __setup_cache(self, cache_folder=None
|
75 |
"""Downloads models and stores them into `cache_folder`. By default stores in current directory."""
|
76 |
print("downloading uk/mykyta/vits-tts")
|
77 |
-
release_number = "
|
78 |
-
model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model
|
79 |
-
config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.
|
80 |
-
speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/speakers.pth"
|
81 |
|
82 |
if cache_folder is None:
|
83 |
cache_folder = "."
|
84 |
|
85 |
model_path = join(cache_folder, "model.pth")
|
86 |
-
config_path = join(cache_folder, "config.
|
87 |
-
speakers_path = join(cache_folder, "speakers.pth")
|
88 |
|
89 |
self.__download(model_link, model_path)
|
90 |
self.__download(config_link, config_path)
|
91 |
-
self.__download(speakers_link, speakers_path)
|
92 |
-
|
93 |
-
self.synthesizer = Synthesizer(
|
94 |
-
model_path, config_path, speakers_path, None, None, use_cuda=use_cuda
|
95 |
-
)
|
96 |
-
|
97 |
-
if self.synthesizer is None:
|
98 |
-
raise NameError("Model not found")
|
99 |
|
100 |
def __download(self, url, file_name):
|
101 |
"""Downloads file from `url` into local `file_name` file."""
|
|
|
1 |
from io import BytesIO
|
2 |
import requests
|
3 |
from os.path import exists, join
|
4 |
+
from espnet2.bin.tts_inference import Text2Speech
|
5 |
from enum import Enum
|
6 |
from .formatter import preprocess_text
|
7 |
from .stress import sentence_to_stress, stress_dict, stress_with_model
|
8 |
from torch import no_grad
|
9 |
+
import numpy as np
|
10 |
+
import time
|
11 |
+
import soundfile as sf
|
12 |
|
13 |
|
14 |
class Voices(Enum):
|
15 |
"""List of available voices for the model."""
|
16 |
|
17 |
+
Olena = 4
|
18 |
+
Mykyta = 3
|
19 |
+
Lada = 2
|
20 |
+
Dmytro = 1
|
21 |
+
Olga = 5
|
22 |
|
23 |
|
24 |
class Stress(Enum):
|
|
|
33 |
class TTS:
|
34 |
""" """
|
35 |
|
36 |
+
def __init__(self, cache_folder=None, device="cpu") -> None:
|
37 |
"""
|
38 |
Class to setup a text-to-speech engine, from download to model creation. \n
|
39 |
Downloads or uses files from `cache_folder` directory. \n
|
40 |
By default stores in current directory."""
|
41 |
+
self.device = device
|
42 |
+
self.__setup_cache(cache_folder)
|
43 |
|
44 |
+
def tts(self, text: str, voice: int, stress: str, output_fp=BytesIO(), speed=1.0):
|
45 |
"""
|
46 |
Run a Text-to-Speech engine and output to `output_fp` BytesIO-like object.
|
47 |
- `text` - your model input text.
|
|
|
67 |
text = preprocess_text(text, stress)
|
68 |
text = sentence_to_stress(text, stress_with_model if stress else stress_dict)
|
69 |
|
70 |
+
self.synthesizer = Text2Speech(
|
71 |
+
train_config="config.yaml",
|
72 |
+
model_file="model.pth",
|
73 |
+
device=self.device,
|
74 |
+
speed_control_alpha=1 / speed,
|
75 |
+
# Only for VITS
|
76 |
+
noise_scale=0.333,
|
77 |
+
noise_scale_dur=0.333,
|
78 |
+
)
|
79 |
+
# synthesis
|
80 |
with no_grad():
|
81 |
+
start = time.time()
|
82 |
+
wav = self.synthesizer(text, sids=np.array(voice))["wav"]
|
83 |
+
|
84 |
+
rtf = (time.time() - start) / (len(wav) / self.synthesizer.fs)
|
85 |
+
print(f"RTF = {rtf:5f}")
|
86 |
+
|
87 |
+
sf.write(
|
88 |
+
output_fp,
|
89 |
+
wav.view(-1).cpu().numpy(),
|
90 |
+
self.synthesizer.fs,
|
91 |
+
"PCM_16",
|
92 |
+
format="wav",
|
93 |
+
)
|
94 |
|
95 |
output_fp.seek(0)
|
96 |
|
97 |
return output_fp, text
|
98 |
|
99 |
+
def __setup_cache(self, cache_folder=None):
|
100 |
"""Downloads models and stores them into `cache_folder`. By default stores in current directory."""
|
101 |
print("downloading uk/mykyta/vits-tts")
|
102 |
+
release_number = "v4.0.0"
|
103 |
+
model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model.pth"
|
104 |
+
config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.yaml"
|
|
|
105 |
|
106 |
if cache_folder is None:
|
107 |
cache_folder = "."
|
108 |
|
109 |
model_path = join(cache_folder, "model.pth")
|
110 |
+
config_path = join(cache_folder, "config.yaml")
|
|
|
111 |
|
112 |
self.__download(model_link, model_path)
|
113 |
self.__download(config_link, config_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
def __download(self, url, file_name):
|
116 |
"""Downloads file from `url` into local `file_name` file."""
|