Yurii Paniv commited on
Commit
a163565
·
1 Parent(s): 7af6095

Add version 3.0.0

Browse files
Files changed (3) hide show
  1. README.md +6 -3
  2. app.py +21 -15
  3. config.json +20 -2
README.md CHANGED
@@ -18,8 +18,10 @@ Link to source code and models -> [https://github.com/robinhad/ukrainian-tts](ht
18
 
19
  Code is licensed under `MIT License`, models are under `GNU GPL v3 License`.
20
  # Support
21
- If you like my work, please support -> [https://send.monobank.ua/jar/48iHq4xAXm](https://send.monobank.ua/jar/48iHq4xAXm)
22
- # Example
 
 
23
 
24
  `Mykyta (male)`:
25
 
@@ -53,7 +55,8 @@ tts-server --model_path path/to/model.pth \
53
  # Attribution 🤝
54
 
55
  - Model training - [Yurii Paniv @robinhad](https://github.com/robinhad)
56
- - Mykyta, Olena and Lada dataset - [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)
 
57
  - Silence cutting using [HMM-GMM](https://github.com/proger/uk) - [Volodymyr Kyrylov @proger](https://github.com/proger)
58
  - Autostress (with dictionary) using [ukrainian-word-stress](https://github.com/lang-uk/ukrainian-word-stress) - [Oleksiy Syvokon @asivokon](https://github.com/asivokon)
59
  - Autostress (with model) using [ukrainian-accentor](https://github.com/egorsmkv/ukrainian-accentor) - [Bohdan Mykhailenko @NeonBohdan](https://github.com/NeonBohdan) + [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)
 
18
 
19
  Code is licensed under `MIT License`, models are under `GNU GPL v3 License`.
20
  # Support
21
+ If you like my work, please support -> [https://send.monobank.ua/jar/48iHq4xAXm](https://send.monobank.ua/jar/48iHq4xAXm)
22
+ For collaboration and question please contact me here: [Telegram https://t.me/robinhad](https://t.me/robinhad) [Twitter https://twitter.com/robinhad](https://twitter.com/robinhad)
23
+ You're welcome to join UA Speech Recognition and Synthesis community: [Telegram https://t.me/speech_recognition_uk](https://t.me/speech_recognition_uk)
24
+ # Examples
25
 
26
  `Mykyta (male)`:
27
 
 
55
  # Attribution 🤝
56
 
57
  - Model training - [Yurii Paniv @robinhad](https://github.com/robinhad)
58
+ - Mykyta, Olena, Lada, Dmytro, Olha dataset - [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)
59
+ - Dmytro voice - [Dmytro Chaplynskyi @dchaplinsky](https://github.com/dchaplinsky)
60
  - Silence cutting using [HMM-GMM](https://github.com/proger/uk) - [Volodymyr Kyrylov @proger](https://github.com/proger)
61
  - Autostress (with dictionary) using [ukrainian-word-stress](https://github.com/lang-uk/ukrainian-word-stress) - [Oleksiy Syvokon @asivokon](https://github.com/asivokon)
62
  - Autostress (with model) using [ukrainian-accentor](https://github.com/egorsmkv/ukrainian-accentor) - [Bohdan Mykhailenko @NeonBohdan](https://github.com/NeonBohdan) + [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)
app.py CHANGED
@@ -20,6 +20,8 @@ class VoiceOption(Enum):
20
  Olena = "Олена (жіночий) 👩"
21
  Mykyta = "Микита (чоловічий) 👨"
22
  Lada = "Лада (жіночий) 👩"
 
 
23
 
24
 
25
  def download(url, file_name):
@@ -33,7 +35,7 @@ def download(url, file_name):
33
 
34
 
35
  print("downloading uk/mykyta/vits-tts")
36
- release_number = "v3.0.0-alpha"
37
  model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model-inference.pth"
38
  config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.json"
39
  speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/speakers.pth"
@@ -71,11 +73,14 @@ def tts(text: str, voice: str, stress: str):
71
  autostress_with_model = (
72
  True if stress == StressOption.AutomaticStressWithModel.value else False
73
  )
74
- speaker_name = "mykyta"
75
- if voice == VoiceOption.Olena.value:
76
- speaker_name = "olena"
77
- elif voice == VoiceOption.Lada.value:
78
- speaker_name = "lada"
 
 
 
79
  text = preprocess_text(text, autostress_with_model)
80
  text_limit = 7200
81
  text = (
@@ -98,23 +103,24 @@ with open("README.md") as file:
98
  iface = gr.Interface(
99
  fn=tts,
100
  inputs=[
101
- gr.inputs.Textbox(
102
  label="Input",
103
- default="Введіть, будь ласка, своє р+ечення.",
104
  ),
105
- gr.inputs.Radio(
106
  label="Голос",
107
  choices=[option.value for option in VoiceOption],
108
- default=VoiceOption.Olena.value,
109
  ),
110
- gr.inputs.Radio(
111
  label="Наголоси",
112
  choices=[option.value for option in StressOption],
 
113
  ),
114
  ],
115
  outputs=[
116
- gr.outputs.Audio(label="Output"),
117
- gr.outputs.Textbox(label="Наголошений текст"),
118
  ],
119
  title="🐸💬🇺🇦 - Coqui TTS",
120
  description="Україномовний🇺🇦 TTS за допомогою Coqui TTS (щоб вручну поставити наголос, використовуйте + перед голосною)",
@@ -132,12 +138,12 @@ iface = gr.Interface(
132
  ],
133
  [
134
  "Вв+едіть, будь ласка, св+оє реч+ення.",
135
- VoiceOption.Mykyta.value,
136
  StressOption.AutomaticStress.value,
137
  ],
138
  [
139
  "Привіт, як тебе звати?",
140
- VoiceOption.Olena.value,
141
  StressOption.AutomaticStress.value,
142
  ],
143
  [
 
20
  Olena = "Олена (жіночий) 👩"
21
  Mykyta = "Микита (чоловічий) 👨"
22
  Lada = "Лада (жіночий) 👩"
23
+ Dmytro = "Дмитро (чоловічий) 👩"
24
+ Olga = "Ольга (жіночий) 👩"
25
 
26
 
27
  def download(url, file_name):
 
35
 
36
 
37
  print("downloading uk/mykyta/vits-tts")
38
+ release_number = "v3.0.0"
39
  model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model-inference.pth"
40
  config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.json"
41
  speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/speakers.pth"
 
73
  autostress_with_model = (
74
  True if stress == StressOption.AutomaticStressWithModel.value else False
75
  )
76
+ voice_mapping = {
77
+ VoiceOption.Olena.value: "olena",
78
+ VoiceOption.Mykyta.value: "mykyta",
79
+ VoiceOption.Lada.value: "lada",
80
+ VoiceOption.Dmytro.value: "dmytro",
81
+ VoiceOption.Olga.value: "olga",
82
+ }
83
+ speaker_name = voice_mapping[voice]
84
  text = preprocess_text(text, autostress_with_model)
85
  text_limit = 7200
86
  text = (
 
103
  iface = gr.Interface(
104
  fn=tts,
105
  inputs=[
106
+ gr.components.Textbox(
107
  label="Input",
108
+ value="Введіть, будь ласка, своє р+ечення.",
109
  ),
110
+ gr.components.Radio(
111
  label="Голос",
112
  choices=[option.value for option in VoiceOption],
113
+ value=VoiceOption.Olena.value,
114
  ),
115
+ gr.components.Radio(
116
  label="Наголоси",
117
  choices=[option.value for option in StressOption],
118
+ value=StressOption.AutomaticStress.value
119
  ),
120
  ],
121
  outputs=[
122
+ gr.components.Audio(label="Output"),
123
+ gr.components.Textbox(label="Наголошений текст"),
124
  ],
125
  title="🐸💬🇺🇦 - Coqui TTS",
126
  description="Україномовний🇺🇦 TTS за допомогою Coqui TTS (щоб вручну поставити наголос, використовуйте + перед голосною)",
 
138
  ],
139
  [
140
  "Вв+едіть, будь ласка, св+оє реч+ення.",
141
+ VoiceOption.Dmytro.value,
142
  StressOption.AutomaticStress.value,
143
  ],
144
  [
145
  "Привіт, як тебе звати?",
146
+ VoiceOption.Olga.value,
147
  StressOption.AutomaticStress.value,
148
  ],
149
  [
config.json CHANGED
@@ -73,7 +73,7 @@
73
  "griffin_lim_iters": 60,
74
  "num_mels": 80,
75
  "mel_fmin": 0,
76
- "mel_fmax": 8000,
77
  "spec_gain": 6.0,
78
  "do_amp_to_db_linear": true,
79
  "do_amp_to_db_mel": true,
@@ -158,11 +158,29 @@
158
  null,
159
  null
160
  ],
 
 
 
 
 
 
161
  [
162
  "\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
163
  "lada",
164
  null,
165
  null
 
 
 
 
 
 
 
 
 
 
 
 
166
  ]
167
  ],
168
  "eval_split_max_size": null,
@@ -243,7 +261,7 @@
243
  "init_discriminator": true,
244
  "use_spectral_norm_disriminator": false,
245
  "use_speaker_embedding": true,
246
- "num_speakers": 3,
247
  "speakers_file": "speakers.pth",
248
  "d_vector_file": null,
249
  "speaker_embedding_channels": 256,
 
73
  "griffin_lim_iters": 60,
74
  "num_mels": 80,
75
  "mel_fmin": 0,
76
+ "mel_fmax": null,
77
  "spec_gain": 6.0,
78
  "do_amp_to_db_linear": true,
79
  "do_amp_to_db_mel": true,
 
158
  null,
159
  null
160
  ],
161
+ [
162
+ "\u0425\u0442+\u043e \u0442+\u0438 \u0442\u0430\u043a+\u0438\u0439 +\u0456 +\u044f\u043a \u0442\u0435\u0431+\u0435 \u0437\u0432+\u0430\u0442\u0438?",
163
+ "dmytro",
164
+ null,
165
+ null
166
+ ],
167
  [
168
  "\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
169
  "lada",
170
  null,
171
  null
172
+ ],
173
+ [
174
+ "\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
175
+ "dmytro",
176
+ null,
177
+ null
178
+ ],
179
+ [
180
+ "\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
181
+ "olga",
182
+ null,
183
+ null
184
  ]
185
  ],
186
  "eval_split_max_size": null,
 
261
  "init_discriminator": true,
262
  "use_spectral_norm_disriminator": false,
263
  "use_speaker_embedding": true,
264
+ "num_speakers": 5,
265
  "speakers_file": "speakers.pth",
266
  "d_vector_file": null,
267
  "speaker_embedding_channels": 256,