Spaces:

robinhad
/

ukrainian-tts

Running

App Files Files Community

Yurii Paniv commited on Dec 10, 2022

Commit

6449e88

1 Parent(s): eb57397

Remove Coqui

Browse files

Files changed (10) hide show

README.md +4 -22
app.py +4 -7
config.json +0 -319
config.yaml +371 -0
requirements.txt +6 -3
setup.py +4 -4
tests/test_formatter.py +2 -2
tests/test_tts.py +2 -2
ukrainian_tts/formatter.py +1 -0
ukrainian_tts/tts.py +41 -26

README.md CHANGED Viewed

@@ -4,14 +4,14 @@ emoji: 🐌
 colorFrom: blue
 colorTo: yellow
 sdk: gradio
-sdk_version : 3.3
 python_version: 3.9
 app_file: app.py
 pinned: false
 ---
 # Ukrainian TTS 📢🤖
-Ukrainian TTS (text-to-speech) using Coqui TTS.
 ![pytest](https://github.com/robinhad/ukrainian-tts/actions/workflows/hf-sync.yml/badge.svg)
 [![Open In HF🤗 Space ](https://img.shields.io/badge/Open%20Demo-%F0%9F%A4%97%20Space-yellow)](https://huggingface.co/spaces/robinhad/ukrainian-tts)
@@ -65,33 +65,15 @@ pip install git+https://github.com/robinhad/ukrainian-tts.git
 ```python
 from ukrainian_tts.tts import TTS, Voices, Stress
-tts = TTS(use_cuda=False)
 with open("test.wav", mode="wb") as file:
     _, text = tts.tts("Привіт", Voices.Dmytro.value, Stress.Model.value, file)
 print("Accented text:", text)
 ```
-## Run manually:
-`Caution: this won't use normalizer and autostress like a web demo. `
-1. `pip install -r requirements.txt`.
-2. Download `model.pth` and `speakers.pth` from "Releases" tab.
-3. Launch as one-time command:
-```
-tts --text "Text for TTS" \
-    --model_path path/to/model.pth \
-    --config_path path/to/config.json \
-    --speaker_idx dmytro \
-    --out_path folder/to/save/output.wav
-```
-or alternatively launch web server using:
-```
-tts-server --model_path path/to/model.pth \
-    --config_path path/to/config.json
-```
 # How to train: 🏋️
-1. Refer to ["Nervous beginner guide"](https://tts.readthedocs.io/en/latest/tutorial_for_nervous_beginners.html) in Coqui TTS docs.
-2. Instead of provided `config.json` use one from this repo.
 # Attribution 🤝

 colorFrom: blue
 colorTo: yellow
 sdk: gradio
+sdk_version : 3.12
 python_version: 3.9
 app_file: app.py
 pinned: false
 ---
 # Ukrainian TTS 📢🤖
+Ukrainian TTS (text-to-speech) using ESPNET.
 ![pytest](https://github.com/robinhad/ukrainian-tts/actions/workflows/hf-sync.yml/badge.svg)
 [![Open In HF🤗 Space ](https://img.shields.io/badge/Open%20Demo-%F0%9F%A4%97%20Space-yellow)](https://huggingface.co/spaces/robinhad/ukrainian-tts)
 ```python
 from ukrainian_tts.tts import TTS, Voices, Stress
+tts = TTS()
 with open("test.wav", mode="wb") as file:
     _, text = tts.tts("Привіт", Voices.Dmytro.value, Stress.Model.value, file)
 print("Accented text:", text)
 ```
 # How to train: 🏋️
+TBD
 # Attribution 🤝

app.py CHANGED Viewed

@@ -53,11 +53,8 @@ class VoiceOption(Enum):
 print(f"CUDA available? {is_available()}")
-badge = (
-    "https://visitor-badge-reloaded.herokuapp.com/badge?page_id=robinhad.ukrainian-tts"
-)
-ukr_tts = TTS(use_cuda=is_available())
 def tts(text: str, voice: str, stress: str):
@@ -121,9 +118,9 @@ iface = gr.Interface(
         gr.components.Audio(label="Output"),
         gr.components.Textbox(label="Наголошений текст"),
     ],
-    title="🐸💬🇺🇦 - Coqui TTS",
-    description="Україномовний🇺🇦 TTS за допомогою Coqui TTS (щоб вручну поставити наголос, використовуйте + перед голосною)",
-    article=article + f'\n  <center><img src="{badge}" alt="visitors badge"/></center>',
     examples=[
         [
             "Введіть, будь ласка, своє речення.",

 print(f"CUDA available? {is_available()}")
+ukr_tts = TTS()
 def tts(text: str, voice: str, stress: str):
         gr.components.Audio(label="Output"),
         gr.components.Textbox(label="Наголошений текст"),
     ],
+    title="🤖💬🇺🇦 - ESPNET",
+    description="Україномовний🇺🇦 TTS за допомогою ESPNET (щоб вручну поставити наголос, використовуйте + перед голосною)",
+    article=article,
     examples=[
         [
             "Введіть, будь ласка, своє речення.",

config.json DELETED Viewed

@@ -1,319 +0,0 @@
-{
-    "output_path": "/home/robinhad/Projects/TTS",
-    "logger_uri": null,
-    "run_name": "vits_mykyta_latest",
-    "project_name": null,
-    "run_description": "\ud83d\udc38Coqui trainer run.",
-    "print_step": 25,
-    "plot_step": 100,
-    "model_param_stats": false,
-    "wandb_entity": null,
-    "dashboard_logger": "tensorboard",
-    "log_model_step": 5000,
-    "save_step": 5000,
-    "save_n_checkpoints": 5,
-    "save_checkpoints": true,
-    "save_all_best": false,
-    "save_best_after": 10000,
-    "target_loss": null,
-    "print_eval": false,
-    "test_delay_epochs": -1,
-    "run_eval": true,
-    "run_eval_steps": null,
-    "distributed_backend": "nccl",
-    "distributed_url": "tcp://localhost:54321",
-    "mixed_precision": true,
-    "epochs": 1500,
-    "batch_size": 64,
-    "eval_batch_size": 16,
-    "grad_clip": [
-        1000,
-        1000
-    ],
-    "scheduler_after_epoch": true,
-    "lr": 0.001,
-    "optimizer": "AdamW",
-    "optimizer_params": {
-        "betas": [
-            0.8,
-            0.99
-        ],
-        "eps": 1e-09,
-        "weight_decay": 0.01
-    },
-    "lr_scheduler": "",
-    "lr_scheduler_params": {},
-    "use_grad_scaler": false,
-    "cudnn_enable": true,
-    "cudnn_deterministic": false,
-    "cudnn_benchmark": false,
-    "training_seed": 54321,
-    "model": "vits",
-    "num_loader_workers": 12,
-    "num_eval_loader_workers": 12,
-    "use_noise_augment": false,
-    "audio": {
-        "fft_size": 1024,
-        "win_length": 1024,
-        "hop_length": 256,
-        "frame_shift_ms": null,
-        "frame_length_ms": null,
-        "stft_pad_mode": "reflect",
-        "sample_rate": 22050,
-        "resample": false,
-        "preemphasis": 0,
-        "ref_level_db": 35,
-        "do_sound_norm": true,
-        "log_func": "np.log",
-        "do_trim_silence": false,
-        "trim_db": 35,
-        "do_rms_norm": false,
-        "db_level": -24,
-        "power": 1.1,
-        "griffin_lim_iters": 60,
-        "num_mels": 80,
-        "mel_fmin": 0,
-        "mel_fmax": null,
-        "spec_gain": 6.0,
-        "do_amp_to_db_linear": true,
-        "do_amp_to_db_mel": true,
-        "pitch_fmax": 640.0,
-        "pitch_fmin": 0.0,
-        "signal_norm": true,
-        "min_level_db": -100,
-        "symmetric_norm": true,
-        "max_norm": 1.0,
-        "clip_norm": true,
-        "stats_path": null
-    },
-    "use_phonemes": false,
-    "phonemizer": null,
-    "phoneme_language": "uk",
-    "compute_input_seq_cache": false,
-    "text_cleaner": "basic_cleaners",
-    "enable_eos_bos_chars": false,
-    "test_sentences_file": "",
-    "phoneme_cache_path": "/home/robinhad/Projects/TTS/phoneme_cache",
-    "characters": {
-        "characters_class": "TTS.tts.models.vits.VitsCharacters",
-        "vocab_dict": null,
-        "pad": "<PAD>",
-        "eos": "<EOS>",
-        "bos": "<BOS>",
-        "blank": "<BLNK>",
-        "characters": "!\"'(),-/:;.?\u00ab\u00bb+\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0454\u0456\u0457\u0491\u2013\u2014\u2015\u2018\u2019\u201c\u201d\u201e\u2026 ",
-        "punctuations": "!\"'(),-/:;?\u00ab\u00bb+\u2013\u2014\u2015\u2018\u2019\u201c\u201d\u201e\u2026 ",
-        "phonemes": null,
-        "is_unique": true,
-        "is_sorted": true
-    },
-    "add_blank": true,
-    "batch_group_size": 0,
-    "loss_masking": null,
-    "sort_by_audio_len": true,
-    "min_audio_len": 32768,
-    "max_audio_len": 264600,
-    "min_text_len": 1,
-    "max_text_len": Infinity,
-    "compute_f0": false,
-    "compute_linear_spec": true,
-    "precompute_num_workers": 16,
-    "start_by_longest": false,
-    "datasets": [
-        {
-            "name": "mailabs",
-            "path": "/home/robinhad/Data/Audio/ukr-tts-dataset-mai",
-            "meta_file_train": "",
-            "ignored_speakers": null,
-            "language": "",
-            "meta_file_val": "",
-            "meta_file_attn_mask": ""
-        }
-    ],
-    "test_sentences": [
-        [
-            "\u0414+\u0435\u0441\u044f\u0442\u044c \u0440\u0430\u0437+\u0456\u0432 \u0432\u0456\u0434\u043c+\u0456\u0440\u044f\u0439, +\u0430 \u0440+\u0430\u0437 - \u0432\u0456\u0434\u0440+\u0456\u0436.",
-            "olena",
-            null,
-            null
-        ],
-        [
-            "\u0413\u043e\u0432\u043e\u0440+\u0438, \u043d+\u0456\u0431\u0438 \u0442+\u0438 \u0436\u0438\u0432+\u0438\u0439!"
-        ],
-        [
-            "\u041f\u0435\u0440\u0435\u043f\u0440+\u043e\u0448\u0443\u044e, \u0414+\u0435\u0439\u0432\u0435, \u043d+\u0430 \u0436+\u0430\u043b\u044c, +\u044f \u043d+\u0435 \u043c+\u043e\u0436\u0443 \u0437\u0440\u043e\u0431+\u0438\u0442\u0438 \u0446+\u0435.",
-            "lada",
-            null,
-            null
-        ],
-        [
-            "\u041f\u0435\u0440\u0435\u043f\u0440+\u043e\u0448\u0443\u044e, \u0414+\u0435\u0439\u0432\u0435, \u043d+\u0430 \u0436+\u0430\u043b\u044c, +\u044f \u043d+\u0435 \u043c+\u043e\u0436\u0443 \u0437\u0440\u043e\u0431+\u0438\u0442\u0438 \u0446+\u0435.",
-            "mykyta",
-            null,
-            null
-        ],
-        [
-            "\u0425\u0442+\u043e \u0442+\u0438 \u0442\u0430\u043a+\u0438\u0439 +\u0456 +\u044f\u043a \u0442\u0435\u0431+\u0435 \u0437\u0432+\u0430\u0442\u0438?",
-            "mykyta",
-            null,
-            null
-        ],
-        [
-            "\u0425\u0442+\u043e \u0442+\u0438 \u0442\u0430\u043a+\u0438\u0439 +\u0456 +\u044f\u043a \u0442\u0435\u0431+\u0435 \u0437\u0432+\u0430\u0442\u0438?",
-            "dmytro",
-            null,
-            null
-        ],
-        [
-            "\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
-            "lada",
-            null,
-            null
-        ],
-        [
-            "\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
-            "dmytro",
-            null,
-            null
-        ],
-        [
-            "\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
-            "olga",
-            null,
-            null
-        ]
-    ],
-    "eval_split_max_size": null,
-    "eval_split_size": 0.01,
-    "use_speaker_weighted_sampler": false,
-    "speaker_weighted_sampler_alpha": 1.0,
-    "use_language_weighted_sampler": false,
-    "language_weighted_sampler_alpha": 1.0,
-    "use_length_weighted_sampler": false,
-    "length_weighted_sampler_alpha": 1.0,
-    "model_args": {
-        "num_chars": 84,
-        "out_channels": 513,
-        "spec_segment_size": 32,
-        "hidden_channels": 192,
-        "hidden_channels_ffn_text_encoder": 768,
-        "num_heads_text_encoder": 2,
-        "num_layers_text_encoder": 6,
-        "kernel_size_text_encoder": 3,
-        "dropout_p_text_encoder": 0.1,
-        "dropout_p_duration_predictor": 0.5,
-        "kernel_size_posterior_encoder": 5,
-        "dilation_rate_posterior_encoder": 1,
-        "num_layers_posterior_encoder": 16,
-        "kernel_size_flow": 5,
-        "dilation_rate_flow": 1,
-        "num_layers_flow": 4,
-        "resblock_type_decoder": "1",
-        "resblock_kernel_sizes_decoder": [
-            3,
-            7,
-            11
-        ],
-        "resblock_dilation_sizes_decoder": [
-            [
-                1,
-                3,
-                5
-            ],
-            [
-                1,
-                3,
-                5
-            ],
-            [
-                1,
-                3,
-                5
-            ]
-        ],
-        "upsample_rates_decoder": [
-            8,
-            8,
-            2,
-            2
-        ],
-        "upsample_initial_channel_decoder": 512,
-        "upsample_kernel_sizes_decoder": [
-            16,
-            16,
-            4,
-            4
-        ],
-        "periods_multi_period_discriminator": [
-            2,
-            3,
-            5,
-            7,
-            11
-        ],
-        "use_sdp": true,
-        "noise_scale": 1.0,
-        "inference_noise_scale": 0.667,
-        "length_scale": 1,
-        "noise_scale_dp": 1.0,
-        "inference_noise_scale_dp": 1.0,
-        "max_inference_len": null,
-        "init_discriminator": true,
-        "use_spectral_norm_disriminator": false,
-        "use_speaker_embedding": true,
-        "num_speakers": 5,
-        "speakers_file": "speakers.pth",
-        "d_vector_file": null,
-        "speaker_embedding_channels": 256,
-        "use_d_vector_file": false,
-        "d_vector_dim": 0,
-        "detach_dp_input": true,
-        "use_language_embedding": false,
-        "embedded_language_dim": 4,
-        "num_languages": 0,
-        "language_ids_file": null,
-        "use_speaker_encoder_as_loss": false,
-        "speaker_encoder_config_path": "",
-        "speaker_encoder_model_path": "",
-        "condition_dp_on_speaker": true,
-        "freeze_encoder": false,
-        "freeze_DP": false,
-        "freeze_PE": false,
-        "freeze_flow_decoder": false,
-        "freeze_waveform_decoder": false,
-        "encoder_sample_rate": null,
-        "interpolate_z": true,
-        "reinit_DP": false,
-        "reinit_text_encoder": false
-    },
-    "lr_gen": 0.0002,
-    "lr_disc": 0.0002,
-    "lr_scheduler_gen": "ExponentialLR",
-    "lr_scheduler_gen_params": {
-        "gamma": 0.999875,
-        "last_epoch": -1
-    },
-    "lr_scheduler_disc": "ExponentialLR",
-    "lr_scheduler_disc_params": {
-        "gamma": 0.999875,
-        "last_epoch": -1
-    },
-    "kl_loss_alpha": 1.0,
-    "disc_loss_alpha": 1.0,
-    "gen_loss_alpha": 1.0,
-    "feat_loss_alpha": 1.0,
-    "mel_loss_alpha": 45.0,
-    "dur_loss_alpha": 1.0,
-    "speaker_encoder_loss_alpha": 1.0,
-    "return_wav": true,
-    "r": 1,
-    "num_speakers": 0,
-    "use_speaker_embedding": true,
-    "speakers_file": "speakers.pth",
-    "speaker_embedding_channels": 256,
-    "language_ids_file": null,
-    "use_language_embedding": false,
-    "use_d_vector_file": false,
-    "d_vector_file": null,
-    "d_vector_dim": 0
-}

config.yaml ADDED Viewed

	@@ -0,0 +1,371 @@

+config: ./conf/tuning/train_vits.yaml
+print_config: false
+log_level: INFO
+dry_run: false
+iterator_type: sequence
+output_dir: /mnt/tts-storage/exp/22k/tts_train_vits_raw_char
+ngpu: 1
+seed: 3407
+num_workers: 1
+num_att_plot: 3
+dist_backend: nccl
+dist_init_method: env://
+dist_world_size: null
+dist_rank: null
+local_rank: 0
+dist_master_addr: null
+dist_master_port: null
+dist_launcher: null
+multiprocessing_distributed: false
+unused_parameters: true
+sharded_ddp: false
+cudnn_enabled: true
+cudnn_benchmark: false
+cudnn_deterministic: false
+collect_stats: false
+write_collected_feats: false
+max_epoch: 300
+patience: null
+val_scheduler_criterion:
+- valid
+- loss
+early_stopping_criterion:
+- valid
+- loss
+- min
+best_model_criterion:
+-   - train
+    - total_count
+    - max
+keep_nbest_models: 10
+nbest_averaging_interval: 0
+grad_clip: -1
+grad_clip_type: 2.0
+grad_noise: false
+accum_grad: 1
+no_forward_run: false
+resume: true
+train_dtype: float32
+use_amp: false
+log_interval: 50
+use_matplotlib: true
+use_tensorboard: true
+create_graph_in_tensorboard: false
+use_wandb: false
+wandb_project: null
+wandb_id: null
+wandb_entity: null
+wandb_name: null
+wandb_model_log_interval: -1
+detect_anomaly: false
+pretrain_path: null
+init_param: []
+ignore_init_mismatch: false
+freeze_param: []
+num_iters_per_epoch: null
+batch_size: 20
+valid_batch_size: null
+batch_bins: 1900000
+valid_batch_bins: null
+train_shape_file:
+- /mnt/tts-storage/exp/22k/tts_stats_raw_linear_spectrogram_char/train/text_shape.char
+- /mnt/tts-storage/exp/22k/tts_stats_raw_linear_spectrogram_char/train/speech_shape
+valid_shape_file:
+- /mnt/tts-storage/exp/22k/tts_stats_raw_linear_spectrogram_char/valid/text_shape.char
+- /mnt/tts-storage/exp/22k/tts_stats_raw_linear_spectrogram_char/valid/speech_shape
+batch_type: numel
+valid_batch_type: null
+fold_length:
+- 150
+- 204800
+sort_in_batch: descending
+sort_batch: descending
+multiple_iterator: false
+chunk_length: 500
+chunk_shift_ratio: 0.5
+num_cache_chunks: 1024
+train_data_path_and_name_and_type:
+-   - dump/22k/raw/tr_no_dev/text
+    - text
+    - text
+-   - dump/22k/raw/tr_no_dev/wav.scp
+    - speech
+    - sound
+-   - dump/22k/raw/tr_no_dev/utt2sid
+    - sids
+    - text_int
+valid_data_path_and_name_and_type:
+-   - dump/22k/raw/dev/text
+    - text
+    - text
+-   - dump/22k/raw/dev/wav.scp
+    - speech
+    - sound
+-   - dump/22k/raw/dev/utt2sid
+    - sids
+    - text_int
+allow_variable_data_keys: false
+max_cache_size: 0.0
+max_cache_fd: 32
+valid_max_cache_size: null
+optim: adamw
+optim_conf:
+    lr: 0.0002
+    betas:
+    - 0.8
+    - 0.99
+    eps: 1.0e-09
+    weight_decay: 0.0
+scheduler: exponentiallr
+scheduler_conf:
+    gamma: 0.999875
+optim2: adamw
+optim2_conf:
+    lr: 0.0002
+    betas:
+    - 0.8
+    - 0.99
+    eps: 1.0e-09
+    weight_decay: 0.0
+scheduler2: exponentiallr
+scheduler2_conf:
+    gamma: 0.999875
+generator_first: false
+token_list:
+- <blank>
+- <unk>
+- +
+- <space>
+- о
+- а
+- и
+- н
+- в
+- і
+- т
+- е
+- с
+- р
+- л
+- у
+- д
+- к
+- м
+- п
+- я
+- з
+- ','
+- б
+- ь
+- г
+- ч
+- й
+- х
+- ж
+- ш
+- ю
+- ц
+- щ
+- —
+- є
+- ї
+- '?'
+- .
+- ф
+- «
+- »
+- '!'
+- ''''
+- ':'
+- …
+- '-'
+- ґ
+- ―
+- –
+- '"'
+- ;
+- “
+- ”
+- <sos/eos>
+odim: null
+model_conf: {}
+use_preprocessor: true
+token_type: char
+bpemodel: null
+non_linguistic_symbols: null
+cleaner: null
+g2p: g2p_en_no_space
+feats_extract: linear_spectrogram
+feats_extract_conf:
+    n_fft: 1024
+    hop_length: 256
+    win_length: null
+normalize: null
+normalize_conf: {}
+tts: vits
+tts_conf:
+    generator_type: vits_generator
+    generator_params:
+        hidden_channels: 192
+        spks: 128
+        global_channels: 256
+        segment_size: 32
+        text_encoder_attention_heads: 2
+        text_encoder_ffn_expand: 4
+        text_encoder_blocks: 6
+        text_encoder_positionwise_layer_type: conv1d
+        text_encoder_positionwise_conv_kernel_size: 3
+        text_encoder_positional_encoding_layer_type: rel_pos
+        text_encoder_self_attention_layer_type: rel_selfattn
+        text_encoder_activation_type: swish
+        text_encoder_normalize_before: true
+        text_encoder_dropout_rate: 0.1
+        text_encoder_positional_dropout_rate: 0.0
+        text_encoder_attention_dropout_rate: 0.1
+        use_macaron_style_in_text_encoder: true
+        use_conformer_conv_in_text_encoder: false
+        text_encoder_conformer_kernel_size: -1
+        decoder_kernel_size: 7
+        decoder_channels: 512
+        decoder_upsample_scales:
+        - 8
+        - 8
+        - 2
+        - 2
+        decoder_upsample_kernel_sizes:
+        - 16
+        - 16
+        - 4
+        - 4
+        decoder_resblock_kernel_sizes:
+        - 3
+        - 7
+        - 11
+        decoder_resblock_dilations:
+        -   - 1
+            - 3
+            - 5
+        -   - 1
+            - 3
+            - 5
+        -   - 1
+            - 3
+            - 5
+        use_weight_norm_in_decoder: true
+        posterior_encoder_kernel_size: 5
+        posterior_encoder_layers: 16
+        posterior_encoder_stacks: 1
+        posterior_encoder_base_dilation: 1
+        posterior_encoder_dropout_rate: 0.0
+        use_weight_norm_in_posterior_encoder: true
+        flow_flows: 4
+        flow_kernel_size: 5
+        flow_base_dilation: 1
+        flow_layers: 4
+        flow_dropout_rate: 0.0
+        use_weight_norm_in_flow: true
+        use_only_mean_in_flow: true
+        stochastic_duration_predictor_kernel_size: 3
+        stochastic_duration_predictor_dropout_rate: 0.5
+        stochastic_duration_predictor_flows: 4
+        stochastic_duration_predictor_dds_conv_layers: 3
+        vocabs: 55
+        aux_channels: 513
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: AvgPool1d
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes:
+            - 15
+            - 41
+            - 5
+            - 3
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: true
+            downsample_scales:
+            - 2
+            - 2
+            - 4
+            - 4
+            - 1
+            nonlinear_activation: LeakyReLU
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: true
+            use_spectral_norm: false
+        follow_official_norm: false
+        periods:
+        - 2
+        - 3
+        - 5
+        - 7
+        - 11
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes:
+            - 5
+            - 3
+            channels: 32
+            downsample_scales:
+            - 3
+            - 3
+            - 3
+            - 3
+            - 1
+            max_downsample_channels: 1024
+            bias: true
+            nonlinear_activation: LeakyReLU
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: true
+            use_spectral_norm: false
+    generator_adv_loss_params:
+        average_by_discriminators: false
+        loss_type: mse
+    discriminator_adv_loss_params:
+        average_by_discriminators: false
+        loss_type: mse
+    feat_match_loss_params:
+        average_by_discriminators: false
+        average_by_layers: false
+        include_final_outputs: true
+    mel_loss_params:
+        fs: 22050
+        n_fft: 1024
+        hop_length: 256
+        win_length: null
+        window: hann
+        n_mels: 80
+        fmin: 0
+        fmax: null
+        log_base: null
+    lambda_adv: 1.0
+    lambda_mel: 45.0
+    lambda_feat_match: 2.0
+    lambda_dur: 1.0
+    lambda_kl: 1.0
+    sampling_rate: 22050
+    cache_generator_outputs: true
+pitch_extract: null
+pitch_extract_conf: {}
+pitch_normalize: null
+pitch_normalize_conf: {}
+energy_extract: null
+energy_extract_conf: {}
+energy_normalize: null
+energy_normalize_conf: {}
+required:
+- output_dir
+- token_list
+version: '202209'
+distributed: false

requirements.txt CHANGED Viewed

@@ -1,6 +1,9 @@
 # requirements for HuggingFace demo. Installs local package.
-torch>=1.9
-TTS==0.9.0
 ukrainian-word-stress==1.0.1
 git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c
-gradio==3.3

 # requirements for HuggingFace demo. Installs local package.
+torch
+--extra-index-url https://download.pytorch.org/whl/cpu
+espnet==202209
+num2words==0.5.12
 ukrainian-word-stress==1.0.1
 git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c
+gradio==3.12
+huggingface_hub==0.11.1

setup.py CHANGED Viewed

@@ -3,8 +3,8 @@ from setuptools import setup, find_packages
 setup(
     name="ukrainian-tts",
-    version="3.0",
-    description="Ukrainian TTS using Coqui TTS",
     author="Yurii Paniv",
     author_email="[email protected]",
     url="https://github.com/robinhad/ukrainian-tts",
@@ -12,8 +12,8 @@ setup(
     packages=find_packages(),
     python_requires=">3.6.0",
     install_requires=[
-        "torch>=1.9",
-        "TTS==0.9.0",
         "ukrainian-word-stress==1.0.1",
         "ukrainian_accentor @ git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c",
     ],

 setup(
     name="ukrainian-tts",
+    version="4.0",
+    description="Ukrainian TTS using ESPNET",
     author="Yurii Paniv",
     author_email="[email protected]",
     url="https://github.com/robinhad/ukrainian-tts",
     packages=find_packages(),
     python_requires=">3.6.0",
     install_requires=[
+        "espnet==202209",
+        "num2words==0.5.12",
         "ukrainian-word-stress==1.0.1",
         "ukrainian_accentor @ git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c",
     ],

tests/test_formatter.py CHANGED Viewed

@@ -3,8 +3,8 @@ from ukrainian_tts.formatter import preprocess_text
 def test_formatter():
     examples = [
-        ("Quality of life update", "КВюаліті оф ліфе юпдате"),
-        ("Він украв 20000000 $", "Він украв двадцять мільйонів долар"),
         (
             "111 000 000 000 доларів державного боргу.",
             "сто одинадцять мільярдів доларів державного боргу.",

 def test_formatter():
     examples = [
+        ("Quality of life update", "квюаліті оф ліфе юпдате"),
+        ("Він украв 20000000 $", "він украв двадцять мільйонів долар"),
         (
             "111 000 000 000 доларів державного боргу.",
             "сто одинадцять мільярдів доларів державного боргу.",

tests/test_tts.py CHANGED Viewed

@@ -3,9 +3,9 @@ from io import BytesIO
 def test_tts():
-    tts = TTS(use_cuda=False)
     file = BytesIO()
     _, text = tts.tts("Привіт", Voices.Dmytro.value, Stress.Dictionary.value, file)
     file.seek(0)
-    assert text == "Прив+іт"
     assert file.getbuffer().nbytes > 1000  # check that file was generated

 def test_tts():
+    tts = TTS()
     file = BytesIO()
     _, text = tts.tts("Привіт", Voices.Dmytro.value, Stress.Dictionary.value, file)
     file.seek(0)
+    assert text == "прив+іт"
     assert file.getbuffer().nbytes > 1000  # check that file was generated

ukrainian_tts/formatter.py CHANGED Viewed

@@ -76,4 +76,5 @@ def preprocess_text(text, use_autostress_model=False):
         text = text.replace(english_char.upper(), english[english_char].upper())
         text = text.replace(english_char, english[english_char])
     return text

         text = text.replace(english_char.upper(), english[english_char].upper())
         text = text.replace(english_char, english[english_char])
+    text = text.lower()
     return text

ukrainian_tts/tts.py CHANGED Viewed

@@ -1,21 +1,24 @@
 from io import BytesIO
 import requests
 from os.path import exists, join
-from TTS.utils.synthesizer import Synthesizer
 from enum import Enum
 from .formatter import preprocess_text
 from .stress import sentence_to_stress, stress_dict, stress_with_model
 from torch import no_grad
 class Voices(Enum):
     """List of available voices for the model."""
-    Olena = "olena"
-    Mykyta = "mykyta"
-    Lada = "lada"
-    Dmytro = "dmytro"
-    Olga = "olga"
 class Stress(Enum):
@@ -30,14 +33,15 @@ class Stress(Enum):
 class TTS:
     """ """
-    def __init__(self, cache_folder=None, use_cuda=False) -> None:
         """
         Class to setup a text-to-speech engine, from download to model creation.  \n
         Downloads or uses files from `cache_folder` directory.  \n
         By default stores in current directory."""
-        self.__setup_cache(cache_folder, use_cuda=use_cuda)
-    def tts(self, text: str, voice: str, stress: str, output_fp=BytesIO()):
         """
         Run a Text-to-Speech engine and output to `output_fp` BytesIO-like object.
         - `text` - your model input text.
@@ -63,39 +67,50 @@ class TTS:
         text = preprocess_text(text, stress)
         text = sentence_to_stress(text, stress_with_model if stress else stress_dict)
         with no_grad():
-            wavs = self.synthesizer.tts(text, speaker_name=voice)
-            self.synthesizer.save_wav(wavs, output_fp)
         output_fp.seek(0)
         return output_fp, text
-    def __setup_cache(self, cache_folder=None, use_cuda=False):
         """Downloads models and stores them into `cache_folder`. By default stores in current directory."""
         print("downloading uk/mykyta/vits-tts")
-        release_number = "v3.0.0"
-        model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model-inference.pth"
-        config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.json"
-        speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/speakers.pth"
         if cache_folder is None:
             cache_folder = "."
         model_path = join(cache_folder, "model.pth")
-        config_path = join(cache_folder, "config.json")
-        speakers_path = join(cache_folder, "speakers.pth")
         self.__download(model_link, model_path)
         self.__download(config_link, config_path)
-        self.__download(speakers_link, speakers_path)
-        self.synthesizer = Synthesizer(
-            model_path, config_path, speakers_path, None, None, use_cuda=use_cuda
-        )
-        if self.synthesizer is None:
-            raise NameError("Model not found")
     def __download(self, url, file_name):
         """Downloads file from `url` into local `file_name` file."""

 from io import BytesIO
 import requests
 from os.path import exists, join
+from espnet2.bin.tts_inference import Text2Speech
 from enum import Enum
 from .formatter import preprocess_text
 from .stress import sentence_to_stress, stress_dict, stress_with_model
 from torch import no_grad
+import numpy as np
+import time
+import soundfile as sf
 class Voices(Enum):
     """List of available voices for the model."""
+    Olena = 4
+    Mykyta = 3
+    Lada = 2
+    Dmytro = 1
+    Olga = 5
 class Stress(Enum):
 class TTS:
     """ """
+    def __init__(self, cache_folder=None, device="cpu") -> None:
         """
         Class to setup a text-to-speech engine, from download to model creation.  \n
         Downloads or uses files from `cache_folder` directory.  \n
         By default stores in current directory."""
+        self.device = device
+        self.__setup_cache(cache_folder)
+    def tts(self, text: str, voice: int, stress: str, output_fp=BytesIO(), speed=1.0):
         """
         Run a Text-to-Speech engine and output to `output_fp` BytesIO-like object.
         - `text` - your model input text.
         text = preprocess_text(text, stress)
         text = sentence_to_stress(text, stress_with_model if stress else stress_dict)
+        self.synthesizer = Text2Speech(
+            train_config="config.yaml",
+            model_file="model.pth",
+            device=self.device,
+            speed_control_alpha=1 / speed,
+            # Only for VITS
+            noise_scale=0.333,
+            noise_scale_dur=0.333,
+        )
+        # synthesis
         with no_grad():
+            start = time.time()
+            wav = self.synthesizer(text, sids=np.array(voice))["wav"]
+        rtf = (time.time() - start) / (len(wav) / self.synthesizer.fs)
+        print(f"RTF = {rtf:5f}")
+        sf.write(
+            output_fp,
+            wav.view(-1).cpu().numpy(),
+            self.synthesizer.fs,
+            "PCM_16",
+            format="wav",
+        )
         output_fp.seek(0)
         return output_fp, text
+    def __setup_cache(self, cache_folder=None):
         """Downloads models and stores them into `cache_folder`. By default stores in current directory."""
         print("downloading uk/mykyta/vits-tts")
+        release_number = "v4.0.0"
+        model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model.pth"
+        config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.yaml"
         if cache_folder is None:
             cache_folder = "."
         model_path = join(cache_folder, "model.pth")
+        config_path = join(cache_folder, "config.yaml")
         self.__download(model_link, model_path)
         self.__download(config_link, config_path)
     def __download(self, url, file_name):
         """Downloads file from `url` into local `file_name` file."""