File size: 4,255 Bytes
1524fa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import gradio as gr
import os
os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')

import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols as symbols_default 

from scipy.io.wavfile import write
from text import cleaners

model_configs = {
    "Graphemes": {
        "path": "french_model_vits/G_700000.pth",
        "symbols": symbols_default
    }
}

# Global variables 
net_g = None
symbols = []
_symbol_to_id = {}
_id_to_symbol = {}

def text_to_sequence(text, cleaner_names):
    sequence = []
    clean_text = _clean_text(text, cleaner_names)
    for symbol in clean_text:
        symbol_id = _symbol_to_id[symbol]
        sequence += [symbol_id]
    return sequence

def _clean_text(text, cleaner_names):
    for name in cleaner_names:
        cleaner = getattr(cleaners, name)
        if not cleaner:
            raise Exception('Unknown cleaner: %s' % name)
        text = cleaner(text)
    return text

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if (hps.data.add_blank):
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

def load_model_and_symbols(tab_name):
    global net_g, symbols, _symbol_to_id, _id_to_symbol
    model_config = model_configs[tab_name]
    symbols = model_config["symbols"]
    _symbol_to_id = {s: i for i, s in enumerate(symbols)}
    _id_to_symbol = {i: s for i, s in enumerate(symbols)}

    net_g = SynthesizerTrn(
        len(symbols),
        hps.data.filter_length // 2 + 1,
        hps.train.segment_size // hps.data.hop_length,
        n_speakers=hps.data.n_speakers,
        **hps.model)
    _ = net_g.eval()
    _ = utils.load_checkpoint(model_config["path"], net_g, None)

def tts(text, speaker_id, tab_name):
    load_model_and_symbols(tab_name)
    sid = torch.LongTensor([speaker_id])  # speaker identity
    stn_tst = get_text(text, hps)

    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
            0, 0].data.float().numpy()
    return "Success", (hps.data.sampling_rate, audio)

def create_tab(tab_name):
    with gr.TabItem(tab_name):
        gr.Markdown(f"### {tab_name} TTS Model")
        tts_input1 = gr.TextArea(label="Text in french", value="")
        tts_input2 = gr.Dropdown(label="Speaker", choices=["Male", "Female"], type="index", value="Male")
        tts_submit = gr.Button("Generate", variant="primary")
        tts_output1 = gr.Textbox(label="Message")
        tts_output2 = gr.Audio(label="Output")
        tts_submit.click(lambda text, speaker_id: tts(text, speaker_id, tab_name), [tts_input1, tts_input2], [tts_output1, tts_output2])

hps = utils.get_hparams_from_file("configs/vctk_base.json")

app = gr.Blocks()
with app:
    gr.Markdown(
        """
        # VITS Implementation for French
        
        Based on VITS (https://github.com/jaywalnut310/vits).
        
        ## How to use:
        Write the text on the box below. For faster inference, it is recommended to use short sentences.
        
        ## Hint: Some sample texts are available at the bottom of the web site.
        """
    )
    with gr.Tabs():
        create_tab("French TTS")

    gr.Markdown(
        """
        ## Examples
        | Input Text | Speaker |
        |------------|---------|
        | On ne voit bien qu'avec le cœur, l'essentiel est invisible pour les yeux. | Female | 
        | Voilà plusieurs fois, Monsieur, que je vous rencontre sur mon chemin. C’est autant de fois de trop, et j’en ai assez de perdre mon temps à déjouer les pièges que vous me tendez. | Male |
        | Je pense, donc je suis.  | Female | 
        | La vie est un sommeil, l'amour en est le rêve, et vous aurez vécu si vous avez aimé. | Male | 
        """
    )

app.launch()