File size: 5,075 Bytes
4b37b7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1b6e2f
4b37b7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69e221d
4b37b7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52eb47a
 
 
4b37b7e
 
 
 
 
 
 
 
 
 
 
 
52eb47a
4b37b7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52eb47a
 
 
 
4b37b7e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import torch
import librosa
import commons
import utils
from models import SynthesizerTrn
from text import text_to_sequence
import numpy as np
from mel_processing import spectrogram_torch
import gradio as gr
from indic_transliteration import sanscript


SCRIPT_DICT={
    'Devanagari':sanscript.DEVANAGARI,
    'IAST':sanscript.IAST,
    'SLP1':sanscript.SLP1,
    'HK':sanscript.HK
}

DEFAULT_TEXT='संस्कृतम् जगतः एकतमा अतिप्राचीना समृद्धा शास्त्रीया च भाषासु वर्तते । संस्कृतं भारतस्य जगत: वा भाषासु एकतमा‌ प्राचीनतमा ।'


def get_text(text, hps, cleaned=False):
    if cleaned:
        text_norm = text_to_sequence(text, hps.symbols, [])
    else:
        text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm


def default_text(script):
    if script=='Devanagari':
        return DEFAULT_TEXT
    else:
        return sanscript.transliterate(DEFAULT_TEXT,sanscript.DEVANAGARI,SCRIPT_DICT[script])


def speech_synthesize(text,script, speaker_id, length_scale):
    text=text.replace('\n','')
    if script!='Devanagari':
        text=sanscript.transliterate(text,SCRIPT_DICT[script],sanscript.DEVANAGARI)
    print(text)
    stn_tst = get_text(text, hps_ms)
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        sid = torch.LongTensor([speaker_id])
        audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=length_scale)[0][0,0].data.cpu().float().numpy()
    return (hps_ms.data.sampling_rate, audio)


def voice_convert(audio,origin_id,target_id):
    sampling_rate, audio = audio
    audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
    if len(audio.shape) > 1:
        audio = librosa.to_mono(audio.transpose(1, 0))
    if sampling_rate != hps_ms.data.sampling_rate:
        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps_ms.data.sampling_rate)

    with torch.no_grad():
        y = torch.FloatTensor(audio).unsqueeze(0)
        spec = spectrogram_torch(y, hps_ms.data.filter_length,
            hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,
            center=False)
        spec_lengths = torch.LongTensor([spec.size(-1)])
        sid_src = torch.LongTensor([origin_id])
        sid_tgt = torch.LongTensor([target_id])
        audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][0,0].data.cpu().float().numpy()
    return (hps_ms.data.sampling_rate, audio)


if __name__=='__main__':
    hps_ms = utils.get_hparams_from_file('model/config.json')
    n_speakers = hps_ms.data.n_speakers
    n_symbols = len(hps_ms.symbols)
    speakers = hps_ms.speakers

    net_g_ms = SynthesizerTrn(
        n_symbols,
        hps_ms.data.filter_length // 2 + 1,
        hps_ms.train.segment_size // hps_ms.data.hop_length,
        n_speakers=n_speakers,
        **hps_ms.model)
    _ = net_g_ms.eval()
    utils.load_checkpoint('model/model.pth', net_g_ms)

    with gr.Blocks() as app:
        gr.Markdown('# Sanskrit Text to Speech\n'
        '![visitor badge](https://visitor-badge.glitch.me/badge?page_id=cjangcjengh.sanskrit-tts)')
        with gr.Tab('Text to Speech'):
            text_script=gr.Radio(['Devanagari','IAST','SLP1','HK'],label='Script',interactive=True,value='Devanagari')
            text_input = gr.TextArea(label='Text', placeholder='Type your text here',value=DEFAULT_TEXT)
            speaker_id=gr.Dropdown(speakers,label='Speaker',type='index',interactive=True,value=speakers[0])
            length_scale=gr.Slider(0.5,2,1,step=0.1,label='Speaking Speed',interactive=True)
            tts_button = gr.Button('Synthesize')
            audio_output = gr.Audio(label='Speech Synthesized')
            text_script.change(default_text,[text_script],[text_input])
            tts_button.click(speech_synthesize,[text_input,text_script,speaker_id,length_scale],[audio_output])
        with gr.Tab('Voice Conversion'):
            audio_input = gr.Audio(label='Audio',interactive=True)
            speaker_input = gr.Dropdown(speakers, label='Original Speaker',type='index',interactive=True, value=speakers[0])
            speaker_output = gr.Dropdown(speakers, label='Target Speaker',type='index',interactive=True, value=speakers[0])
            vc_button = gr.Button('Convert')
            audio_output_vc = gr.Audio(label='Voice Converted')
            vc_button.click(voice_convert,[audio_input,speaker_input,speaker_output],[audio_output_vc])
        gr.Markdown('## Based on\n'
        '- [VITS](https://github.com/jaywalnut310/vits)\n\n'
        '## Dataset\n'
        '- [Vāksañcayaḥ](https://www.cse.iitb.ac.in/~asr/)')

    app.launch()