File size: 2,920 Bytes
f23c138
 
 
cea6632
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f23c138
 
 
 
cea6632
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os

import gdown
import gradio as gr
import numpy as np
import torch

from InferenceInterfaces.Meta_FastSpeech2 import Meta_FastSpeech2


def float2pcm(sig, dtype='int16'):
    """
    https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
    """
    sig = np.asarray(sig)
    if sig.dtype.kind != 'f':
        raise TypeError("'sig' must be a float array")
    dtype = np.dtype(dtype)
    if dtype.kind not in 'iu':
        raise TypeError("'dtype' must be an integer type")
    i = np.iinfo(dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)


class TTS_Interface:

    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = Meta_FastSpeech2(device=self.device)
        os.makedirs("Models/HiFiGAN_combined", exist_ok=True)
        os.makedirs("Models/FastSpeech2_Meta", exist_ok=True)
        gdown.download(url="https://drive.google.com/uc?id=1-AhjmCR6DDI6rtzPIn9ksOxQyHKf6CbG", output="Models/FastSpeech2_Meta/best.pt")
        gdown.download(url="https://drive.google.com/uc?id=1-5sP-0JDUvKTjxhO3hUVJgArSUjuhU6P", output="Models/HiFiGAN_combined/best.pt")

    def read(self, prompt, language):
        language_id_lookup = {
            "English"  : "en",
            "German"   : "de",
            "Greek"    : "el",
            "Spanish"  : "es",
            "Finnish"  : "fi",
            "Russian"  : "ru",
            "Hungarian": "hu",
            "Dutch"    : "nl",
            "French"   : "fr"
            }
        self.model.set_language(language_id_lookup[language])
        wav = self.model(prompt)
        return 48000, float2pcm(wav.cpu().numpy())


meta_model = TTS_Interface()

iface = gr.Interface(fn=meta_model.read,
                     inputs=[gr.inputs.Textbox(lines=2, placeholder="write what you want the synthesis to read here...", label=" "),
                             gr.inputs.Dropdown(['English',
                                                 'German',
                                                 'Greek',
                                                 'Spanish',
                                                 'Finnish',
                                                 'Russian',
                                                 'Hungarian',
                                                 'Dutch',
                                                 'French'], type="value", default='English', label="Language Selection")],
                     outputs=gr.outputs.Audio(type="numpy", label=None),
                     layout="vertical",
                     title="IMS Toucan Multilingual Multispeaker Demo",
                     thumbnail="Utility/toucan.png",
                     theme="default",
                     allow_flagging="never",
                     allow_screenshot=False)
iface.launch()