File size: 3,371 Bytes
f23c138
 
 
cea6632
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f23c138
 
1d606bc
 
ed7f208
 
cea6632
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d606bc
cea6632
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404e64b
1d606bc
404e64b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os

import gdown
import gradio as gr
import numpy as np
import torch

from InferenceInterfaces.Meta_FastSpeech2 import Meta_FastSpeech2


def float2pcm(sig, dtype='int16'):
    """
    https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
    """
    sig = np.asarray(sig)
    if sig.dtype.kind != 'f':
        raise TypeError("'sig' must be a float array")
    dtype = np.dtype(dtype)
    if dtype.kind not in 'iu':
        raise TypeError("'dtype' must be an integer type")
    i = np.iinfo(dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)


class TTS_Interface:

    def __init__(self):
        os.makedirs("Models/HiFiGAN_combined", exist_ok=True)
        os.makedirs("Models/FastSpeech2_Meta", exist_ok=True)
        gdown.download(id="1-AhjmCR6DDI6rtzPIn9ksOxQyHKf6CbG", output="Models/FastSpeech2_Meta/best.pt")
        gdown.download(id="1-5sP-0JDUvKTjxhO3hUVJgArSUjuhU6P", output="Models/HiFiGAN_combined/best.pt")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = Meta_FastSpeech2(device=self.device)

    def read(self, prompt, language):
        language_id_lookup = {
            "English"  : "en",
            "German"   : "de",
            "Greek"    : "el",
            "Spanish"  : "es",
            "Finnish"  : "fi",
            "Russian"  : "ru",
            "Hungarian": "hu",
            "Dutch"    : "nl",
            "French"   : "fr"
            }
        self.model.set_language(language_id_lookup[language])
        wav = self.model(prompt)
        return 48000, float2pcm(wav.cpu().numpy())


meta_model = TTS_Interface()
article = "<p style='text-align: left'>This is still a work in progress, models will be exchanged for better ones as soon as they are done. All of those languages are spoken by a single model. Speakers can be transferred across languages. More languages will be added soon.</p><p style='text-align: center'><a href='https://github.com/DigitalPhonetics/IMS-Toucan' target='_blank'>Click here to learn more about the IMS Toucan Speech Synthesis Toolkit</a></p>"

iface = gr.Interface(fn=meta_model.read,
                     inputs=[gr.inputs.Textbox(lines=2, placeholder="write what you want the synthesis to read here...", label=" "),
                             gr.inputs.Dropdown(['English',
                                                 'German',
                                                 'Greek',
                                                 'Spanish',
                                                 'Finnish',
                                                 'Russian',
                                                 'Hungarian',
                                                 'Dutch',
                                                 'French'], type="value", default='English', label="Language Selection")],
                     outputs=gr.outputs.Audio(type="numpy", label=None),
                     layout="vertical",
                     title="IMS Toucan Multilingual Multispeaker Demo",
                     thumbnail="Utility/toucan.png",
                     theme="default",
                     allow_flagging="never",
                     allow_screenshot=False,
                     article=article)
iface.launch(enable_queue=True)