File size: 2,053 Bytes
a9482ab
 
b87f08b
7f6563e
 
510f17f
aeceb48
7f6563e
8772ca9
7f6563e
d59ee2f
 
9630f4e
dd29aa4
e28cac3
a9482ab
a7d0893
9630f4e
 
55ef1e7
a9482ab
 
 
 
 
 
 
 
 
 
 
 
 
 
63ced49
 
a9482ab
7663e41
a9482ab
 
132c7ea
2b65d86
d2e0f91
b7d4e28
d2e0f91
 
 
b7d4e28
d2e0f91
dd29aa4
 
 
d9f9ad4
 
d2e0f91
91eda71
d9f9ad4
1256bad
 
 
d9f9ad4
 
91eda71
7ad0327
91eda71
132c7ea
95ec227
91eda71
44e30b0
7670619
132c7ea
 
42ed2a2
d9f9ad4
a9482ab
 
2fce1e5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import IPython

import sys
import subprocess

subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "--force-reinstall", "git+https://github.com/osanseviero/tortoise-tts.git"])

# entmax could not be installed at same time as torch
subprocess.check_call([sys.executable, "-m", "pip", "install", "entmax"])

from tortoise_tts.api import TextToSpeech
from tortoise_tts.utils.audio import load_audio, get_voices
import torch 
import torchaudio
import gradio as gr

device = "cuda" if torch.cuda.is_available() else "cpu"

# This will download all the models used by Tortoise from the HF hub
tts = TextToSpeech(device="cuda")

voices = [
  "angie",
  "daniel",
  "deniro",
  "emma",
  "freeman",
  "geralt",
  "halle",
  "jlaw",
  "lj",
  "snakes",
  "William",
]
voice_paths = get_voices()
print(voice_paths)

preset = "fast"

def inference(text, voice):
    text = text[:256]
    cond_paths = voice_paths[voice]
    conds = []
    print(voice_paths, voice, cond_paths)
    for cond_path in cond_paths:
        c = load_audio(cond_path, 22050)
        conds.append(c)
    print(text, conds, preset)
    gen = tts.tts_with_preset(text, conds, preset)
    print("gen")
    torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
    return "generated.wav"
    

 
text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?"
examples = [
    [text, "angie"],
    [text, "emma"],
    ["how are you doing this day", "freeman"]
]

iface = gr.Interface(
  inference,
  inputs=[
      gr.inputs.Textbox(type="str", default=text, label="Text", lines=3),
      gr.inputs.Dropdown(voices),
  ],
  outputs="audio",
  title="TorToiSe",
  description="A multi-voice TTS system trained with an emphasis on quality",
  article="This demo shows the ultra fast option in the TorToiSe system. For more info check the <a href='https://github.com/neonbjb/tortoise-tts' target='_blank'>Repository</a>.",
  enable_queue=True,
  examples=examples,
)

iface.launch(cache_examples=True)