File size: 1,320 Bytes
3e75f8e
256e1f6
 
 
3e75f8e
 
256e1f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a6e518a
256e1f6
 
 
 
 
 
 
 
 
 
3e75f8e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import gradio as gr
import torch
import numpy as np
from synthesize import synthesize


def text_to_speech(text, speaker_id, cfg_scale, num_sampling_steps):
    audio, sample_rate = synthesize(
        text=text,
        duration_model_config="./train_duration_dit_s.yaml",
        acoustic_model_config="./train_acoustic_dit_b.yaml",
        duration_model_checkpoint="./duration_model_0120000.pt",
        acoustic_model_checkpoint="./acoustic_model_0140000.pt",
        speaker_id=speaker_id,
        cfg_scale=cfg_scale,
        num_sampling_steps=num_sampling_steps,
    )
    return (sample_rate, audio)


speaker_ids = [str(i) for i in range(100)]
sampling_steps = [100, 250, 500, 1000]

demo = gr.Interface(
    fn=text_to_speech,
    inputs=[
        gr.Textbox(label="Text", value="Text to Speech with Diffusion Transformer"),
        gr.Dropdown(choices=speaker_ids, label="Speaker ID", value="0"),
        gr.Slider(minimum=0, maximum=10, value=4.0, label="CFG Scale"),
        gr.Dropdown(choices=sampling_steps, label="Sampling Steps", value=100),
    ],
    outputs=gr.Audio(label="Generated Speech"),
    title="Text to Speech with Diffusion Transformer",
    description="Enter text, select a speaker ID (0-99), and adjust the CFG scale to generate speech.",
    flagging_options=None,
)

demo.launch()