Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,320 Bytes
3e75f8e 256e1f6 3e75f8e 256e1f6 a6e518a 256e1f6 3e75f8e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import gradio as gr
import torch
import numpy as np
from synthesize import synthesize
def text_to_speech(text, speaker_id, cfg_scale, num_sampling_steps):
audio, sample_rate = synthesize(
text=text,
duration_model_config="./train_duration_dit_s.yaml",
acoustic_model_config="./train_acoustic_dit_b.yaml",
duration_model_checkpoint="./duration_model_0120000.pt",
acoustic_model_checkpoint="./acoustic_model_0140000.pt",
speaker_id=speaker_id,
cfg_scale=cfg_scale,
num_sampling_steps=num_sampling_steps,
)
return (sample_rate, audio)
speaker_ids = [str(i) for i in range(100)]
sampling_steps = [100, 250, 500, 1000]
demo = gr.Interface(
fn=text_to_speech,
inputs=[
gr.Textbox(label="Text", value="Text to Speech with Diffusion Transformer"),
gr.Dropdown(choices=speaker_ids, label="Speaker ID", value="0"),
gr.Slider(minimum=0, maximum=10, value=4.0, label="CFG Scale"),
gr.Dropdown(choices=sampling_steps, label="Sampling Steps", value=100),
],
outputs=gr.Audio(label="Generated Speech"),
title="Text to Speech with Diffusion Transformer",
description="Enter text, select a speaker ID (0-99), and adjust the CFG scale to generate speech.",
flagging_options=None,
)
demo.launch()
|