import gradio as gr import numpy as np import spaces import torch from synthesize import synthesize # initialize model audio, sample_rate = synthesize( text="Hello", duration_model_config="./train_duration_dit_s.yaml", acoustic_model_config="./train_acoustic_dit_b.yaml", duration_model_checkpoint="./duration_model_0120000.pt", acoustic_model_checkpoint="./acoustic_model_0140000.pt", speaker_id=0, cfg_scale=4.0, num_sampling_steps=100, ) @spaces.GPU def text_to_speech(text, speaker_id, cfg_scale, num_sampling_steps): audio, sample_rate = synthesize( text=text, duration_model_config="./train_duration_dit_s.yaml", acoustic_model_config="./train_acoustic_dit_b.yaml", duration_model_checkpoint="./duration_model_0120000.pt", acoustic_model_checkpoint="./acoustic_model_0140000.pt", speaker_id=speaker_id, cfg_scale=cfg_scale, num_sampling_steps=num_sampling_steps, ) return (sample_rate, audio) speaker_ids = [str(i) for i in range(100)] sampling_steps = [100, 250, 500, 1000] demo = gr.Interface( fn=text_to_speech, inputs=[ gr.Textbox(label="Text", value="Text to Speech with Diffusion Transformer"), gr.Dropdown(choices=speaker_ids, label="Speaker ID", value="0"), gr.Slider(minimum=0, maximum=10, value=4.0, label="CFG Scale"), gr.Dropdown(choices=sampling_steps, label="Sampling Steps", value=100), ], outputs=gr.Audio(label="Generated Speech"), title="Text to Speech with Diffusion Transformer", description="Enter text, select a speaker ID (0-99), and adjust the CFG scale to generate speech.", flagging_options=None, ) demo.launch()