model: name: DiT-S input_size: 512 embedding_vocab_size: 1024 learn_sigma: true optimization: constant_memory: false epochs: 1400 global_batch_size: 512 initial_input_size: 32 learning_rate: 1.0e-4 min_lr: 1.0e-5 warmup_iters: 10000 lr_decay_iters: 100000 decay_lr: true weight_decay: 0.0 max_grad_norm: 20.0 betas: beta1: 0.9 beta2: 0.999 loss: num_timesteps: 1000 data: data_path: duration.npy data_dim: 1 data_std: 0.067776896 data_mean: 0.08663661 normalize: true training: enable_compile: true use_bfloat16: true use_block_mask: false seed: 42 ckpt_every: 10_000 log_every: 100 results_dir: results/duration resume_from_ckpt: null wandb: enable: true project: diffusion-speech