File size: 751 Bytes
256e1f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
model:
  name: DiT-B
  input_size: 2048
  embedding_vocab_size: 1024
  learn_sigma: true
optimization:
  constant_memory: false
  epochs: 1400
  global_batch_size: 32
  initial_input_size: 32
  learning_rate: 1.0e-4
  min_lr: 1.0e-5
  warmup_iters: 10000
  lr_decay_iters: 100000
  decay_lr: true
  weight_decay: 0.0
  max_grad_norm: 20.0
  betas:
    beta1: 0.9
    beta2: 0.999
  loss:
    num_timesteps: 1000
data:
  data_path: acoustic.npy
  data_dim: 100
  data_std: 2.0
  data_mean: -1.0
  normalize: true
training:
  enable_compile: true
  use_bfloat16: true
  use_block_mask: false
  seed: 42
  ckpt_every: 10_000
  log_every: 100
  results_dir: results/acoustic
  resume_from_ckpt: null
  wandb:
    enable: true
    project: diffusion-speech