experiment: tokenizer_checkpoint: "tokenizer_titok_s128.bin" generator_checkpoint: "generator_titok_s128.bin" output_dir: "titok_s_128" model: vq_model: codebook_size: 4096 token_size: 12 use_l2_norm: True commitment_cost: 0.25 # vit arch vit_enc_model_size: "small" vit_dec_model_size: "small" vit_enc_patch_size: 16 vit_dec_patch_size: 16 num_latent_tokens: 128 finetune_decoder: True generator: model_type: "UViT" hidden_size: 1024 num_hidden_layers: 20 num_attention_heads: 16 intermediate_size: 4096 dropout: 0.1 attn_drop: 0.1 num_steps: 64 class_label_dropout: 0.1 image_seq_len: ${model.vq_model.num_latent_tokens} condition_num_classes: 1000 # sampling hyper-params randomize_temperature: 2.8 guidance_scale: 6.9 guidance_decay: "power-cosine" dataset: preprocessing: crop_size: 256