output_dir: "samples/inference"
output_name: "long_video"

pretrained_model_path: "TIGER-Lab/ConsistI2V"
unet_path: null
unet_ckpt_prefix: "module."
pipeline_pretrained_path: null

sampling_kwargs:
  height: 256
  width: 256
  n_frames: 16
  steps: 50
  ddim_eta: 0.0
  guidance_scale_txt: 7.5
  guidance_scale_img: 1.0
  guidance_rescale: 0.0
  num_videos_per_prompt: 1
  frame_stride: 3
  autoregress_steps: 3

unet_additional_kwargs:
  variant: null
  n_temp_heads: 8
  augment_temporal_attention: true
  temp_pos_embedding: "rotary" # "rotary" or "sinusoidal"
  first_frame_condition_mode: "concat"
  use_frame_stride_condition: true
  noise_sampling_method: "pyoco_mixed" # "vanilla" or "pyoco_mixed" or "pyoco_progressive"
  noise_alpha: 1.0

noise_scheduler_kwargs:
  beta_start: 0.00085
  beta_end: 0.012
  beta_schedule: "linear"
  steps_offset: 1
  clip_sample: false
  rescale_betas_zero_snr: false     # true if using zero terminal snr
  timestep_spacing:       "leading" # "trailing" if using zero terminal snr
  prediction_type:        "epsilon" # "v_prediction" if using zero terminal snr


frameinit_kwargs:
  enable: true
  noise_level: 850
  filter_params:
    method: 'gaussian'
    d_s: 0.25
    d_t: 0.25