Spaces:
Runtime error
Runtime error
# Copyright (c) Meta Platforms, Inc. and affiliates. | |
# All rights reserved. | |
# This source code is licensed under the license found in the | |
# LICENSE file in the root directory of this source tree. | |
""" | |
A minimal training script for DiT using PyTorch DDP. | |
""" | |
import argparse | |
import logging | |
import math | |
import os | |
import shutil | |
from pathlib import Path | |
from typing import Optional | |
import gc | |
import numpy as np | |
from einops import rearrange | |
from tqdm import tqdm | |
from dataclasses import field, dataclass | |
from torch.utils.data import DataLoader | |
from copy import deepcopy | |
import accelerate | |
import torch | |
from torch.nn import functional as F | |
import transformers | |
from accelerate import Accelerator | |
from accelerate.logging import get_logger | |
from accelerate.utils import ProjectConfiguration, set_seed | |
from huggingface_hub import create_repo | |
from packaging import version | |
from tqdm.auto import tqdm | |
from transformers import HfArgumentParser, TrainingArguments, AutoTokenizer | |
import diffusers | |
from diffusers import DDPMScheduler, PNDMScheduler | |
from diffusers.optimization import get_scheduler | |
from diffusers.training_utils import EMAModel, compute_snr | |
from diffusers.utils import check_min_version, is_wandb_available | |
from opensora.dataset import getdataset, ae_denorm | |
from opensora.models.ae import getae, getae_wrapper | |
from opensora.models.ae.videobase import CausalVQVAEModelWrapper, CausalVAEModelWrapper | |
from opensora.models.diffusion.diffusion import create_diffusion_T as create_diffusion | |
from opensora.models.diffusion.latte.modeling_latte import LatteT2V | |
from opensora.models.text_encoder import get_text_enc, get_text_warpper | |
from opensora.utils.dataset_utils import Collate | |
from opensora.models.ae import ae_stride_config, ae_channel_config | |
from opensora.models.diffusion import Diffusion_models | |
from opensora.sample.pipeline_videogen import VideoGenPipeline | |
from opensora.utils.utils import print_grad_norm | |
# Will error if the minimal version of diffusers is not installed. Remove at your own risks. | |
check_min_version("0.24.0") | |
logger = get_logger(__name__) | |
def log_validation(args, model, vae, text_encoder, tokenizer, accelerator, weight_dtype, global_step): | |
validation_prompt = [ | |
"A quiet beach at dawn, the waves gently lapping at the shore and the sky painted in pastel hues.", | |
"The majestic beauty of a waterfall cascading down a cliff into a serene lake." | |
] | |
logger.info(f"Running validation....\n") | |
model = accelerator.unwrap_model(model) | |
scheduler = PNDMScheduler() | |
videogen_pipeline = VideoGenPipeline(vae=vae, | |
text_encoder=text_encoder, | |
tokenizer=tokenizer, | |
scheduler=scheduler, | |
transformer=model).to(device=accelerator.device) | |
videos = [] | |
for prompt in validation_prompt: | |
logger.info('Processing the ({}) prompt'.format(prompt)) | |
video = videogen_pipeline(prompt, | |
num_frames=args.num_frames, | |
height=args.max_image_size, | |
width=args.max_image_size, | |
num_inference_steps=args.num_sampling_steps, | |
guidance_scale=args.guidance_scale, | |
enable_temporal_attentions=True, | |
num_images_per_prompt=1, | |
mask_feature=True, | |
).video | |
videos.append(video[0]) | |
# import ipdb;ipdb.set_trace() | |
gc.collect() | |
torch.cuda.empty_cache() | |
videos = torch.stack(videos).numpy() | |
videos = rearrange(videos, 'b t h w c -> b t c h w') | |
for tracker in accelerator.trackers: | |
if tracker.name == "tensorboard": | |
np_videos = np.stack([np.asarray(vid) for vid in videos]) | |
tracker.writer.add_video("validation", np_videos, global_step, fps=24) | |
if tracker.name == "wandb": | |
import wandb | |
tracker.log( | |
{ | |
"validation": [ | |
wandb.Video(video, caption=f"{i}: {prompt}", fps=24) | |
for i, (video, prompt) in enumerate(zip(videos, validation_prompt)) | |
] | |
} | |
) | |
del videogen_pipeline | |
gc.collect() | |
torch.cuda.empty_cache() | |
################################################################################# | |
# Training Loop # | |
################################################################################# | |
def main(args): | |
logging_dir = Path(args.output_dir, args.logging_dir) | |
accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) | |
accelerator = Accelerator( | |
gradient_accumulation_steps=args.gradient_accumulation_steps, | |
mixed_precision=args.mixed_precision, | |
log_with=args.report_to, | |
project_config=accelerator_project_config, | |
) | |
if args.report_to == "wandb": | |
if not is_wandb_available(): | |
raise ImportError("Make sure to install wandb if you want to use it for logging during training.") | |
# Make one log on every process with the configuration for debugging. | |
logging.basicConfig( | |
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", | |
datefmt="%m/%d/%Y %H:%M:%S", | |
level=logging.INFO, | |
) | |
logger.info(accelerator.state, main_process_only=False) | |
if accelerator.is_local_main_process: | |
transformers.utils.logging.set_verbosity_warning() | |
diffusers.utils.logging.set_verbosity_info() | |
else: | |
transformers.utils.logging.set_verbosity_error() | |
diffusers.utils.logging.set_verbosity_error() | |
# If passed along, set the training seed now. | |
if args.seed is not None: | |
set_seed(args.seed) | |
# Handle the repository creation | |
if accelerator.is_main_process: | |
if args.output_dir is not None: | |
os.makedirs(args.output_dir, exist_ok=True) | |
# if args.push_to_hub: | |
# repo_id = create_repo( | |
# repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token | |
# ).repo_id | |
# For mixed precision training we cast all non-trainable weigths to half-precision | |
# as these weights are only used for inference, keeping weights in full precision is not required. | |
weight_dtype = torch.float32 | |
if accelerator.mixed_precision == "fp16": | |
weight_dtype = torch.float16 | |
elif accelerator.mixed_precision == "bf16": | |
weight_dtype = torch.bfloat16 | |
# Create model: | |
diffusion = create_diffusion(timestep_respacing="") # default: 1000 steps, linear noise schedule | |
kwargs = {} | |
ae = getae_wrapper(args.ae)(args.ae_path, cache_dir=args.cache_dir, **kwargs).eval() | |
if args.enable_tiling: | |
ae.vae.enable_tiling() | |
ae.vae.tile_overlap_factor = args.tile_overlap_factor | |
kwargs = {'load_in_8bit': args.enable_8bit_t5, 'torch_dtype': weight_dtype, 'low_cpu_mem_usage': True} | |
text_enc = get_text_warpper(args.text_encoder_name)(args, **kwargs).eval() | |
ae_stride_t, ae_stride_h, ae_stride_w = ae_stride_config[args.ae] | |
ae.vae_scale_factor = ae_stride_config[args.ae] | |
assert ae_stride_h == ae_stride_w, f"Support only ae_stride_h == ae_stride_w now, but found ae_stride_h ({ae_stride_h}), ae_stride_w ({ae_stride_w})" | |
args.ae_stride_t, args.ae_stride_h, args.ae_stride_w = ae_stride_t, ae_stride_h, ae_stride_w | |
args.ae_stride = args.ae_stride_h | |
patch_size = args.model[-3:] | |
patch_size_t, patch_size_h, patch_size_w = int(patch_size[0]), int(patch_size[1]), int(patch_size[2]) | |
args.patch_size = patch_size_h | |
args.patch_size_t, args.patch_size_h, args.patch_size_w = patch_size_t, patch_size_h, patch_size_w | |
assert patch_size_h == patch_size_w, f"Support only patch_size_h == patch_size_w now, but found patch_size_h ({patch_size_h}), patch_size_w ({patch_size_w})" | |
# assert args.num_frames % ae_stride_t == 0, f"Num_frames must be divisible by ae_stride_t, but found num_frames ({args.num_frames}), ae_stride_t ({ae_stride_t})." | |
assert args.max_image_size % ae_stride_h == 0, f"Image size must be divisible by ae_stride_h, but found max_image_size ({args.max_image_size}), ae_stride_h ({ae_stride_h})." | |
args.stride_t = ae_stride_t * patch_size_t | |
args.stride = ae_stride_h * patch_size_h | |
latent_size = (args.max_image_size // ae_stride_h, args.max_image_size // ae_stride_w) | |
ae.latent_size = latent_size | |
if getae_wrapper(args.ae) == CausalVQVAEModelWrapper or getae_wrapper(args.ae) == CausalVAEModelWrapper: | |
args.video_length = video_length = args.num_frames // ae_stride_t + 1 | |
else: | |
video_length = args.num_frames // ae_stride_t | |
model = Diffusion_models[args.model]( | |
in_channels=ae_channel_config[args.ae], | |
out_channels=ae_channel_config[args.ae] * 2, | |
# caption_channels=4096, | |
# cross_attention_dim=1152, | |
attention_bias=True, | |
sample_size=latent_size, | |
num_vector_embeds=None, | |
activation_fn="gelu-approximate", | |
num_embeds_ada_norm=1000, | |
use_linear_projection=False, | |
only_cross_attention=False, | |
double_self_attention=False, | |
upcast_attention=False, | |
# norm_type="ada_norm_single", | |
norm_elementwise_affine=False, | |
norm_eps=1e-6, | |
attention_type='default', | |
video_length=video_length, | |
attention_mode=args.attention_mode, | |
compress_kv_factor=args.compress_kv_factor, | |
use_rope=args.use_rope, | |
model_max_length=args.model_max_length, | |
) | |
model.gradient_checkpointing = args.gradient_checkpointing | |
# # use pretrained model? | |
if args.pretrained: | |
if 'safetensors' in args.pretrained: | |
from safetensors.torch import load_file as safe_load | |
checkpoint = safe_load(args.pretrained, device="cpu") | |
else: | |
checkpoint = torch.load(args.pretrained, map_location='cpu')['model'] | |
model_state_dict = model.state_dict() | |
missing_keys, unexpected_keys = model.load_state_dict(checkpoint, strict=False) | |
logger.info(f'missing_keys {len(missing_keys)} {missing_keys}, unexpected_keys {len(unexpected_keys)}') | |
logger.info(f'Successfully load {len(model.state_dict()) - len(missing_keys)}/{len(model_state_dict)} keys from {args.pretrained}!') | |
# Freeze vae and text encoders. | |
ae.requires_grad_(False) | |
text_enc.requires_grad_(False) | |
# Set model as trainable. | |
model.train() | |
# Move unet, vae and text_encoder to device and cast to weight_dtype | |
# The VAE is in float32 to avoid NaN losses. | |
# ae.to(accelerator.device, dtype=torch.float32) | |
ae.to(accelerator.device, dtype=weight_dtype) | |
# ae.to(accelerator.device) | |
text_enc.to(accelerator.device, dtype=weight_dtype) | |
# text_enc.to(accelerator.device) | |
# Create EMA for the unet. | |
if args.use_ema: | |
ema_model = deepcopy(model) | |
ema_model = EMAModel(ema_model.parameters(), model_cls=LatteT2V, model_config=ema_model.config) | |
# `accelerate` 0.16.0 will have better support for customized saving | |
if version.parse(accelerate.__version__) >= version.parse("0.16.0"): | |
# create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format | |
def save_model_hook(models, weights, output_dir): | |
if accelerator.is_main_process: | |
if args.use_ema: | |
ema_model.save_pretrained(os.path.join(output_dir, "model_ema")) | |
for i, model in enumerate(models): | |
model.save_pretrained(os.path.join(output_dir, "model")) | |
if weights: # Don't pop if empty | |
# make sure to pop weight so that corresponding model is not saved again | |
weights.pop() | |
def load_model_hook(models, input_dir): | |
if args.use_ema: | |
load_model = EMAModel.from_pretrained(os.path.join(input_dir, "model_ema"), LatteT2V) | |
ema_model.load_state_dict(load_model.state_dict()) | |
ema_model.to(accelerator.device) | |
del load_model | |
for i in range(len(models)): | |
# pop models so that they are not loaded again | |
model = models.pop() | |
# load diffusers style into model | |
load_model = LatteT2V.from_pretrained(input_dir, subfolder="model") | |
model.register_to_config(**load_model.config) | |
model.load_state_dict(load_model.state_dict()) | |
del load_model | |
accelerator.register_save_state_pre_hook(save_model_hook) | |
accelerator.register_load_state_pre_hook(load_model_hook) | |
# Enable TF32 for faster training on Ampere GPUs, | |
# cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices | |
if args.allow_tf32: | |
torch.backends.cuda.matmul.allow_tf32 = True | |
if args.scale_lr: | |
args.learning_rate = ( | |
args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes | |
) | |
# Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs | |
if args.use_8bit_adam: | |
try: | |
import bitsandbytes as bnb | |
except ImportError: | |
raise ImportError( | |
"To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." | |
) | |
optimizer_class = bnb.optim.AdamW8bit | |
else: | |
optimizer_class = torch.optim.AdamW | |
# Optimizer creation | |
params_to_optimize = model.parameters() | |
optimizer = optimizer_class( | |
params_to_optimize, | |
lr=args.learning_rate, | |
betas=(args.adam_beta1, args.adam_beta2), | |
weight_decay=args.adam_weight_decay, | |
eps=args.adam_epsilon, | |
) | |
# Setup data: | |
train_dataset = getdataset(args) | |
train_dataloader = torch.utils.data.DataLoader( | |
train_dataset, | |
shuffle=True, | |
collate_fn=Collate(args), | |
batch_size=args.train_batch_size, | |
num_workers=args.dataloader_num_workers, | |
) | |
# Scheduler and math around the number of training steps. | |
overrode_max_train_steps = False | |
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) | |
if args.max_train_steps is None: | |
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch | |
overrode_max_train_steps = True | |
lr_scheduler = get_scheduler( | |
args.lr_scheduler, | |
optimizer=optimizer, | |
num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, | |
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, | |
) | |
# Prepare everything with our `accelerator`. | |
model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( | |
model, optimizer, train_dataloader, lr_scheduler | |
) | |
# We need to recalculate our total training steps as the size of the training dataloader may have changed. | |
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) | |
if overrode_max_train_steps: | |
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch | |
# Afterwards we recalculate our number of training epochs | |
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) | |
# We need to initialize the trackers we use, and also store our configuration. | |
# The trackers initializes automatically on the main process. | |
if accelerator.is_main_process: | |
accelerator.init_trackers(args.output_dir, config=vars(args)) | |
# Train! | |
total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps | |
logger.info("***** Running training *****") | |
logger.info(f" Num examples = {len(train_dataset)}") | |
logger.info(f" Num Epochs = {args.num_train_epochs}") | |
logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") | |
logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") | |
logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") | |
logger.info(f" Total optimization steps = {args.max_train_steps}") | |
global_step = 0 | |
first_epoch = 0 | |
# Potentially load in the weights and states from a previous save | |
if args.resume_from_checkpoint: | |
if args.resume_from_checkpoint != "latest": | |
path = os.path.basename(args.resume_from_checkpoint) | |
else: | |
# Get the most recent checkpoint | |
dirs = os.listdir(args.output_dir) | |
dirs = [d for d in dirs if d.startswith("checkpoint")] | |
dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) | |
path = dirs[-1] if len(dirs) > 0 else None | |
if path is None: | |
accelerator.print( | |
f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run." | |
) | |
args.resume_from_checkpoint = None | |
initial_global_step = 0 | |
else: | |
accelerator.print(f"Resuming from checkpoint {path}") | |
accelerator.load_state(os.path.join(args.output_dir, path)) | |
global_step = int(path.split("-")[1]) | |
initial_global_step = global_step | |
first_epoch = global_step // num_update_steps_per_epoch | |
else: | |
initial_global_step = 0 | |
progress_bar = tqdm( | |
range(0, args.max_train_steps), | |
initial=initial_global_step, | |
desc="Steps", | |
# Only show the progress bar once on each machine. | |
disable=not accelerator.is_local_main_process, | |
) | |
for epoch in range(first_epoch, args.num_train_epochs): | |
train_loss = 0.0 | |
for step, (x, attn_mask, input_ids, cond_mask) in enumerate(train_dataloader): | |
with accelerator.accumulate(model): | |
# Sample noise that we'll add to the latents | |
if not args.multi_scale: | |
assert torch.all(attn_mask) | |
x = x.to(accelerator.device, dtype=weight_dtype) # B C T+num_images H W, 16 + 4 | |
attn_mask = attn_mask.to(accelerator.device) # B L or B 1+num_images L | |
# assert torch.all(attn_mask != 0), 'attn_mask must all 1' | |
input_ids = input_ids.to(accelerator.device) # B L or B 1+num_images L | |
cond_mask = cond_mask.to(accelerator.device) # B L or B 1+num_images L | |
# print('x.shape, attn_mask.shape, input_ids.shape, cond_mask.shape', x.shape, attn_mask.shape, input_ids.shape, cond_mask.shape) | |
with torch.no_grad(): | |
# use for loop to avoid OOM, because T5 is too huge... | |
B, _, _ = input_ids.shape # B T+num_images L b 1+4, L | |
cond = torch.stack([text_enc(input_ids[i], cond_mask[i]) for i in range(B)]) # B 1+num_images L D | |
# Map input images to latent space + normalize latents | |
if args.use_image_num == 0: | |
x = ae.encode(x) # B C T H W | |
else: | |
videos, images = x[:, :, :-args.use_image_num], x[:, :, -args.use_image_num:] | |
videos = ae.encode(videos) # B C T H W | |
def custom_to_video(x: torch.Tensor, fps: float = 2.0, output_file: str = 'output_video.mp4') -> None: | |
from examples.rec_imvi_vae import array_to_video | |
x = x.detach().cpu() | |
x = torch.clamp(x, -1, 1) | |
x = (x + 1) / 2 | |
x = x.permute(1, 2, 3, 0).numpy() | |
x = (255*x).astype(np.uint8) | |
array_to_video(x, fps=fps, output_file=output_file) | |
return | |
# videos = ae.decode(videos.to(dtype=weight_dtype))[0] | |
# videos = videos.transpose(0, 1) | |
# custom_to_video(videos.to(torch.float32), fps=24, output_file='tmp.mp4') | |
# sys.exit() | |
images = rearrange(images, 'b c t h w -> (b t) c 1 h w') | |
images = ae.encode(images) | |
# import ipdb;ipdb.set_trace() | |
# images = ae.decode(images.to(dtype=weight_dtype)) | |
# for idx in range(args.use_image_num): | |
# x = images[idx, 0, :, :, :].to(torch.float32) | |
# x = x.squeeze() | |
# x = x.detach().cpu().numpy() | |
# x = np.clip(x, -1, 1) | |
# x = (x + 1) / 2 | |
# x = (255 * x).astype(np.uint8) | |
# x = x.transpose(1, 2, 0) | |
# from PIL import Image | |
# image = Image.fromarray(x) | |
# image.save(f'tmp{idx}.jpg') | |
# import sys | |
# sys.exit() | |
images = rearrange(images, '(b t) c 1 h w -> b c t h w', t=args.use_image_num) | |
x = torch.cat([videos, images], dim=2) # b c 17+4, h, w | |
# print('(x.shape, attn_mask.shape, cond.shape, cond_mask.shape', x.shape, attn_mask.shape, cond.shape, cond_mask.shape) | |
model_kwargs = dict(encoder_hidden_states=cond, attention_mask=attn_mask, | |
encoder_attention_mask=cond_mask, use_image_num=args.use_image_num) | |
t = torch.randint(0, diffusion.num_timesteps, (x.shape[0],), device=accelerator.device) | |
loss_dict = diffusion.training_losses(model, x, t, model_kwargs) | |
loss = loss_dict["loss"].mean() | |
# Gather the losses across all processes for logging (if we use distributed training). | |
avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean() | |
train_loss += avg_loss.item() / args.gradient_accumulation_steps | |
# Backpropagate | |
accelerator.backward(loss) | |
# accelerator.deepspeed_engine_wrapped.engine.backward(loss) | |
# print_grad_norm(model) | |
# accelerator.deepspeed_engine_wrapped.engine.step() | |
if accelerator.sync_gradients: | |
params_to_clip = model.parameters() | |
accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm) | |
optimizer.step() | |
lr_scheduler.step() | |
optimizer.zero_grad() | |
# Checks if the accelerator has performed an optimization step behind the scenes | |
if accelerator.sync_gradients: | |
progress_bar.update(1) | |
global_step += 1 | |
accelerator.log({"train_loss": train_loss}, step=global_step) | |
train_loss = 0.0 | |
if args.use_deepspeed or accelerator.is_main_process: | |
if global_step % args.checkpointing_steps == 0: | |
# _before_ saving state, check if this save would set us over the `checkpoints_total_limit` | |
if args.checkpoints_total_limit is not None: | |
checkpoints = os.listdir(args.output_dir) | |
checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] | |
checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) | |
# before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints | |
if len(checkpoints) >= args.checkpoints_total_limit: | |
num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 | |
removing_checkpoints = checkpoints[0:num_to_remove] | |
logger.info( | |
f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" | |
) | |
logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") | |
for removing_checkpoint in removing_checkpoints: | |
removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint) | |
shutil.rmtree(removing_checkpoint) | |
save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") | |
accelerator.save_state(save_path) | |
logger.info(f"Saved state to {save_path}") | |
logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} | |
progress_bar.set_postfix(**logs) | |
if global_step >= args.max_train_steps: | |
break | |
if accelerator.is_main_process: | |
if global_step % args.checkpointing_steps == 0: | |
if args.use_ema: | |
# Store the UNet parameters temporarily and load the EMA parameters to perform inference. | |
ema_model.store(model.parameters()) | |
ema_model.copy_to(model.parameters()) | |
if args.enable_tracker: | |
log_validation(args, model, ae, text_enc.text_enc, train_dataset.tokenizer, accelerator, weight_dtype, global_step) | |
accelerator.wait_for_everyone() | |
accelerator.end_training() | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--dataset", type=str, required=True) | |
parser.add_argument("--video_data", type=str, required='') | |
parser.add_argument("--image_data", type=str, default='') | |
parser.add_argument("--sample_rate", type=int, default=1) | |
parser.add_argument("--num_frames", type=int, default=17) | |
parser.add_argument("--max_image_size", type=int, default=512) | |
parser.add_argument("--use_img_from_vid", action="store_true") | |
parser.add_argument("--use_image_num", type=int, default=0) | |
parser.add_argument("--model_max_length", type=int, default=300) | |
parser.add_argument('--enable_8bit_t5', action='store_true') | |
parser.add_argument('--tile_overlap_factor', type=float, default=0.25) | |
parser.add_argument('--enable_tiling', action='store_true') | |
parser.add_argument("--compress_kv", action="store_true") | |
parser.add_argument("--attention_mode", type=str, choices=['xformers', 'math', 'flash'], default="xformers") | |
parser.add_argument('--use_rope', action='store_true') | |
parser.add_argument('--compress_kv_factor', type=int, default=1) | |
parser.add_argument("--model", type=str, choices=list(Diffusion_models.keys()), default="Latte-XL/122") | |
parser.add_argument("--pretrained", type=str, default=None) | |
parser.add_argument("--ae", type=str, default="stabilityai/sd-vae-ft-mse") | |
parser.add_argument("--ae_path", type=str, default="stabilityai/sd-vae-ft-mse") | |
parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl') | |
parser.add_argument("--cache_dir", type=str, default='./cache_dir') | |
parser.add_argument("--num_sampling_steps", type=int, default=50) | |
parser.add_argument('--guidance_scale', type=float, default=5.5) | |
parser.add_argument("--multi_scale", action="store_true") | |
parser.add_argument("--enable_tracker", action="store_true") | |
parser.add_argument("--use_deepspeed", action="store_true") | |
parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") | |
parser.add_argument( | |
"--output_dir", | |
type=str, | |
default=None, | |
help="The output directory where the model predictions and checkpoints will be written.", | |
) | |
parser.add_argument( | |
"--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader." | |
) | |
parser.add_argument("--num_train_epochs", type=int, default=100) | |
parser.add_argument( | |
"--max_train_steps", | |
type=int, | |
default=None, | |
help="Total number of training steps to perform. If provided, overrides num_train_epochs.", | |
) | |
parser.add_argument( | |
"--checkpointing_steps", | |
type=int, | |
default=500, | |
help=( | |
"Save a checkpoint of the training state every X updates. These checkpoints can be used both as final" | |
" checkpoints in case they are better than the last checkpoint, and are also suitable for resuming" | |
" training using `--resume_from_checkpoint`." | |
), | |
) | |
parser.add_argument( | |
"--checkpoints_total_limit", | |
type=int, | |
default=None, | |
help=("Max number of checkpoints to store."), | |
) | |
parser.add_argument( | |
"--resume_from_checkpoint", | |
type=str, | |
default=None, | |
help=( | |
"Whether training should be resumed from a previous checkpoint. Use a path saved by" | |
' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' | |
), | |
) | |
parser.add_argument( | |
"--gradient_accumulation_steps", | |
type=int, | |
default=1, | |
help="Number of updates steps to accumulate before performing a backward/update pass.", | |
) | |
parser.add_argument( | |
"--gradient_checkpointing", | |
action="store_true", | |
help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", | |
) | |
parser.add_argument( | |
"--learning_rate", | |
type=float, | |
default=1e-4, | |
help="Initial learning rate (after the potential warmup period) to use.", | |
) | |
parser.add_argument( | |
"--scale_lr", | |
action="store_true", | |
default=False, | |
help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", | |
) | |
parser.add_argument( | |
"--lr_scheduler", | |
type=str, | |
default="constant", | |
help=( | |
'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' | |
' "constant", "constant_with_warmup"]' | |
), | |
) | |
parser.add_argument( | |
"--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." | |
) | |
parser.add_argument( | |
"--timestep_bias_strategy", | |
type=str, | |
default="none", | |
choices=["earlier", "later", "range", "none"], | |
help=( | |
"The timestep bias strategy, which may help direct the model toward learning low or high frequency details." | |
" Choices: ['earlier', 'later', 'range', 'none']." | |
" The default is 'none', which means no bias is applied, and training proceeds normally." | |
" The value of 'later' will increase the frequency of the model's final training timesteps." | |
), | |
) | |
parser.add_argument( | |
"--timestep_bias_multiplier", | |
type=float, | |
default=1.0, | |
help=( | |
"The multiplier for the bias. Defaults to 1.0, which means no bias is applied." | |
" A value of 2.0 will double the weight of the bias, and a value of 0.5 will halve it." | |
), | |
) | |
parser.add_argument( | |
"--timestep_bias_begin", | |
type=int, | |
default=0, | |
help=( | |
"When using `--timestep_bias_strategy=range`, the beginning (inclusive) timestep to bias." | |
" Defaults to zero, which equates to having no specific bias." | |
), | |
) | |
parser.add_argument( | |
"--timestep_bias_end", | |
type=int, | |
default=1000, | |
help=( | |
"When using `--timestep_bias_strategy=range`, the final timestep (inclusive) to bias." | |
" Defaults to 1000, which is the number of timesteps that Stable Diffusion is trained on." | |
), | |
) | |
parser.add_argument( | |
"--timestep_bias_portion", | |
type=float, | |
default=0.25, | |
help=( | |
"The portion of timesteps to bias. Defaults to 0.25, which 25% of timesteps will be biased." | |
" A value of 0.5 will bias one half of the timesteps. The value provided for `--timestep_bias_strategy` determines" | |
" whether the biased portions are in the earlier or later timesteps." | |
), | |
) | |
parser.add_argument( | |
"--snr_gamma", | |
type=float, | |
default=None, | |
help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. " | |
"More details here: https://arxiv.org/abs/2303.09556.", | |
) | |
parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.") | |
parser.add_argument( | |
"--allow_tf32", | |
action="store_true", | |
help=( | |
"Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" | |
" https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" | |
), | |
) | |
parser.add_argument( | |
"--dataloader_num_workers", | |
type=int, | |
default=10, | |
help=( | |
"Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." | |
), | |
) | |
parser.add_argument( | |
"--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." | |
) | |
parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") | |
parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") | |
parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") | |
parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") | |
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") | |
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") | |
parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") | |
parser.add_argument( | |
"--prediction_type", | |
type=str, | |
default=None, | |
help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.", | |
) | |
parser.add_argument( | |
"--hub_model_id", | |
type=str, | |
default=None, | |
help="The name of the repository to keep in sync with the local `output_dir`.", | |
) | |
parser.add_argument( | |
"--logging_dir", | |
type=str, | |
default="logs", | |
help=( | |
"[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" | |
" *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." | |
), | |
) | |
parser.add_argument( | |
"--report_to", | |
type=str, | |
default="tensorboard", | |
help=( | |
'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' | |
' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' | |
), | |
) | |
parser.add_argument( | |
"--mixed_precision", | |
type=str, | |
default=None, | |
choices=["no", "fp16", "bf16"], | |
help=( | |
"Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" | |
" 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" | |
" flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." | |
), | |
) | |
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") | |
parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.") | |
args = parser.parse_args() | |
main(args) | |