|
import spaces |
|
|
|
import gradio as gr |
|
|
|
import torch |
|
import os |
|
from glob import glob |
|
from pathlib import Path |
|
from typing import Optional |
|
|
|
from PIL import Image |
|
from diffusers.utils import load_image, export_to_video |
|
from pipeline import StableVideoDiffusionPipeline |
|
|
|
import random |
|
from safetensors import safe_open |
|
from lcm_scheduler import AnimateLCMSVDStochasticIterativeScheduler |
|
|
|
|
|
def get_safetensors_files(): |
|
models_dir = "./safetensors" |
|
safetensors_files = [ |
|
f for f in os.listdir(models_dir) if f.endswith(".safetensors") |
|
] |
|
return safetensors_files |
|
|
|
|
|
def model_select(selected_file): |
|
print("load model weights", selected_file) |
|
pipe.unet.cpu() |
|
file_path = os.path.join("./safetensors", selected_file) |
|
state_dict = {} |
|
with safe_open(file_path, framework="pt", device="cpu") as f: |
|
for key in f.keys(): |
|
state_dict[key] = f.get_tensor(key) |
|
missing, unexpected = pipe.unet.load_state_dict(state_dict, strict=True) |
|
pipe.unet.cuda() |
|
del state_dict |
|
return |
|
|
|
|
|
noise_scheduler = AnimateLCMSVDStochasticIterativeScheduler( |
|
num_train_timesteps=40, |
|
sigma_min=0.002, |
|
sigma_max=700.0, |
|
sigma_data=1.0, |
|
s_noise=1.0, |
|
rho=7, |
|
clip_denoised=False, |
|
) |
|
pipe = StableVideoDiffusionPipeline.from_pretrained( |
|
"stabilityai/stable-video-diffusion-img2vid-xt", |
|
scheduler=noise_scheduler, |
|
torch_dtype=torch.float16, |
|
variant="fp16", |
|
) |
|
pipe.to("cuda") |
|
pipe.enable_model_cpu_offload() |
|
model_select("AnimateLCM-SVD-xt-1.1.safetensors") |
|
|
|
|
|
|
|
max_64_bit_int = 2**63 - 1 |
|
|
|
@spaces.GPU |
|
def sample( |
|
image: Image, |
|
seed: Optional[int] = 42, |
|
randomize_seed: bool = False, |
|
motion_bucket_id: int = 80, |
|
fps_id: int = 8, |
|
max_guidance_scale: float = 1.2, |
|
min_guidance_scale: float = 1, |
|
width: int = 1024, |
|
height: int = 576, |
|
num_inference_steps: int = 4, |
|
decoding_t: int = 4, |
|
output_folder: str = "outputs_gradio", |
|
): |
|
if image.mode == "RGBA": |
|
image = image.convert("RGB") |
|
|
|
if randomize_seed: |
|
seed = random.randint(0, max_64_bit_int) |
|
generator = torch.manual_seed(seed) |
|
|
|
os.makedirs(output_folder, exist_ok=True) |
|
base_count = len(glob(os.path.join(output_folder, "*.mp4"))) |
|
video_path = os.path.join(output_folder, f"{base_count:06d}.mp4") |
|
|
|
with torch.autocast("cuda"): |
|
frames = pipe( |
|
image, |
|
decode_chunk_size=decoding_t, |
|
generator=generator, |
|
motion_bucket_id=motion_bucket_id, |
|
height=height, |
|
width=width, |
|
num_inference_steps=num_inference_steps, |
|
min_guidance_scale=min_guidance_scale, |
|
max_guidance_scale=max_guidance_scale, |
|
).frames[0] |
|
export_to_video(frames, video_path, fps=fps_id) |
|
torch.manual_seed(seed) |
|
|
|
return video_path, seed |
|
|
|
|
|
def resize_image(image, output_size=(1024, 576)): |
|
|
|
target_aspect = output_size[0] / output_size[1] |
|
image_aspect = image.width / image.height |
|
|
|
|
|
if image_aspect > target_aspect: |
|
|
|
new_height = output_size[1] |
|
new_width = int(new_height * image_aspect) |
|
resized_image = image.resize((new_width, new_height), Image.LANCZOS) |
|
|
|
left = (new_width - output_size[0]) / 2 |
|
top = 0 |
|
right = (new_width + output_size[0]) / 2 |
|
bottom = output_size[1] |
|
else: |
|
|
|
new_width = output_size[0] |
|
new_height = int(new_width / image_aspect) |
|
resized_image = image.resize((new_width, new_height), Image.LANCZOS) |
|
|
|
left = 0 |
|
top = (new_height - output_size[1]) / 2 |
|
right = output_size[0] |
|
bottom = (new_height + output_size[1]) / 2 |
|
|
|
|
|
cropped_image = resized_image.crop((left, top, right, bottom)) |
|
return cropped_image |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown( |
|
""" |
|
# [AnimateLCM: Accelerating the Animation of Personalized Diffusion Models and Adapters with Decoupled Consistency Learning](https://arxiv.org/abs/2402.00769) |
|
Fu-Yun Wang, Zhaoyang Huang (*Corresponding Author), Xiaoyu Shi, Weikang Bian, Guanglu Song, Yu Liu, Hongsheng Li (*Corresponding Author)<br> |
|
|
|
[arXiv Report](https://arxiv.org/abs/2402.00769) | [Project Page](https://animatelcm.github.io/) | [Github](https://github.com/G-U-N/AnimateLCM) | [Civitai](https://civitai.com/models/290375/animatelcm-fast-video-generation) | [Replicate](https://replicate.com/camenduru/animate-lcm) |
|
|
|
Related Models: |
|
[AnimateLCM-t2v](https://huggingface.co/wangfuyun/AnimateLCM): Personalized Text-to-Video Generation |
|
[AnimateLCM-SVD-xt](https://huggingface.co/wangfuyun/AnimateLCM-SVD-xt): General Image-to-Video Generation |
|
[AnimateLCM-i2v](https://huggingface.co/wangfuyun/AnimateLCM-I2V): Personalized Image-to-Video Generation |
|
""" |
|
) |
|
with gr.Row(): |
|
with gr.Column(): |
|
image = gr.Image(label="Upload your image", type="pil") |
|
generate_btn = gr.Button("Generate") |
|
video = gr.Video() |
|
with gr.Accordion("Advanced options", open=False): |
|
safetensors_dropdown = gr.Dropdown( |
|
label="Choose Safetensors", choices=get_safetensors_files() |
|
) |
|
seed = gr.Slider( |
|
label="Seed", |
|
value=42, |
|
randomize=False, |
|
minimum=0, |
|
maximum=max_64_bit_int, |
|
step=1, |
|
) |
|
randomize_seed = gr.Checkbox(label="Randomize seed", value=False) |
|
motion_bucket_id = gr.Slider( |
|
label="Motion bucket id", |
|
info="Controls how much motion to add/remove from the image", |
|
value=80, |
|
minimum=1, |
|
maximum=255, |
|
) |
|
fps_id = gr.Slider( |
|
label="Frames per second", |
|
info="The length of your video in seconds will be 25/fps", |
|
value=8, |
|
minimum=5, |
|
maximum=30, |
|
) |
|
width = gr.Slider( |
|
label="Width of input image", |
|
info="It should be divisible by 64", |
|
value=1024, |
|
minimum=576, |
|
maximum=2048, |
|
) |
|
height = gr.Slider( |
|
label="Height of input image", |
|
info="It should be divisible by 64", |
|
value=576, |
|
minimum=320, |
|
maximum=1152, |
|
) |
|
max_guidance_scale = gr.Slider( |
|
label="Max guidance scale", |
|
info="classifier-free guidance strength", |
|
value=1.2, |
|
minimum=1, |
|
maximum=2, |
|
) |
|
min_guidance_scale = gr.Slider( |
|
label="Min guidance scale", |
|
info="classifier-free guidance strength", |
|
value=1, |
|
minimum=1, |
|
maximum=1.5, |
|
) |
|
num_inference_steps = gr.Slider( |
|
label="Num inference steps", |
|
info="steps for inference", |
|
value=4, |
|
minimum=1, |
|
maximum=20, |
|
step=1, |
|
) |
|
|
|
image.upload(fn=resize_image, inputs=image, outputs=image, queue=False) |
|
generate_btn.click( |
|
fn=sample, |
|
inputs=[ |
|
image, |
|
seed, |
|
randomize_seed, |
|
motion_bucket_id, |
|
fps_id, |
|
max_guidance_scale, |
|
min_guidance_scale, |
|
width, |
|
height, |
|
num_inference_steps, |
|
], |
|
outputs=[video, seed], |
|
api_name="video", |
|
) |
|
safetensors_dropdown.change(fn=model_select, inputs=safetensors_dropdown) |
|
|
|
gr.Examples( |
|
examples=[ |
|
["test_imgs/ai-generated-8496135_1280.jpg"], |
|
["test_imgs/dog-7396912_1280.jpg"], |
|
["test_imgs/ship-7833921_1280.jpg"], |
|
["test_imgs/girl-4898696_1280.jpg"], |
|
["test_imgs/power-station-6579092_1280.jpg"] |
|
], |
|
inputs=[image], |
|
outputs=[video, seed], |
|
fn=sample, |
|
cache_examples=True, |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.queue(max_size=20, api_open=False) |
|
demo.launch(share=True, show_api=False) |
|
|