Spaces:

JFoz
/

CoherentControl

Runtime error

App Files Files Community

foz commited on May 1, 2023

Commit

aada7c5

•

1 Parent(s): 046b08b

Fix requirements

Browse files

Files changed (5) hide show

app.py +7 -14
app_pose.py +0 -2
model.py +68 -96
requirements.txt +0 -1
utils.py +4 -6

app.py CHANGED Viewed

@@ -1,17 +1,14 @@
 import gradio as gr
 import torch
-from model import Model, ModelType
 from app_pose import create_demo as create_demo_pose
 import argparse
 import os
-on_huggingspace = os.environ.get("SPACE_AUTHOR_NAME") == "PAIR"
-model = Model(device='cuda', dtype=torch.float16)
-parser = argparse.ArgumentParser()
-parser.add_argument('--public_access', action='store_true',
-                    help="if enabled, the app can be access from a public url", default=False)
-args = parser.parse_args()
 with gr.Blocks(css='style.css') as demo:
@@ -22,10 +19,6 @@ with gr.Blocks(css='style.css') as demo:
     '''
-if on_huggingspace:
-    demo.queue(max_size=20)
-    demo.launch(debug=True)
-else:
-    _, _, link = demo.queue(api_open=False).launch(
-        file_directories=['temporal'], share=args.public_access)
-    print(link)

 import gradio as gr
 import torch
+from model import Model
 from app_pose import create_demo as create_demo_pose
 import argparse
 import os
+model = Model()
 with gr.Blocks(css='style.css') as demo:
     '''
+demo.launch(debug=True)

app_pose.py CHANGED Viewed

@@ -1,7 +1,5 @@
 from model import Model
 import gradio as gr
-import os
-on_huggingspace = os.environ.get("SPACE_AUTHOR_NAME") == "PAIR"
 examples = [
     ['Motion 1', "An astronaut dancing in the outer space"],

 from model import Model
 import gradio as gr
 examples = [
     ['Motion 1', "An astronaut dancing in the outer space"],

model.py CHANGED Viewed

@@ -4,111 +4,95 @@ import numpy as np
 import torch
-from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
-from diffusers import StableDiffusionInstructPix2PixPipeline, StableDiffusionControlNetPipeline, ControlNetModel, UNet2DConditionModel
-from diffusers.schedulers import EulerAncestralDiscreteScheduler, DDIMScheduler
-from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
 import utils
 import gradio_utils
 import os
-on_huggingspace = os.environ.get("SPACE_AUTHOR_NAME") == "PAIR"
 from einops import rearrange
-class ModelType(Enum):
-    ControlNetPose = 5,
 class Model:
-    def __init__(self, device, dtype, **kwargs):
-        self.device = device
-        self.dtype = dtype
-        self.generator = torch.Generator(device=device)
-        self.pipe_dict = {
-            ModelType.ControlNetPose: StableDiffusionControlNetPipeline,
-        }
-        self.pipe = None
-        self.model_type = None
-        self.states = {}
-        self.model_name = ""
-    def set_model(self, model_type: ModelType, model_id: str, **kwargs):
-        if hasattr(self, "pipe") and self.pipe is not None:
-            del self.pipe
-        torch.cuda.empty_cache()
-        gc.collect()
-        print('kwargs', kwargs)
-        print('device', self.device)
-        safety_checker = kwargs.pop('safety_checker', None)
-        controlnet = kwargs.pop('controlnet', None)
-        self.pipe = self.pipe_dict[model_type].from_pretrained(
-            model_id, safety_checker=safety_checker, controlnet=controlnet, torch_dtype=torch.float16).to(self.device)#, torch_dtype=torch.float16).to(self.device)
-        self.pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
-        self.pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
-        self.model_type = model_type
-        self.model_name = model_id
-    def inference_chunk(self, frame_ids, **kwargs):
-        if not hasattr(self, "pipe") or self.pipe is None:
-            return
-        prompt = np.array(kwargs.pop('prompt'))
-        negative_prompt = np.array(kwargs.pop('negative_prompt', ''))
-        latents = None
-        if 'latents' in kwargs:
-            latents = kwargs.pop('latents')[frame_ids]
-        if 'image' in kwargs:
-            kwargs['image'] = kwargs['image'][frame_ids]
-        if 'video_length' in kwargs:
-            kwargs['video_length'] = len(frame_ids)
-        return self.pipe(prompt=prompt[frame_ids].tolist(),
-                         negative_prompt=negative_prompt[frame_ids].tolist(),
-                         latents=latents,
-                         generator=self.generator,
-                         **kwargs)
     def inference(self, **kwargs):
-        if not hasattr(self, "pipe") or self.pipe is None:
-            return
         seed = kwargs.pop('seed', 0)
-        if seed < 0:
-            seed = self.generator.seed()
-        kwargs.pop('generator', '')
-        if 'image' in kwargs:
-            f = kwargs['image'].shape[0]
-        else:
-            f = kwargs['video_length']
         assert 'prompt' in kwargs
         prompt = [kwargs.pop('prompt')] * f
         negative_prompt = [kwargs.pop('negative_prompt', '')] * f
         frames_counter = 0
-        # Processing frame_by_frame
         result = []
-        for i in range(f):
-            frame_ids = [0] + [i]
-            self.generator.manual_seed(seed)
             print(f'Processing frame {i + 1} / {f}')
-            result.append(self.inference_chunk(frame_ids=frame_ids,
                                                    prompt=prompt,
                                                    negative_prompt=negative_prompt,
-                                                   **kwargs).images[1:])
             frames_counter += 1
-            if on_huggingspace and frames_counter >= 80:
-                break
-        result = np.concatenate(result)
         return result
     def process_controlnet_pose(self,
@@ -120,33 +104,22 @@ class Model:
                                 seed=42,
                                 eta=0.0,
                                 resolution=512,
-                                use_cf_attn=True,
                                 save_path=None):
         print("Module Pose")
         video_path = gradio_utils.motion_to_video_path(video_path)
-        if self.model_type != ModelType.ControlNetPose:
-            controlnet = ControlNetModel.from_pretrained(
-                "fusing/stable-diffusion-v1-5-controlnet-openpose", torch_dtype=torch.float16)
-            self.set_model(ModelType.ControlNetPose,
-                           model_id="runwayml/stable-diffusion-v1-5", controlnet=controlnet)
-            self.pipe.scheduler = DDIMScheduler.from_config(
-                self.pipe.scheduler.config)
-        video_path = gradio_utils.motion_to_video_path(
-            video_path) if 'Motion' in video_path else video_path
         added_prompt = 'best quality, extremely detailed, HD, ultra-realistic, 8K, HQ, masterpiece, trending on artstation, art, smooth'
         negative_prompts = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer difits, cropped, worst quality, low quality, deformed body, bloated, ugly, unrealistic'
         video, fps = utils.prepare_video(
-            video_path, resolution, self.device, self.dtype, False, output_fps=4)
         control = utils.pre_process_pose(
-            video, apply_pose_detect=False).to(self.device).to(self.dtype)
         f, _, h, w = video.shape
-        self.generator.manual_seed(seed)
-        latents = torch.randn((1, 4, h//8, w//8), dtype=self.dtype,
-                              device=self.device, generator=self.generator)
-        latents = latents.repeat(f, 1, 1, 1)
         result = self.inference(image=control,
                                 prompt=prompt + ', ' + added_prompt,
                                 height=h,
@@ -156,9 +129,8 @@ class Model:
                                 guidance_scale=guidance_scale,
                                 controlnet_conditioning_scale=controlnet_conditioning_scale,
                                 eta=eta,
-                                latents=latents,
                                 seed=seed,
                                 output_type='numpy',
                                 )
-        return utils.create_gif(result, fps, path=save_path)

 import torch
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.jax_utils import replicate
+from flax.training.common_utils import shard
+from PIL import Image
+from diffusers import FlaxStableDiffusionControlNetPipeline, FlaxControlNetModel
 import utils
 import gradio_utils
 import os
 from einops import rearrange
+import matplotlib.pyplot as plt
+def create_key(seed=0):
+    return jax.random.PRNGKey(seed)
 class Model:
+    def __init__(self, **kwargs):
+        self.base_controlnet, self.base_controlnet_params = FlaxControlNetModel.from_pretrained(
+       #"JFoz/dog-cat-pose", dtype=jnp.bfloat16
+        "lllyasviel/control_v11p_sd15_openpose", dtype=jnp.bfloat16, from_pt=True
+        )
+        self.pipe, self.params = FlaxStableDiffusionControlNetPipeline.from_pretrained(
+        "runwayml/stable-diffusion-v1-5", controlnet=self.base_controlnet, revision="flax", dtype=jnp.bfloat16,# from_pt=True,
+        )
+    def infer_frame(self, frame_id, prompt, negative_prompt, rng, **kwargs):
+        print(prompt, frame_id)
+        num_samples = 1
+        prompt_ids = self.pipe.prepare_text_inputs([prompt[frame_id]]*num_samples)
+        negative_prompt_ids = self.pipe.prepare_text_inputs([negative_prompt[frame_id]] * num_samples)
+        processed_image = self.pipe.prepare_image_inputs([kwargs['image'][frame_id]]*num_samples)
+        self.params["controlnet"] = self.base_controlnet_params
+        p_params = replicate(self.params)
+        prompt_ids = shard(prompt_ids)
+        negative_prompt_ids = shard(negative_prompt_ids)
+        processed_image = shard(processed_image)
+        output = self.pipe(
+            prompt_ids=prompt_ids,
+            image=processed_image,
+            params=p_params,
+            prng_seed=rng,
+            num_inference_steps=50,
+            neg_prompt_ids=negative_prompt_ids,
+            jit=True,
+        ).images
+        output_images = np.asarray(output.reshape((num_samples,) + output.shape[-3:]))
+        return output_images
     def inference(self, **kwargs):
         seed = kwargs.pop('seed', 0)
+        rng = create_key(0)
+        rng = jax.random.split(rng, jax.device_count())
+        f = len(kwargs['image'])
+        print('frames', f)
         assert 'prompt' in kwargs
         prompt = [kwargs.pop('prompt')] * f
         negative_prompt = [kwargs.pop('negative_prompt', '')] * f
         frames_counter = 0
         result = []
+        for i in range(0, f):
             print(f'Processing frame {i + 1} / {f}')
+            result.append(self.infer_frame(frame_id=i,
                                                    prompt=prompt,
                                                    negative_prompt=negative_prompt,
+                                                   rng = rng,
+                                                   **kwargs))
             frames_counter += 1
+        result = np.stack(result, axis=0)
         return result
     def process_controlnet_pose(self,
                                 seed=42,
                                 eta=0.0,
                                 resolution=512,
                                 save_path=None):
         print("Module Pose")
         video_path = gradio_utils.motion_to_video_path(video_path)
         added_prompt = 'best quality, extremely detailed, HD, ultra-realistic, 8K, HQ, masterpiece, trending on artstation, art, smooth'
         negative_prompts = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer difits, cropped, worst quality, low quality, deformed body, bloated, ugly, unrealistic'
         video, fps = utils.prepare_video(
+            video_path, resolution, False, output_fps=4)
         control = utils.pre_process_pose(
+            video, apply_pose_detect=False)
+        print('N frames', len(control))
         f, _, h, w = video.shape
         result = self.inference(image=control,
                                 prompt=prompt + ', ' + added_prompt,
                                 height=h,
                                 guidance_scale=guidance_scale,
                                 controlnet_conditioning_scale=controlnet_conditioning_scale,
                                 eta=eta,
                                 seed=seed,
                                 output_type='numpy',
                                 )
+        return utils.create_gif(result.astype(jnp.float16), fps, path=save_path)

requirements.txt CHANGED Viewed

@@ -7,7 +7,6 @@ git+https://github.com/huggingface/diffusers@main
 torch
 accelerate
 decord==0.6.0
-diffusers==0.16.1
 einops
 gradio
 imageio

 torch
 accelerate
 decord==0.6.0
 einops
 gradio
 imageio

utils.py CHANGED Viewed

@@ -15,7 +15,7 @@ from controlnet_aux import OpenposeDetector
 apply_openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
-def prepare_video(video_path:str, resolution:int, device, dtype, normalize=True, start_t:float=0, end_t:float=-1, output_fps:int=-1):
     vr = decord.VideoReader(video_path)
     initial_fps = vr.get_avg_fps()
     if output_fps == -1:
@@ -37,7 +37,7 @@ def prepare_video(video_path:str, resolution:int, device, dtype, normalize=True,
         video = video.asnumpy()
     _, h, w, _ = video.shape
     video = rearrange(video, "f h w c -> f c h w")
-    video = torch.Tensor(video).to(device).to(dtype)
     # Use max if you want the larger side to be equal to resolution (e.g. 512)
     # k = float(resolution) / min(h, w)
@@ -63,10 +63,8 @@ def pre_process_pose(input_video, apply_pose_detect: bool = True):
             detected_map = img
         H, W, C = img.shape
         detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
-        detected_maps.append(detected_map[None])
-    detected_maps = np.concatenate(detected_maps)
-    control = torch.from_numpy(detected_maps.copy()).float() / 255.0
-    return rearrange(control, 'f h w c -> f c h w')

 apply_openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
+def prepare_video(video_path:str, resolution:int, normalize=True, start_t:float=0, end_t:float=-1, output_fps:int=-1):
     vr = decord.VideoReader(video_path)
     initial_fps = vr.get_avg_fps()
     if output_fps == -1:
         video = video.asnumpy()
     _, h, w, _ = video.shape
     video = rearrange(video, "f h w c -> f c h w")
+    video = torch.Tensor(video)
     # Use max if you want the larger side to be equal to resolution (e.g. 512)
     # k = float(resolution) / min(h, w)
             detected_map = img
         H, W, C = img.shape
         detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+        detected_maps.append(Image.fromarray(detected_map))
+    return detected_maps