callum-canavan
commited on
Commit
·
b3ee019
1
Parent(s):
381e596
Fix pipeline
Browse files- .gitignore +3 -1
- bapp.py +26 -13
- requirements.txt +6 -0
- test.py +12 -0
- test_video.py +11 -0
- animate.py → visual_anagrams/animate.py +15 -6
- visual_anagrams/samplers.py +21 -10
.gitignore
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
env/
|
2 |
__pycache__/
|
3 |
-
assets/
|
|
|
|
|
|
1 |
env/
|
2 |
__pycache__/
|
3 |
+
assets/
|
4 |
+
*.png
|
5 |
+
*.mp4
|
bapp.py
CHANGED
@@ -4,10 +4,12 @@ from pathlib import Path
|
|
4 |
import gradio as gr
|
5 |
import torch
|
6 |
from diffusers import DiffusionPipeline
|
|
|
7 |
|
8 |
from visual_anagrams.views import get_views, VIEW_MAP_NAMES
|
9 |
from visual_anagrams.samplers import sample_stage_1, sample_stage_2
|
10 |
from visual_anagrams.utils import add_args, save_illusion, save_metadata
|
|
|
11 |
|
12 |
stage_1 = DiffusionPipeline.from_pretrained(
|
13 |
"DeepFloyd/IF-I-M-v1.0",
|
@@ -31,23 +33,26 @@ def generate_content(
|
|
31 |
num_inference_steps,
|
32 |
seed
|
33 |
):
|
34 |
-
prompts = [prompt_for_original, prompt_for_transformed]
|
35 |
-
prompt_embeds = [stage_1.encode_prompt(
|
36 |
prompt_embeds, negative_prompt_embeds = zip(*prompt_embeds)
|
37 |
prompt_embeds = torch.cat(prompt_embeds)
|
38 |
negative_prompt_embeds = torch.cat(negative_prompt_embeds)
|
39 |
|
40 |
-
views = ['identity', transformation]
|
41 |
views = get_views(views)
|
42 |
|
43 |
generator = torch.manual_seed(seed)
|
|
|
|
|
44 |
image = sample_stage_1(stage_1,
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
|
|
|
51 |
image = sample_stage_2(stage_2,
|
52 |
image,
|
53 |
prompt_embeds,
|
@@ -55,8 +60,16 @@ def generate_content(
|
|
55 |
views,
|
56 |
num_inference_steps=num_inference_steps,
|
57 |
generator=generator)
|
|
|
58 |
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
|
62 |
choices = list(VIEW_MAP_NAMES.keys())
|
@@ -64,13 +77,13 @@ gradio_app = gr.Interface(
|
|
64 |
fn=generate_content,
|
65 |
inputs=[
|
66 |
gr.Textbox(label="Style", placeholder="an oil painting of"),
|
67 |
-
gr.Textbox(label="Prompt for original view", placeholder="a
|
68 |
-
gr.Textbox(label="Prompt for transformed view", placeholder="
|
69 |
gr.Dropdown(label="View transformation", choices=choices, value=choices[0]),
|
70 |
-
gr.Number(label="Number of diffusion steps", value=
|
71 |
gr.Number(label="Random seed", value=0, step=1, minimum=0, maximum=100000)
|
72 |
],
|
73 |
-
outputs=[gr.
|
74 |
)
|
75 |
|
76 |
|
|
|
4 |
import gradio as gr
|
5 |
import torch
|
6 |
from diffusers import DiffusionPipeline
|
7 |
+
from icecream import ic
|
8 |
|
9 |
from visual_anagrams.views import get_views, VIEW_MAP_NAMES
|
10 |
from visual_anagrams.samplers import sample_stage_1, sample_stage_2
|
11 |
from visual_anagrams.utils import add_args, save_illusion, save_metadata
|
12 |
+
from visual_anagrams.animate import animate_two_view
|
13 |
|
14 |
stage_1 = DiffusionPipeline.from_pretrained(
|
15 |
"DeepFloyd/IF-I-M-v1.0",
|
|
|
33 |
num_inference_steps,
|
34 |
seed
|
35 |
):
|
36 |
+
prompts = [f'{style} {p}'.strip() for p in [prompt_for_original, prompt_for_transformed]]
|
37 |
+
prompt_embeds = [stage_1.encode_prompt(p) for p in prompts]
|
38 |
prompt_embeds, negative_prompt_embeds = zip(*prompt_embeds)
|
39 |
prompt_embeds = torch.cat(prompt_embeds)
|
40 |
negative_prompt_embeds = torch.cat(negative_prompt_embeds)
|
41 |
|
42 |
+
views = ['identity', VIEW_MAP_NAMES[transformation]]
|
43 |
views = get_views(views)
|
44 |
|
45 |
generator = torch.manual_seed(seed)
|
46 |
+
|
47 |
+
print("Sample stage 1")
|
48 |
image = sample_stage_1(stage_1,
|
49 |
+
prompt_embeds,
|
50 |
+
negative_prompt_embeds,
|
51 |
+
views,
|
52 |
+
num_inference_steps=num_inference_steps,
|
53 |
+
generator=generator)
|
54 |
|
55 |
+
print("Sample stage 2")
|
56 |
image = sample_stage_2(stage_2,
|
57 |
image,
|
58 |
prompt_embeds,
|
|
|
60 |
views,
|
61 |
num_inference_steps=num_inference_steps,
|
62 |
generator=generator)
|
63 |
+
save_illusion(image, views, Path(""))
|
64 |
|
65 |
+
size = image.shape[-1]
|
66 |
+
animate_two_view(
|
67 |
+
f"sample_{size}.png",
|
68 |
+
views[1],
|
69 |
+
prompts[0],
|
70 |
+
prompts[1],
|
71 |
+
)
|
72 |
+
return 'tmp.mp4', f"sample_{size}.png", f"sample_{size}.views.png"
|
73 |
|
74 |
|
75 |
choices = list(VIEW_MAP_NAMES.keys())
|
|
|
77 |
fn=generate_content,
|
78 |
inputs=[
|
79 |
gr.Textbox(label="Style", placeholder="an oil painting of"),
|
80 |
+
gr.Textbox(label="Prompt for original view", placeholder="a dress"),
|
81 |
+
gr.Textbox(label="Prompt for transformed view", placeholder="an old man"),
|
82 |
gr.Dropdown(label="View transformation", choices=choices, value=choices[0]),
|
83 |
+
gr.Number(label="Number of diffusion steps", value=100, step=1, minimum=1, maximum=300),
|
84 |
gr.Number(label="Random seed", value=0, step=1, minimum=0, maximum=100000)
|
85 |
],
|
86 |
+
outputs=[gr.Video(label="Illusion"), gr.Image(label="Original"), gr.Image(label="Transformed")],
|
87 |
)
|
88 |
|
89 |
|
requirements.txt
CHANGED
@@ -2,9 +2,15 @@ accelerate
|
|
2 |
diffusers
|
3 |
einops
|
4 |
gradio
|
|
|
|
|
|
|
|
|
|
|
5 |
safetensors
|
6 |
sentencepiece
|
7 |
transformers
|
8 |
torch
|
9 |
torchvision
|
|
|
10 |
xformers
|
|
|
2 |
diffusers
|
3 |
einops
|
4 |
gradio
|
5 |
+
icecream
|
6 |
+
imageio
|
7 |
+
imageio[ffmpeg]
|
8 |
+
imageio[pyav]
|
9 |
+
opencv-python
|
10 |
safetensors
|
11 |
sentencepiece
|
12 |
transformers
|
13 |
torch
|
14 |
torchvision
|
15 |
+
tqdm
|
16 |
xformers
|
test.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bapp import generate_content
|
2 |
+
|
3 |
+
if __name__ == "__main__":
|
4 |
+
print(generate_content(
|
5 |
+
"a painting of",
|
6 |
+
"vases",
|
7 |
+
"a sloth",
|
8 |
+
"Flip",
|
9 |
+
1,
|
10 |
+
0
|
11 |
+
))
|
12 |
+
|
test_video.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from visual_anagrams.animate import animate_two_view
|
2 |
+
from visual_anagrams.views import get_views
|
3 |
+
|
4 |
+
if __name__ == "__main__":
|
5 |
+
animate_two_view(
|
6 |
+
"sample_256.png",
|
7 |
+
get_views(["identity", "flip"])[1],
|
8 |
+
"a painting of vases",
|
9 |
+
"a painting of a sloth",
|
10 |
+
save_video_path="tmp3.mp4",
|
11 |
+
)
|
animate.py → visual_anagrams/animate.py
RENAMED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from tqdm import tqdm
|
2 |
import numpy as np
|
3 |
from PIL import Image, ImageDraw, ImageFont
|
@@ -13,12 +14,11 @@ def draw_text(image, text, fill=(0,0,0), frame_size=384, im_size=256):
|
|
13 |
image = image.copy()
|
14 |
|
15 |
# Font info
|
16 |
-
font_path = get_courier_font_path()
|
17 |
font_size = 16
|
18 |
|
19 |
# Make PIL objects
|
20 |
draw = ImageDraw.Draw(image)
|
21 |
-
font = ImageFont.
|
22 |
|
23 |
# Center text horizontally, and vertically between
|
24 |
# illusion bottom and frame bottom
|
@@ -117,10 +117,19 @@ def animate_two_view(
|
|
117 |
|
118 |
# Convert PIL images to numpy arrays
|
119 |
image_array = [imageio.core.asarray(frame) for frame in frames]
|
120 |
-
|
121 |
-
|
122 |
-
print(
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
|
126 |
|
|
|
1 |
+
import cv2
|
2 |
from tqdm import tqdm
|
3 |
import numpy as np
|
4 |
from PIL import Image, ImageDraw, ImageFont
|
|
|
14 |
image = image.copy()
|
15 |
|
16 |
# Font info
|
|
|
17 |
font_size = 16
|
18 |
|
19 |
# Make PIL objects
|
20 |
draw = ImageDraw.Draw(image)
|
21 |
+
font = ImageFont.load_default()
|
22 |
|
23 |
# Center text horizontally, and vertically between
|
24 |
# illusion bottom and frame bottom
|
|
|
117 |
|
118 |
# Convert PIL images to numpy arrays
|
119 |
image_array = [imageio.core.asarray(frame) for frame in frames]
|
120 |
+
f = image_array[0]
|
121 |
+
print(f.dtype)
|
122 |
+
print(f.shape)
|
123 |
+
print(frame_size)
|
124 |
+
print(np.min(f), np.max(f))
|
125 |
+
print(len(image_array))
|
126 |
+
|
127 |
+
# Save as video using opencv
|
128 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
129 |
+
video = cv2.VideoWriter(save_video_path, fourcc, 30, (frame_size, frame_size))
|
130 |
+
for frame in image_array:
|
131 |
+
video.write(frame)
|
132 |
+
video.release()
|
133 |
|
134 |
|
135 |
|
visual_anagrams/samplers.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from tqdm import tqdm
|
|
|
2 |
|
3 |
import torch
|
4 |
import torch.nn.functional as F
|
@@ -42,8 +43,9 @@ def sample_stage_1(model,
|
|
42 |
device,
|
43 |
generator,
|
44 |
)
|
|
|
45 |
|
46 |
-
for i, t in enumerate(
|
47 |
# Apply views to noisy_image
|
48 |
viewed_noisy_images = []
|
49 |
for view_fn in views:
|
@@ -56,6 +58,7 @@ def sample_stage_1(model,
|
|
56 |
model_input = model.scheduler.scale_model_input(model_input, t)
|
57 |
|
58 |
# Predict noise estimate
|
|
|
59 |
noise_pred = model.unet(
|
60 |
model_input,
|
61 |
t,
|
@@ -63,9 +66,11 @@ def sample_stage_1(model,
|
|
63 |
cross_attention_kwargs=None,
|
64 |
return_dict=False,
|
65 |
)[0]
|
|
|
66 |
|
67 |
# Extract uncond (neg) and cond noise estimates
|
68 |
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
|
|
69 |
|
70 |
# Invert the unconditional (negative) estimates
|
71 |
inverted_preds = []
|
@@ -73,6 +78,7 @@ def sample_stage_1(model,
|
|
73 |
inverted_pred = view.inverse_view(pred)
|
74 |
inverted_preds.append(inverted_pred)
|
75 |
noise_pred_uncond = torch.stack(inverted_preds)
|
|
|
76 |
|
77 |
# Invert the conditional estimates
|
78 |
inverted_preds = []
|
@@ -80,11 +86,13 @@ def sample_stage_1(model,
|
|
80 |
inverted_pred = view.inverse_view(pred)
|
81 |
inverted_preds.append(inverted_pred)
|
82 |
noise_pred_text = torch.stack(inverted_preds)
|
|
|
83 |
|
84 |
# Split into noise estimate and variance estimates
|
85 |
noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
|
86 |
noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
|
87 |
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
|
|
88 |
|
89 |
# Reduce predicted noise and variances
|
90 |
noise_pred = noise_pred.view(-1,num_prompts,3,64,64)
|
@@ -98,11 +106,14 @@ def sample_stage_1(model,
|
|
98 |
else:
|
99 |
raise ValueError('Reduction must be either `mean` or `alternate`')
|
100 |
noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
|
|
|
101 |
|
|
|
102 |
# compute the previous noisy sample x_t -> x_t-1
|
103 |
noisy_images = model.scheduler.step(
|
104 |
-
noise_pred, t, noisy_images, generator=generator, return_dict=False
|
105 |
)[0]
|
|
|
106 |
|
107 |
# Return denoised images
|
108 |
return noisy_images
|
@@ -149,34 +160,34 @@ def sample_stage_2(model,
|
|
149 |
prompt_embeds.dtype,
|
150 |
device,
|
151 |
generator,
|
152 |
-
)
|
153 |
|
154 |
# Prepare upscaled image and noise level
|
155 |
image = model.preprocess_image(image, num_images_per_prompt, device)
|
156 |
-
upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
|
157 |
|
158 |
noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device)
|
159 |
noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype)
|
160 |
-
upscaled = model.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
|
161 |
|
162 |
# Condition on noise level, for each model input
|
163 |
-
noise_level = torch.cat([noise_level] * num_prompts * 2)
|
164 |
|
165 |
# Denoising Loop
|
166 |
for i, t in enumerate(tqdm(timesteps)):
|
167 |
# Cat noisy image with upscaled conditioning image
|
168 |
-
model_input = torch.cat([noisy_images, upscaled], dim=1)
|
169 |
|
170 |
# Apply views to noisy_image
|
171 |
viewed_inputs = []
|
172 |
for view_fn in views:
|
173 |
viewed_inputs.append(view_fn.view(model_input[0]))
|
174 |
-
viewed_inputs = torch.stack(viewed_inputs)
|
175 |
|
176 |
# Duplicate inputs for CFG
|
177 |
# Model input is: [ neg_0, neg_1, ..., pos_0, pos_1, ... ]
|
178 |
-
model_input = torch.cat([viewed_inputs] * 2)
|
179 |
-
model_input = model.scheduler.scale_model_input(model_input, t)
|
180 |
|
181 |
# predict the noise residual
|
182 |
noise_pred = model.unet(
|
|
|
1 |
from tqdm import tqdm
|
2 |
+
from icecream import ic
|
3 |
|
4 |
import torch
|
5 |
import torch.nn.functional as F
|
|
|
43 |
device,
|
44 |
generator,
|
45 |
)
|
46 |
+
# ic(noisy_images.shape)
|
47 |
|
48 |
+
for i, t in tqdm(enumerate(timesteps)):
|
49 |
# Apply views to noisy_image
|
50 |
viewed_noisy_images = []
|
51 |
for view_fn in views:
|
|
|
58 |
model_input = model.scheduler.scale_model_input(model_input, t)
|
59 |
|
60 |
# Predict noise estimate
|
61 |
+
# print("Predicting noise estimate")
|
62 |
noise_pred = model.unet(
|
63 |
model_input,
|
64 |
t,
|
|
|
66 |
cross_attention_kwargs=None,
|
67 |
return_dict=False,
|
68 |
)[0]
|
69 |
+
# ic(noise_pred.shape)
|
70 |
|
71 |
# Extract uncond (neg) and cond noise estimates
|
72 |
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
73 |
+
# ic(noise_pred_uncond.shape)
|
74 |
|
75 |
# Invert the unconditional (negative) estimates
|
76 |
inverted_preds = []
|
|
|
78 |
inverted_pred = view.inverse_view(pred)
|
79 |
inverted_preds.append(inverted_pred)
|
80 |
noise_pred_uncond = torch.stack(inverted_preds)
|
81 |
+
# ic(noise_pred_uncond.shape)
|
82 |
|
83 |
# Invert the conditional estimates
|
84 |
inverted_preds = []
|
|
|
86 |
inverted_pred = view.inverse_view(pred)
|
87 |
inverted_preds.append(inverted_pred)
|
88 |
noise_pred_text = torch.stack(inverted_preds)
|
89 |
+
# ic(noise_pred_text.shape)
|
90 |
|
91 |
# Split into noise estimate and variance estimates
|
92 |
noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
|
93 |
noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
|
94 |
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
95 |
+
# ic(noise_pred.shape)
|
96 |
|
97 |
# Reduce predicted noise and variances
|
98 |
noise_pred = noise_pred.view(-1,num_prompts,3,64,64)
|
|
|
106 |
else:
|
107 |
raise ValueError('Reduction must be either `mean` or `alternate`')
|
108 |
noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
|
109 |
+
# ic(noise_pred.shape)
|
110 |
|
111 |
+
# ic(t.shape)
|
112 |
# compute the previous noisy sample x_t -> x_t-1
|
113 |
noisy_images = model.scheduler.step(
|
114 |
+
noise_pred.to('cuda'), t, noisy_images.to('cuda'), generator=generator, return_dict=False
|
115 |
)[0]
|
116 |
+
# ic(noisy_images.shape)
|
117 |
|
118 |
# Return denoised images
|
119 |
return noisy_images
|
|
|
160 |
prompt_embeds.dtype,
|
161 |
device,
|
162 |
generator,
|
163 |
+
).to('cuda')
|
164 |
|
165 |
# Prepare upscaled image and noise level
|
166 |
image = model.preprocess_image(image, num_images_per_prompt, device)
|
167 |
+
upscaled = F.interpolate(image.to('cuda'), (height, width), mode="bilinear", align_corners=True).to('cuda')
|
168 |
|
169 |
noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device)
|
170 |
noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype)
|
171 |
+
upscaled = model.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level).to('cuda')
|
172 |
|
173 |
# Condition on noise level, for each model input
|
174 |
+
noise_level = torch.cat([noise_level] * num_prompts * 2).to('cuda')
|
175 |
|
176 |
# Denoising Loop
|
177 |
for i, t in enumerate(tqdm(timesteps)):
|
178 |
# Cat noisy image with upscaled conditioning image
|
179 |
+
model_input = torch.cat([noisy_images, upscaled], dim=1).to('cuda')
|
180 |
|
181 |
# Apply views to noisy_image
|
182 |
viewed_inputs = []
|
183 |
for view_fn in views:
|
184 |
viewed_inputs.append(view_fn.view(model_input[0]))
|
185 |
+
viewed_inputs = torch.stack(viewed_inputs).to('cuda')
|
186 |
|
187 |
# Duplicate inputs for CFG
|
188 |
# Model input is: [ neg_0, neg_1, ..., pos_0, pos_1, ... ]
|
189 |
+
model_input = torch.cat([viewed_inputs] * 2).to('cuda')
|
190 |
+
model_input = model.scheduler.scale_model_input(model_input, t).to('cuda')
|
191 |
|
192 |
# predict the noise residual
|
193 |
noise_pred = model.unet(
|