callum-canavan commited on
Commit
b3ee019
·
1 Parent(s): 381e596

Fix pipeline

Browse files
.gitignore CHANGED
@@ -1,3 +1,5 @@
1
  env/
2
  __pycache__/
3
- assets/
 
 
 
1
  env/
2
  __pycache__/
3
+ assets/
4
+ *.png
5
+ *.mp4
bapp.py CHANGED
@@ -4,10 +4,12 @@ from pathlib import Path
4
  import gradio as gr
5
  import torch
6
  from diffusers import DiffusionPipeline
 
7
 
8
  from visual_anagrams.views import get_views, VIEW_MAP_NAMES
9
  from visual_anagrams.samplers import sample_stage_1, sample_stage_2
10
  from visual_anagrams.utils import add_args, save_illusion, save_metadata
 
11
 
12
  stage_1 = DiffusionPipeline.from_pretrained(
13
  "DeepFloyd/IF-I-M-v1.0",
@@ -31,23 +33,26 @@ def generate_content(
31
  num_inference_steps,
32
  seed
33
  ):
34
- prompts = [prompt_for_original, prompt_for_transformed]
35
- prompt_embeds = [stage_1.encode_prompt(f'{style} {p}'.strip()) for p in [prompts]]
36
  prompt_embeds, negative_prompt_embeds = zip(*prompt_embeds)
37
  prompt_embeds = torch.cat(prompt_embeds)
38
  negative_prompt_embeds = torch.cat(negative_prompt_embeds)
39
 
40
- views = ['identity', transformation]
41
  views = get_views(views)
42
 
43
  generator = torch.manual_seed(seed)
 
 
44
  image = sample_stage_1(stage_1,
45
- prompt_embeds,
46
- negative_prompt_embeds,
47
- views,
48
- num_inference_steps=num_inference_steps,
49
- generator=generator)
50
 
 
51
  image = sample_stage_2(stage_2,
52
  image,
53
  prompt_embeds,
@@ -55,8 +60,16 @@ def generate_content(
55
  views,
56
  num_inference_steps=num_inference_steps,
57
  generator=generator)
 
58
 
59
- return image, image_transformed, transformation_gif
 
 
 
 
 
 
 
60
 
61
 
62
  choices = list(VIEW_MAP_NAMES.keys())
@@ -64,13 +77,13 @@ gradio_app = gr.Interface(
64
  fn=generate_content,
65
  inputs=[
66
  gr.Textbox(label="Style", placeholder="an oil painting of"),
67
- gr.Textbox(label="Prompt for original view", placeholder="a penguin"),
68
- gr.Textbox(label="Prompt for transformed view", placeholder="a giraffe"),
69
  gr.Dropdown(label="View transformation", choices=choices, value=choices[0]),
70
- gr.Number(label="Number of diffusion steps", value=30, step=1, minimum=1, maximum=100),
71
  gr.Number(label="Random seed", value=0, step=1, minimum=0, maximum=100000)
72
  ],
73
- outputs=[gr.Image(label="Illusion"), gr.Image(label="Original"), gr.Image(label="Transformed")],
74
  )
75
 
76
 
 
4
  import gradio as gr
5
  import torch
6
  from diffusers import DiffusionPipeline
7
+ from icecream import ic
8
 
9
  from visual_anagrams.views import get_views, VIEW_MAP_NAMES
10
  from visual_anagrams.samplers import sample_stage_1, sample_stage_2
11
  from visual_anagrams.utils import add_args, save_illusion, save_metadata
12
+ from visual_anagrams.animate import animate_two_view
13
 
14
  stage_1 = DiffusionPipeline.from_pretrained(
15
  "DeepFloyd/IF-I-M-v1.0",
 
33
  num_inference_steps,
34
  seed
35
  ):
36
+ prompts = [f'{style} {p}'.strip() for p in [prompt_for_original, prompt_for_transformed]]
37
+ prompt_embeds = [stage_1.encode_prompt(p) for p in prompts]
38
  prompt_embeds, negative_prompt_embeds = zip(*prompt_embeds)
39
  prompt_embeds = torch.cat(prompt_embeds)
40
  negative_prompt_embeds = torch.cat(negative_prompt_embeds)
41
 
42
+ views = ['identity', VIEW_MAP_NAMES[transformation]]
43
  views = get_views(views)
44
 
45
  generator = torch.manual_seed(seed)
46
+
47
+ print("Sample stage 1")
48
  image = sample_stage_1(stage_1,
49
+ prompt_embeds,
50
+ negative_prompt_embeds,
51
+ views,
52
+ num_inference_steps=num_inference_steps,
53
+ generator=generator)
54
 
55
+ print("Sample stage 2")
56
  image = sample_stage_2(stage_2,
57
  image,
58
  prompt_embeds,
 
60
  views,
61
  num_inference_steps=num_inference_steps,
62
  generator=generator)
63
+ save_illusion(image, views, Path(""))
64
 
65
+ size = image.shape[-1]
66
+ animate_two_view(
67
+ f"sample_{size}.png",
68
+ views[1],
69
+ prompts[0],
70
+ prompts[1],
71
+ )
72
+ return 'tmp.mp4', f"sample_{size}.png", f"sample_{size}.views.png"
73
 
74
 
75
  choices = list(VIEW_MAP_NAMES.keys())
 
77
  fn=generate_content,
78
  inputs=[
79
  gr.Textbox(label="Style", placeholder="an oil painting of"),
80
+ gr.Textbox(label="Prompt for original view", placeholder="a dress"),
81
+ gr.Textbox(label="Prompt for transformed view", placeholder="an old man"),
82
  gr.Dropdown(label="View transformation", choices=choices, value=choices[0]),
83
+ gr.Number(label="Number of diffusion steps", value=100, step=1, minimum=1, maximum=300),
84
  gr.Number(label="Random seed", value=0, step=1, minimum=0, maximum=100000)
85
  ],
86
+ outputs=[gr.Video(label="Illusion"), gr.Image(label="Original"), gr.Image(label="Transformed")],
87
  )
88
 
89
 
requirements.txt CHANGED
@@ -2,9 +2,15 @@ accelerate
2
  diffusers
3
  einops
4
  gradio
 
 
 
 
 
5
  safetensors
6
  sentencepiece
7
  transformers
8
  torch
9
  torchvision
 
10
  xformers
 
2
  diffusers
3
  einops
4
  gradio
5
+ icecream
6
+ imageio
7
+ imageio[ffmpeg]
8
+ imageio[pyav]
9
+ opencv-python
10
  safetensors
11
  sentencepiece
12
  transformers
13
  torch
14
  torchvision
15
+ tqdm
16
  xformers
test.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bapp import generate_content
2
+
3
+ if __name__ == "__main__":
4
+ print(generate_content(
5
+ "a painting of",
6
+ "vases",
7
+ "a sloth",
8
+ "Flip",
9
+ 1,
10
+ 0
11
+ ))
12
+
test_video.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from visual_anagrams.animate import animate_two_view
2
+ from visual_anagrams.views import get_views
3
+
4
+ if __name__ == "__main__":
5
+ animate_two_view(
6
+ "sample_256.png",
7
+ get_views(["identity", "flip"])[1],
8
+ "a painting of vases",
9
+ "a painting of a sloth",
10
+ save_video_path="tmp3.mp4",
11
+ )
animate.py → visual_anagrams/animate.py RENAMED
@@ -1,3 +1,4 @@
 
1
  from tqdm import tqdm
2
  import numpy as np
3
  from PIL import Image, ImageDraw, ImageFont
@@ -13,12 +14,11 @@ def draw_text(image, text, fill=(0,0,0), frame_size=384, im_size=256):
13
  image = image.copy()
14
 
15
  # Font info
16
- font_path = get_courier_font_path()
17
  font_size = 16
18
 
19
  # Make PIL objects
20
  draw = ImageDraw.Draw(image)
21
- font = ImageFont.truetype(font_path, font_size)
22
 
23
  # Center text horizontally, and vertically between
24
  # illusion bottom and frame bottom
@@ -117,10 +117,19 @@ def animate_two_view(
117
 
118
  # Convert PIL images to numpy arrays
119
  image_array = [imageio.core.asarray(frame) for frame in frames]
120
-
121
- # Save as video
122
- print('Making video...')
123
- imageio.mimsave(save_video_path, image_array, fps=30)
 
 
 
 
 
 
 
 
 
124
 
125
 
126
 
 
1
+ import cv2
2
  from tqdm import tqdm
3
  import numpy as np
4
  from PIL import Image, ImageDraw, ImageFont
 
14
  image = image.copy()
15
 
16
  # Font info
 
17
  font_size = 16
18
 
19
  # Make PIL objects
20
  draw = ImageDraw.Draw(image)
21
+ font = ImageFont.load_default()
22
 
23
  # Center text horizontally, and vertically between
24
  # illusion bottom and frame bottom
 
117
 
118
  # Convert PIL images to numpy arrays
119
  image_array = [imageio.core.asarray(frame) for frame in frames]
120
+ f = image_array[0]
121
+ print(f.dtype)
122
+ print(f.shape)
123
+ print(frame_size)
124
+ print(np.min(f), np.max(f))
125
+ print(len(image_array))
126
+
127
+ # Save as video using opencv
128
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
129
+ video = cv2.VideoWriter(save_video_path, fourcc, 30, (frame_size, frame_size))
130
+ for frame in image_array:
131
+ video.write(frame)
132
+ video.release()
133
 
134
 
135
 
visual_anagrams/samplers.py CHANGED
@@ -1,4 +1,5 @@
1
  from tqdm import tqdm
 
2
 
3
  import torch
4
  import torch.nn.functional as F
@@ -42,8 +43,9 @@ def sample_stage_1(model,
42
  device,
43
  generator,
44
  )
 
45
 
46
- for i, t in enumerate(tqdm(timesteps)):
47
  # Apply views to noisy_image
48
  viewed_noisy_images = []
49
  for view_fn in views:
@@ -56,6 +58,7 @@ def sample_stage_1(model,
56
  model_input = model.scheduler.scale_model_input(model_input, t)
57
 
58
  # Predict noise estimate
 
59
  noise_pred = model.unet(
60
  model_input,
61
  t,
@@ -63,9 +66,11 @@ def sample_stage_1(model,
63
  cross_attention_kwargs=None,
64
  return_dict=False,
65
  )[0]
 
66
 
67
  # Extract uncond (neg) and cond noise estimates
68
  noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
 
69
 
70
  # Invert the unconditional (negative) estimates
71
  inverted_preds = []
@@ -73,6 +78,7 @@ def sample_stage_1(model,
73
  inverted_pred = view.inverse_view(pred)
74
  inverted_preds.append(inverted_pred)
75
  noise_pred_uncond = torch.stack(inverted_preds)
 
76
 
77
  # Invert the conditional estimates
78
  inverted_preds = []
@@ -80,11 +86,13 @@ def sample_stage_1(model,
80
  inverted_pred = view.inverse_view(pred)
81
  inverted_preds.append(inverted_pred)
82
  noise_pred_text = torch.stack(inverted_preds)
 
83
 
84
  # Split into noise estimate and variance estimates
85
  noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
86
  noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
87
  noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
88
 
89
  # Reduce predicted noise and variances
90
  noise_pred = noise_pred.view(-1,num_prompts,3,64,64)
@@ -98,11 +106,14 @@ def sample_stage_1(model,
98
  else:
99
  raise ValueError('Reduction must be either `mean` or `alternate`')
100
  noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
 
101
 
 
102
  # compute the previous noisy sample x_t -> x_t-1
103
  noisy_images = model.scheduler.step(
104
- noise_pred, t, noisy_images, generator=generator, return_dict=False
105
  )[0]
 
106
 
107
  # Return denoised images
108
  return noisy_images
@@ -149,34 +160,34 @@ def sample_stage_2(model,
149
  prompt_embeds.dtype,
150
  device,
151
  generator,
152
- )
153
 
154
  # Prepare upscaled image and noise level
155
  image = model.preprocess_image(image, num_images_per_prompt, device)
156
- upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
157
 
158
  noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device)
159
  noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype)
160
- upscaled = model.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
161
 
162
  # Condition on noise level, for each model input
163
- noise_level = torch.cat([noise_level] * num_prompts * 2)
164
 
165
  # Denoising Loop
166
  for i, t in enumerate(tqdm(timesteps)):
167
  # Cat noisy image with upscaled conditioning image
168
- model_input = torch.cat([noisy_images, upscaled], dim=1)
169
 
170
  # Apply views to noisy_image
171
  viewed_inputs = []
172
  for view_fn in views:
173
  viewed_inputs.append(view_fn.view(model_input[0]))
174
- viewed_inputs = torch.stack(viewed_inputs)
175
 
176
  # Duplicate inputs for CFG
177
  # Model input is: [ neg_0, neg_1, ..., pos_0, pos_1, ... ]
178
- model_input = torch.cat([viewed_inputs] * 2)
179
- model_input = model.scheduler.scale_model_input(model_input, t)
180
 
181
  # predict the noise residual
182
  noise_pred = model.unet(
 
1
  from tqdm import tqdm
2
+ from icecream import ic
3
 
4
  import torch
5
  import torch.nn.functional as F
 
43
  device,
44
  generator,
45
  )
46
+ # ic(noisy_images.shape)
47
 
48
+ for i, t in tqdm(enumerate(timesteps)):
49
  # Apply views to noisy_image
50
  viewed_noisy_images = []
51
  for view_fn in views:
 
58
  model_input = model.scheduler.scale_model_input(model_input, t)
59
 
60
  # Predict noise estimate
61
+ # print("Predicting noise estimate")
62
  noise_pred = model.unet(
63
  model_input,
64
  t,
 
66
  cross_attention_kwargs=None,
67
  return_dict=False,
68
  )[0]
69
+ # ic(noise_pred.shape)
70
 
71
  # Extract uncond (neg) and cond noise estimates
72
  noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
73
+ # ic(noise_pred_uncond.shape)
74
 
75
  # Invert the unconditional (negative) estimates
76
  inverted_preds = []
 
78
  inverted_pred = view.inverse_view(pred)
79
  inverted_preds.append(inverted_pred)
80
  noise_pred_uncond = torch.stack(inverted_preds)
81
+ # ic(noise_pred_uncond.shape)
82
 
83
  # Invert the conditional estimates
84
  inverted_preds = []
 
86
  inverted_pred = view.inverse_view(pred)
87
  inverted_preds.append(inverted_pred)
88
  noise_pred_text = torch.stack(inverted_preds)
89
+ # ic(noise_pred_text.shape)
90
 
91
  # Split into noise estimate and variance estimates
92
  noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
93
  noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
94
  noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
95
+ # ic(noise_pred.shape)
96
 
97
  # Reduce predicted noise and variances
98
  noise_pred = noise_pred.view(-1,num_prompts,3,64,64)
 
106
  else:
107
  raise ValueError('Reduction must be either `mean` or `alternate`')
108
  noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
109
+ # ic(noise_pred.shape)
110
 
111
+ # ic(t.shape)
112
  # compute the previous noisy sample x_t -> x_t-1
113
  noisy_images = model.scheduler.step(
114
+ noise_pred.to('cuda'), t, noisy_images.to('cuda'), generator=generator, return_dict=False
115
  )[0]
116
+ # ic(noisy_images.shape)
117
 
118
  # Return denoised images
119
  return noisy_images
 
160
  prompt_embeds.dtype,
161
  device,
162
  generator,
163
+ ).to('cuda')
164
 
165
  # Prepare upscaled image and noise level
166
  image = model.preprocess_image(image, num_images_per_prompt, device)
167
+ upscaled = F.interpolate(image.to('cuda'), (height, width), mode="bilinear", align_corners=True).to('cuda')
168
 
169
  noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device)
170
  noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype)
171
+ upscaled = model.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level).to('cuda')
172
 
173
  # Condition on noise level, for each model input
174
+ noise_level = torch.cat([noise_level] * num_prompts * 2).to('cuda')
175
 
176
  # Denoising Loop
177
  for i, t in enumerate(tqdm(timesteps)):
178
  # Cat noisy image with upscaled conditioning image
179
+ model_input = torch.cat([noisy_images, upscaled], dim=1).to('cuda')
180
 
181
  # Apply views to noisy_image
182
  viewed_inputs = []
183
  for view_fn in views:
184
  viewed_inputs.append(view_fn.view(model_input[0]))
185
+ viewed_inputs = torch.stack(viewed_inputs).to('cuda')
186
 
187
  # Duplicate inputs for CFG
188
  # Model input is: [ neg_0, neg_1, ..., pos_0, pos_1, ... ]
189
+ model_input = torch.cat([viewed_inputs] * 2).to('cuda')
190
+ model_input = model.scheduler.scale_model_input(model_input, t).to('cuda')
191
 
192
  # predict the noise residual
193
  noise_pred = model.unet(