tolgacangoz
/

matryoshka-diffusion-models

Text-to-Image

Diffusers

Safetensors

English

mdm

Model card Files Files and versions Community

tolgacangoz commited on Oct 20, 2024

Commit

82eed0f

verified ·

1 Parent(s): 5e1ad73

Upload matryoshka.py

Browse files

Files changed (1) hide show

unet/matryoshka.py +27 -68

unet/matryoshka.py CHANGED Viewed

@@ -420,6 +420,7 @@ class MatryoshkaDDIMScheduler(SchedulerMixin, ConfigMixin):
         self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
         self.scales = None
     def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
         """
@@ -532,6 +533,7 @@ class MatryoshkaDDIMScheduler(SchedulerMixin, ConfigMixin):
     def get_schedule_shifted(self, alpha_prod, scale_factor=None):
         if (scale_factor is not None) and (scale_factor > 1):  # rescale noise schedule
             snr = alpha_prod / (1 - alpha_prod)
             scaled_snr = snr / scale_factor
             alpha_prod = 1 / (1 + 1 / scaled_snr)
@@ -640,16 +642,16 @@ class MatryoshkaDDIMScheduler(SchedulerMixin, ConfigMixin):
         if self.config.thresholding:
             if len(model_output) > 1:
                 pred_original_sample = [
-                    self._threshold_sample(p_o_s * scale) / scale
-                    for p_o_s, scale in zip(pred_original_sample, self.scales)
                 ]
             else:
                 pred_original_sample = self._threshold_sample(pred_original_sample)
         elif self.config.clip_sample:
             if len(model_output) > 1:
                 pred_original_sample = [
-                    (p_o_s * scale).clamp(-self.config.clip_sample_range, self.config.clip_sample_range) / scale
-                    for p_o_s, scale in zip(pred_original_sample, self.scales)
                 ]
             else:
                 pred_original_sample = pred_original_sample.clamp(
@@ -1440,7 +1442,7 @@ class MatryoshkaTransformerBlock(nn.Module):
             bias=True,
             upcast_attention=upcast_attention,
             pre_only=True,
-            processor=MatryoshkaFusedAttnProcessor1_0_or_2_0(),
         )
         self.attn1.fuse_projections()
         del self.attn1.to_q
@@ -1458,7 +1460,7 @@ class MatryoshkaTransformerBlock(nn.Module):
                 bias=True,
                 upcast_attention=upcast_attention,
                 pre_only=True,
-                processor=MatryoshkaFusedAttnProcessor1_0_or_2_0(),
             )
             self.attn2.fuse_projections()
             del self.attn2.to_q
@@ -1517,7 +1519,6 @@ class MatryoshkaTransformerBlock(nn.Module):
                 # **cross_attention_kwargs,
             )
-        # attn_output_cond = attn_output_cond.permute(0, 2, 1).contiguous()
         attn_output_cond = self.proj_out(attn_output_cond)
         attn_output_cond = attn_output_cond.permute(0, 2, 1).reshape(batch_size, channels, *spatial_dims)
         hidden_states = hidden_states + attn_output_cond
@@ -1535,7 +1536,7 @@ class MatryoshkaTransformerBlock(nn.Module):
         return hidden_states
-class MatryoshkaFusedAttnProcessor1_0_or_2_0:
     r"""
     Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). It uses
     fused projection layers. For self-attention modules, all projection matrices (i.e., query, key, value) are fused.
@@ -1548,28 +1549,11 @@ class MatryoshkaFusedAttnProcessor1_0_or_2_0:
     </Tip>
     """
-    # def __init__(self):
-    #     if not hasattr(F, "scaled_dot_product_attention"):
-    #         raise ImportError(
-    #             "MatryoshkaFusedAttnProcessor2_0 requires PyTorch 2.x, to use it. Please upgrade PyTorch to > 2.x."
-    #         )
-    # TODO: They seem to give different results; but nevertheless can I replace this with torch.nn.functional.scaled_dot_product_attention()?
-    def attention(self, q, k, v, num_heads, mask=None):
-        bs, width, length = q.shape
-        ch = width // num_heads
-        scale = 1 / torch.sqrt(torch.sqrt(torch.tensor(ch)))
-        weight = torch.einsum(
-            "bct,bcs->bts",
-            (q * scale).reshape(bs * num_heads, ch, length),
-            (k * scale).reshape(bs * num_heads, ch, -1),
-        )  # More stable with f16 than dividing afterwards
-        if mask is not None:
-            mask = mask.view(mask.size(0), 1, 1, mask.size(-1)).repeat(1, num_heads, 1, 1).flatten(0, 1)
-            weight = weight.masked_fill(mask == 0, float("-inf"))
-        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = torch.einsum("bts,bcs->bct", weight, v.reshape(bs * num_heads, ch, -1))
-        return a.reshape(bs, -1, length)
     def __call__(
         self,
@@ -1593,26 +1577,12 @@ class MatryoshkaFusedAttnProcessor1_0_or_2_0:
         input_ndim = hidden_states.ndim
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            # hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        # batch_size, sequence_length, _ = (
-        #     hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        # )
-        # if attention_mask is not None:
-        #     attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        #     # scaled_dot_product_attention expects attention_mask shape to be
-        #     # (batch, heads, source_length, target_length)
-        #     attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
         if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states)  # .transpose(1, 2)).transpose(1, 2)
-        # Reshape hidden_states to 2D tensor
-        hidden_states = hidden_states.view(batch_size, channel, height * width).permute(0, 2, 1).contiguous()
-        # Now hidden_states.shape is [batch_size, height * width, channels]
         if encoder_hidden_states is None:
             qkv = attn.to_qkv(hidden_states)
@@ -1630,11 +1600,6 @@ class MatryoshkaFusedAttnProcessor1_0_or_2_0:
             split_size = kv.shape[-1] // 2
             key, value = torch.split(kv, split_size, dim=-1)
-        # if self_attention_output is None:
-        #     query = query.permute(0, 2, 1)
-        # key = key.permute(0, 2, 1)
-        # value = value.permute(0, 2, 1)
         if attn.norm_q is not None:
             query = attn.norm_q(query)
         if attn.norm_k is not None:
@@ -1659,16 +1624,6 @@ class MatryoshkaFusedAttnProcessor1_0_or_2_0:
             query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
         )
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1 if F.scaled_dot_product_attention() is available
-        # hidden_states = self.attention(
-        #     query,
-        #     key,
-        #     value,
-        #     mask=attention_mask,
-        #     num_heads=attn.heads,
-        # )
         hidden_states = hidden_states.to(query.dtype)
         if self_attention_output is not None:
@@ -1956,7 +1911,7 @@ class MatryoshkaCombinedTimestepTextEmbedding(nn.Module):
             # if self.cond_emb is not None and not added_cond_kwargs.get("from_nested", False):
             return temb_micro_conditioning, conditioning_mask, cond_emb
-        return cond_emb, conditioning_mask, cond_emb
 @dataclass
@@ -3184,7 +3139,7 @@ class MatryoshkaUNet2DConditionModel(
                     encoder_hidden_states=encoder_hidden_states,
                     attention_mask=attention_mask,
                     cross_attention_kwargs=cross_attention_kwargs,
-                    encoder_attention_mask=encoder_attention_mask,  # cond_mask?
                     **additional_residuals,
                 )
             else:
@@ -3214,7 +3169,7 @@ class MatryoshkaUNet2DConditionModel(
                     encoder_hidden_states=encoder_hidden_states,
                     attention_mask=attention_mask,
                     cross_attention_kwargs=cross_attention_kwargs,
-                    encoder_attention_mask=encoder_attention_mask,  # cond_mask?
                 )
             else:
                 sample = self.mid_block(sample, emb)
@@ -3251,7 +3206,7 @@ class MatryoshkaUNet2DConditionModel(
                     cross_attention_kwargs=cross_attention_kwargs,
                     upsample_size=upsample_size,
                     attention_mask=attention_mask,
-                    encoder_attention_mask=encoder_attention_mask,  # cond_mask?
                 )
             else:
                 sample = upsample_block(
@@ -3699,7 +3654,7 @@ class NestedUNet2DConditionModel(MatryoshkaUNet2DConditionModel):
                     cross_attention_kwargs=cross_attention_kwargs,
                     upsample_size=upsample_size,
                     attention_mask=attention_mask,
-                    encoder_attention_mask=cond_mask[:bh] if cond_mask is not None else cond_mask,  # cond_mask?
                 )
             else:
                 sample = upsample_block(
@@ -3863,6 +3818,8 @@ class MatryoshkaPipeline(
         if hasattr(unet, "nest_ratio"):
             scheduler.scales = unet.nest_ratio + [1]
         self.register_modules(
             text_encoder=text_encoder,
@@ -3889,12 +3846,14 @@ class MatryoshkaPipeline(
             ).to(self.device)
             self.config.nesting_level = 1
             self.scheduler.scales = self.unet.nest_ratio + [1]
         elif nesting_level == 2:
             self.unet = NestedUNet2DConditionModel.from_pretrained(
                 "tolgacangoz/matryoshka-diffusion-models", subfolder="unet/nesting_level_2"
             ).to(self.device)
             self.config.nesting_level = 2
             self.scheduler.scales = self.unet.nest_ratio + [1]
         else:
             raise ValueError("Currently, nesting levels 0, 1, and 2 are supported.")

         self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
         self.scales = None
+        self.schedule_shifted_power = 1.0
     def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
         """
     def get_schedule_shifted(self, alpha_prod, scale_factor=None):
         if (scale_factor is not None) and (scale_factor > 1):  # rescale noise schedule
+            scale_factor = scale_factor ** self.schedule_shifted_power
             snr = alpha_prod / (1 - alpha_prod)
             scaled_snr = snr / scale_factor
             alpha_prod = 1 / (1 + 1 / scaled_snr)
         if self.config.thresholding:
             if len(model_output) > 1:
                 pred_original_sample = [
+                    self._threshold_sample(p_o_s)
+                    for p_o_s in pred_original_sample
                 ]
             else:
                 pred_original_sample = self._threshold_sample(pred_original_sample)
         elif self.config.clip_sample:
             if len(model_output) > 1:
                 pred_original_sample = [
+                    p_o_s.clamp(-self.config.clip_sample_range, self.config.clip_sample_range)
+                    for p_o_s in pred_original_sample
                 ]
             else:
                 pred_original_sample = pred_original_sample.clamp(
             bias=True,
             upcast_attention=upcast_attention,
             pre_only=True,
+            processor=MatryoshkaFusedAttnProcessor2_0(),
         )
         self.attn1.fuse_projections()
         del self.attn1.to_q
                 bias=True,
                 upcast_attention=upcast_attention,
                 pre_only=True,
+                processor=MatryoshkaFusedAttnProcessor2_0(),
             )
             self.attn2.fuse_projections()
             del self.attn2.to_q
                 # **cross_attention_kwargs,
             )
         attn_output_cond = self.proj_out(attn_output_cond)
         attn_output_cond = attn_output_cond.permute(0, 2, 1).reshape(batch_size, channels, *spatial_dims)
         hidden_states = hidden_states + attn_output_cond
         return hidden_states
+class MatryoshkaFusedAttnProcessor2_0:
     r"""
     Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). It uses
     fused projection layers. For self-attention modules, all projection matrices (i.e., query, key, value) are fused.
     </Tip>
     """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "MatryoshkaFusedAttnProcessor2_0 requires PyTorch 2.x, to use it. Please upgrade PyTorch to > 2.x."
+            )
     def __call__(
         self,
         input_ndim = hidden_states.ndim
         if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states)
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2).contiguous()
         if encoder_hidden_states is None:
             qkv = attn.to_qkv(hidden_states)
             split_size = kv.shape[-1] // 2
             key, value = torch.split(kv, split_size, dim=-1)
         if attn.norm_q is not None:
             query = attn.norm_q(query)
         if attn.norm_k is not None:
             query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
         )
         hidden_states = hidden_states.to(query.dtype)
         if self_attention_output is not None:
             # if self.cond_emb is not None and not added_cond_kwargs.get("from_nested", False):
             return temb_micro_conditioning, conditioning_mask, cond_emb
+        return None, conditioning_mask, cond_emb
 @dataclass
                     encoder_hidden_states=encoder_hidden_states,
                     attention_mask=attention_mask,
                     cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
                     **additional_residuals,
                 )
             else:
                     encoder_hidden_states=encoder_hidden_states,
                     attention_mask=attention_mask,
                     cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
                 )
             else:
                 sample = self.mid_block(sample, emb)
                     cross_attention_kwargs=cross_attention_kwargs,
                     upsample_size=upsample_size,
                     attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
                 )
             else:
                 sample = upsample_block(
                     cross_attention_kwargs=cross_attention_kwargs,
                     upsample_size=upsample_size,
                     attention_mask=attention_mask,
+                    encoder_attention_mask=cond_mask[:bh] if cond_mask is not None else cond_mask,
                 )
             else:
                 sample = upsample_block(
         if hasattr(unet, "nest_ratio"):
             scheduler.scales = unet.nest_ratio + [1]
+            if nesting_level == 2:
+                scheduler.schedule_shifted_power = 2.0
         self.register_modules(
             text_encoder=text_encoder,
             ).to(self.device)
             self.config.nesting_level = 1
             self.scheduler.scales = self.unet.nest_ratio + [1]
+            self.scheduler.schedule_shifted_power = 1.0
         elif nesting_level == 2:
             self.unet = NestedUNet2DConditionModel.from_pretrained(
                 "tolgacangoz/matryoshka-diffusion-models", subfolder="unet/nesting_level_2"
             ).to(self.device)
             self.config.nesting_level = 2
             self.scheduler.scales = self.unet.nest_ratio + [1]
+            self.scheduler.schedule_shifted_power = 2.0
         else:
             raise ValueError("Currently, nesting levels 0, 1, and 2 are supported.")