ai-tube-model-ltxv-1

Paused

App Files Files Community

origordon commited on Nov 13, 2024

Commit

07ddecf

1 Parent(s): c4b2a35

Decoder: add AttentionResBlocks block

Browse files

1. Support attention block after residual block.
2. Add flash attention support.

Files changed (1) hide show

xora/models/autoencoders/causal_video_autoencoder.py +141 -0

xora/models/autoencoders/causal_video_autoencoder.py CHANGED Viewed

@@ -9,10 +9,12 @@ import numpy as np
 from einops import rearrange
 from torch import nn
 from diffusers.utils import logging
 from xora.models.autoencoders.conv_nd_factory import make_conv_nd, make_linear_nd
 from xora.models.autoencoders.pixel_norm import PixelNorm
 from xora.models.autoencoders.vae import AutoencoderKLWrapper
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -212,6 +214,12 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
             last_layer = self.decoder.layers[-1]
         return last_layer
 class Encoder(nn.Module):
     r"""
@@ -483,6 +491,16 @@ class Decoder(nn.Module):
                     norm_layer=norm_layer,
                     inject_noise=block_params.get("inject_noise", False),
                 )
             elif block_name == "res_x_y":
                 output_channel = output_channel // block_params.get("multiplier", 2)
                 block = ResnetBlock3D(
@@ -558,6 +576,129 @@ class Decoder(nn.Module):
         return sample
 class UNetMidBlock3D(nn.Module):
     """
     A 3D UNet mid-block [`UNetMidBlock3D`] with multiple residual blocks.

 from einops import rearrange
 from torch import nn
 from diffusers.utils import logging
+import torch.nn.functional as F
 from xora.models.autoencoders.conv_nd_factory import make_conv_nd, make_linear_nd
 from xora.models.autoencoders.pixel_norm import PixelNorm
 from xora.models.autoencoders.vae import AutoencoderKLWrapper
+from xora.models.transformers.attention import Attention
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
             last_layer = self.decoder.layers[-1]
         return last_layer
+    def set_use_tpu_flash_attention(self):
+        for block in self.decoder.up_blocks:
+            if isinstance(block, AttentionResBlocks):
+                for attention_block in block.attention_blocks:
+                    attention_block.set_use_tpu_flash_attention()
 class Encoder(nn.Module):
     r"""
                     norm_layer=norm_layer,
                     inject_noise=block_params.get("inject_noise", False),
                 )
+            elif block_name == "attn_res_x":
+                block = AttentionResBlocks(
+                    dims=dims,
+                    in_channels=input_channel,
+                    num_layers=block_params["num_layers"],
+                    resnet_groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                    attention_head_dim=block_params["attention_head_dim"],
+                    inject_noise=block_params.get("inject_noise", False),
+                )
             elif block_name == "res_x_y":
                 output_channel = output_channel // block_params.get("multiplier", 2)
                 block = ResnetBlock3D(
         return sample
+class AttentionResBlocks(nn.Module):
+    """
+    A 3D convolution residual block followed by self attention residual block
+    Args:
+        dims (`int` or `Tuple[int, int]`): The number of dimensions to use in convolutions.
+        in_channels (`int`): The number of input channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+        norm_layer (`str`, *optional*, defaults to `group_norm`): The normalization layer to use.
+        attention_head_dim (`int`, *optional*, defaults to 64): The dimension of the attention heads.
+        inject_noise (`bool`, *optional*, defaults to `False`): Whether to inject noise or not between convolution layers.
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_groups: int = 32,
+        norm_layer: str = "group_norm",
+        attention_head_dim: int = 64,
+        inject_noise: bool = False,
+    ):
+        super().__init__()
+        if attention_head_dim > in_channels:
+            raise ValueError(
+                "attention_head_dim must be less than or equal to in_channels"
+            )
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+        self.res_blocks = []
+        self.attention_blocks = []
+        for i in range(num_layers):
+            self.res_blocks.append(
+                ResnetBlock3D(
+                    dims=dims,
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    norm_layer=norm_layer,
+                    inject_noise=inject_noise,
+                )
+            )
+            self.attention_blocks.append(
+                Attention(
+                    query_dim=in_channels,
+                    heads=in_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    bias=True,
+                    out_bias=True,
+                    qk_norm="rms_norm",
+                    residual_connection=True,
+                )
+            )
+        self.res_blocks = nn.ModuleList(self.res_blocks)
+        self.attention_blocks = nn.ModuleList(self.attention_blocks)
+    def forward(
+        self, hidden_states: torch.FloatTensor, causal: bool = True
+    ) -> torch.FloatTensor:
+        for resnet, attention in zip(self.res_blocks, self.attention_blocks):
+            hidden_states = resnet(hidden_states, causal=causal)
+            # Reshape the hidden states to be (batch_size, frames * height * width, channel)
+            batch_size, channel, frames, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, frames * height * width
+            ).transpose(1, 2)
+            if attention.use_tpu_flash_attention:
+                # Pad the second dimension to be divisible by block_k_major (block in flash attention)
+                seq_len = hidden_states.shape[1]
+                block_k_major = 512
+                pad_len = (block_k_major - seq_len % block_k_major) % block_k_major
+                if pad_len > 0:
+                    hidden_states = F.pad(
+                        hidden_states, (0, 0, 0, pad_len), "constant", 0
+                    )
+                # Create a mask with ones for the original sequence length and zeros for the padded indexes
+                mask = torch.ones(
+                    (hidden_states.shape[0], seq_len),
+                    device=hidden_states.device,
+                    dtype=hidden_states.dtype,
+                )
+                if pad_len > 0:
+                    mask = F.pad(mask, (0, pad_len), "constant", 0)
+            hidden_states = attention(
+                hidden_states,
+                attention_mask=None if not attention.use_tpu_flash_attention else mask,
+            )
+            if attention.use_tpu_flash_attention:
+                # Remove the padding
+                if pad_len > 0:
+                    hidden_states = hidden_states[:, :-pad_len, :]
+            # Reshape the hidden states back to (batch_size, channel, frames, height, width, channel)
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, frames, height, width
+            )
+        return hidden_states
 class UNetMidBlock3D(nn.Module):
     """
     A 3D UNet mid-block [`UNetMidBlock3D`] with multiple residual blocks.