LaVie / vsr /models /diffusers_attention.py

thanks to Vchitect ❤

2f760fe about 1 year ago

43.1 kB

	# Copyright 2022 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import math
	from dataclasses import dataclass
	from typing import Optional

	import torch
	import torch.nn.functional as F
	from torch import nn

	from diffusers.configuration_utils import ConfigMixin, register_to_config
	from diffusers.models.modeling_utils import ModelMixin
	from diffusers.models.embeddings import ImagePositionalEmbeddings
	from diffusers.utils import BaseOutput
	from diffusers.utils.import_utils import is_xformers_available
	from diffusers.models.attention import FeedForward, AdaLayerNorm
	from diffusers.models.attention_processor import Attention


	@dataclass
	class Transformer2DModelOutput(BaseOutput):
	"""
	Args:
	sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
	Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions
	for the unnoised latent pixels.
	"""

	sample: torch.FloatTensor


	if is_xformers_available():
	import xformers
	import xformers.ops
	else:
	xformers = None


	class Transformer2DModel(ModelMixin, ConfigMixin):
	"""
	Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual
	embeddings) inputs.

	When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard
	transformer action. Finally, reshape to image.

	When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional
	embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict
	classes of unnoised image.

	Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised
	image do not contain a prediction for the masked pixel as the unnoised image cannot be masked.

	Parameters:
	num_attention_heads (`int`, optional, defaults to 16): The number of heads to use for multi-head attention.
	attention_head_dim (`int`, optional, defaults to 88): The number of channels in each head.
	in_channels (`int`, optional):
	Pass if the input is continuous. The number of channels in the input and output.
	num_layers (`int`, optional, defaults to 1): The number of layers of Transformer blocks to use.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	cross_attention_dim (`int`, optional): The number of encoder_hidden_states dimensions to use.
	sample_size (`int`, optional): Pass if the input is discrete. The width of the latent images.
	Note that this is fixed at training time as it is used for learning a number of position embeddings. See
	`ImagePositionalEmbeddings`.
	num_vector_embeds (`int`, optional):
	Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
	Includes the class for the masked latent pixel.
	activation_fn (`str`, optional, defaults to `"geglu"`): Activation function to be used in feed-forward.
	num_embeds_ada_norm ( `int`, optional): Pass if at least one of the norm_layers is `AdaLayerNorm`.
	The number of diffusion steps used during training. Note that this is fixed at training time as it is used
	to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
	up to but not more than steps than `num_embeds_ada_norm`.
	attention_bias (`bool`, optional):
	Configure if the TransformerBlocks' attention should contain a bias parameter.
	"""

	@register_to_config
	def __init__(
	self,
	num_attention_heads: int = 16,
	attention_head_dim: int = 88,
	in_channels: Optional[int] = None,
	num_layers: int = 1,
	dropout: float = 0.0,
	norm_num_groups: int = 32,
	cross_attention_dim: Optional[int] = None,
	attention_bias: bool = False,
	sample_size: Optional[int] = None,
	num_vector_embeds: Optional[int] = None,
	activation_fn: str = "geglu",
	num_embeds_ada_norm: Optional[int] = None,
	use_linear_projection: bool = False,
	only_cross_attention: bool = False,
	upcast_attention: bool = False,
	):
	super().__init__()
	self.use_linear_projection = use_linear_projection
	self.num_attention_heads = num_attention_heads
	self.attention_head_dim = attention_head_dim
	inner_dim = num_attention_heads * attention_head_dim

	# 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
	# Define whether input is continuous or discrete depending on configuration
	self.is_input_continuous = in_channels is not None
	self.is_input_vectorized = num_vector_embeds is not None

	if self.is_input_continuous and self.is_input_vectorized:
	raise ValueError(
	f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
	" sure that either `in_channels` or `num_vector_embeds` is None."
	)
	elif not self.is_input_continuous and not self.is_input_vectorized:
	raise ValueError(
	f"Has to define either `in_channels`: {in_channels} or `num_vector_embeds`: {num_vector_embeds}. Make"
	" sure that either `in_channels` or `num_vector_embeds` is not None."
	)

	# 2. Define input layers
	if self.is_input_continuous:
	self.in_channels = in_channels

	self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
	if use_linear_projection:
	self.proj_in = nn.Linear(in_channels, inner_dim)
	else:
	self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
	elif self.is_input_vectorized:
	assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
	assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"

	self.height = sample_size
	self.width = sample_size
	self.num_vector_embeds = num_vector_embeds
	self.num_latent_pixels = self.height * self.width

	self.latent_image_embedding = ImagePositionalEmbeddings(
	num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
	)

	# 3. Define transformers blocks
	self.transformer_blocks = nn.ModuleList(
	[
	BasicTransformerBlock(
	inner_dim,
	num_attention_heads,
	attention_head_dim,
	dropout=dropout,
	cross_attention_dim=cross_attention_dim,
	activation_fn=activation_fn,
	num_embeds_ada_norm=num_embeds_ada_norm,
	attention_bias=attention_bias,
	only_cross_attention=only_cross_attention,
	upcast_attention=upcast_attention,
	)
	for d in range(num_layers)
	]
	)

	# 4. Define output layers
	if self.is_input_continuous:
	if use_linear_projection:
	self.proj_out = nn.Linear(in_channels, inner_dim)
	else:
	self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
	elif self.is_input_vectorized:
	self.norm_out = nn.LayerNorm(inner_dim)
	self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)

	def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, return_dict: bool = True):
	"""
	Args:
	hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
	When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
	hidden_states
	encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, optional):
	Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
	self-attention.
	timestep ( `torch.long`, optional):
	Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.

	Returns:
	[`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
	if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
	tensor.
	"""
	# 1. Input
	if self.is_input_continuous:
	batch, channel, height, weight = hidden_states.shape
	residual = hidden_states

	hidden_states = self.norm(hidden_states)
	if not self.use_linear_projection:
	hidden_states = self.proj_in(hidden_states)
	inner_dim = hidden_states.shape[1]
	hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
	else:
	inner_dim = hidden_states.shape[1]
	hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
	hidden_states = self.proj_in(hidden_states)
	elif self.is_input_vectorized:
	hidden_states = self.latent_image_embedding(hidden_states)

	# 2. Blocks
	for block in self.transformer_blocks:
	hidden_states = block(hidden_states, encoder_hidden_states=encoder_hidden_states, timestep=timestep)

	# 3. Output
	if self.is_input_continuous:
	if not self.use_linear_projection:
	hidden_states = (
	hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
	)
	hidden_states = self.proj_out(hidden_states)
	else:
	hidden_states = self.proj_out(hidden_states)
	hidden_states = (
	hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
	)

	output = hidden_states + residual
	elif self.is_input_vectorized:
	hidden_states = self.norm_out(hidden_states)
	logits = self.out(hidden_states)
	# (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
	logits = logits.permute(0, 2, 1)

	# log(p(x_0))
	output = F.log_softmax(logits.double(), dim=1).float()

	if not return_dict:
	return (output,)

	return Transformer2DModelOutput(sample=output)


	class AttentionBlock(nn.Module):
	"""
	An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
	to the N-d case.
	https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
	Uses three q, k, v linear layers to compute attention.

	Parameters:
	channels (`int`): The number of channels in the input and output.
	num_head_channels (`int`, optional):
	The number of channels in each head. If None, then `num_heads` = 1.
	norm_num_groups (`int`, optional, defaults to 32): The number of groups to use for group norm.
	rescale_output_factor (`float`, optional, defaults to 1.0): The factor to rescale the output by.
	eps (`float`, optional, defaults to 1e-5): The epsilon value to use for group norm.
	"""

	# IMPORTANT;TODO(Patrick, William) - this class will be deprecated soon. Do not use it anymore

	def __init__(
	self,
	channels: int,
	num_head_channels: Optional[int] = None,
	norm_num_groups: int = 32,
	rescale_output_factor: float = 1.0,
	eps: float = 1e-5,
	):
	super().__init__()
	self.channels = channels

	self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
	self.num_head_size = num_head_channels
	self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=norm_num_groups, eps=eps, affine=True)

	# define q,k,v as linear layers
	self.query = nn.Linear(channels, channels)
	self.key = nn.Linear(channels, channels)
	self.value = nn.Linear(channels, channels)

	self.rescale_output_factor = rescale_output_factor
	self.proj_attn = nn.Linear(channels, channels, 1)

	self._use_memory_efficient_attention_xformers = False

	def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool, attention_op: None):
	if not is_xformers_available():
	raise ModuleNotFoundError(
	"Refer to https://github.com/facebookresearch/xformers for more information on how to install"
	" xformers",
	name="xformers",
	)
	elif not torch.cuda.is_available():
	raise ValueError(
	"torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
	" available for GPU "
	)
	else:
	try:
	# Make sure we can run the memory efficient attention
	_ = xformers.ops.memory_efficient_attention(
	torch.randn((1, 2, 40), device="cuda"),
	torch.randn((1, 2, 40), device="cuda"),
	torch.randn((1, 2, 40), device="cuda"),
	)
	except Exception as e:
	raise e
	self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers

	def reshape_heads_to_batch_dim(self, tensor):
	batch_size, seq_len, dim = tensor.shape
	head_size = self.num_heads
	tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
	tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)
	return tensor

	def reshape_batch_dim_to_heads(self, tensor):
	batch_size, seq_len, dim = tensor.shape
	head_size = self.num_heads
	tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
	tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
	return tensor

	def forward(self, hidden_states):
	residual = hidden_states
	batch, channel, height, width = hidden_states.shape

	# norm
	hidden_states = self.group_norm(hidden_states)

	hidden_states = hidden_states.view(batch, channel, height * width).transpose(1, 2)

	# proj to q, k, v
	query_proj = self.query(hidden_states)
	key_proj = self.key(hidden_states)
	value_proj = self.value(hidden_states)

	scale = 1 / math.sqrt(self.channels / self.num_heads)

	query_proj = self.reshape_heads_to_batch_dim(query_proj)
	key_proj = self.reshape_heads_to_batch_dim(key_proj)
	value_proj = self.reshape_heads_to_batch_dim(value_proj)

	if self._use_memory_efficient_attention_xformers:
	# Memory efficient attention
	hidden_states = xformers.ops.memory_efficient_attention(query_proj, key_proj, value_proj, attn_bias=None)
	hidden_states = hidden_states.to(query_proj.dtype)
	else:
	attention_scores = torch.baddbmm(
	torch.empty(
	query_proj.shape[0],
	query_proj.shape[1],
	key_proj.shape[1],
	dtype=query_proj.dtype,
	device=query_proj.device,
	),
	query_proj,
	key_proj.transpose(-1, -2),
	beta=0,
	alpha=scale,
	)
	attention_probs = torch.softmax(attention_scores.float(), dim=-1).type(attention_scores.dtype)
	hidden_states = torch.bmm(attention_probs, value_proj)

	# reshape hidden_states
	hidden_states = self.reshape_batch_dim_to_heads(hidden_states)

	# compute next hidden_states
	hidden_states = self.proj_attn(hidden_states)

	hidden_states = hidden_states.transpose(-1, -2).reshape(batch, channel, height, width)

	# res connect and rescale
	hidden_states = (hidden_states + residual) / self.rescale_output_factor
	return hidden_states


	class BasicTransformerBlock(nn.Module):
	r"""
	A basic Transformer block.

	Parameters:
	dim (`int`): The number of channels in the input and output.
	num_attention_heads (`int`): The number of heads to use for multi-head attention.
	attention_head_dim (`int`): The number of channels in each head.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	cross_attention_dim (`int`, optional): The size of the encoder_hidden_states vector for cross attention.
	activation_fn (`str`, optional, defaults to `"geglu"`): Activation function to be used in feed-forward.
	num_embeds_ada_norm (:
	obj: `int`, optional): The number of diffusion steps used during training. See `Transformer2DModel`.
	attention_bias (:
	obj: `bool`, optional, defaults to `False`): Configure if the attentions should contain a bias parameter.
	"""

	def __init__(
	self,
	dim: int,
	num_attention_heads: int,
	attention_head_dim: int,
	dropout=0.0,
	cross_attention_dim: Optional[int] = None,
	activation_fn: str = "geglu",
	num_embeds_ada_norm: Optional[int] = None,
	attention_bias: bool = False,
	only_cross_attention: bool = False,
	upcast_attention: bool = False,
	):
	super().__init__()
	self.only_cross_attention = only_cross_attention
	self.use_ada_layer_norm = num_embeds_ada_norm is not None

	# 1. Self-Attn
	self.attn1 = CrossAttention(
	query_dim=dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	cross_attention_dim=cross_attention_dim if only_cross_attention else None,
	upcast_attention=upcast_attention,
	) # is a self-attention
	self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)

	# 2. Cross-Attn
	if cross_attention_dim is not None:
	self.attn2 = CrossAttention(
	query_dim=dim,
	cross_attention_dim=cross_attention_dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	upcast_attention=upcast_attention,
	) # is self-attn if encoder_hidden_states is none
	else:
	self.attn2 = None

	self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)

	if cross_attention_dim is not None:
	self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
	else:
	self.norm2 = None

	# 3. Feed-forward
	self.norm3 = nn.LayerNorm(dim)

	def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool, attention_op: None):
	if not is_xformers_available():
	print("Here is how to install it")
	raise ModuleNotFoundError(
	"Refer to https://github.com/facebookresearch/xformers for more information on how to install"
	" xformers",
	name="xformers",
	)
	elif not torch.cuda.is_available():
	raise ValueError(
	"torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
	" available for GPU "
	)
	else:
	try:
	# Make sure we can run the memory efficient attention
	_ = xformers.ops.memory_efficient_attention(
	torch.randn((1, 2, 40), device="cuda"),
	torch.randn((1, 2, 40), device="cuda"),
	torch.randn((1, 2, 40), device="cuda"),
	)
	except Exception as e:
	raise e
	self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
	if self.attn2 is not None:
	self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers

	def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, attention_mask=None):
	# 1. Self-Attention
	norm_hidden_states = (
	self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
	)

	if self.only_cross_attention:
	hidden_states = (
	self.attn1(norm_hidden_states, encoder_hidden_states, attention_mask=attention_mask) + hidden_states
	)
	else:
	hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask) + hidden_states

	if self.attn2 is not None:
	# 2. Cross-Attention
	norm_hidden_states = (
	self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
	)
	hidden_states = (
	self.attn2(
	norm_hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask
	)
	+ hidden_states
	)

	# 3. Feed-forward
	hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states

	return hidden_states


	class CrossAttention(nn.Module):
	r"""
	A cross attention layer.

	Parameters:
	query_dim (`int`): The number of channels in the query.
	cross_attention_dim (`int`, optional):
	The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
	heads (`int`, optional, defaults to 8): The number of heads to use for multi-head attention.
	dim_head (`int`, optional, defaults to 64): The number of channels in each head.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	bias (`bool`, optional, defaults to False):
	Set to `True` for the query, key, and value linear layers to contain a bias parameter.
	"""

	def __init__(
	self,
	query_dim: int,
	cross_attention_dim: Optional[int] = None,
	heads: int = 8,
	dim_head: int = 64,
	dropout: float = 0.0,
	bias=False,
	upcast_attention: bool = False,
	upcast_softmax: bool = False,
	added_kv_proj_dim: Optional[int] = None,
	norm_num_groups: Optional[int] = None,
	):
	super().__init__()
	inner_dim = dim_head * heads
	cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
	self.upcast_attention = upcast_attention
	self.upcast_softmax = upcast_softmax

	self.scale = dim_head**-0.5

	self.heads = heads
	# for slice_size > 0 the attention score computation
	# is split across the batch axis to save memory
	# You can set slice_size with `set_attention_slice`
	self.sliceable_head_dim = heads
	self._slice_size = None
	self._use_memory_efficient_attention_xformers = False
	self.added_kv_proj_dim = added_kv_proj_dim

	if norm_num_groups is not None:
	self.group_norm = nn.GroupNorm(num_channels=inner_dim, num_groups=norm_num_groups, eps=1e-5, affine=True)
	else:
	self.group_norm = None

	self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
	self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
	self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)

	if self.added_kv_proj_dim is not None:
	self.add_k_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)
	self.add_v_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)

	self.to_out = nn.ModuleList([])
	self.to_out.append(nn.Linear(inner_dim, query_dim))
	self.to_out.append(nn.Dropout(dropout))

	def reshape_heads_to_batch_dim(self, tensor):
	batch_size, seq_len, dim = tensor.shape
	head_size = self.heads
	tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
	tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)
	return tensor

	def reshape_batch_dim_to_heads(self, tensor):
	batch_size, seq_len, dim = tensor.shape
	head_size = self.heads
	tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
	tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
	return tensor

	def set_attention_slice(self, slice_size):
	if slice_size is not None and slice_size > self.sliceable_head_dim:
	raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")

	self._slice_size = slice_size

	def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
	batch_size, sequence_length, _ = hidden_states.shape

	encoder_hidden_states = encoder_hidden_states

	if self.group_norm is not None:
	hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)

	query = self.to_q(hidden_states)
	dim = query.shape[-1]
	query = self.reshape_heads_to_batch_dim(query)

	if self.added_kv_proj_dim is not None:
	key = self.to_k(hidden_states)
	value = self.to_v(hidden_states)
	encoder_hidden_states_key_proj = self.add_k_proj(encoder_hidden_states)
	encoder_hidden_states_value_proj = self.add_v_proj(encoder_hidden_states)

	key = self.reshape_heads_to_batch_dim(key)
	value = self.reshape_heads_to_batch_dim(value)
	encoder_hidden_states_key_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_key_proj)
	encoder_hidden_states_value_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_value_proj)

	key = torch.concat([encoder_hidden_states_key_proj, key], dim=1)
	value = torch.concat([encoder_hidden_states_value_proj, value], dim=1)
	else:
	encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
	key = self.to_k(encoder_hidden_states)
	value = self.to_v(encoder_hidden_states)

	key = self.reshape_heads_to_batch_dim(key)
	value = self.reshape_heads_to_batch_dim(value)

	if attention_mask is not None:
	if attention_mask.shape[-1] != query.shape[1]:
	target_length = query.shape[1]
	attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
	attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)

	# attention, what we cannot get enough of
	if self._use_memory_efficient_attention_xformers:
	hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
	# Some versions of xformers return output in fp32, cast it back to the dtype of the input
	hidden_states = hidden_states.to(query.dtype)
	else:
	if self._slice_size is None or query.shape[0] // self._slice_size == 1:
	hidden_states = self._attention(query, key, value, attention_mask)
	else:
	hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)

	# linear proj
	hidden_states = self.to_out[0](hidden_states)

	# dropout
	hidden_states = self.to_out[1](hidden_states)
	return hidden_states

	def _attention(self, query, key, value, attention_mask=None):
	if self.upcast_attention:
	query = query.float()
	key = key.float()

	attention_scores = torch.baddbmm(
	torch.empty(query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device),
	query,
	key.transpose(-1, -2),
	beta=0,
	alpha=self.scale,
	)

	if attention_mask is not None:
	attention_scores = attention_scores + attention_mask

	if self.upcast_softmax:
	attention_scores = attention_scores.float()

	attention_probs = attention_scores.softmax(dim=-1)

	# cast back to the original dtype
	attention_probs = attention_probs.to(value.dtype)

	# compute attention output
	hidden_states = torch.bmm(attention_probs, value)

	# reshape hidden_states
	hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
	return hidden_states

	def _sliced_attention(self, query, key, value, sequence_length, dim, attention_mask):
	batch_size_attention = query.shape[0]
	hidden_states = torch.zeros(
	(batch_size_attention, sequence_length, dim // self.heads), device=query.device, dtype=query.dtype
	)
	slice_size = self._slice_size if self._slice_size is not None else hidden_states.shape[0]
	for i in range(hidden_states.shape[0] // slice_size):
	start_idx = i * slice_size
	end_idx = (i + 1) * slice_size

	query_slice = query[start_idx:end_idx]
	key_slice = key[start_idx:end_idx]

	if self.upcast_attention:
	query_slice = query_slice.float()
	key_slice = key_slice.float()

	attn_slice = torch.baddbmm(
	torch.empty(slice_size, query.shape[1], key.shape[1], dtype=query_slice.dtype, device=query.device),
	query_slice,
	key_slice.transpose(-1, -2),
	beta=0,
	alpha=self.scale,
	)

	if attention_mask is not None:
	attn_slice = attn_slice + attention_mask[start_idx:end_idx]

	if self.upcast_softmax:
	attn_slice = attn_slice.float()

	attn_slice = attn_slice.softmax(dim=-1)

	# cast back to the original dtype
	attn_slice = attn_slice.to(value.dtype)
	attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])

	hidden_states[start_idx:end_idx] = attn_slice

	# reshape hidden_states
	hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
	return hidden_states

	def _memory_efficient_attention_xformers(self, query, key, value, attention_mask):
	query = query.contiguous()
	key = key.contiguous()
	value = value.contiguous()
	hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
	hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
	return hidden_states


	class FeedForward(nn.Module):
	r"""
	A feed-forward layer.

	Parameters:
	dim (`int`): The number of channels in the input.
	dim_out (`int`, optional): The number of channels in the output. If not given, defaults to `dim`.
	mult (`int`, optional, defaults to 4): The multiplier to use for the hidden dimension.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	activation_fn (`str`, optional, defaults to `"geglu"`): Activation function to be used in feed-forward.
	"""

	def __init__(
	self,
	dim: int,
	dim_out: Optional[int] = None,
	mult: int = 4,
	dropout: float = 0.0,
	activation_fn: str = "geglu",
	):
	super().__init__()
	inner_dim = int(dim * mult)
	dim_out = dim_out if dim_out is not None else dim

	if activation_fn == "gelu":
	act_fn = GELU(dim, inner_dim)
	elif activation_fn == "geglu":
	act_fn = GEGLU(dim, inner_dim)
	elif activation_fn == "geglu-approximate":
	act_fn = ApproximateGELU(dim, inner_dim)

	self.net = nn.ModuleList([])
	# project in
	self.net.append(act_fn)
	# project dropout
	self.net.append(nn.Dropout(dropout))
	# project out
	self.net.append(nn.Linear(inner_dim, dim_out))

	def forward(self, hidden_states):
	for module in self.net:
	hidden_states = module(hidden_states)
	return hidden_states


	class GELU(nn.Module):
	r"""
	GELU activation function
	"""

	def __init__(self, dim_in: int, dim_out: int):
	super().__init__()
	self.proj = nn.Linear(dim_in, dim_out)

	def gelu(self, gate):
	if gate.device.type != "mps":
	return F.gelu(gate)
	# mps: gelu is not implemented for float16
	return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)

	def forward(self, hidden_states):
	hidden_states = self.proj(hidden_states)
	hidden_states = self.gelu(hidden_states)
	return hidden_states


	# feedforward
	class GEGLU(nn.Module):
	r"""
	A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.

	Parameters:
	dim_in (`int`): The number of channels in the input.
	dim_out (`int`): The number of channels in the output.
	"""

	def __init__(self, dim_in: int, dim_out: int):
	super().__init__()
	self.proj = nn.Linear(dim_in, dim_out * 2)

	def gelu(self, gate):
	if gate.device.type != "mps":
	return F.gelu(gate)
	# mps: gelu is not implemented for float16
	return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)

	def forward(self, hidden_states):
	hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
	return hidden_states * self.gelu(gate)


	class ApproximateGELU(nn.Module):
	"""
	The approximate form of Gaussian Error Linear Unit (GELU)

	For more details, see section 2: https://arxiv.org/abs/1606.08415
	"""

	def __init__(self, dim_in: int, dim_out: int):
	super().__init__()
	self.proj = nn.Linear(dim_in, dim_out)

	def forward(self, x):
	x = self.proj(x)
	return x * torch.sigmoid(1.702 * x)


	class AdaLayerNorm(nn.Module):
	"""
	Norm layer modified to incorporate timestep embeddings.
	"""

	def __init__(self, embedding_dim, num_embeddings):
	super().__init__()
	self.emb = nn.Embedding(num_embeddings, embedding_dim)
	self.silu = nn.SiLU()
	self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
	self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False)

	def forward(self, x, timestep):
	emb = self.linear(self.silu(self.emb(timestep)))
	scale, shift = torch.chunk(emb, 2)
	x = self.norm(x) * (1 + scale) + shift
	return x


	class DualTransformer2DModel(nn.Module):
	"""
	Dual transformer wrapper that combines two `Transformer2DModel`s for mixed inference.

	Parameters:
	num_attention_heads (`int`, optional, defaults to 16): The number of heads to use for multi-head attention.
	attention_head_dim (`int`, optional, defaults to 88): The number of channels in each head.
	in_channels (`int`, optional):
	Pass if the input is continuous. The number of channels in the input and output.
	num_layers (`int`, optional, defaults to 1): The number of layers of Transformer blocks to use.
	dropout (`float`, optional, defaults to 0.1): The dropout probability to use.
	cross_attention_dim (`int`, optional): The number of encoder_hidden_states dimensions to use.
	sample_size (`int`, optional): Pass if the input is discrete. The width of the latent images.
	Note that this is fixed at training time as it is used for learning a number of position embeddings. See
	`ImagePositionalEmbeddings`.
	num_vector_embeds (`int`, optional):
	Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
	Includes the class for the masked latent pixel.
	activation_fn (`str`, optional, defaults to `"geglu"`): Activation function to be used in feed-forward.
	num_embeds_ada_norm ( `int`, optional): Pass if at least one of the norm_layers is `AdaLayerNorm`.
	The number of diffusion steps used during training. Note that this is fixed at training time as it is used
	to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
	up to but not more than steps than `num_embeds_ada_norm`.
	attention_bias (`bool`, optional):
	Configure if the TransformerBlocks' attention should contain a bias parameter.
	"""

	def __init__(
	self,
	num_attention_heads: int = 16,
	attention_head_dim: int = 88,
	in_channels: Optional[int] = None,
	num_layers: int = 1,
	dropout: float = 0.0,
	norm_num_groups: int = 32,
	cross_attention_dim: Optional[int] = None,
	attention_bias: bool = False,
	sample_size: Optional[int] = None,
	num_vector_embeds: Optional[int] = None,
	activation_fn: str = "geglu",
	num_embeds_ada_norm: Optional[int] = None,
	):
	super().__init__()
	self.transformers = nn.ModuleList(
	[
	Transformer2DModel(
	num_attention_heads=num_attention_heads,
	attention_head_dim=attention_head_dim,
	in_channels=in_channels,
	num_layers=num_layers,
	dropout=dropout,
	norm_num_groups=norm_num_groups,
	cross_attention_dim=cross_attention_dim,
	attention_bias=attention_bias,
	sample_size=sample_size,
	num_vector_embeds=num_vector_embeds,
	activation_fn=activation_fn,
	num_embeds_ada_norm=num_embeds_ada_norm,
	)
	for _ in range(2)
	]
	)

	# Variables that can be set by a pipeline:

	# The ratio of transformer1 to transformer2's output states to be combined during inference
	self.mix_ratio = 0.5

	# The shape of `encoder_hidden_states` is expected to be
	# `(batch_size, condition_lengths[0]+condition_lengths[1], num_features)`
	self.condition_lengths = [77, 257]

	# Which transformer to use to encode which condition.
	# E.g. `(1, 0)` means that we'll use `transformers[1](conditions[0])` and `transformers[0](conditions[1])`
	self.transformer_index_for_condition = [1, 0]

	def forward(
	self, hidden_states, encoder_hidden_states, timestep=None, attention_mask=None, return_dict: bool = True
	):
	"""
	Args:
	hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
	When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
	hidden_states
	encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, optional):
	Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
	self-attention.
	timestep ( `torch.long`, optional):
	Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
	attention_mask (`torch.FloatTensor`, optional):
	Optional attention mask to be applied in CrossAttention
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.

	Returns:
	[`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
	if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
	tensor.
	"""
	input_states = hidden_states

	encoded_states = []
	tokens_start = 0
	# attention_mask is not used yet
	for i in range(2):
	# for each of the two transformers, pass the corresponding condition tokens
	condition_state = encoder_hidden_states[:, tokens_start : tokens_start + self.condition_lengths[i]]
	transformer_index = self.transformer_index_for_condition[i]
	encoded_state = self.transformers[transformer_index](
	input_states,
	encoder_hidden_states=condition_state,
	timestep=timestep,
	return_dict=False,
	)[0]
	encoded_states.append(encoded_state - input_states)
	tokens_start += self.condition_lengths[i]

	output_states = encoded_states[0] * self.mix_ratio + encoded_states[1] * (1 - self.mix_ratio)
	output_states = output_states + input_states

	if not return_dict:
	return (output_states,)

	return Transformer2DModelOutput(sample=output_states)