Spaces:

Freiburg-AI-Research
/

dermoscopic_image_generation

Runtime error

App Files Files

Freiburg-AI-Research commited on Jan 2, 2023

Commit

a0c720e

1 Parent(s): 0990b67

Upload 6 files

Browse files

Files changed (6) hide show

glide_text2im/clip/__init__.py +0 -0
glide_text2im/clip/attention.py +179 -0
glide_text2im/clip/config.yaml +18 -0
glide_text2im/clip/encoders.py +497 -0
glide_text2im/clip/model_creation.py +117 -0
glide_text2im/clip/utils.py +97 -0

glide_text2im/clip/__init__.py ADDED Viewed

File without changes

glide_text2im/clip/attention.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import math
+from abc import ABC, abstractmethod
+from itertools import product
+from typing import Any, Optional
+import attr
+import numpy as np
+import torch
+@attr.s
+class AttentionMask(ABC):
+    query_context_size: int = attr.ib(validator=lambda i, a, x: x >= 1)  # type: ignore
+    key_context_size: int = attr.ib(validator=lambda i, a, x: x >= 1)  # type: ignore
+    block_size: int = attr.ib(validator=lambda i, a, x: x >= 1)  # type: ignore
+    n_head: int = attr.ib(validator=lambda i, a, x: x >= 1)  # type: ignore
+    is_head_specific: bool = attr.ib(default=False)
+    n_query_pad: int = attr.ib(default=0)
+    n_key_pad: int = attr.ib(default=0)
+    def __attrs_post_init__(self) -> None:
+        if self.query_context_size % self.block_size != 0:
+            raise ValueError()
+        if self.key_context_size % self.block_size != 0:
+            raise ValueError()
+        if self.n_query_pad >= self.query_context_size:
+            raise ValueError()
+        if self.n_key_pad >= self.key_context_size:
+            raise ValueError()
+        self.n_query_block = self.query_context_size // self.block_size
+        self.n_key_block = self.key_context_size // self.block_size
+        self.first_pad_query_block_idx = self.n_query_block - int(
+            math.ceil(self.n_query_pad / self.block_size)
+        )
+        self.first_pad_key_block_idx = self.n_key_block - int(
+            math.ceil(self.n_key_pad / self.block_size)
+        )
+    def _make_global_layout(self) -> None:
+        if not self.is_head_specific:
+            m = np.ones([self.n_query_block, self.n_key_block], dtype=np.bool)
+            r = product(*[range(n) for n in m.shape])
+            for qb, kb in r:
+                m[qb, kb] = np.any(self.block_layout(None, 0, qb, kb, 0))
+        else:
+            m = np.ones([self.n_head, self.n_query_block, self.n_key_block], dtype=np.bool)
+            r = product(*[range(n) for n in m.shape])
+            for h, qb, kb in r:
+                m[h, qb, kb] = np.any(self.block_layout(None, h, qb, kb, 0))
+        self.global_layout = m
+    @abstractmethod
+    def _block_layout(
+        self, blk_shape: Any, head_idx: int, query_idx: int, key_idx: int, blk_idx: int
+    ) -> np.ndarray:
+        raise NotImplementedError()
+    def block_layout(
+        self, blk_shape: Any, head_idx: int, query_idx: int, key_idx: int, blk_idx: int
+    ) -> np.ndarray:
+        """
+        `query_idx`, `key_idx` are block-level, zero-based indices.
+        """
+        m = np.ones([self.block_size, self.block_size], dtype=np.bool)
+        if query_idx >= self.first_pad_query_block_idx:
+            n_pad = min(
+                self.block_size,
+                (query_idx + 1) * self.block_size - (self.query_context_size - self.n_query_pad),
+            )
+            assert n_pad > 0
+            m[self.block_size - n_pad :] = False
+        if key_idx >= self.first_pad_key_block_idx:
+            n_pad = min(
+                self.block_size,
+                (key_idx + 1) * self.block_size - (self.key_context_size - self.n_key_pad),
+            )
+            assert n_pad > 0
+            m[:, self.block_size - n_pad :] = False
+        return m & self._block_layout(blk_shape, head_idx, query_idx, key_idx, blk_idx)
+@attr.s
+class DenseAttentionMask(AttentionMask):
+    def __attrs_post_init__(self) -> None:
+        super().__attrs_post_init__()
+        self.global_layout = np.ones([self.n_query_block, self.n_key_block], dtype=np.bool)
+        n_zero_query_blocks = self.n_query_pad // self.block_size
+        n_zero_key_blocks = self.n_key_pad // self.block_size
+        self.global_layout[self.n_query_block - n_zero_query_blocks :] = False
+        self.global_layout[:, self.n_key_block - n_zero_key_blocks :] = False
+    def _block_layout(
+        self, blk_shape: Any, head_idx: int, query_idx: int, key_idx: int, blk_idx: int
+    ) -> np.ndarray:
+        return np.ones([self.block_size, self.block_size], dtype=np.bool)
+@attr.s
+class DenseCausalAttentionMask(AttentionMask):
+    def __attrs_post_init__(self) -> None:
+        super().__attrs_post_init__()
+        self.global_layout = np.tril(np.ones([self.n_query_block, self.n_key_block], dtype=np.bool))
+        n_zero_query_blocks = self.n_query_pad // self.block_size
+        n_zero_key_blocks = self.n_key_pad // self.block_size
+        self.global_layout[self.n_query_block - n_zero_query_blocks :] = False
+        self.global_layout[:, self.n_key_block - n_zero_key_blocks :] = False
+    def _block_layout(
+        self, blk_shape: Any, head_idx: int, query_idx: int, key_idx: int, blk_idx: int
+    ) -> np.ndarray:
+        if query_idx > key_idx:
+            return np.ones(2 * [self.block_size], dtype=np.bool)
+        elif query_idx < key_idx:
+            return np.zeros(2 * [self.block_size], dtype=np.bool)
+        else:
+            return np.tril(np.ones(2 * [self.block_size], dtype=np.bool))
+@attr.s(eq=False, repr=False)
+class AttentionInfo:
+    n_heads: int = attr.ib()
+    ctx_blks_q: int = attr.ib()
+    ctx_blks_k: int = attr.ib()
+    block_size: int = attr.ib()
+    pytorch_attn_bias: Optional[torch.Tensor] = attr.ib()
+def to_attention_info(d: AttentionMask) -> AttentionInfo:
+    return AttentionInfo(
+        n_heads=d.n_head,
+        ctx_blks_q=d.n_query_block,
+        ctx_blks_k=d.n_key_block,
+        block_size=d.block_size,
+        pytorch_attn_bias=None,
+    )
+def make_full_layout(d: AttentionMask) -> np.ndarray:
+    """
+    Returns the `context_size x context_size` layout matrix described by `d`. If the layout is dependent on the index of
+    the attention head, a `attention_head x context_size x context_size` layout matrix is returned instead.
+    """
+    if not d.is_head_specific:
+        u = np.reshape(d.global_layout, [d.n_query_block, d.n_key_block, 1, 1])
+        r = product(range(d.n_query_block), range(d.n_key_block))
+        v = np.array([d.block_layout(None, 0, i, j, 0) for i, j in r])
+        v = np.reshape(v, [d.n_query_block, d.n_key_block, d.block_size, d.block_size])
+        w = u * v
+        w = np.transpose(w, [0, 2, 1, 3])
+        w = np.reshape(w, [d.query_context_size, d.key_context_size])
+        return w
+    else:
+        if len(d.global_layout.shape) == 2:
+            u = np.reshape(d.global_layout, [1, d.n_query_block, d.n_key_block, 1, 1])
+            u = np.tile(u, [d.n_head, 1, 1, 1, 1])
+        elif len(d.global_layout.shape) == 3:
+            u = np.reshape(d.global_layout, [d.n_head, d.n_query_block, d.n_key_block, 1, 1])
+        else:
+            raise RuntimeError()
+        s = product(range(d.n_head), range(d.n_query_block), range(d.n_key_block))
+        v = np.array([d.block_layout(None, i, j, k, 0) for i, j, k in s])
+        v = np.reshape(v, [d.n_head, d.n_query_block, d.n_key_block, d.block_size, d.block_size])
+        w = u * v
+        w = np.transpose(w, [0, 1, 3, 2, 4])
+        w = np.reshape(w, [d.n_head, d.query_context_size, d.key_context_size])
+        return w

glide_text2im/clip/config.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+logit_scale: 100.0
+# Diffusion settings
+beta_schedule: "squaredcos_cap_v2"
+n_timesteps: 1000
+# Architecture settings
+image_size: 64
+patch_size: 4
+n_vocab: 65536
+max_text_len: 77
+n_embd: 512
+n_head_state_text: 64
+n_head_text: 8
+n_xf_blocks_text: 12
+n_head_state_image: 64
+n_head_image: 12
+n_xf_blocks_image: 12

glide_text2im/clip/encoders.py ADDED Viewed

	@@ -0,0 +1,497 @@

+import math
+from collections import OrderedDict
+from typing import List, Optional, Tuple, cast
+import attr
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .attention import (
+    AttentionInfo,
+    DenseAttentionMask,
+    DenseCausalAttentionMask,
+    make_full_layout,
+    to_attention_info,
+)
+from .utils import Affine, LayerNorm, zero_key_bias_grad
+# Constants used in the original CLIP implementation.
+image_channel_means = [122.77093945, 116.74601272, 104.09373519]
+image_channel_stds = [68.50053285, 66.63215831, 70.32316309]
+@attr.s(eq=False, repr=False)
+class TextEmbedding(nn.Module):
+    n_vocab: int = attr.ib()
+    n_context: int = attr.ib()
+    n_state: int = attr.ib()
+    device: torch.device = attr.ib(default=torch.device("cuda"))
+    def __attrs_post_init__(self) -> None:
+        super().__init__()
+        w_voc = torch.empty((self.n_vocab, self.n_state), dtype=torch.float32, device=self.device)
+        w_pos = torch.empty((self.n_context, self.n_state), dtype=torch.float32, device=self.device)
+        with torch.no_grad():
+            w_voc.normal_(std=0.02)
+            w_pos.normal_(std=0.01)
+        self.w_voc = nn.Parameter(w_voc)
+        self.w_pos = nn.Parameter(w_pos)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if len(x.shape) != 2:
+            raise ValueError()
+        return F.embedding(x, self.w_voc) + self.w_pos[None, :, :]
+@attr.s(eq=False, repr=False)
+class ImageEmbedding(nn.Module):
+    image_size: int = attr.ib()
+    patch_size: int = attr.ib()
+    n_state: int = attr.ib()
+    n_timestep: int = attr.ib(default=0)
+    device: torch.device = attr.ib(default=torch.device("cuda"))
+    def __attrs_post_init__(self) -> None:
+        super().__init__()
+        if self.image_size % self.patch_size != 0:
+            raise ValueError()
+        n_patch = self.image_size // self.patch_size
+        patch_proj = torch.empty(
+            (self.n_state, 3) + 2 * (self.patch_size,), dtype=torch.float32, device=self.device
+        )
+        w_pos = torch.empty(
+            (1 + n_patch ** 2, self.n_state), dtype=torch.float32, device=self.device
+        )
+        with torch.no_grad():
+            if self.n_timestep == 0:
+                pred_state = torch.empty((self.n_state,), dtype=torch.float32, device=self.device)
+                pred_state.normal_(std=1 / np.sqrt(self.n_state))
+                self.pred_state = nn.Parameter(pred_state)
+            else:
+                w_t = torch.empty(
+                    (self.n_timestep, self.n_state), dtype=torch.float32, device=self.device
+                )
+                w_t.normal_(std=1 / np.sqrt(self.n_state))
+                self.w_t = nn.Parameter(w_t)
+            patch_proj.normal_(std=np.sqrt(2 / (self.n_state * self.patch_size ** 2)))
+            w_pos.normal_(std=1 / np.sqrt(self.n_state))
+        self.patch_proj = nn.Parameter(patch_proj)
+        self.w_pos = nn.Parameter(w_pos)
+        self.channel_means = torch.tensor(
+            image_channel_means, dtype=torch.float32, device=self.device
+        )[None, :, None, None]
+        self.channel_stds = torch.tensor(
+            image_channel_stds, dtype=torch.float32, device=self.device
+        )[None, :, None, None]
+        self.ln = LayerNorm(self.n_state, eps=1e-5, device=self.device)
+    def forward(self, x: torch.Tensor, t: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if len(x.shape) != 4:
+            raise ValueError("input should be 4d")
+        if x.shape[1] != 3:
+            raise ValueError("input should have 3 channels")
+        if not (x.shape[2] == self.image_size and x.shape[3] == self.image_size):
+            raise ValueError(f"input is not {self.image_size} x {self.image_size}")
+        if (self.n_timestep == 0 and t is not None) or (self.n_timestep != 0 and t is None):
+            raise ValueError()
+        if self.n_timestep != 0:
+            assert t is not None
+            if len(t.shape) != 1:
+                raise ValueError()
+            if t.shape[0] != x.shape[0]:
+                raise ValueError()
+        x = (x - self.channel_means) / self.channel_stds
+        x = F.conv2d(x, self.patch_proj, stride=self.patch_size)
+        x = x.reshape(x.shape[0], self.n_state, (self.image_size // self.patch_size) ** 2).permute(
+            0, 2, 1
+        )
+        sot = (
+            self.pred_state[None, None].expand(x.shape[0], -1, -1)
+            if self.n_timestep == 0
+            else F.embedding(cast(torch.Tensor, t), self.w_t)[:, None]
+        )
+        x = torch.cat((sot, x), dim=1) + self.w_pos[None]
+        return self.ln(x)
+@attr.s(eq=False, repr=False)
+class AttentionResblock(nn.Module):
+    n_state: int = attr.ib()
+    n_resblocks: int = attr.ib()
+    attn_fn: AttentionInfo = attr.ib()
+    device: torch.device = attr.ib(default=torch.device("cuda"))
+    def __attrs_post_init__(self) -> None:
+        super().__init__()
+        self.n_head_state = self.n_state // self.attn_fn.n_heads
+        self.qk_scale = 1 / np.sqrt(self.n_head_state)
+        self.ln = LayerNorm(self.n_state, eps=1e-5, device=self.device)
+        self.f_q = Affine(
+            self.n_state,
+            self.n_state,
+            std=1 / math.sqrt(self.n_state),
+            use_bias=True,
+            bias_filter_fn=zero_key_bias_grad,
+            device=self.device,
+        )
+        self.f_k = Affine(
+            self.n_state,
+            self.n_state,
+            std=1 / math.sqrt(self.n_state),
+            use_bias=False,
+            bias_filter_fn=zero_key_bias_grad,
+            device=self.device,
+        )
+        self.f_v = Affine(
+            self.n_state,
+            self.n_state,
+            std=1 / math.sqrt(self.n_state),
+            use_bias=True,
+            bias_filter_fn=zero_key_bias_grad,
+            device=self.device,
+        )
+        self.f_c = Affine(
+            self.n_state,
+            self.n_state,
+            use_bias=True,
+            std=1 / np.sqrt(self.n_state * self.n_resblocks ** 2),
+            device=self.device,
+        )  # XXX
+    def forward(self, m: torch.Tensor) -> torch.Tensor:
+        n_context = m.shape[1]
+        n_query_pad = self.attn_fn.ctx_blks_q * self.attn_fn.block_size - n_context
+        n_key_pad = self.attn_fn.ctx_blks_k * self.attn_fn.block_size - n_context
+        assert n_query_pad >= 0
+        assert n_key_pad >= 0
+        r = m
+        r = self.ln(r)
+        q, k, v = self.f_q(r), self.f_k(r), self.f_v(r)
+        if n_query_pad != 0:
+            q = F.pad(q, (0, 0, 0, n_query_pad))
+        if n_key_pad != 0:
+            k = F.pad(k, (0, 0, 0, n_key_pad))
+            v = F.pad(v, (0, 0, 0, n_key_pad))
+        q = q.view([q.shape[0], -1, self.attn_fn.n_heads, self.n_head_state]).permute((0, 2, 1, 3))
+        k = k.view([k.shape[0], -1, self.attn_fn.n_heads, self.n_head_state]).permute((0, 2, 1, 3))
+        v = v.view([v.shape[0], -1, self.attn_fn.n_heads, self.n_head_state]).permute((0, 2, 1, 3))
+        w = torch.einsum(
+            "bhcd,bhkd->bhck", q * math.sqrt(self.qk_scale), k * math.sqrt(self.qk_scale)
+        )
+        if hasattr(self.attn_fn, "pytorch_attn_bias"):
+            bias = self.attn_fn.pytorch_attn_bias
+            assert len(bias.shape) in {2, 3}
+            if len(bias.shape) == 2:
+                w = torch.softmax(w + self.attn_fn.pytorch_attn_bias[None, None], dim=-1)
+            elif len(bias.shape) == 3:
+                w = torch.softmax(w + self.attn_fn.pytorch_attn_bias[None], dim=-1)
+        else:
+            w = torch.softmax(w, dim=-1)
+        r = torch.einsum("bhck,bhkd->bhcd", w, v)
+        r = r.permute((0, 2, 1, 3)).reshape((r.shape[0], -1, self.n_state))
+        if n_query_pad != 0:
+            r = r[:, :-n_query_pad]
+        assert r.shape[1] == n_context
+        r = self.f_c(r)
+        return m + r
+@attr.s(eq=False, repr=False)
+class FullyConnectedResblock(nn.Module):
+    """
+    Not imported from other files because we retain Alec's original inits.
+    """
+    n_state: int = attr.ib()
+    n_resblocks: int = attr.ib()
+    device: torch.device = attr.ib(default=torch.device("cuda"))
+    def __attrs_post_init__(self) -> None:
+        super().__init__()
+        self.ln = LayerNorm(self.n_state, eps=1e-5, device=self.device)
+        self.f_1 = Affine(
+            self.n_state,
+            4 * self.n_state,
+            use_bias=True,
+            std=np.sqrt(2 / (4 * self.n_state)),
+            device=self.device,
+        )
+        self.f_2 = Affine(
+            4 * self.n_state,
+            self.n_state,
+            use_bias=True,
+            std=1 / np.sqrt(self.n_state * self.n_resblocks ** 2),
+            device=self.device,
+        )  # XXX
+    def forward(self, m: torch.Tensor) -> torch.Tensor:
+        r = m
+        r = self.ln(r)
+        r = self.f_2(F.gelu(self.f_1(r)))
+        return m + r
+@attr.s(eq=False, repr=False)
+class TransformerBlock(nn.Module):
+    n_state: int = attr.ib()
+    n_resblocks: int = attr.ib()
+    attn_fn: AttentionInfo = attr.ib()
+    device: torch.device = attr.ib(default=torch.device("cuda"))
+    def __attrs_post_init__(self) -> None:
+        super().__init__()
+        self.f_attn = AttentionResblock(
+            self.n_state,
+            self.n_resblocks,
+            self.attn_fn,
+            self.device,
+        )
+        self.f_mlp = FullyConnectedResblock(self.n_state, self.n_resblocks, self.device)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.f_mlp(self.f_attn(x))
+@attr.s(eq=False, repr=False)
+class TextFeatureExtractor(nn.Module):
+    n_state: int = attr.ib()
+    n_embd: int = attr.ib()
+    device: torch.device = attr.ib(default=torch.device("cuda"))
+    def __attrs_post_init__(self) -> None:
+        super().__init__()
+        self.ln = LayerNorm(self.n_state, eps=1e-5, device=self.device)
+        self.f = Affine(self.n_state, self.n_embd, use_bias=False, device=self.device)
+    def forward(
+        self, text: torch.Tensor, text_len: torch.Tensor, return_probe_features: bool = False
+    ) -> torch.Tensor:
+        if len(text.shape) != 3:
+            raise ValueError("expected text to be 3d")
+        if len(text_len.shape) != 1:
+            raise ValueError("expected text length to be 1d")
+        if text.shape[0] != text_len.shape[0]:
+            raise ValueError("text and text_len have inconsistent batch dimensions")
+        index = (text_len - 1)[:, None, None].expand(-1, 1, text.shape[2])
+        x = torch.gather(text, dim=1, index=index)
+        assert list(x.shape) == [text.shape[0], 1, text.shape[2]]
+        if return_probe_features:
+            return x[:, 0]
+        x = self.ln(x)
+        return self.f(x[:, 0])
+@attr.s(eq=False, repr=False)
+class ImageFeatureExtractor(nn.Module):
+    n_state: int = attr.ib()
+    n_embd: int = attr.ib()
+    device: torch.device = attr.ib(default=torch.device("cuda"))
+    def __attrs_post_init__(self) -> None:
+        super().__init__()
+        self.ln = LayerNorm(self.n_state, eps=1e-5, device=self.device)
+        self.f = Affine(self.n_state, self.n_embd, use_bias=False, device=self.device)
+    def forward(self, x: torch.Tensor, return_probe_features: bool = False) -> torch.Tensor:
+        if return_probe_features:
+            return x[:, 0]
+        x = self.ln(x[:, :1])
+        return self.f(x[:, 0])
+@attr.s(eq=False, repr=False)
+class TextEncoder(nn.Module):
+    n_bpe_vocab: int = attr.ib()
+    max_text_len: int = attr.ib()
+    n_embd: int = attr.ib()
+    n_head: int = attr.ib()
+    n_xf_blocks: int = attr.ib()
+    n_head_state: int = attr.ib(default=64)
+    device: torch.device = attr.ib(default=torch.device("cuda"))
+    block_size: int = attr.ib(init=False, default=32)
+    def __attrs_post_init__(self) -> None:
+        super().__init__()
+        self.n_state = self.n_head * self.n_head_state
+        n_rounded_context = self.block_size * int(math.ceil(self.max_text_len / self.block_size))
+        n_pad = n_rounded_context - self.max_text_len
+        args = (
+            n_rounded_context,
+            n_rounded_context,
+            self.block_size,
+            self.n_head,
+            False,
+            n_pad,
+            n_pad,
+        )
+        mask = DenseCausalAttentionMask(*args)
+        attn_fn = to_attention_info(mask)
+        m = 1 - make_full_layout(mask).astype(np.float32)
+        m[m == 1] = -1e10
+        attn_fn.pytorch_attn_bias = torch.from_numpy(m).to(self.device)
+        blocks: List[Tuple[str, nn.Module]] = [
+            (
+                "input",
+                TextEmbedding(
+                    self.n_bpe_vocab, self.max_text_len, self.n_state, device=self.device
+                ),
+            )
+        ]
+        for i in range(self.n_xf_blocks):
+            blocks.append(
+                (
+                    f"block_{i}",
+                    TransformerBlock(self.n_state, 2 * self.n_xf_blocks, attn_fn, self.device),
+                )
+            )
+        blocks.append(
+            ("output", TextFeatureExtractor(self.n_state, self.n_embd, device=self.device))
+        )
+        self.blocks = nn.ModuleDict(OrderedDict(blocks))
+    def forward(
+        self,
+        text: torch.Tensor,
+        text_len: torch.Tensor,
+        return_probe_features: bool = False,
+    ) -> torch.Tensor:
+        n_batch = text.shape[0]
+        h = self.blocks["input"](text)
+        for i in range(self.n_xf_blocks):
+            h = self.blocks[f"block_{i}"](h)
+        h = self.blocks["output"](h, text_len, return_probe_features=return_probe_features)
+        assert list(h.shape) == [
+            n_batch,
+            self.n_embd if not return_probe_features else self.n_state,
+        ]
+        return h
+@attr.s(eq=False, repr=False)
+class ImageEncoder(nn.Module):
+    image_size: int = attr.ib()
+    patch_size: int = attr.ib()
+    n_embd: int = attr.ib()
+    n_head: int = attr.ib()
+    n_xf_blocks: int = attr.ib()
+    n_head_state: int = attr.ib(default=64)
+    n_timestep: int = attr.ib(default=0)
+    device: torch.device = attr.ib(default=torch.device("cuda"))
+    block_size: int = attr.ib(init=False, default=32)
+    def __attrs_post_init__(self) -> None:
+        super().__init__()
+        self.n_state = self.n_head * self.n_head_state
+        self.n_context = 1 + (self.image_size // self.patch_size) ** 2
+        n_rounded_context = self.block_size * int(math.ceil(self.n_context / self.block_size))
+        n_pad = n_rounded_context - self.n_context
+        args = (
+            n_rounded_context,
+            n_rounded_context,
+            self.block_size,
+            self.n_head,
+            False,
+            n_pad,
+            n_pad,
+        )
+        mask = DenseAttentionMask(*args)
+        attn_fn = to_attention_info(mask)
+        m = 1 - make_full_layout(mask).astype(np.float32)
+        m[m == 1] = -1e10
+        attn_fn.pytorch_attn_bias = torch.from_numpy(m).to(self.device)
+        blocks: List[Tuple[str, nn.Module]] = [
+            (
+                "input",
+                ImageEmbedding(
+                    self.image_size,
+                    self.patch_size,
+                    self.n_state,
+                    n_timestep=self.n_timestep,
+                    device=self.device,
+                ),
+            )
+        ]
+        for i in range(self.n_xf_blocks):
+            blocks.append(
+                (
+                    f"block_{i}",
+                    TransformerBlock(self.n_state, 2 * self.n_xf_blocks, attn_fn, self.device),
+                )
+            )
+        blocks.append(("output", ImageFeatureExtractor(self.n_state, self.n_embd, self.device)))
+        self.blocks = nn.ModuleDict(OrderedDict(blocks))
+    def forward(
+        self,
+        image: torch.Tensor,
+        timesteps: Optional[torch.Tensor] = None,
+        return_probe_features: bool = False,
+    ) -> torch.Tensor:
+        n_batch = image.shape[0]
+        h = self.blocks["input"](image, t=timesteps)
+        for i in range(self.n_xf_blocks):
+            h = self.blocks[f"block_{i}"](h)
+        h = self.blocks["output"](h, return_probe_features=return_probe_features)
+        assert list(h.shape) == [
+            n_batch,
+            self.n_embd if not return_probe_features else self.n_state,
+        ]
+        return h

glide_text2im/clip/model_creation.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+from functools import lru_cache
+from typing import Any, Callable, Dict, List, Optional, Tuple
+import attr
+import numpy as np
+import torch
+import torch.nn as nn
+import yaml
+from glide_text2im.tokenizer.simple_tokenizer import SimpleTokenizer
+from .encoders import ImageEncoder, TextEncoder
+@lru_cache()
+def default_config_path() -> str:
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.yaml")
+@attr.s
+class CLIPModel:
+    config: Dict[str, Any] = attr.ib()
+    text_encoder: nn.Module = attr.ib()
+    image_encoder: nn.Module = attr.ib()
+    logit_scale: torch.Tensor = attr.ib()
+    device: torch.device = attr.ib()
+    tokenizer: SimpleTokenizer = attr.ib()
+    def encode_prompts(self, prompts: List[str]) -> Tuple[torch.Tensor, torch.Tensor]:
+        tokens = []
+        lens = []
+        for prompt in prompts:
+            sub_tokens, sub_len = self.tokenizer.padded_tokens_and_len(
+                self.tokenizer.encode(prompt), self.text_encoder.max_text_len
+            )
+            tokens.append(sub_tokens)
+            lens.append(sub_len)
+        return (
+            torch.tensor(tokens).to(dtype=torch.long, device=self.device),
+            torch.tensor(lens).to(dtype=torch.long, device=self.device),
+        )
+    def text_embeddings(self, prompts: List[str]) -> torch.Tensor:
+        tokens, lens = self.encode_prompts(prompts)
+        z_t = self.text_encoder(tokens, lens)
+        return z_t / (torch.linalg.norm(z_t, dim=-1, keepdim=True) + 1e-12)
+    def image_embeddings(self, images: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        z_i = self.image_encoder((images + 1) * 127.5, t)
+        return z_i / (torch.linalg.norm(z_i, dim=-1, keepdim=True) + 1e-12)
+    def cond_fn(self, prompts: List[str], grad_scale: float) -> Callable[..., torch.Tensor]:
+        with torch.no_grad():
+            z_t = self.text_embeddings(prompts)
+        def cond_fn(x, t, grad_scale=grad_scale, **kwargs):
+            with torch.enable_grad():
+                x_var = x.detach().requires_grad_(True)
+                z_i = self.image_embeddings(x_var, t)
+                loss = torch.exp(self.logit_scale) * (z_t * z_i).sum()
+                grad = torch.autograd.grad(loss, x_var)[0].detach()
+            return grad * grad_scale
+        return cond_fn
+def create_clip_model(
+    config_path: Optional[str] = None,
+    device: Optional[torch.device] = None,
+    tokenizer: Optional[SimpleTokenizer] = None,
+) -> CLIPModel:
+    if config_path is None:
+        config_path = default_config_path()
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if tokenizer is None:
+        tokenizer = SimpleTokenizer()
+    with open(config_path, "r") as f:
+        config = yaml.load(f, Loader=yaml.SafeLoader)
+    text_encoder = TextEncoder(
+        n_bpe_vocab=config["n_vocab"],
+        max_text_len=config["max_text_len"],
+        n_embd=config["n_embd"],
+        n_head=config["n_head_text"],
+        n_xf_blocks=config["n_xf_blocks_text"],
+        n_head_state=config["n_head_state_text"],
+        device=device,
+    )
+    image_encoder = ImageEncoder(
+        image_size=config["image_size"],
+        patch_size=config["patch_size"],
+        n_embd=config["n_embd"],
+        n_head=config["n_head_image"],
+        n_xf_blocks=config["n_xf_blocks_image"],
+        n_head_state=config["n_head_state_image"],
+        n_timestep=config["n_timesteps"],
+        device=device,
+    )
+    logit_scale = torch.tensor(
+        np.log(config["logit_scale"]),
+        dtype=torch.float32,
+        device=device,
+        requires_grad=False,
+    )
+    return CLIPModel(
+        config=config,
+        text_encoder=text_encoder,
+        image_encoder=image_encoder,
+        logit_scale=logit_scale,
+        device=device,
+        tokenizer=tokenizer,
+    )

glide_text2im/clip/utils.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import math
+from typing import Callable, Optional
+import attr
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+FilterFn = Callable[[torch.Tensor], torch.Tensor]
+class ZeroKeyBiasGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x):
+        return x
+    @staticmethod
+    def backward(ctx, output_grad):
+        output_grad = output_grad.clone()
+        output_grad.chunk(3)[1].zero_()
+        return output_grad
+def zero_key_bias_grad(x: torch.Tensor) -> torch.Tensor:
+    return ZeroKeyBiasGrad.apply(x)
+@attr.s(eq=False, repr=False)
+class LayerNorm(nn.Module):
+    n_state: int = attr.ib()
+    eps: float = attr.ib(default=1e-6)
+    device: torch.device = attr.ib(default=torch.device("cuda"))
+    def __attrs_post_init__(self) -> None:
+        super().__init__()
+        self.g = nn.Parameter(torch.ones((self.n_state,), dtype=torch.float32, device=self.device))
+        self.b = nn.Parameter(torch.zeros((self.n_state,), dtype=torch.float32, device=self.device))
+        self.g.weight_decay_level = "disable"  # type: ignore
+        self.b.weight_decay_level = "disable"  # type: ignore
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.layer_norm(
+            x.type(torch.float32), torch.Size((self.n_state,)), self.g, self.b, self.eps
+        )
+@attr.s(eq=False, repr=False)
+class Affine(nn.Module):
+    n_in: int = attr.ib()
+    n_out: int = attr.ib()
+    use_bias: bool = attr.ib(default=True)
+    use_admnet_init: bool = attr.ib(default=False)
+    std: Optional[float] = attr.ib(default=None)
+    extra_init_scale: Optional[float] = attr.ib(default=None)
+    bias_filter_fn: FilterFn = attr.ib(default=lambda x: x)
+    device: torch.device = attr.ib(default=torch.device("cuda"))
+    def __attrs_post_init__(self) -> None:
+        super().__init__()
+        if not self.use_admnet_init:
+            self.std = self.std if self.std is not None else math.sqrt(2 / (self.n_in + self.n_out))
+            self.std = (
+                self.std if self.extra_init_scale is None else self.std * self.extra_init_scale
+            )
+            w = torch.empty((self.n_out, self.n_in), dtype=torch.float32, device=self.device)
+            self.w = nn.Parameter(w)
+            if self.use_bias:
+                self.b = nn.Parameter(
+                    torch.zeros((self.n_out,), dtype=torch.float32, device=self.device)
+                )
+                self.b.weight_decay_level = "disable"  # type: ignore
+        else:
+            if self.extra_init_scale is not None:
+                raise ValueError("extra_init_scale incompatible with admnet init")
+            w = torch.empty((self.n_out, self.n_in), dtype=torch.float32, device=self.device)
+            if self.use_bias:
+                b = torch.empty((self.n_out,), dtype=torch.float32, device=self.device)
+            self.w = nn.Parameter(w)
+            if self.use_bias:
+                self.b = nn.Parameter(b)
+                self.b.weight_decay_level = "disable"  # type: ignore
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        w = self.w if self.w.dtype == x.dtype else self.w.to(x.dtype)
+        b = (
+            self.bias_filter_fn(self.b if self.b.dtype == x.dtype else self.b.to(x.dtype))
+            if self.use_bias
+            else None
+        )
+        return F.linear(x, w, b)