Spaces:

turhancan97
/

MAE-Demo

Sleeping

App Files Files Community

turhancan97 commited on Aug 20, 2024

Commit

929f451

1 Parent(s): b7d4bcf

app file created

Browse files

Files changed (6) hide show

app.py +101 -0
images/cat.jpg +0 -0
images/dog.jpg +0 -0
model.py +220 -0
requirements.txt +16 -0
vit-t-mae-pretrain.pt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import gradio as gr
+import torch
+import torchvision
+from PIL import Image
+import numpy as np
+import random
+from einops import rearrange
+import matplotlib.pyplot as plt
+from torchvision.transforms import v2
+from model import MAE_ViT, MAE_Encoder, MAE_Decoder, MAE_Encoder_FeatureExtractor
+path  = [['images/cat.jpg'], ['images/dog.jpg']]
+model_name = "vit-t-mae-pretrain.pt"
+model = torch.load(model_name, map_location='cpu')
+model.eval()
+device = torch.device("cpu")
+model.to(device)
+transform = v2.Compose([
+        v2.Resize((32, 32)),
+        v2.ToTensor(),
+        v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+    ])
+# Load and Preprocess the Image
+def load_image(image_path, transform):
+    img = Image.open(image_path).convert('RGB')
+    # transform = Compose([ToTensor(), Normalize(0.5, 0.5), Resize((32, 32))])
+    img = transform(img).unsqueeze(0)  # Add batch dimension
+    return img
+def show_image(img, title):
+    img = rearrange(img, "c h w -> h w c")
+    img = (img.cpu().detach().numpy() + 1) / 2  # Normalize to [0, 1]
+    plt.imshow(img)
+    plt.axis('off')
+    plt.title(title)
+# Visualize a Single Image
+def visualize_single_image(image_path, image_name, model, device):
+    img = load_image(image_path, transform).to(device)
+    # Run inference
+    model.eval()
+    with torch.no_grad():
+        predicted_img, mask = model(img)
+    # Convert the tensor back to a displayable image
+    # masked image
+    im_masked = img * (1 - mask)
+    # MAE reconstruction pasted with visible patches
+    im_paste = img * (1 - mask) + predicted_img * mask
+   # make the plt figure larger
+    plt.figure(figsize=(12, 4))
+    plt.subplot(1, 4, 1)
+    show_image(img[0], "original")
+    plt.subplot(1, 4, 2)
+    show_image(im_masked[0], "masked")
+    plt.subplot(1, 4, 3)
+    show_image(predicted_img[0], "reconstruction")
+    plt.subplot(1, 4, 4)
+    show_image(im_paste[0], "reconstruction + visible")
+    plt.tight_layout()
+    return plt
+# Example Usage
+image_path = 'images/dog.jpg'  # Replace with the actual path to your image
+# take the string after the last '/' as the image name
+image_name = image_path.split('/')[-1].split('.')[0]
+visualize_single_image(image_path, image_name, model, device)
+inputs_image = [
+    gr.components.Image(type="filepath", label="Input Image"),
+]
+outputs_image = [
+    gr.outputs.Image(type="plot", label="Output Image"),
+]
+gr.Interface(
+    fn=visualize_single_image,
+    inputs=inputs_image,
+    outputs=outputs_image,
+    title="MAE-ViT Image Reconstruction",
+    description="This is a demo of the MAE-ViT model for image reconstruction.",
+    allow_flagging=False,
+    allow_screenshot=False,
+    allow_remote_access=False,
+).launch()

images/cat.jpg ADDED Viewed

images/dog.jpg ADDED Viewed

model.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/IcarusWizard/MAE
+# --------------------------------------------------------
+import torch
+import timm
+import numpy as np
+from einops import repeat, rearrange
+from einops.layers.torch import Rearrange
+from timm.models.layers import trunc_normal_
+from timm.models.vision_transformer import Block
+def random_indexes(size : int):
+    forward_indexes = np.arange(size)
+    np.random.shuffle(forward_indexes)
+    backward_indexes = np.argsort(forward_indexes)
+    return forward_indexes, backward_indexes
+def take_indexes(sequences, indexes):
+    return torch.gather(sequences, 0, repeat(indexes, 't b -> t b c', c=sequences.shape[-1]))
+class PatchShuffle(torch.nn.Module):
+    def __init__(self, ratio) -> None:
+        super().__init__()
+        self.ratio = ratio
+    def forward(self, patches : torch.Tensor):
+        T, B, C = patches.shape
+        remain_T = int(T * (1 - self.ratio))
+        indexes = [random_indexes(T) for _ in range(B)]
+        forward_indexes = torch.as_tensor(np.stack([i[0] for i in indexes], axis=-1), dtype=torch.long).to(patches.device)
+        backward_indexes = torch.as_tensor(np.stack([i[1] for i in indexes], axis=-1), dtype=torch.long).to(patches.device)
+        patches = take_indexes(patches, forward_indexes)
+        patches = patches[:remain_T]
+        return patches, forward_indexes, backward_indexes
+class MAE_Encoder(torch.nn.Module):
+    def __init__(self,
+                 image_size=32,
+                 patch_size=2,
+                 emb_dim=192,
+                 num_layer=12,
+                 num_head=3,
+                 mask_ratio=0.75,
+                 ) -> None:
+        super().__init__()
+        self.cls_token = torch.nn.Parameter(torch.zeros(1, 1, emb_dim))
+        self.pos_embedding = torch.nn.Parameter(torch.zeros((image_size // patch_size) ** 2, 1, emb_dim))
+        self.shuffle = PatchShuffle(mask_ratio)
+        self.patchify = torch.nn.Conv2d(3, emb_dim, patch_size, patch_size)
+        self.transformer = torch.nn.Sequential(*[Block(emb_dim, num_head) for _ in range(num_layer)])
+        self.layer_norm = torch.nn.LayerNorm(emb_dim)
+        self.init_weight()
+    def init_weight(self):
+        trunc_normal_(self.cls_token, std=.02)
+        trunc_normal_(self.pos_embedding, std=.02)
+    def forward(self, img):
+        patches = self.patchify(img)
+        patches = rearrange(patches, 'b c h w -> (h w) b c')
+        patches = patches + self.pos_embedding
+        patches, forward_indexes, backward_indexes = self.shuffle(patches)
+        patches = torch.cat([self.cls_token.expand(-1, patches.shape[1], -1), patches], dim=0)
+        patches = rearrange(patches, 't b c -> b t c')
+        features = self.layer_norm(self.transformer(patches))
+        features = rearrange(features, 'b t c -> t b c')
+        return features, backward_indexes
+class MAE_Decoder(torch.nn.Module):
+    def __init__(self,
+                 image_size=32,
+                 patch_size=2,
+                 emb_dim=192,
+                 num_layer=4,
+                 num_head=3,
+                 ) -> None:
+        super().__init__()
+        self.mask_token = torch.nn.Parameter(torch.zeros(1, 1, emb_dim))
+        self.pos_embedding = torch.nn.Parameter(torch.zeros((image_size // patch_size) ** 2 + 1, 1, emb_dim))
+        self.transformer = torch.nn.Sequential(*[Block(emb_dim, num_head) for _ in range(num_layer)])
+        self.head = torch.nn.Linear(emb_dim, 3 * patch_size ** 2)
+        self.patch2img = Rearrange('(h w) b (c p1 p2) -> b c (h p1) (w p2)', p1=patch_size, p2=patch_size, h=image_size//patch_size)
+        self.init_weight()
+    def init_weight(self):
+        trunc_normal_(self.mask_token, std=.02)
+        trunc_normal_(self.pos_embedding, std=.02)
+    def forward(self, features, backward_indexes):
+        T = features.shape[0]
+        backward_indexes = torch.cat([torch.zeros(1, backward_indexes.shape[1]).to(backward_indexes), backward_indexes + 1], dim=0)
+        features = torch.cat([features, self.mask_token.expand(backward_indexes.shape[0] - features.shape[0], features.shape[1], -1)], dim=0)
+        features = take_indexes(features, backward_indexes)
+        features = features + self.pos_embedding
+        features = rearrange(features, 't b c -> b t c')
+        features = self.transformer(features)
+        features = rearrange(features, 'b t c -> t b c')
+        features = features[1:] # remove global feature
+        patches = self.head(features)
+        mask = torch.zeros_like(patches)
+        mask[T-1:] = 1
+        mask = take_indexes(mask, backward_indexes[1:] - 1)
+        img = self.patch2img(patches)
+        mask = self.patch2img(mask)
+        return img, mask
+class MAE_ViT(torch.nn.Module):
+    def __init__(self,
+                 image_size=32,
+                 patch_size=2,
+                 emb_dim=192,
+                 encoder_layer=12,
+                 encoder_head=3,
+                 decoder_layer=4,
+                 decoder_head=3,
+                 mask_ratio=0.75,
+                 ) -> None:
+        super().__init__()
+        self.encoder = MAE_Encoder(image_size, patch_size, emb_dim, encoder_layer, encoder_head, mask_ratio)
+        self.decoder = MAE_Decoder(image_size, patch_size, emb_dim, decoder_layer, decoder_head)
+    def forward(self, img):
+        features, backward_indexes = self.encoder(img)
+        predicted_img, mask = self.decoder(features,  backward_indexes)
+        return predicted_img, mask
+class ViT_Classifier(torch.nn.Module):
+    '''
+    A simple image classification task acts as a head for ViT, allowing fine-tuning on downstream tasks.
+    We didn't directly use the MAE_ViT encoder because we need to add a classification head.
+    The Masked Autoencoder uses only some patches as input, which means it lacks the global information of the image,
+    making it unsuitable for classification.
+    '''
+    def __init__(self, encoder : MAE_Encoder, dropout_p, num_classes=10) -> None:
+        super().__init__()
+        self.dropout_p = dropout_p
+        self.cls_token = encoder.cls_token
+        self.pos_embedding = encoder.pos_embedding
+        self.patchify = encoder.patchify
+        self.transformer = encoder.transformer
+        self.layer_norm = encoder.layer_norm
+        self.dropout = torch.nn.Dropout(dropout_p)  # Add dropout layer
+        self.head = torch.nn.Linear(self.pos_embedding.shape[-1], num_classes)
+    def forward(self, img):
+        patches = self.patchify(img)
+        patches = rearrange(patches, 'b c h w -> (h w) b c')
+        patches = patches + self.pos_embedding
+        patches = torch.cat([self.cls_token.expand(-1, patches.shape[1], -1), patches], dim=0)
+        patches = rearrange(patches, 't b c -> b t c')
+        features = self.layer_norm(self.transformer(patches))
+        # t is the number of patches, b is the batch size, c is the number of features
+        features = rearrange(features, 'b t c -> t b c')
+        if self.dropout_p > 0:
+            features = self.dropout(features)  # Apply dropout before the final head
+        logits = self.head(features[0]) # only use the cls token
+        return logits
+class MAE_Encoder_FeatureExtractor(torch.nn.Module):
+    '''
+    A feature extractor that extracts features from the encoder of the Masked Autoencoder.
+    '''
+    def __init__(self, encoder : MAE_Encoder) -> None:
+        super().__init__()
+        self.cls_token = encoder.cls_token
+        self.pos_embedding = encoder.pos_embedding
+        self.patchify = encoder.patchify
+        self.transformer = encoder.transformer
+        self.layer_norm = encoder.layer_norm
+    def forward(self, img):
+        patches = self.patchify(img)
+        patches = rearrange(patches, 'b c h w -> (h w) b c')
+        patches = patches + self.pos_embedding
+        patches = torch.cat([self.cls_token.expand(-1, patches.shape[1], -1), patches], dim=0)
+        patches = rearrange(patches, 't b c -> b t c')
+        features = self.layer_norm(self.transformer(patches))
+        # t is the number of patches, b is the batch size, c is the number of features
+        features = rearrange(features, 'b t c -> t b c')
+        return features
+if __name__ == '__main__':
+    shuffle = PatchShuffle(0.75)
+    a = torch.rand(16, 2, 10)
+    b, forward_indexes, backward_indexes = shuffle(a)
+    print(b.shape)
+    img = torch.rand(2, 3, 32, 32)
+    encoder = MAE_Encoder()
+    decoder = MAE_Decoder()
+    features, backward_indexes = encoder(img)
+    print(forward_indexes.shape)
+    predicted_img, mask = decoder(features, backward_indexes)
+    print(predicted_img.shape)
+    loss = torch.mean((predicted_img - img) ** 2 * mask / 0.75)
+    print(loss)

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+# python=3.8
+torch
+torchvision
+tensorboard
+scikit-learn
+matplotlib
+numpy
+einops
+timm==0.4.12
+tqdm
+omega
+pyyaml
+opencv-python
+wandb
+icecream
+torchinfo

vit-t-mae-pretrain.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:852a6a0806c42a8c725b0de82cd0e7b59d7d79ad21f8e012bc599eedcce15375
+size 28972154