Spaces:

haor
/

clip_aes

Sleeping

App Files Files Community

haor commited on Apr 8, 2024

Commit

b5fd63d

verified ·

1 Parent(s): d98182a

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -23

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 import clip
 import pandas as pd
 import hashlib
@@ -10,15 +9,12 @@ import cv2
 import time
 from PIL import Image
-# if you changed the MLP architecture during training, change it also here:
 class MLP(nn.Module):
-    def __init__(self, input_size, xcol="emb", ycol="avg_rating"):
         super().__init__()
-        self.input_size = input_size
-        self.xcol = xcol
-        self.ycol = ycol
         self.layers = nn.Sequential(
-            nn.Linear(self.input_size, 1024),
             nn.Dropout(0.2),
             nn.Linear(1024, 128),
             nn.Dropout(0.2),
@@ -31,12 +27,14 @@ class MLP(nn.Module):
     def forward(self, x):
         return self.layers(x)
-def _binary_array_to_hex(arr):
     bit_string = ''.join(str(b) for b in 1 * arr.flatten())
     width = int(np.ceil(len(bit_string) / 4))
     return '{:0>{width}x}'.format(int(bit_string, 2), width=width)
-def phashstr(image, hash_size=8, highfreq_factor=4):
     if hash_size < 2:
         raise ValueError('Hash size must be greater than or equal to 2')
@@ -48,8 +46,9 @@ def phashstr(image, hash_size=8, highfreq_factor=4):
     dctlowfreq = dct[:hash_size, :hash_size]
     med = np.median(dctlowfreq)
     diff = dctlowfreq > med
-    return _binary_array_to_hex(diff.flatten())
 def convert_numpy_types(data):
     if isinstance(data, dict):
         return {key: convert_numpy_types(value) for key, value in data.items()}
@@ -62,19 +61,13 @@ def convert_numpy_types(data):
     else:
         return data
-def normalized_np(a, axis=-1, order=2):
-    import numpy as np  # pylint: disable=import-outside-toplevel
-    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
-    l2[l2 == 0] = 1
-    return a / np.expand_dims(l2, axis)
-def normalized(a, axis=-1, order=2):
     l2 = torch.linalg.norm(a, dim=axis, ord=order, keepdim=True)
     l2[l2 == 0] = 1
     return a / l2
 model = MLP(768)  # CLIP embedding dim is 768 for CLIP ViT L 14
 pthpath = "https://huggingface.co/haor/aesthetics/resolve/main/sac%2Blogos%2Bava1-l14-linearMSE.pth"
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -82,41 +75,47 @@ model.load_state_dict(torch.hub.load_state_dict_from_url(pthpath, map_location=d
 model.to(device).eval()
 model2, preprocess = clip.load("ViT-L/14", device=device)
 def predict(image):
     image = Image.fromarray(image)
     image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
     laplacian_variance = cv2.Laplacian(image_np, cv2.CV_64F).var()
-    phash = phashstr(image)
     md5 = hashlib.md5(image.tobytes()).hexdigest()
     sha1 = hashlib.sha1(image.tobytes()).hexdigest()
     inputs = preprocess(image).unsqueeze(0).to(device)
     with torch.no_grad():
         start_time = time.time()
         img_emb = model2.encode_image(inputs)
         end_time = time.time()
         print(f"Encoding image took {end_time - start_time} seconds")
         start_time = time.time()
-        img_emb = normalized(img_emb).float()
         end_time = time.time()
         print(f"Normalizing image took {end_time - start_time} seconds")
         start_time = time.time()
         prediction = model(img_emb).item()
         end_time = time.time()
         print(f"Making prediction took {end_time - start_time} seconds")
     result = {
         "clip_aesthetic": prediction,
-        "phash": phash,
         "md5": md5,
         "sha1": sha1,
         "laplacian_variance": laplacian_variance
     }
     return convert_numpy_types(result)
 title = "CLIP Aesthetic Score"
 description = "Upload an image to predict its aesthetic score using the CLIP model and calculate other image metrics."

 import gradio as gr
 import torch
 import torch.nn as nn
 import clip
 import pandas as pd
 import hashlib
 import time
 from PIL import Image
+# MLP model definition
 class MLP(nn.Module):
+    def __init__(self, input_size):
         super().__init__()
         self.layers = nn.Sequential(
+            nn.Linear(input_size, 1024),
             nn.Dropout(0.2),
             nn.Linear(1024, 128),
             nn.Dropout(0.2),
     def forward(self, x):
         return self.layers(x)
+# Convert binary array to hexadecimal string
+def binary_array_to_hex(arr):
     bit_string = ''.join(str(b) for b in 1 * arr.flatten())
     width = int(np.ceil(len(bit_string) / 4))
     return '{:0>{width}x}'.format(int(bit_string, 2), width=width)
+# Calculate perceptual hash of an image
+def phash(image, hash_size=8, highfreq_factor=4):
     if hash_size < 2:
         raise ValueError('Hash size must be greater than or equal to 2')
     dctlowfreq = dct[:hash_size, :hash_size]
     med = np.median(dctlowfreq)
     diff = dctlowfreq > med
+    return binary_array_to_hex(diff)
+# Convert NumPy types to Python built-in types
 def convert_numpy_types(data):
     if isinstance(data, dict):
         return {key: convert_numpy_types(value) for key, value in data.items()}
     else:
         return data
+# Normalize tensor
+def normalize(a, axis=-1, order=2):
     l2 = torch.linalg.norm(a, dim=axis, ord=order, keepdim=True)
     l2[l2 == 0] = 1
     return a / l2
+# Load pre-trained MLP model and CLIP model
 model = MLP(768)  # CLIP embedding dim is 768 for CLIP ViT L 14
 pthpath = "https://huggingface.co/haor/aesthetics/resolve/main/sac%2Blogos%2Bava1-l14-linearMSE.pth"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device).eval()
 model2, preprocess = clip.load("ViT-L/14", device=device)
+# Predict aesthetic score and other metrics of an image
 def predict(image):
+    # Preprocess image
     image = Image.fromarray(image)
     image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
     laplacian_variance = cv2.Laplacian(image_np, cv2.CV_64F).var()
+    phash_value = phash(image)
     md5 = hashlib.md5(image.tobytes()).hexdigest()
     sha1 = hashlib.sha1(image.tobytes()).hexdigest()
     inputs = preprocess(image).unsqueeze(0).to(device)
     with torch.no_grad():
+        # Extract image features using CLIP model
         start_time = time.time()
         img_emb = model2.encode_image(inputs)
         end_time = time.time()
         print(f"Encoding image took {end_time - start_time} seconds")
+        # Normalize image features
         start_time = time.time()
+        img_emb = normalize(img_emb).float()
         end_time = time.time()
         print(f"Normalizing image took {end_time - start_time} seconds")
+        # Predict aesthetic score using MLP model
         start_time = time.time()
         prediction = model(img_emb).item()
         end_time = time.time()
         print(f"Making prediction took {end_time - start_time} seconds")
+    # Return prediction results
     result = {
         "clip_aesthetic": prediction,
+        "phash": phash_value,
         "md5": md5,
         "sha1": sha1,
         "laplacian_variance": laplacian_variance
     }
     return convert_numpy_types(result)
+# Create web interface using Gradio
 title = "CLIP Aesthetic Score"
 description = "Upload an image to predict its aesthetic score using the CLIP model and calculate other image metrics."