haor commited on
Commit
b5fd63d
·
verified ·
1 Parent(s): d98182a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -23
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
  import torch
3
  import torch.nn as nn
4
- import torch.nn.functional as F
5
  import clip
6
  import pandas as pd
7
  import hashlib
@@ -10,15 +9,12 @@ import cv2
10
  import time
11
  from PIL import Image
12
 
13
- # if you changed the MLP architecture during training, change it also here:
14
  class MLP(nn.Module):
15
- def __init__(self, input_size, xcol="emb", ycol="avg_rating"):
16
  super().__init__()
17
- self.input_size = input_size
18
- self.xcol = xcol
19
- self.ycol = ycol
20
  self.layers = nn.Sequential(
21
- nn.Linear(self.input_size, 1024),
22
  nn.Dropout(0.2),
23
  nn.Linear(1024, 128),
24
  nn.Dropout(0.2),
@@ -31,12 +27,14 @@ class MLP(nn.Module):
31
  def forward(self, x):
32
  return self.layers(x)
33
 
34
- def _binary_array_to_hex(arr):
 
35
  bit_string = ''.join(str(b) for b in 1 * arr.flatten())
36
  width = int(np.ceil(len(bit_string) / 4))
37
  return '{:0>{width}x}'.format(int(bit_string, 2), width=width)
38
 
39
- def phashstr(image, hash_size=8, highfreq_factor=4):
 
40
  if hash_size < 2:
41
  raise ValueError('Hash size must be greater than or equal to 2')
42
 
@@ -48,8 +46,9 @@ def phashstr(image, hash_size=8, highfreq_factor=4):
48
  dctlowfreq = dct[:hash_size, :hash_size]
49
  med = np.median(dctlowfreq)
50
  diff = dctlowfreq > med
51
- return _binary_array_to_hex(diff.flatten())
52
 
 
53
  def convert_numpy_types(data):
54
  if isinstance(data, dict):
55
  return {key: convert_numpy_types(value) for key, value in data.items()}
@@ -62,19 +61,13 @@ def convert_numpy_types(data):
62
  else:
63
  return data
64
 
65
- def normalized_np(a, axis=-1, order=2):
66
- import numpy as np # pylint: disable=import-outside-toplevel
67
-
68
- l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
69
- l2[l2 == 0] = 1
70
- return a / np.expand_dims(l2, axis)
71
-
72
- def normalized(a, axis=-1, order=2):
73
  l2 = torch.linalg.norm(a, dim=axis, ord=order, keepdim=True)
74
  l2[l2 == 0] = 1
75
  return a / l2
76
 
77
-
78
  model = MLP(768) # CLIP embedding dim is 768 for CLIP ViT L 14
79
  pthpath = "https://huggingface.co/haor/aesthetics/resolve/main/sac%2Blogos%2Bava1-l14-linearMSE.pth"
80
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -82,41 +75,47 @@ model.load_state_dict(torch.hub.load_state_dict_from_url(pthpath, map_location=d
82
  model.to(device).eval()
83
  model2, preprocess = clip.load("ViT-L/14", device=device)
84
 
 
85
  def predict(image):
86
-
87
  image = Image.fromarray(image)
88
  image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
89
  laplacian_variance = cv2.Laplacian(image_np, cv2.CV_64F).var()
90
- phash = phashstr(image)
91
  md5 = hashlib.md5(image.tobytes()).hexdigest()
92
  sha1 = hashlib.sha1(image.tobytes()).hexdigest()
93
  inputs = preprocess(image).unsqueeze(0).to(device)
94
 
95
  with torch.no_grad():
 
96
  start_time = time.time()
97
  img_emb = model2.encode_image(inputs)
98
  end_time = time.time()
99
  print(f"Encoding image took {end_time - start_time} seconds")
100
 
 
101
  start_time = time.time()
102
- img_emb = normalized(img_emb).float()
103
  end_time = time.time()
104
  print(f"Normalizing image took {end_time - start_time} seconds")
105
 
 
106
  start_time = time.time()
107
  prediction = model(img_emb).item()
108
  end_time = time.time()
109
  print(f"Making prediction took {end_time - start_time} seconds")
110
 
 
111
  result = {
112
  "clip_aesthetic": prediction,
113
- "phash": phash,
114
  "md5": md5,
115
  "sha1": sha1,
116
  "laplacian_variance": laplacian_variance
117
  }
118
  return convert_numpy_types(result)
119
 
 
120
  title = "CLIP Aesthetic Score"
121
  description = "Upload an image to predict its aesthetic score using the CLIP model and calculate other image metrics."
122
 
 
1
  import gradio as gr
2
  import torch
3
  import torch.nn as nn
 
4
  import clip
5
  import pandas as pd
6
  import hashlib
 
9
  import time
10
  from PIL import Image
11
 
12
+ # MLP model definition
13
  class MLP(nn.Module):
14
+ def __init__(self, input_size):
15
  super().__init__()
 
 
 
16
  self.layers = nn.Sequential(
17
+ nn.Linear(input_size, 1024),
18
  nn.Dropout(0.2),
19
  nn.Linear(1024, 128),
20
  nn.Dropout(0.2),
 
27
  def forward(self, x):
28
  return self.layers(x)
29
 
30
+ # Convert binary array to hexadecimal string
31
+ def binary_array_to_hex(arr):
32
  bit_string = ''.join(str(b) for b in 1 * arr.flatten())
33
  width = int(np.ceil(len(bit_string) / 4))
34
  return '{:0>{width}x}'.format(int(bit_string, 2), width=width)
35
 
36
+ # Calculate perceptual hash of an image
37
+ def phash(image, hash_size=8, highfreq_factor=4):
38
  if hash_size < 2:
39
  raise ValueError('Hash size must be greater than or equal to 2')
40
 
 
46
  dctlowfreq = dct[:hash_size, :hash_size]
47
  med = np.median(dctlowfreq)
48
  diff = dctlowfreq > med
49
+ return binary_array_to_hex(diff)
50
 
51
+ # Convert NumPy types to Python built-in types
52
  def convert_numpy_types(data):
53
  if isinstance(data, dict):
54
  return {key: convert_numpy_types(value) for key, value in data.items()}
 
61
  else:
62
  return data
63
 
64
+ # Normalize tensor
65
+ def normalize(a, axis=-1, order=2):
 
 
 
 
 
 
66
  l2 = torch.linalg.norm(a, dim=axis, ord=order, keepdim=True)
67
  l2[l2 == 0] = 1
68
  return a / l2
69
 
70
+ # Load pre-trained MLP model and CLIP model
71
  model = MLP(768) # CLIP embedding dim is 768 for CLIP ViT L 14
72
  pthpath = "https://huggingface.co/haor/aesthetics/resolve/main/sac%2Blogos%2Bava1-l14-linearMSE.pth"
73
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
75
  model.to(device).eval()
76
  model2, preprocess = clip.load("ViT-L/14", device=device)
77
 
78
+ # Predict aesthetic score and other metrics of an image
79
  def predict(image):
80
+ # Preprocess image
81
  image = Image.fromarray(image)
82
  image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
83
  laplacian_variance = cv2.Laplacian(image_np, cv2.CV_64F).var()
84
+ phash_value = phash(image)
85
  md5 = hashlib.md5(image.tobytes()).hexdigest()
86
  sha1 = hashlib.sha1(image.tobytes()).hexdigest()
87
  inputs = preprocess(image).unsqueeze(0).to(device)
88
 
89
  with torch.no_grad():
90
+ # Extract image features using CLIP model
91
  start_time = time.time()
92
  img_emb = model2.encode_image(inputs)
93
  end_time = time.time()
94
  print(f"Encoding image took {end_time - start_time} seconds")
95
 
96
+ # Normalize image features
97
  start_time = time.time()
98
+ img_emb = normalize(img_emb).float()
99
  end_time = time.time()
100
  print(f"Normalizing image took {end_time - start_time} seconds")
101
 
102
+ # Predict aesthetic score using MLP model
103
  start_time = time.time()
104
  prediction = model(img_emb).item()
105
  end_time = time.time()
106
  print(f"Making prediction took {end_time - start_time} seconds")
107
 
108
+ # Return prediction results
109
  result = {
110
  "clip_aesthetic": prediction,
111
+ "phash": phash_value,
112
  "md5": md5,
113
  "sha1": sha1,
114
  "laplacian_variance": laplacian_variance
115
  }
116
  return convert_numpy_types(result)
117
 
118
+ # Create web interface using Gradio
119
  title = "CLIP Aesthetic Score"
120
  description = "Upload an image to predict its aesthetic score using the CLIP model and calculate other image metrics."
121