Spaces:

haor
/

clip_aes

Running

App Files Files Community

clip_aes / app.py

haor

Create app.py

2996858 verified 10 months ago

raw

history blame

3.32 kB

	import gradio as gr
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import clip
	import pandas as pd
	import hashlib
	import numpy as np
	import cv2
	from PIL import Image

	# if you changed the MLP architecture during training, change it also here:
	class MLP(nn.Module):
	def __init__(self, input_size, xcol="emb", ycol="avg_rating"):
	super().__init__()
	self.input_size = input_size
	self.xcol = xcol
	self.ycol = ycol
	self.layers = nn.Sequential(
	nn.Linear(self.input_size, 1024),
	nn.Dropout(0.2),
	nn.Linear(1024, 128),
	nn.Dropout(0.2),
	nn.Linear(128, 64),
	nn.Dropout(0.1),
	nn.Linear(64, 16),
	nn.Linear(16, 1),
	)

	def forward(self, x):
	return self.layers(x)

	def _binary_array_to_hex(arr):
	bit_string = ''.join(str(b) for b in 1 * arr.flatten())
	width = int(np.ceil(len(bit_string) / 4))
	return '{:0>{width}x}'.format(int(bit_string, 2), width=width)

	def phashstr(image, hash_size=8, highfreq_factor=4):
	if hash_size < 2:
	raise ValueError('Hash size must be greater than or equal to 2')

	import scipy.fftpack
	img_size = hash_size * highfreq_factor
	image = image.convert('L').resize((img_size, img_size), Image.Resampling.LANCZOS)
	pixels = np.asarray(image)
	dct = scipy.fftpack.dct(scipy.fftpack.dct(pixels, axis=0), axis=1)
	dctlowfreq = dct[:hash_size, :hash_size]
	med = np.median(dctlowfreq)
	diff = dctlowfreq > med
	return _binary_array_to_hex(diff.flatten())

	def normalized(a, axis=-1, order=2):
	l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
	l2[l2 == 0] = 1
	return a / np.expand_dims(l2, axis)

	def predict(image):
	model = MLP(768) # CLIP embedding dim is 768 for CLIP ViT L 14
	pthpath = "https://huggingface.co/haor/aesthetics/resolve/main/sac%2Blogos%2Bava1-l14-linearMSE.pth"
	device = "cuda" if torch.cuda.is_available() else "cpu"

	model.load_state_dict(torch.hub.load_state_dict_from_url(pthpath, map_location=device))
	model.to(device).eval()
	model2, preprocess = clip.load("ViT-L/14", device=device)

	image = Image.fromarray(image)
	image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
	laplacian_variance = cv2.Laplacian(image_np, cv2.CV_64F).var()
	phash = phashstr(image)
	md5 = hashlib.md5(image.tobytes()).hexdigest()
	sha1 = hashlib.sha1(image.tobytes()).hexdigest()

	inputs = preprocess(image).unsqueeze(0).to(device)

	with torch.no_grad():
	img_emb = model2.encode_image(inputs)
	img_emb = normalized(img_emb.cpu().numpy())
	prediction = model(torch.from_numpy(img_emb).to(device).type(torch.cuda.FloatTensor)).item()

	result = {
	"clip_aesthetic": prediction,
	"phash": phash,
	"md5": md5,
	"sha1": sha1,
	"laplacian_variance": laplacian_variance
	}
	return result

	title = "CLIP Aesthetic Score"
	description = "Upload an image to predict its aesthetic score using the CLIP model and calculate other image metrics."

	gr.Interface(
	fn=predict,
	inputs=gr.Image(type="numpy"),
	outputs=gr.JSON(label="Result"),
	title=title,
	description=description,
	examples=[["example1.jpg"], ["example2.jpg"]]
	).launch()