import os import clip import torch import logging import json import pandas as pd from PIL import Image import gradio as gr from autogluon.tabular import TabularPredictor predictor = TabularPredictor.load("ag-20240618_230402") # set logging level logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) logger = logging.getLogger("AQ") CLIP_MODEL_NAME = "ViT-B/32" clip_model, preprocess = clip.load(CLIP_MODEL_NAME, device="cpu") def predict_fn(input_img): input_img = Image.fromarray(input_img.astype("uint8"), "RGB") image = preprocess(input_img).unsqueeze(0) with torch.no_grad(): image_features = clip_model.encode_image(image).numpy() input_df = pd.DataFrame(image_features[0].reshape(1, -1)) quality_score = float(predictor.predict(input_df).iloc[0]) logger.info(f"decision: {quality_score}") decision_json = json.dumps({"quality_score": quality_score}).encode("utf-8") logger.info(f"decision_json: {decision_json}") return decision_json iface = gr.Interface( fn=predict_fn, inputs="image", outputs="text", description=""" The model returns quality score for an avatar based on visual apeal and humanoid appearance. """, allow_flagging="manual", ) iface.launch()