File size: 6,843 Bytes
8510f91
 
 
 
 
 
 
 
 
 
 
5c5800a
8510f91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d538145
 
 
cfcf949
d538145
 
 
8510f91
d538145
8510f91
d538145
8510f91
 
 
d538145
8510f91
 
 
d538145
8510f91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c5800a
 
 
8510f91
 
 
 
 
 
 
 
 
d538145
 
 
8510f91
5c5800a
 
 
d538145
 
5c5800a
 
8510f91
 
 
5c5800a
 
8510f91
 
 
5c5800a
 
 
 
 
 
 
d538145
5c5800a
 
8510f91
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import gradio as gr
import torch
from PIL import Image
import pandas as pd
from lavis.models import load_model_and_preprocess
from lavis.processors import load_processor
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity


# Import logging module
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load model and preprocessors for Image-Text Matching (LAVIS)
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
model_itm, vis_processors, text_processors = load_model_and_preprocess("blip2_image_text_matching", "pretrain", device=device, is_eval=True)

# Load tokenizer and model for Image Captioning (TextCaps)
git_processor_large_textcaps = AutoProcessor.from_pretrained("microsoft/git-large-r-textcaps")
git_model_large_textcaps = AutoModelForCausalLM.from_pretrained("microsoft/git-large-r-textcaps")

# Load Universal Sentence Encoder model for textual similarity calculation
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Define a function to compute textual similarity between caption and statement
def compute_textual_similarity(caption, statement):
    # Convert caption and statement into sentence embeddings
    caption_embedding = embed([caption])[0].numpy()
    statement_embedding = embed([statement])[0].numpy()

    # Calculate cosine similarity between sentence embeddings
    similarity_score = cosine_similarity([caption_embedding], [statement_embedding])[0][0]
    return similarity_score

# List of statements for Image-Text Matching
statements = [
    "contains or features a cartoon, figurine, or toy",
    "appears to be for children",
    "includes children",
    "sexual",
    "nudity",
    "depicts a child or portrays objects, images, or cartoon figures that primarily appeal to persons below the legal purchase age",
    "uses the name of or depicts Santa Claus",
    'promotes alcohol use as a "rite of passage" to adulthood',
    "uses brand identification—including logos, trademarks, or names—on clothing, toys, games, game equipment, or other items intended for use primarily by persons below the legal purchase age",
    "portrays persons in a state of intoxication or in any way suggests that intoxication is socially acceptable conduct",
    "makes curative or therapeutic claims, except as permitted by law",
    "makes claims or representations that individuals can attain social, professional, educational, or athletic success or status due to beverage alcohol consumption",
    "degrades the image, form, or status of women, men, or of any ethnic group, minority, sexual orientation, religious affiliation, or other such group?",
    "uses lewd or indecent images or language",
    "employs religion or religious themes?",
    "relies upon sexual prowess or sexual success as a selling point for the brand",
    "uses graphic or gratuitous nudity, overt sexual activity, promiscuity, or sexually lewd or indecent images or language",
    "associates with anti-social or dangerous behavior",
    "depicts illegal activity",
    'uses the term "spring break" or sponsors events or activities that use the term "spring break," unless those events or activities are located at a licensed retail establishment',
]

# Function to compute ITM scores for the image-statement pair
def compute_itm_score(image, statement):
    logging.info('Starting compute_itm_score')
    pil_image = Image.fromarray(image.astype('uint8'), 'RGB')
    img = vis_processors["eval"](pil_image.convert("RGB")).unsqueeze(0).to(device)
    # Pass the statement text directly to model_itm
    itm_output = model_itm({"image": img, "text_input": statement}, match_head="itm")
    itm_scores = torch.nn.functional.softmax(itm_output, dim=1)
    score = itm_scores[:, 1].item()
    logging.info('Finished compute_itm_score')
    return score

def generate_caption(processor, model, image):
    logging.info('Starting generate_caption')
    inputs = processor(images=image, return_tensors="pt").to(device)
    generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=50)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    logging.info('Finished generate_caption')
    return generated_caption

# Main function to perform image captioning and image-text matching
def process_images_and_statements(image):
    logging.info('Starting process_images_and_statements')

    # Generate image caption for the uploaded image using git-large-r-textcaps
    caption = generate_caption(git_processor_large_textcaps, git_model_large_textcaps, image)

    # Define weights for combining textual similarity score and image-statement ITM score (adjust as needed)
    weight_textual_similarity = 0.5
    weight_statement = 0.5

    # Initialize an empty DataFrame with column names
    results_df = pd.DataFrame(columns=['Statement', 'Textual Similarity Score', 'ITM Score', 'Final Combined Score'])

    # Loop through each predefined statement
    for statement in statements:
        # Compute textual similarity between caption and statement
        textual_similarity_score = compute_textual_similarity(caption, statement)

        # Compute ITM score for the image-statement pair
        itm_score_statement = compute_itm_score(image, statement)

        # Combine the two scores using a weighted average
        #final_score = (weight_textual_similarity * textual_similarity_score) + (weight_statement * itm_score_statement)
        final_score = ((weight_textual_similarity * textual_similarity_score) +
                       (weight_statement * itm_score_statement)) * 100  # Multiply by 100

        # Append the result to the DataFrame
        results_df = results_df.append({
            'Statement': statement,
            'Textual Similarity Score': textual_similarity_score * 100,  # Multiply by 100
            'ITM Score': itm_score_statement * 100,  # Multiply by 100
            'Final Combined Score': final_score
        }, ignore_index=True)

    logging.info('Finished process_images_and_statements')

    # Return the DataFrame directly as output (no need to convert to HTML)
    return results_df  # <--- Return results_df directly

# Gradio interface
image_input = gr.inputs.Image()
output = gr.outputs.Dataframe(type="pandas", label="Results")  # <--- Use "pandas" type for DataFrame output

iface = gr.Interface(
    fn=process_images_and_statements,
    inputs=image_input,
    outputs=output,
    title="Image Captioning and Image-Text Matching",
    theme='sudeepshouche/minimalist',
    css=".output { flex-direction: column; } .output .outputs { width: 100%; }"  # Custom CSS
)

iface.launch()