File size: 6,335 Bytes
4bd62d7
d933c69
4bd62d7
d933c69
0116945
660ea27
4bd62d7
 
0116945
4bd62d7
 
 
 
d933c69
4bd62d7
 
 
d933c69
 
 
4bd62d7
159fca3
4bd62d7
 
 
660ea27
e27c656
d933c69
 
 
4bd62d7
 
0fcf1c4
4bd62d7
 
d933c69
 
 
 
 
4bd62d7
 
 
 
 
 
 
 
 
 
 
 
 
 
0116945
4bd62d7
 
 
 
 
0116945
 
4bd62d7
 
 
0116945
 
 
 
 
 
 
 
 
d933c69
0116945
 
d933c69
0116945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4bd62d7
 
 
0116945
4bd62d7
d933c69
0116945
 
 
d933c69
0116945
 
 
 
 
 
 
 
 
 
 
 
 
 
d933c69
0116945
4bd62d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0116945
 
 
 
 
 
4bd62d7
0116945
 
 
4bd62d7
0116945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d933c69
0116945
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import os
import requests
from PIL import Image, UnidentifiedImageError
import numpy as np
import gradio as gr
from encoder import FashionCLIPEncoder
from pinecone import Pinecone
from dotenv import load_dotenv
import json

# Load environment variables
load_dotenv()

# Constants
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_NAMESPACE = os.getenv("PINECONE_NAMESPACE")
REQUESTS_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
BATCH_SIZE = 30

# Initialize services
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(PINECONE_INDEX_NAME)
encoder = FashionCLIPEncoder()

def download_image_as_pil(url: str, timeout: int = 10) -> Image.Image:
    try:
        response = requests.get(url, stream=True, headers=REQUESTS_HEADERS, timeout=timeout)
        if response.status_code == 200 and 'image' in response.headers.get('Content-Type', ''):
            try:
                return Image.open(response.raw).convert("RGB")
            except UnidentifiedImageError:
                print(f"Unidentified image file from URL: {url}")
        return None
    except Exception as e:
        print(f"Error downloading image: {e}")
        return None

def process_batch(batch_products, batch_images, results):
    try:
        # Generate embeddings
        embeddings = encoder.encode_images(batch_images)
        
        for product, embedding in zip(batch_products, embeddings):
            # Normalize embedding
            embedding_normalized = embedding / np.linalg.norm(embedding)
            
            # Append results
            results.append({
                "product_id": product["product_id"],
                "image_url": product["url"],
                "embedding": embedding_normalized.tolist(),
                "embedding_preview": embedding_normalized[:5].tolist(),
                "success": True
            })
    except Exception as e:
        for product in batch_products:
            results.append({
                "product_id": product["product_id"],
                "image_url": product["url"],
                "error": str(e)
            })

def batch_process_images(json_input: str):
    try:
        # Parse JSON input
        data = json.loads(json_input)
        products = data.get("products", [])
        upload_to_pinecone = data.get("upload_to_pinecone", False)
        
        if not products:
            return {"error": "No products provided in JSON input."}

        results = []
        batch_products, batch_images = [], []

        for product in products:
            try:
                # Download image
                image = download_image_as_pil(product["url"])
                if not image:
                    results.append({
                        "product_id": product["product_id"],
                        "image_url": product["url"],
                        "error": "Failed to download image"
                    })
                    continue

                batch_products.append(product)
                batch_images.append(image)

                # Process batch when reaching batch size
                if len(batch_images) == BATCH_SIZE:
                    process_batch(batch_products, batch_images, results)
                    batch_products, batch_images = [], []

            except Exception as e:
                results.append({
                    "product_id": product["product_id"],
                    "image_url": product["url"],
                    "error": str(e)
                })

        # Process remaining images in the last batch
        if batch_images:
            process_batch(batch_products, batch_images, results)

        # Upload to Pinecone if requested
        if upload_to_pinecone:
            upload_result = upload_vector_to_pinecone(results)
            return {
                "processing_results": results,
                "pinecone_upload": upload_result
            }
        
        return {"processing_results": results}
    
    except json.JSONDecodeError:
        return {"error": "Invalid JSON format"}
    except Exception as e:
        return {"error": f"Unexpecteddd error: {str(e)}"}

def upload_vector_to_pinecone(processed_results):
    """Upload embeddings to Pinecone"""
    vectors_to_upsert = []
    for result in processed_results:
        if 'error' not in result and 'embedding' in result:
            vector = {
                'id': result['product_id'],
                'values': result['embedding'],
                'metadata': {
                    'image_url': result['image_url']
                }
            }
            vectors_to_upsert.append(vector)
    
    if vectors_to_upsert:
        index.upsert(vectors=vectors_to_upsert, namespace=PINECONE_NAMESPACE)
    
    return {"uploaded_count": len(vectors_to_upsert)}

# Example JSON input
EXAMPLE_INPUT = {
    "products": [
        {
            "product_id": "1",
            "url": "https://cdn.shopify.com/s/files/1/0522/2239/4534/files/CT21355-22_1024x1024.webp"
        }
    ],
    "upload_to_pinecone": False
}

# Gradio Interface
iface = gr.Interface(
    fn=batch_process_images,
    inputs=gr.Code(
        label="Input JSON",
        language="json",
        value=json.dumps(EXAMPLE_INPUT, indent=4)  # Changed from default to value
    ),
    outputs=gr.JSON(label="Processing Results"),
    title="Fashion CLIP Embedding Generator",
    description="Provide JSON input with product IDs, URLs, and Pinecone upload preference to generate embeddings.",
    article="""
    ### Input JSON Format:
    ```json
    {
        "products": [
            {
                "product_id": "string",
                "url": "string"
            }
        ],
        "upload_to_pinecone": boolean
    }
    ```
    
    ### Features:
    - Batch processing of multiple images
    - Custom product ID support
    - Embedding generation using Fashion CLIP
    - Optional Pinecone database integration
    - Error handling and detailed results
    
    Make sure to set up your environment variables in a .env file:
    - PINECONE_API_KEY
    - PINECONE_INDEX_NAME
    - PINECONE_NAMESPACE
    """
)

# Launch Gradio App
if __name__ == "__main__":
    iface.launch()