Spaces:
Running
Running
File size: 6,335 Bytes
4bd62d7 d933c69 4bd62d7 d933c69 0116945 660ea27 4bd62d7 0116945 4bd62d7 d933c69 4bd62d7 d933c69 4bd62d7 159fca3 4bd62d7 660ea27 e27c656 d933c69 4bd62d7 0fcf1c4 4bd62d7 d933c69 4bd62d7 0116945 4bd62d7 0116945 4bd62d7 0116945 d933c69 0116945 d933c69 0116945 4bd62d7 0116945 4bd62d7 d933c69 0116945 d933c69 0116945 d933c69 0116945 4bd62d7 0116945 4bd62d7 0116945 4bd62d7 0116945 d933c69 0116945 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
import os
import requests
from PIL import Image, UnidentifiedImageError
import numpy as np
import gradio as gr
from encoder import FashionCLIPEncoder
from pinecone import Pinecone
from dotenv import load_dotenv
import json
# Load environment variables
load_dotenv()
# Constants
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_NAMESPACE = os.getenv("PINECONE_NAMESPACE")
REQUESTS_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
BATCH_SIZE = 30
# Initialize services
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(PINECONE_INDEX_NAME)
encoder = FashionCLIPEncoder()
def download_image_as_pil(url: str, timeout: int = 10) -> Image.Image:
try:
response = requests.get(url, stream=True, headers=REQUESTS_HEADERS, timeout=timeout)
if response.status_code == 200 and 'image' in response.headers.get('Content-Type', ''):
try:
return Image.open(response.raw).convert("RGB")
except UnidentifiedImageError:
print(f"Unidentified image file from URL: {url}")
return None
except Exception as e:
print(f"Error downloading image: {e}")
return None
def process_batch(batch_products, batch_images, results):
try:
# Generate embeddings
embeddings = encoder.encode_images(batch_images)
for product, embedding in zip(batch_products, embeddings):
# Normalize embedding
embedding_normalized = embedding / np.linalg.norm(embedding)
# Append results
results.append({
"product_id": product["product_id"],
"image_url": product["url"],
"embedding": embedding_normalized.tolist(),
"embedding_preview": embedding_normalized[:5].tolist(),
"success": True
})
except Exception as e:
for product in batch_products:
results.append({
"product_id": product["product_id"],
"image_url": product["url"],
"error": str(e)
})
def batch_process_images(json_input: str):
try:
# Parse JSON input
data = json.loads(json_input)
products = data.get("products", [])
upload_to_pinecone = data.get("upload_to_pinecone", False)
if not products:
return {"error": "No products provided in JSON input."}
results = []
batch_products, batch_images = [], []
for product in products:
try:
# Download image
image = download_image_as_pil(product["url"])
if not image:
results.append({
"product_id": product["product_id"],
"image_url": product["url"],
"error": "Failed to download image"
})
continue
batch_products.append(product)
batch_images.append(image)
# Process batch when reaching batch size
if len(batch_images) == BATCH_SIZE:
process_batch(batch_products, batch_images, results)
batch_products, batch_images = [], []
except Exception as e:
results.append({
"product_id": product["product_id"],
"image_url": product["url"],
"error": str(e)
})
# Process remaining images in the last batch
if batch_images:
process_batch(batch_products, batch_images, results)
# Upload to Pinecone if requested
if upload_to_pinecone:
upload_result = upload_vector_to_pinecone(results)
return {
"processing_results": results,
"pinecone_upload": upload_result
}
return {"processing_results": results}
except json.JSONDecodeError:
return {"error": "Invalid JSON format"}
except Exception as e:
return {"error": f"Unexpecteddd error: {str(e)}"}
def upload_vector_to_pinecone(processed_results):
"""Upload embeddings to Pinecone"""
vectors_to_upsert = []
for result in processed_results:
if 'error' not in result and 'embedding' in result:
vector = {
'id': result['product_id'],
'values': result['embedding'],
'metadata': {
'image_url': result['image_url']
}
}
vectors_to_upsert.append(vector)
if vectors_to_upsert:
index.upsert(vectors=vectors_to_upsert, namespace=PINECONE_NAMESPACE)
return {"uploaded_count": len(vectors_to_upsert)}
# Example JSON input
EXAMPLE_INPUT = {
"products": [
{
"product_id": "1",
"url": "https://cdn.shopify.com/s/files/1/0522/2239/4534/files/CT21355-22_1024x1024.webp"
}
],
"upload_to_pinecone": False
}
# Gradio Interface
iface = gr.Interface(
fn=batch_process_images,
inputs=gr.Code(
label="Input JSON",
language="json",
value=json.dumps(EXAMPLE_INPUT, indent=4) # Changed from default to value
),
outputs=gr.JSON(label="Processing Results"),
title="Fashion CLIP Embedding Generator",
description="Provide JSON input with product IDs, URLs, and Pinecone upload preference to generate embeddings.",
article="""
### Input JSON Format:
```json
{
"products": [
{
"product_id": "string",
"url": "string"
}
],
"upload_to_pinecone": boolean
}
```
### Features:
- Batch processing of multiple images
- Custom product ID support
- Embedding generation using Fashion CLIP
- Optional Pinecone database integration
- Error handling and detailed results
Make sure to set up your environment variables in a .env file:
- PINECONE_API_KEY
- PINECONE_INDEX_NAME
- PINECONE_NAMESPACE
"""
)
# Launch Gradio App
if __name__ == "__main__":
iface.launch() |