Spaces:
Running
Running
import os | |
import requests | |
from PIL import Image, UnidentifiedImageError | |
import numpy as np | |
import gradio as gr | |
from encoder import FashionCLIPEncoder | |
from pinecone import Pinecone | |
from dotenv import load_dotenv | |
import json | |
# Load environment variables | |
load_dotenv() | |
# Constants | |
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") | |
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME") | |
PINECONE_NAMESPACE = os.getenv("PINECONE_NAMESPACE") | |
REQUESTS_HEADERS = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
BATCH_SIZE = 30 | |
# Initialize services | |
pc = Pinecone(api_key=PINECONE_API_KEY) | |
index = pc.Index(PINECONE_INDEX_NAME) | |
encoder = FashionCLIPEncoder() | |
def download_image_as_pil(url: str, timeout: int = 10) -> Image.Image: | |
try: | |
response = requests.get(url, stream=True, headers=REQUESTS_HEADERS, timeout=timeout) | |
if response.status_code == 200 and 'image' in response.headers.get('Content-Type', ''): | |
try: | |
return Image.open(response.raw).convert("RGB") | |
except UnidentifiedImageError: | |
print(f"Unidentified image file from URL: {url}") | |
return None | |
except Exception as e: | |
print(f"Error downloading image: {e}") | |
return None | |
def process_batch(batch_products, batch_images, results): | |
try: | |
# Generate embeddings | |
embeddings = encoder.encode_images(batch_images) | |
for product, embedding in zip(batch_products, embeddings): | |
# Normalize embedding | |
embedding_normalized = embedding / np.linalg.norm(embedding) | |
# Append results | |
results.append({ | |
"product_id": product["product_id"], | |
"image_url": product["url"], | |
"embedding": embedding_normalized.tolist(), | |
"embedding_preview": embedding_normalized[:5].tolist(), | |
"success": True | |
}) | |
except Exception as e: | |
for product in batch_products: | |
results.append({ | |
"product_id": product["product_id"], | |
"image_url": product["url"], | |
"error": str(e) | |
}) | |
def batch_process_images(json_input: str): | |
try: | |
# Parse JSON input | |
data = json.loads(json_input) | |
products = data.get("products", []) | |
upload_to_pinecone = data.get("upload_to_pinecone", False) | |
if not products: | |
return {"error": "No products provided in JSON input."} | |
results = [] | |
batch_products, batch_images = [], [] | |
for product in products: | |
try: | |
# Download image | |
image = download_image_as_pil(product["url"]) | |
if not image: | |
results.append({ | |
"product_id": product["product_id"], | |
"image_url": product["url"], | |
"error": "Failed to download image" | |
}) | |
continue | |
batch_products.append(product) | |
batch_images.append(image) | |
# Process batch when reaching batch size | |
if len(batch_images) == BATCH_SIZE: | |
process_batch(batch_products, batch_images, results) | |
batch_products, batch_images = [], [] | |
except Exception as e: | |
results.append({ | |
"product_id": product["product_id"], | |
"image_url": product["url"], | |
"error": str(e) | |
}) | |
# Process remaining images in the last batch | |
if batch_images: | |
process_batch(batch_products, batch_images, results) | |
# Upload to Pinecone if requested | |
if upload_to_pinecone: | |
upload_result = upload_vector_to_pinecone(results) | |
return { | |
"processing_results": results, | |
"pinecone_upload": upload_result | |
} | |
return {"processing_results": results} | |
except json.JSONDecodeError: | |
return {"error": "Invalid JSON format"} | |
except Exception as e: | |
return {"error": f"Unexpecteddd error: {str(e)}"} | |
def upload_vector_to_pinecone(processed_results): | |
"""Upload embeddings to Pinecone""" | |
vectors_to_upsert = [] | |
for result in processed_results: | |
if 'error' not in result and 'embedding' in result: | |
vector = { | |
'id': result['product_id'], | |
'values': result['embedding'], | |
'metadata': { | |
'image_url': result['image_url'] | |
} | |
} | |
vectors_to_upsert.append(vector) | |
if vectors_to_upsert: | |
index.upsert(vectors=vectors_to_upsert, namespace=PINECONE_NAMESPACE) | |
return {"uploaded_count": len(vectors_to_upsert)} | |
# Example JSON input | |
EXAMPLE_INPUT = { | |
"products": [ | |
{ | |
"product_id": "1", | |
"url": "https://cdn.shopify.com/s/files/1/0522/2239/4534/files/CT21355-22_1024x1024.webp" | |
} | |
], | |
"upload_to_pinecone": False | |
} | |
# Gradio Interface | |
iface = gr.Interface( | |
fn=batch_process_images, | |
inputs=gr.Code( | |
label="Input JSON", | |
language="json", | |
value=json.dumps(EXAMPLE_INPUT, indent=4) # Changed from default to value | |
), | |
outputs=gr.JSON(label="Processing Results"), | |
title="Fashion CLIP Embedding Generator", | |
description="Provide JSON input with product IDs, URLs, and Pinecone upload preference to generate embeddings.", | |
article=""" | |
### Input JSON Format: | |
```json | |
{ | |
"products": [ | |
{ | |
"product_id": "string", | |
"url": "string" | |
} | |
], | |
"upload_to_pinecone": boolean | |
} | |
``` | |
### Features: | |
- Batch processing of multiple images | |
- Custom product ID support | |
- Embedding generation using Fashion CLIP | |
- Optional Pinecone database integration | |
- Error handling and detailed results | |
Make sure to set up your environment variables in a .env file: | |
- PINECONE_API_KEY | |
- PINECONE_INDEX_NAME | |
- PINECONE_NAMESPACE | |
""" | |
) | |
# Launch Gradio App | |
if __name__ == "__main__": | |
iface.launch() |