amaye15
/

colqwen2-1.0-alpha-inference

Safetensors

Inference Endpoints

Model card Files Files and versions Community

amaye15 commited on Nov 7, 2024

Commit

64262c3

1 Parent(s): dbabaf1

Logging added

Browse files

Files changed (1) hide show

handler.py +150 -72

handler.py CHANGED Viewed

@@ -7,13 +7,13 @@
 # class EndpointHandler:
 #     """
-#     A handler class for processing image data, generating embeddings using a specified model and processor.
 #     Attributes:
 #         model: The pre-trained model used for generating embeddings.
-#         processor: The pre-trained processor used to process images before model inference.
 #         device: The device (CPU or CUDA) used to run model inference.
-#         default_batch_size: The default batch size for processing images in batches.
 #     """
 #     def __init__(self, path: str = "", default_batch_size: int = 4):
@@ -22,13 +22,16 @@
 #         Args:
 #             path (str): Path to the pre-trained model and processor.
-#             default_batch_size (int): Default batch size for image processing.
 #         """
 #         from colpali_engine.models import ColQwen2, ColQwen2Processor
 #         self.model = ColQwen2.from_pretrained(
 #             path,
 #             torch_dtype=torch.bfloat16,
 #         ).eval()
 #         self.processor = ColQwen2Processor.from_pretrained(path)
@@ -36,7 +39,7 @@
 #         self.model.to(self.device)
 #         self.default_batch_size = default_batch_size
-#     def _process_batch(self, images: List[Image.Image]) -> List[List[float]]:
 #         """
 #         Processes a batch of images and generates embeddings.
@@ -46,55 +49,97 @@
 #         Returns:
 #             List[List[float]]: List of embeddings for each image.
 #         """
-#         batch_images = self.processor.process_images(images)
-#         batch_images = {k: v.to(self.device) for k, v in batch_images.items()}
 #         with torch.no_grad():
 #             image_embeddings = self.model(**batch_images)
 #         return image_embeddings.cpu().tolist()
 #     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
 #         """
-#         Processes input data containing base64-encoded images, decodes them, and generates embeddings.
 #         Args:
-#             data (Dict[str, Any]): Dictionary containing input images and optional batch size.
 #         Returns:
-#             Dict[str, Any]: Dictionary containing generated embeddings or error messages.
 #         """
-#         images_data = data.get("inputs", [])
 #         batch_size = data.get("batch_size", self.default_batch_size)
-#         if not images_data:
-#             return {"error": "No images provided in 'inputs'."}
 #         images = []
-#         for img_data in images_data:
-#             if isinstance(img_data, str):
-#                 try:
-#                     image_bytes = base64.b64decode(img_data)
-#                     image = Image.open(BytesIO(image_bytes)).convert("RGB")
-#                     images.append(image)
-#                 except Exception as e:
-#                     return {"error": f"Invalid image data: {e}"}
-#             else:
-#                 return {"error": "Images should be base64-encoded strings."}
-#         embeddings = []
 #         for i in range(0, len(images), batch_size):
 #             batch_images = images[i : i + batch_size]
-#             batch_embeddings = self._process_batch(batch_images)
-#             embeddings.extend(batch_embeddings)
-#         return {"embeddings": embeddings}
 import torch
 from typing import Dict, Any, List
 from PIL import Image
 import base64
 from io import BytesIO
 class EndpointHandler:
@@ -116,20 +161,27 @@ class EndpointHandler:
             path (str): Path to the pre-trained model and processor.
             default_batch_size (int): Default batch size for processing images and text data.
         """
-        from colpali_engine.models import ColQwen2, ColQwen2Processor
-        self.model = ColQwen2.from_pretrained(
-            path,
-            torch_dtype=torch.bfloat16,
-            device_map=(
-                "cuda:0" if torch.cuda.is_available() else "cpu"
-            ),  # Set device map based on availability
-        ).eval()
-        self.processor = ColQwen2Processor.from_pretrained(path)
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model.to(self.device)
-        self.default_batch_size = default_batch_size
     def _process_image_batch(self, images: List[Image.Image]) -> List[List[float]]:
         """
@@ -141,12 +193,16 @@ class EndpointHandler:
         Returns:
             List[List[float]]: List of embeddings for each image.
         """
-        batch_images = self.processor.process_images(images).to(self.device)
-        with torch.no_grad():
-            image_embeddings = self.model(**batch_images)
-        return image_embeddings.cpu().tolist()
     def _process_text_batch(self, texts: List[str]) -> List[List[float]]:
         """
@@ -158,12 +214,16 @@ class EndpointHandler:
         Returns:
             List[List[float]]: List of embeddings for each text query.
         """
-        batch_queries = self.processor.process_queries(texts).to(self.device)
-        with torch.no_grad():
-            query_embeddings = self.model(**batch_queries)
-        return query_embeddings.cpu().tolist()
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
@@ -182,6 +242,7 @@ class EndpointHandler:
         # Decode and process images
         images = []
         if images_data:
             for img_data in images_data:
                 if isinstance(img_data, str):
                     try:
@@ -189,38 +250,55 @@ class EndpointHandler:
                         image = Image.open(BytesIO(image_bytes)).convert("RGB")
                         images.append(image)
                     except Exception as e:
                         return {"error": f"Invalid image data: {e}"}
                 else:
                     return {"error": "Images should be base64-encoded strings."}
         image_embeddings = []
-        for i in range(0, len(images), batch_size):
-            batch_images = images[i : i + batch_size]
-            batch_embeddings = self._process_image_batch(batch_images)
-            image_embeddings.extend(batch_embeddings)
         # Process text data
         text_embeddings = []
         if text_data:
-            for i in range(0, len(text_data), batch_size):
-                batch_texts = text_data[i : i + batch_size]
-                batch_text_embeddings = self._process_text_batch(batch_texts)
-                text_embeddings.extend(batch_text_embeddings)
         # Compute similarity scores if both image and text embeddings are available
         scores = []
         if image_embeddings and text_embeddings:
-            # Convert embeddings to tensors for scoring
-            image_embeddings_tensor = torch.tensor(image_embeddings).to(self.device)
-            text_embeddings_tensor = torch.tensor(text_embeddings).to(self.device)
-            with torch.no_grad():
-                scores = (
-                    self.processor.score_multi_vector(
-                        text_embeddings_tensor, image_embeddings_tensor
                     )
-                    .cpu()
-                    .tolist()
-                )
         return {"image": image_embeddings, "text": text_embeddings, "scores": scores}

 # class EndpointHandler:
 #     """
+#     A handler class for processing image and text data, generating embeddings using a specified model and processor.
 #     Attributes:
 #         model: The pre-trained model used for generating embeddings.
+#         processor: The pre-trained processor used to process images and text before model inference.
 #         device: The device (CPU or CUDA) used to run model inference.
+#         default_batch_size: The default batch size for processing images and text in batches.
 #     """
 #     def __init__(self, path: str = "", default_batch_size: int = 4):
 #         Args:
 #             path (str): Path to the pre-trained model and processor.
+#             default_batch_size (int): Default batch size for processing images and text data.
 #         """
 #         from colpali_engine.models import ColQwen2, ColQwen2Processor
 #         self.model = ColQwen2.from_pretrained(
 #             path,
 #             torch_dtype=torch.bfloat16,
+#             device_map=(
+#                 "cuda:0" if torch.cuda.is_available() else "cpu"
+#             ),  # Set device map based on availability
 #         ).eval()
 #         self.processor = ColQwen2Processor.from_pretrained(path)
 #         self.model.to(self.device)
 #         self.default_batch_size = default_batch_size
+#     def _process_image_batch(self, images: List[Image.Image]) -> List[List[float]]:
 #         """
 #         Processes a batch of images and generates embeddings.
 #         Returns:
 #             List[List[float]]: List of embeddings for each image.
 #         """
+#         batch_images = self.processor.process_images(images).to(self.device)
 #         with torch.no_grad():
 #             image_embeddings = self.model(**batch_images)
 #         return image_embeddings.cpu().tolist()
+#     def _process_text_batch(self, texts: List[str]) -> List[List[float]]:
+#         """
+#         Processes a batch of text queries and generates embeddings.
+#         Args:
+#             texts (List[str]): List of text queries to process.
+#         Returns:
+#             List[List[float]]: List of embeddings for each text query.
+#         """
+#         batch_queries = self.processor.process_queries(texts).to(self.device)
+#         with torch.no_grad():
+#             query_embeddings = self.model(**batch_queries)
+#         return query_embeddings.cpu().tolist()
 #     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
 #         """
+#         Processes input data containing base64-encoded images and text queries, decodes them, and generates embeddings.
 #         Args:
+#             data (Dict[str, Any]): Dictionary containing input images, text queries, and optional batch size.
 #         Returns:
+#             Dict[str, Any]: Dictionary containing generated embeddings for images and text or error messages.
 #         """
+#         images_data = data.get("image", [])
+#         text_data = data.get("text", [])
 #         batch_size = data.get("batch_size", self.default_batch_size)
+#         # Decode and process images
 #         images = []
+#         if images_data:
+#             for img_data in images_data:
+#                 if isinstance(img_data, str):
+#                     try:
+#                         image_bytes = base64.b64decode(img_data)
+#                         image = Image.open(BytesIO(image_bytes)).convert("RGB")
+#                         images.append(image)
+#                     except Exception as e:
+#                         return {"error": f"Invalid image data: {e}"}
+#                 else:
+#                     return {"error": "Images should be base64-encoded strings."}
+#         image_embeddings = []
 #         for i in range(0, len(images), batch_size):
 #             batch_images = images[i : i + batch_size]
+#             batch_embeddings = self._process_image_batch(batch_images)
+#             image_embeddings.extend(batch_embeddings)
+#         # Process text data
+#         text_embeddings = []
+#         if text_data:
+#             for i in range(0, len(text_data), batch_size):
+#                 batch_texts = text_data[i : i + batch_size]
+#                 batch_text_embeddings = self._process_text_batch(batch_texts)
+#                 text_embeddings.extend(batch_text_embeddings)
+#         # Compute similarity scores if both image and text embeddings are available
+#         scores = []
+#         if image_embeddings and text_embeddings:
+#             # Convert embeddings to tensors for scoring
+#             image_embeddings_tensor = torch.tensor(image_embeddings).to(self.device)
+#             text_embeddings_tensor = torch.tensor(text_embeddings).to(self.device)
+#             with torch.no_grad():
+#                 scores = (
+#                     self.processor.score_multi_vector(
+#                         text_embeddings_tensor, image_embeddings_tensor
+#                     )
+#                     .cpu()
+#                     .tolist()
+#                 )
+#         return {"image": image_embeddings, "text": text_embeddings, "scores": scores}
 import torch
 from typing import Dict, Any, List
 from PIL import Image
 import base64
 from io import BytesIO
+import logging
 class EndpointHandler:
             path (str): Path to the pre-trained model and processor.
             default_batch_size (int): Default batch size for processing images and text data.
         """
+        # Initialize logging
+        logging.basicConfig(level=logging.INFO)
+        self.logger = logging.getLogger(__name__)
+        from colpali_engine.models import ColQwen2, ColQwen2Processor
+        self.logger.info("Initializing model and processor.")
+        try:
+            self.model = ColQwen2.from_pretrained(
+                path,
+                torch_dtype=torch.bfloat16,
+                device_map=("cuda:0" if torch.cuda.is_available() else "cpu"),
+            ).eval()
+            self.processor = ColQwen2Processor.from_pretrained(path)
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            self.model.to(self.device)
+            self.default_batch_size = default_batch_size
+            self.logger.info("Initialization complete.")
+        except Exception as e:
+            self.logger.error(f"Failed to initialize model or processor: {e}")
+            raise
     def _process_image_batch(self, images: List[Image.Image]) -> List[List[float]]:
         """
         Returns:
             List[List[float]]: List of embeddings for each image.
         """
+        self.logger.debug(f"Processing batch of {len(images)} images.")
+        try:
+            batch_images = self.processor.process_images(images).to(self.device)
+            with torch.no_grad():
+                image_embeddings = self.model(**batch_images)
+            self.logger.debug("Image batch processing complete.")
+            return image_embeddings.cpu().tolist()
+        except Exception as e:
+            self.logger.error(f"Error processing image batch: {e}")
+            raise
     def _process_text_batch(self, texts: List[str]) -> List[List[float]]:
         """
         Returns:
             List[List[float]]: List of embeddings for each text query.
         """
+        self.logger.debug(f"Processing batch of {len(texts)} text queries.")
+        try:
+            batch_queries = self.processor.process_queries(texts).to(self.device)
+            with torch.no_grad():
+                query_embeddings = self.model(**batch_queries)
+            self.logger.debug("Text batch processing complete.")
+            return query_embeddings.cpu().tolist()
+        except Exception as e:
+            self.logger.error(f"Error processing text batch: {e}")
+            raise
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         # Decode and process images
         images = []
         if images_data:
+            self.logger.info("Decoding images from base64.")
             for img_data in images_data:
                 if isinstance(img_data, str):
                     try:
                         image = Image.open(BytesIO(image_bytes)).convert("RGB")
                         images.append(image)
                     except Exception as e:
+                        self.logger.error(f"Invalid image data: {e}")
                         return {"error": f"Invalid image data: {e}"}
                 else:
+                    self.logger.error("Images should be base64-encoded strings.")
                     return {"error": "Images should be base64-encoded strings."}
         image_embeddings = []
+        if images:
+            self.logger.info("Processing image embeddings.")
+            try:
+                for i in range(0, len(images), batch_size):
+                    batch_images = images[i : i + batch_size]
+                    batch_embeddings = self._process_image_batch(batch_images)
+                    image_embeddings.extend(batch_embeddings)
+            except Exception as e:
+                self.logger.error(f"Error generating image embeddings: {e}")
+                return {"error": f"Error generating image embeddings: {e}"}
         # Process text data
         text_embeddings = []
         if text_data:
+            self.logger.info("Processing text embeddings.")
+            try:
+                for i in range(0, len(text_data), batch_size):
+                    batch_texts = text_data[i : i + batch_size]
+                    batch_text_embeddings = self._process_text_batch(batch_texts)
+                    text_embeddings.extend(batch_text_embeddings)
+            except Exception as e:
+                self.logger.error(f"Error generating text embeddings: {e}")
+                return {"error": f"Error generating text embeddings: {e}"}
         # Compute similarity scores if both image and text embeddings are available
         scores = []
         if image_embeddings and text_embeddings:
+            self.logger.info("Computing similarity scores.")
+            try:
+                image_embeddings_tensor = torch.tensor(image_embeddings).to(self.device)
+                text_embeddings_tensor = torch.tensor(text_embeddings).to(self.device)
+                with torch.no_grad():
+                    scores = (
+                        self.processor.score_multi_vector(
+                            text_embeddings_tensor, image_embeddings_tensor
+                        )
+                        .cpu()
+                        .tolist()
                     )
+                self.logger.info("Similarity scoring complete.")
+            except Exception as e:
+                self.logger.error(f"Error computing similarity scores: {e}")
+                return {"error": f"Error computing similarity scores: {e}"}
         return {"image": image_embeddings, "text": text_embeddings, "scores": scores}