blip2-image-to-text / handler.py
thoth-AI's picture
Updated handler.py
f49e3ec
raw
history blame
1.85 kB
from typing import Dict, List, Any
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
from io import BytesIO
import torch
import os
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class EndpointHandler:
def __init__(self, path=""):
# load the optimized model
self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", device_map="auto")
self.model.eval()
self.model = self.model.to("cuda")
def __call__(self, data: Any) -> Dict[str, Any]:
"""
Args:
data (:obj:):
includes the input data and the parameters for the inference.
Return:
A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
- "caption": A string corresponding to the generated caption.
"""
inputs = data.pop("inputs", data)
parameters = data.pop("parameters", {})
raw_images = inputs
inputs = self.processor(inputs, return_tensors="pt").to("cuda")
processed_image = self.processor(images=raw_images, return_tensors="pt").to(device)
out = self.model.generate(**processed_image)
# processed_image["pixel_values"] = processed_image["pixel_values"].to(device)
# processed_image = {**processed_image, **parameters}
# with torch.no_grad():
# out = self.model.generate(
# **processed_image
# )
captions = self.processor.decode(out[0], skip_special_tokens=True)
# postprocess the prediction
return {"captions": captions}