|
from typing import Dict, List, Any |
|
from PIL import Image |
|
import base64 |
|
import torch |
|
import os |
|
from io import BytesIO |
|
from transformers import BlipForConditionalGeneration, BlipProcessor |
|
import requests |
|
from PIL import Image |
|
from transformers import Blip2Processor, Blip2ForConditionalGeneration |
|
|
|
|
|
class EndpointHandler(): |
|
def __init__(self, path=""): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(" $$$$ Model Loading $$$$") |
|
self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") |
|
self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16, device_map="auto") |
|
print(" $$$$ model loaded $$$$") |
|
|
|
|
|
|
|
|
|
|
|
def __call__(self, data: Any) -> Dict[str, Any]: |
|
""" |
|
Args: |
|
data (:obj:): |
|
includes the input data and the parameters for the inference. |
|
Return: |
|
A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing : |
|
- "caption": A string corresponding to the generated caption. |
|
""" |
|
print("********* Helllo ***********") |
|
print(data) |
|
img_data = data.pop("inputs", data) |
|
prompt = data.pop("prompt", "") |
|
print("#########") |
|
|
|
|
|
if isinstance(img_data, Image.Image): |
|
raw_image = img_data |
|
else: |
|
inputs = isinstance(img_data, str) and [img_data] or img_data |
|
|
|
raw_image = Image.open(BytesIO(base64.b64decode(img_data))) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inputs = self.processor(raw_image, prompt, return_tensors="pt").to("cuda", torch.float16) |
|
|
|
print("@@@@@@ generated_text @@@@@@@") |
|
out = self.model.generate(**inputs) |
|
print("!!!!!!") |
|
captions = self.processor.decode(out[0], skip_special_tokens=True) |
|
|
|
print("captions", captions) |
|
|
|
return {"captions": captions} |
|
|