File size: 3,179 Bytes
7c90bcf eca2bf8 7c90bcf f27c209 db1d42e f27c209 4ad5631 7c90bcf e454733 f27c209 e454733 7c90bcf db1d42e 56a6bf1 db1d42e 4ad5631 7c90bcf db1d42e 7c90bcf db1d42e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
from typing import Dict, List, Any
from PIL import Image
import base64
import torch
import os
from io import BytesIO
from transformers import BlipForConditionalGeneration, BlipProcessor
import requests
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class EndpointHandler():
def __init__(self, path=""):
# load the optimized model
# self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
# self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
# self.model.eval()
# self.model = self.model.to(device)
print(" $$$$ Model Loading $$$$")
self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16, device_map="auto")
print(" $$$$ model loaded $$$$")
# print(self.model.eval())
def __call__(self, data: Any) -> Dict[str, Any]:
"""
Args:
data (:obj:):
includes the input data and the parameters for the inference.
Return:
A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
- "caption": A string corresponding to the generated caption.
"""
print("********* Helllo ***********")
print(data)
img_data = data.pop("inputs", data)
prompt = data.pop("prompt", "")
print("#########")
# parameters = data.pop("parameters", {})
if isinstance(img_data, Image.Image):
raw_image = img_data
else:
inputs = isinstance(img_data, str) and [img_data] or img_data
# raw_image = [Image.open(BytesIO(base64.b64decode(_img))) for _img in inputs]
raw_image = Image.open(BytesIO(base64.b64decode(img_data)))
# processed_images = self.processor(images=raw_images, return_tensors="pt")
# processed_images["pixel_values"] = processed_images["pixel_values"].to(device)
# processed_images = {**processed_images, **parameters}
# with torch.no_grad():
# out = self.model.generate(**processed_images)
# captions = self.processor.batch_decode(out, skip_special_tokens=True)
##############
# img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
# raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
inputs = self.processor(raw_image, prompt, return_tensors="pt").to("cuda", torch.float16)
print("@@@@@@ generated_text @@@@@@@")
out = self.model.generate(**inputs)
print("!!!!!!")
captions = self.processor.decode(out[0], skip_special_tokens=True)
print("captions", captions)
return {"captions": captions}
|