File size: 3,179 Bytes
7c90bcf
 
 
 
 
 
 
 
 
 
eca2bf8
7c90bcf
 
 
 
 
 
 
 
 
 
 
f27c209
db1d42e
 
f27c209
4ad5631
7c90bcf
 
 
 
 
 
 
 
 
 
 
 
 
e454733
 
f27c209
 
e454733
 
7c90bcf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db1d42e
 
56a6bf1
db1d42e
4ad5631
 
7c90bcf
db1d42e
7c90bcf
db1d42e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from typing import  Dict, List, Any
from PIL import Image
import base64
import torch
import os
from io import BytesIO
from transformers import BlipForConditionalGeneration, BlipProcessor
import requests
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class EndpointHandler():
    def __init__(self, path=""):
        # load the optimized model
        
        # self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") 
        # self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
        # self.model.eval()
        # self.model = self.model.to(device)


        print(" $$$$ Model Loading $$$$")
        self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
        self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b",  torch_dtype=torch.float16, device_map="auto")
        print(" $$$$ model loaded $$$$")
        # print(self.model.eval())
        
        


    def __call__(self, data: Any) -> Dict[str, Any]:
        """
        Args:
            data (:obj:):
                includes the input data and the parameters for the inference.
        Return:
            A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
                - "caption": A string corresponding to the generated caption.
        """
        print("********* Helllo ***********")
        print(data)
        img_data = data.pop("inputs", data)
        prompt = data.pop("prompt", "")
        print("#########")
        # parameters = data.pop("parameters", {})

        if isinstance(img_data, Image.Image):
            raw_image = img_data
        else:
            inputs = isinstance(img_data, str) and [img_data] or img_data
            # raw_image = [Image.open(BytesIO(base64.b64decode(_img))) for _img in inputs]
            raw_image = Image.open(BytesIO(base64.b64decode(img_data)))
                                     
        # processed_images = self.processor(images=raw_images, return_tensors="pt")
        # processed_images["pixel_values"] = processed_images["pixel_values"].to(device)
        # processed_images = {**processed_images, **parameters}
        
        # with torch.no_grad():
        #     out = self.model.generate(**processed_images)
        # captions = self.processor.batch_decode(out, skip_special_tokens=True)

        ##############
        # img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
        # raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
        
        inputs = self.processor(raw_image, prompt, return_tensors="pt").to("cuda", torch.float16)
    
        print("@@@@@@ generated_text @@@@@@@")
        out = self.model.generate(**inputs)
        print("!!!!!!")
        captions = self.processor.decode(out[0], skip_special_tokens=True)
        
        print("captions", captions)
        
        return {"captions": captions}