from typing import Dict, List, Any import sys sys.path.append('./') from videollama2 import model_init, mm_infer from videollama2.utils import disable_torch_init class EndpointHandler: def __init__(self, path: str = ""): """ Initialize the handler by loading the model and any other necessary components. Args: path (str): The path to the model or other necessary files. """ disable_torch_init() self.model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B' self.model, self.processor, self.tokenizer = model_init(self.model_path) def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ Handle inference requests. Args: data (Dict[str, Any]): The input data for inference. Expected keys: - 'modal' (str): 'video' or 'image' - 'modal_path' (str): Path to the video or image file - 'instruct' (str): The instruction/query to process Returns: List[Dict[str, Any]]: The output of the inference. """ modal = data.get("modal", "video") modal_path = data.get("modal_path", "") instruct = data.get("instruct", "") if not modal_path or not instruct: raise ValueError("Both 'modal_path' and 'instruct' must be provided in the input data.") # Perform inference output = mm_infer( self.processor[modal](modal_path), instruct, model=self.model, tokenizer=self.tokenizer, do_sample=False, modal=modal ) return [{"output": output}] # from transformers import pipeline # class EndpointHandler: # def __init__(self, path: str = ""): # """ # Initialize the handler by setting up the environment and loading the model. # """ # # Use a pipeline as a high-level helper to download and load the model # self.pipe = pipeline("visual-question-answering", model="DAMO-NLP-SG/VideoLLaMA2-8x7B") # print("Model downloaded and pipeline created successfully.") # def __call__(self, data): # """ # Handle inference requests. # Args: # data (dict): Input data containing 'image' and 'question'. # Returns: # dict: The output from the model. # """ # image = data.get("image") # question = data.get("question") # if not image or not question: # raise ValueError("Both 'image' and 'question' must be provided in the input data.") # # Use the pipeline to perform visual question answering # output = self.pipe(image=image, question=question) # return output