morthens
/

qwen2-vl-2b-infer

Image-Text-to-Text

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

qwen2-vl-2b-infer / handler.py

morthens's picture

Create handler.py

9deea94 verified about 1 month ago

history blame contribute delete

2.82 kB

	from typing import Dict, Any
	from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
	import torch
	from PIL import Image
	import requests
	from io import BytesIO
	import json

	# Check for GPU
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	class EndpointHandler:
	def __init__(self, path: str = ""):
	"""
	Initializes the handler for the Qwen2-VL model.

	Args:
	path (str): Path to the model weights and processor. Defaults to the current directory.
	"""
	# Load the processor and model
	self.processor = AutoProcessor.from_pretrained(path)
	self.model = Qwen2VLForConditionalGeneration.from_pretrained(
	path,
	torch_dtype="auto",
	device_map="auto"
	)
	# Move the model to the appropriate device
	self.model.to(device)

	def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Processes the input data and returns the model's prediction.

	Args:
	data (Dict[str, Any]): Input data containing `image_url` and `text`.

	Returns:
	Dict[str, Any]: The prediction or an error message.
	"""
	image_url = data.get("image_url", "")
	text = data.get("text", "")

	# Load the image from the URL
	try:
	response = requests.get(image_url)
	response.raise_for_status()
	image = Image.open(BytesIO(response.content))
	except Exception as e:
	return {"error": f"Failed to fetch or process image: {str(e)}"}

	# Prepare the text prompt
	text_prompt = self.processor.apply_chat_template(
	[{"role": "user", "content": [{"type": "text", "text": text}]}],
	add_generation_prompt=True
	)

	# Preprocess the input
	inputs = self.processor(
	text=[text_prompt],
	images=[image],
	padding=True,
	return_tensors="pt"
	)

	# Move inputs to the correct device
	inputs = {key: value.to(device) for key, value in inputs.items()}

	# Perform inference
	output_ids = self.model.generate(**inputs, max_new_tokens=128)

	# Decode the generated text
	output_text = self.processor.batch_decode(
	output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
	)[0]

	# Clean and parse the JSON response
	cleaned_data = output_text.replace("```json\n", "").replace("```", "").strip()
	try:
	prediction = json.loads(cleaned_data)
	except json.JSONDecodeError as e:
	return {"error": f"Failed to parse JSON output: {str(e)}", "raw_output": cleaned_data}

	return {"prediction": prediction}