Spaces:

AskUI
/

DeepSeek-Vl-UI

Running on Zero

App Files Files Community

DeepSeek-Vl-UI / app.py

programmnix-askui

Enable small models

5b056c1 26 days ago

raw

history blame contribute delete

6.38 kB

	import gradio as gr
	import spaces
	import torch
	import base64
	from PIL import Image, ImageDraw
	from io import BytesIO
	import re

	from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
	from deepseek_vl2.utils.io import load_pil_images


	from transformers import AutoModelForCausalLM



	models = {
	"deepseek-ai/deepseek-vl2-tiny": AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-vl2-tiny", trust_remote_code=True),
	#"deepseek-ai/deepseek-vl2-small": AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-vl2-small", trust_remote_code=True),
	#"deepseek-ai/deepseek-vl2": AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-vl2", trust_remote_code=True)
	}

	processors = {
	"deepseek-ai/deepseek-vl2-tiny": DeepseekVLV2Processor.from_pretrained("deepseek-ai/deepseek-vl2-tiny",),
	#"deepseek-ai/deepseek-vl2-small": DeepseekVLV2Processor.from_pretrained("deepseek-ai/deepseek-vl2-small",),
	#"deepseek-ai/deepseek-vl2": DeepseekVLV2Processor.from_pretrained("deepseek-ai/deepseek-vl2",),
	}


	def image_to_base64(image):
	buffered = BytesIO()
	image.save(buffered, format="PNG")
	img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
	return img_str


	def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2):
	draw = ImageDraw.Draw(image)
	for box in bounding_boxes:
	xmin, ymin, xmax, ymax = box
	draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
	return image


	def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
	x_scale = original_width / scaled_width
	y_scale = original_height / scaled_height
	rescaled_boxes = []
	for box in bounding_boxes:
	xmin, ymin, xmax, ymax = box
	rescaled_box = [
	xmin * x_scale,
	ymin * y_scale,
	xmax * x_scale,
	ymax * y_scale
	]
	rescaled_boxes.append(rescaled_box)
	return rescaled_boxes


	def deepseek(image, text_input, model_id):
	# specify the path to the model
	vl_chat_processor: DeepseekVLV2Processor = processors[model_id]
	tokenizer = vl_chat_processor.tokenizer

	vl_gpt: DeepseekVLV2ForCausalLM = models[model_id]
	vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

	## single image conversation example
	conversation = [
	{
	"role": "<\|User\|>",
	"content": f"<image><\|ref\|>{text_input}<\|/ref\|>.",
	"images": ["./images/visual_grounding_1.jpeg"],
	},
	{"role": "<\|Assistant\|>", "content": ""},
	]

	# load images and prepare for inputs
	#pil_images = load_pil_images(conversation)
	prepare_inputs = vl_chat_processor(
	conversations=conversation,
	images=[image],
	force_batchify=True,
	system_prompt=""
	).to(vl_gpt.device)


	with torch.no_grad():

	# run image encoder to get the image embeddings
	inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

	inputs_embeds, past_key_values = vl_gpt.incremental_prefilling(
	input_ids=prepare_inputs.input_ids,
	images=prepare_inputs.images,
	images_seq_mask=prepare_inputs.images_seq_mask,
	images_spatial_crop=prepare_inputs.images_spatial_crop,
	attention_mask=prepare_inputs.attention_mask,
	chunk_size=512 # prefilling size
	)

	# run the model to get the response
	outputs = vl_gpt.generate(
	inputs_embeds=inputs_embeds,
	input_ids=prepare_inputs.input_ids,
	images=prepare_inputs.images,
	images_seq_mask=prepare_inputs.images_seq_mask,
	images_spatial_crop=prepare_inputs.images_spatial_crop,
	attention_mask=prepare_inputs.attention_mask,
	past_key_values=past_key_values,
	pad_token_id=tokenizer.eos_token_id,
	bos_token_id=tokenizer.bos_token_id,
	eos_token_id=tokenizer.eos_token_id,
	max_new_tokens=512,
	do_sample=False,
	use_cache=True,
	)

	answer = tokenizer.decode(outputs[0][len(prepare_inputs.input_ids[0]):].cpu().tolist(), skip_special_tokens=False)
	print(f"{prepare_inputs['sft_format'][0]}", answer)
	det_pattern = r"<\\|det\\|>\[\[(.+)]]<\\|\/det\\|>"

	det_match = re.search(det_pattern, answer)
	if det_match is None:
	return text_input, [], image

	det_content = det_match.group(1)
	bbox = [int(v.strip()) for v in det_content.split(",")]

	scaled_boxes = rescale_bounding_boxes([bbox], image.width, image.height)
	return answer, scaled_boxes, draw_bounding_boxes(image, scaled_boxes)


	@spaces.GPU
	def run_example(image, text_input, model_id="deepseek-ai/deepseek-vl2-tiny"):
	return deepseek(image, text_input, model_id)

	css = """
	#output {
	height: 500px;
	overflow: auto;
	border: 1px solid #ccc;
	}
	"""
	with gr.Blocks(css=css) as demo:
	gr.Markdown(
	"""
	# Demo for Deepseek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding
	""")
	with gr.Row():
	with gr.Column():
	input_img = gr.Image(label="Input Image", type="pil")
	model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="deepseek-ai/deepseek-vl2-tiny")
	text_input = gr.Textbox(label="User Prompt")
	submit_btn = gr.Button(value="Submit")
	with gr.Column():
	model_output_text = gr.Textbox(label="Model Output Text")
	model_output_box = gr.Textbox(label="Model Output Box")
	annotated_image = gr.Image(label="Annotated Image")

	gr.Examples(
	examples=[
	["assets/web_6f93090a-81f6-489e-bb35-1a2838b18c01.png", "select search textfield"],
	["assets/web_6f93090a-81f6-489e-bb35-1a2838b18c01.png", "switch to discussions"],
	],
	inputs=[input_img, text_input],
	outputs=[model_output_text, model_output_box, annotated_image],
	fn=run_example,
	cache_examples=True,
	label="Try examples"
	)

	submit_btn.click(run_example, [input_img, text_input, model_selector], [model_output_text, model_output_box, annotated_image])

	demo.launch(debug=True)