pandora-s's picture
Update app.py
2a9695b verified
## Due to a small bug when installing exllamav2 from dev branch directly we require CUDA paths
import cuda_bug
cuda_bug.install_cuda_toolkit_requirements()
##
import gradio as gr
from gradio.data_classes import FileData
from huggingface_hub import snapshot_download
from pathlib import Path
import base64
import spaces
import os
import sys, os
import torch
from exllamav2 import (
ExLlamaV2,
ExLlamaV2Config,
ExLlamaV2Cache,
ExLlamaV2Tokenizer,
ExLlamaV2VisionTower,
)
from exllamav2.generator import (
ExLlamaV2DynamicGenerator,
ExLlamaV2Sampler,
)
from PIL import Image
import requests
from huggingface_hub import snapshot_download
from tqdm import tqdm
default_max_context = 16384
default_max_output = 512
default_bpw = "4.0bpw"
available_models = [
"2.5bpw",
"3.0bpw",
"3.5bpw",
"4.0bpw",
"4.5bpw",
"5.0bpw",
"6.0bpw",
"8.0bpw"
]
dirs = {}
for model in tqdm(available_models):
dirs.update({model: snapshot_download(repo_id="turboderp/pixtral-12b-exl2", revision=model)})
@spaces.GPU(duration=45)
def run_inference(message, history, model_picked, context_size, max_output):
if not model_picked:
model_picked = default_bpw
if not context_size:
context_size = default_max_context
if not max_output:
max_output = default_max_output
local_dir = dirs[model_picked]
# Loading only once GPU available
config = ExLlamaV2Config(local_dir)
config.max_seq_len = context_size
vision_model = ExLlamaV2VisionTower(config)
vision_model.load(progress = True)
model = ExLlamaV2(config)
cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = context_size)
model.load_autosplit(cache, progress = True)
tokenizer = ExLlamaV2Tokenizer(config)
generator = ExLlamaV2DynamicGenerator(
model = model,
cache = cache,
tokenizer = tokenizer
)
# Making Prompt Template
prompt = ""
image_prompt = ""
images_embeddings = []
for couple in history:
if type(couple[0]) is tuple:
images_embeddings += [
vision_model.get_image_embeddings(
model = model,
tokenizer = tokenizer,
image = img,
text_alias = alias,
)
for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path)) for i, path in enumerate(couple[0])]
]
image_prompt = ""
for i in range(len(couple[0])):
image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(couple[0])+i+1) + "}}"
elif couple[0]:
prompt += "[INST]" + image_prompt + couple[0] + "[/INST]"
prompt += couple[1] + "</s>"
if type(message) is dict:
images_embeddings += [
vision_model.get_image_embeddings(
model = model,
tokenizer = tokenizer,
image = img,
text_alias = alias,
)
for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path['path'] if type(path) is dict else path)) for i, path in enumerate(message['files'])]
]
image_prompt = ""
for i in range(len(message['files'])):
image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(message['files'])+i+1) + "}}"
prompt += "[INST]" + image_prompt + message["text"] + "[/INST]"
else:
prompt += "[INST]" + image_prompt + message + "[/INST]"
print(prompt)
# Gnerating Response
output = generator.generate(
prompt = prompt,
max_new_tokens = max_output,
add_bos = True,
encode_special_tokens = True,
decode_special_tokens = True,
stop_conditions = [tokenizer.eos_token_id],
gen_settings = ExLlamaV2Sampler.Settings.greedy(),
embeddings = images_embeddings
)
result = output.split("[/INST]")[-1]
print(result)
return result
description="""A demo chat interface with Pixtral 12B EXL2 Quants, deployed using **ExllamaV2**!
The model will be loaded once the GPU is available. This space specifically will load by default Pixtral at 4bpw from the following repository: [turboderp/pixtral-12b-exl2](https://huggingface.co/turboderp/pixtral-12b-exl2). Other quantization options are available.
The current version of ExllamaV2 running is the dev branch, not the master branch: [ExllamaV2](https://github.com/turboderp/exllamav2/tree/dev).
The model at **4bpw and 16k context size fits in less than 12GB of VRAM**, and at **2.5bpw and short context can potentially fit in 8GB of VRAM**!
The current default settings are:
- Model Quant: 4.0bpw
- Context Size: 16k tokens
- Max Output: 512 tokens
You can select other quants and experiment!
Thanks, turboderp!"""
examples = [
[
{"text": "What are the similarities and differences between these two experiments?", "files":["test_image_1.jpg", "test_image_2.jpg"]},
]
]
drop = gr.Dropdown(available_models, label="EXL2 Quant", value=default_bpw)
context_size_gradio = gr.Slider(minimum = 256, maximum = 32768, label="Context Size", value=default_max_context, step = 1)
output_length_gradio = gr.Slider(minimum = 1, maximum = 4096, label="Max Ouput Length", value=default_max_output, step = 1)
demo = gr.ChatInterface(fn=run_inference, examples = examples, title="Pixtral 12B EXL2", multimodal=True, description=description, additional_inputs = [drop, context_size_gradio, output_length_gradio])
demo.queue().launch()