Spaces:
Running
on
Zero
Running
on
Zero
## Due to a small bug when installing exllamav2 from dev branch directly we require CUDA paths | |
import cuda_bug | |
cuda_bug.install_cuda_toolkit_requirements() | |
## | |
import gradio as gr | |
from gradio.data_classes import FileData | |
from huggingface_hub import snapshot_download | |
from pathlib import Path | |
import base64 | |
import spaces | |
import os | |
import sys, os | |
import torch | |
from exllamav2 import ( | |
ExLlamaV2, | |
ExLlamaV2Config, | |
ExLlamaV2Cache, | |
ExLlamaV2Tokenizer, | |
ExLlamaV2VisionTower, | |
) | |
from exllamav2.generator import ( | |
ExLlamaV2DynamicGenerator, | |
ExLlamaV2Sampler, | |
) | |
from PIL import Image | |
import requests | |
from huggingface_hub import snapshot_download | |
from tqdm import tqdm | |
default_max_context = 16384 | |
default_max_output = 512 | |
default_bpw = "4.0bpw" | |
available_models = [ | |
"2.5bpw", | |
"3.0bpw", | |
"3.5bpw", | |
"4.0bpw", | |
"4.5bpw", | |
"5.0bpw", | |
"6.0bpw", | |
"8.0bpw" | |
] | |
dirs = {} | |
for model in tqdm(available_models): | |
dirs.update({model: snapshot_download(repo_id="turboderp/pixtral-12b-exl2", revision=model)}) | |
def run_inference(message, history, model_picked, context_size, max_output): | |
if not model_picked: | |
model_picked = default_bpw | |
if not context_size: | |
context_size = default_max_context | |
if not max_output: | |
max_output = default_max_output | |
local_dir = dirs[model_picked] | |
# Loading only once GPU available | |
config = ExLlamaV2Config(local_dir) | |
config.max_seq_len = context_size | |
vision_model = ExLlamaV2VisionTower(config) | |
vision_model.load(progress = True) | |
model = ExLlamaV2(config) | |
cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = context_size) | |
model.load_autosplit(cache, progress = True) | |
tokenizer = ExLlamaV2Tokenizer(config) | |
generator = ExLlamaV2DynamicGenerator( | |
model = model, | |
cache = cache, | |
tokenizer = tokenizer | |
) | |
# Making Prompt Template | |
prompt = "" | |
image_prompt = "" | |
images_embeddings = [] | |
for couple in history: | |
if type(couple[0]) is tuple: | |
images_embeddings += [ | |
vision_model.get_image_embeddings( | |
model = model, | |
tokenizer = tokenizer, | |
image = img, | |
text_alias = alias, | |
) | |
for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path)) for i, path in enumerate(couple[0])] | |
] | |
image_prompt = "" | |
for i in range(len(couple[0])): | |
image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(couple[0])+i+1) + "}}" | |
elif couple[0]: | |
prompt += "[INST]" + image_prompt + couple[0] + "[/INST]" | |
prompt += couple[1] + "</s>" | |
if type(message) is dict: | |
images_embeddings += [ | |
vision_model.get_image_embeddings( | |
model = model, | |
tokenizer = tokenizer, | |
image = img, | |
text_alias = alias, | |
) | |
for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path['path'] if type(path) is dict else path)) for i, path in enumerate(message['files'])] | |
] | |
image_prompt = "" | |
for i in range(len(message['files'])): | |
image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(message['files'])+i+1) + "}}" | |
prompt += "[INST]" + image_prompt + message["text"] + "[/INST]" | |
else: | |
prompt += "[INST]" + image_prompt + message + "[/INST]" | |
print(prompt) | |
# Gnerating Response | |
output = generator.generate( | |
prompt = prompt, | |
max_new_tokens = max_output, | |
add_bos = True, | |
encode_special_tokens = True, | |
decode_special_tokens = True, | |
stop_conditions = [tokenizer.eos_token_id], | |
gen_settings = ExLlamaV2Sampler.Settings.greedy(), | |
embeddings = images_embeddings | |
) | |
result = output.split("[/INST]")[-1] | |
print(result) | |
return result | |
description="""A demo chat interface with Pixtral 12B EXL2 Quants, deployed using **ExllamaV2**! | |
The model will be loaded once the GPU is available. This space specifically will load by default Pixtral at 4bpw from the following repository: [turboderp/pixtral-12b-exl2](https://huggingface.co/turboderp/pixtral-12b-exl2). Other quantization options are available. | |
The current version of ExllamaV2 running is the dev branch, not the master branch: [ExllamaV2](https://github.com/turboderp/exllamav2/tree/dev). | |
The model at **4bpw and 16k context size fits in less than 12GB of VRAM**, and at **2.5bpw and short context can potentially fit in 8GB of VRAM**! | |
The current default settings are: | |
- Model Quant: 4.0bpw | |
- Context Size: 16k tokens | |
- Max Output: 512 tokens | |
You can select other quants and experiment! | |
Thanks, turboderp!""" | |
examples = [ | |
[ | |
{"text": "What are the similarities and differences between these two experiments?", "files":["test_image_1.jpg", "test_image_2.jpg"]}, | |
] | |
] | |
drop = gr.Dropdown(available_models, label="EXL2 Quant", value=default_bpw) | |
context_size_gradio = gr.Slider(minimum = 256, maximum = 32768, label="Context Size", value=default_max_context, step = 1) | |
output_length_gradio = gr.Slider(minimum = 1, maximum = 4096, label="Max Ouput Length", value=default_max_output, step = 1) | |
demo = gr.ChatInterface(fn=run_inference, examples = examples, title="Pixtral 12B EXL2", multimodal=True, description=description, additional_inputs = [drop, context_size_gradio, output_length_gradio]) | |
demo.queue().launch() |