Spaces:
Running
Running
import os | |
import torch | |
import gradio as gr | |
import ipywidgets as widgets | |
from pathlib import Path | |
from transformers import AutoConfig, AutoTokenizer | |
from optimum.intel.openvino import OVModelForCausalLM | |
from typing import List, Tuple | |
from threading import Event, Thread | |
from gradio_helper import make_demo # Your helper function for Gradio demo | |
from llm_config import SUPPORTED_LLM_MODELS # Model configuration | |
from notebook_utils import device_widget # Device selection utility | |
import openvino as ov | |
import openvino.properties as props | |
import openvino.properties.hint as hints | |
import openvino.properties.streams as streams | |
import requests | |
# Define the model loading function (same as in your notebook) | |
def convert_to_int4(model_id, model_configuration, enable_awq=False): | |
# Model conversion logic here (same as in notebook) | |
compression_configs = { | |
"qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, | |
"default": {"sym": False, "group_size": 128, "ratio": 0.8}, | |
} | |
model_compression_params = compression_configs.get(model_id, compression_configs["default"]) | |
# Example conversion logic | |
int4_model_dir = Path(model_id) / "INT4_compressed_weights" | |
if (int4_model_dir / "openvino_model.xml").exists(): | |
return int4_model_dir | |
remote_code = model_configuration.get("remote_code", False) | |
export_command_base = f"optimum-cli export openvino --model {model_configuration['model_id']} --task text-generation-with-past --weight-format int4" | |
int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}" | |
if model_compression_params["sym"]: | |
int4_compression_args += " --sym" | |
if enable_awq: | |
int4_compression_args += " --awq --dataset wikitext2 --num-samples 128" | |
export_command_base += int4_compression_args | |
if remote_code: | |
export_command_base += " --trust-remote-code" | |
export_command = export_command_base + f" {str(int4_model_dir)}" | |
# Execute export command (shell command) | |
os.system(export_command) | |
return int4_model_dir | |
# Model and tokenizer loading | |
def load_model(model_dir, device): | |
# Load model using OpenVINO | |
ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""} | |
core = ov.Core() | |
model_name = model_configuration["model_id"] | |
tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) | |
ov_model = OVModelForCausalLM.from_pretrained( | |
model_dir, | |
device=device, | |
ov_config=ov_config, | |
config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True), | |
trust_remote_code=True, | |
) | |
return ov_model, tok | |
# Define the bot function that interacts with Gradio UI | |
def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id): | |
input_ids = convert_history_to_token(history) | |
if input_ids.shape[1] > 2000: | |
history = [history[-1]] # Limit input size | |
input_ids = convert_history_to_token(history) | |
streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True) | |
generate_kwargs = dict( | |
input_ids=input_ids, | |
max_new_tokens=256, | |
temperature=temperature, | |
do_sample=temperature > 0.0, | |
top_p=top_p, | |
top_k=top_k, | |
repetition_penalty=repetition_penalty, | |
streamer=streamer, | |
) | |
# Function to generate response in a separate thread | |
def generate_and_signal_complete(): | |
ov_model.generate(**generate_kwargs) | |
stream_complete.set() | |
t1 = Thread(target=generate_and_signal_complete) | |
t1.start() | |
# Process partial text and return updated history | |
partial_text = "" | |
for new_text in streamer: | |
partial_text = text_processor(partial_text, new_text) | |
history[-1][1] = partial_text | |
yield history | |
# Gradio interface setup | |
def create_gradio_interface(): | |
model_language = SUPPORTED_LLM_MODELS.keys() # List of model languages | |
model_id = widgets.Dropdown(options=model_language, value=model_language[0], description="Model Language:") | |
# Choose model based on the selected language | |
model_configuration = SUPPORTED_LLM_MODELS[model_language[0]][model_id.value] | |
# Prepare model (convert to INT4, etc.) | |
int4_model_dir = convert_to_int4(model_id.value, model_configuration) | |
# Load model and tokenizer | |
device = device_widget("CPU") | |
ov_model, tok = load_model(int4_model_dir, device) | |
# Create the Gradio app | |
demo = make_demo(run_fn=bot, stop_fn=request_cancel, title=f"OpenVINO Chatbot", language=model_language[0]) | |
return demo | |
# Run the Gradio app | |
if __name__ == "__main__": | |
app = create_gradio_interface() | |
app.launch(debug=True, share=True) # share=True for public access | |