import os import torch import gradio as gr import ipywidgets as widgets from pathlib import Path from transformers import AutoConfig, AutoTokenizer from optimum.intel.openvino import OVModelForCausalLM from typing import List, Tuple from threading import Event, Thread from gradio_helper import make_demo # Your helper function for Gradio demo from llm_config import SUPPORTED_LLM_MODELS # Model configuration from notebook_utils import device_widget # Device selection utility import openvino as ov import openvino.properties as props import openvino.properties.hint as hints import openvino.properties.streams as streams import requests # Define the model loading function (same as in your notebook) def convert_to_int4(model_id, model_configuration, enable_awq=False): # Model conversion logic here (same as in notebook) compression_configs = { "qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0}, "default": {"sym": False, "group_size": 128, "ratio": 0.8}, } model_compression_params = compression_configs.get(model_id, compression_configs["default"]) # Example conversion logic int4_model_dir = Path(model_id) / "INT4_compressed_weights" if (int4_model_dir / "openvino_model.xml").exists(): return int4_model_dir remote_code = model_configuration.get("remote_code", False) export_command_base = f"optimum-cli export openvino --model {model_configuration['model_id']} --task text-generation-with-past --weight-format int4" int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}" if model_compression_params["sym"]: int4_compression_args += " --sym" if enable_awq: int4_compression_args += " --awq --dataset wikitext2 --num-samples 128" export_command_base += int4_compression_args if remote_code: export_command_base += " --trust-remote-code" export_command = export_command_base + f" {str(int4_model_dir)}" # Execute export command (shell command) os.system(export_command) return int4_model_dir # Model and tokenizer loading def load_model(model_dir, device): # Load model using OpenVINO ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""} core = ov.Core() model_name = model_configuration["model_id"] tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) ov_model = OVModelForCausalLM.from_pretrained( model_dir, device=device, ov_config=ov_config, config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True), trust_remote_code=True, ) return ov_model, tok # Define the bot function that interacts with Gradio UI def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id): input_ids = convert_history_to_token(history) if input_ids.shape[1] > 2000: history = [history[-1]] # Limit input size input_ids = convert_history_to_token(history) streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( input_ids=input_ids, max_new_tokens=256, temperature=temperature, do_sample=temperature > 0.0, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty, streamer=streamer, ) # Function to generate response in a separate thread def generate_and_signal_complete(): ov_model.generate(**generate_kwargs) stream_complete.set() t1 = Thread(target=generate_and_signal_complete) t1.start() # Process partial text and return updated history partial_text = "" for new_text in streamer: partial_text = text_processor(partial_text, new_text) history[-1][1] = partial_text yield history # Gradio interface setup def create_gradio_interface(): model_language = SUPPORTED_LLM_MODELS.keys() # List of model languages model_id = widgets.Dropdown(options=model_language, value=model_language[0], description="Model Language:") # Choose model based on the selected language model_configuration = SUPPORTED_LLM_MODELS[model_language[0]][model_id.value] # Prepare model (convert to INT4, etc.) int4_model_dir = convert_to_int4(model_id.value, model_configuration) # Load model and tokenizer device = device_widget("CPU") ov_model, tok = load_model(int4_model_dir, device) # Create the Gradio app demo = make_demo(run_fn=bot, stop_fn=request_cancel, title=f"OpenVINO Chatbot", language=model_language[0]) return demo # Run the Gradio app if __name__ == "__main__": app = create_gradio_interface() app.launch(debug=True, share=True) # share=True for public access