Spaces:

lightmate
/

llm-chatbot

Running

App Files Files Community

llm-chatbot / app.py

lightmate

Update app.py

210ec4a verified 3 months ago

raw

history blame

4.89 kB

	import os
	import torch
	import gradio as gr
	import ipywidgets as widgets
	from pathlib import Path
	from transformers import AutoConfig, AutoTokenizer
	from optimum.intel.openvino import OVModelForCausalLM
	from typing import List, Tuple
	from threading import Event, Thread
	from gradio_helper import make_demo # Your helper function for Gradio demo
	from llm_config import SUPPORTED_LLM_MODELS # Model configuration
	from notebook_utils import device_widget # Device selection utility
	import openvino as ov
	import openvino.properties as props
	import openvino.properties.hint as hints
	import openvino.properties.streams as streams
	import requests

	# Define the model loading function (same as in your notebook)
	def convert_to_int4(model_id, model_configuration, enable_awq=False):
	# Model conversion logic here (same as in notebook)
	compression_configs = {
	"qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
	"default": {"sym": False, "group_size": 128, "ratio": 0.8},
	}
	model_compression_params = compression_configs.get(model_id, compression_configs["default"])

	# Example conversion logic
	int4_model_dir = Path(model_id) / "INT4_compressed_weights"
	if (int4_model_dir / "openvino_model.xml").exists():
	return int4_model_dir
	remote_code = model_configuration.get("remote_code", False)
	export_command_base = f"optimum-cli export openvino --model {model_configuration['model_id']} --task text-generation-with-past --weight-format int4"
	int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
	if model_compression_params["sym"]:
	int4_compression_args += " --sym"
	if enable_awq:
	int4_compression_args += " --awq --dataset wikitext2 --num-samples 128"
	export_command_base += int4_compression_args
	if remote_code:
	export_command_base += " --trust-remote-code"
	export_command = export_command_base + f" {str(int4_model_dir)}"

	# Execute export command (shell command)
	os.system(export_command)
	return int4_model_dir


	# Model and tokenizer loading
	def load_model(model_dir, device):
	# Load model using OpenVINO
	ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}
	core = ov.Core()
	model_name = model_configuration["model_id"]
	tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)

	ov_model = OVModelForCausalLM.from_pretrained(
	model_dir,
	device=device,
	ov_config=ov_config,
	config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
	trust_remote_code=True,
	)

	return ov_model, tok

	# Define the bot function that interacts with Gradio UI
	def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
	input_ids = convert_history_to_token(history)
	if input_ids.shape[1] > 2000:
	history = [history[-1]] # Limit input size
	input_ids = convert_history_to_token(history)

	streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)

	generate_kwargs = dict(
	input_ids=input_ids,
	max_new_tokens=256,
	temperature=temperature,
	do_sample=temperature > 0.0,
	top_p=top_p,
	top_k=top_k,
	repetition_penalty=repetition_penalty,
	streamer=streamer,
	)

	# Function to generate response in a separate thread
	def generate_and_signal_complete():
	ov_model.generate(**generate_kwargs)
	stream_complete.set()

	t1 = Thread(target=generate_and_signal_complete)
	t1.start()

	# Process partial text and return updated history
	partial_text = ""
	for new_text in streamer:
	partial_text = text_processor(partial_text, new_text)
	history[-1][1] = partial_text
	yield history

	# Gradio interface setup
	def create_gradio_interface():
	model_language = SUPPORTED_LLM_MODELS.keys() # List of model languages
	model_id = widgets.Dropdown(options=model_language, value=model_language[0], description="Model Language:")

	# Choose model based on the selected language
	model_configuration = SUPPORTED_LLM_MODELS[model_language[0]][model_id.value]

	# Prepare model (convert to INT4, etc.)
	int4_model_dir = convert_to_int4(model_id.value, model_configuration)

	# Load model and tokenizer
	device = device_widget("CPU")
	ov_model, tok = load_model(int4_model_dir, device)

	# Create the Gradio app
	demo = make_demo(run_fn=bot, stop_fn=request_cancel, title=f"OpenVINO Chatbot", language=model_language[0])

	return demo

	# Run the Gradio app
	if __name__ == "__main__":
	app = create_gradio_interface()
	app.launch(debug=True, share=True) # share=True for public access