Spaces:

lightmate
/

llm-chatbot

Running

App Files Files Community

lightmate commited on Nov 7, 2024

Commit

1cd9f06

verified ·

1 Parent(s): f0d2584

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -120

app.py CHANGED Viewed

@@ -1,144 +1,148 @@
 import os
 from pathlib import Path
 import torch
-from transformers import AutoConfig, AutoTokenizer
 from optimum.intel.openvino import OVModelForCausalLM
 import openvino as ov
 import openvino.properties as props
 import openvino.properties.hint as hints
 import openvino.properties.streams as streams
-import gradio as gr
 from llm_config import SUPPORTED_LLM_MODELS
-# Initialize model language options
-model_languages = list(SUPPORTED_LLM_MODELS)
-# Helper function to retrieve model configuration and path
-def get_model_path(model_language_value, model_id_value):
-    model_configuration = SUPPORTED_LLM_MODELS[model_language_value][model_id_value]
-    pt_model_name = model_id_value.split("-")[0]
-    int4_model_dir = Path(model_id_value) / "INT4_compressed_weights"
-    return model_configuration, int4_model_dir, pt_model_name
-# Download the model if not already present
-def download_model_if_needed(model_language_value, model_id_value):
-    model_configuration, int4_model_dir, pt_model_name = get_model_path(model_language_value, model_id_value)
-    int4_weights = int4_model_dir / "openvino_model.bin"
-    if not int4_weights.exists():
-        print(f"Downloading model {model_id_value}...")
-        # Download logic (e.g., requests.get(model_configuration["model_url"])) can go here
-    return int4_model_dir
-# Load the model based on selected options
-def load_model(model_language_value, model_id_value, device):
-    int4_model_dir = download_model_if_needed(model_language_value, model_id_value)
-    ov_config = {
-        hints.performance_mode(): hints.PerformanceMode.LATENCY,
-        streams.num(): "1",
-        props.cache_dir(): ""
-    }
-    core = ov.Core()
-    tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)
-    ov_model = OVModelForCausalLM.from_pretrained(
-        int4_model_dir,
-        device=device,
-        ov_config=ov_config,
-        config=AutoConfig.from_pretrained(int4_model_dir, trust_remote_code=True),
-        trust_remote_code=True
-    )
-    return tok, ov_model
-# Define the function to generate responses
-def generate_response(history, temperature, top_p, top_k, repetition_penalty, model_language_value, model_id_value, device):
-    tok, ov_model = load_model(model_language_value, model_id_value, device)
-    def convert_history_to_token(history):
-        input_tokens = tok(" ".join([msg[0] for msg in history]), return_tensors="pt").input_ids
-        return input_tokens
     input_ids = convert_history_to_token(history)
     generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=256,
         temperature=temperature,
         top_p=top_p,
         top_k=top_k,
-        repetition_penalty=repetition_penalty
-    )
-    # Stream response to textbox
-    response = ""
-    for new_text in ov_model.generate(**generate_kwargs):
-        response += new_text
-        history[-1][1] = response
-        yield history
-# Define Gradio interface within a Blocks context
-with gr.Blocks() as iface:
-    # Dropdown for model language selection
-    model_language = gr.Dropdown(
-        choices=model_languages,
-        value=model_languages[0],
-        label="Model Language"
-    )
-    # Dropdown for model ID, dynamically populated
-    model_id = gr.Dropdown(
-        choices=[],  # will be populated dynamically
-        label="Model",
-        value=None
     )
-    # Update model_id choices when model_language changes
-    def update_model_id(model_language_value):
-        model_ids = list(SUPPORTED_LLM_MODELS[model_language_value])
-        return gr.Dropdown.update(value=model_ids[0], choices=model_ids)
-    model_language.change(update_model_id, inputs=model_language, outputs=model_id)
-    # Checkbox for INT4 model preparation
-    prepare_int4_model = gr.Checkbox(
-        value=True,
-        label="Prepare INT4 Model"
-    )
-    # Checkbox for enabling AWQ (shown conditionally)
-    enable_awq = gr.Checkbox(
-        value=False,
-        label="Enable AWQ",
-        visible=False  # visibility can be controlled in the UI logic
-    )
-    # Dropdown for device selection
-    device = gr.Dropdown(
-        choices=["CPU", "GPU"],
-        value="CPU",
-        label="Device"
-    )
-    # Sliders for model generation parameters
-    temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, label="Temperature")
-    top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top P")
-    top_k = gr.Slider(minimum=0, maximum=50, value=50, label="Top K")
-    repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, label="Repetition Penalty")
-    # Conversation history state
-    history = gr.State([])
-    # Textbox for conversation history
-    conversation_output = gr.Textbox(label="Conversation History")
-    # Button to trigger response generation
-    generate_button = gr.Button("Generate Response")
-    # Define action when button is clicked
-    generate_button.click(
-        generate_response,
-        inputs=[history, temperature, top_p, top_k, repetition_penalty, model_language, model_id, device],
-        outputs=[conversation_output, history]
-    )
-# Launch the Gradio app
 if __name__ == "__main__":
-    iface.launch(debug=True, server_name="0.0.0.0", server_port=7860)

+# app.py
 import os
 from pathlib import Path
 import torch
+from threading import Event, Thread
+from typing import List, Tuple
+# Importing necessary packages
+from transformers import AutoConfig, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
 from optimum.intel.openvino import OVModelForCausalLM
 import openvino as ov
 import openvino.properties as props
 import openvino.properties.hint as hints
 import openvino.properties.streams as streams
+from gradio_helper import make_demo  # UI logic import
 from llm_config import SUPPORTED_LLM_MODELS
+# Model configuration setup
+model_language_value = "English"
+model_id_value = 'qwen2.5-0.5b-instruct'
+prepare_int4_model_value = True
+enable_awq_value = False
+device_value = 'CPU'
+model_to_run_value = 'INT4'
+pt_model_id = SUPPORTED_LLM_MODELS[model_language_value][model_id_value]["model_id"]
+pt_model_name = model_id_value.split("-")[0]
+int4_model_dir = Path(model_id_value) / "INT4_compressed_weights"
+int4_weights = int4_model_dir / "openvino_model.bin"
+# Model loading
+core = ov.Core()
+ov_config = {
+    hints.performance_mode(): hints.PerformanceMode.LATENCY,
+    streams.num(): "1",
+    props.cache_dir(): ""
+}
+tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)
+ov_model = OVModelForCausalLM.from_pretrained(
+    int4_model_dir,
+    device=device_value,
+    ov_config=ov_config,
+    config=AutoConfig.from_pretrained(int4_model_dir, trust_remote_code=True),
+    trust_remote_code=True,
+)
+# Stopping criteria for token generation
+class StopOnTokens(StoppingCriteria):
+    def __init__(self, token_ids):
+        self.token_ids = token_ids
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        return any(input_ids[0][-1] == stop_id for stop_id in self.token_ids)
+# Functions for chatbot logic
+def convert_history_to_token(history: List[Tuple[str, str]]):
+    """
+    function for conversion history stored as list pairs of user and assistant messages to tokens according to model expected conversation template
+    Params:
+      history: dialogue history
+    Returns:
+      history in token format
+    """
+    if pt_model_name == "baichuan2":
+        system_tokens = tok.encode(start_message)
+        history_tokens = []
+        for old_query, response in history[:-1]:
+            round_tokens = []
+            round_tokens.append(195)
+            round_tokens.extend(tok.encode(old_query))
+            round_tokens.append(196)
+            round_tokens.extend(tok.encode(response))
+            history_tokens = round_tokens + history_tokens
+        input_tokens = system_tokens + history_tokens
+        input_tokens.append(195)
+        input_tokens.extend(tok.encode(history[-1][0]))
+        input_tokens.append(196)
+        input_token = torch.LongTensor([input_tokens])
+    elif history_template is None or has_chat_template:
+        messages = [{"role": "system", "content": start_message}]
+        for idx, (user_msg, model_msg) in enumerate(history):
+            if idx == len(history) - 1 and not model_msg:
+                messages.append({"role": "user", "content": user_msg})
+                break
+            if user_msg:
+                messages.append({"role": "user", "content": user_msg})
+            if model_msg:
+                messages.append({"role": "assistant", "content": model_msg})
+        input_token = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_tensors="pt")
+    else:
+        text = start_message + "".join(
+            ["".join([history_template.format(num=round, user=item[0], assistant=item[1])]) for round, item in enumerate(history[:-1])]
+        )
+        text += "".join(
+            [
+                "".join(
+                    [
+                        current_message_template.format(
+                            num=len(history) + 1,
+                            user=history[-1][0],
+                            assistant=history[-1][1],
+                        )
+                    ]
+                )
+            ]
+        )
+        input_token = tok(text, return_tensors="pt", **tokenizer_kwargs).input_ids
+    return input_token
+def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
+    # Callback function for running chatbot on submit button click
     input_ids = convert_history_to_token(history)
+    if input_ids.shape[1] > 2000:
+        history = [history[-1]]
+        input_ids = convert_history_to_token(history)
+    streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=256,
         temperature=temperature,
+        do_sample=temperature > 0.0,
         top_p=top_p,
         top_k=top_k,
+        repetition_penalty=repetition_penalty,
+        streamer=streamer,
     )
+    stream_complete = Event()
+    def generate_and_signal_complete():
+        ov_model.generate(**generate_kwargs)
+        stream_complete.set()
+    Thread(target=generate_and_signal_complete).start()
+    partial_text = ""
+    for new_text in streamer:
+        partial_text += new_text
+        history[-1][1] = partial_text
+        yield history
+def request_cancel():
+    ov_model.request.cancel()
+# Gradio setup and launch
+demo = make_demo(run_fn=bot, stop_fn=request_cancel, title=f"OpenVINO {model_id_value} Chatbot", language=model_language_value)
 if __name__ == "__main__":
+    demo.launch(debug=True, share=True, server_name="0.0.0.0", server_port=7860)