lightmate commited on
Commit
1cd9f06
·
verified ·
1 Parent(s): f0d2584

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -120
app.py CHANGED
@@ -1,144 +1,148 @@
 
1
  import os
2
  from pathlib import Path
3
  import torch
4
- from transformers import AutoConfig, AutoTokenizer
 
 
 
 
5
  from optimum.intel.openvino import OVModelForCausalLM
6
  import openvino as ov
7
  import openvino.properties as props
8
  import openvino.properties.hint as hints
9
  import openvino.properties.streams as streams
10
- import gradio as gr
11
 
 
12
  from llm_config import SUPPORTED_LLM_MODELS
13
 
14
- # Initialize model language options
15
- model_languages = list(SUPPORTED_LLM_MODELS)
16
-
17
- # Helper function to retrieve model configuration and path
18
- def get_model_path(model_language_value, model_id_value):
19
- model_configuration = SUPPORTED_LLM_MODELS[model_language_value][model_id_value]
20
- pt_model_name = model_id_value.split("-")[0]
21
- int4_model_dir = Path(model_id_value) / "INT4_compressed_weights"
22
- return model_configuration, int4_model_dir, pt_model_name
23
-
24
- # Download the model if not already present
25
- def download_model_if_needed(model_language_value, model_id_value):
26
- model_configuration, int4_model_dir, pt_model_name = get_model_path(model_language_value, model_id_value)
27
- int4_weights = int4_model_dir / "openvino_model.bin"
28
- if not int4_weights.exists():
29
- print(f"Downloading model {model_id_value}...")
30
- # Download logic (e.g., requests.get(model_configuration["model_url"])) can go here
31
- return int4_model_dir
32
-
33
- # Load the model based on selected options
34
- def load_model(model_language_value, model_id_value, device):
35
- int4_model_dir = download_model_if_needed(model_language_value, model_id_value)
36
- ov_config = {
37
- hints.performance_mode(): hints.PerformanceMode.LATENCY,
38
- streams.num(): "1",
39
- props.cache_dir(): ""
40
- }
41
- core = ov.Core()
42
- tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)
43
- ov_model = OVModelForCausalLM.from_pretrained(
44
- int4_model_dir,
45
- device=device,
46
- ov_config=ov_config,
47
- config=AutoConfig.from_pretrained(int4_model_dir, trust_remote_code=True),
48
- trust_remote_code=True
49
- )
50
- return tok, ov_model
51
-
52
- # Define the function to generate responses
53
- def generate_response(history, temperature, top_p, top_k, repetition_penalty, model_language_value, model_id_value, device):
54
- tok, ov_model = load_model(model_language_value, model_id_value, device)
55
-
56
- def convert_history_to_token(history):
57
- input_tokens = tok(" ".join([msg[0] for msg in history]), return_tensors="pt").input_ids
58
- return input_tokens
59
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  input_ids = convert_history_to_token(history)
 
 
 
 
 
61
  generate_kwargs = dict(
62
  input_ids=input_ids,
63
  max_new_tokens=256,
64
  temperature=temperature,
 
65
  top_p=top_p,
66
  top_k=top_k,
67
- repetition_penalty=repetition_penalty
68
- )
69
-
70
- # Stream response to textbox
71
- response = ""
72
- for new_text in ov_model.generate(**generate_kwargs):
73
- response += new_text
74
- history[-1][1] = response
75
- yield history
76
-
77
- # Define Gradio interface within a Blocks context
78
- with gr.Blocks() as iface:
79
- # Dropdown for model language selection
80
- model_language = gr.Dropdown(
81
- choices=model_languages,
82
- value=model_languages[0],
83
- label="Model Language"
84
- )
85
-
86
- # Dropdown for model ID, dynamically populated
87
- model_id = gr.Dropdown(
88
- choices=[], # will be populated dynamically
89
- label="Model",
90
- value=None
91
  )
 
92
 
93
- # Update model_id choices when model_language changes
94
- def update_model_id(model_language_value):
95
- model_ids = list(SUPPORTED_LLM_MODELS[model_language_value])
96
- return gr.Dropdown.update(value=model_ids[0], choices=model_ids)
97
 
98
- model_language.change(update_model_id, inputs=model_language, outputs=model_id)
99
-
100
- # Checkbox for INT4 model preparation
101
- prepare_int4_model = gr.Checkbox(
102
- value=True,
103
- label="Prepare INT4 Model"
104
- )
105
-
106
- # Checkbox for enabling AWQ (shown conditionally)
107
- enable_awq = gr.Checkbox(
108
- value=False,
109
- label="Enable AWQ",
110
- visible=False # visibility can be controlled in the UI logic
111
- )
112
-
113
- # Dropdown for device selection
114
- device = gr.Dropdown(
115
- choices=["CPU", "GPU"],
116
- value="CPU",
117
- label="Device"
118
- )
119
-
120
- # Sliders for model generation parameters
121
- temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, label="Temperature")
122
- top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top P")
123
- top_k = gr.Slider(minimum=0, maximum=50, value=50, label="Top K")
124
- repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, label="Repetition Penalty")
125
-
126
- # Conversation history state
127
- history = gr.State([])
128
-
129
- # Textbox for conversation history
130
- conversation_output = gr.Textbox(label="Conversation History")
131
-
132
- # Button to trigger response generation
133
- generate_button = gr.Button("Generate Response")
134
 
135
- # Define action when button is clicked
136
- generate_button.click(
137
- generate_response,
138
- inputs=[history, temperature, top_p, top_k, repetition_penalty, model_language, model_id, device],
139
- outputs=[conversation_output, history]
140
- )
141
 
142
- # Launch the Gradio app
 
143
  if __name__ == "__main__":
144
- iface.launch(debug=True, server_name="0.0.0.0", server_port=7860)
 
1
+ # app.py
2
  import os
3
  from pathlib import Path
4
  import torch
5
+ from threading import Event, Thread
6
+ from typing import List, Tuple
7
+
8
+ # Importing necessary packages
9
+ from transformers import AutoConfig, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
10
  from optimum.intel.openvino import OVModelForCausalLM
11
  import openvino as ov
12
  import openvino.properties as props
13
  import openvino.properties.hint as hints
14
  import openvino.properties.streams as streams
 
15
 
16
+ from gradio_helper import make_demo # UI logic import
17
  from llm_config import SUPPORTED_LLM_MODELS
18
 
19
+ # Model configuration setup
20
+ model_language_value = "English"
21
+ model_id_value = 'qwen2.5-0.5b-instruct'
22
+ prepare_int4_model_value = True
23
+ enable_awq_value = False
24
+ device_value = 'CPU'
25
+ model_to_run_value = 'INT4'
26
+ pt_model_id = SUPPORTED_LLM_MODELS[model_language_value][model_id_value]["model_id"]
27
+ pt_model_name = model_id_value.split("-")[0]
28
+ int4_model_dir = Path(model_id_value) / "INT4_compressed_weights"
29
+ int4_weights = int4_model_dir / "openvino_model.bin"
30
+
31
+ # Model loading
32
+ core = ov.Core()
33
+ ov_config = {
34
+ hints.performance_mode(): hints.PerformanceMode.LATENCY,
35
+ streams.num(): "1",
36
+ props.cache_dir(): ""
37
+ }
38
+ tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)
39
+ ov_model = OVModelForCausalLM.from_pretrained(
40
+ int4_model_dir,
41
+ device=device_value,
42
+ ov_config=ov_config,
43
+ config=AutoConfig.from_pretrained(int4_model_dir, trust_remote_code=True),
44
+ trust_remote_code=True,
45
+ )
46
+
47
+ # Stopping criteria for token generation
48
+ class StopOnTokens(StoppingCriteria):
49
+ def __init__(self, token_ids):
50
+ self.token_ids = token_ids
51
+
52
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
53
+ return any(input_ids[0][-1] == stop_id for stop_id in self.token_ids)
54
+
55
+ # Functions for chatbot logic
56
+ def convert_history_to_token(history: List[Tuple[str, str]]):
57
+ """
58
+ function for conversion history stored as list pairs of user and assistant messages to tokens according to model expected conversation template
59
+ Params:
60
+ history: dialogue history
61
+ Returns:
62
+ history in token format
63
+ """
64
+ if pt_model_name == "baichuan2":
65
+ system_tokens = tok.encode(start_message)
66
+ history_tokens = []
67
+ for old_query, response in history[:-1]:
68
+ round_tokens = []
69
+ round_tokens.append(195)
70
+ round_tokens.extend(tok.encode(old_query))
71
+ round_tokens.append(196)
72
+ round_tokens.extend(tok.encode(response))
73
+ history_tokens = round_tokens + history_tokens
74
+ input_tokens = system_tokens + history_tokens
75
+ input_tokens.append(195)
76
+ input_tokens.extend(tok.encode(history[-1][0]))
77
+ input_tokens.append(196)
78
+ input_token = torch.LongTensor([input_tokens])
79
+ elif history_template is None or has_chat_template:
80
+ messages = [{"role": "system", "content": start_message}]
81
+ for idx, (user_msg, model_msg) in enumerate(history):
82
+ if idx == len(history) - 1 and not model_msg:
83
+ messages.append({"role": "user", "content": user_msg})
84
+ break
85
+ if user_msg:
86
+ messages.append({"role": "user", "content": user_msg})
87
+ if model_msg:
88
+ messages.append({"role": "assistant", "content": model_msg})
89
+
90
+ input_token = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_tensors="pt")
91
+ else:
92
+ text = start_message + "".join(
93
+ ["".join([history_template.format(num=round, user=item[0], assistant=item[1])]) for round, item in enumerate(history[:-1])]
94
+ )
95
+ text += "".join(
96
+ [
97
+ "".join(
98
+ [
99
+ current_message_template.format(
100
+ num=len(history) + 1,
101
+ user=history[-1][0],
102
+ assistant=history[-1][1],
103
+ )
104
+ ]
105
+ )
106
+ ]
107
+ )
108
+ input_token = tok(text, return_tensors="pt", **tokenizer_kwargs).input_ids
109
+ return input_token
110
+
111
+ def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
112
+ # Callback function for running chatbot on submit button click
113
  input_ids = convert_history_to_token(history)
114
+ if input_ids.shape[1] > 2000:
115
+ history = [history[-1]]
116
+ input_ids = convert_history_to_token(history)
117
+
118
+ streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
119
  generate_kwargs = dict(
120
  input_ids=input_ids,
121
  max_new_tokens=256,
122
  temperature=temperature,
123
+ do_sample=temperature > 0.0,
124
  top_p=top_p,
125
  top_k=top_k,
126
+ repetition_penalty=repetition_penalty,
127
+ streamer=streamer,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  )
129
+ stream_complete = Event()
130
 
131
+ def generate_and_signal_complete():
132
+ ov_model.generate(**generate_kwargs)
133
+ stream_complete.set()
 
134
 
135
+ Thread(target=generate_and_signal_complete).start()
136
+ partial_text = ""
137
+ for new_text in streamer:
138
+ partial_text += new_text
139
+ history[-1][1] = partial_text
140
+ yield history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
+ def request_cancel():
143
+ ov_model.request.cancel()
 
 
 
 
144
 
145
+ # Gradio setup and launch
146
+ demo = make_demo(run_fn=bot, stop_fn=request_cancel, title=f"OpenVINO {model_id_value} Chatbot", language=model_language_value)
147
  if __name__ == "__main__":
148
+ demo.launch(debug=True, share=True, server_name="0.0.0.0", server_port=7860)