File size: 5,426 Bytes
1be87ac
 
fc98e77
21a478e
1be87ac
 
21a478e
8037c4b
1be87ac
 
 
 
 
 
 
 
5d7db46
1be87ac
4ca2388
b597dd2
4132916
 
 
 
4ca2388
 
1be87ac
 
 
4ca2388
 
1be87ac
 
 
4ca2388
1be87ac
 
 
4ca2388
 
 
80eed0f
1be87ac
fc98e77
1be87ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ca2388
1be87ac
 
 
 
 
 
 
 
 
 
 
 
4ca2388
4facf91
c54909c
202881e
 
181f5cd
4ca2388
 
 
fc98e77
 
40b508f
4ca2388
40b508f
4ca2388
 
 
 
 
 
700ffae
 
b597dd2
700ffae
 
 
 
40b508f
700ffae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4132916
 
700ffae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40b508f
1be87ac
 
 
 
fc98e77
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#refer llama recipes for more info https://github.com/huggingface/huggingface-llama-recipes/blob/main/inference-api.ipynb
#huggingface-llama-recipes : https://github.com/huggingface/huggingface-llama-recipes/tree/main

import gradio as gr
from openai import OpenAI
import os

ACCESS_TOKEN = os.getenv("myHFtoken")

print("Access token loaded.")

client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)

print("Client initialized.")

SYSTEM_PROMPTS = {
    "zh-HK": "用香港的廣東話(Cantonese)對話. No chatty. Answer in simple but accurate way.",
    "zh-TW": "Chat by Traditional Chinese language of Taiwan (zh-TW). No chatty. Answer in simple but accurate way.",
    "EN: General Assistant": "You are a helpful, respectful and honest assistant. Always provide accurate information and admit when you're not sure about something.",
    "EN: Code Helper": "You are a programming assistant. Help users with coding questions, debugging, and best practices. Provide clear explanations and code examples when appropriate.",
    "EN: Creative Writer": "You are a creative writing assistant. Help users with storytelling, character development, and creative writing techniques. Be imaginative and encouraging."
}

def respond(
    message,
    history: list[tuple[str, str]],
    preset_prompt,
    custom_prompt,
    max_tokens,
    temperature,
    top_p,
    model_name,
):
    print(f"Received message: {message}")
    print(f"History: {history}")
    
    system_message = custom_prompt if custom_prompt.strip() else SYSTEM_PROMPTS[preset_prompt]
    
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Selected model: {model_name}")

    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
            print(f"Added user message to context: {val[0]}")
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
            print(f"Added assistant message to context: {val[1]}")

    messages.append({"role": "user", "content": message})

    response = ""
    print("Sending request to OpenAI API.")
    
    for message in client.chat.completions.create(
        model=model_name,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
        messages=messages,
    ):
        token = message.choices[0].delta.content
        print(f"Received token: {token}")
        response += token
        yield response

    print("Completed response generation.")

models = [
    "ngxson/MiniThinky-v2-1B-Llama-3.2",
    "meta-llama/Llama-3.2-3B-Instruct",
    "PowerInfer/SmallThinker-3B-Preview",
    "NovaSky-AI/Sky-T1-32B-Preview",
    "Qwen/QwQ-32B-Preview",
    "Qwen/Qwen2.5-Coder-32B-Instruct",
    "microsoft/Phi-3-mini-128k-instruct",
]

with gr.Blocks() as demo:
    gr.Markdown("# LLM Test")
    
    with gr.Row():
        model_dropdown = gr.Dropdown(
            choices=models, 
            value=models[0], 
            label="Select Model:"
        )

    # Create the chat components separately
    chatbot = gr.Chatbot(height=500)
    msg = gr.Textbox(
        show_label=False,
        placeholder="Enter text and press enter",
        container=False
    )
    clear = gr.Button("Clear")

    # Additional inputs
    with gr.Accordion("Configuration", open=False):
        preset_prompt = gr.Dropdown(
            choices=list(SYSTEM_PROMPTS.keys()),
            value=list(SYSTEM_PROMPTS.keys())[0],
            label="Select System Prompt:"
        )
        custom_prompt = gr.Textbox(
            value="",
            label="Custom System Prompt (leaves blank to use preset):",
            lines=2
        )
        max_tokens = gr.Slider(
            minimum=1,
            maximum=8192,
            value=2048,
            step=1,
            label="Max new tokens:"
        )
        temperature = gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.3,
            step=0.1,
            label="Temperature:"
        )
        top_p = gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-P:"
        )

    # Set up the chat functionality
    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(
        history,
        preset_prompt,
        custom_prompt,
        max_tokens,
        temperature,
        top_p,
        model_name
    ):
        history[-1][1] = ""
        for character in respond(
            history[-1][0],
            history[:-1],
            preset_prompt,
            custom_prompt,
            max_tokens,
            temperature,
            top_p,
            model_name
        ):
            history[-1][1] = character
            yield history

    msg.submit(
        user,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot,
        [chatbot, preset_prompt, custom_prompt, max_tokens, temperature, top_p, model_dropdown],
        chatbot
    )

    clear.click(lambda: None, None, chatbot, queue=False)

print("Gradio interface initialized.")

if __name__ == "__main__":
    print("Launching the demo application.")
    demo.launch()