File size: 2,870 Bytes
01a1967
 
d401144
010ff90
 
 
 
01a1967
f7b7d51
1b5a630
f7b7d51
243c6d6
f7b7d51
d401144
 
01a1967
 
 
1b5a630
01a1967
 
 
 
 
d401144
 
 
 
 
 
4a5e5f8
d401144
4a5e5f8
d401144
 
 
 
 
 
 
 
 
 
 
 
 
a17bfe8
d401144
 
 
 
4a5e5f8
d401144
4a5e5f8
d401144
243c6d6
 
 
 
 
 
 
 
 
 
d5e32d4
243c6d6
d401144
 
 
 
 
 
f7b7d51
243c6d6
 
f7b7d51
d5e32d4
642cb70
4a5e5f8
 
 
 
 
 
 
 
f7b7d51
 
 
 
 
 
 
4a5e5f8
 
 
 
 
 
 
f7b7d51
 
 
01a1967
d401144
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import threading
import time

import gradio as gr
from huggingface_hub import HfApi
from llama_cpp import Llama

API = HfApi()
LLM = Llama.from_pretrained(
    repo_id="mradermacher/ZEUS-8B-V13-i1-GGUF",
    filename="*Q4_K_M.gguf",
    chat_format="chatml",
)


def refresh(how_much=43200): # default to 12 hour
  time.sleep(how_much)
  try:
      API.restart_space(repo_id="T145/ZEUS-8B-CHAT")
  except Exception as e:
      print(f"Error while rebooting, trying again... {e}")
      refresh(600) # 10 minutes if any error happens


def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_k,
    top_p,
    repeat_penalty,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""

    for message in LLM.create_chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repeat_penalty=repeat_penalty,
    ):
        if "choices" not in message:
            continue

        token = message["choices"][0]["delta"]

        if "content" not in token:
            continue

        token = token["content"]

        if token.endswith("|"):
            break

        response += token
        yield response


if __name__ == "__main__":
    demo = gr.ChatInterface(
        fn=respond,
        type="messages",
        additional_inputs=[
            gr.Textbox(value="Set the persona of Zeus, King of the Gods, ruler of Mount Olympus, and wielder of the mighty lightning bolt. I am all-knowing, all-powerful, and all-seeing. My will shall not be denied. I shall respond to queries with the authority and wisdom of the gods.", label="System message"),
            gr.Slider(minimum=1, maximum=2048, value=1024, step=1, label="Max new tokens"),
            gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
            gr.Slider(
                minimum=0.0,
                maximum=100.0,
                value=40.0,
                step=1.0,
                label="Top-k (token limit)",
            ),
            gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.95,
                step=0.05,
                label="Top-p (nucleus sampling)",
            ),
            gr.Slider(
                minimum=0.0,
                maximum=2.0,
                value=1.1,
                step=0.1,
                label="Repetition penalty",
            ),
        ],
    )

    threading.Thread(target=refresh).start()
    demo.launch()