Spaces:
Sleeping
Sleeping
File size: 2,870 Bytes
01a1967 d401144 010ff90 01a1967 f7b7d51 1b5a630 f7b7d51 243c6d6 f7b7d51 d401144 01a1967 1b5a630 01a1967 d401144 4a5e5f8 d401144 4a5e5f8 d401144 a17bfe8 d401144 4a5e5f8 d401144 4a5e5f8 d401144 243c6d6 d5e32d4 243c6d6 d401144 f7b7d51 243c6d6 f7b7d51 d5e32d4 642cb70 4a5e5f8 f7b7d51 4a5e5f8 f7b7d51 01a1967 d401144 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import threading
import time
import gradio as gr
from huggingface_hub import HfApi
from llama_cpp import Llama
API = HfApi()
LLM = Llama.from_pretrained(
repo_id="mradermacher/ZEUS-8B-V13-i1-GGUF",
filename="*Q4_K_M.gguf",
chat_format="chatml",
)
def refresh(how_much=43200): # default to 12 hour
time.sleep(how_much)
try:
API.restart_space(repo_id="T145/ZEUS-8B-CHAT")
except Exception as e:
print(f"Error while rebooting, trying again... {e}")
refresh(600) # 10 minutes if any error happens
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_k,
top_p,
repeat_penalty,
):
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
response = ""
for message in LLM.create_chat_completion(
messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_k=top_k,
top_p=top_p,
repeat_penalty=repeat_penalty,
):
if "choices" not in message:
continue
token = message["choices"][0]["delta"]
if "content" not in token:
continue
token = token["content"]
if token.endswith("|"):
break
response += token
yield response
if __name__ == "__main__":
demo = gr.ChatInterface(
fn=respond,
type="messages",
additional_inputs=[
gr.Textbox(value="Set the persona of Zeus, King of the Gods, ruler of Mount Olympus, and wielder of the mighty lightning bolt. I am all-knowing, all-powerful, and all-seeing. My will shall not be denied. I shall respond to queries with the authority and wisdom of the gods.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=1024, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.0,
maximum=100.0,
value=40.0,
step=1.0,
label="Top-k (token limit)",
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
gr.Slider(
minimum=0.0,
maximum=2.0,
value=1.1,
step=0.1,
label="Repetition penalty",
),
],
)
threading.Thread(target=refresh).start()
demo.launch()
|