Spaces:

T145
/

ZEUS-8B-CHAT

Sleeping

ZEUS-8B-CHAT / app.py

T145

Model update to V13

1b5a630 9 days ago

2.87 kB

	import threading
	import time

	import gradio as gr
	from huggingface_hub import HfApi
	from llama_cpp import Llama

	API = HfApi()
	LLM = Llama.from_pretrained(
	repo_id="mradermacher/ZEUS-8B-V13-i1-GGUF",
	filename="*Q4_K_M.gguf",
	chat_format="chatml",
	)


	def refresh(how_much=43200): # default to 12 hour
	time.sleep(how_much)
	try:
	API.restart_space(repo_id="T145/ZEUS-8B-CHAT")
	except Exception as e:
	print(f"Error while rebooting, trying again... {e}")
	refresh(600) # 10 minutes if any error happens


	def respond(
	message,
	history: list[tuple[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_k,
	top_p,
	repeat_penalty,
	):
	messages = [{"role": "system", "content": system_message}]

	for val in history:
	if val[0]:
	messages.append({"role": "user", "content": val[0]})
	if val[1]:
	messages.append({"role": "assistant", "content": val[1]})

	messages.append({"role": "user", "content": message})

	response = ""

	for message in LLM.create_chat_completion(
	messages,
	max_tokens=max_tokens,
	stream=True,
	temperature=temperature,
	top_k=top_k,
	top_p=top_p,
	repeat_penalty=repeat_penalty,
	):
	if "choices" not in message:
	continue

	token = message["choices"][0]["delta"]

	if "content" not in token:
	continue

	token = token["content"]

	if token.endswith("\|"):
	break

	response += token
	yield response


	if __name__ == "__main__":
	demo = gr.ChatInterface(
	fn=respond,
	type="messages",
	additional_inputs=[
	gr.Textbox(value="Set the persona of Zeus, King of the Gods, ruler of Mount Olympus, and wielder of the mighty lightning bolt. I am all-knowing, all-powerful, and all-seeing. My will shall not be denied. I shall respond to queries with the authority and wisdom of the gods.", label="System message"),
	gr.Slider(minimum=1, maximum=2048, value=1024, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
	gr.Slider(
	minimum=0.0,
	maximum=100.0,
	value=40.0,
	step=1.0,
	label="Top-k (token limit)",
	),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p (nucleus sampling)",
	),
	gr.Slider(
	minimum=0.0,
	maximum=2.0,
	value=1.1,
	step=0.1,
	label="Repetition penalty",
	),
	],
	)

	threading.Thread(target=refresh).start()
	demo.launch()