from kani import Kani from kani.engines.ctransformers.llama2 import LlamaCTransformersEngine import gradio as gr title = """

kani (カニ) demo

""" description = """

This is the kani chat demo with llama v2 ggml (cpu only!)

""" article = """
""" ai = None def user(message, history): history = history or [] # Append the user's message to the conversation history history.append([message, ""]) return "", history async def chat(history, limit: int = 1024, temp: float = 0.8, top_k: int = 40, top_p: float = 0.9, repeat_penalty: float = 1.1): history = history or [] message = history[-1][0] history[-1][1] = "" global ai if ai is None: model_id = "TheBloke/Llama-2-7B-Chat-GGML" model_file = "llama-2-7b-chat.ggmlv3.q3_K_M.bin" engine = LlamaCTransformersEngine(model_id, model_file, max_new_tokens = int(limit), temperature = float(temp), top_k = int(top_k), top_p = float(top_p), repetition_penalty = float(repeat_penalty), batch_size = 512, ) ai = Kani( engine, system_prompt=( "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. " " Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content." " Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not" " make any sense, or is not factually coherent, explain why instead of answering something not correct. If you" " don't know the answer to a question, please don't share false information." ), ) async for output in ai.full_round_str( message ): answer = output history[-1][1] += answer # stream the response yield history, history def clear_state(history, chat_message): history = [] global ai ai = None return history, gr.update(placeholder='Chat here') def start(): with gr.Blocks() as demo: gr.Markdown(title) gr.Markdown(description) gr.Markdown(article) with gr.Row(): with gr.Column(scale=0.5): max_tokens = gr.Slider(1, 1024, label="Max Tokens", step=1, value=512) temperature = gr.Slider(0.0, 1.0, label="Temperature", step=0.05, value=0.8) top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.05, value=0.95) top_k = gr.Slider(0, 100, label="Top K", step=1, value=40) repeat_penalty = gr.Slider(0.0, 2.0, label="Repetition Penalty", step=0.1, value=1.1) with gr.Column(): chatbot = gr.Chatbot(label='Llama v2') message = gr.Textbox(label='User', placeholder='Chat here') history = gr.State() with gr.Row(): submit = gr.Button(value="Send message", variant="secondary").style(full_width=True) clear = gr.Button(value="Reset", variant="secondary").style(full_width=False) clear.click(clear_state, inputs=[history, message], outputs=[history, message], queue=False) submit_click_event = submit.click( fn=user, inputs=[message, history], outputs=[message, history], queue=True ).then( fn=chat, inputs=[history, max_tokens, temperature, top_p, top_k, repeat_penalty], outputs=[chatbot, history], queue=True ) message_submit_event = message.submit( fn=user, inputs=[message, history], outputs=[message, history], queue=True ).then( fn=chat, inputs=[history, max_tokens, temperature, top_p, top_k, repeat_penalty], outputs=[chatbot, history], queue=True ) demo.launch(enable_queue=True) start()