from llama_cpp import Llama import gradio as gr import random import requests import os if not os.path.exists("ggml-model-q4_0.bin"): open("ggml-model-q4_0.bin", "wb").write( requests.get( "https://huggingface.co/birdup/pygmalion-7b-q5_1-ggml-v5/resolve/main/pygmalion-7b-q5_1-ggml-v5.bin" ).content ) else: print("Model already exists, skipping redownload") print("Loading model...") llm = Llama( model_path="ggml-model-q4_0.bin", seed=random.randint(1, 9999999), n_ctx=2048, n_threads=3, ) print("Model loaded.") def generate(prompt, stop): output = llm( bytes(prompt, "utf-8").decode("unicode_escape"), max_tokens=64, temperature=0.75, top_p=0.7, stop=[bytes(stop, "utf-8").decode("unicode_escape")] if len(stop) > 1 else None, ) print(output) return output["choices"][0]["text"] app = gr.Interface(fn=generate, inputs=["text", "text"], outputs="text") app.launch()