Spaces:
Runtime error
Runtime error
File size: 3,280 Bytes
1975e5f 2a078b6 a408e8f d3776a2 385e0d5 d3776a2 1975e5f a408e8f 1975e5f a408e8f e823295 a408e8f e823295 a408e8f 22d22f7 d3776a2 bf0fcb3 d3776a2 385e0d5 d3776a2 385e0d5 9e0025f 1975e5f bf0fcb3 84981a1 2a078b6 84981a1 d3776a2 a408e8f d3776a2 a408e8f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, TextStreamer
import gradio as gr
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
# client = InferenceClient("EITD/lora_model", token=os.getenv("HF_TOKEN"))
model = AutoPeftModelForCausalLM.from_pretrained(
"EITD/lora_model_1", # YOUR MODEL YOU USED FOR TRAINING
load_in_4bit = False,
)
tokenizer = AutoTokenizer.from_pretrained("EITD/lora_model_1")
# messages = [{"role": "user", "content": "Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8,"},]
# inputs = tokenizer.apply_chat_template(
# messages,
# tokenize = True,
# add_generation_prompt = True, # Must add for generation
# return_tensors = "pt",
# )
# outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
# temperature = 1.5, min_p = 0.1)
# print(tokenizer.batch_decode(outputs))
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
# response = ""
# for message in client.chat_completion(
# messages,
# max_tokens=max_tokens,
# stream=True,
# temperature=temperature,
# top_p=top_p,
# ):
# token = message.choices[0].delta.content
# response += token
# yield response
inputs = tokenizer.apply_chat_template(
messages,
tokenize = True,
add_generation_prompt = True, # Must add for generation
return_tensors = "pt",
)
outputs = model.generate(input_ids = inputs, max_new_tokens = max_tokens, use_cache = True,
temperature = temperature, min_p = top_p)
# text_streamer = TextStreamer(tokenizer, skip_prompt = True)
# model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = max_tokens,
# use_cache = True, temperature = temperature, min_p = top_p)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
if "assistant" in response:
response = response.split("assistant")[-1].strip()
print(response)
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch()
|