Spaces:
Runtime error
Runtime error
File size: 3,250 Bytes
1975e5f 2a078b6 a408e8f d3776a2 385e0d5 d3776a2 cb54541 1975e5f a408e8f 1975e5f a408e8f e823295 a408e8f e823295 a408e8f 22d22f7 d3776a2 cb54541 d3776a2 385e0d5 d3776a2 385e0d5 9e0025f 1975e5f cb54541 2a078b6 cb54541 d3776a2 a408e8f d3776a2 a408e8f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, TextStreamer
import gradio as gr
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
# client = InferenceClient("EITD/lora_model", token=os.getenv("HF_TOKEN"))
class CustomTextStreamer(TextStreamer):
def __init__(self, tokenizer):
super().__init__(tokenizer)
self.generated_text = ""
def on_token(self, token):
super().on_token(token)
self.generated_text += token
model = AutoPeftModelForCausalLM.from_pretrained(
"EITD/lora_model_1", # YOUR MODEL YOU USED FOR TRAINING
load_in_4bit = False,
)
tokenizer = AutoTokenizer.from_pretrained("EITD/lora_model_1")
# messages = [{"role": "user", "content": "Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8,"},]
# inputs = tokenizer.apply_chat_template(
# messages,
# tokenize = True,
# add_generation_prompt = True, # Must add for generation
# return_tensors = "pt",
# )
# outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
# temperature = 1.5, min_p = 0.1)
# print(tokenizer.batch_decode(outputs))
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
response = ""
# for message in client.chat_completion(
# messages,
# max_tokens=max_tokens,
# stream=True,
# temperature=temperature,
# top_p=top_p,
# ):
# token = message.choices[0].delta.content
# response += token
# yield response
inputs = tokenizer.apply_chat_template(
messages,
tokenize = True,
add_generation_prompt = True, # Must add for generation
return_tensors = "pt",
)
custom_streamer = CustomTextStreamer(tokenizer)
model.generate(input_ids = inputs, streamer = custom_streamer, max_new_tokens = max_tokens,
use_cache = True, temperature = temperature, min_p = top_p)
for token in custom_streamer.generated_text:
response += token
yield response
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch()
|