File size: 3,162 Bytes
1975e5f
2a078b6
a408e8f
d3776a2
 
 
385e0d5
d3776a2
1975e5f
 
 
 
 
 
a408e8f
1975e5f
a408e8f
 
 
 
 
 
e823295
a408e8f
 
e823295
a408e8f
22d22f7
d3776a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf0fcb3
d3776a2
385e0d5
 
 
 
 
 
 
 
d3776a2
385e0d5
 
9e0025f
1975e5f
 
 
 
 
 
 
bf0fcb3
 
 
 
 
 
2a078b6
bf0fcb3
d3776a2
 
 
 
 
a408e8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3776a2
 
a408e8f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, TextStreamer
import gradio as gr
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
# client = InferenceClient("EITD/lora_model", token=os.getenv("HF_TOKEN"))

model = AutoPeftModelForCausalLM.from_pretrained(
        "EITD/lora_model_1", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = False,
    )
tokenizer = AutoTokenizer.from_pretrained("EITD/lora_model_1")

# messages = [{"role": "user", "content": "Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8,"},]

# inputs = tokenizer.apply_chat_template(
#     messages,
#     tokenize = True,
#     add_generation_prompt = True, # Must add for generation
#     return_tensors = "pt",
# )

# outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
#                         temperature = 1.5, min_p = 0.1)

# print(tokenizer.batch_decode(outputs))

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    # response = ""

    # for message in client.chat_completion(
    #     messages,
    #     max_tokens=max_tokens,
    #     stream=True,
    #     temperature=temperature,
    #     top_p=top_p,
    # ):
    #     token = message.choices[0].delta.content

    #     response += token
    #     yield response
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    )
    
    outputs = model.generate(input_ids = inputs, max_new_tokens = max_tokens, use_cache = True,
                         temperature = temperature, min_p = top_p)
    
    # text_streamer = TextStreamer(tokenizer, skip_prompt = True)
    # model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = max_tokens,
    #                 use_cache = True, temperature = temperature, min_p = top_p)
    
    yield tokenizer.batch_decode(outputs, skip_special_tokens = True)


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()