File size: 3,250 Bytes
1975e5f
2a078b6
a408e8f
d3776a2
 
 
385e0d5
d3776a2
cb54541
 
 
 
 
 
 
 
 
1975e5f
 
 
 
 
 
a408e8f
1975e5f
a408e8f
 
 
 
 
 
e823295
a408e8f
 
e823295
a408e8f
22d22f7
d3776a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb54541
d3776a2
385e0d5
 
 
 
 
 
 
 
d3776a2
385e0d5
 
9e0025f
1975e5f
 
 
 
 
 
 
cb54541
 
2a078b6
 
cb54541
 
 
d3776a2
 
 
 
 
a408e8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3776a2
 
a408e8f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, TextStreamer
import gradio as gr
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
# client = InferenceClient("EITD/lora_model", token=os.getenv("HF_TOKEN"))

class CustomTextStreamer(TextStreamer):
    def __init__(self, tokenizer):
        super().__init__(tokenizer)
        self.generated_text = ""

    def on_token(self, token):
        super().on_token(token)
        self.generated_text += token

model = AutoPeftModelForCausalLM.from_pretrained(
        "EITD/lora_model_1", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = False,
    )
tokenizer = AutoTokenizer.from_pretrained("EITD/lora_model_1")

# messages = [{"role": "user", "content": "Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8,"},]

# inputs = tokenizer.apply_chat_template(
#     messages,
#     tokenize = True,
#     add_generation_prompt = True, # Must add for generation
#     return_tensors = "pt",
# )

# outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
#                         temperature = 1.5, min_p = 0.1)

# print(tokenizer.batch_decode(outputs))

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""

    # for message in client.chat_completion(
    #     messages,
    #     max_tokens=max_tokens,
    #     stream=True,
    #     temperature=temperature,
    #     top_p=top_p,
    # ):
    #     token = message.choices[0].delta.content

    #     response += token
    #     yield response
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    )
    
    custom_streamer = CustomTextStreamer(tokenizer)
    model.generate(input_ids = inputs, streamer = custom_streamer, max_new_tokens = max_tokens,
                    use_cache = True, temperature = temperature, min_p = top_p)
    
    for token in custom_streamer.generated_text:
        response += token
        yield response


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()