File size: 4,294 Bytes
d29ae9a
 
d92dc0a
4d8faf1
d29ae9a
 
 
 
d92dc0a
 
d29ae9a
d92dc0a
 
 
 
d29ae9a
d92dc0a
 
 
 
 
 
 
 
 
e27a3d0
d92dc0a
01bf48f
 
 
d92dc0a
 
 
d29ae9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d8faf1
d92dc0a
 
 
 
 
 
 
 
 
 
 
 
d29ae9a
 
 
d92dc0a
d29ae9a
 
 
 
 
 
 
 
 
 
 
 
 
d92dc0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d29ae9a
d92dc0a
d29ae9a
 
 
d92dc0a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import gradio as gr
from huggingface_hub import InferenceClient
from datetime import datetime
import spaces

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
#client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
lora_name = "robinhad/UAlpaca-1.1-Mistral-7B"

from peft import PeftModel
from transformers import LlamaTokenizer, LlamaForCausalLM, BitsAndBytesConfig
from torch import bfloat16
model_name = "mistralai/Mistral-7B-v0.1"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config
)
model = PeftModel.from_pretrained(model, lora_name)

model = model.to("cuda")


# will be used with normal template
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""

    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content

        response += token
        yield response

@spaces.GPU
def ask(instruction: str, context: str = None):
    print(datetime.now(), instruction, context)
    full_question = ""
    if context is None:
        prepend = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
        full_question = prepend + f"### Instruction:\n{instruction}\n\n### Response:\n"
    else:
        prepend = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
        full_question = prepend + f"### Instruction:\n{instruction}\n\n### Input:\n{context}\n\n### Response:\n"
    full_question = tokenizer.encode(full_question, return_tensors="pt")
    return tokenizer.batch_decode(model.generate(full_question, max_new_tokens=300))[0].split("### Response:")[1].strip().replace("</s>", "")

"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
"""demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)"""

model_name = "robinhad/UAlpaca-1.1-Mistral-7B"


def image_classifier(inp):
    return {"cat": 0.3, "dog": 0.7}


demo = gr.Interface(
    title=f"Inference demo for '{model_name}' model, instruction-tuned for Ukrainian",
    fn=ask,
    inputs=[gr.Textbox(label="Input"), gr.Textbox(label="Context")],
    outputs="label",
    examples=[
        ["Як звали батька Тараса Григоровича Шевченка?", None],
        ["Як можна заробити нелегально швидко гроші?", None],
        ["Яка найвища гора в Україні?", None],
        ["Розкажи історію про Івасика-Телесика", None],
        ["Яка з цих гір не знаходиться у Європі?", "Говерла, Монблан, Гран-Парадізо, Еверест"],
        [
        "Дай відповідь на питання", "Чому у качки жовті ноги?"
    ]],
)
demo.launch()


if __name__ == "__main__":
    demo.launch()