File size: 4,117 Bytes
4b226f0
 
c121866
4b226f0
 
 
 
 
 
 
 
 
7daf8ff
c121866
4b226f0
 
 
 
 
 
 
 
 
 
 
 
c121866
4b226f0
 
 
 
c121866
4b226f0
c121866
4b226f0
 
 
 
c121866
4b226f0
 
 
 
 
 
 
c121866
4b226f0
 
 
 
c121866
4b226f0
c121866
4b226f0
 
 
 
 
 
 
c121866
4b226f0
 
 
c121866
4b226f0
 
 
 
 
 
 
 
 
c121866
4b226f0
 
 
7daf8ff
4b226f0
 
 
 
c121866
4b226f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c121866
4b226f0
 
 
 
c121866
 
4b226f0
 
 
c121866
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

import spaces  # 必须在最顶部导入
import gradio as gr
import os

# 获取 Hugging Face 访问令牌
hf_token = os.getenv("HF_API_TOKEN")

# 定义基础模型名称
base_model_name = "larry1129/WooWoof_AI_Vision_merged_16bit_3b"

# 定义 adapter 模型名称
adapter_model_name = "larry1129/WooWoof_AI_Vision_merged_16bit_3b"

# 定义全局变量用于缓存模型和分词器
model = None
tokenizer = None

# 定义提示生成函数
def generate_prompt(instruction, input_text=""):
    if input_text:
        prompt = f"""### Instruction:
{instruction}
### Input:
{input_text}
### Response:
"""
    else:
        prompt = f"""### Instruction:
{instruction}
### Response:
"""
    return prompt

# 定义生成响应的函数,并使用 @spaces.GPU 装饰
@spaces.GPU(duration=40)  # 建议将 duration 增加到 120
def generate_response(instruction, input_text):
    global model, tokenizer

    if model is None:
        print("开始加载模型...")
        # 检查 bitsandbytes 是否已安装
        import importlib.util
        if importlib.util.find_spec("bitsandbytes") is None:
            import subprocess
            subprocess.call(["pip", "install", "--upgrade", "bitsandbytes"])

        try:
            # 在函数内部导入需要 GPU 的库
            import torch
            from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

            from peft import PeftModel

            # 创建量化配置
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16
            )

            # 加载分词器
            tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_auth_token=hf_token)
            print("分词器加载成功。")

            # 加载基础模型
            base_model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                quantization_config=bnb_config,
                device_map="auto",
                use_auth_token=hf_token,
                trust_remote_code=True
            )
            print("基础模型加载成功。")

            # 加载适配器模型
            model = PeftModel.from_pretrained(
                base_model,
                 adapter_model_name,
                torch_dtype=torch.float16,
                use_auth_token=hf_token
            )
            print("适配器模型加载成功。")

            # 设置 pad_token
            tokenizer.pad_token = tokenizer.eos_token
            model.config.pad_token_id = tokenizer.pad_token_id

            # 切换到评估模式
            model.eval()
            print("模型已切换到评估模式。")
        except Exception as e:
            print("加载模型时出错:", e)
            raise e
    else:
        # 在函数内部导入需要的库
        import torch

    # 检查 model 和 tokenizer 是否已正确加载
    if model is None or tokenizer is None:
        print("模型或分词器未正确加载。")
        raise ValueError("模型或分词器未正确加载。")

    # 生成提示
    prompt = generate_prompt(instruction, input_text)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs.get("attention_mask"),
            max_new_tokens=128,
            temperature=0.7,
            top_p=0.95,
            do_sample=True,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Response:")[-1].strip()
    return response

# 创建 Gradio 接口
iface = gr.Interface(
    fn=generate_response,
    inputs=[
        gr.Textbox(lines=2, placeholder="Instruction", label="Instruction"),
    ],
    outputs="text",
    title="WooWoof AI",
    description="Based on LLAMA 3.1 for pet related",
    allow_flagging="never"
)

# 启动 Gradio 接口
iface.launch(share=True)