import gradio as gr def greet(name): return "Hello " + name + "!!" import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel, PeftConfig class InferenceFineTunning: def __init__(self, model_path): peft_model_id = f"hyang0503/{model_path}" config = PeftConfig.from_pretrained(peft_model_id) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 ) self.model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, quantization_config=bnb_config, device_map="auto") self.model = PeftModel.from_pretrained(self.model, peft_model_id) # self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) self.tokenizer = AutoTokenizer.from_pretrained(peft_model_id) self.tokenizer.pad_token = self.tokenizer.eos_token self.model.eval() def generate(self, q): # 실습 노트북과 내용 다름 outputs = self.model.generate( **self.tokenizer( f"### 질문: {q}\n\n### 답변:", return_tensors='pt', return_token_type_ids=False ).to("cuda"), max_new_tokens=256, early_stopping=True, do_sample=True, eos_token_id=2, ) print(self.tokenizer.decode(outputs[0])) ifg = InferenceFineTunning("qlora-koalpaca") iface = gr.Interface(fn=ifg.generate, inputs="text", outputs="text") iface.launch()