Training procedure
Framework versions
Loading Model + Adapters
from peft import PeftModel, PeftModelForCausalLM, LoraConfig
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
model_id = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_id,
use_auth_token=True,
torch_dtype=torch.bfloat16,
use_cache=True,
device_map="auto")
model.config.pretraining_tp = 1
model = PeftModel.from_pretrained(model, 'elhindih/lora-checkpoint-2224')
Faster Inference
from datasets import Dataset
BASE = "You are a helpful AI assistant. "
prompt_answer = BASE + "Given the context and the question at the end, your task is to provide most relevant answer while explaining your thoughts step by step."
prompt_followup0 = BASE + "Given the context and a question your task is to provide relevant answer."
prompt_followup1 = BASE + "Given the context of a conversation and a followup question, your task is to generate most relevant question about the context. Only generate question and not the answer."
prompt_summary = "You are a helpful AI assistant who is expert at summarizing an input paragraph. You provide most relevant summary to the paragraph while keeping in view the guidelines at the end of the context."
sys_prompt= {
"followup1" : prompt_followup0,
"followup0" : prompt_followup1,
"followup2" : prompt_followup1,
"answer1" : prompt_answer,
"answer0" : prompt_answer,
"answer2" : prompt_answer,
"followup3" : prompt_followup1,
"augment" : prompt_summary,
"answer3" : prompt_answer
}
def format_instruction(sample):
return f"""### Instruction:
{sys_prompt[sample['name']]}
### Input:
{sample['prompt']}
### Response:
"""
generation_config = model.generation_config
generation_config.max_new_tokens = 300
generation_config.temperature = 0.7
generation_config.top_p = 0.95
generation_config.num_return_sequences = 1
from time import perf_counter
device = "cuda:0"
def predict_tokens(item):
st = perf_counter()
tokenized = tokenizer(item, return_tensors="pt")
with torch.inference_mode():
outputs = model.generate(
input_ids = torch.Tensor(tokenized["input_ids"]).to(torch.int64).to(device),
attention_mask=torch.Tensor(tokenized["attention_mask"]).to(torch.int64).to(device),
generation_config=generation_config,
eos_token_id=tokenizer.eos_token_id,
use_cache=True
)
completion = tokenizer.decode(outputs[0], skip_special_tokens=False)
print(f"Time taken to generate: {perf_counter() - st:.2f}")
return {"pred_completion": completion }