Llama-2-Qlora

This model is fine-tuned with LLaMA-2 with 8 Nvidia A100-80G GPUs using 3,000,000 groups of conversations in the context of mathematics by students and facilitators on Algebra Nation (https://www.mathnation.com/). Llama-2-Qlora consists of 32 layers and over 7 billion parameters, consuming up to 13.5 gigabytes of disk space. Researchers can experiment with and finetune the model to help construct dedicated LLMs for downstream tasks (e.g., classification) related to K-12 math learning.

Here is how to use it with texts in HuggingFace

import torch
import transformers
from transformers import LlamaTokenizer, LlamaForCausalLM
tokenizer = LlamaTokenizer.from_pretrained("uf-aice-lab/Llama-2-QLoRA")
mdoel = LlamaForCausalLM.from_pretrained(
        "uf-aice-lab/Llama-2-QLoRA",
        load_in_8bit=False,
        torch_dtype=torch.float16,
        device_map="auto",
    )
def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input}
### Response:"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:"""

def evaluate(
    instruction,
    input=None,
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=128,
    **kwargs,
):
    prompt = generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        **kwargs,
    )
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    return output.split("### Response:")[1].strip()
instruction = 'write your instruction here'
inputs = 'write your inputs here'
output= evaluate(instruction,
                 input=inputs,
                 temperature=0.1,#change the parameters by yourself
                 top_p=0.75,
                 top_k=40,
                 num_beams=4,
                  max_new_tokens=128,)
Downloads last month
16
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.