vicuna-7b / README.md
helloollel's picture
Update README.md
da46067

vicuna-7b

This README provides a step-by-step guide to set up and run the FastChat application with the required dependencies and model.

Prerequisites

Before you proceed, ensure that you have git installed on your system.

Installation

Follow the steps below to install the required packages and set up the environment.

  1. Upgrade pip:
python3 -m pip install --upgrade pip
  1. Install accelerate:
python3 -m pip install accelerate
  1. Install bitsandbytes

3.1 install by pip

python3 -m pip install bitsandbytes

3.2 Clone the bitsandbytes repository and install it:

git clone https://github.com/TimDettmers/bitsandbytes.git
cd bitsandbytes
CUDA_VERSION=118 make cuda11x
python3 -m pip install .
cd ..

use the following command to find CUDA_VERSION:

nvcc --version
  1. Clone the FastChat repository and install it:
git clone https://github.com/lm-sys/FastChat.git
cd FastChat
python3 -m pip install -e .
cd ..
  1. Install git-lfs:
curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
sudo apt-get install git-lfs
git lfs install
  1. Clone the vicuna-7b model:
git clone https://huggingface.co/helloollel/vicuna-7b

Running FastChat

After completing the installation, you can run FastChat with the following command:

python3 -m fastchat.serve.cli --model-path ./vicuna-7b

This will start the FastChat server using the vicuna-7b model.

Running in Notebook

import argparse
import time

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer

from fastchat.conversation import conv_templates, SeparatorStyle
from fastchat.serve.monkey_patch_non_inplace import replace_llama_attn_with_non_inplace_operations


def load_model(model_name, device, num_gpus, load_8bit=False):
    if device == "cpu":
        kwargs = {}
    elif device == "cuda":
        kwargs = {"torch_dtype": torch.float16}
        if load_8bit:
            if num_gpus != "auto" and int(num_gpus) != 1:
                print("8-bit weights are not supported on multiple GPUs. Revert to use one GPU.")
            kwargs.update({"load_in_8bit": True, "device_map": "auto"})
        else:
            if num_gpus == "auto":
                kwargs["device_map"] = "auto"
            else:
                num_gpus = int(num_gpus)
                if num_gpus != 1:
                    kwargs.update({
                        "device_map": "auto",
                        "max_memory": {i: "13GiB" for i in range(num_gpus)},
                    })
    elif device == "mps":
        # Avoid bugs in mps backend by not using in-place operations.
        kwargs = {"torch_dtype": torch.float16}
        replace_llama_attn_with_non_inplace_operations()
    else:
        raise ValueError(f"Invalid device: {device}")

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    model = AutoModelForCausalLM.from_pretrained(model_name,
        low_cpu_mem_usage=True, **kwargs)

    # calling model.cuda() mess up weights if loading 8-bit weights
    if device == "cuda" and num_gpus == 1 and not load_8bit:
        model.to("cuda")
    elif device == "mps":
        model.to("mps")

    return model, tokenizer


@torch.inference_mode()
def generate_stream(tokenizer, model, params, device,
                    context_len=2048, stream_interval=2):
    """Adapted from fastchat/serve/model_worker.py::generate_stream"""

    prompt = params["prompt"]
    l_prompt = len(prompt)
    temperature = float(params.get("temperature", 1.0))
    max_new_tokens = int(params.get("max_new_tokens", 256))
    stop_str = params.get("stop", None)

    input_ids = tokenizer(prompt).input_ids
    output_ids = list(input_ids)

    max_src_len = context_len - max_new_tokens - 8
    input_ids = input_ids[-max_src_len:]

    for i in range(max_new_tokens):
        if i == 0:
            out = model(
                torch.as_tensor([input_ids], device=device), use_cache=True)
            logits = out.logits
            past_key_values = out.past_key_values
        else:
            attention_mask = torch.ones(
                1, past_key_values[0][0].shape[-2] + 1, device=device)
            out = model(input_ids=torch.as_tensor([[token]], device=device),
                        use_cache=True,
                        attention_mask=attention_mask,
                        past_key_values=past_key_values)
            logits = out.logits
            past_key_values = out.past_key_values

        last_token_logits = logits[0][-1]

        if device == "mps":
            # Switch to CPU by avoiding some bugs in mps backend.
            last_token_logits = last_token_logits.float().to("cpu")

        if temperature < 1e-4:
            token = int(torch.argmax(last_token_logits))
        else:
            probs = torch.softmax(last_token_logits / temperature, dim=-1)
            token = int(torch.multinomial(probs, num_samples=1))

        output_ids.append(token)

        if token == tokenizer.eos_token_id:
            stopped = True
        else:
            stopped = False

        if i % stream_interval == 0 or i == max_new_tokens - 1 or stopped:
            output = tokenizer.decode(output_ids, skip_special_tokens=True)
            pos = output.rfind(stop_str, l_prompt)
            if pos != -1:
                output = output[:pos]
                stopped = True
            yield output

        if stopped:
            break

    del past_key_values

args = dict(
    model_name='./vicuna-7b',
    device='cuda',
    num_gpus='1',
    load_8bit=True,
    conv_template='vicuna_v1.1',
    temperature=0.7,
    max_new_tokens=512,
    debug=False
)

args = argparse.Namespace(**args)

model_name = args.model_name

# Model
model, tokenizer = load_model(args.model_name, args.device,
    args.num_gpus, args.load_8bit)

# Chat
conv = conv_templates[args.conv_template].copy()

def chat(inp):
  conv.append_message(conv.roles[0], inp)
  conv.append_message(conv.roles[1], None)
  prompt = conv.get_prompt()

  params = {
      "model": model_name,
      "prompt": prompt,
      "temperature": args.temperature,
      "max_new_tokens": args.max_new_tokens,
      "stop": conv.sep if conv.sep_style == SeparatorStyle.SINGLE else conv.sep2,
  }

  print(f"{conv.roles[1]}: ", end="", flush=True)
  pre = 0
  for outputs in generate_stream(tokenizer, model, params, args.device):
      outputs = outputs[len(prompt) + 1:].strip()
      outputs = outputs.split(" ")
      now = len(outputs)
      if now - 1 > pre:
          print(" ".join(outputs[pre:now-1]), end=" ", flush=True)
          pre = now - 1
  print(" ".join(outputs[pre:]), flush=True)

  conv.messages[-1][-1] = " ".join(outputs)
chat("what's the meaning of life?")