vicuna-7b
This README provides a step-by-step guide to set up and run the FastChat application with the required dependencies and model.
Prerequisites
Before you proceed, ensure that you have git
installed on your system.
Installation
Follow the steps below to install the required packages and set up the environment.
- Upgrade
pip
:
python3 -m pip install --upgrade pip
- Install
accelerate
:
python3 -m pip install accelerate
- Install
bitsandbytes
3.1 install by pip
python3 -m pip install bitsandbytes
3.2 Clone the bitsandbytes
repository and install it:
git clone https://github.com/TimDettmers/bitsandbytes.git
cd bitsandbytes
CUDA_VERSION=118 make cuda11x
python3 -m pip install .
cd ..
use the following command to find CUDA_VERSION
:
nvcc --version
- Clone the
FastChat
repository and install it:
git clone https://github.com/lm-sys/FastChat.git
cd FastChat
python3 -m pip install -e .
cd ..
- Install
git-lfs
:
curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
sudo apt-get install git-lfs
git lfs install
- Clone the
vicuna-7b
model:
git clone https://huggingface.co/helloollel/vicuna-7b
Running FastChat
After completing the installation, you can run FastChat with the following command:
python3 -m fastchat.serve.cli --model-path ./vicuna-7b
This will start the FastChat server using the vicuna-7b
model.
Running in Notebook
import argparse
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer
from fastchat.conversation import conv_templates, SeparatorStyle
from fastchat.serve.monkey_patch_non_inplace import replace_llama_attn_with_non_inplace_operations
def load_model(model_name, device, num_gpus, load_8bit=False):
if device == "cpu":
kwargs = {}
elif device == "cuda":
kwargs = {"torch_dtype": torch.float16}
if load_8bit:
if num_gpus != "auto" and int(num_gpus) != 1:
print("8-bit weights are not supported on multiple GPUs. Revert to use one GPU.")
kwargs.update({"load_in_8bit": True, "device_map": "auto"})
else:
if num_gpus == "auto":
kwargs["device_map"] = "auto"
else:
num_gpus = int(num_gpus)
if num_gpus != 1:
kwargs.update({
"device_map": "auto",
"max_memory": {i: "13GiB" for i in range(num_gpus)},
})
elif device == "mps":
# Avoid bugs in mps backend by not using in-place operations.
kwargs = {"torch_dtype": torch.float16}
replace_llama_attn_with_non_inplace_operations()
else:
raise ValueError(f"Invalid device: {device}")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name,
low_cpu_mem_usage=True, **kwargs)
# calling model.cuda() mess up weights if loading 8-bit weights
if device == "cuda" and num_gpus == 1 and not load_8bit:
model.to("cuda")
elif device == "mps":
model.to("mps")
return model, tokenizer
@torch.inference_mode()
def generate_stream(tokenizer, model, params, device,
context_len=2048, stream_interval=2):
"""Adapted from fastchat/serve/model_worker.py::generate_stream"""
prompt = params["prompt"]
l_prompt = len(prompt)
temperature = float(params.get("temperature", 1.0))
max_new_tokens = int(params.get("max_new_tokens", 256))
stop_str = params.get("stop", None)
input_ids = tokenizer(prompt).input_ids
output_ids = list(input_ids)
max_src_len = context_len - max_new_tokens - 8
input_ids = input_ids[-max_src_len:]
for i in range(max_new_tokens):
if i == 0:
out = model(
torch.as_tensor([input_ids], device=device), use_cache=True)
logits = out.logits
past_key_values = out.past_key_values
else:
attention_mask = torch.ones(
1, past_key_values[0][0].shape[-2] + 1, device=device)
out = model(input_ids=torch.as_tensor([[token]], device=device),
use_cache=True,
attention_mask=attention_mask,
past_key_values=past_key_values)
logits = out.logits
past_key_values = out.past_key_values
last_token_logits = logits[0][-1]
if device == "mps":
# Switch to CPU by avoiding some bugs in mps backend.
last_token_logits = last_token_logits.float().to("cpu")
if temperature < 1e-4:
token = int(torch.argmax(last_token_logits))
else:
probs = torch.softmax(last_token_logits / temperature, dim=-1)
token = int(torch.multinomial(probs, num_samples=1))
output_ids.append(token)
if token == tokenizer.eos_token_id:
stopped = True
else:
stopped = False
if i % stream_interval == 0 or i == max_new_tokens - 1 or stopped:
output = tokenizer.decode(output_ids, skip_special_tokens=True)
pos = output.rfind(stop_str, l_prompt)
if pos != -1:
output = output[:pos]
stopped = True
yield output
if stopped:
break
del past_key_values
args = dict(
model_name='./vicuna-7b',
device='cuda',
num_gpus='1',
load_8bit=True,
conv_template='vicuna_v1.1',
temperature=0.7,
max_new_tokens=512,
debug=False
)
args = argparse.Namespace(**args)
model_name = args.model_name
# Model
model, tokenizer = load_model(args.model_name, args.device,
args.num_gpus, args.load_8bit)
# Chat
conv = conv_templates[args.conv_template].copy()
def chat(inp):
conv.append_message(conv.roles[0], inp)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
params = {
"model": model_name,
"prompt": prompt,
"temperature": args.temperature,
"max_new_tokens": args.max_new_tokens,
"stop": conv.sep if conv.sep_style == SeparatorStyle.SINGLE else conv.sep2,
}
print(f"{conv.roles[1]}: ", end="", flush=True)
pre = 0
for outputs in generate_stream(tokenizer, model, params, args.device):
outputs = outputs[len(prompt) + 1:].strip()
outputs = outputs.split(" ")
now = len(outputs)
if now - 1 > pre:
print(" ".join(outputs[pre:now-1]), end=" ", flush=True)
pre = now - 1
print(" ".join(outputs[pre:]), flush=True)
conv.messages[-1][-1] = " ".join(outputs)
chat("what's the meaning of life?")