# vicuna-7b This README provides a step-by-step guide to set up and run the FastChat application with the required dependencies and model. ## Prerequisites Before you proceed, ensure that you have `git` installed on your system. ## Installation Follow the steps below to install the required packages and set up the environment. 1. Upgrade `pip`: ```bash python3 -m pip install --upgrade pip ``` 2. Install `accelerate`: ```bash python3 -m pip install accelerate ``` 3. Install `bitsandbytes` 3.1 install by pip ```bash python3 -m pip install bitsandbytes ``` 3.2 Clone the `bitsandbytes` repository and install it: ```bash git clone https://github.com/TimDettmers/bitsandbytes.git cd bitsandbytes CUDA_VERSION=118 make cuda11x python3 -m pip install . cd .. ``` use the following command to find `CUDA_VERSION`: ```bash nvcc --version ``` 4. Clone the `FastChat` repository and install it: ```bash git clone https://github.com/lm-sys/FastChat.git cd FastChat python3 -m pip install -e . cd .. ``` 5. Install `git-lfs`: ```bash curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash sudo apt-get install git-lfs git lfs install ``` 6. Clone the `vicuna-7b` model: ```bash git clone https://huggingface.co/helloollel/vicuna-7b ``` ## Running FastChat After completing the installation, you can run FastChat with the following command: ```bash python3 -m fastchat.serve.cli --model-path ./vicuna-7b ``` This will start the FastChat server using the `vicuna-7b` model. ## Running in Notebook ```python import argparse import time import torch from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer from fastchat.conversation import conv_templates, SeparatorStyle from fastchat.serve.monkey_patch_non_inplace import replace_llama_attn_with_non_inplace_operations def load_model(model_name, device, num_gpus, load_8bit=False): if device == "cpu": kwargs = {} elif device == "cuda": kwargs = {"torch_dtype": torch.float16} if load_8bit: if num_gpus != "auto" and int(num_gpus) != 1: print("8-bit weights are not supported on multiple GPUs. Revert to use one GPU.") kwargs.update({"load_in_8bit": True, "device_map": "auto"}) else: if num_gpus == "auto": kwargs["device_map"] = "auto" else: num_gpus = int(num_gpus) if num_gpus != 1: kwargs.update({ "device_map": "auto", "max_memory": {i: "13GiB" for i in range(num_gpus)}, }) elif device == "mps": # Avoid bugs in mps backend by not using in-place operations. kwargs = {"torch_dtype": torch.float16} replace_llama_attn_with_non_inplace_operations() else: raise ValueError(f"Invalid device: {device}") tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, **kwargs) # calling model.cuda() mess up weights if loading 8-bit weights if device == "cuda" and num_gpus == 1 and not load_8bit: model.to("cuda") elif device == "mps": model.to("mps") return model, tokenizer @torch.inference_mode() def generate_stream(tokenizer, model, params, device, context_len=2048, stream_interval=2): """Adapted from fastchat/serve/model_worker.py::generate_stream""" prompt = params["prompt"] l_prompt = len(prompt) temperature = float(params.get("temperature", 1.0)) max_new_tokens = int(params.get("max_new_tokens", 256)) stop_str = params.get("stop", None) input_ids = tokenizer(prompt).input_ids output_ids = list(input_ids) max_src_len = context_len - max_new_tokens - 8 input_ids = input_ids[-max_src_len:] for i in range(max_new_tokens): if i == 0: out = model( torch.as_tensor([input_ids], device=device), use_cache=True) logits = out.logits past_key_values = out.past_key_values else: attention_mask = torch.ones( 1, past_key_values[0][0].shape[-2] + 1, device=device) out = model(input_ids=torch.as_tensor([[token]], device=device), use_cache=True, attention_mask=attention_mask, past_key_values=past_key_values) logits = out.logits past_key_values = out.past_key_values last_token_logits = logits[0][-1] if device == "mps": # Switch to CPU by avoiding some bugs in mps backend. last_token_logits = last_token_logits.float().to("cpu") if temperature < 1e-4: token = int(torch.argmax(last_token_logits)) else: probs = torch.softmax(last_token_logits / temperature, dim=-1) token = int(torch.multinomial(probs, num_samples=1)) output_ids.append(token) if token == tokenizer.eos_token_id: stopped = True else: stopped = False if i % stream_interval == 0 or i == max_new_tokens - 1 or stopped: output = tokenizer.decode(output_ids, skip_special_tokens=True) pos = output.rfind(stop_str, l_prompt) if pos != -1: output = output[:pos] stopped = True yield output if stopped: break del past_key_values args = dict( model_name='./vicuna-7b', device='cuda', num_gpus='1', load_8bit=True, conv_template='vicuna_v1.1', temperature=0.7, max_new_tokens=512, debug=False ) args = argparse.Namespace(**args) model_name = args.model_name # Model model, tokenizer = load_model(args.model_name, args.device, args.num_gpus, args.load_8bit) # Chat conv = conv_templates[args.conv_template].copy() def chat(inp): conv.append_message(conv.roles[0], inp) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() params = { "model": model_name, "prompt": prompt, "temperature": args.temperature, "max_new_tokens": args.max_new_tokens, "stop": conv.sep if conv.sep_style == SeparatorStyle.SINGLE else conv.sep2, } print(f"{conv.roles[1]}: ", end="", flush=True) pre = 0 for outputs in generate_stream(tokenizer, model, params, args.device): outputs = outputs[len(prompt) + 1:].strip() outputs = outputs.split(" ") now = len(outputs) if now - 1 > pre: print(" ".join(outputs[pre:now-1]), end=" ", flush=True) pre = now - 1 print(" ".join(outputs[pre:]), flush=True) conv.messages[-1][-1] = " ".join(outputs) ``` ```python chat("what's the meaning of life?") ```