badayvedat liuhaotian commited on
Commit
c6dfdac
·
1 Parent(s): 255cd6e

Load 13B model with 8-bit/4-bit quantization to support more hardwares (#2)

Browse files

- Load 13B model with 8-bit/4-bit quantization to support more hardwares (2043a67569994113ef5f4a8d0c58df57f6c2ec66)
- Update requirements.txt (45e69a6796b68457d9e0f2e7bf82cc5f7a38b2b1)
- Update app.py (4e058355a3b5dcf3470e3a49b891eb91455f030b)
- Update app.py (4ad10fb0867be1212b7746919900c9fd16014f69)


Co-authored-by: Haotian Liu <[email protected]>

Files changed (2) hide show
  1. app.py +20 -3
  2. requirements.txt +2 -2
app.py CHANGED
@@ -325,6 +325,14 @@ title_markdown = """
325
  [[Project Page]](https://llava-vl.github.io) [[Paper]](https://arxiv.org/abs/2304.08485) [[Code]](https://github.com/haotian-liu/LLaVA) [[Model]](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)
326
 
327
  ONLY WORKS WITH GPU!
 
 
 
 
 
 
 
 
328
  """
329
 
330
  tos_markdown = """
@@ -522,8 +530,12 @@ def start_controller():
522
  return subprocess.Popen(controller_command)
523
 
524
 
525
- def start_worker(model_path: str):
526
  logger.info(f"Starting the model worker for the model {model_path}")
 
 
 
 
527
  worker_command = [
528
  "python",
529
  "-m",
@@ -534,7 +546,11 @@ def start_worker(model_path: str):
534
  "http://localhost:10000",
535
  "--model-path",
536
  model_path,
 
 
537
  ]
 
 
538
  return subprocess.Popen(worker_command)
539
 
540
 
@@ -582,12 +598,13 @@ if __name__ == "__main__":
582
  args = get_args()
583
  logger.info(f"args: {args}")
584
 
585
- model_path = "liuhaotian/llava-v1.5-7b"
 
586
 
587
  preload_models(model_path)
588
 
589
  controller_proc = start_controller()
590
- worker_proc = start_worker(model_path)
591
 
592
  # Wait for worker and controller to start
593
  time.sleep(10)
 
325
  [[Project Page]](https://llava-vl.github.io) [[Paper]](https://arxiv.org/abs/2304.08485) [[Code]](https://github.com/haotian-liu/LLaVA) [[Model]](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)
326
 
327
  ONLY WORKS WITH GPU!
328
+
329
+ You can load the model with 8-bit or 4-bit quantization to make it fit in smaller hardwares. Setting the environment variable `bits` to control the quantization.
330
+
331
+ Recommended configurations:
332
+ | Hardware | A10G-Large (24G) | T4-Medium (15G) | A100-Large (40G) |
333
+ |-------------------|------------------|-----------------|------------------|
334
+ | **Bits** | 8 (default) | 4 | 16 |
335
+
336
  """
337
 
338
  tos_markdown = """
 
530
  return subprocess.Popen(controller_command)
531
 
532
 
533
+ def start_worker(model_path: str, bits=16):
534
  logger.info(f"Starting the model worker for the model {model_path}")
535
+ model_name = model_path.strip('/').split('/')[-1]
536
+ assert bits in [4, 8, 16], "It can be only loaded with 16-bit, 8-bit, and 4-bit."
537
+ if bits != 16:
538
+ model_name += f'-{bits}bit'
539
  worker_command = [
540
  "python",
541
  "-m",
 
546
  "http://localhost:10000",
547
  "--model-path",
548
  model_path,
549
+ "--model-name",
550
+ model_name,
551
  ]
552
+ if bits != 16:
553
+ worker_command += [f'--load-{bits}bit']
554
  return subprocess.Popen(worker_command)
555
 
556
 
 
598
  args = get_args()
599
  logger.info(f"args: {args}")
600
 
601
+ model_path = "liuhaotian/llava-v1.5-13b"
602
+ bits = int(os.getenv("bits", 8))
603
 
604
  preload_models(model_path)
605
 
606
  controller_proc = start_controller()
607
+ worker_proc = start_worker(model_path, bits=bits)
608
 
609
  # Wait for worker and controller to start
610
  time.sleep(10)
requirements.txt CHANGED
@@ -8,8 +8,8 @@ numpy
8
  requests
9
  sentencepiece
10
  tokenizers>=0.12.1
11
- torch
12
- torchvision
13
  uvicorn
14
  wandb
15
  shortuuid
 
8
  requests
9
  sentencepiece
10
  tokenizers>=0.12.1
11
+ torch==2.0.1
12
+ torchvision==0.15.2
13
  uvicorn
14
  wandb
15
  shortuuid