mgoin commited on
Commit
4b84c3e
·
verified ·
1 Parent(s): f6e1264

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +119 -0
README.md ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model:
3
+ - microsoft/Phi-3.5-vision-instruct
4
+ ---
5
+
6
+ ## Eval
7
+
8
+ ```
9
+ vllm serve nm-testing/Phi-3.5-vision-instruct-W8A8-Dynamic-Per-Token --trust-remote-code --max-model-len 100000
10
+ ```
11
+
12
+ ```
13
+ python -m eval.run eval_vllm --model_name nm-testing/Phi-3.5-vision-instruct-W8A8-Dynamic-Per-Token --url http://0.0.0.0:8000 --output_dir output/ --eval_name "chartqa"
14
+ ...
15
+ ================================================================================
16
+ Metrics:
17
+ {
18
+ "explicit_prompt_relaxed_correctness": 0.6472,
19
+ "anywhere_in_answer_relaxed_correctness": 0.6616
20
+ }
21
+ ================================================================================
22
+ ```
23
+
24
+ ## Creation
25
+
26
+ ```python
27
+ from datasets import load_dataset
28
+ from transformers import AutoTokenizer, AutoModelForCausalLM
29
+
30
+ from llmcompressor.modifiers.quantization import GPTQModifier
31
+ # from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
32
+ from llmcompressor.transformers import oneshot, wrap_hf_model_class
33
+
34
+ # Select model and load it.
35
+ MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
36
+ model_class = wrap_hf_model_class(AutoModelForCausalLM)
37
+ model = model_class.from_pretrained(
38
+ MODEL_ID,
39
+ device_map="auto",
40
+ torch_dtype="auto",
41
+ trust_remote_code=True,
42
+ _attn_implementation="eager",
43
+ )
44
+ processor = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
45
+
46
+ # Select calibration dataset.
47
+ DATASET_ID = "HuggingFaceH4/ultrachat_200k"
48
+ DATASET_SPLIT = "train_sft"
49
+
50
+ # Select number of samples. 512 samples is a good place to start.
51
+ # Increasing the number of samples can improve accuracy.
52
+ NUM_CALIBRATION_SAMPLES = 512
53
+ MAX_SEQUENCE_LENGTH = 2048
54
+
55
+ # Load dataset and preprocess.
56
+ ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
57
+ ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
58
+
59
+
60
+ def preprocess(example):
61
+ return {
62
+ "text": processor.apply_chat_template(
63
+ example["messages"],
64
+ tokenize=False,
65
+ )
66
+ }
67
+
68
+
69
+ ds = ds.map(preprocess)
70
+
71
+
72
+ # Tokenize inputs.
73
+ def tokenize(sample):
74
+ return processor(
75
+ sample["text"],
76
+ padding=False,
77
+ max_length=MAX_SEQUENCE_LENGTH,
78
+ truncation=True,
79
+ add_special_tokens=False,
80
+ )
81
+
82
+
83
+ ds = ds.map(tokenize, remove_columns=ds.column_names)
84
+ print(ds)
85
+
86
+ # Configure algorithms. In this case, we:
87
+ # * apply SmoothQuant to make the activations easier to quantize
88
+ # * quantize the weights to int8 with GPTQ (static per channel)
89
+ # * quantize the activations to int8 (dynamic per token)
90
+ # Note: set sequential_update: true in the recipe to reduce memory
91
+ ignore=["re:.*lm_head", "re:model.vision_embed_tokens.*"]
92
+ recipe = [
93
+ # SmoothQuantModifier(smoothing_strength=0.8, ignore=ignore),
94
+ GPTQModifier(targets="Linear", scheme="W8A8", ignore=ignore),
95
+ ]
96
+
97
+ # Apply algorithms.
98
+ oneshot(
99
+ model=model,
100
+ dataset=ds,
101
+ recipe=recipe,
102
+ max_seq_length=MAX_SEQUENCE_LENGTH,
103
+ num_calibration_samples=NUM_CALIBRATION_SAMPLES,
104
+ trust_remote_code_model=True,
105
+ )
106
+
107
+ # Confirm generations of the quantized model look sane.
108
+ print("\n\n")
109
+ print("========== SAMPLE GENERATION ==============")
110
+ input_ids = processor("Hello my name is", return_tensors="pt").input_ids.to("cuda")
111
+ output = model.generate(input_ids, max_new_tokens=100)
112
+ print(processor.decode(output[0]))
113
+ print("==========================================\n\n")
114
+
115
+ # Save to disk compressed.
116
+ SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
117
+ model.save_pretrained(SAVE_DIR, save_compressed=True)
118
+ processor.save_pretrained(SAVE_DIR)
119
+ ```