|
--- |
|
tags: |
|
- fp8 |
|
--- |
|
|
|
Quantized using AutoFP8 with this script: |
|
|
|
```python |
|
from transformers import AutoTokenizer |
|
import auto_fp8 |
|
from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig |
|
|
|
pretrained_model_dir = "ibm-granite/granite-20b-code-base" |
|
quantized_model_dir = "granite-20b-code-base-FP8" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) |
|
|
|
# use some code to calibrate |
|
import auto_fp8 |
|
tmp = auto_fp8.__file__.split('/')[:-1] |
|
tmp.append('quantize.py') |
|
seed_text_file = '/'.join(tmp) |
|
|
|
with open(seed_text_file, "r") as f: |
|
text = f.read() |
|
|
|
examples = [text] |
|
|
|
examples = tokenizer(examples, return_tensors="pt").to("cuda") |
|
|
|
quantize_config = BaseQuantizeConfig( |
|
quant_method="fp8", |
|
activation_scheme="static", |
|
ignore_patterns=["re:.*lm_head"], |
|
) |
|
|
|
model = AutoFP8ForCausalLM.from_pretrained( |
|
pretrained_model_dir, quantize_config=quantize_config |
|
) |
|
|
|
model.quantize(examples) |
|
model.save_quantized(quantized_model_dir) |
|
``` |