internlm2-7b-llama-8.0bpw-h8-exl2 / convert_weights.py
LoneStriker's picture
Upload folder using huggingface_hub
3b95ca2 verified
#!/usr/bin/env python3
# 1/17/2024
# Charles O. Goddard
"""Convert internlm2 weights to Llama format."""
import json
import os
import einops
import tqdm
from mergekit.io import LazyTensorLoader, TensorWriter
from mergekit.common import ModelReference
from transformers import LlamaTokenizer
MODEL_IN = "internlm/internlm2-20b"
OUT_PATH = "./internlm2-20b-llama"
model_ref = ModelReference.parse(MODEL_IN)
cfg = model_ref.config(trust_remote_code=True)
head_dim = cfg.hidden_size // cfg.num_attention_heads
num_key_value_groups = cfg.num_attention_heads // cfg.num_key_value_heads
loader = LazyTensorLoader(model_ref.tensor_index(), lazy_unpickle=True)
writer = TensorWriter(OUT_PATH)
SIMPLE_REPLACEMENTS = {
"feed_forward.w1": "mlp.gate_proj",
"feed_forward.w2": "mlp.down_proj",
"feed_forward.w3": "mlp.up_proj",
"attention.wo": "self_attn.o_proj",
"ffn_norm": "post_attention_layernorm",
"attention_norm": "input_layernorm",
"tok_embeddings": "embed_tokens",
"output.weight": "lm_head.weight",
}
for tensor_name in tqdm.tqdm(loader.index.tensor_paths):
tensor = loader.get_tensor(tensor_name)
if "attention.wqkv" in tensor_name:
# make me think about tensor shapes will you >:(
# ((cfg.num_attention_heads + 2 * cfg.num_key_value_heads) * head_dim, cfg.hidden_size) x (batch_sz, sq_len, cfg.hidden_size)
# -> (batch_sz, sq_len, (cfg.num_attention_heads + 2 * cfg.num_key_value_heads) * head_dim)
# qkv_states = rearrange(
# qkv_states,
# "b q (h gs d) -> b q h gs d",
# gs=2 + self.num_key_value_groups,
# d=self.head_dim,
# )
# ->(batch_sz, sq_len, h, 2 + self.num_key_value_groups, head_dim)
qkv_vecs = einops.rearrange(
tensor, "(h gs d) z -> h gs d z", gs=2 + num_key_value_groups, d=head_dim
)
q_proj = (
qkv_vecs[:, :num_key_value_groups, ...]
.reshape(-1, cfg.hidden_size)
.contiguous()
)
k_proj = qkv_vecs[:, -2, ...].reshape(-1, cfg.hidden_size).contiguous()
v_proj = qkv_vecs[:, -1, ...].reshape(-1, cfg.hidden_size).contiguous()
assert k_proj.shape == v_proj.shape
writer.save_tensor(
tensor_name.replace("attention.wqkv", "self_attn.q_proj"),
q_proj,
clone=True,
)
writer.save_tensor(
tensor_name.replace("attention.wqkv", "self_attn.k_proj"),
k_proj,
clone=True,
)
writer.save_tensor(
tensor_name.replace("attention.wqkv", "self_attn.v_proj"),
v_proj,
clone=True,
)
continue
out_name = tensor_name
for pattern, sub in SIMPLE_REPLACEMENTS.items():
if pattern in out_name:
out_name = out_name.replace(pattern, sub)
writer.save_tensor(out_name, tensor)
writer.finalize()
cfg_dict = json.loads(cfg.to_json_string())
del cfg_dict["auto_map"]
cfg_dict["architectures"] = ["LlamaForCausalLM"]
cfg_dict["model_type"] = "llama"
if "rope_scaling" in cfg_dict and cfg_dict["rope_scaling"]["factor"] == 1.0:
del cfg_dict["rope_scaling"]
with open(os.path.join(OUT_PATH, "config.json"), "w", encoding="utf-8") as fp:
json.dump(cfg_dict, fp, indent=2)
# InternLMTokenizer differences:
# 1. clean_up_tokenization() hardcoded to always be called
# 2. might prepend a space to some tokens that LlamaTokenizer doesn't if they're the first token
# 1 is easy to fix, 2... is not important
tok = LlamaTokenizer.from_pretrained(MODEL_IN, trust_remote_code=False, legacy=True)
tok.clean_up_tokenization_spaces = True
tok.save_pretrained(OUT_PATH)