File size: 4,973 Bytes
df28c17
 
 
 
 
07899fc
7075c6d
df28c17
 
 
 
839e7cc
 
 
df28c17
 
 
 
 
839e7cc
 
df28c17
 
839e7cc
df28c17
 
 
 
 
 
 
 
 
 
839e7cc
 
 
df28c17
 
 
 
839e7cc
 
df28c17
839e7cc
 
df28c17
839e7cc
 
 
 
df28c17
839e7cc
 
 
df28c17
839e7cc
 
df28c17
 
839e7cc
 
 
df28c17
 
839e7cc
 
 
df28c17
 
839e7cc
df28c17
 
839e7cc
 
 
df28c17
 
 
 
839e7cc
 
df28c17
 
 
 
 
 
839e7cc
df28c17
839e7cc
df28c17
839e7cc
df28c17
 
839e7cc
df28c17
 
 
 
 
0758b38
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
---
license: apache-2.0
---
# maywell/EXAONE-3.0-7.8B-Instruct-Llamafied

๋™์ผ ๋ผ์ด์„ผ์Šค ์žฌ๋ฐฐํฌ์กฐ์ฐจ ๊ธˆ์ง€๋˜์–ด์žˆ๋Š” ๊ด€๊ณ„๋กœ Llamafied ๋ชจ๋ธ์„ ๊ณต์œ ํ•  ์ˆ˜ ์—†๊ฒŒ ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. vLLM, ์ถ”๋ก  ๋ฐ ๊ธฐํƒ€ ํ™œ์šฉ์œผ๋กœ Llamafied ๋ชจ๋ธ์ด ํ•„์š”ํ•˜๋‹ค๋ฉด ์•„๋ž˜ ์Šคํฌ๋ฆฝํŠธ๋ฅผ ์‹คํ–‰ํ•ด์„œ ์‚ฌ์šฉํ•ด์ฃผ์‹œ๋ฉด ๊ฐ์‚ฌํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค.

```python
import torch
import gc

from transformers import LlamaConfig, LlamaForCausalLM, AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm


def unload_model(model):
    """Clear memory by deleting a model and calling the garbage collector."""
    del model
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def create_llama_config(exaone_config):
    """Create and return a Llama configuration based on EXAONE config."""
    return LlamaConfig(
        vocab_size=exaone_config.vocab_size,
        hidden_size=exaone_config.hidden_size,
        intermediate_size=exaone_config.intermediate_size,
        num_hidden_layers=exaone_config.num_layers,
        num_attention_heads=exaone_config.num_attention_heads,
        max_position_embeddings=exaone_config.max_position_embeddings,
        rms_norm_eps=exaone_config.layer_norm_epsilon,
        num_key_value_heads=exaone_config.num_key_value_heads,
        rope_theta=exaone_config.rope_theta,
        bos_token_id=exaone_config.bos_token_id,
        eos_token_id=exaone_config.eos_token_id,
        pad_token_id=exaone_config.pad_token_id,
        attention_bias=False,
    )

def copy_embedding_weights(llama_model, exaone_model):
    """Copy embedding weights from EXAONE to Llama model."""
    llama_model.model.embed_tokens.weight.data = exaone_model.transformer.wte.weight.data.to(llama_model.device)

def copy_layer_weights(llama_layer, exaone_layer, device):
    """Copy weights for a single layer from EXAONE to Llama model."""
    # Self-attention
    llama_layer.self_attn.q_proj.weight.data = exaone_layer.attn.attention.q_proj.weight.data.to(device)
    llama_layer.self_attn.k_proj.weight.data = exaone_layer.attn.attention.k_proj.weight.data.to(device)
    llama_layer.self_attn.v_proj.weight.data = exaone_layer.attn.attention.v_proj.weight.data.to(device)
    llama_layer.self_attn.o_proj.weight.data = exaone_layer.attn.attention.out_proj.weight.data.to(device)
    # MLP
    llama_layer.mlp.gate_proj.weight.data = exaone_layer.mlp.c_fc_0.weight.data.to(device)
    llama_layer.mlp.up_proj.weight.data = exaone_layer.mlp.c_fc_1.weight.data.to(device)
    llama_layer.mlp.down_proj.weight.data = exaone_layer.mlp.c_proj.weight.data.to(device)
    # Layer Norms
    llama_layer.input_layernorm.weight.data = exaone_layer.ln_1.weight.data.to(device)
    llama_layer.post_attention_layernorm.weight.data = exaone_layer.ln_2.weight.data.to(device)

def copy_final_weights(llama_model, exaone_model):
    """Copy final layer norm and LM head weights from EXAONE to Llama model."""
    llama_model.model.norm.weight.data = exaone_model.transformer.ln_f.weight.data.to(llama_model.device)
    llama_model.lm_head.weight.data = exaone_model.lm_head.weight.data.to(llama_model.device)

def port_exaone_to_llama(exaone_model_path, llama_model_path):
    print("Loading EXAONE model and tokenizer...")
    exaone_model = AutoModelForCausalLM.from_pretrained(exaone_model_path, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
    exaone_tokenizer = AutoTokenizer.from_pretrained(exaone_model_path, trust_remote_code=True)
    exaone_config = exaone_model.config

    print("Creating Llama configuration...")
    llama_config = create_llama_config(exaone_config)

    print("Initializing Llama model...")
    llama_model = LlamaForCausalLM(llama_config)
    llama_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    print("Copying weights...")
    copy_embedding_weights(llama_model, exaone_model)

    for i in tqdm(range(exaone_config.num_layers), desc="Copying layers"):
        copy_layer_weights(llama_model.model.layers[i], exaone_model.transformer.h[i], llama_model.device)

    copy_final_weights(llama_model, exaone_model)

    print("Unloading EXAONE model to free memory...")
    unload_model(exaone_model)

    print(f"Saving ported Llama model and tokenizer to {llama_model_path}")
    llama_model.save_pretrained(llama_model_path, safe_serialization=True, max_shard_size="5GB")
    exaone_tokenizer.save_pretrained(llama_model_path)

    print("Unloading Llama model...")
    unload_model(llama_model)

    print(f"EXAONE model successfully ported to Llama format and saved at {llama_model_path}")

if __name__ == "__main__":
    exaone_model_path = "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
    llama_model_path = "./exa_llamafied"
    port_exaone_to_llama(exaone_model_path, llama_model_path)
```

๋ชจ๋ธ์„ ๊ณต๊ฐœํ•ด์ฃผ์‹  `LG AI Research`๋ถ„๋“ค๊ป˜ ๊ฐ์‚ฌ์˜ ๋ง์”€ ๋“œ๋ฆฝ๋‹ˆ๋‹ค.
[Original Repository](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)