Update Llamafy code
Browse files
README.md
CHANGED
@@ -40,46 +40,47 @@ def create_llama_config(exaone_config):
|
|
40 |
|
41 |
def copy_embedding_weights(llama_model, exaone_model):
|
42 |
"""Copy embedding weights from EXAONE to LLaMA model."""
|
43 |
-
llama_model.model.embed_tokens.weight.data = exaone_model.transformer.wte.weight.data
|
44 |
|
45 |
-
def copy_layer_weights(llama_layer, exaone_layer
|
46 |
"""Copy weights for a single layer from EXAONE to LLaMA model."""
|
47 |
# Self-attention
|
48 |
-
llama_layer.self_attn.q_proj.weight.data = exaone_layer.attn.attention.q_proj.weight.data
|
49 |
-
llama_layer.self_attn.k_proj.weight.data = exaone_layer.attn.attention.k_proj.weight.data
|
50 |
-
llama_layer.self_attn.v_proj.weight.data = exaone_layer.attn.attention.v_proj.weight.data
|
51 |
-
llama_layer.self_attn.o_proj.weight.data = exaone_layer.attn.attention.out_proj.weight.data
|
52 |
# MLP
|
53 |
-
llama_layer.mlp.gate_proj.weight.data = exaone_layer.mlp.c_fc_0.weight.data
|
54 |
-
llama_layer.mlp.up_proj.weight.data = exaone_layer.mlp.c_fc_1.weight.data
|
55 |
-
llama_layer.mlp.down_proj.weight.data = exaone_layer.mlp.c_proj.weight.data
|
56 |
# Layer Norms
|
57 |
-
llama_layer.input_layernorm.weight.data = exaone_layer.ln_1.weight.data
|
58 |
-
llama_layer.post_attention_layernorm.weight.data = exaone_layer.ln_2.weight.data
|
59 |
|
60 |
def copy_final_weights(llama_model, exaone_model):
|
61 |
"""Copy final layer norm and LM head weights from EXAONE to LLaMA model."""
|
62 |
-
llama_model.model.norm.weight.data = exaone_model.transformer.ln_f.weight.data
|
63 |
-
llama_model.lm_head.weight.data = exaone_model.lm_head.weight.data
|
64 |
|
65 |
def port_exaone_to_llama(exaone_model_path, llama_model_path):
|
|
|
|
|
66 |
print("Loading EXAONE model...")
|
67 |
-
exaone_model = load_model(exaone_model_path, ExaoneForCausalLM)
|
68 |
exaone_config = exaone_model.config
|
69 |
|
70 |
print("Creating LLaMA configuration...")
|
71 |
llama_config = create_llama_config(exaone_config)
|
72 |
|
73 |
print("Initializing LLaMA model...")
|
74 |
-
llama_model = LlamaForCausalLM(llama_config)
|
75 |
-
llama_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
76 |
|
77 |
print("Copying weights...")
|
78 |
copy_embedding_weights(llama_model, exaone_model)
|
79 |
|
80 |
for i in range(exaone_config.num_layers):
|
81 |
print(f"Copying weights for layer {i+1}/{exaone_config.num_layers}")
|
82 |
-
copy_layer_weights(llama_model.model.layers[i], exaone_model.transformer.h[i]
|
83 |
|
84 |
copy_final_weights(llama_model, exaone_model)
|
85 |
|
|
|
40 |
|
41 |
def copy_embedding_weights(llama_model, exaone_model):
|
42 |
"""Copy embedding weights from EXAONE to LLaMA model."""
|
43 |
+
llama_model.model.embed_tokens.weight.data = exaone_model.transformer.wte.weight.data
|
44 |
|
45 |
+
def copy_layer_weights(llama_layer, exaone_layer):
|
46 |
"""Copy weights for a single layer from EXAONE to LLaMA model."""
|
47 |
# Self-attention
|
48 |
+
llama_layer.self_attn.q_proj.weight.data = exaone_layer.attn.attention.q_proj.weight.data
|
49 |
+
llama_layer.self_attn.k_proj.weight.data = exaone_layer.attn.attention.k_proj.weight.data
|
50 |
+
llama_layer.self_attn.v_proj.weight.data = exaone_layer.attn.attention.v_proj.weight.data
|
51 |
+
llama_layer.self_attn.o_proj.weight.data = exaone_layer.attn.attention.out_proj.weight.data
|
52 |
# MLP
|
53 |
+
llama_layer.mlp.gate_proj.weight.data = exaone_layer.mlp.c_fc_0.weight.data
|
54 |
+
llama_layer.mlp.up_proj.weight.data = exaone_layer.mlp.c_fc_1.weight.data
|
55 |
+
llama_layer.mlp.down_proj.weight.data = exaone_layer.mlp.c_proj.weight.data
|
56 |
# Layer Norms
|
57 |
+
llama_layer.input_layernorm.weight.data = exaone_layer.ln_1.weight.data
|
58 |
+
llama_layer.post_attention_layernorm.weight.data = exaone_layer.ln_2.weight.data
|
59 |
|
60 |
def copy_final_weights(llama_model, exaone_model):
|
61 |
"""Copy final layer norm and LM head weights from EXAONE to LLaMA model."""
|
62 |
+
llama_model.model.norm.weight.data = exaone_model.transformer.ln_f.weight.data
|
63 |
+
llama_model.lm_head.weight.data = exaone_model.lm_head.weight.data
|
64 |
|
65 |
def port_exaone_to_llama(exaone_model_path, llama_model_path):
|
66 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
67 |
+
|
68 |
print("Loading EXAONE model...")
|
69 |
+
exaone_model = load_model(exaone_model_path, ExaoneForCausalLM).to(device)
|
70 |
exaone_config = exaone_model.config
|
71 |
|
72 |
print("Creating LLaMA configuration...")
|
73 |
llama_config = create_llama_config(exaone_config)
|
74 |
|
75 |
print("Initializing LLaMA model...")
|
76 |
+
llama_model = LlamaForCausalLM(llama_config).to(device)
|
|
|
77 |
|
78 |
print("Copying weights...")
|
79 |
copy_embedding_weights(llama_model, exaone_model)
|
80 |
|
81 |
for i in range(exaone_config.num_layers):
|
82 |
print(f"Copying weights for layer {i+1}/{exaone_config.num_layers}")
|
83 |
+
copy_layer_weights(llama_model.model.layers[i], exaone_model.transformer.h[i])
|
84 |
|
85 |
copy_final_weights(llama_model, exaone_model)
|
86 |
|