|
[INFO|parser.py:325] 2024-09-19 12:17:17,636 >> Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16 |
|
|
|
[INFO|tokenization_utils_base.py:2287] 2024-09-19 12:17:17,638 >> loading file tokenizer.json |
|
|
|
[INFO|tokenization_utils_base.py:2287] 2024-09-19 12:17:17,638 >> loading file added_tokens.json |
|
|
|
[INFO|tokenization_utils_base.py:2287] 2024-09-19 12:17:17,638 >> loading file special_tokens_map.json |
|
|
|
[INFO|tokenization_utils_base.py:2287] 2024-09-19 12:17:17,638 >> loading file tokenizer_config.json |
|
|
|
[INFO|tokenization_utils_base.py:2533] 2024-09-19 12:17:17,897 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
|
[INFO|template.py:270] 2024-09-19 12:17:17,897 >> Replace eos token: <|eot_id|> |
|
|
|
[INFO|template.py:372] 2024-09-19 12:17:17,897 >> Add pad token: <|eot_id|> |
|
|
|
[INFO|loader.py:50] 2024-09-19 12:17:17,897 >> Loading dataset Judge.json... |
|
|
|
[INFO|configuration_utils.py:731] 2024-09-19 12:17:21,971 >> loading configuration file /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 12:17:21,972 >> Model config LlamaConfig { |
|
"_name_or_path": "/home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"hidden_act": "silu", |
|
"hidden_size": 4096, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 32, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 8.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[WARNING|rope.py:57] 2024-09-19 12:17:21,972 >> Input length is smaller than max length. Consider increase input length. |
|
|
|
[INFO|rope.py:63] 2024-09-19 12:17:21,972 >> Using linear scaling strategy and setting scaling factor to 1.0 |
|
|
|
[INFO|configuration_utils.py:731] 2024-09-19 12:17:23,421 >> loading configuration file /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 12:17:23,422 >> Model config LlamaConfig { |
|
"_name_or_path": "/home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"hidden_act": "silu", |
|
"hidden_size": 4096, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 32, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 8.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[INFO|configuration_utils.py:733] 2024-09-19 12:17:24,156 >> loading configuration file config.json from cache at /home/marl/.cache/huggingface/hub/models--unslothai--other/snapshots/43d9e0f2f19a5d7836895f648dc0e762816acf77/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 12:17:24,157 >> Model config LlamaConfig { |
|
"_name_or_path": "unslothai/other", |
|
"architectures": [ |
|
"LlamaModel" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"hidden_act": "silu", |
|
"hidden_size": 0, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 0, |
|
"max_position_embeddings": 0, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 0, |
|
"num_hidden_layers": 0, |
|
"num_key_value_heads": 0, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 0 |
|
} |
|
|
|
|
|
[INFO|modeling_utils.py:3634] 2024-09-19 12:17:24,659 >> loading weights file model.safetensors from cache at /home/marl/.cache/huggingface/hub/models--unslothai--other/snapshots/43d9e0f2f19a5d7836895f648dc0e762816acf77/model.safetensors |
|
|
|
[INFO|configuration_utils.py:1038] 2024-09-19 12:17:24,670 >> Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2 |
|
} |
|
|
|
|
|
[INFO|configuration_utils.py:733] 2024-09-19 12:17:24,820 >> loading configuration file config.json from cache at /home/marl/.cache/huggingface/hub/models--unslothai--repeat/snapshots/7c48478c02f84ed89f149b0815cc0216ee831fb0/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 12:17:24,821 >> Model config LlamaConfig { |
|
"_name_or_path": "unslothai/repeat", |
|
"architectures": [ |
|
"LlamaModel" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"hidden_act": "silu", |
|
"hidden_size": 0, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 0, |
|
"max_position_embeddings": 0, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 0, |
|
"num_hidden_layers": 0, |
|
"num_key_value_heads": 0, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 0 |
|
} |
|
|
|
|
|
[INFO|modeling_utils.py:3634] 2024-09-19 12:17:24,823 >> loading weights file model.safetensors from cache at /home/marl/.cache/huggingface/hub/models--unslothai--repeat/snapshots/7c48478c02f84ed89f149b0815cc0216ee831fb0/model.safetensors |
|
|
|
[INFO|configuration_utils.py:1038] 2024-09-19 12:17:24,824 >> Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2 |
|
} |
|
|
|
|
|
[INFO|configuration_utils.py:733] 2024-09-19 12:17:25,466 >> loading configuration file config.json from cache at /home/marl/.cache/huggingface/hub/models--unslothai--vram-24/snapshots/61324ceeacd75b2b31f7a789a9c9d82058e6118c/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 12:17:25,467 >> Model config LlamaConfig { |
|
"_name_or_path": "unslothai/vram-24", |
|
"architectures": [ |
|
"LlamaModel" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"hidden_act": "silu", |
|
"hidden_size": 0, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 0, |
|
"max_position_embeddings": 0, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 0, |
|
"num_hidden_layers": 0, |
|
"num_key_value_heads": 0, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 0 |
|
} |
|
|
|
|
|
[INFO|modeling_utils.py:3634] 2024-09-19 12:17:25,853 >> loading weights file model.safetensors from cache at /home/marl/.cache/huggingface/hub/models--unslothai--vram-24/snapshots/61324ceeacd75b2b31f7a789a9c9d82058e6118c/model.safetensors |
|
|
|
[INFO|configuration_utils.py:1038] 2024-09-19 12:17:25,854 >> Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2 |
|
} |
|
|
|
|
|
[INFO|configuration_utils.py:733] 2024-09-19 12:17:26,452 >> loading configuration file config.json from cache at /home/marl/.cache/huggingface/hub/models--unslothai--1/snapshots/7ec782b7604cd9ea0781c23a4270f031650f5617/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 12:17:26,453 >> Model config LlamaConfig { |
|
"_name_or_path": "unslothai/1", |
|
"architectures": [ |
|
"LlamaModel" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"hidden_act": "silu", |
|
"hidden_size": 0, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 0, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 0, |
|
"num_hidden_layers": 0, |
|
"num_key_value_heads": 0, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 0 |
|
} |
|
|
|
|
|
[INFO|modeling_utils.py:3634] 2024-09-19 12:17:26,819 >> loading weights file model.safetensors from cache at /home/marl/.cache/huggingface/hub/models--unslothai--1/snapshots/7ec782b7604cd9ea0781c23a4270f031650f5617/model.safetensors |
|
|
|
[INFO|configuration_utils.py:1038] 2024-09-19 12:17:26,821 >> Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2 |
|
} |
|
|
|
|
|
[INFO|configuration_utils.py:731] 2024-09-19 12:17:26,821 >> loading configuration file /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 12:17:26,822 >> Model config LlamaConfig { |
|
"_name_or_path": "/home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"hidden_act": "silu", |
|
"hidden_size": 4096, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 32, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 8.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[INFO|configuration_utils.py:731] 2024-09-19 12:17:26,836 >> loading configuration file /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 12:17:26,836 >> Model config LlamaConfig { |
|
"_name_or_path": "/home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"hidden_act": "silu", |
|
"hidden_size": 4096, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 32, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 8.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[INFO|modeling_utils.py:3631] 2024-09-19 12:17:26,837 >> loading weights file /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693/model.safetensors.index.json |
|
|
|
[INFO|modeling_utils.py:1572] 2024-09-19 12:17:26,838 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16. |
|
|
|
[INFO|configuration_utils.py:1038] 2024-09-19 12:17:26,839 >> Generate config GenerationConfig { |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
] |
|
} |
|
|
|
|
|
[INFO|modeling_utils.py:4463] 2024-09-19 12:17:30,294 >> All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
|
|
[INFO|modeling_utils.py:4471] 2024-09-19 12:17:30,294 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
|
|
[INFO|configuration_utils.py:991] 2024-09-19 12:17:30,297 >> loading configuration file /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693/generation_config.json |
|
|
|
[INFO|configuration_utils.py:1038] 2024-09-19 12:17:30,297 >> Generate config GenerationConfig { |
|
"bos_token_id": 128000, |
|
"do_sample": true, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"temperature": 0.6, |
|
"top_p": 0.9 |
|
} |
|
|
|
|
|
[WARNING|logging.py:328] 2024-09-19 12:17:30,609 >> Unsloth: We successfully patched the tokenizer to add a {% if add_generation_prompt %} to the chat_template. |
|
This is not a bug, but please notify the Unsloth maintainers - thanks! |
|
|
|
[WARNING|logging.py:328] 2024-09-19 12:17:30,609 >> /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693 does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>. |
|
|
|
[INFO|checkpointing.py:103] 2024-09-19 12:17:31,099 >> Gradient checkpointing enabled. |
|
|
|
[INFO|adapter.py:302] 2024-09-19 12:17:31,099 >> Upcasting trainable params to float32. |
|
|
|
[INFO|adapter.py:158] 2024-09-19 12:17:31,099 >> Fine-tuning method: DoRA |
|
|
|
[INFO|misc.py:51] 2024-09-19 12:17:31,100 >> Found linear modules: v_proj,k_proj,o_proj,down_proj,q_proj,gate_proj,up_proj |
|
|
|
[WARNING|logging.py:328] 2024-09-19 12:17:32,743 >> Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters |
|
are not enabled or a bias term (like in Qwen) is used. |
|
|
|
[WARNING|logging.py:328] 2024-09-19 12:17:32,744 >> Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters |
|
are not enabled or a bias term (like in Qwen) is used. |
|
|
|
[WARNING|logging.py:328] 2024-09-19 12:17:32,744 >> Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters |
|
are not enabled or a bias term (like in Qwen) is used. |
|
|
|
[WARNING|logging.py:328] 2024-09-19 12:17:32,745 >> Unsloth 2024.9 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers. |
|
|
|
[INFO|loader.py:196] 2024-09-19 12:17:33,762 >> trainable params: 43,319,296 || all params: 8,073,580,544 || trainable%: 0.5366 |
|
|
|
[INFO|trainer.py:648] 2024-09-19 12:17:33,779 >> Using auto half precision backend |
|
|
|
[WARNING|<string>:213] 2024-09-19 12:17:34,063 >> ==((====))== Unsloth - 2x faster free finetuning | Num GPUs = 1 |
|
\\ /| Num examples = 30 | Num Epochs = 10 |
|
O^O/ \_/ \ Batch size per device = 2 | Gradient Accumulation steps = 8 |
|
\ / Total batch size = 16 | Total steps = 10 |
|
"-____-" Number of trainable parameters = 43,319,296 |
|
|
|
[INFO|callbacks.py:137] 2024-09-19 12:17:34,578 >> Initial PiSSA adatper will be saved at: saves/LLaMA3-8B-Chat/lora/JudgeDredd/pissa_init. |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 12:18:40,414 >> {'loss': 1.2688, 'learning_rate': 2.5000e-05, 'epoch': 2.67, 'throughput': 228.45} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 12:19:44,899 >> {'loss': 0.3328, 'learning_rate': 0.0000e+00, 'epoch': 5.33, 'throughput': 230.69} |
|
|
|
[INFO|trainer.py:3503] 2024-09-19 12:19:44,900 >> Saving model checkpoint to saves/LLaMA3-8B-Chat/lora/JudgeDredd/checkpoint-10 |
|
|
|
[INFO|<string>:478] 2024-09-19 12:19:45,579 >> |
|
|
|
Training completed. Do not forget to share your model on huggingface.co/models =) |
|
|
|
|
|
|
|
[INFO|callbacks.py:153] 2024-09-19 12:19:45,581 >> Converted PiSSA adapter will be saved at: saves/LLaMA3-8B-Chat/lora/JudgeDredd/pissa_converted. |
|
|
|
[INFO|trainer.py:3503] 2024-09-19 12:19:47,418 >> Saving model checkpoint to saves/LLaMA3-8B-Chat/lora/JudgeDredd |
|
|
|
[WARNING|ploting.py:89] 2024-09-19 12:19:47,819 >> No metric eval_loss to plot. |
|
|
|
[WARNING|ploting.py:89] 2024-09-19 12:19:47,819 >> No metric eval_accuracy to plot. |
|
|
|
[INFO|modelcard.py:449] 2024-09-19 12:19:47,820 >> Dropping the following result as it does not have all the necessary fields: |
|
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}} |
|
|
|
|