Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 2, 2024

Commit

2892566

verified ·

1 Parent(s): 05c4b5c

Update train-h100-sharegpt-sft.py

Browse files

Files changed (1) hide show

train-h100-sharegpt-sft.py +59 -49

train-h100-sharegpt-sft.py CHANGED Viewed

@@ -1,27 +1,26 @@
 import torch
 torch.backends.cuda.matmul.allow_tf32 = True
 import random
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from datasets import load_dataset
 from transformers import TrainingArguments
 from trl import SFTTrainer
 from peft import LoraConfig
 import time
 random_seed = 42
 torch.manual_seed(random_seed)
 random.seed(random_seed)
-dataset = load_dataset("Crystalcareai/Self-Discover-MM-Instruct-openai", split="train_sft")
-n_ahead_talk_global = 4
-n_passes_global = 2
-n_ahead_global = 2
 n_examples = 0
 def model_init(params):
     original = False
     if params is None:
@@ -44,11 +43,10 @@ def model_init(params):
     model_id = "Crystalcareai/Quiet-Star-Custom"
     tokenizer_id = model_id
     print("Loading model")
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
         max_thoughts=n_ahead + n_ahead_talk + 1,
         merged_talk_heads=merged_talk_heads,
         merged_lm_and_talk_heads=False,
@@ -59,14 +57,12 @@ def model_init(params):
         use_complex_think_head=False,
         use_complex_talk_head=True,
         use_weighted_talk_head=True,
-        trust_remote_code=True,
-        # device_map="auto",
-        # load_in_4bit=True,
-        # attn_implementation="flash_attention_2",
     )
     print("Loaded model")
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id,truncation=True,padding_side="right")
     tokenizer.pad_token_id = tokenizer.eos_token_id
     special_tokens_to_add = []
@@ -76,12 +72,11 @@ def model_init(params):
         special_tokens_to_add.append("<|endthought|>")
     if special_tokens_to_add:
         tokenizer.add_special_tokens({"additional_special_tokens": special_tokens_to_add})
-        model.resize_token_embeddings(len(tokenizer))
     model.tokenizer = tokenizer
     for name, module in model.named_modules():
         if "embed" in name:
             print(module, flush=True)
     model.gumbel_detach = gumbel_detach
     model.include_policy_loss = include_policy_loss
     model.use_end_thought_token = use_end_thought_token
@@ -98,55 +93,70 @@ def model_init(params):
     model.train()
     return model
-max_seq_length = 1024
 run_id = int(time.time())
 training_args = TrainingArguments(
     output_dir="./out",
     num_train_epochs=3,
     per_device_train_batch_size=1,
     gradient_checkpointing=False,
-    gradient_accumulation_steps=6,
-    optim="lion_32bit",
     logging_steps=1,
     save_strategy="steps",
-    save_steps=25,
     bf16=True,
-    # fp16=True,
-    tf32=False,
-    # epsilson=1e-05,
-    # beta1=0.9,
-    # beta2=0.95,
-    # auto_find_batch_size=True
-    learning_rate=6e-05,
-    max_grad_norm=0.3,  # Gradient clipping with a maximum gradient norm of 0.3
-    warmup_ratio=0.06,
-    lr_scheduler_type="cosine",
     push_to_hub=False,
     report_to="wandb"
 )
-peft_config = LoraConfig(
-          r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
-    target_modules = ["q_proj", "k_proj"],
-    lora_alpha = 16,
-    lora_dropout = 0, # Supports any, but = 0 is optimized
-    bias = "none", # Enable Dora method
-    use_dora=True,
-)
 torch.autograd.set_detect_anomaly(True)
 model = model_init(None)  # Initialize the model
-tokenizer = model.tokenizer
-trainer = SFTTrainer(
     args=training_args,
     train_dataset=dataset,
     model=model,
-    peft_config=peft_config,
-    tokenizer=tokenizer,
     max_seq_length=max_seq_length,
 )
-trainer.train()

 import torch
 torch.backends.cuda.matmul.allow_tf32 = True
 import random
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline, AutoConfig, BitsAndBytesConfig
 from datasets import load_dataset
 from transformers import TrainingArguments
 from trl import SFTTrainer
 from peft import LoraConfig
+from torch.nn import CrossEntropyLoss
 import time
+import gc
 random_seed = 42
 torch.manual_seed(random_seed)
 random.seed(random_seed)
+dataset = load_dataset("HuggingFaceH4/orca-math-word-problems-200k", split="train_sft").select(range(1500))
+n_ahead_talk_global = 1
+n_passes_global = 1
+n_ahead_global = 8
 n_examples = 0
 def model_init(params):
     original = False
     if params is None:
     model_id = "Crystalcareai/Quiet-Star-Custom"
     tokenizer_id = model_id
     print("Loading model")
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
+        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
         max_thoughts=n_ahead + n_ahead_talk + 1,
         merged_talk_heads=merged_talk_heads,
         merged_lm_and_talk_heads=False,
         use_complex_think_head=False,
         use_complex_talk_head=True,
         use_weighted_talk_head=True,
+        trust_remote_code=True,
+        device_map="auto",
     )
     print("Loaded model")
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, truncation=True, padding_side="right")
     tokenizer.pad_token_id = tokenizer.eos_token_id
     special_tokens_to_add = []
         special_tokens_to_add.append("<|endthought|>")
     if special_tokens_to_add:
         tokenizer.add_special_tokens({"additional_special_tokens": special_tokens_to_add})
     model.tokenizer = tokenizer
     for name, module in model.named_modules():
         if "embed" in name:
             print(module, flush=True)
     model.gumbel_detach = gumbel_detach
     model.include_policy_loss = include_policy_loss
     model.use_end_thought_token = use_end_thought_token
     model.train()
     return model
+def clear_gpu_cache():
+    torch.cuda.empty_cache()
+    gc.collect()
+class CustomSFTTrainer(SFTTrainer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.cache_clear_step = 6  # Clear cache every 100 steps
+    def on_step_end(self, args, state, control, **kwargs):
+        if state.global_step % self.cache_clear_step == 0:
+            clear_gpu_cache()
+        return super().on_step_end(args, state, control, **kwargs)
+max_seq_length = 8092
 run_id = int(time.time())
 training_args = TrainingArguments(
     output_dir="./out",
     num_train_epochs=3,
     per_device_train_batch_size=1,
     gradient_checkpointing=False,
+    gradient_accumulation_steps=16,
+    optim="galore_adamw",
+    optim_target_modules=[ r".*mlp.*"],
+    # optim="adamw_torch_fused",
     logging_steps=1,
     save_strategy="steps",
+    save_steps=1000,
+    max_steps=-1,
     bf16=True,
+    tf32=True,
+    learning_rate=2e-10,
+    max_grad_norm=1.0,
+    warmup_steps=20,
+    lr_scheduler_type="constant",
     push_to_hub=False,
     report_to="wandb"
 )
+# peft_config = LoraConfig(
+#     r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+#     target_modules =["q_proj", "v_proj"],
+#     lora_alpha = 32,
+#     lora_dropout = 0, # Supports any, but = 0 is optimized
+#     bias = "none",
+#     use_dora=True,
+# )
 torch.autograd.set_detect_anomaly(True)
+# Set the device for each process
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = model_init(None)  # Initialize the model
+tokenizer = model.tokenizer
+trainer = CustomSFTTrainer(
     args=training_args,
     train_dataset=dataset,
     model=model,
+    tokenizer=tokenizer,
     max_seq_length=max_seq_length,
+    # peft_config=peft_config,
 )
+trainer.train()