Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 3, 2024

Commit

ce8a2da

verified ·

1 Parent(s): 1833729

Update train-h100-sharegpt-sft.py

Browse files

Files changed (1) hide show

train-h100-sharegpt-sft.py +75 -51

train-h100-sharegpt-sft.py CHANGED Viewed

@@ -4,6 +4,7 @@ import random
 from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline, AutoConfig, BitsAndBytesConfig
 from datasets import load_dataset
 from transformers import TrainingArguments
 from trl import SFTTrainer
 from peft import LoraConfig
 from torch.nn import CrossEntropyLoss
@@ -14,12 +15,14 @@ random_seed = 42
 torch.manual_seed(random_seed)
 random.seed(random_seed)
-dataset = load_dataset("HuggingFaceH4/orca-math-word-problems-200k", split="train_sft").select(range(1500))
-n_ahead_talk_global = 4
 n_passes_global = 1
-n_ahead_global = 4
-n_examples = 0
 def model_init(params):
     original = False
@@ -93,77 +96,98 @@ def model_init(params):
     model.train()
     return model
-# def clear_gpu_cache():
-#     torch.cuda.empty_cache()
-#     gc.collect()
-# class CustomSFTTrainer(SFTTrainer):
-#     def __init__(self, *args, **kwargs):
-#         super().__init__(*args, **kwargs)
-#         self.cache_clear_step = 6
-#         self.gradient_scale = 0.1  # Scaling factor for gradients
-#     def on_step_end(self, args, state, control, **kwargs):
-#         if state.global_step % self.cache_clear_step == 0:
-#             clear_gpu_cache()
-#         return super().on_step_end(args, state, control, **kwargs)
-#     def compute_loss(self, model, inputs, return_outputs=False):
-#         loss = super().compute_loss(model, inputs, return_outputs=return_outputs)
-#         scaled_loss = loss * self.gradient_scale
-#         return (scaled_loss, loss.detach()) if return_outputs else scaled_loss
-max_seq_length = 4092
 run_id = int(time.time())
 training_args = TrainingArguments(
     output_dir="./out",
-    num_train_epochs=3,
     per_device_train_batch_size=1,
     gradient_checkpointing=False,
-    gradient_accumulation_steps=4,
-    # optim="galore_adamw",
-    # optim_target_modules=[r".*attn.*", r".*mlp.*"],
-    optim="adamw_torch_fused",
     logging_steps=1,
     save_strategy="steps",
-    save_steps=1000,
     max_steps=-1,
     bf16=True,
     tf32=True,
-    learning_rate=2e-10,
-    max_grad_norm=0.1,
     warmup_steps=20,
-    lr_scheduler_type="cosine",
     push_to_hub=False,
     report_to="wandb"
 )
-# peft_config = LoraConfig(
-#     r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
-#     target_modules =["q_proj", "v_proj", "o_proj", "k_proj"],
-#     lora_alpha = 32,
-#     lora_dropout = 0, # Supports any, but = 0 is optimized
-#     bias = "none",
-#     use_dora=True,
-# )
 torch.autograd.set_detect_anomaly(True)
-# Set the device for each process
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = model_init(None)  # Initialize the model
-tokenizer = model.tokenizer
-trainer = SFTTrainer(
     args=training_args,
     train_dataset=dataset,
-    model=model,
-    tokenizer=tokenizer,
     max_seq_length=max_seq_length,
-    # neftune_noise_alpha=5,
-    # peft_config=peft_config,
 )
 trainer.train()

 from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline, AutoConfig, BitsAndBytesConfig
 from datasets import load_dataset
 from transformers import TrainingArguments
+from accelerate import infer_auto_device_map, init_empty_weights, dispatch_model
 from trl import SFTTrainer
 from peft import LoraConfig
 from torch.nn import CrossEntropyLoss
 torch.manual_seed(random_seed)
 random.seed(random_seed)
+dataset = load_dataset("HuggingFaceH4/orca-math-word-problems-200k", split="train_sft").select(range(3000))
+n_ahead_talk_global = 2
 n_passes_global = 1
+n_ahead_global = 8
+# n_examples = 1000
+# full_batch_size = 8
 def model_init(params):
     original = False
     model.train()
     return model
+max_seq_length = 8192
 run_id = int(time.time())
 training_args = TrainingArguments(
     output_dir="./out",
+    num_train_epochs=1,
     per_device_train_batch_size=1,
     gradient_checkpointing=False,
+    gradient_accumulation_steps=1,
+    optim="lion_32bit",
     logging_steps=1,
     save_strategy="steps",
+    save_steps=100,
     max_steps=-1,
+    # auto_find_batch_size=True,
+    weight_decay=0.001,
     bf16=True,
     tf32=True,
+    learning_rate=1e-07,
+    max_grad_norm=0,
     warmup_steps=20,
+    lr_scheduler_type="constant",
     push_to_hub=False,
     report_to="wandb"
 )
+peft_config = LoraConfig(
+    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
+    target_modules =["up_proj", "down_proj", "gate_proj"],
+    lora_alpha = 32,
+    lora_dropout = 0, # Supports any, but = 0 is optimized
+    bias = "none",
+    use_dora=False,
+    task_type="CAUSAL_LM"
+)
 torch.autograd.set_detect_anomaly(True)
+class CustomSFTTrainer(SFTTrainer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.beta = 0.9  # momentum factor
+        self.clip_factor = 1.0  # clipping factor
+        self.moving_avg = 0.0
+    def training_step(self, model, inputs):
+        model.train()
+        inputs = self._prepare_inputs(inputs)
+        outputs = model(**inputs)
+        loss = outputs.loss if isinstance(outputs, dict) else outputs[0]
+        if self.args.gradient_accumulation_steps > 1:
+            loss = loss / self.args.gradient_accumulation_steps
+        loss.backward()
+        # Compute gradients and their norm
+        grad_norm = torch.sqrt(sum(p.grad.data.norm().to(model.device)**2 for p in model.parameters() if p.grad is not None))
+        # Update moving average and apply gradient clipping
+        if self.state.global_step == 0:
+            self.moving_avg = grad_norm
+        else:
+            self.moving_avg = self.beta * self.moving_avg + (1 - self.beta) * grad_norm
+        if grad_norm > self.clip_factor * self.moving_avg:
+            clip_coef = (self.clip_factor * self.moving_avg / grad_norm).item()
+            for param in model.parameters():
+                if param.grad is not None:
+                    param.grad.data.mul_(clip_coef)
+        if (self.state.global_step + 1) % self.args.gradient_accumulation_steps == 0:
+            self.optimizer.step()
+            self.lr_scheduler.step()
+            model.zero_grad()
+            self.state.global_step += 1
+        # Return the loss as a Tensor
+        return loss
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = model_init(None)
+trainer = CustomSFTTrainer(
+    model=model,
     args=training_args,
     train_dataset=dataset,
+    tokenizer=model.tokenizer,
     max_seq_length=max_seq_length,
+    peft_config=peft_config,
 )
 trainer.train()