Crystalcareai commited on
Commit
ce8a2da
·
verified ·
1 Parent(s): 1833729

Update train-h100-sharegpt-sft.py

Browse files
Files changed (1) hide show
  1. train-h100-sharegpt-sft.py +75 -51
train-h100-sharegpt-sft.py CHANGED
@@ -4,6 +4,7 @@ import random
4
  from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline, AutoConfig, BitsAndBytesConfig
5
  from datasets import load_dataset
6
  from transformers import TrainingArguments
 
7
  from trl import SFTTrainer
8
  from peft import LoraConfig
9
  from torch.nn import CrossEntropyLoss
@@ -14,12 +15,14 @@ random_seed = 42
14
  torch.manual_seed(random_seed)
15
  random.seed(random_seed)
16
 
17
- dataset = load_dataset("HuggingFaceH4/orca-math-word-problems-200k", split="train_sft").select(range(1500))
18
 
19
- n_ahead_talk_global = 4
 
20
  n_passes_global = 1
21
- n_ahead_global = 4
22
- n_examples = 0
 
23
 
24
  def model_init(params):
25
  original = False
@@ -93,77 +96,98 @@ def model_init(params):
93
  model.train()
94
  return model
95
 
96
- # def clear_gpu_cache():
97
- # torch.cuda.empty_cache()
98
- # gc.collect()
99
-
100
- # class CustomSFTTrainer(SFTTrainer):
101
- # def __init__(self, *args, **kwargs):
102
- # super().__init__(*args, **kwargs)
103
- # self.cache_clear_step = 6
104
- # self.gradient_scale = 0.1 # Scaling factor for gradients
105
-
106
- # def on_step_end(self, args, state, control, **kwargs):
107
- # if state.global_step % self.cache_clear_step == 0:
108
- # clear_gpu_cache()
109
- # return super().on_step_end(args, state, control, **kwargs)
110
-
111
- # def compute_loss(self, model, inputs, return_outputs=False):
112
- # loss = super().compute_loss(model, inputs, return_outputs=return_outputs)
113
- # scaled_loss = loss * self.gradient_scale
114
- # return (scaled_loss, loss.detach()) if return_outputs else scaled_loss
115
-
116
- max_seq_length = 4092
117
  run_id = int(time.time())
118
  training_args = TrainingArguments(
119
  output_dir="./out",
120
- num_train_epochs=3,
121
  per_device_train_batch_size=1,
122
  gradient_checkpointing=False,
123
- gradient_accumulation_steps=4,
124
- # optim="galore_adamw",
125
- # optim_target_modules=[r".*attn.*", r".*mlp.*"],
126
- optim="adamw_torch_fused",
127
  logging_steps=1,
128
  save_strategy="steps",
129
- save_steps=1000,
130
  max_steps=-1,
 
 
131
  bf16=True,
 
132
  tf32=True,
133
- learning_rate=2e-10,
134
- max_grad_norm=0.1,
135
  warmup_steps=20,
136
- lr_scheduler_type="cosine",
137
  push_to_hub=False,
138
  report_to="wandb"
139
  )
140
 
141
- # peft_config = LoraConfig(
142
- # r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
143
- # target_modules =["q_proj", "v_proj", "o_proj", "k_proj"],
144
- # lora_alpha = 32,
145
- # lora_dropout = 0, # Supports any, but = 0 is optimized
146
- # bias = "none",
147
- # use_dora=True,
148
- # )
 
149
 
150
  torch.autograd.set_detect_anomaly(True)
151
 
152
- # Set the device for each process
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
154
 
155
- model = model_init(None) # Initialize the model
156
-
157
- tokenizer = model.tokenizer
158
 
159
- trainer = SFTTrainer(
 
160
  args=training_args,
161
  train_dataset=dataset,
162
- model=model,
163
- tokenizer=tokenizer,
164
  max_seq_length=max_seq_length,
165
- # neftune_noise_alpha=5,
166
- # peft_config=peft_config,
167
  )
168
 
169
  trainer.train()
 
4
  from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline, AutoConfig, BitsAndBytesConfig
5
  from datasets import load_dataset
6
  from transformers import TrainingArguments
7
+ from accelerate import infer_auto_device_map, init_empty_weights, dispatch_model
8
  from trl import SFTTrainer
9
  from peft import LoraConfig
10
  from torch.nn import CrossEntropyLoss
 
15
  torch.manual_seed(random_seed)
16
  random.seed(random_seed)
17
 
18
+ dataset = load_dataset("HuggingFaceH4/orca-math-word-problems-200k", split="train_sft").select(range(3000))
19
 
20
+
21
+ n_ahead_talk_global = 2
22
  n_passes_global = 1
23
+ n_ahead_global = 8
24
+ # n_examples = 1000
25
+ # full_batch_size = 8
26
 
27
  def model_init(params):
28
  original = False
 
96
  model.train()
97
  return model
98
 
99
+ max_seq_length = 8192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  run_id = int(time.time())
101
  training_args = TrainingArguments(
102
  output_dir="./out",
103
+ num_train_epochs=1,
104
  per_device_train_batch_size=1,
105
  gradient_checkpointing=False,
106
+ gradient_accumulation_steps=1,
107
+ optim="lion_32bit",
 
 
108
  logging_steps=1,
109
  save_strategy="steps",
110
+ save_steps=100,
111
  max_steps=-1,
112
+ # auto_find_batch_size=True,
113
+ weight_decay=0.001,
114
  bf16=True,
115
+
116
  tf32=True,
117
+ learning_rate=1e-07,
118
+ max_grad_norm=0,
119
  warmup_steps=20,
120
+ lr_scheduler_type="constant",
121
  push_to_hub=False,
122
  report_to="wandb"
123
  )
124
 
125
+ peft_config = LoraConfig(
126
+ r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
127
+ target_modules =["up_proj", "down_proj", "gate_proj"],
128
+ lora_alpha = 32,
129
+ lora_dropout = 0, # Supports any, but = 0 is optimized
130
+ bias = "none",
131
+ use_dora=False,
132
+ task_type="CAUSAL_LM"
133
+ )
134
 
135
  torch.autograd.set_detect_anomaly(True)
136
 
137
+ class CustomSFTTrainer(SFTTrainer):
138
+ def __init__(self, *args, **kwargs):
139
+ super().__init__(*args, **kwargs)
140
+ self.beta = 0.9 # momentum factor
141
+ self.clip_factor = 1.0 # clipping factor
142
+ self.moving_avg = 0.0
143
+
144
+ def training_step(self, model, inputs):
145
+ model.train()
146
+ inputs = self._prepare_inputs(inputs)
147
+
148
+ outputs = model(**inputs)
149
+ loss = outputs.loss if isinstance(outputs, dict) else outputs[0]
150
+
151
+ if self.args.gradient_accumulation_steps > 1:
152
+ loss = loss / self.args.gradient_accumulation_steps
153
+
154
+ loss.backward()
155
+
156
+ # Compute gradients and their norm
157
+ grad_norm = torch.sqrt(sum(p.grad.data.norm().to(model.device)**2 for p in model.parameters() if p.grad is not None))
158
+
159
+ # Update moving average and apply gradient clipping
160
+ if self.state.global_step == 0:
161
+ self.moving_avg = grad_norm
162
+ else:
163
+ self.moving_avg = self.beta * self.moving_avg + (1 - self.beta) * grad_norm
164
+
165
+ if grad_norm > self.clip_factor * self.moving_avg:
166
+ clip_coef = (self.clip_factor * self.moving_avg / grad_norm).item()
167
+ for param in model.parameters():
168
+ if param.grad is not None:
169
+ param.grad.data.mul_(clip_coef)
170
+
171
+ if (self.state.global_step + 1) % self.args.gradient_accumulation_steps == 0:
172
+ self.optimizer.step()
173
+ self.lr_scheduler.step()
174
+ model.zero_grad()
175
+ self.state.global_step += 1
176
+
177
+ # Return the loss as a Tensor
178
+ return loss
179
+
180
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
181
 
182
+ model = model_init(None)
 
 
183
 
184
+ trainer = CustomSFTTrainer(
185
+ model=model,
186
  args=training_args,
187
  train_dataset=dataset,
188
+ tokenizer=model.tokenizer,
 
189
  max_seq_length=max_seq_length,
190
+ peft_config=peft_config,
 
191
  )
192
 
193
  trainer.train()