Crystalcareai
commited on
Update train-h100-sharegpt-sft.py
Browse files- train-h100-sharegpt-sft.py +75 -51
train-h100-sharegpt-sft.py
CHANGED
@@ -4,6 +4,7 @@ import random
|
|
4 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline, AutoConfig, BitsAndBytesConfig
|
5 |
from datasets import load_dataset
|
6 |
from transformers import TrainingArguments
|
|
|
7 |
from trl import SFTTrainer
|
8 |
from peft import LoraConfig
|
9 |
from torch.nn import CrossEntropyLoss
|
@@ -14,12 +15,14 @@ random_seed = 42
|
|
14 |
torch.manual_seed(random_seed)
|
15 |
random.seed(random_seed)
|
16 |
|
17 |
-
dataset = load_dataset("HuggingFaceH4/orca-math-word-problems-200k", split="train_sft").select(range(
|
18 |
|
19 |
-
|
|
|
20 |
n_passes_global = 1
|
21 |
-
n_ahead_global =
|
22 |
-
n_examples =
|
|
|
23 |
|
24 |
def model_init(params):
|
25 |
original = False
|
@@ -93,77 +96,98 @@ def model_init(params):
|
|
93 |
model.train()
|
94 |
return model
|
95 |
|
96 |
-
|
97 |
-
# torch.cuda.empty_cache()
|
98 |
-
# gc.collect()
|
99 |
-
|
100 |
-
# class CustomSFTTrainer(SFTTrainer):
|
101 |
-
# def __init__(self, *args, **kwargs):
|
102 |
-
# super().__init__(*args, **kwargs)
|
103 |
-
# self.cache_clear_step = 6
|
104 |
-
# self.gradient_scale = 0.1 # Scaling factor for gradients
|
105 |
-
|
106 |
-
# def on_step_end(self, args, state, control, **kwargs):
|
107 |
-
# if state.global_step % self.cache_clear_step == 0:
|
108 |
-
# clear_gpu_cache()
|
109 |
-
# return super().on_step_end(args, state, control, **kwargs)
|
110 |
-
|
111 |
-
# def compute_loss(self, model, inputs, return_outputs=False):
|
112 |
-
# loss = super().compute_loss(model, inputs, return_outputs=return_outputs)
|
113 |
-
# scaled_loss = loss * self.gradient_scale
|
114 |
-
# return (scaled_loss, loss.detach()) if return_outputs else scaled_loss
|
115 |
-
|
116 |
-
max_seq_length = 4092
|
117 |
run_id = int(time.time())
|
118 |
training_args = TrainingArguments(
|
119 |
output_dir="./out",
|
120 |
-
num_train_epochs=
|
121 |
per_device_train_batch_size=1,
|
122 |
gradient_checkpointing=False,
|
123 |
-
gradient_accumulation_steps=
|
124 |
-
|
125 |
-
# optim_target_modules=[r".*attn.*", r".*mlp.*"],
|
126 |
-
optim="adamw_torch_fused",
|
127 |
logging_steps=1,
|
128 |
save_strategy="steps",
|
129 |
-
save_steps=
|
130 |
max_steps=-1,
|
|
|
|
|
131 |
bf16=True,
|
|
|
132 |
tf32=True,
|
133 |
-
learning_rate=
|
134 |
-
max_grad_norm=0
|
135 |
warmup_steps=20,
|
136 |
-
lr_scheduler_type="
|
137 |
push_to_hub=False,
|
138 |
report_to="wandb"
|
139 |
)
|
140 |
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
149 |
|
150 |
torch.autograd.set_detect_anomaly(True)
|
151 |
|
152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
154 |
|
155 |
-
model = model_init(None)
|
156 |
-
|
157 |
-
tokenizer = model.tokenizer
|
158 |
|
159 |
-
trainer =
|
|
|
160 |
args=training_args,
|
161 |
train_dataset=dataset,
|
162 |
-
|
163 |
-
tokenizer=tokenizer,
|
164 |
max_seq_length=max_seq_length,
|
165 |
-
|
166 |
-
# peft_config=peft_config,
|
167 |
)
|
168 |
|
169 |
trainer.train()
|
|
|
4 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline, AutoConfig, BitsAndBytesConfig
|
5 |
from datasets import load_dataset
|
6 |
from transformers import TrainingArguments
|
7 |
+
from accelerate import infer_auto_device_map, init_empty_weights, dispatch_model
|
8 |
from trl import SFTTrainer
|
9 |
from peft import LoraConfig
|
10 |
from torch.nn import CrossEntropyLoss
|
|
|
15 |
torch.manual_seed(random_seed)
|
16 |
random.seed(random_seed)
|
17 |
|
18 |
+
dataset = load_dataset("HuggingFaceH4/orca-math-word-problems-200k", split="train_sft").select(range(3000))
|
19 |
|
20 |
+
|
21 |
+
n_ahead_talk_global = 2
|
22 |
n_passes_global = 1
|
23 |
+
n_ahead_global = 8
|
24 |
+
# n_examples = 1000
|
25 |
+
# full_batch_size = 8
|
26 |
|
27 |
def model_init(params):
|
28 |
original = False
|
|
|
96 |
model.train()
|
97 |
return model
|
98 |
|
99 |
+
max_seq_length = 8192
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
run_id = int(time.time())
|
101 |
training_args = TrainingArguments(
|
102 |
output_dir="./out",
|
103 |
+
num_train_epochs=1,
|
104 |
per_device_train_batch_size=1,
|
105 |
gradient_checkpointing=False,
|
106 |
+
gradient_accumulation_steps=1,
|
107 |
+
optim="lion_32bit",
|
|
|
|
|
108 |
logging_steps=1,
|
109 |
save_strategy="steps",
|
110 |
+
save_steps=100,
|
111 |
max_steps=-1,
|
112 |
+
# auto_find_batch_size=True,
|
113 |
+
weight_decay=0.001,
|
114 |
bf16=True,
|
115 |
+
|
116 |
tf32=True,
|
117 |
+
learning_rate=1e-07,
|
118 |
+
max_grad_norm=0,
|
119 |
warmup_steps=20,
|
120 |
+
lr_scheduler_type="constant",
|
121 |
push_to_hub=False,
|
122 |
report_to="wandb"
|
123 |
)
|
124 |
|
125 |
+
peft_config = LoraConfig(
|
126 |
+
r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
|
127 |
+
target_modules =["up_proj", "down_proj", "gate_proj"],
|
128 |
+
lora_alpha = 32,
|
129 |
+
lora_dropout = 0, # Supports any, but = 0 is optimized
|
130 |
+
bias = "none",
|
131 |
+
use_dora=False,
|
132 |
+
task_type="CAUSAL_LM"
|
133 |
+
)
|
134 |
|
135 |
torch.autograd.set_detect_anomaly(True)
|
136 |
|
137 |
+
class CustomSFTTrainer(SFTTrainer):
|
138 |
+
def __init__(self, *args, **kwargs):
|
139 |
+
super().__init__(*args, **kwargs)
|
140 |
+
self.beta = 0.9 # momentum factor
|
141 |
+
self.clip_factor = 1.0 # clipping factor
|
142 |
+
self.moving_avg = 0.0
|
143 |
+
|
144 |
+
def training_step(self, model, inputs):
|
145 |
+
model.train()
|
146 |
+
inputs = self._prepare_inputs(inputs)
|
147 |
+
|
148 |
+
outputs = model(**inputs)
|
149 |
+
loss = outputs.loss if isinstance(outputs, dict) else outputs[0]
|
150 |
+
|
151 |
+
if self.args.gradient_accumulation_steps > 1:
|
152 |
+
loss = loss / self.args.gradient_accumulation_steps
|
153 |
+
|
154 |
+
loss.backward()
|
155 |
+
|
156 |
+
# Compute gradients and their norm
|
157 |
+
grad_norm = torch.sqrt(sum(p.grad.data.norm().to(model.device)**2 for p in model.parameters() if p.grad is not None))
|
158 |
+
|
159 |
+
# Update moving average and apply gradient clipping
|
160 |
+
if self.state.global_step == 0:
|
161 |
+
self.moving_avg = grad_norm
|
162 |
+
else:
|
163 |
+
self.moving_avg = self.beta * self.moving_avg + (1 - self.beta) * grad_norm
|
164 |
+
|
165 |
+
if grad_norm > self.clip_factor * self.moving_avg:
|
166 |
+
clip_coef = (self.clip_factor * self.moving_avg / grad_norm).item()
|
167 |
+
for param in model.parameters():
|
168 |
+
if param.grad is not None:
|
169 |
+
param.grad.data.mul_(clip_coef)
|
170 |
+
|
171 |
+
if (self.state.global_step + 1) % self.args.gradient_accumulation_steps == 0:
|
172 |
+
self.optimizer.step()
|
173 |
+
self.lr_scheduler.step()
|
174 |
+
model.zero_grad()
|
175 |
+
self.state.global_step += 1
|
176 |
+
|
177 |
+
# Return the loss as a Tensor
|
178 |
+
return loss
|
179 |
+
|
180 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
181 |
|
182 |
+
model = model_init(None)
|
|
|
|
|
183 |
|
184 |
+
trainer = CustomSFTTrainer(
|
185 |
+
model=model,
|
186 |
args=training_args,
|
187 |
train_dataset=dataset,
|
188 |
+
tokenizer=model.tokenizer,
|
|
|
189 |
max_seq_length=max_seq_length,
|
190 |
+
peft_config=peft_config,
|
|
|
191 |
)
|
192 |
|
193 |
trainer.train()
|