Crystalcareai commited on
Commit
2892566
·
verified ·
1 Parent(s): 05c4b5c

Update train-h100-sharegpt-sft.py

Browse files
Files changed (1) hide show
  1. train-h100-sharegpt-sft.py +59 -49
train-h100-sharegpt-sft.py CHANGED
@@ -1,27 +1,26 @@
1
  import torch
2
  torch.backends.cuda.matmul.allow_tf32 = True
3
  import random
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
  from datasets import load_dataset
6
  from transformers import TrainingArguments
7
  from trl import SFTTrainer
8
  from peft import LoraConfig
9
-
10
-
11
  import time
 
 
12
  random_seed = 42
13
  torch.manual_seed(random_seed)
14
  random.seed(random_seed)
15
 
16
- dataset = load_dataset("Crystalcareai/Self-Discover-MM-Instruct-openai", split="train_sft")
17
 
18
- n_ahead_talk_global = 4
19
- n_passes_global = 2
20
- n_ahead_global = 2
21
  n_examples = 0
22
 
23
-
24
-
25
  def model_init(params):
26
  original = False
27
  if params is None:
@@ -44,11 +43,10 @@ def model_init(params):
44
  model_id = "Crystalcareai/Quiet-Star-Custom"
45
  tokenizer_id = model_id
46
  print("Loading model")
47
-
48
-
49
  model = AutoModelForCausalLM.from_pretrained(
50
  model_id,
51
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
52
  max_thoughts=n_ahead + n_ahead_talk + 1,
53
  merged_talk_heads=merged_talk_heads,
54
  merged_lm_and_talk_heads=False,
@@ -59,14 +57,12 @@ def model_init(params):
59
  use_complex_think_head=False,
60
  use_complex_talk_head=True,
61
  use_weighted_talk_head=True,
62
- trust_remote_code=True,
63
- # device_map="auto",
64
- # load_in_4bit=True,
65
- # attn_implementation="flash_attention_2",
66
  )
67
  print("Loaded model")
68
-
69
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_id,truncation=True,padding_side="right")
70
  tokenizer.pad_token_id = tokenizer.eos_token_id
71
 
72
  special_tokens_to_add = []
@@ -76,12 +72,11 @@ def model_init(params):
76
  special_tokens_to_add.append("<|endthought|>")
77
  if special_tokens_to_add:
78
  tokenizer.add_special_tokens({"additional_special_tokens": special_tokens_to_add})
79
- model.resize_token_embeddings(len(tokenizer))
80
  model.tokenizer = tokenizer
81
  for name, module in model.named_modules():
82
  if "embed" in name:
83
  print(module, flush=True)
84
-
85
  model.gumbel_detach = gumbel_detach
86
  model.include_policy_loss = include_policy_loss
87
  model.use_end_thought_token = use_end_thought_token
@@ -98,55 +93,70 @@ def model_init(params):
98
  model.train()
99
  return model
100
 
101
- max_seq_length = 1024
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  run_id = int(time.time())
103
  training_args = TrainingArguments(
104
  output_dir="./out",
105
  num_train_epochs=3,
106
  per_device_train_batch_size=1,
107
  gradient_checkpointing=False,
108
- gradient_accumulation_steps=6,
109
- optim="lion_32bit",
 
 
110
  logging_steps=1,
111
  save_strategy="steps",
112
- save_steps=25,
 
113
  bf16=True,
114
- # fp16=True,
115
- tf32=False,
116
- # epsilson=1e-05,
117
- # beta1=0.9,
118
- # beta2=0.95,
119
- # auto_find_batch_size=True
120
- learning_rate=6e-05,
121
- max_grad_norm=0.3, # Gradient clipping with a maximum gradient norm of 0.3
122
- warmup_ratio=0.06,
123
- lr_scheduler_type="cosine",
124
  push_to_hub=False,
125
  report_to="wandb"
126
-
127
  )
128
 
129
- peft_config = LoraConfig(
130
- r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
131
- target_modules = ["q_proj", "k_proj"],
132
- lora_alpha = 16,
133
- lora_dropout = 0, # Supports any, but = 0 is optimized
134
- bias = "none", # Enable Dora method
135
- use_dora=True,
136
- )
137
 
138
  torch.autograd.set_detect_anomaly(True)
 
 
 
 
139
  model = model_init(None) # Initialize the model
140
 
141
- tokenizer = model.tokenizer
142
-
143
- trainer = SFTTrainer(
144
  args=training_args,
145
  train_dataset=dataset,
146
  model=model,
147
- peft_config=peft_config,
148
- tokenizer=tokenizer,
149
  max_seq_length=max_seq_length,
 
150
  )
151
 
152
- trainer.train()
 
1
  import torch
2
  torch.backends.cuda.matmul.allow_tf32 = True
3
  import random
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline, AutoConfig, BitsAndBytesConfig
5
  from datasets import load_dataset
6
  from transformers import TrainingArguments
7
  from trl import SFTTrainer
8
  from peft import LoraConfig
9
+ from torch.nn import CrossEntropyLoss
 
10
  import time
11
+ import gc
12
+
13
  random_seed = 42
14
  torch.manual_seed(random_seed)
15
  random.seed(random_seed)
16
 
17
+ dataset = load_dataset("HuggingFaceH4/orca-math-word-problems-200k", split="train_sft").select(range(1500))
18
 
19
+ n_ahead_talk_global = 1
20
+ n_passes_global = 1
21
+ n_ahead_global = 8
22
  n_examples = 0
23
 
 
 
24
  def model_init(params):
25
  original = False
26
  if params is None:
 
43
  model_id = "Crystalcareai/Quiet-Star-Custom"
44
  tokenizer_id = model_id
45
  print("Loading model")
46
+
 
47
  model = AutoModelForCausalLM.from_pretrained(
48
  model_id,
49
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
50
  max_thoughts=n_ahead + n_ahead_talk + 1,
51
  merged_talk_heads=merged_talk_heads,
52
  merged_lm_and_talk_heads=False,
 
57
  use_complex_think_head=False,
58
  use_complex_talk_head=True,
59
  use_weighted_talk_head=True,
60
+ trust_remote_code=True,
61
+ device_map="auto",
 
 
62
  )
63
  print("Loaded model")
64
+
65
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id, truncation=True, padding_side="right")
66
  tokenizer.pad_token_id = tokenizer.eos_token_id
67
 
68
  special_tokens_to_add = []
 
72
  special_tokens_to_add.append("<|endthought|>")
73
  if special_tokens_to_add:
74
  tokenizer.add_special_tokens({"additional_special_tokens": special_tokens_to_add})
 
75
  model.tokenizer = tokenizer
76
  for name, module in model.named_modules():
77
  if "embed" in name:
78
  print(module, flush=True)
79
+
80
  model.gumbel_detach = gumbel_detach
81
  model.include_policy_loss = include_policy_loss
82
  model.use_end_thought_token = use_end_thought_token
 
93
  model.train()
94
  return model
95
 
96
+ def clear_gpu_cache():
97
+ torch.cuda.empty_cache()
98
+ gc.collect()
99
+
100
+ class CustomSFTTrainer(SFTTrainer):
101
+ def __init__(self, *args, **kwargs):
102
+ super().__init__(*args, **kwargs)
103
+ self.cache_clear_step = 6 # Clear cache every 100 steps
104
+
105
+ def on_step_end(self, args, state, control, **kwargs):
106
+ if state.global_step % self.cache_clear_step == 0:
107
+ clear_gpu_cache()
108
+ return super().on_step_end(args, state, control, **kwargs)
109
+
110
+ max_seq_length = 8092
111
  run_id = int(time.time())
112
  training_args = TrainingArguments(
113
  output_dir="./out",
114
  num_train_epochs=3,
115
  per_device_train_batch_size=1,
116
  gradient_checkpointing=False,
117
+ gradient_accumulation_steps=16,
118
+ optim="galore_adamw",
119
+ optim_target_modules=[ r".*mlp.*"],
120
+ # optim="adamw_torch_fused",
121
  logging_steps=1,
122
  save_strategy="steps",
123
+ save_steps=1000,
124
+ max_steps=-1,
125
  bf16=True,
126
+ tf32=True,
127
+ learning_rate=2e-10,
128
+ max_grad_norm=1.0,
129
+ warmup_steps=20,
130
+ lr_scheduler_type="constant",
 
 
 
 
 
131
  push_to_hub=False,
132
  report_to="wandb"
 
133
  )
134
 
135
+ # peft_config = LoraConfig(
136
+ # r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
137
+ # target_modules =["q_proj", "v_proj"],
138
+ # lora_alpha = 32,
139
+ # lora_dropout = 0, # Supports any, but = 0 is optimized
140
+ # bias = "none",
141
+ # use_dora=True,
142
+ # )
143
 
144
  torch.autograd.set_detect_anomaly(True)
145
+
146
+ # Set the device for each process
147
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
148
+
149
  model = model_init(None) # Initialize the model
150
 
151
+ tokenizer = model.tokenizer
152
+
153
+ trainer = CustomSFTTrainer(
154
  args=training_args,
155
  train_dataset=dataset,
156
  model=model,
157
+ tokenizer=tokenizer,
 
158
  max_seq_length=max_seq_length,
159
+ # peft_config=peft_config,
160
  )
161
 
162
+ trainer.train()