Crystalcareai commited on
Commit
21a96f6
·
verified ·
1 Parent(s): fe73328

Rename train.py to train-h100-sharegpt-sft.py

Browse files
train.py → train-h100-sharegpt-sft.py RENAMED
@@ -44,9 +44,11 @@ def model_init(params):
44
  model_id = "Crystalcareai/Quiet-Star-Custom"
45
  tokenizer_id = model_id
46
  print("Loading model")
 
 
47
  model = AutoModelForCausalLM.from_pretrained(
48
  model_id,
49
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
50
  max_thoughts=n_ahead + n_ahead_talk + 1,
51
  merged_talk_heads=merged_talk_heads,
52
  merged_lm_and_talk_heads=False,
@@ -58,14 +60,13 @@ def model_init(params):
58
  use_complex_talk_head=True,
59
  use_weighted_talk_head=True,
60
  trust_remote_code=True,
61
- device_map="auto",
62
  # load_in_4bit=True,
63
  # attn_implementation="flash_attention_2",
64
  )
65
  print("Loaded model")
66
 
67
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
68
- tokenizer.padding_side = 'left' # Adjust padding side to 'left' to avoid batch generation issues with Flash Attention
69
  tokenizer.pad_token_id = tokenizer.eos_token_id
70
 
71
  special_tokens_to_add = []
@@ -97,42 +98,42 @@ def model_init(params):
97
  model.train()
98
  return model
99
 
100
- max_seq_length = 2048
101
  run_id = int(time.time())
102
  training_args = TrainingArguments(
103
  output_dir="./out",
104
  num_train_epochs=3,
105
  per_device_train_batch_size=1,
106
  gradient_checkpointing=False,
107
- gradient_accumulation_steps=16,
108
- optim="adamw_torch_fused",
109
  logging_steps=1,
110
  save_strategy="steps",
111
- save_steps=300,
112
  bf16=True,
 
113
  tf32=False,
114
  # epsilson=1e-05,
115
  # beta1=0.9,
116
  # beta2=0.95,
117
  # auto_find_batch_size=True
118
- learning_rate=2e-07,
119
- max_grad_norm=1.0, # Gradient clipping with a maximum gradient norm of 0.3
120
- warmup_steps=10,
121
  lr_scheduler_type="cosine",
122
  push_to_hub=False,
123
  report_to="wandb"
124
 
125
  )
126
 
127
- # peft_config = LoraConfig(
128
- # r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
129
- # target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
130
- # "gate_proj", "up_proj", "down_proj",],
131
- # lora_alpha = 16,
132
- # lora_dropout = 0, # Supports any, but = 0 is optimized
133
- # bias = "none", # Enable Dora method
134
- # use_dora=True,
135
- # )
136
 
137
  torch.autograd.set_detect_anomaly(True)
138
  model = model_init(None) # Initialize the model
@@ -143,7 +144,7 @@ trainer = SFTTrainer(
143
  args=training_args,
144
  train_dataset=dataset,
145
  model=model,
146
- # peft_config=peft_config,
147
  tokenizer=tokenizer,
148
  max_seq_length=max_seq_length,
149
  )
 
44
  model_id = "Crystalcareai/Quiet-Star-Custom"
45
  tokenizer_id = model_id
46
  print("Loading model")
47
+
48
+
49
  model = AutoModelForCausalLM.from_pretrained(
50
  model_id,
51
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
52
  max_thoughts=n_ahead + n_ahead_talk + 1,
53
  merged_talk_heads=merged_talk_heads,
54
  merged_lm_and_talk_heads=False,
 
60
  use_complex_talk_head=True,
61
  use_weighted_talk_head=True,
62
  trust_remote_code=True,
63
+ # device_map="auto",
64
  # load_in_4bit=True,
65
  # attn_implementation="flash_attention_2",
66
  )
67
  print("Loaded model")
68
 
69
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id,truncation=True,padding_side="right")
 
70
  tokenizer.pad_token_id = tokenizer.eos_token_id
71
 
72
  special_tokens_to_add = []
 
98
  model.train()
99
  return model
100
 
101
+ max_seq_length = 1024
102
  run_id = int(time.time())
103
  training_args = TrainingArguments(
104
  output_dir="./out",
105
  num_train_epochs=3,
106
  per_device_train_batch_size=1,
107
  gradient_checkpointing=False,
108
+ gradient_accumulation_steps=6,
109
+ optim="lion_32bit",
110
  logging_steps=1,
111
  save_strategy="steps",
112
+ save_steps=25,
113
  bf16=True,
114
+ # fp16=True,
115
  tf32=False,
116
  # epsilson=1e-05,
117
  # beta1=0.9,
118
  # beta2=0.95,
119
  # auto_find_batch_size=True
120
+ learning_rate=6e-05,
121
+ max_grad_norm=0.3, # Gradient clipping with a maximum gradient norm of 0.3
122
+ warmup_ratio=0.06,
123
  lr_scheduler_type="cosine",
124
  push_to_hub=False,
125
  report_to="wandb"
126
 
127
  )
128
 
129
+ peft_config = LoraConfig(
130
+ r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
131
+ target_modules = ["q_proj", "k_proj"],
132
+ lora_alpha = 16,
133
+ lora_dropout = 0, # Supports any, but = 0 is optimized
134
+ bias = "none", # Enable Dora method
135
+ use_dora=True,
136
+ )
 
137
 
138
  torch.autograd.set_detect_anomaly(True)
139
  model = model_init(None) # Initialize the model
 
144
  args=training_args,
145
  train_dataset=dataset,
146
  model=model,
147
+ peft_config=peft_config,
148
  tokenizer=tokenizer,
149
  max_seq_length=max_seq_length,
150
  )