Crystalcareai commited on
Commit
6524016
·
verified ·
1 Parent(s): 8b9bd5a

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +10 -9
train.py CHANGED
@@ -15,7 +15,7 @@ random.seed(random_seed)
15
 
16
  dataset = load_dataset("Crystalcareai/Self-Discover-MM-Instruct-openai", split="train_sft")
17
 
18
- n_ahead_talk_global = 3
19
  n_passes_global = 2
20
  n_ahead_global = 2
21
  n_examples = 0
@@ -64,7 +64,8 @@ def model_init(params):
64
  )
65
  print("Loaded model")
66
 
67
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_id,truncation=True,padding="left")
 
68
  tokenizer.pad_token_id = tokenizer.eos_token_id
69
 
70
  special_tokens_to_add = []
@@ -96,15 +97,15 @@ def model_init(params):
96
  model.train()
97
  return model
98
 
99
- max_seq_length = 1024
100
  run_id = int(time.time())
101
  training_args = TrainingArguments(
102
  output_dir="./out",
103
- num_train_epochs=1.5,
104
  per_device_train_batch_size=1,
105
  gradient_checkpointing=False,
106
- gradient_accumulation_steps=8,
107
- optim="lion_32bit",
108
  logging_steps=1,
109
  save_strategy="steps",
110
  save_steps=300,
@@ -114,8 +115,8 @@ training_args = TrainingArguments(
114
  # beta1=0.9,
115
  # beta2=0.95,
116
  # auto_find_batch_size=True
117
- learning_rate=3e-07,
118
- max_grad_norm=0.3, # Gradient clipping with a maximum gradient norm of 0.3
119
  warmup_steps=10,
120
  lr_scheduler_type="cosine",
121
  push_to_hub=False,
@@ -147,4 +148,4 @@ trainer = SFTTrainer(
147
  max_seq_length=max_seq_length,
148
  )
149
 
150
- trainer.train()
 
15
 
16
  dataset = load_dataset("Crystalcareai/Self-Discover-MM-Instruct-openai", split="train_sft")
17
 
18
+ n_ahead_talk_global = 4
19
  n_passes_global = 2
20
  n_ahead_global = 2
21
  n_examples = 0
 
64
  )
65
  print("Loaded model")
66
 
67
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
68
+ tokenizer.padding_side = 'left' # Adjust padding side to 'left' to avoid batch generation issues with Flash Attention
69
  tokenizer.pad_token_id = tokenizer.eos_token_id
70
 
71
  special_tokens_to_add = []
 
97
  model.train()
98
  return model
99
 
100
+ max_seq_length = 2048
101
  run_id = int(time.time())
102
  training_args = TrainingArguments(
103
  output_dir="./out",
104
+ num_train_epochs=3,
105
  per_device_train_batch_size=1,
106
  gradient_checkpointing=False,
107
+ gradient_accumulation_steps=16,
108
+ optim="adamw_torch_fused",
109
  logging_steps=1,
110
  save_strategy="steps",
111
  save_steps=300,
 
115
  # beta1=0.9,
116
  # beta2=0.95,
117
  # auto_find_batch_size=True
118
+ learning_rate=2e-07,
119
+ max_grad_norm=1.0, # Gradient clipping with a maximum gradient norm of 0.3
120
  warmup_steps=10,
121
  lr_scheduler_type="cosine",
122
  push_to_hub=False,
 
148
  max_seq_length=max_seq_length,
149
  )
150
 
151
+ trainer.train()