MasaakiKotera commited on
Commit
1ea7410
·
1 Parent(s): 508087f

git add configs/

Browse files
configs/example_finetuning.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -----------------------------------------------------------------------------
2
+ # I/O
3
+
4
+ # learning data directory, train.bin and valid .bin are expected. You should prepare them using tokenize.py
5
+ data_dir = 'directory_containing_train.bin/val.bin'
6
+ out_dir = 'output_directory' # output directory
7
+ log_dir = os.path.join(out_dir, 'logs') # logs will be written in to out_dir/logs
8
+
9
+ # -----------------------------------------------------------------------------
10
+ # model parameters
11
+ meta_vocab_size = 1024
12
+ block_size = 256
13
+ n_layer=24
14
+ n_head=16
15
+ n_embd=1024 # 350M, medium
16
+ bias = False
17
+
18
+ # -----------------------------------------------------------------------------
19
+ # learning parameters
20
+ max_iters = 1000000 # total number of training iterations
21
+ eval_interval = 5000
22
+ log_interval = 1
23
+ eval_iters = 100
24
+ eval_only = False # if True, script exits right after the first eval
25
+ always_save_checkpoint = True # if True, always save a checkpoint after each eval
26
+ init_from = 'resume' # 'scratch' or 'resume' or 'gpt2*'
27
+ ckpt_path = 'model.pt'
28
+ gradient_accumulation_steps = 16 # used to simulate larger batch sizes, should be multiple of GPU number
29
+ batch_size = 16
30
+
31
+ # adamw optimizer
32
+ learning_rate = 1e-4 # max learning rate
33
+ dropout = 0.1
34
+ weight_decay = 0.1
35
+ beta1 = 0.9
36
+ beta2 = 0.95
37
+ grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
38
+
39
+ # learning rate decay settings
40
+ decay_lr = False
41
+ warmup_iters = 2000
42
+ lr_decay_iters = 1000000
43
+ min_lr = 1e-4 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
44
+ # DDP settings
45
+ backend = 'nccl' # 'nccl', 'gloo', etc.
46
+ # system
47
+ device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1'
48
+ dtype = 'float32' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
49
+ compile = True # use PyTorch 2.0 to compile the model to be faster
configs/example_pretraining.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -----------------------------------------------------------------------------
2
+ # I/O
3
+
4
+ # learning data directory, train.bin and valid .bin are expected. You should prepare them using tokenize.py
5
+ data_dir = 'directory_containing_train.bin/val.bin'
6
+ out_dir = 'output_directory' # output directory
7
+ log_dir = os.path.join(out_dir, 'logs') # logs will be written in to out_dir/logs
8
+
9
+ # -----------------------------------------------------------------------------
10
+ # model parameters
11
+ meta_vocab_size = 1024
12
+ block_size = 256
13
+ n_layer=24
14
+ n_head=16
15
+ n_embd=1024 # 350M, medium
16
+ bias = False # do we use bias inside LayerNorm and Linear layers?
17
+
18
+ # -----------------------------------------------------------------------------
19
+ # learning parameters
20
+ max_iters = 1000000 # total number of training iterations
21
+ eval_interval = 100000
22
+ log_interval = 1
23
+ eval_iters = 100
24
+ eval_only = False # if True, script exits right after the first eval
25
+ always_save_checkpoint = True # if True, always save a checkpoint after each eval
26
+ init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
27
+ gradient_accumulation_steps = 16 # used to simulate larger batch sizes, should be multiple of GPU number
28
+ batch_size = 16
29
+
30
+ # adamw optimizer
31
+ learning_rate = 1e-3 # max learning rate
32
+ dropout = 0.0
33
+ weight_decay = 0
34
+ beta1 = 0.9
35
+ beta2 = 0.999
36
+ grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
37
+
38
+ # learning rate decay settings
39
+ decay_lr = True # whether to decay the learning rate
40
+ warmup_iters = 5000 # how many steps to warm up for
41
+ lr_decay_iters = 1000000 # should be ~= max_iters per Chinchilla
42
+ min_lr = 1e-4 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
43
+ # DDP settings
44
+ backend = 'nccl' # 'nccl', 'gloo', etc.
45
+ # system
46
+ device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
47
+ dtype = 'float32' # if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
48
+ compile = True # use PyTorch 2.0 to compile the model to be faster