File size: 16,882 Bytes
bfc8406
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
[NeMo W 2024-03-18 05:25:14 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.
    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
      ret = run_job(
    
[NeMo I 2024-03-18 05:25:14 train_gpt_sft:118] 
    
    ************** Experiment configuration ***********
[NeMo I 2024-03-18 05:25:14 train_gpt_sft:119] 
    name: gemma-7b-sql-nemo
    trainer:
      num_nodes: 1
      devices: 8
      accelerator: gpu
      precision: bf16
      sft:
        max_epochs: 1
        max_steps: -1
        val_check_interval: 1000
        save_interval: ${.val_check_interval}
        limit_val_batches: 40
        gradient_clip_val: 1.0
      logger: false
      enable_checkpointing: false
      use_distributed_sampler: false
      max_time: null
      max_epochs: ${.sft.max_epochs}
      max_steps: ${.sft.max_steps}
    exp_manager:
      explicit_log_dir: models/gemma-7b-sql-nemo
      exp_dir: null
      name: ${name}
      create_wandb_logger: false
      wandb_logger_kwargs:
        project: null
        name: null
      resume_if_exists: true
      resume_ignore_no_checkpoint: true
      create_checkpoint_callback: true
      checkpoint_callback_params:
        monitor: validation_loss
        save_top_k: 5
        mode: min
        save_nemo_on_train_end: true
        filename: megatron_gpt_sft--{${.monitor}:.3f}-{step}-{consumed_samples}-{epoch}
        model_parallel_size: ${model.tensor_model_parallel_size}
        save_best_model: false
    model:
      seed: 1234
      tensor_model_parallel_size: 4
      pipeline_model_parallel_size: 1
      restore_from_path: /workspace/models/pytorch-7b-pt.nemo
      resume_from_checkpoint: null
      save_nemo_on_validation_end: true
      sync_batch_comm: false
      megatron_amp_O2: true
      encoder_seq_length: 4096
      sequence_parallel: false
      activations_checkpoint_granularity: null
      activations_checkpoint_method: null
      activations_checkpoint_num_layers: null
      activations_checkpoint_layers_per_pipeline: null
      answer_only_loss: true
      gradient_as_bucket_view: false
      seq_len_interpolation_factor: null
      use_flash_attention: null
      hidden_dropout: 0.0
      attention_dropout: 0.0
      ffn_dropout: 0.0
      peft:
        peft_scheme: none
        restore_from_path: null
        lora_tuning:
          target_modules:
          - attention_qkv
          adapter_dim: 32
          adapter_dropout: 0.0
          column_init_method: xavier
          row_init_method: zero
          layer_selection: null
          weight_tying: false
          position_embedding_strategy: null
      data:
        chat: false
        chat_prompt_tokens:
          system_turn_start: "\0"
          turn_start: "\x11"
          label_start: "\x12"
          end_of_turn: '
    
            '
          end_of_name: '
    
            '
        sample: false
        num_workers: 0
        dataloader_type: single
        train_ds:
          file_path: nsql.jsonl
          global_batch_size: 128
          micro_batch_size: 1
          shuffle: true
          memmap_workers: null
          max_seq_length: 8192
          min_seq_length: 1
          drop_last: true
          label_key: output
          add_eos: true
          add_sep: false
          add_bos: false
          truncation_field: input
          index_mapping_dir: null
          prompt_template: '{input} {output}'
          hf_dataset: false
          truncation_method: right
        validation_ds:
          file_path: nsql.jsonl
          global_batch_size: 128
          micro_batch_size: 1
          shuffle: false
          memmap_workers: ${model.data.train_ds.memmap_workers}
          max_seq_length: ${model.data.train_ds.max_seq_length}
          min_seq_length: 1
          drop_last: true
          label_key: ${model.data.train_ds.label_key}
          add_eos: ${model.data.train_ds.add_eos}
          add_sep: ${model.data.train_ds.add_sep}
          add_bos: ${model.data.train_ds.add_bos}
          truncation_field: ${model.data.train_ds.truncation_field}
          index_mapping_dir: null
          prompt_template: ${model.data.train_ds.prompt_template}
          hf_dataset: false
          truncation_method: right
          output_original_text: true
      optim:
        name: distributed_fused_adam
        lr: 5.0e-06
        weight_decay: 0.01
        betas:
        - 0.9
        - 0.98
        sched:
          name: CosineAnnealing
          warmup_steps: 10
          constant_steps: 1000
          min_lr: 9.0e-07
      bias_activation_fusion: true
    
[NeMo W 2024-03-18 05:25:14 exp_manager:630] There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir :models/gemma-7b-sql-nemo/checkpoints. Training from scratch.
[NeMo I 2024-03-18 05:25:14 exp_manager:396] Experiments will be logged at models/gemma-7b-sql-nemo
[NeMo I 2024-03-18 05:25:14 exp_manager:856] TensorboardLogger has been set up
[NeMo W 2024-03-18 05:25:55 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:55 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:55 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: tp_comm_overlap in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:55 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:55 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:55 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:55 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:55 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:55 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo I 2024-03-18 05:25:56 megatron_init:241] Rank 1 has data parallel group : [1, 5]
[NeMo I 2024-03-18 05:25:56 megatron_init:247] Rank 1 has combined group of data parallel and context parallel : [1, 5]
[NeMo I 2024-03-18 05:25:56 megatron_init:252] All data parallel group ranks with context parallel combined: [[0, 4], [1, 5], [2, 6], [3, 7]]
[NeMo I 2024-03-18 05:25:56 megatron_init:255] Ranks 1 has data parallel rank: 0
[NeMo I 2024-03-18 05:25:56 megatron_init:272] Rank 1 has context parallel group: [1]
[NeMo I 2024-03-18 05:25:56 megatron_init:275] All context parallel group ranks: [[0], [1], [2], [3], [4], [5], [6], [7]]
[NeMo I 2024-03-18 05:25:56 megatron_init:276] Ranks 1 has context parallel rank: 0
[NeMo I 2024-03-18 05:25:56 megatron_init:287] Rank 1 has model parallel group: [0, 1, 2, 3]
[NeMo I 2024-03-18 05:25:56 megatron_init:288] All model parallel group ranks: [[0, 1, 2, 3], [4, 5, 6, 7]]
[NeMo I 2024-03-18 05:25:56 megatron_init:298] Rank 1 has tensor model parallel group: [0, 1, 2, 3]
[NeMo I 2024-03-18 05:25:56 megatron_init:302] All tensor model parallel group ranks: [[0, 1, 2, 3], [4, 5, 6, 7]]
[NeMo I 2024-03-18 05:25:56 megatron_init:303] Rank 1 has tensor model parallel rank: 1
[NeMo I 2024-03-18 05:25:56 megatron_init:317] Rank 1 has pipeline model parallel group: [1]
[NeMo I 2024-03-18 05:25:56 megatron_init:329] Rank 1 has embedding group: [1]
[NeMo I 2024-03-18 05:25:56 megatron_init:335] All pipeline model parallel group ranks: [[0], [1], [2], [3], [4], [5], [6], [7]]
[NeMo I 2024-03-18 05:25:56 megatron_init:336] Rank 1 has pipeline model parallel rank 0
[NeMo I 2024-03-18 05:25:56 megatron_init:337] All embedding group ranks: [[0], [1], [2], [3], [4], [5], [6], [7]]
[NeMo I 2024-03-18 05:25:56 megatron_init:338] Rank 1 has embedding rank: 0
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: tp_comm_overlap in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo I 2024-03-18 05:25:56 tokenizer_utils:191] Getting SentencePiece with model: /tmp/tmpqymm0qxt/c1f49ba929c24b7e95b7219ca958f881_tokenizer-final.model
[NeMo I 2024-03-18 05:25:56 megatron_base_model:520] Padded vocab_size: 256000, original vocab_size: 256000, dummy tokens: 0.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: tp_comm_overlap in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:1078] The model: GPTSFTModel() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:492] The model: GPTSFTModel() does not have field.name: num_moe_experts in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:492] The model: GPTSFTModel() does not have field.name: bias_gelu_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:492] The model: GPTSFTModel() does not have field.name: fp8_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 megatron_base_model:492] The model: GPTSFTModel() does not have field.name: clone_scatter_output_in_embedding in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-03-18 05:25:56 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformer_engine/pytorch/module/base.py:611: UserWarning: To guarantee overlapping TP and SP collectives with the backwardGEMMs, set environment variable CUDA_DEVICE_MAX_CONNECTIONS = 1
      warnings.warn(
    
[NeMo I 2024-03-18 05:27:30 nlp_overrides:1100] Model GPTSFTModel was successfully restored from /workspace/models/pytorch-7b-pt.nemo.
[NeMo I 2024-03-18 05:27:30 train_script_utils:169] Running full finetuning since no peft scheme is given.
      | Name  | Type          | Params
    ----------------------------------------
    0 | model | Float16Module | 2.1 B 
    ----------------------------------------
    2.1 B     Trainable params
    0         Non-trainable params
    2.1 B     Total params
    8,538.206 Total estimated model params size (MB)
[NeMo I 2024-03-18 05:27:30 text_memmap_dataset:116] Building data files
[NeMo I 2024-03-18 05:27:31 text_memmap_dataset:158] Loading data files
[NeMo I 2024-03-18 05:27:31 text_memmap_dataset:249] Loading nsql.jsonl
[NeMo I 2024-03-18 05:27:31 text_memmap_dataset:161] Time loading 1 mem-mapped files: 0:00:00.000896
[NeMo I 2024-03-18 05:27:31 text_memmap_dataset:165] Computing global indices
[NeMo I 2024-03-18 05:27:31 text_memmap_dataset:116] Building data files
[NeMo I 2024-03-18 05:27:34 text_memmap_dataset:158] Loading data files
[NeMo I 2024-03-18 05:27:34 text_memmap_dataset:249] Loading nsql.jsonl
[NeMo I 2024-03-18 05:27:34 text_memmap_dataset:161] Time loading 1 mem-mapped files: 0:00:00.000631
[NeMo I 2024-03-18 05:27:34 text_memmap_dataset:165] Computing global indices
[NeMo I 2024-03-18 05:27:34 builders:327] Building dataloader with consumed samples: 0
[NeMo W 2024-03-18 05:27:34 experimental:26] `<class 'nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers.MegatronPretrainingRandomBatchSampler'>` is experimental and not ready for production yet. Use at your own risk.
[NeMo I 2024-03-18 05:27:34 builders:327] Building dataloader with consumed samples: 0
[NeMo W 2024-03-18 05:27:34 experimental:26] `<class 'nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers.MegatronPretrainingRandomBatchSampler'>` is experimental and not ready for production yet. Use at your own risk.
[NeMo I 2024-03-18 05:27:40 megatron_gpt_model:1296] Pipeline model parallel rank: 0, Tensor model parallel rank: 1, Number of model parameters on device: 2.13e+09. Total number of model parameters: 8.54e+09.
[NeMo I 2024-03-18 05:27:40 modelPT:723] Optimizer config = MegatronDistributedFusedAdam (
    Parameter Group 0
        betas: [0.9, 0.98]
        bias_correction: True
        eps: 1e-08
        lr: 5e-06
        weight_decay: 0.01
    
    Parameter Group 1
        betas: [0.9, 0.98]
        bias_correction: True
        eps: 1e-08
        lr: 5e-06
        weight_decay: 0.0
    )
[NeMo I 2024-03-18 05:27:40 lr_scheduler:915] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7f723cf11270>" 
    will be used during training (effective maximum steps = 613) - 
    Parameters : 
    (warmup_steps: 10
    constant_steps: 1000
    min_lr: 9.0e-07
    max_steps: 613
    )