File size: 3,463 Bytes
a540f7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
virtual_pipeline_model_parallel_size: null
sequence_parallel: false
context_parallel_size: 1
expert_model_parallel_size: 1
moe_extended_tp: false
perform_initialization: true
use_cpu_initialization: false
fp16: false
bf16: false
params_dtype: float32
timers: null
finalize_model_grads_func: null
grad_scale_func: null
no_sync_func: null
grad_sync_func: null
param_sync_func: null
deterministic_mode: false
enable_autocast: false
autocast_dtype: float32
num_microbatches_with_partial_activation_checkpoints: null
gradient_accumulation_fusion: false
async_tensor_model_parallel_allreduce: false
use_te_rng_tracker: false
tp_comm_overlap: false
tp_comm_bulk_wgrad: true
tp_comm_bulk_dgrad: true
tp_comm_overlap_ag: true
tp_comm_overlap_rs: true
tp_comm_overlap_rs_dgrad: false
tp_comm_split_ag: true
tp_comm_atomic_ag: false
tp_comm_split_rs: true
tp_comm_atomic_rs: false
pipeline_dtype: null
variable_seq_lengths: false
overlap_p2p_comm: false
batch_p2p_comm: true
batch_p2p_sync: true
use_ring_exchange_p2p: false
deallocate_pipeline_outputs: false
defer_embedding_wgrad_compute: false
pipeline_model_parallel_split_rank: null
cpu_offloading: false
cpu_offloading_num_layers: 0
_cpu_offloading_context: null
cpu_offloading_activations: true
cpu_offloading_weights: true
barrier_with_L1_time: true
fp16_lm_cross_entropy: false
parallel_output: true
share_embeddings_and_output_weights: false
make_vocab_size_divisible_by: 128
position_embedding_type: learned_absolute
rotary_base: 10000
rotary_percent: 1.0
seq_len_interpolation_factor: null
seq_length: 2048
optim:
  name: fused_adam
  sched: null
optimizer_fn: null
tokenizer_filepath: null
num_layers: 4
hidden_size: 256
num_attention_heads: 4
num_query_groups: 4
ffn_hidden_size: 256
kv_channels: 64
hidden_dropout: 0.1
attention_dropout: 0.1
fp32_residual_connection: false
apply_residual_connection_post_layernorm: false
layernorm_epsilon: 1.0e-05
layernorm_zero_centered_gamma: false
add_bias_linear: true
add_qkv_bias: false
gated_linear_unit: false
activation_func: gelu
activation_func_fp8_input_store: false
num_moe_experts: null
rotary_interleaved: false
window_size: null
normalization: LayerNorm
qk_layernorm: false
test_mode: false
calculate_per_token_loss: false
init_method: init_
output_layer_init_method: init_
init_method_std: 0.02
apply_query_key_layer_scaling: false
attention_softmax_in_fp32: true
bias_activation_fusion: false
masked_softmax_fusion: false
persist_layer_norm: false
memory_efficient_layer_norm: false
bias_dropout_fusion: false
apply_rope_fusion: false
recompute_granularity: null
recompute_method: null
recompute_num_layers: null
distribute_saved_activations: null
fp8: null
fp8_margin: 0
fp8_interval: 1
fp8_amax_history_len: 1
fp8_amax_compute_algo: most_recent
fp8_wgrad: true
fp8_dot_product_attention: false
fp8_multi_head_attention: false
moe_router_load_balancing_type: aux_loss
moe_router_topk: 2
moe_grouped_gemm: false
moe_aux_loss_coeff: 0.0
moe_z_loss_coeff: null
moe_input_jitter_eps: null
moe_token_dropping: false
moe_token_dispatcher_type: allgather
moe_per_layer_logging: false
moe_expert_capacity_factor: null
moe_pad_expert_input_to_capacity: false
moe_token_drop_policy: probs
moe_layer_recompute: false
clone_scatter_output_in_embedding: true
disable_parameter_transpose_cache: false
enable_cuda_graph: false
target: nemo.collections.llm.gpt.model.base_v2.GPTModelV2
nemo_version: 2.0.0rc1