|
tensor_model_parallel_size: 1 |
|
pipeline_model_parallel_size: 1 |
|
virtual_pipeline_model_parallel_size: null |
|
sequence_parallel: false |
|
context_parallel_size: 1 |
|
expert_model_parallel_size: 1 |
|
moe_extended_tp: false |
|
perform_initialization: true |
|
use_cpu_initialization: false |
|
fp16: false |
|
bf16: false |
|
params_dtype: float32 |
|
timers: null |
|
finalize_model_grads_func: null |
|
grad_scale_func: null |
|
no_sync_func: null |
|
grad_sync_func: null |
|
param_sync_func: null |
|
deterministic_mode: false |
|
enable_autocast: false |
|
autocast_dtype: float32 |
|
num_microbatches_with_partial_activation_checkpoints: null |
|
gradient_accumulation_fusion: false |
|
async_tensor_model_parallel_allreduce: false |
|
use_te_rng_tracker: false |
|
tp_comm_overlap: false |
|
tp_comm_bulk_wgrad: true |
|
tp_comm_bulk_dgrad: true |
|
tp_comm_overlap_ag: true |
|
tp_comm_overlap_rs: true |
|
tp_comm_overlap_rs_dgrad: false |
|
tp_comm_split_ag: true |
|
tp_comm_atomic_ag: false |
|
tp_comm_split_rs: true |
|
tp_comm_atomic_rs: false |
|
pipeline_dtype: null |
|
variable_seq_lengths: false |
|
overlap_p2p_comm: false |
|
batch_p2p_comm: true |
|
batch_p2p_sync: true |
|
use_ring_exchange_p2p: false |
|
deallocate_pipeline_outputs: false |
|
defer_embedding_wgrad_compute: false |
|
pipeline_model_parallel_split_rank: null |
|
cpu_offloading: false |
|
cpu_offloading_num_layers: 0 |
|
_cpu_offloading_context: null |
|
cpu_offloading_activations: true |
|
cpu_offloading_weights: true |
|
barrier_with_L1_time: true |
|
fp16_lm_cross_entropy: false |
|
parallel_output: true |
|
share_embeddings_and_output_weights: false |
|
make_vocab_size_divisible_by: 128 |
|
position_embedding_type: learned_absolute |
|
rotary_base: 10000 |
|
rotary_percent: 1.0 |
|
seq_len_interpolation_factor: null |
|
seq_length: 2048 |
|
optim: |
|
name: fused_adam |
|
sched: null |
|
optimizer_fn: null |
|
tokenizer_filepath: null |
|
num_layers: 4 |
|
hidden_size: 256 |
|
num_attention_heads: 4 |
|
num_query_groups: 4 |
|
ffn_hidden_size: 256 |
|
kv_channels: 64 |
|
hidden_dropout: 0.1 |
|
attention_dropout: 0.1 |
|
fp32_residual_connection: false |
|
apply_residual_connection_post_layernorm: false |
|
layernorm_epsilon: 1.0e-05 |
|
layernorm_zero_centered_gamma: false |
|
add_bias_linear: true |
|
add_qkv_bias: false |
|
gated_linear_unit: false |
|
activation_func: gelu |
|
activation_func_fp8_input_store: false |
|
num_moe_experts: null |
|
rotary_interleaved: false |
|
window_size: null |
|
normalization: LayerNorm |
|
qk_layernorm: false |
|
test_mode: false |
|
calculate_per_token_loss: false |
|
init_method: init_ |
|
output_layer_init_method: init_ |
|
init_method_std: 0.02 |
|
apply_query_key_layer_scaling: false |
|
attention_softmax_in_fp32: true |
|
bias_activation_fusion: false |
|
masked_softmax_fusion: false |
|
persist_layer_norm: false |
|
memory_efficient_layer_norm: false |
|
bias_dropout_fusion: false |
|
apply_rope_fusion: false |
|
recompute_granularity: null |
|
recompute_method: null |
|
recompute_num_layers: null |
|
distribute_saved_activations: null |
|
fp8: null |
|
fp8_margin: 0 |
|
fp8_interval: 1 |
|
fp8_amax_history_len: 1 |
|
fp8_amax_compute_algo: most_recent |
|
fp8_wgrad: true |
|
fp8_dot_product_attention: false |
|
fp8_multi_head_attention: false |
|
moe_router_load_balancing_type: aux_loss |
|
moe_router_topk: 2 |
|
moe_grouped_gemm: false |
|
moe_aux_loss_coeff: 0.0 |
|
moe_z_loss_coeff: null |
|
moe_input_jitter_eps: null |
|
moe_token_dropping: false |
|
moe_token_dispatcher_type: allgather |
|
moe_per_layer_logging: false |
|
moe_expert_capacity_factor: null |
|
moe_pad_expert_input_to_capacity: false |
|
moe_token_drop_policy: probs |
|
moe_layer_recompute: false |
|
clone_scatter_output_in_embedding: true |
|
disable_parameter_transpose_cache: false |
|
enable_cuda_graph: false |
|
target: nemo.collections.llm.gpt.model.base_v2.GPTModelV2 |
|
nemo_version: 2.0.0rc1 |
|
|