smajumdar
/

abc5

Model card Files Files and versions Community

abc5 / model_config.yaml

smajumdar's picture

Push model using huggingface_hub.

a540f7d verified 7 months ago

history blame contribute delete

3.46 kB

	tensor_model_parallel_size: 1
	pipeline_model_parallel_size: 1
	virtual_pipeline_model_parallel_size: null
	sequence_parallel: false
	context_parallel_size: 1
	expert_model_parallel_size: 1
	moe_extended_tp: false
	perform_initialization: true
	use_cpu_initialization: false
	fp16: false
	bf16: false
	params_dtype: float32
	timers: null
	finalize_model_grads_func: null
	grad_scale_func: null
	no_sync_func: null
	grad_sync_func: null
	param_sync_func: null
	deterministic_mode: false
	enable_autocast: false
	autocast_dtype: float32
	num_microbatches_with_partial_activation_checkpoints: null
	gradient_accumulation_fusion: false
	async_tensor_model_parallel_allreduce: false
	use_te_rng_tracker: false
	tp_comm_overlap: false
	tp_comm_bulk_wgrad: true
	tp_comm_bulk_dgrad: true
	tp_comm_overlap_ag: true
	tp_comm_overlap_rs: true
	tp_comm_overlap_rs_dgrad: false
	tp_comm_split_ag: true
	tp_comm_atomic_ag: false
	tp_comm_split_rs: true
	tp_comm_atomic_rs: false
	pipeline_dtype: null
	variable_seq_lengths: false
	overlap_p2p_comm: false
	batch_p2p_comm: true
	batch_p2p_sync: true
	use_ring_exchange_p2p: false
	deallocate_pipeline_outputs: false
	defer_embedding_wgrad_compute: false
	pipeline_model_parallel_split_rank: null
	cpu_offloading: false
	cpu_offloading_num_layers: 0
	_cpu_offloading_context: null
	cpu_offloading_activations: true
	cpu_offloading_weights: true
	barrier_with_L1_time: true
	fp16_lm_cross_entropy: false
	parallel_output: true
	share_embeddings_and_output_weights: false
	make_vocab_size_divisible_by: 128
	position_embedding_type: learned_absolute
	rotary_base: 10000
	rotary_percent: 1.0
	seq_len_interpolation_factor: null
	seq_length: 2048
	optim:
	name: fused_adam
	sched: null
	optimizer_fn: null
	tokenizer_filepath: null
	num_layers: 4
	hidden_size: 256
	num_attention_heads: 4
	num_query_groups: 4
	ffn_hidden_size: 256
	kv_channels: 64
	hidden_dropout: 0.1
	attention_dropout: 0.1
	fp32_residual_connection: false
	apply_residual_connection_post_layernorm: false
	layernorm_epsilon: 1.0e-05
	layernorm_zero_centered_gamma: false
	add_bias_linear: true
	add_qkv_bias: false
	gated_linear_unit: false
	activation_func: gelu
	activation_func_fp8_input_store: false
	num_moe_experts: null
	rotary_interleaved: false
	window_size: null
	normalization: LayerNorm
	qk_layernorm: false
	test_mode: false
	calculate_per_token_loss: false
	init_method: init_
	output_layer_init_method: init_
	init_method_std: 0.02
	apply_query_key_layer_scaling: false
	attention_softmax_in_fp32: true
	bias_activation_fusion: false
	masked_softmax_fusion: false
	persist_layer_norm: false
	memory_efficient_layer_norm: false
	bias_dropout_fusion: false
	apply_rope_fusion: false
	recompute_granularity: null
	recompute_method: null
	recompute_num_layers: null
	distribute_saved_activations: null
	fp8: null
	fp8_margin: 0
	fp8_interval: 1
	fp8_amax_history_len: 1
	fp8_amax_compute_algo: most_recent
	fp8_wgrad: true
	fp8_dot_product_attention: false
	fp8_multi_head_attention: false
	moe_router_load_balancing_type: aux_loss
	moe_router_topk: 2
	moe_grouped_gemm: false
	moe_aux_loss_coeff: 0.0
	moe_z_loss_coeff: null
	moe_input_jitter_eps: null
	moe_token_dropping: false
	moe_token_dispatcher_type: allgather
	moe_per_layer_logging: false
	moe_expert_capacity_factor: null
	moe_pad_expert_input_to_capacity: false
	moe_token_drop_policy: probs
	moe_layer_recompute: false
	clone_scatter_output_in_embedding: true
	disable_parameter_transpose_cache: false
	enable_cuda_graph: false
	target: nemo.collections.llm.gpt.model.base_v2.GPTModelV2
	nemo_version: 2.0.0rc1