{ "_name_or_path": "/mnt/petrelfs/huxuyang/LLaMA-MoE-v2/outputs/v2_mixtral/moe-res-droppad-nosys-all/3689429/checkpoint-5400", "add_rescale_bias": false, "architectures": [ "MixtralForCausalLM" ], "attention_bias": false, "attention_dropout": 0.0, "attn_experts": null, "auto_map": { "AutoConfig": "configuration_mixtral.MixtralConfig", "AutoModel": "modeling_mixtral.MixtralModel", "AutoModelForCausalLM": "modeling_mixtral.MixtralForCausalLM" }, "bos_token_id": 128000, "eos_token_id": 128009, "hidden_act": "silu", "hidden_size": 4096, "initializer_range": 0.02, "intermediate_size": 1792, "intermediate_size_residual": 1792, "max_position_embeddings": 8192, "model_type": "mixtral", "moe_type": "modulelist", "num_attention_heads": 32, "num_experts_per_tok": 1, "num_hidden_layers": 32, "num_key_value_heads": 8, "num_local_experts": 7, "num_moe_contract_layers": 0, "output_router_logits": true, "pretraining_tp": 1, "rms_norm_eps": 1e-05, "rope_scaling": null, "rope_theta": 500000.0, "router_aux_loss_coef": 0.01, "scale_factor": 4.0, "scale_factor_attn": null, "sliding_window": 4096, "tie_word_embeddings": false, "top_k_attn": null, "torch_dtype": "bfloat16", "transformers_version": "4.42.4", "use_attn_moe": false, "use_cache": false, "use_layer_wise_balance": false, "vocab_size": 128256 }