Upload DogeForCausalLM

Browse files

Files changed (5) hide show

config.json +44 -37
configuration_doge.py +58 -35
generation_config.json +7 -7
model.safetensors +2 -2
modeling_doge.py +327 -199

config.json CHANGED Viewed

@@ -1,37 +1,44 @@
-{
-  "_name_or_path": "./results/Doge-20M/checkpoint-1792",
-  "architectures": [
-    "DogeForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "auto_map": {
-    "AutoConfig": "configuration_doge.DogeConfig",
-    "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
-  },
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "expert_retrieval_size": 256,
-  "hidden_act": "silu",
-  "hidden_bias": false,
-  "hidden_dropout": 0.0,
-  "hidden_size": 256,
-  "initializer_range": 0.02,
-  "intermediate_size": 1024,
-  "is_moe": false,
-  "max_position_embeddings": 2048,
-  "model_type": "doge",
-  "num_attention_heads": 2,
-  "num_cdmmoe_experts": 4096,
-  "num_cdmmoe_experts_per_head": 8,
-  "num_cdmmoe_heads": 4,
-  "num_hidden_layers": 4,
-  "pad_token_id": 0,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float32",
-  "transformers_version": "4.46.1",
-  "use_cache": true,
-  "vocab_size": 32768
-}

+{
+  "_name_or_path": "./results/Doge-20M-Instruct-DPO",
+  "architectures": [
+    "DogeForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_doge.DogeConfig",
+    "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
+  },
+  "bos_token_id": 0,
+  "dynamic_mask_ratio": 0.0,
+  "eos_token_id": 1,
+  "expert_retrieval_size": 256,
+  "hidden_act": "silu",
+  "hidden_bias": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 256,
+  "initializer_range": 0.02,
+  "intermediate_size": 512,
+  "is_moe": false,
+  "max_position_embeddings": 2048,
+  "model_type": "doge",
+  "num_attention_heads": 2,
+  "num_cdmmoe_experts": 2048,
+  "num_cdmmoe_experts_per_head": 8,
+  "num_cdmmoe_heads": 4,
+  "num_channels": 3,
+  "num_hidden_layers": 8,
+  "num_key_value_heads": 1,
+  "pad_token_id": 2,
+  "patch_size": 16,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "factor": 4.0,
+    "original_max_position_embeddings": 2048,
+    "rope_type": "dynamic"
+  },
+  "rope_theta": 10000.0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0.dev0",
+  "use_cache": true,
+  "vocab_size": 32768
+}

configuration_doge.py CHANGED Viewed

@@ -25,20 +25,23 @@ from transformers.modeling_rope_utils import rope_config_validation
 class DogeConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
-    model according to the specified arguments, defining the model architecture like [LoserCheems/doge-tiny-test](https://huggingface.co/LoserCheems/doge-tiny-test)
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 32768):
-            Vocabulary size of the Doge model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`DogeModel`]
         hidden_size (`int`, *optional*, defaults to 1024):
             Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 4096):
             Dimension of the CDMoE representations.
-        num_hidden_layers (`int`, *optional*, defaults to 16):
             Number of hidden layers in the Transformer decoder.
         hidden_bias (`bool`, *optional*, defaults to `False`):
             Whether to use bias in the hidden layers.
@@ -51,24 +54,21 @@ class DogeConfig(PretrainedConfig):
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
             Expected contents:
                 `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'llama3'], with 'default' being the original RoPE implementation.
                 `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
                 `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
-                    pretraining.
                 `attention_factor` (`float`, *optional*):
                     Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
                 `beta_fast` (`float`, *optional*):
                     Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                     ramp function. If unspecified, it defaults to 32.
@@ -76,13 +76,11 @@ class DogeConfig(PretrainedConfig):
                     Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                     ramp function. If unspecified, it defaults to 1.
                 `short_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
                 `long_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
                 `low_freq_factor` (`float`, *optional*):
                     Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                 `high_freq_factor` (`float`, *optional*):
@@ -100,15 +98,24 @@ class DogeConfig(PretrainedConfig):
             Beginning of stream token id.
         eos_token_id (`int`, *optional*, defaults to 2):
             End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
         num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer decoder.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         is_moe (`bool`, *optional*, defaults to `False`):
             Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
-        num_cdmmoe_experts (`int`, *optional*, defaults to 4096):
             Number of Private Experts for the Cross Domain Mixture of Experts.
         num_cdmmoe_heads (`int`, *optional*, defaults to 4):
             Number of heads of Private Experts for the Cross Domain Mixture of Experts.
@@ -124,32 +131,42 @@ class DogeConfig(PretrainedConfig):
     def __init__(
         self,
         vocab_size=32768,
         hidden_size=1024,
-        intermediate_size=4096,
-        num_hidden_layers=16,
         hidden_bias=False,
         hidden_dropout=0.0,
         hidden_act="silu",
         max_position_embeddings=2048,
         rope_theta=10000.0,
-        rope_scaling=None,
         initializer_range=0.02,
         rms_norm_eps=1e-06,
         use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
         num_attention_heads=8,
         attention_dropout=0.0,
         is_moe=False,
-        num_cdmmoe_experts=4096,
         num_cdmmoe_heads=4,
         num_cdmmoe_experts_per_head=8,
         expert_retrieval_size=256,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
@@ -162,12 +179,14 @@ class DogeConfig(PretrainedConfig):
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
         self.tie_word_embeddings = tie_word_embeddings
         self.num_attention_heads = num_attention_heads
         self.attention_dropout = attention_dropout
         self.is_moe = is_moe
         self.num_cdmmoe_experts = num_cdmmoe_experts
         self.num_cdmmoe_heads = num_cdmmoe_heads
@@ -180,10 +199,14 @@ class DogeConfig(PretrainedConfig):
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         rope_config_validation(self)
         super().__init__(
-            pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )

 class DogeConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
+    model according to the specified arguments, defining the model architecture like [JingzeShi/Doge-20M](https://huggingface.co/JingzeShi/Doge-20M).
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 32768):
+            Vocabulary size of the Doge model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input image.
+        patch_size (`int`, *optional*, defaults to 16):
+            Patch size of Vision Transformer Embeddings.
         hidden_size (`int`, *optional*, defaults to 1024):
             Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 2048):
             Dimension of the CDMoE representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
             Number of hidden layers in the Transformer decoder.
         hidden_bias (`bool`, *optional*, defaults to `False`):
             Whether to use bias in the hidden layers.
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings.
+            NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly.
             Expected contents:
                 `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation.
                 `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings.
+                    In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length.
                 `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'.
+                    The original max position embeddings used during pretraining.
                 `attention_factor` (`float`, *optional*):
                     Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation.
+                    If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value.
                 `beta_fast` (`float`, *optional*):
                     Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                     ramp function. If unspecified, it defaults to 32.
                     Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                     ramp function. If unspecified, it defaults to 1.
                 `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`).
+                    Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
                 `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`).
+                    Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
                 `low_freq_factor` (`float`, *optional*):
                     Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                 `high_freq_factor` (`float`, *optional*):
             Beginning of stream token id.
         eos_token_id (`int`, *optional*, defaults to 2):
             End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
             Whether to tie weight embeddings
         num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to `None`):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention.
+            If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
+            When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
+            For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf).
+            If it is not specified, will default to `num_attention_heads`.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
+        dynamic_mask_ratio (`float`, *optional*, defaults to 0.0, range [0, 1]):
+            The ratio to control the proportion of the dynamic mask filled with the minimum value.
         is_moe (`bool`, *optional*, defaults to `False`):
             Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
+        num_cdmmoe_experts (`int`, *optional*, defaults to 2048):
             Number of Private Experts for the Cross Domain Mixture of Experts.
         num_cdmmoe_heads (`int`, *optional*, defaults to 4):
             Number of heads of Private Experts for the Cross Domain Mixture of Experts.
     def __init__(
         self,
         vocab_size=32768,
+        num_channels=3,
+        patch_size=16,
         hidden_size=1024,
+        intermediate_size=2048,
+        num_hidden_layers=32,
         hidden_bias=False,
         hidden_dropout=0.0,
         hidden_act="silu",
         max_position_embeddings=2048,
         rope_theta=10000.0,
+        rope_scaling={
+            "rope_type": "dynamic",
+            "factor": 4.0,
+            "original_max_position_embeddings": 2048,
+        },
         initializer_range=0.02,
         rms_norm_eps=1e-06,
         use_cache=True,
+        bos_token_id=0,
+        eos_token_id=1,
+        pad_token_id=2,
+        tie_word_embeddings=True,
         num_attention_heads=8,
+        num_key_value_heads=None,
         attention_dropout=0.0,
+        dynamic_mask_ratio=0.0,
         is_moe=False,
+        num_cdmmoe_experts=2048,
         num_cdmmoe_heads=4,
         num_cdmmoe_experts_per_head=8,
         expert_retrieval_size=256,
         **kwargs,
     ):
         self.vocab_size = vocab_size
+        self.num_channels = num_channels
+        self.patch_size = patch_size
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
         self.tie_word_embeddings = tie_word_embeddings
         self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
         self.attention_dropout = attention_dropout
+        self.dynamic_mask_ratio = dynamic_mask_ratio
         self.is_moe = is_moe
         self.num_cdmmoe_experts = num_cdmmoe_experts
         self.num_cdmmoe_heads = num_cdmmoe_heads
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         rope_config_validation(self)
+        # for backward compatibility
+        if num_key_value_heads is None:
+            self.num_key_value_heads = num_attention_heads
         super().__init__(
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )

generation_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
-{
-  "_from_model_config": true,
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "pad_token_id": 0,
-  "transformers_version": "4.46.1"
-}

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 2,
+  "transformers_version": "4.49.0.dev0"
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e3e3a6abbaf8f67363291e553d97a4a937204c8f43c97abf14f7e8fa8f64ab54
-size 83917640

 version https://git-lfs.github.com/spec/v1
+oid sha256:ae49b37c117138c1880aa9d2f1c140436772eb3cc1f9d1c73a2e2a0643100b2b
+size 52482152

modeling_doge.py CHANGED Viewed

@@ -39,6 +39,7 @@ from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
     replace_return_docstrings,
 )
@@ -49,6 +50,9 @@ try:
 except ImportError:
     einx_add = None
 logger = logging.get_logger(__name__)
@@ -79,7 +83,7 @@ class Residual(nn.Module):
     def __init__(self, hidden_size):
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
     def forward(self, residual_states, hidden_states):
         return self.weight * residual_states + hidden_states
@@ -92,10 +96,10 @@ class RotaryEmbedding(nn.Module):
         super().__init__()
         self.rope_kwargs = {}
-        if config.rope_scaling is None:
-            self.rope_type = "default"
         else:
-            self.rope_type = config.rope_scaling
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
         self.base = config.rope_theta
@@ -133,6 +137,7 @@ class RotaryEmbedding(nn.Module):
         # core RoPE block
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
         device_type = x.device.type
         device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
@@ -141,6 +146,7 @@ class RotaryEmbedding(nn.Module):
             cos = emb.cos()
             sin = emb.sin()
         cos = cos * self.attention_scaling
         sin = sin * self.attention_scaling
@@ -168,11 +174,10 @@ def apply_QK_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
             Deprecated and unused.
         unsqueeze_dim (`int`, *optional*, defaults to 1):
             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
@@ -183,6 +188,18 @@ def apply_QK_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return q_embed, k_embed
 class DogeDynamicMaskAttention(nn.Module):
     """Dynamic Mask Attention from 'Wonderful Matrices' paper."""
@@ -193,46 +210,26 @@ class DogeDynamicMaskAttention(nn.Module):
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
             )
         self.hidden_dim = config.hidden_size
-        self.num_attention_heads = config.num_attention_heads
         self.attention_dropout = config.attention_dropout
-        self.attention_head_dim = self.hidden_dim // self.num_attention_heads
         # Q K V O projections
-        self.q_proj = nn.Linear(
-            self.hidden_dim,
-            self.num_attention_heads * self.attention_head_dim,
-            bias=config.hidden_bias,
-        )
-        self.k_proj = nn.Linear(
-            self.hidden_dim,
-            self.num_attention_heads * self.attention_head_dim,
-            bias=config.hidden_bias,
-        )
         # dynamic mask for the QK^T attention score matrix
-        self.A = nn.Parameter(
-            torch.ones(self.num_attention_heads)
-        )
-        self.dt_proj = nn.Linear(
-            self.hidden_dim,
-            self.num_attention_heads,
-            bias=config.hidden_bias,
-        )
-        self.v_proj = nn.Linear(
-            self.hidden_dim,
-            self.num_attention_heads * self.attention_head_dim,
-            bias=config.hidden_bias,
-        )
-        self.o_proj = nn.Linear(
-            self.hidden_dim,
-            self.hidden_dim,
-            bias=config.hidden_bias,
-        )
     def forward(
         self,
@@ -250,15 +247,9 @@ class DogeDynamicMaskAttention(nn.Module):
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
-        query_states = query_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(
-            1, 2
-        )
-        key_states = key_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(
-            1, 2
-        )
-        value_states = value_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(
-            1, 2
-        )
         cos, sin = position_embeddings
         query_states, key_states = apply_QK_rotary_pos_emb(query_states, key_states, cos, sin)
@@ -268,16 +259,25 @@ class DogeDynamicMaskAttention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         # compute attention scores matrix
-        attn_weights = torch.matmul(query_states, key_states.transpose(-1, -2)) / math.sqrt(self.attention_head_dim)
         # add mask to attention scores
-        if attention_mask is not None:
-            dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
-            dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
-            dynamic_mask = dynamic_mask < 1.0
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]].masked_fill(dynamic_mask[:, :, None, :], torch.finfo(hidden_states.dtype).min)
-            attn_weights = attn_weights + causal_mask
         # upcast attention scores to fp32
         attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
@@ -292,8 +292,35 @@ class DogeDynamicMaskAttention(nn.Module):
         return attn_output, past_key_value
-class DogeSdpaDynamicMaskAttn(DogeDynamicMaskAttention):
     def forward(
         self,
@@ -311,9 +338,9 @@ class DogeSdpaDynamicMaskAttn(DogeDynamicMaskAttention):
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
-        query_states = query_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(1, 2)
         cos, sin = position_embeddings
         query_states, key_states = apply_QK_rotary_pos_emb(query_states, key_states, cos, sin)
@@ -322,23 +349,31 @@ class DogeSdpaDynamicMaskAttn(DogeDynamicMaskAttention):
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        if attention_mask is not None:
-            dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
-            dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
-            dynamic_mask = dynamic_mask < 1.0
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]].masked_fill(dynamic_mask[:, :, None, :], torch.finfo(hidden_states.dtype).min)
         query_states = query_states.contiguous()
         key_states = key_states.contiguous()
         value_states = value_states.contiguous()
         attn_output = F.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
-            attn_mask=causal_mask,
-            dropout_p=self.attention_dropout,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
@@ -348,9 +383,70 @@ class DogeSdpaDynamicMaskAttn(DogeDynamicMaskAttention):
         return attn_output, past_key_value
 DOGE_ATTENTION_CLASSES = {
     "eager": DogeDynamicMaskAttention,
-    "sdpa": DogeSdpaDynamicMaskAttn,
 }
@@ -362,21 +458,9 @@ class DogeMLP(nn.Module):
         self.intermediate_dim = config.intermediate_size
         self.act_fn = ACT2FN[config.hidden_act]
-        self.gate_proj = nn.Linear(
-            self.hidden_dim,
-            self.intermediate_dim,
-            bias=config.hidden_bias,
-        )
-        self.up_proj = nn.Linear(
-            self.hidden_dim,
-            self.intermediate_dim,
-            bias=config.hidden_bias,
-        )
-        self.down_proj = nn.Linear(
-            self.intermediate_dim,
-            self.hidden_dim,
-            bias=config.hidden_bias,
-        )
     def forward(
         self,
@@ -402,30 +486,12 @@ class DogeCDMoE(DogeMLP):
         self.num_keys = int(math.sqrt(self.num_cdmmoe_experts))
         # queries and keys for retrieval experts
-        self.queries = nn.Linear(
-            self.hidden_dim,
-            self.num_cdmmoe_heads * self.expert_retrieval_dim,
-            bias=False,
-        )
-        self.keys = nn.Parameter(
-            torch.zeros(
-                self.num_cdmmoe_heads,
-                self.num_keys,
-                2,
-                self.expert_retrieval_dim // 2,
-            )
-        )
         # experts
-        self.down_embed  = nn.Embedding(
-            self.num_cdmmoe_experts,
-            self.hidden_dim,
-        )
-        self.up_embed = nn.Embedding(
-            self.num_cdmmoe_experts,
-            self.hidden_dim,
-        )
     def forward(
         self,
@@ -468,13 +534,13 @@ class DogeDecoderLayer(nn.Module):
         super().__init__()
         self.hidden_dropout = config.hidden_dropout
-        self.pre_sequence_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.attn = DOGE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
-        self.post_sequence_residual = Residual(config.hidden_size)
-        self.pre_state_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.feed_forward = DogeMLP(config) if config.is_moe == False else DogeCDMoE(config)
-        self.post_state_residual = Residual(config.hidden_size)
     def forward(
         self,
@@ -492,29 +558,25 @@ class DogeDecoderLayer(nn.Module):
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`, *optional*):
-                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
-                query_sequence_length, key_sequence_length)` if default attention is used.
             output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
             use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
             past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
             cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                 Indices depicting the position of the input sequence tokens in the sequence
             position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
-                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
-                with `head_dim` being the embedding dimension of each attention head.
             kwargs (`dict`, *optional*):
-                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
-                into the model
         """
         # sequence transformation
         residual = hidden_states
-        hidden_states = self.pre_sequence_layernorm(hidden_states)
-        hidden_states, present_key_value = self.attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -525,14 +587,14 @@ class DogeDecoderLayer(nn.Module):
         )
         self_attn_weights = None
         hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
-        hidden_states = self.post_sequence_residual(residual, hidden_states)
         # state transformation
         residual = hidden_states
-        hidden_states = self.pre_state_layernorm(hidden_states)
         hidden_states = self.feed_forward(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
-        hidden_states = self.post_state_residual(residual, hidden_states)
         outputs = (hidden_states,)
@@ -552,6 +614,7 @@ class DogePreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["DogeDecoderLayer"]
     _skip_keys_device_placement = ["past_key_values"]
     _supports_sdpa = True
     _supports_cache_class = True
     _supports_quantized_cache = True
@@ -572,11 +635,10 @@ class DogePreTrainedModel(PreTrainedModel):
 DOGE_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -587,60 +649,48 @@ DOGE_INPUTS_DOCSTRING = r"""
             [What are attention masks?](../glossary#attention-mask)
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
-            `past_key_values`).
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
             Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
         use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
         output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
         output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
-            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
-            the complete sequence length.
 """
@@ -711,9 +761,9 @@ class DogeModel(DogePreTrainedModel):
             else:
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
                 logger.warning_once(
-                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
-                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
-                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
                 )
         if cache_position is None:
@@ -739,7 +789,7 @@ class DogeModel(DogePreTrainedModel):
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
-        for decoder_layer in self.layers:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
@@ -842,18 +892,15 @@ class DogeModel(DogePreTrainedModel):
         **kwargs,
     ):
         """
-        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
-        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
         Args:
             attention_mask (`torch.Tensor`):
-                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
-                `(batch_size, 1, query_length, key_value_length)`.
             sequence_length (`int`):
                 The sequence length being processed.
             target_length (`int`):
-                The target length: when generating with static cache, the mask should be as long as the static cache,
-                to account for the 0 padding, the part of the cache that is not filled yet.
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
@@ -912,13 +959,13 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
     def set_decoder(self, decoder):
         self.model = decoder
-    def get_decoder(self):
-        return self.model
     @add_start_docstrings_to_model_forward(DOGE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -926,7 +973,7 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
@@ -935,19 +982,19 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         num_logits_to_keep: int = 0,
-        **loss_kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
             num_logits_to_keep (`int`, *optional*):
-                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
-                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
         Returns:
         """
@@ -969,6 +1016,7 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
         )
         hidden_states = outputs[0]
@@ -978,7 +1026,7 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
         loss = None
         if labels is not None:
-            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.vocab_size, **loss_kwargs)
         if not return_dict:
             output = (logits,) + outputs[1:]
@@ -993,18 +1041,98 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
         )
 @add_start_docstrings(
     """
     The Doge Model transformer with a sequence classification head on top (linear layer).
-    [`DogeForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
     """
 )
 class DogeForSequenceClassification(DogePreTrainedModel):
@@ -1041,9 +1169,9 @@ class DogeForSequenceClassification(DogePreTrainedModel):
     ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict

 from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_torch_greater_or_equal,
     logging,
     replace_return_docstrings,
 )
 except ImportError:
     einx_add = None
+if is_torch_greater_or_equal("2.5"):
+    from torch.nn.attention.flex_attention import flex_attention
 logger = logging.get_logger(__name__)
     def __init__(self, hidden_size):
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
     def forward(self, residual_states, hidden_states):
         return self.weight * residual_states + hidden_states
         super().__init__()
         self.rope_kwargs = {}
+        if config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
         else:
+            self.rope_type = "default"
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
         self.base = config.rope_theta
         # core RoPE block
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
         device_type = x.device.type
         device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
             cos = emb.cos()
             sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
         cos = cos * self.attention_scaling
         sin = sin * self.attention_scaling
             Deprecated and unused.
         unsqueeze_dim (`int`, *optional*, defaults to 1):
             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k.
+            For example, note that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim].
+            Then, if q and k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k.
+            Similarly, if q and k have the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
     return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
+    The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 class DogeDynamicMaskAttention(nn.Module):
     """Dynamic Mask Attention from 'Wonderful Matrices' paper."""
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. "
+                "Please make sure to provide a `layer_idx` when creating this class."
             )
         self.hidden_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_dim // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.attention_dropout = config.attention_dropout
+        self.dynamic_mask_ratio = config.dynamic_mask_ratio
         # Q K V O projections
+        self.q_proj = nn.Linear(self.hidden_dim, self.num_heads * self.head_dim, bias=config.hidden_bias)
+        self.k_proj = nn.Linear(self.hidden_dim, self.num_key_value_heads * self.head_dim, bias=config.hidden_bias)
+        self.v_proj = nn.Linear(self.hidden_dim, self.num_key_value_heads * self.head_dim, bias=config.hidden_bias)
         # dynamic mask for the QK^T attention score matrix
+        self.A = nn.Parameter(torch.ones(self.num_heads))
+        self.dt_proj = nn.Linear(self.num_key_value_heads * self.head_dim, self.num_heads, bias=config.hidden_bias)
+        self.o_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=config.hidden_bias)
     def forward(
         self,
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
         cos, sin = position_embeddings
         query_states, key_states = apply_QK_rotary_pos_emb(query_states, key_states, cos, sin)
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # calculate dynamic mask from value_states
+        dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
+        dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
+        # repeat key and value states
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
         # compute attention scores matrix
+        attn_weights = torch.matmul(query_states, key_states.transpose(-1, -2)) / math.sqrt(self.head_dim)
         # add mask to attention scores
+        attn_mask = self.prepare_dynamic_mask(
+            hidden_states=hidden_states,
+            dynamic_mask=dynamic_mask,
+            dynamic_mask_ratio=0.1,
+            attention_mask=attention_mask,
+        )
+        attn_weights = attn_weights + attn_mask
         # upcast attention scores to fp32
         attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         return attn_output, past_key_value
+    def prepare_dynamic_mask(
+        self,
+        hidden_states: torch.Tensor,
+        dynamic_mask: torch.Tensor,
+        dynamic_mask_ratio: float = 0.0,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        Combine `dynamic_mask` with `attention_mask` to generate the final `attn_mask`.
+        Args:
+            hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
+            dynamic_mask (`torch.Tensor`): dynamic mask of shape `(batch_size, num_heads, key_sequence_length)`.
+            dynamic_mask_ratio (`float`, *optional*): Ratio from 0.0 to 1.0 used to control the proportion of the dynamic mask filled with the minimum value.
+            attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
+        """
+        min_type = torch.finfo(hidden_states.dtype).min
+        attn_mask = dynamic_mask[:, :, None, :]
+        if 0.0 < dynamic_mask_ratio < 1.0:
+            num_dynamic_mask = int(attn_mask.shape[-1] * dynamic_mask_ratio)
+            if num_dynamic_mask > 0:
+                rate_value = torch.kthvalue(attn_mask, num_dynamic_mask, dim=-1, keepdim=True).values
+                attn_mask = attn_mask.masked_fill(attn_mask < rate_value, min_type)
+        if attention_mask is not None:
+            attn_mask = attn_mask.masked_fill(attention_mask[:, :, :, : hidden_states.shape[-2]] == min_type, min_type)
+        return attn_mask
+class DogeSdpaDynamicMaskAttention(DogeDynamicMaskAttention):
     def forward(
         self,
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
         cos, sin = position_embeddings
         query_states, key_states = apply_QK_rotary_pos_emb(query_states, key_states, cos, sin)
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # calculate dynamic mask from value_states
+        dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
+        dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
+        attn_mask = self.prepare_dynamic_mask(
+            hidden_states=hidden_states,
+            dynamic_mask=dynamic_mask,
+            dynamic_mask_ratio=self.dynamic_mask_ratio,
+            attention_mask=attention_mask,
+        )
         query_states = query_states.contiguous()
         key_states = key_states.contiguous()
         value_states = value_states.contiguous()
+        # NOTE: As of pytorch 2.5.1, cuDNN's SDPA backward pass is still incorrect, so we disable cuDNN SDPA (see https://github.com/pytorch/pytorch/issues/138581)
+        torch.backends.cuda.enable_cudnn_sdp(False)
         attn_output = F.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
+            attn_mask=attn_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            enable_gqa=True,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
         return attn_output, past_key_value
+class DogeFlexDynamicMaskAttention(DogeDynamicMaskAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[Cache]]:
+        bsz, q_len, _ = hidden_states.shape
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_QK_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
+        dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
+        attn_mask = self.prepare_dynamic_mask(
+            hidden_states=hidden_states,
+            dynamic_mask=dynamic_mask,
+            dynamic_mask_ratio=self.dynamic_mask_ratio,
+            attention_mask=attention_mask,
+        )
+        # TODO: flex_attention: Captured buffers that require grad are not yet supported.
+        # NOTE: So we only use flex_attention in inference mode.
+        def dynamic_mask_mod(score, batch, head, q_idx, kv_idx):
+            score = score + attn_mask[batch][head][q_idx][kv_idx]
+            return score
+        attn_output = flex_attention(
+            query_states,
+            key_states,
+            value_states,
+            score_mod=dynamic_mask_mod,
+            enable_gqa=True,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, past_key_value
 DOGE_ATTENTION_CLASSES = {
+    "flex_attention": DogeFlexDynamicMaskAttention,
     "eager": DogeDynamicMaskAttention,
+    "sdpa": DogeSdpaDynamicMaskAttention,
 }
         self.intermediate_dim = config.intermediate_size
         self.act_fn = ACT2FN[config.hidden_act]
+        self.gate_proj = nn.Linear(self.hidden_dim, self.intermediate_dim, bias=config.hidden_bias)
+        self.up_proj = nn.Linear(self.hidden_dim, self.intermediate_dim, bias=config.hidden_bias)
+        self.down_proj = nn.Linear(self.intermediate_dim, self.hidden_dim, bias=config.hidden_bias)
     def forward(
         self,
         self.num_keys = int(math.sqrt(self.num_cdmmoe_experts))
         # queries and keys for retrieval experts
+        self.queries = nn.Linear(self.hidden_dim, self.num_cdmmoe_heads * self.expert_retrieval_dim, bias=False)
+        self.keys = nn.Parameter(torch.zeros(self.num_cdmmoe_heads, self.num_keys, 2, self.expert_retrieval_dim // 2))
         # experts
+        self.down_embed  = nn.Embedding(self.num_cdmmoe_experts, self.hidden_dim)
+        self.up_embed = nn.Embedding(self.num_cdmmoe_experts, self.hidden_dim)
     def forward(
         self,
         super().__init__()
         self.hidden_dropout = config.hidden_dropout
+        self.pre_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.self_attn = DOGE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+        self.pre_residual = Residual(config.hidden_size)
+        self.post_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.feed_forward = DogeMLP(config) if config.is_moe == False else DogeCDMoE(config)
+        self.post_residual = Residual(config.hidden_size)
     def forward(
         self,
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, query_sequence_length, key_sequence_length)` if default attention is used.
             output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+                See `attentions` under returned tensors for more detail.
             use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`).
             past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
             cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                 Indices depicting the position of the input sequence tokens in the sequence
             position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, with `head_dim` being the embedding dimension of each attention head.
             kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code into the model
         """
         # sequence transformation
         residual = hidden_states
+        hidden_states = self.pre_layernorm(hidden_states)
+        hidden_states, present_key_value = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
         )
         self_attn_weights = None
         hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
+        hidden_states = self.pre_residual(residual, hidden_states)
         # state transformation
         residual = hidden_states
+        hidden_states = self.post_layernorm(hidden_states)
         hidden_states = self.feed_forward(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
+        hidden_states = self.post_residual(residual, hidden_states)
         outputs = (hidden_states,)
     supports_gradient_checkpointing = True
     _no_split_modules = ["DogeDecoderLayer"]
     _skip_keys_device_placement = ["past_key_values"]
+    _supports_flex_attn = True
     _supports_sdpa = True
     _supports_cache_class = True
     _supports_quantized_cache = True
 DOGE_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.
+            Indices can be obtained using [`AutoTokenizer`].
+            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`].
+            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] and modify to your needs.
+            See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) that can be used to speed up sequential decoding.
+            This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
             Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format.
+            The model will output the same cache format that is fed as input.
+            If no `past_key_values` are passed, the legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` of shape `(batch_size, sequence_length)`.
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
         use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`).
         output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers.
+            See `attentions` under returned tensors for more detail.
         output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers.
+            See `hidden_states` under returned tensors for more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, this tensor is not affected by padding.
+            It is used to update the cache in the correct position and to infer the complete sequence length.
 """
             else:
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
                 logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples."
+                    "This is deprecated and will be removed in v4.47."
+                    "Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
                 )
         if cache_position is None:
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
         **kwargs,
     ):
         """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
         Args:
             attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
             sequence_length (`int`):
                 The sequence length being processed.
             target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
+    def get_decoder(self):
+        return self.model
     def set_decoder(self, decoder):
         self.model = decoder
     @add_start_docstrings_to_model_forward(DOGE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         num_logits_to_keep: int = 0,
+        **kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss.
+                Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring).
+                Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
             num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens.
+                If `0`, calculate logits for all `input_ids` (special case).
+                Only last token logits are needed for generation, and calculating them only for that token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
         Returns:
         """
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
+            **kwargs,
         )
         hidden_states = outputs[0]
         loss = None
         if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.vocab_size, **kwargs)
         if not return_dict:
             output = (logits,) + outputs[1:]
         )
+class DogePatchEmbedding(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial `hidden_states` of shape `(batch_size, seq_len, hidden_size)` to be consumed by a Transformer.
+    """
+    def __init__(self, config: DogeConfig):
+        super().__init__()
+        self.num_channels = config.num_channels
+        self.patch_size = config.patch_size
+        self.hidden_dim = config.hidden_size
+        self.sequence_proj = nn.Conv2d(self.num_channels, self.hidden_dim, kernel_size=self.patch_size, stride=self.patch_size)
+        self.state_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=config.hidden_bias)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+        image_embedding = self.sequence_proj(pixel_values).flatten(2).transpose(1, 2)
+        image_embedding = self.state_proj(image_embedding)
+        return image_embedding
+class DogeForCausalVLM(DogeForCausalLM):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: DogeConfig):
+        super().__init__(config)
+        self.config = config
+        self.pixel_embed = DogePatchEmbedding(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        # TODO: @wubingheng111: refer to Llava for implementating the forward method
+        ...
+    def prepare_inputs_for_generation(
+        self,
+        input_ids=None,
+        pixel_values=None,
+        past_key_values=None,
+        input_embeds=None,
+        attention_mask=None,
+        cache_position=None,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        model_inputs = self.model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+            **kwargs,
+        )
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+        return model_inputs
 @add_start_docstrings(
     """
     The Doge Model transformer with a sequence classification head on top (linear layer).
+    [`DogeForSequenceClassification`] uses the last token in order to do the classification, as other causal models (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token.
+    If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row.
+    If no `pad_token_id` is defined, it simply takes the last value in each row of the batch.
+    Since it cannot guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in each row of the batch).
     """
 )
 class DogeForSequenceClassification(DogePreTrainedModel):
     ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in `[0, ..., config.num_labels - 1]`.
+            If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict