Upload HymbaForCausalLM

Browse files

Files changed (3) hide show

README.md +3 -4
config.json +9 -9
modeling_hymba.py +161 -127

README.md CHANGED Viewed

@@ -1,12 +1,11 @@
 ---
 library_name: transformers
 license: other
 license_name: nvidia-open-model-license
-license_link: >-
-  https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf
 pipeline_tag: text-generation
-base_model:
-- nvidia/Hymba-1.5B-Base
 ---
 # Hymba-1.5B-Instruct

 ---
+base_model:
+- nvidia/Hymba-1.5B-Base
 library_name: transformers
 license: other
 license_name: nvidia-open-model-license
+license_link: https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf
 pipeline_tag: text-generation
 ---
 # Hymba-1.5B-Instruct

config.json CHANGED Viewed

@@ -15,14 +15,6 @@
   "conv_dim": {
     "0": 3200,
     "1": 3200,
-    "2": 3200,
-    "3": 3200,
-    "4": 3200,
-    "5": 3200,
-    "6": 3200,
-    "7": 3200,
-    "8": 3200,
-    "9": 3200,
     "10": 3200,
     "11": 3200,
     "12": 3200,
@@ -33,6 +25,7 @@
     "17": 3200,
     "18": 3200,
     "19": 3200,
     "20": 3200,
     "21": 3200,
     "22": 3200,
@@ -43,8 +36,15 @@
     "27": 3200,
     "28": 3200,
     "29": 3200,
     "30": 3200,
-    "31": 3200
   },
   "eos_token_id": 2,
   "global_attn_idx": [

   "conv_dim": {
     "0": 3200,
     "1": 3200,
     "10": 3200,
     "11": 3200,
     "12": 3200,
     "17": 3200,
     "18": 3200,
     "19": 3200,
+    "2": 3200,
     "20": 3200,
     "21": 3200,
     "22": 3200,
     "27": 3200,
     "28": 3200,
     "29": 3200,
+    "3": 3200,
     "30": 3200,
+    "31": 3200,
+    "4": 3200,
+    "5": 3200,
+    "6": 3200,
+    "7": 3200,
+    "8": 3200,
+    "9": 3200
   },
   "eos_token_id": 2,
   "global_attn_idx": [

modeling_hymba.py CHANGED Viewed

@@ -1579,146 +1579,133 @@ class HymbaBlock(nn.Module):
     def cuda_kernels_forward(self, hidden_states: torch.Tensor, cache_params: HybridMambaAttentionDynamicCache = None, attention_mask=None, position_ids=None, kv_last_layer=None, use_cache=False, use_swa=False):
         projected_states = self.in_proj(hidden_states).transpose(1, 2)  ## (bs, latent_dim, seq_len)
-        if (
-                self.training and cache_params is None and not self.apply_inner_layernorms
-        ):  # Doesn't support outputting the states -> used for training
-            contextualized_states = mamba_inner_fn(
-                projected_states,
-                self.conv1d.weight,
-                self.conv1d.bias if self.use_conv_bias else None,
-                self.x_proj.weight,
-                self.dt_proj.weight,
-                self.out_proj.weight,
-                self.out_proj.bias.float() if self.use_bias else None,
-                -torch.exp(self.A_log.float()),
-                None,  # input-dependent B
-                None,  # input-dependent C
-                self.D.float(),
-                delta_bias=self.dt_proj.bias.float(),
-                delta_softplus=True,
-            )
-        else:
-            batch_size, seq_len, _ = hidden_states.shape
-            use_precomputed_states = (
-                cache_params is not None
-                and cache_params.has_previous_state
-                and seq_len == 1
-                and cache_params.conv_states[self.layer_idx].shape[0]
-                == cache_params.ssm_states[self.layer_idx].shape[0]
-                == batch_size
-                and use_cache
-            )
-            hidden_states, gate = projected_states.tensor_split((self.latent_dim,), dim=1)
-            conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
-            if self.reuse_kv:
-                query_states, hidden_states = hidden_states.tensor_split((self.attn_hidden_size,), dim=1)
-                query_states = query_states.transpose(1,2)
-            else:
-                query_states, key_states, value_states, hidden_states = hidden_states.tensor_split((self.attn_hidden_size, self.attn_hidden_size + self.k_hidden_size, self.attn_hidden_size + self.k_hidden_size + self.v_hidden_size), dim=1)
-                query_states = query_states.transpose(1,2)
-                key_states = key_states.transpose(1,2)
-                value_states = value_states.transpose(1,2)
-            if use_precomputed_states:
-                hidden_states = causal_conv1d_update(
-                    hidden_states.squeeze(-1),
-                    cache_params.conv_states[self.layer_idx],
-                    conv_weights,
-                    self.conv1d.bias,
-                    self.activation,
                 )
-                hidden_states = hidden_states.unsqueeze(-1)
-                cache_params.mamba_past_length[self.layer_idx] += seq_len
-            else:
-                if cache_params is not None:
-                    conv_states = nn.functional.pad(
-                        hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0)
-                    )
-                    cache_params.conv_states[self.layer_idx].copy_(conv_states)
-                    cache_params.mamba_past_length[self.layer_idx] += seq_len
-                hidden_states = causal_conv1d_fn(
-                    hidden_states, conv_weights, self.conv1d.bias, activation=self.activation
-                )
-            if self.reuse_kv:
-                assert kv_last_layer is not None
-                attn_outputs, attn_key_value = self.self_attn(attention_mask=attention_mask, position_ids=position_ids, query_states=query_states, kv_last_layer=kv_last_layer, use_swa=use_swa, use_cache=use_cache, past_key_value=cache_params)
-            else:
-                attn_outputs, attn_key_value = self.self_attn(attention_mask=attention_mask, position_ids=position_ids, query_states=query_states, key_states=key_states, value_states=value_states, use_swa=use_swa, use_cache=use_cache, past_key_value=cache_params)
-            ## Mamba head
-            index = 0
-            ssm_parameters = self.x_proj[index](hidden_states.transpose(1, 2))
-            time_step, B, C = torch.split(
-                ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
-            )
-            time_step, B, C = self._apply_layernorms(time_step, B, C)
-            if hasattr(self.dt_proj[index], "base_layer"):
-                time_proj_bias = self.dt_proj[index].base_layer.bias
-                self.dt_proj[index].base_layer.bias = None
-            else:
-                time_proj_bias = self.dt_proj[index].bias
-                self.dt_proj[index].bias = None
-            discrete_time_step = self.dt_proj[index](time_step).transpose(1, 2)  # [batch, intermediate_size, seq_len]
-            if hasattr(self.dt_proj[index], "base_layer"):
-                self.dt_proj[index].base_layer.bias = time_proj_bias
             else:
-                self.dt_proj[index].bias = time_proj_bias
-            A = -torch.exp(self.A_log[index].float())
-            time_proj_bias = time_proj_bias.float() if time_proj_bias is not None else None
-            if 1:
-                if use_precomputed_states:
-                    scan_outputs = selective_state_update(
-                        cache_params.ssm_states[self.layer_idx],
-                        hidden_states[..., 0],
-                        discrete_time_step[..., 0],
-                        A,
-                        B[:, 0],
-                        C[:, 0],
-                        self.D[index],
-                        gate[..., 0],
-                        time_proj_bias,
-                        dt_softplus=True,
-                    ).unsqueeze(-1)
-                else:
-                    outputs = selective_scan_fn(
-                        hidden_states,
-                        discrete_time_step,
-                        A,
-                        B.transpose(1, 2),
-                        C.transpose(1, 2),
-                        self.D[index].float(),
-                        z=gate,
-                        delta_bias=time_proj_bias,
-                        delta_softplus=True,
-                        return_last_state=True,
-                    )
-                    if len(outputs) == 3:
-                        scan_outputs, ssm_state, _ = outputs
-                    else:
-                        scan_outputs, ssm_state = outputs
-                    if ssm_state is not None and cache_params is not None:
-                        cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
-            scan_outputs = scan_outputs.transpose(1, 2)
-            hidden_states = (self.pre_avg_layernorm1(attn_outputs) + self.pre_avg_layernorm2(scan_outputs)) / 2
-            contextualized_states = self.out_proj(hidden_states)
         return contextualized_states, attn_key_value
@@ -2038,6 +2025,49 @@ class HymbaPreTrainedModel(PreTrainedModel):
 HYMBA_INPUTS_DOCSTRING = r"""
     Args: To be added later. Please refer to the forward function.
 """
@@ -2206,7 +2236,11 @@ class HymbaModel(HymbaPreTrainedModel):
             if position_ids is not None and position_ids.shape[1] != inputs_embeds.shape[1]:
                 position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0)
         attention_mask_raw = attention_mask
         if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:

     def cuda_kernels_forward(self, hidden_states: torch.Tensor, cache_params: HybridMambaAttentionDynamicCache = None, attention_mask=None, position_ids=None, kv_last_layer=None, use_cache=False, use_swa=False):
         projected_states = self.in_proj(hidden_states).transpose(1, 2)  ## (bs, latent_dim, seq_len)
+        ## Handle padding for Mamba: Set padding tokens to 0
+        if projected_states.shape[-1] > 1 and attention_mask is not None and (attention_mask == 0).any():
+            projected_states = projected_states * attention_mask.unsqueeze(1).to(projected_states)
+        batch_size, seq_len, _ = hidden_states.shape
+        use_precomputed_states = (
+            cache_params is not None
+            and cache_params.has_previous_state
+            and seq_len == 1
+            and cache_params.conv_states[self.layer_idx].shape[0]
+            == cache_params.ssm_states[self.layer_idx].shape[0]
+            == batch_size
+            and use_cache
+        )
+        hidden_states, gate = projected_states.tensor_split((self.latent_dim,), dim=1)
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
+        if self.reuse_kv:
+            query_states, hidden_states = hidden_states.tensor_split((self.attn_hidden_size,), dim=1)
+            query_states = query_states.transpose(1,2)
+        else:
+            query_states, key_states, value_states, hidden_states = hidden_states.tensor_split((self.attn_hidden_size, self.attn_hidden_size + self.k_hidden_size, self.attn_hidden_size + self.k_hidden_size + self.v_hidden_size), dim=1)
+            query_states = query_states.transpose(1,2)
+            key_states = key_states.transpose(1,2)
+            value_states = value_states.transpose(1,2)
+        if use_precomputed_states:
+            hidden_states = causal_conv1d_update(
+                hidden_states.squeeze(-1),
+                cache_params.conv_states[self.layer_idx],
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+            )
+            hidden_states = hidden_states.unsqueeze(-1)
+            cache_params.mamba_past_length[self.layer_idx] += seq_len
+        else:
+            if cache_params is not None:
+                conv_states = nn.functional.pad(
+                    hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0)
                 )
+                cache_params.conv_states[self.layer_idx].copy_(conv_states)
+                cache_params.mamba_past_length[self.layer_idx] += seq_len
+            hidden_states = causal_conv1d_fn(
+                hidden_states, conv_weights, self.conv1d.bias, activation=self.activation
+            )
+        ## Handle padding for Mamba: Set padding tokens to 0
+        if seq_len > 1 and attention_mask is not None and (attention_mask == 0).any():
+            hidden_states = hidden_states * attention_mask.unsqueeze(1).to(hidden_states)
+        if self.reuse_kv:
+            assert kv_last_layer is not None
+            attn_outputs, attn_key_value = self.self_attn(attention_mask=attention_mask, position_ids=position_ids, query_states=query_states, kv_last_layer=kv_last_layer, use_swa=use_swa, use_cache=use_cache, past_key_value=cache_params)
+        else:
+            attn_outputs, attn_key_value = self.self_attn(attention_mask=attention_mask, position_ids=position_ids, query_states=query_states, key_states=key_states, value_states=value_states, use_swa=use_swa, use_cache=use_cache, past_key_value=cache_params)
+        ## Mamba head
+        index = 0
+        ssm_parameters = self.x_proj[index](hidden_states.transpose(1, 2))
+        time_step, B, C = torch.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+        )
+        time_step, B, C = self._apply_layernorms(time_step, B, C)
+        if hasattr(self.dt_proj[index], "base_layer"):
+            time_proj_bias = self.dt_proj[index].base_layer.bias
+            self.dt_proj[index].base_layer.bias = None
+        else:
+            time_proj_bias = self.dt_proj[index].bias
+            self.dt_proj[index].bias = None
+        discrete_time_step = self.dt_proj[index](time_step).transpose(1, 2)  # [batch, intermediate_size, seq_len]
+        if hasattr(self.dt_proj[index], "base_layer"):
+            self.dt_proj[index].base_layer.bias = time_proj_bias
+        else:
+            self.dt_proj[index].bias = time_proj_bias
+        A = -torch.exp(self.A_log[index].float())
+        time_proj_bias = time_proj_bias.float() if time_proj_bias is not None else None
+        if use_precomputed_states:
+            scan_outputs = selective_state_update(
+                cache_params.ssm_states[self.layer_idx],
+                hidden_states[..., 0],
+                discrete_time_step[..., 0],
+                A,
+                B[:, 0],
+                C[:, 0],
+                self.D[index],
+                gate[..., 0],
+                time_proj_bias,
+                dt_softplus=True,
+            ).unsqueeze(-1)
+        else:
+            outputs = selective_scan_fn(
+                hidden_states,
+                discrete_time_step,
+                A,
+                B.transpose(1, 2),
+                C.transpose(1, 2),
+                self.D[index].float(),
+                z=gate,
+                delta_bias=time_proj_bias,
+                delta_softplus=True,
+                return_last_state=True,
+            )
+            if len(outputs) == 3:
+                scan_outputs, ssm_state, _ = outputs
             else:
+                scan_outputs, ssm_state = outputs
+            if ssm_state is not None and cache_params is not None:
+                cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+        scan_outputs = scan_outputs.transpose(1, 2)
+        hidden_states = (self.pre_avg_layernorm1(attn_outputs) + self.pre_avg_layernorm2(scan_outputs)) / 2
+        contextualized_states = self.out_proj(hidden_states)
         return contextualized_states, attn_key_value
+def shift_zeros_to_front(attention_mask, hidden_states, position_ids):
+    """
+    Move all zero entries in 'attention_mask' to the front of the sequence
+    and reorder 'hidden_states' accordingly, preserving the order of zeros
+    and the order of ones.
+    Args:
+      attention_mask: (batch_size, seq_len), values in {0, 1}.
+      hidden_states:  (batch_size, seq_len, dim).
+    Returns:
+      shifted_mask:   (batch_size, seq_len) with zeros at the front.
+      shifted_states: (batch_size, seq_len, dim) reordered accordingly.
+    """
+    B, L = attention_mask.shape
+    D = hidden_states.shape[-1]
+    shifted_mask = torch.empty_like(attention_mask)
+    shifted_states = torch.empty_like(hidden_states)
+    shifted_position_ids = torch.empty_like(position_ids)
+    # Process each batch row independently
+    for b in range(B):
+        row_mask = attention_mask[b]       # (seq_len,)
+        row_states = hidden_states[b]      # (seq_len, dim)
+        row_pos = position_ids[b]       # (seq_len,)
+        # Find positions of zeros and ones
+        zero_indices = torch.where(row_mask == 0)[0]
+        one_indices  = torch.where(row_mask == 1)[0]
+        # Concatenate zero indices (in order) then one indices
+        new_order = torch.cat([zero_indices, one_indices], dim=0)
+        # Reorder mask and states
+        shifted_mask[b] = row_mask[new_order]
+        shifted_states[b] = row_states[new_order]
+        shifted_position_ids[b] = row_pos[new_order]
+    return shifted_mask, shifted_states, shifted_position_ids
 HYMBA_INPUTS_DOCSTRING = r"""
     Args: To be added later. Please refer to the forward function.
 """
             if position_ids is not None and position_ids.shape[1] != inputs_embeds.shape[1]:
                 position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0)
+            ## Handle paddings: Shift all padding tokens to the beginning of the sequence
+            if inputs_embeds.shape[1] > 1 and attention_mask is not None and (attention_mask == 0).any():
+                attention_mask, inputs_embeds, position_ids = shift_zeros_to_front(attention_mask, inputs_embeds, position_ids)
         attention_mask_raw = attention_mask
         if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: