Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 30, 2024

Commit

120f09f

verified ·

1 Parent(s): ced45b7

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +14 -56

modeling_quiet.py CHANGED Viewed

@@ -607,6 +607,19 @@ class QuietFlashAttention2(QuietAttention):
         else:
             # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
             causal = self.is_causal and query_length != 1
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
@@ -667,63 +680,8 @@ class QuietFlashAttention2(QuietAttention):
                     causal=causal,
                     window_size=(self.config.sliding_window, self.config.sliding_window),
                 )
-            try:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            except RuntimeError as e:
-                if "cu_seqlens_q must have shape (batch_size + 1)" in str(e):
-                    # Handle the case when cu_seqlens_q has an invalid shape
-                    if attention_mask is not None:
-                        # Ensure attention_mask has the correct shape
-                        if attention_mask.dim() == 2:
-                            # Convert 2D attention mask to 4D
-                            attention_mask = _prepare_4d_causal_attention_mask(
-                                attention_mask,
-                                (query_states.size(0), query_states.size(1)),
-                                query_states,
-                                past_key_values_length=0,
-                                sliding_window=0,
-                            )
-                        elif attention_mask.dim() != 4:
-                            raise ValueError(
-                                f"Invalid attention mask dimension: {attention_mask.dim()}. Expected 2D or 4D mask."
-                            )
-                        # Update cu_seqlens_q based on the attention mask
-                        cu_seqlens_q = attention_mask.sum(dim=-1).flatten().cumsum(dim=0).to(torch.int32)
-                        max_seqlen_in_batch_q = cu_seqlens_q[-1].item()
-                        # Retry flash_attn_varlen_func with updated cu_seqlens_q
-                        attn_output_unpad = flash_attn_varlen_func(
-                            query_states,
-                            key_states,
-                            value_states,
-                            cu_seqlens_q=cu_seqlens_q,
-                            cu_seqlens_k=cu_seqlens_k,
-                            max_seqlen_q=max_seqlen_in_batch_q,
-                            max_seqlen_k=max_seqlen_in_batch_k,
-                            dropout_p=dropout,
-                            softmax_scale=softmax_scale,
-                            causal=causal,
-                        )
-                    else:
-                        raise ValueError(
-                            "Attention mask is required for flash-attn when cu_seqlens_q has an invalid shape."
-                        )
-                else:
-                    raise e
-            return attn_output
     def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape

         else:
             # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
             causal = self.is_causal and query_length != 1
+        # Ensure attention_mask has the correct shape and values
+        if attention_mask is not None:
+            if attention_mask.dim() == 4:
+                # Convert 4D attention mask to 2D
+                attention_mask = attention_mask.squeeze(1).squeeze(1)
+            elif attention_mask.dim() != 2:
+                raise ValueError(
+                    f"Invalid attention mask dimension: {attention_mask.dim()}. Expected 2D or 4D mask."
+                )
+            # Ensure attention_mask has values of 0 and 1
+            attention_mask = attention_mask.to(torch.bool).to(torch.int32)
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
                     causal=causal,
                     window_size=(self.config.sliding_window, self.config.sliding_window),
                 )
+        return attn_output
     def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape