Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 31, 2024

Commit

6f9c805

verified ·

1 Parent(s): 3bbca75

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +122 -109

modeling_quiet.py CHANGED Viewed

@@ -571,117 +571,133 @@ class QuietFlashAttention2(QuietAttention):
         return attn_output, attn_weights, past_key_value
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-        # Ensure attention_mask has the correct shape and values
-        if attention_mask is not None:
-            if attention_mask.dim() == 4:
-                # Convert 4D attention mask to 2D
-                attention_mask = attention_mask.squeeze(1).squeeze(1)
-            elif attention_mask.dim() != 2:
-                raise ValueError(
-                    f"Invalid attention mask dimension: {attention_mask.dim()}. Expected 2D or 4D mask."
-                )
-            # Ensure attention_mask has values of 0 and 1
-            attention_mask = attention_mask.to(torch.bool).to(torch.int32)
-        # Contains at least one padding token in the sequence
         if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
             )
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-        return attn_output
     def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
@@ -1848,10 +1864,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
                             [shift_labels, padding],
                             dim=-1
                         )
-                            # Adjust the labels to account for the additional thinking tokens
-                        new_rm_tokens = torch.cat([torch.full_like(new_rm_tokens[..., :self.n_ahead], self.tokenizer.pad_token_id, dtype=torch.long, device=new_rm_tokens.device), new_rm_tokens], dim=-1)
                         # print((new_rm_tokens > self.vocab_size - 1).any().item())
                         new_rm_tokens = torch.clamp(new_rm_tokens, 0, self.vocab_size - 1)

         return attn_output, attn_weights, past_key_value
+def _flash_attention_forward(
+    self,
+    query_states,
+    key_states,
+    value_states,
+    attention_mask,
+    query_length,
+    dropout=0.0,
+    softmax_scale=None,
+    use_sliding_windows=False,
+):
+    """
+    Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+    first unpad the input, then computes the attention scores and pad the final attention scores.
+    Args:
+        query_states (`torch.Tensor`):
+            Input query states to be passed to Flash Attention API
+        key_states (`torch.Tensor`):
+            Input key states to be passed to Flash Attention API
+        value_states (`torch.Tensor`):
+            Input value states to be passed to Flash Attention API
+        attention_mask (`torch.Tensor`):
+            The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+            position of padding tokens and 1 for the position of non-padding tokens.
+        dropout (`int`, *optional*):
+            Attention dropout
+        softmax_scale (`float`, *optional*):
+            The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        use_sliding_windows (`bool`, *optional*):
+            Whether to activate sliding window attention.
+    """
+    if not self._flash_attn_uses_top_left_mask:
+        causal = self.is_causal
+    else:
+        # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+        causal = self.is_causal and query_length != 1
+    # Ensure attention_mask has the correct shape and values
+    if attention_mask is not None:
+        if attention_mask.dim() == 4:
+            # Convert 4D attention mask to 2D
+            attention_mask = attention_mask.squeeze(1).squeeze(1)
+        elif attention_mask.dim() != 2:
+            raise ValueError(
+                f"Invalid attention mask dimension: {attention_mask.dim()}. Expected 2D or 4D mask."
+            )
+        # Ensure attention_mask has values of 0 and 1
+        attention_mask = attention_mask.to(torch.bool).to(torch.int32)
+    # Contains at least one padding token in the sequence
+    if attention_mask is not None:
+        batch_size = query_states.shape[0]
+        query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+            query_states, key_states, value_states, attention_mask, query_length
+        )
+        cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+        max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+        # Create the cu_seqlens_q and cu_seqlens_k tensors
+        q_max_s, k_max_s = query_states.shape[1], key_states.shape[1]
+        qkv_max_s = max(q_max_s, k_max_s)
+        q_seqlens = torch.full((batch_size,), q_max_s, dtype=torch.int32, device=query_states.device)
+        k_seqlens = torch.full((batch_size,), k_max_s, dtype=torch.int32, device=key_states.device)
+        # Adjust the attention mask to match the sequence lengths
         if attention_mask is not None:
+            q_seqlens = attention_mask.sum(dim=1).int()
+            k_seqlens = attention_mask.sum(dim=1).int()
+        # Convert seqlens to cumulative sequence lengths
+        cu_seqlens_q = torch.cat([torch.zeros(1, dtype=torch.int32, device=q_seqlens.device), q_seqlens.cumsum(dim=0)])
+        cu_seqlens_k = torch.cat([torch.zeros(1, dtype=torch.int32, device=k_seqlens.device), k_seqlens.cumsum(dim=0)])
+        if not use_sliding_windows:
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=qkv_max_s,
+                max_seqlen_k=qkv_max_s,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+        else:
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=qkv_max_s,
+                max_seqlen_k=qkv_max_s,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+                window_size=(self.config.sliding_window, self.config.sliding_window),
             )
+        attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+    else:
+        if not use_sliding_windows:
+            attn_output = flash_attn_func(
+                query_states,
+                key_states,
+                value_states,
+                dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
         else:
+            attn_output = flash_attn_func(
+                query_states,
+                key_states,
+                value_states,
+                dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+                window_size=(self.config.sliding_window, self.config.sliding_window),
+            )
+    return attn_output
     def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
                             [shift_labels, padding],
                             dim=-1
                         )
                         # print((new_rm_tokens > self.vocab_size - 1).any().item())
                         new_rm_tokens = torch.clamp(new_rm_tokens, 0, self.vocab_size - 1)