Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 31, 2024

Commit

8d1df50

verified ·

1 Parent(s): f1bc6fa

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +13 -31

modeling_quiet.py CHANGED Viewed

@@ -136,22 +136,17 @@ def save_tokens_with_rewards_to_pdf(input_ids, token_rewards, tokenizer, output_
     c.save()
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    # Handle the case when seqlens_in_batch is empty
-    if seqlens_in_batch.numel() == 0:
-        max_seqlen_in_batch = 0
-    else:
-        max_seqlen_in_batch = seqlens_in_batch.max().item()
-    # Ensure seqlens_in_batch has the correct shape before cumulative sum
-    seqlens_in_batch = seqlens_in_batch.view(-1)
-    cu_seqlens = torch.cat([torch.zeros(1, dtype=torch.int32, device=attention_mask.device), seqlens_in_batch.cumsum(dim=0)])
-    return indices, cu_seqlens, max_seqlen_in_batch
 # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Quiet
@@ -545,7 +540,7 @@ class QuietFlashAttention2(QuietAttention):
             value_states = value_states.to(target_dtype)
         # Compute the causal mask
-        causal = self.config.is_causal
         if causal:
             if self._flash_attn_uses_top_left_mask:
                 # Compute the causal mask
@@ -583,7 +578,7 @@ class QuietFlashAttention2(QuietAttention):
                 indices_q,
                 cu_seq_lens,
                 max_seq_lens,
-            ) = self.upad_input(query_states, key_states, value_states, attention_mask, q_len)
             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
@@ -656,7 +651,7 @@ class QuietFlashAttention2(QuietAttention):
         return attn_output
-    def upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
         # On the first iteration we need to properly re-create the padding mask
@@ -665,23 +660,10 @@ class QuietFlashAttention2(QuietAttention):
             attention_mask_num_tokens = attention_mask.shape[-1]
             attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-        # Check if attention_mask is empty or all zeros
-        if attention_mask.numel() == 0 or attention_mask.sum() == 0:
-            # Return the original query_layer, key_layer, and value_layer without modifications
-            return (
-                query_layer,
-                key_layer,
-                value_layer,
-                torch.arange(batch_size, device=query_layer.device),
-                (torch.arange(batch_size + 1, device=query_layer.device), torch.arange(batch_size + 1, device=key_layer.device)),
-                (query_layer.shape[1], key_layer.shape[1]),
-            )
         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
         if query_length == kv_seq_len:
             query_layer = index_first_axis(
                 query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
@@ -786,7 +768,7 @@ class QuietSdpaAttention(QuietAttention):
             attn_mask=attention_mask.to(query_states.device) if attention_mask is not None else None,
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()

     c.save()
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
 # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Quiet
             value_states = value_states.to(target_dtype)
         # Compute the causal mask
+        causal = self.config.causal
         if causal:
             if self._flash_attn_uses_top_left_mask:
                 # Compute the causal mask
                 indices_q,
                 cu_seq_lens,
                 max_seq_lens,
+            ) = self._upad_input(query_states, key_states, value_states, attention_mask, q_len)
             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
         return attn_output
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
         # On the first iteration we need to properly re-create the padding mask
             attention_mask_num_tokens = attention_mask.shape[-1]
             attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer= index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
         if query_length == kv_seq_len:
             query_layer = index_first_axis(
                 query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
             attn_mask=attention_mask.to(query_states.device) if attention_mask is not None else None,
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            causal=self.is_causal and attention_mask is None and q_len > 1,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()