Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 26, 2024

Commit

72e45de

verified ·

1 Parent(s): a00ce27

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +14 -71

modeling_quiet.py CHANGED Viewed

@@ -42,12 +42,12 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
@@ -134,34 +134,6 @@ def save_tokens_with_rewards_to_pdf(input_ids, token_rewards, tokenizer, output_
             previous_text = current_text
     c.showPage()
     c.save()
-def _prepare_4d_causal_attention_mask_for_sdpa(
-    attn_mask: Optional[torch.Tensor],
-    shape: Tuple[int, int],
-    inputs_embeds: Optional[torch.Tensor] = None,
-    past_key_values_length: int = 0,
-) -> torch.Tensor:
-    batch_size, seq_len = shape
-    if attn_mask is None:
-        attn_mask = torch.ones((batch_size, seq_len), dtype=torch.bool, device=inputs_embeds.device)
-    else:
-        attn_mask = attn_mask.bool()
-    # Extend the attention mask to account for past key/value states
-    if past_key_values_length > 0:
-        extended_attn_mask = torch.cat(
-            [
-                attn_mask.new_zeros(batch_size, seq_len, past_key_values_length),
-                attn_mask.unsqueeze(2),
-            ],
-            dim=2,
-        )
-        attn_mask = extended_attn_mask
-    attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)
-    causal_mask = torch.tril(torch.ones(seq_len, seq_len + past_key_values_length, device=attn_mask.device)).bool()
-    attn_mask = attn_mask & causal_mask.unsqueeze(0).unsqueeze(0)
-    return attn_mask
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
@@ -309,8 +281,8 @@ class QuietAttention(nn.Module):
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                 "when creating this class."
             )
@@ -601,7 +573,7 @@ class QuietFlashAttention2(QuietAttention):
             attention_mask (`torch.Tensor`):
                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                 position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
                 Attention dropout
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
@@ -719,7 +691,8 @@ class QuietFlashAttention2(QuietAttention):
         )
-# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Quiet
 class QuietSdpaAttention(QuietAttention):
     """
     Quiet attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
@@ -793,14 +766,14 @@ class QuietSdpaAttention(QuietAttention):
             query_states,
             key_states,
             value_states,
-            attn_mask=attention_mask.to(query_states.device) if attention_mask is not None else None,
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
             is_causal=self.is_causal and attention_mask is None and q_len > 1,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
@@ -1665,37 +1638,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
                 else:
                     with torch.set_grad_enabled(not self.train_only_thinking_embedding):
                         inputs_embeds = self.model.embed_tokens(input_ids)
-                def _update_inputs_for_thought_tokens(
-                    self, input_ids, attention_mask, contains_start, contains_end
-                ):
-                    batch_size = input_ids.size(0)
-                    seq_len = input_ids.size(1)
-                    if contains_start:
-                        start_token_ids = torch.tensor(
-                            [[self.start_token_id]] * batch_size, device=input_ids.device
-                        )
-                        input_ids = torch.cat([input_ids, start_token_ids], dim=1)
-                        if attention_mask is not None:
-                            start_attention_mask = torch.ones(
-                                (batch_size, 1), device=attention_mask.device
-                            )
-                            attention_mask = torch.cat([attention_mask, start_attention_mask], dim=1)
-                    if contains_end:
-                        end_token_ids = torch.tensor(
-                            [[self.end_token_id]] * batch_size, device=input_ids.device
-                        )
-                        input_ids = torch.cat([input_ids, end_token_ids], dim=1)
-                        if attention_mask is not None:
-                            end_attention_mask = torch.ones(
-                                (batch_size, 1), device=attention_mask.device
-                            )
-                            attention_mask = torch.cat([attention_mask, end_attention_mask], dim=1)
-                    return input_ids, attention_mask
             if self.n_ahead != 1 or self.n_ahead_talk != 1 or self.comparison_mode:
                 if attention_mask is None:
                     base_attention_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=0).to(input_ids.device)

 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
             previous_text = current_text
     c.showPage()
     c.save()
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
                 "when creating this class."
             )
             attention_mask (`torch.Tensor`):
                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                 position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
                 Attention dropout
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         )
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Quiet
+# TODO @Arthur no longer copied from LLama after static cache
 class QuietSdpaAttention(QuietAttention):
     """
     Quiet attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
             query_states,
             key_states,
             value_states,
+            attn_mask=attention_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
             is_causal=self.is_causal and attention_mask is None and q_len > 1,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
                 else:
                     with torch.set_grad_enabled(not self.train_only_thinking_embedding):
                         inputs_embeds = self.model.embed_tokens(input_ids)
             if self.n_ahead != 1 or self.n_ahead_talk != 1 or self.comparison_mode:
                 if attention_mask is None:
                     base_attention_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=0).to(input_ids.device)