Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 26, 2024

Commit

18dad8f

verified ·

1 Parent(s): 54575d5

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +14 -35

modeling_quiet.py CHANGED Viewed

@@ -1070,40 +1070,28 @@ class QuietModel(QuietPreTrainedModel):
                     " this may lead to unexpected behaviour for Flash Attention version of Quiet. Make sure to "
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
-        if self._attn_implementation == "flash_attention_2":
-            num_thought_tokens = 0
-            if self.use_start_thought_token:
-                num_thought_tokens += 1
-            if self.use_end_thought_token:
-                num_thought_tokens += 1
-                original_sequence_length = input_ids.shape[1]
-            seq_length = original_sequence_length + num_thought_tokens
-            # Convert 2D mask to 4D and adjust size
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_len),  # Adjust size
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions and attention_mask.dim() == 2 and False:
             # output_attentions=True can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
-            seq_length = original_sequence_length + num_thought_tokens
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_len),
                 inputs_embeds,
                 past_key_values_length,
             )
-        elif attention_mask is None or attention_mask.dim() == 2:
             # 4d mask is passed through the layers
-            seq_length = original_sequence_length + num_thought_tokens
             attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_len),
                 inputs_embeds,
                 past_key_values_length,
                 sliding_window=self.config.sliding_window,
@@ -1668,18 +1656,9 @@ class QuietForCausalLM(QuietPreTrainedModel):
                             [torch.ones((attention_mask.shape[0], past_key_values_length), dtype=attention_mask.dtype, device=attention_mask.device), attention_mask],
                             dim=-1
                         )
-                    num_thought_tokens = 0
-                    if self.use_start_thought_token:
-                        num_thought_tokens += 1
-                    if self.use_end_thought_token:
-                        num_thought_tokens += 1
-                    original_sequence_length = input_ids.shape[1]
-                    seq_length = original_sequence_length + num_thought_tokens
                     # # if the attention mask
                     attention_mask = _prepare_4d_causal_attention_mask(
-                        attention_mask,
                         (batch_size, seq_len),
                         inputs_embeds,
                         past_key_values_length,

                     " this may lead to unexpected behaviour for Flash Attention version of Quiet. Make sure to "
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
+        if attention_mask is not None and self._attn_implementation == 'flash_attention_2':
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == 'sdpa' and not output_attentions:
             # output_attentions=True can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
                 inputs_embeds,
                 past_key_values_length,
             )
+        else:
+            # Check the shape of the attention mask
+            if attention_mask is not None and attention_mask.dim() == 2:
+                # Reshape the attention mask to 4D
+                attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
                 inputs_embeds,
                 past_key_values_length,
                 sliding_window=self.config.sliding_window,
                             [torch.ones((attention_mask.shape[0], past_key_values_length), dtype=attention_mask.dtype, device=attention_mask.device), attention_mask],
                             dim=-1
                         )
                     # # if the attention mask
                     attention_mask = _prepare_4d_causal_attention_mask(
+                        attention_mask,
                         (batch_size, seq_len),
                         inputs_embeds,
                         past_key_values_length,