Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 26, 2024

Commit

45ddaba

verified ·

1 Parent(s): 15cda82

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +10 -4

modeling_quiet.py CHANGED Viewed

@@ -1071,10 +1071,12 @@ class QuietModel(QuietPreTrainedModel):
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
         if self._attn_implementation == "flash_attention_2":
             # Convert 2D mask to 4D and adjust size
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask,
-                (batch_size, seq_len + num_thought_tokens),  # Adjust size
                 inputs_embeds,
                 past_key_values_length,
                 sliding_window=self.config.sliding_window,
@@ -1084,17 +1086,19 @@ class QuietModel(QuietPreTrainedModel):
         elif self._attn_implementation == "sdpa" and not output_attentions and attention_mask.dim() == 2 and False:
             # output_attentions=True can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask,
-                (batch_size, seq_len + num_thought_tokens),
                 inputs_embeds,
                 past_key_values_length,
             )
         elif attention_mask is None or attention_mask.dim() == 2:
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask,
-                (batch_size, seq_len + num_thought_tokens),
                 inputs_embeds,
                 past_key_values_length,
                 sliding_window=self.config.sliding_window,
@@ -1665,10 +1669,12 @@ class QuietForCausalLM(QuietPreTrainedModel):
                         num_thought_tokens += 1
                     if self.use_end_thought_token:
                         num_thought_tokens += 1
                     # # if the attention mask
                     attention_mask = _prepare_4d_causal_attention_mask(
                         attention_mask,
-                        (batch_size, seq_len + num_thought_tokens),
                         inputs_embeds,
                         past_key_values_length,
                         sliding_window=self.config.sliding_window,

                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
         if self._attn_implementation == "flash_attention_2":
+            seq_length = original_sequence_length + num_thought_tokens
             # Convert 2D mask to 4D and adjust size
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask,
+                (batch_size, seq_len),  # Adjust size
                 inputs_embeds,
                 past_key_values_length,
                 sliding_window=self.config.sliding_window,
         elif self._attn_implementation == "sdpa" and not output_attentions and attention_mask.dim() == 2 and False:
             # output_attentions=True can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
+            seq_length = original_sequence_length + num_thought_tokens
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask,
+                (batch_size, seq_len),
                 inputs_embeds,
                 past_key_values_length,
             )
         elif attention_mask is None or attention_mask.dim() == 2:
             # 4d mask is passed through the layers
+            seq_length = original_sequence_length + num_thought_tokens
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask,
+                (batch_size, seq_len),
                 inputs_embeds,
                 past_key_values_length,
                 sliding_window=self.config.sliding_window,
                         num_thought_tokens += 1
                     if self.use_end_thought_token:
                         num_thought_tokens += 1
+                    seq_length = original_sequence_length + num_thought_tokens
                     # # if the attention mask
                     attention_mask = _prepare_4d_causal_attention_mask(
                         attention_mask,
+                        (batch_size, seq_len),
                         inputs_embeds,
                         past_key_values_length,
                         sliding_window=self.config.sliding_window,