Update modeling_quiet.py
Browse files- modeling_quiet.py +10 -4
modeling_quiet.py
CHANGED
@@ -1071,10 +1071,12 @@ class QuietModel(QuietPreTrainedModel):
|
|
1071 |
" call `tokenizer.padding_side = 'left'` before tokenizing the input. "
|
1072 |
)
|
1073 |
if self._attn_implementation == "flash_attention_2":
|
|
|
|
|
1074 |
# Convert 2D mask to 4D and adjust size
|
1075 |
attention_mask = _prepare_4d_causal_attention_mask(
|
1076 |
attention_mask,
|
1077 |
-
(batch_size, seq_len
|
1078 |
inputs_embeds,
|
1079 |
past_key_values_length,
|
1080 |
sliding_window=self.config.sliding_window,
|
@@ -1084,17 +1086,19 @@ class QuietModel(QuietPreTrainedModel):
|
|
1084 |
elif self._attn_implementation == "sdpa" and not output_attentions and attention_mask.dim() == 2 and False:
|
1085 |
# output_attentions=True can not be supported when using SDPA, and we fall back on
|
1086 |
# the manual implementation that requires a 4D causal mask in all cases.
|
|
|
1087 |
attention_mask = _prepare_4d_causal_attention_mask(
|
1088 |
attention_mask,
|
1089 |
-
(batch_size, seq_len
|
1090 |
inputs_embeds,
|
1091 |
past_key_values_length,
|
1092 |
)
|
1093 |
elif attention_mask is None or attention_mask.dim() == 2:
|
1094 |
# 4d mask is passed through the layers
|
|
|
1095 |
attention_mask = _prepare_4d_causal_attention_mask(
|
1096 |
attention_mask,
|
1097 |
-
(batch_size, seq_len
|
1098 |
inputs_embeds,
|
1099 |
past_key_values_length,
|
1100 |
sliding_window=self.config.sliding_window,
|
@@ -1665,10 +1669,12 @@ class QuietForCausalLM(QuietPreTrainedModel):
|
|
1665 |
num_thought_tokens += 1
|
1666 |
if self.use_end_thought_token:
|
1667 |
num_thought_tokens += 1
|
|
|
|
|
1668 |
# # if the attention mask
|
1669 |
attention_mask = _prepare_4d_causal_attention_mask(
|
1670 |
attention_mask,
|
1671 |
-
(batch_size, seq_len
|
1672 |
inputs_embeds,
|
1673 |
past_key_values_length,
|
1674 |
sliding_window=self.config.sliding_window,
|
|
|
1071 |
" call `tokenizer.padding_side = 'left'` before tokenizing the input. "
|
1072 |
)
|
1073 |
if self._attn_implementation == "flash_attention_2":
|
1074 |
+
|
1075 |
+
seq_length = original_sequence_length + num_thought_tokens
|
1076 |
# Convert 2D mask to 4D and adjust size
|
1077 |
attention_mask = _prepare_4d_causal_attention_mask(
|
1078 |
attention_mask,
|
1079 |
+
(batch_size, seq_len), # Adjust size
|
1080 |
inputs_embeds,
|
1081 |
past_key_values_length,
|
1082 |
sliding_window=self.config.sliding_window,
|
|
|
1086 |
elif self._attn_implementation == "sdpa" and not output_attentions and attention_mask.dim() == 2 and False:
|
1087 |
# output_attentions=True can not be supported when using SDPA, and we fall back on
|
1088 |
# the manual implementation that requires a 4D causal mask in all cases.
|
1089 |
+
seq_length = original_sequence_length + num_thought_tokens
|
1090 |
attention_mask = _prepare_4d_causal_attention_mask(
|
1091 |
attention_mask,
|
1092 |
+
(batch_size, seq_len),
|
1093 |
inputs_embeds,
|
1094 |
past_key_values_length,
|
1095 |
)
|
1096 |
elif attention_mask is None or attention_mask.dim() == 2:
|
1097 |
# 4d mask is passed through the layers
|
1098 |
+
seq_length = original_sequence_length + num_thought_tokens
|
1099 |
attention_mask = _prepare_4d_causal_attention_mask(
|
1100 |
attention_mask,
|
1101 |
+
(batch_size, seq_len),
|
1102 |
inputs_embeds,
|
1103 |
past_key_values_length,
|
1104 |
sliding_window=self.config.sliding_window,
|
|
|
1669 |
num_thought_tokens += 1
|
1670 |
if self.use_end_thought_token:
|
1671 |
num_thought_tokens += 1
|
1672 |
+
|
1673 |
+
seq_length = original_sequence_length + num_thought_tokens
|
1674 |
# # if the attention mask
|
1675 |
attention_mask = _prepare_4d_causal_attention_mask(
|
1676 |
attention_mask,
|
1677 |
+
(batch_size, seq_len),
|
1678 |
inputs_embeds,
|
1679 |
past_key_values_length,
|
1680 |
sliding_window=self.config.sliding_window,
|