Crystalcareai commited on
Commit
45ddaba
·
verified ·
1 Parent(s): 15cda82

Update modeling_quiet.py

Browse files
Files changed (1) hide show
  1. modeling_quiet.py +10 -4
modeling_quiet.py CHANGED
@@ -1071,10 +1071,12 @@ class QuietModel(QuietPreTrainedModel):
1071
  " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
1072
  )
1073
  if self._attn_implementation == "flash_attention_2":
 
 
1074
  # Convert 2D mask to 4D and adjust size
1075
  attention_mask = _prepare_4d_causal_attention_mask(
1076
  attention_mask,
1077
- (batch_size, seq_len + num_thought_tokens), # Adjust size
1078
  inputs_embeds,
1079
  past_key_values_length,
1080
  sliding_window=self.config.sliding_window,
@@ -1084,17 +1086,19 @@ class QuietModel(QuietPreTrainedModel):
1084
  elif self._attn_implementation == "sdpa" and not output_attentions and attention_mask.dim() == 2 and False:
1085
  # output_attentions=True can not be supported when using SDPA, and we fall back on
1086
  # the manual implementation that requires a 4D causal mask in all cases.
 
1087
  attention_mask = _prepare_4d_causal_attention_mask(
1088
  attention_mask,
1089
- (batch_size, seq_len + num_thought_tokens),
1090
  inputs_embeds,
1091
  past_key_values_length,
1092
  )
1093
  elif attention_mask is None or attention_mask.dim() == 2:
1094
  # 4d mask is passed through the layers
 
1095
  attention_mask = _prepare_4d_causal_attention_mask(
1096
  attention_mask,
1097
- (batch_size, seq_len + num_thought_tokens),
1098
  inputs_embeds,
1099
  past_key_values_length,
1100
  sliding_window=self.config.sliding_window,
@@ -1665,10 +1669,12 @@ class QuietForCausalLM(QuietPreTrainedModel):
1665
  num_thought_tokens += 1
1666
  if self.use_end_thought_token:
1667
  num_thought_tokens += 1
 
 
1668
  # # if the attention mask
1669
  attention_mask = _prepare_4d_causal_attention_mask(
1670
  attention_mask,
1671
- (batch_size, seq_len + num_thought_tokens),
1672
  inputs_embeds,
1673
  past_key_values_length,
1674
  sliding_window=self.config.sliding_window,
 
1071
  " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
1072
  )
1073
  if self._attn_implementation == "flash_attention_2":
1074
+
1075
+ seq_length = original_sequence_length + num_thought_tokens
1076
  # Convert 2D mask to 4D and adjust size
1077
  attention_mask = _prepare_4d_causal_attention_mask(
1078
  attention_mask,
1079
+ (batch_size, seq_len), # Adjust size
1080
  inputs_embeds,
1081
  past_key_values_length,
1082
  sliding_window=self.config.sliding_window,
 
1086
  elif self._attn_implementation == "sdpa" and not output_attentions and attention_mask.dim() == 2 and False:
1087
  # output_attentions=True can not be supported when using SDPA, and we fall back on
1088
  # the manual implementation that requires a 4D causal mask in all cases.
1089
+ seq_length = original_sequence_length + num_thought_tokens
1090
  attention_mask = _prepare_4d_causal_attention_mask(
1091
  attention_mask,
1092
+ (batch_size, seq_len),
1093
  inputs_embeds,
1094
  past_key_values_length,
1095
  )
1096
  elif attention_mask is None or attention_mask.dim() == 2:
1097
  # 4d mask is passed through the layers
1098
+ seq_length = original_sequence_length + num_thought_tokens
1099
  attention_mask = _prepare_4d_causal_attention_mask(
1100
  attention_mask,
1101
+ (batch_size, seq_len),
1102
  inputs_embeds,
1103
  past_key_values_length,
1104
  sliding_window=self.config.sliding_window,
 
1669
  num_thought_tokens += 1
1670
  if self.use_end_thought_token:
1671
  num_thought_tokens += 1
1672
+
1673
+ seq_length = original_sequence_length + num_thought_tokens
1674
  # # if the attention mask
1675
  attention_mask = _prepare_4d_causal_attention_mask(
1676
  attention_mask,
1677
+ (batch_size, seq_len),
1678
  inputs_embeds,
1679
  past_key_values_length,
1680
  sliding_window=self.config.sliding_window,