Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 26, 2024

Commit

c0c99ee

verified ·

1 Parent(s): 16f10b2

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +44 -50

modeling_quiet.py CHANGED Viewed

@@ -373,27 +373,13 @@ class QuietAttention(nn.Module):
                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
-            # print("Hidden states contains NaN before applying attention mask:", torch.isnan(hidden_states).any().item())
-            # print("Attention mask contains NaN:", torch.isnan(attention_mask).any().item())
             attn_weights = attn_weights + attention_mask
-            # print("Attention weights contains NaN after applying attention mask:", torch.isnan(attn_weights).any().item())
         # upcast attention to fp32
-        # print("Attention weights contains NaN before softmax:", torch.isnan(attn_weights).any().item())
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        # print("Attention weights contains NaN after softmax:", torch.isnan(attn_weights).any().item())
         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        # print("Attention weights contains NaN before matmul:", torch.isnan(attn_weights).any().item())
-        # print("Value states contains NaN before matmul:", torch.isnan(value_states).any().item())
         attn_output = torch.matmul(attn_weights, value_states)
-        # print("Attention output contains NaN:", torch.isnan(attn_output).any().item())
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
             raise ValueError(
                 f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
@@ -1085,22 +1071,27 @@ class QuietModel(QuietPreTrainedModel):
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_len), dtype=torch.bool, device=inputs_embeds.device)
-        if attention_mask.dim() == 2:
-            attention_mask = attention_mask.view(batch_size, 1, 1, seq_len)
-            attention_mask = attention_mask.expand(batch_size, 1, seq_len, seq_len)
-        elif attention_mask.dim() == 3:
-            attention_mask = attention_mask.unsqueeze(1)
-        elif attention_mask.dim() != 4:
-            raise ValueError(f"Attention mask should be of shape (batch_size, 1, seq_len, seq_len) or (batch_size, 1, 1, seq_len), but got {attention_mask.shape}")
-        attention_mask = attention_mask.to(dtype=torch.bool)
-        print("Attention mask shape after expansion:", attention_mask.shape)
-        print("Attention mask contains NaN:", torch.isnan(attention_mask).any().item())
         hidden_states = inputs_embeds
@@ -1110,7 +1101,6 @@ class QuietModel(QuietPreTrainedModel):
         next_decoder_cache = None
         for decoder_layer in self.layers:
-            print(f"Hidden states contains NaN before layer {id}:", torch.isnan(hidden_states).any().item())
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
@@ -1168,7 +1158,6 @@ def nonzero_mean(x, axis=None):
 def loss_mean(x):
     return x.sum() / (x != 0).sum()
-    print(f"Hidden states contains NaN after layer {id}:", torch.isnan(hidden_states).any().item())
 class QuietForCausalLM(QuietPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
@@ -1353,8 +1342,6 @@ class QuietForCausalLM(QuietPreTrainedModel):
                 return_dict=return_dict,
             )
             new_key_values = outputs.past_key_values
-            print(f"Hidden states contains NaN: {torch.isnan(hidden_states).any().item()}")
             hidden_states = outputs[0]
             logits = self.lm_head(hidden_states)
             logits = logits[:, -1, :]  # Only consider the last token
@@ -1896,15 +1883,29 @@ class QuietForCausalLM(QuietPreTrainedModel):
                         inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
                 inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
-                if attention_mask is not None:
-                    if attention_mask.dim() == 2:
-                        attention_mask = attention_mask.view(batch_size, 1, 1, seq_len)
-                        attention_mask = attention_mask.expand(batch_size, 1, seq_len, seq_len)
-                    elif attention_mask.dim() != 4:
-                        raise ValueError(f"Attention mask should be of shape (batch_size, 1, seq_len, seq_len), but got {attention_mask.shape}")
-                    attention_mask = attention_mask.to(dtype=torch.bool)
                 past_key_values = outputs.past_key_values
                 position_ids = position_ids + 1
@@ -1917,16 +1918,9 @@ class QuietForCausalLM(QuietPreTrainedModel):
                     else:
                         loss_logits = logits
                     shift_idx = 1 + max(0, ahead_idx - (self.n_ahead - 1))
-                    # print("initial_loss_logits contains NaN:", torch.isnan(initial_loss_logits).any().item())
-                    # print("logits contains NaN:", torch.isnan(logits).any().item())
-                    # print("loss_logits contains NaN:", torch.isnan(loss_logits).any().item())
                     shift_logits = loss_logits[..., :-shift_idx, :].contiguous()
-                    # print("shift_logits contains NaN:", torch.isnan(shift_logits).any().item())
                     shift_labels = labels[..., shift_idx:].contiguous()
                     # Flatten the tokens
-                    # assert not torch.isnan(shift_logits).any(), "NaN values found in shift_logits"
-                    # assert not torch.isnan(shift_labels).any(), "NaN values found in shift_labels"
                     loss_fct = CrossEntropyLoss(reduction="none")
                     shift_logits = shift_logits.view(-1, self.config.vocab_size)
                     shift_labels = shift_labels.view(-1)

                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights + attention_mask
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
             raise ValueError(
                 f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions and attention_mask.dim() == 2 and False:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        elif attention_mask is None or attention_mask.dim() == 2:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
         hidden_states = inputs_embeds
         next_decoder_cache = None
         for decoder_layer in self.layers:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 def loss_mean(x):
     return x.sum() / (x != 0).sum()
 class QuietForCausalLM(QuietPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
                 return_dict=return_dict,
             )
             new_key_values = outputs.past_key_values
             hidden_states = outputs[0]
             logits = self.lm_head(hidden_states)
             logits = logits[:, -1, :]  # Only consider the last token
                         inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
                 inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
+                if len(attention_mask.shape) == 2:
+                    breakpoint()
+                else:
+                    original_attention = attention_mask[..., :attention_mask.shape[-2]]
+                    if self.use_upper_triangular:
+                        new_attention = original_attention
+                    else:
+                        original_attention = original_attention == attention_mask.max()
+                        # because eye isn't implemented for BF16, we need to handle the case
+                        if not attention_mask.dtype == torch.bfloat16:
+                            new_attention = torch.eye(
+                                seq_len, dtype=attention_mask.dtype, device=attention_mask.device
+                            )
+                        else:
+                            new_attention = torch.eye(
+                                seq_len, dtype=torch.float32, device=attention_mask.device
+                            ).to(attention_mask.dtype)
+                        new_attention = new_attention.view(1, 1, seq_len, seq_len).repeat(input_ids.shape[0], 1, 1, 1)
+                        new_attention = new_attention * original_attention
+                        new_attention[new_attention == 0] = attention_mask.min()
+                        new_attention[new_attention == 1] = attention_mask.max()
+                    attention_mask = torch.cat([attention_mask, new_attention], dim=-1)
                 past_key_values = outputs.past_key_values
                 position_ids = position_ids + 1
                     else:
                         loss_logits = logits
                     shift_idx = 1 + max(0, ahead_idx - (self.n_ahead - 1))
                     shift_logits = loss_logits[..., :-shift_idx, :].contiguous()
                     shift_labels = labels[..., shift_idx:].contiguous()
                     # Flatten the tokens
                     loss_fct = CrossEntropyLoss(reduction="none")
                     shift_logits = shift_logits.view(-1, self.config.vocab_size)
                     shift_labels = shift_labels.view(-1)