Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 26, 2024

Commit

1f8e662

verified ·

1 Parent(s): 38e552e

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +75 -49

modeling_quiet.py CHANGED Viewed

@@ -23,6 +23,7 @@ import math
 import copy
 import os
 import time
 import seaborn as sns
 import matplotlib.pyplot as plt
 import wandb
@@ -68,6 +69,73 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "QuietConfig"
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
@@ -257,13 +325,6 @@ class QuietAttention(nn.Module):
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if past_key_value is not None:
-            expected_attention_mask_size = (bsz, 1, q_len, q_len + past_key_value.get_usable_length(q_len, self.layer_idx))
-            if attention_mask.size() != expected_attention_mask_size:
-                # Assuming the attention mask is larger than expected, slice it to match the expected size
-                attention_mask = attention_mask[:, :, :, -expected_attention_mask_size[-1]:]
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
@@ -307,16 +368,11 @@ class QuietAttention(nn.Module):
             )
         if attention_mask is not None:
-            if attention_mask.dim() == 3:
-                attention_mask = attention_mask.unsqueeze(1)
-            elif attention_mask.dim() == 2:
-                attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-            if attention_mask.size(0) != bsz or attention_mask.size(-1) != kv_seq_len:
                 raise ValueError(
-                    f"Attention mask should be of size ({bsz}, 1, q_len, {kv_seq_len}), but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights + attention_mask
         # upcast attention to fp32
@@ -693,21 +749,11 @@ class QuietSdpaAttention(QuietAttention):
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         if attention_mask is not None:
-            if attention_mask.dim() == 3:
-                attention_mask = attention_mask.unsqueeze(1)
-            elif attention_mask.dim() == 2:
-                attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-        if attention_mask is not None:
-            if attention_mask.dim() == 3:
-                attention_mask = attention_mask.unsqueeze(1)
-            elif attention_mask.dim() == 2:
-                attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-            if attention_mask.size(0) != bsz or attention_mask.size(-1) != kv_seq_len:
                 raise ValueError(
-                    f"Attention mask should be of size ({bsz}, 1, q_len, {kv_seq_len}), but is {attention_mask.size()}"
                 )
         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
         if query_states.device.type == "cuda" and attention_mask is not None:
@@ -1281,27 +1327,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
         # Generate the continuation
         continuation_length = self.n_ahead - 2
         new_key_values = past_key_values
-        if self.n_ahead != 1 or self.n_ahead_talk != 1 or self.comparison_mode:
-            if attention_mask is None:
-                base_attention_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=0).to(input_ids.device)
-                base_attention_mask = base_attention_mask.view(1, 1, seq_len, seq_len)
-                base_attention_mask = base_attention_mask.repeat(input_ids.shape[0], 1, 1, 1)
-                attention_mask = base_attention_mask
-            elif attention_mask.dim() == 2:
-                if seq_len + past_key_values_length != attention_mask.shape[-1]:
-                    attention_mask = torch.cat(
-                        [torch.ones((attention_mask.shape[0], past_key_values_length), dtype=attention_mask.dtype, device=attention_mask.device), attention_mask],
-                        dim=-1
-                    )
-                attention_mask = _prepare_4d_causal_attention_mask(
-                    attention_mask,
-                    (batch_size, seq_len),
-                    inputs_embeds,
-                    past_key_values_length,
-                    sliding_window=self.config.sliding_window,
-                )
         start_time = time.time()
         for continuation_idx in range(continuation_length):
             outputs = self.model(

 import copy
 import os
 import time
+import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 import wandb
 _CONFIG_FOR_DOC = "QuietConfig"
+from reportlab.pdfgen import canvas
+from reportlab.lib.pagesizes import letter
+from reportlab.lib.colors import HexColor
+def save_tokens_with_rewards_to_pdf(input_ids, token_rewards, tokenizer, output_file="text.pdf", eps=0.2, eps2=0.5):
+    c = canvas.Canvas(output_file, pagesize=letter)
+    c.setFont("Courier", 8)
+    x, y = 50, 750
+    previous_text = ""
+    current_text = ""
+    for token_idx, reward in enumerate(token_rewards):
+        current_text = tokenizer.decode(input_ids[: token_idx + 1])
+        if current_text != previous_text:
+            diff_text = current_text[len(previous_text) :]
+            if "\n" in diff_text:
+                lines = diff_text.split("\n")
+                for line_idx, line in enumerate(lines):
+                    if line_idx > 0:
+                        x = 50
+                        y -= 12
+                    if abs(reward) < eps:
+                        opacity = 0
+                    elif abs(reward) > eps2:
+                        opacity = 0.8
+                    else:
+                        opacity = 0.8 * (abs(reward) - eps) / (eps2 - eps)
+                    text_width = c.stringWidth(line)
+                    if reward > 0:
+                        highlight_color = HexColor("#4CCD99")
+                    else:
+                        highlight_color = HexColor("#FFC700")
+                    highlight_color.alpha = opacity
+                    c.setFillColor(highlight_color)
+                    c.rect(x, y - 2, text_width, 10, fill=True, stroke=False)
+                    c.setFillColor(HexColor("#000000"))
+                    c.drawString(x, y, line)
+                    x += text_width
+            else:
+                if abs(reward) < eps:
+                    opacity = 0
+                elif abs(reward) > eps2:
+                    opacity = 0.8
+                else:
+                    opacity = 0.8 * (abs(reward) - eps) / (eps2 - eps)
+                text_width = c.stringWidth(diff_text)
+                if reward > 0:
+                    highlight_color = HexColor("#4CCD99")
+                else:
+                    highlight_color = HexColor("#FFC700")
+                highlight_color.alpha = opacity
+                c.setFillColor(highlight_color)
+                c.rect(x, y - 2, text_width, 10, fill=True, stroke=False)
+                c.setFillColor(HexColor("#000000"))
+                c.drawString(x, y, diff_text)
+                x += text_width
+            if x > 550:
+                x = 50
+                y -= 12
+            if y < 50:
+                c.showPage()
+                y = 750
+                x = 50
+            previous_text = current_text
+    c.showPage()
+    c.save()
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
             )
         if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights + attention_mask
         # upcast attention to fp32
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
         if query_states.device.type == "cuda" and attention_mask is not None:
         # Generate the continuation
         continuation_length = self.n_ahead - 2
         new_key_values = past_key_values
         start_time = time.time()
         for continuation_idx in range(continuation_length):
             outputs = self.model(