Upload 3 files

Browse files

Files changed (3) hide show

config.json +14 -4
configuration_quiet.py +26 -5
modeling_quiet.py +1050 -107

config.json CHANGED Viewed

@@ -1,7 +1,9 @@
 {
   "architectures": [
     "QuietForCausalLM"
   ],
   "auto_map": {
     "AutoConfig": "configuration_quiet.QuietConfig",
     "AutoModel": "modeling_quiet.QuietModel",
@@ -14,9 +16,11 @@
   "initializer_range": 0.02,
   "intermediate_size": 14336,
   "max_position_embeddings": 32768,
   "model_type": "quiet",
-  "max_thoughts": 3,
-  "thought_length": 10,
   "num_attention_heads": 32,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
@@ -25,7 +29,13 @@
   "sliding_window": 4096,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.34.0.dev0",
   "use_cache": true,
-  "vocab_size": 32000
 }

 {
+  "_name_or_path": "Crystalcareai/Quiet-Star-Custom",
   "architectures": [
     "QuietForCausalLM"
   ],
+  "attention_dropout": 0.0,
   "auto_map": {
     "AutoConfig": "configuration_quiet.QuietConfig",
     "AutoModel": "modeling_quiet.QuietModel",
   "initializer_range": 0.02,
   "intermediate_size": 14336,
   "max_position_embeddings": 32768,
+  "max_thoughts": 10,
+  "merged_lm_and_talk_heads": false,
+  "merged_lm_and_think_heads": true,
+  "merged_talk_heads": true,
   "model_type": "quiet",
   "num_attention_heads": 32,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
   "sliding_window": 4096,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.37.0.dev0",
   "use_cache": true,
+  "use_complex_talk_head": true,
+  "use_complex_think_head": false,
+  "use_concat_talk_head": true,
+  "use_shallow_talk": false,
+  "use_shallow_think": true,
+  "use_weighted_talk_head": true,
+  "vocab_size": 32002
 }

configuration_quiet.py CHANGED Viewed

@@ -20,6 +20,11 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
 class QuietConfig(PretrainedConfig):
     r"""
@@ -111,13 +116,21 @@ class QuietConfig(PretrainedConfig):
         use_cache=True,
         pad_token_id=None,
         bos_token_id=1,
-        max_thoughts: int = 3,
-        thought_length: int = 10,
         eos_token_id=2,
         tie_word_embeddings=False,
         rope_theta=10000.0,
         sliding_window=4096,
         attention_dropout=0.0,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -137,10 +150,18 @@ class QuietConfig(PretrainedConfig):
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.max_thoughts = max_thoughts
-        self.thought_length = thought_length
         self.rope_theta = rope_theta
         self.attention_dropout = attention_dropout
         super().__init__(
             pad_token_id=pad_token_id,
@@ -148,4 +169,4 @@ class QuietConfig(PretrainedConfig):
             eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
-        )

 logger = logging.get_logger(__name__)
+QUIET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "quietai/Quiet-7B-v0.1": "https://huggingface.co/quietai/Quiet-7B-v0.1/resolve/main/config.json",
+    "quietai/Quiet-7B-Instruct-v0.1": "https://huggingface.co/quietai/Quiet-7B-Instruct-v0.1/resolve/main/config.json",
+}
 class QuietConfig(PretrainedConfig):
     r"""
         use_cache=True,
         pad_token_id=None,
         bos_token_id=1,
         eos_token_id=2,
         tie_word_embeddings=False,
         rope_theta=10000.0,
         sliding_window=4096,
         attention_dropout=0.0,
+        max_thoughts=16,
+        merged_talk_heads=True,
+        merged_lm_and_talk_heads=False,
+        merged_lm_and_think_heads=True,
+        use_concat_talk_head=True,
+        use_shallow_think=True,
+        use_shallow_talk=False,
+        use_complex_think_head=False,
+        use_complex_talk_head=True,
+        use_weighted_talk_head=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
         self.rope_theta = rope_theta
         self.attention_dropout = attention_dropout
+        self.max_thoughts = max_thoughts
+        self.merged_talk_heads = merged_talk_heads
+        self.merged_lm_and_talk_heads = merged_lm_and_talk_heads
+        self.merged_lm_and_think_heads = merged_lm_and_think_heads
+        self.use_concat_talk_head = use_concat_talk_head
+        self.use_shallow_think = use_shallow_think
+        self.use_shallow_talk = use_shallow_talk
+        self.use_complex_think_head = use_complex_think_head
+        self.use_complex_talk_head = use_complex_talk_head
+        self.use_weighted_talk_head = use_weighted_talk_head
         super().__init__(
             pad_token_id=pad_token_id,
             eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
+        )

modeling_quiet.py CHANGED Viewed

@@ -20,7 +20,20 @@
 """ PyTorch Quiet model."""
 import inspect
 import math
 import warnings
 from typing import List, Optional, Tuple, Union
 import torch
@@ -56,13 +69,79 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "QuietConfig"
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
     return (
         indices,
         cu_seqlens,
@@ -85,11 +164,10 @@ class QuietRMSNorm(nn.Module):
         hidden_states = hidden_states.to(torch.float32)
         variance = hidden_states.pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Quiet
-# TODO @Arthur no longer copied from LLama after static cache
 class QuietRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -97,7 +175,7 @@ class QuietRotaryEmbedding(nn.Module):
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         # Build here to make `torch.jit.trace` work.
@@ -107,7 +185,7 @@ class QuietRotaryEmbedding(nn.Module):
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
         freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
@@ -134,8 +212,7 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
-# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-# TODO @Arthur no longer copied from LLama after static cache
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
@@ -204,8 +281,8 @@ class QuietAttention(nn.Module):
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
                 "when creating this class."
             )
@@ -496,7 +573,7 @@ class QuietFlashAttention2(QuietAttention):
             attention_mask (`torch.Tensor`):
                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                 position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
                 Attention dropout
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
@@ -614,8 +691,7 @@ class QuietFlashAttention2(QuietAttention):
         )
-# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Quiet
-# TODO @Arthur no longer copied from LLama after static cache
 class QuietSdpaAttention(QuietAttention):
     """
     Quiet attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
@@ -689,14 +765,14 @@ class QuietSdpaAttention(QuietAttention):
             query_states,
             key_states,
             value_states,
-            attn_mask=attention_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
             is_causal=self.is_causal and attention_mask is None and q_len > 1,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
@@ -762,7 +838,7 @@ class QuietDecoderLayer(nn.Module):
             output_attentions=output_attentions,
             use_cache=use_cache,
         )
-        hidden_states = residual + hidden_states
         # Fully Connected
         residual = hidden_states
@@ -928,35 +1004,6 @@ class QuietModel(QuietPreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
-    def _generate_thoughts(self, hidden_states, max_length):
-        thought_ids = []
-        thought_embeddings = []
-        for _ in range(self.config.max_thoughts):
-            thought_id = torch.LongTensor([[self.config.start_token_id]]).to(hidden_states.device)
-            thought_embedding = self.embed_tokens(thought_id)
-            for _ in range(max_length):
-                outputs = self.forward(
-                    inputs_embeds=thought_embedding,
-                    attention_mask=None,
-                    use_cache=True,
-                )
-                logits = outputs.logits[:, -1, :]
-                next_token_id = torch.argmax(logits, dim=-1)
-                if next_token_id == self.config.end_token_id:
-                    break
-                thought_id = torch.cat([thought_id, next_token_id.unsqueeze(0)], dim=-1)
-                thought_embedding = torch.cat([thought_embedding, self.embed_tokens(next_token_id.unsqueeze(0))], dim=1)
-            thought_ids.append(thought_id.squeeze(0))
-            thought_embeddings.append(thought_embedding.squeeze(0))
-        return thought_ids, thought_embeddings
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     def forward(
         self,
@@ -1027,7 +1074,7 @@ class QuietModel(QuietPreTrainedModel):
         if self._attn_implementation == "flash_attention_2":
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
             # output_attentions=True can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
             attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
@@ -1036,7 +1083,7 @@ class QuietModel(QuietPreTrainedModel):
                 inputs_embeds,
                 past_key_values_length,
             )
-        else:
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask,
@@ -1104,37 +1151,132 @@ class QuietModel(QuietPreTrainedModel):
             attentions=all_self_attns,
         )
 class QuietForCausalLM(QuietPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.model = QuietModel(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        self.mixing_head = nn.Sequential(
-            nn.Linear(config.hidden_size * 2, config.hidden_size),
-            nn.ReLU(),
-            nn.Linear(config.hidden_size, 1),
-        )
         self.max_thoughts = config.max_thoughts
-        self.thought_length = config.thought_length
         self.use_policy_loss = True
         self.remove_negative_rewards = True
-        self.post_init()
-    def calculate_policy_loss(self, thoughts, rewards):
-        thought_log_probs = []
-        for thought in thoughts:
-            thought_log_prob = self.lm_head(thought).log_softmax(dim=-1)
-            thought_log_probs.append(thought_log_prob)
-        thought_log_probs = torch.stack(thought_log_probs, dim=1)  # (batch_size, num_thoughts, seq_length, vocab_size)
-        thought_probs = torch.exp(thought_log_probs)
-        policy_loss = -torch.mean(thought_log_probs * rewards.unsqueeze(-1).unsqueeze(-1))
-        return policy_loss
     def get_input_embeddings(self):
         return self.model.embed_tokens
@@ -1154,6 +1296,125 @@ class QuietForCausalLM(QuietPreTrainedModel):
     def get_decoder(self):
         return self.model
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1194,6 +1455,16 @@ class QuietForCausalLM(QuietPreTrainedModel):
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1201,58 +1472,730 @@ class QuietForCausalLM(QuietPreTrainedModel):
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = outputs.last_hidden_state
-        base_logits = self.lm_head(hidden_states)
-        thought_ids, thought_embeddings = self.model._generate_thoughts(hidden_states, max_length=self.thought_length)
-        thought_hidden_states = self.model(inputs_embeds=thought_embeddings).last_hidden_state
-        thought_logits = self.lm_head(thought_hidden_states)
-        mixing_input = torch.cat([hidden_states, thought_hidden_states], dim=-1)
-        mixing_weights = self.mixing_head(mixing_input).squeeze(-1)  # (batch_size, seq_length)
-        mixed_logits = base_logits * (1 - mixing_weights.unsqueeze(-1)) + thought_logits * mixing_weights.unsqueeze(-1)
         loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = mixed_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-            if self.use_policy_loss:
-                rewards = loss.detach().unsqueeze(1).repeat(1, self.max_thoughts)
-                if self.remove_negative_rewards:
-                    rewards = torch.clamp(rewards, min=0)
-                policy_loss = self.calculate_policy_loss(thought_ids, rewards)
-                loss = loss + policy_loss
         if not return_dict:
-            output = (mixed_logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
         return CausalLMOutputWithPast(
-            loss=loss,
-            logits=mixed_logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
@@ -1268,7 +2211,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]

 """ PyTorch Quiet model."""
 import inspect
 import math
+import copy
+import os
+import time
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import wandb
+from termcolor import colored
+from tqdm import tqdm
+import random
+import numpy as np
+from matplotlib.colors import LinearSegmentedColormap, LogNorm
 import warnings
+from collections import defaultdict
 from typing import List, Optional, Tuple, Union
 import torch
 _CONFIG_FOR_DOC = "QuietConfig"
+from reportlab.pdfgen import canvas
+from reportlab.lib.pagesizes import letter
+from reportlab.lib.colors import HexColor
+def save_tokens_with_rewards_to_pdf(input_ids, token_rewards, tokenizer, output_file="text.pdf", eps=0.2, eps2=0.5):
+    c = canvas.Canvas(output_file, pagesize=letter)
+    c.setFont("Courier", 8)
+    x, y = 50, 750
+    previous_text = ""
+    current_text = ""
+    for token_idx, reward in enumerate(token_rewards):
+        current_text = tokenizer.decode(input_ids[: token_idx + 1])
+        if current_text != previous_text:
+            diff_text = current_text[len(previous_text) :]
+            if "\n" in diff_text:
+                lines = diff_text.split("\n")
+                for line_idx, line in enumerate(lines):
+                    if line_idx > 0:
+                        x = 50
+                        y -= 12
+                    if abs(reward) < eps:
+                        opacity = 0
+                    elif abs(reward) > eps2:
+                        opacity = 0.8
+                    else:
+                        opacity = 0.8 * (abs(reward) - eps) / (eps2 - eps)
+                    text_width = c.stringWidth(line)
+                    if reward > 0:
+                        highlight_color = HexColor("#4CCD99")
+                    else:
+                        highlight_color = HexColor("#FFC700")
+                    highlight_color.alpha = opacity
+                    c.setFillColor(highlight_color)
+                    c.rect(x, y - 2, text_width, 10, fill=True, stroke=False)
+                    c.setFillColor(HexColor("#000000"))
+                    c.drawString(x, y, line)
+                    x += text_width
+            else:
+                if abs(reward) < eps:
+                    opacity = 0
+                elif abs(reward) > eps2:
+                    opacity = 0.8
+                else:
+                    opacity = 0.8 * (abs(reward) - eps) / (eps2 - eps)
+                text_width = c.stringWidth(diff_text)
+                if reward > 0:
+                    highlight_color = HexColor("#4CCD99")
+                else:
+                    highlight_color = HexColor("#FFC700")
+                highlight_color.alpha = opacity
+                c.setFillColor(highlight_color)
+                c.rect(x, y - 2, text_width, 10, fill=True, stroke=False)
+                c.setFillColor(HexColor("#000000"))
+                c.drawString(x, y, diff_text)
+                x += text_width
+            if x > 550:
+                x = 50
+                y -= 12
+            if y < 50:
+                c.showPage()
+                y = 750
+                x = 50
+            previous_text = current_text
+    c.showPage()
+    c.save()
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
     return (
         indices,
         cu_seqlens,
         hidden_states = hidden_states.to(torch.float32)
         variance = hidden_states.pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return hidden_states.to(input_dtype) * self.weight.to(hidden_states.device)
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Quiet
 class QuietRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         # Build here to make `torch.jit.trace` work.
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
         freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
     return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                 "when creating this class."
             )
             attention_mask (`torch.Tensor`):
                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                 position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
                 Attention dropout
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         )
+# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Quiet
 class QuietSdpaAttention(QuietAttention):
     """
     Quiet attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
             query_states,
             key_states,
             value_states,
+            attn_mask=attention_mask.to(query_states.device) if attention_mask is not None else None,
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
             is_causal=self.is_causal and attention_mask is None and q_len > 1,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
             output_attentions=output_attentions,
             use_cache=use_cache,
         )
+        hidden_states = residual.to(hidden_states.device) + hidden_states
         # Fully Connected
         residual = hidden_states
     def set_input_embeddings(self, value):
         self.embed_tokens = value
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     def forward(
         self,
         if self._attn_implementation == "flash_attention_2":
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions and attention_mask.dim() == 2 and False:
             # output_attentions=True can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
             attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
                 inputs_embeds,
                 past_key_values_length,
             )
+        elif attention_mask is None or attention_mask.dim() == 2:
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask,
             attentions=all_self_attns,
         )
+def nonzero_mean(x, axis=None):
+    if axis is not None:
+        return x.sum(axis) / (x != 0).sum(axis)
+    return x.sum() / (x != 0).sum()
+def loss_mean(x):
+    return x.sum() / (x != 0).sum()
 class QuietForCausalLM(QuietPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
         self.model = QuietModel(config)
+        self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.max_thoughts = config.max_thoughts
+        self.merged_lm_and_talk_heads = config.merged_lm_and_talk_heads
+        self.use_concat_talk_head = config.use_concat_talk_head
+        self.use_shallow_talk = config.use_shallow_talk
+        self.use_complex_talk_head = config.use_complex_talk_head
+        self.use_weighted_talk_head = config.use_weighted_talk_head
+        # the weighted head will output a single value, so it can't be passed to the lm head
+        assert not (self.use_weighted_talk_head and self.use_shallow_talk)
+        self.n_ahead = 1
+        self.n_ahead_talk = 1
+        self.n_passes = 1
+        self.n_tokens_print = 1
+        self.gradient_accumulation_steps = 1
+        self.training_steps = 0
+        self.tokenizer = None
+        self.start_token_id = None
+        self.end_token_id = None
+        self.rm_initialized = False
+        self.residual_talk_head = True
+        self.thought_init_std_scale = 1e-2
+        self.final_only_mode = False
+        self.first_and_last_mode = True
+        self.first_only = False
+        self.original_loss_weight = 0.5
+        self.cumulative_residual = False
+        self.clever_residual = False
+        self.skip_residual = False
+        self.no_residual = True
+        self.optimize_lm_head_only_at_start = False
+        self.optimize_model_only_at_start = False
+        if self.optimize_model_only_at_start:
+            raise NotImplementedError
+        self.train_only_thinking_embedding = False
+        self.weighted_embeddings = False
+        self.use_start_thought_token = True
+        self.use_end_thought_token = True
+        self.initialize_thought_embedding_to_normal = False
+        self.initial_start_token = "---"
+        self.initial_end_token = "---"
+        self.output_logits_at_the_end = True
+        self.wandb_enabled = False
+        self.gumbel_temperature = 0.001
         self.use_policy_loss = True
+        self.include_policy_loss = True
+        self.trice_mode = True
         self.remove_negative_rewards = True
+        self.use_policy_loss_for_end_thought = True
+        self.base_original_mode = False
+        self.original_mode = False
+        self.thought_prefix = "(Let's think step by step"
+        self.tokenized_thought_prefix = None
+        self.log_dict = defaultdict(int)
+        self.eval_log_dict = defaultdict(int)
+        self.print_final_only = True
+        self.loss_mean = loss_mean
+        self.all_rewards = []
+        self.all_unreduced_losses = []
+        self.kill_after = 100
+        self.start_embedding = nn.Parameter(torch.zeros(2, self.model.config.hidden_size))
+        self.end_embedding = nn.Parameter(torch.zeros(2, self.model.config.hidden_size))
+        self.policy_loss_beta = 1e6
+        self.embedding_scale = 1e2
+        self.reinforce_temperature = 3
+        self.base_loss_beta = 1
+        # Not used in the paper:
+        self.use_thought_prefix = False
+        self.use_reparam_for_thought_embeddings = False
+        self.use_upper_triangular = False
+        self.subtract_mean_reward = False
+        self.comparison_mode = False
+        self.gumbel_detach = True
+        # For visualization
+        self.eval_mode = False
+        num_talk = 1
+        talk_input_dim = config.hidden_size if not self.use_concat_talk_head else config.hidden_size * 2
+        if self.use_weighted_talk_head:
+            talk_output_dim = 1
+        else:
+            talk_output_dim = config.hidden_size if self.use_shallow_talk else config.vocab_size
+        if not self.merged_lm_and_talk_heads:
+            if self.use_complex_talk_head:
+                self.talk_head = nn.ModuleList([nn.Sequential(
+                    nn.Linear(talk_input_dim, config.hidden_size),
+                    nn.ReLU(),
+                    nn.Linear(config.hidden_size, config.hidden_size),
+                    nn.ReLU(),
+                    nn.Linear(config.hidden_size, talk_output_dim, bias=False)
+                )])
+            else:
+                self.talk_head = nn.ModuleList([nn.Sequential(
+                    nn.Linear(talk_input_dim, talk_output_dim, bias=False)
+                )])
+        # Initialize weights and apply final processing
+        self.post_init()
     def get_input_embeddings(self):
         return self.model.embed_tokens
     def get_decoder(self):
         return self.model
+    @torch.no_grad()
+    def infer(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        batch_size, seq_len = input_ids.shape
+        # Save the original input_ids and attention_mask for later use
+        original_input_ids = input_ids.clone()
+        original_attention_mask = attention_mask.clone() if attention_mask is not None else None
+        # Append the start thought token to the input sequence
+        start_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|startthought|>")
+        input_ids = torch.cat([input_ids, torch.tensor([[start_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
+        seq_len += 1
+        # Update the attention mask
+        if attention_mask is not None:
+            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+        # Generate the continuation
+        continuation_length = self.n_ahead - 2
+        new_key_values = past_key_values
+        generated_tokens = []
+        for continuation_idx in range(continuation_length):
+            outputs = self.model(
+                input_ids=input_ids if continuation_idx == 0 else next_token_id.unsqueeze(-1).to(input_ids.device),
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=new_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            new_key_values = outputs.past_key_values
+            hidden_states = outputs[0]
+            logits = self.lm_head(hidden_states)
+            logits = logits[:, -1, :]  # Only consider the last token
+            # Apply Gumbel-Softmax to the logits
+            next_token_logits = F.gumbel_softmax(logits, tau=self.gumbel_temperature, hard=True, dim=-1)
+            next_token_id = torch.argmax(next_token_logits, dim=-1)
+            # Append the generated token to the input sequence
+            input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
+            generated_tokens.append(next_token_id)
+            seq_len += 1
+            # Update the attention mask
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+            # Update the position ids
+            if position_ids is not None:
+                position_ids = torch.cat([position_ids, (position_ids[:, -1] + 1).unsqueeze(-1)], dim=-1)
+        # Append the end thought token to the input sequence
+        end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
+        input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
+        seq_len += 1
+        # Update the attention mask
+        if attention_mask is not None:
+            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+        # Get the hidden states before and after the thought
+        outputs_before = self.model(
+            input_ids=original_input_ids,
+            attention_mask=original_attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states_before = outputs_before[0][:, -1:, :]
+        # two new tokens: last continuation token and end thought token
+        outputs_after = self.model(
+            input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor(end_thought_token_id).unsqueeze(-1).unsqueeze(-1).to(input_ids.device)], dim=-1),
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=new_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states_after = outputs_after[0][:, -1:, :]
+        # Apply the talk head to get the mixing weight
+        mixing_weight = self.talk_head[0](torch.cat([hidden_states_before, hidden_states_after], dim=-1))
+        # Apply the mixing weight to the hidden states
+        mixed_hidden_states = (1 - mixing_weight) * hidden_states_before + mixing_weight * hidden_states_after
+        # Apply the language model head to get the final logits
+        logits = self.lm_head(mixed_hidden_states)
+        # Decode the logits to get the generated text
+        generated_tokens = torch.cat(generated_tokens, dim=-1)
+        generated_text = self.tokenizer.decode(generated_tokens.squeeze(), skip_special_tokens=True)
+        return generated_text
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
+        log_dict = self.log_dict if self.training else self.eval_log_dict
+        if self.training and self.kill_after is not None and self.training_steps // self.gradient_accumulation_steps > self.kill_after:
+            raise ValueError("Killed after")
+        if not self.training:
+            n_ahead_talk_to_restore = self.n_ahead_talk
+            n_passes_to_restore = self.n_passes
+            self.n_ahead_talk = 1
+            self.n_passes = 1
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        assert self.cumulative_residual or self.clever_residual or self.skip_residual or self.no_residual
+        assert not (self.skip_residual and self.use_policy_loss)
+        if self.tokenized_thought_prefix is None and self.use_thought_prefix:
+            self.tokenized_thought_prefix = self.tokenizer(self.thought_prefix, return_tensors="pt", add_special_tokens=False)["input_ids"]
+        def apply_head(head, states, detach=False):
+            if detach:
+                head_weight = head.weight.detach()
+            else:
+                head_weight = head.weight
+            head_weight = head_weight.to(states.device)
+            return (head_weight @ states.transpose(-1, -2)).transpose(-1, -2).contiguous()
+        def idx_if_sequential(head, idx=0):
+            if isinstance(head, nn.Sequential) or isinstance(head, nn.ModuleList):
+                return idx_if_sequential(head[idx], idx=idx)
+            return head
+        def none_repeat_interleave(x, n):
+            if x is None:
+                return x
+            return x.repeat_interleave(n, dim=0)
+        if self.n_passes > 1:
+            input_ids = none_repeat_interleave(input_ids, self.n_passes)
+            attention_mask = none_repeat_interleave(attention_mask, self.n_passes)
+            position_ids = none_repeat_interleave(position_ids, self.n_passes)
+            inputs_embeds = none_repeat_interleave(inputs_embeds, self.n_passes)
+            labels = none_repeat_interleave(labels, self.n_passes)
+            if past_key_values is not None:
+                past_key_values = [none_repeat_interleave(p, self.n_passes) for p in past_key_values]
+        cur_token_indices = torch.arange(input_ids.shape[1], device=input_ids.device)
+        self.tokenizer_has_start_thought_token = True
+        self.tokenizer_has_end_thought_token = True
+        if self.start_token_id is None:
+            self.start_token_id = self.tokenizer.convert_tokens_to_ids("<|startthought|>")
+            if self.start_token_id == 0:
+                self.start_token_id = self.tokenizer.bos_token_id
+                self.tokenizer_has_start_thought_token = False
+            elif self.use_start_thought_token:
+                # base_start_id = self.tokenizer.convert_tokens_to_ids(self.initial_start_token)
+                base_start_id = self.tokenizer.encode(self.initial_start_token, add_special_tokens=False)[0]
+                if self.initialize_thought_embedding_to_normal:
+                    self.start_embedding.data = torch.zeros_like(self.start_embedding.data)
+                else:
+                    self.start_embedding.data[0] = self.model.embed_tokens.weight.data[base_start_id].clone().detach() / self.embedding_scale
+                self.start_embedding.data[1] = torch.log(self.model.embed_tokens.weight.data.std(dim=0) * self.thought_init_std_scale / self.embedding_scale)
+        if self.end_token_id is None:
+            self.end_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
+            if self.end_token_id == 0:
+                self.end_token_id = self.tokenizer.eos_token_id
+                self.tokenizer_has_end_thought_token = False
+            elif self.use_end_thought_token:
+                # base_end_id = self.tokenizer.convert_tokens_to_ids(self.initial_end_token)
+                base_end_id = self.tokenizer.encode(self.initial_end_token, add_special_tokens=False)[0]
+                if self.initialize_thought_embedding_to_normal:
+                    self.end_embedding.data = torch.zeros_like(self.end_embedding.data)
+                else:
+                    self.end_embedding.data[0] = self.model.embed_tokens.weight.data[base_end_id].clone().detach() / self.embedding_scale
+                self.end_embedding.data[1] = torch.log(self.model.embed_tokens.weight.data.std(dim=0) * self.thought_init_std_scale / self.embedding_scale)
+        if not self.rm_initialized and (self.n_ahead > 1 or not self.base_original_mode):
+            self.rm_initialized = True
+            if not self.use_shallow_talk:
+                head = self.talk_head[0]
+                cur_head = head[-1] if isinstance(head, nn.Sequential) else head
+                talk_input_dim = cur_head.weight.data.shape[1]
+                talk_output_dim = 1 if self.use_weighted_talk_head else self.lm_head.weight.data.shape[0]
+                cur_head.weight.data = torch.zeros(talk_output_dim, talk_input_dim, device=cur_head.weight.device, dtype=cur_head.weight.dtype)
+            else:
+                # convert to identity transform
+                def lambda_transform(cur_head):
+                    if cur_head.weight.data.shape[0] != cur_head.weight.data.shape[1]:
+                        return torch.cat([
+                        torch.eye(
+                            cur_head.weight.data.shape[0],
+                            device=cur_head.weight.device,
+                            dtype=cur_head.weight.dtype
+                        ),
+                        torch.zeros(
+                            cur_head.weight.data.shape[0],
+                            cur_head.weight.data.shape[1] - cur_head.weight.data.shape[0],
+                            device=cur_head.weight.device,
+                            dtype=cur_head.weight.dtype
+                        )], dim=1)
+                    return torch.eye(
+                        cur_head.weight.data.shape[0],
+                        device=cur_head.weight.device,
+                        dtype=cur_head.weight.dtype
+                    )
+                if isinstance(self.talk_head[0], nn.Sequential):
+                    for cur_head in self.talk_head[0]:
+                        # if it has weights
+                        if hasattr(cur_head, "weight"):
+                            cur_head.weight.data = lambda_transform(cur_head)
+                else:
+                    self.talk_head[-1].weight.data = lambda_transform(self.talk_head[0])
         loss = None
+        prev_rm_tokens = None
+        cur_rm_tokens = None
+        prev_rm_logits = None
+        prev_sample_probs = None
+        did_skip_sampling = None
+        skip_sampling = None
+        sample_probs = None
+        hidden_states = None
+        logits = None
+        talk_kl_penalty = None
+        rm_logits = None
+        residual_logits = None
+        probabilities_2d = None
+        prev_probabilities_2d = None
+        policy_reward = None
+        logits_to_output = None
+        batch_size, seq_len = input_ids.shape
+        base_input_ids = input_ids.clone()
+        loss_list = []
+        dqn_loss_list = []
+        sampled_token_history = []
+        sample_probs_history = []
+        action_loglikelihoods_list = []
+        if self.use_end_thought_token or self.use_start_thought_token:
+            if not self.use_reparam_for_thought_embeddings:
+                start_embedding = self.start_embedding[0].unsqueeze(0) * self.embedding_scale
+                end_embedding = self.end_embedding[0].unsqueeze(0) * self.embedding_scale
+            else:
+                start_embedding = self.start_embedding * self.embedding_scale
+                end_embedding = self.end_embedding * self.embedding_scale
+            base_embeddings = self.model.embed_tokens.weight
+            if self.train_only_thinking_embedding:
+                base_embeddings = base_embeddings.detach()
+        # # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        fwd_iters = 1 if self.original_mode else self.n_ahead + self.n_ahead_talk - 1
+        for ahead_idx in range(fwd_iters):
+            past_key_values_length = 0
+            if past_key_values is not None:
+                use_legacy_cache = not isinstance(past_key_values, Cache)
+                if use_legacy_cache:
+                    past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                past_key_values_length = past_key_values.get_usable_length(seq_len)
+            if position_ids is None:
+                device = input_ids.device if input_ids is not None else inputs_embeds.device
+                position_ids = torch.arange(
+                    past_key_values_length, seq_len + past_key_values_length, dtype=torch.long, device=device
+                )
+                position_ids = position_ids.unsqueeze(0).view(-1, seq_len)
+            else:
+                position_ids = position_ids.view(-1, seq_len).long()
+            if inputs_embeds is None:
+                contains_start = self.use_start_thought_token and (input_ids == self.start_token_id).any()
+                contains_end = self.use_end_thought_token and (input_ids == self.end_token_id).any()
+                contains_thought = contains_start or contains_end
+                if contains_thought:
+                    thought_id = self.start_token_id if contains_start else self.end_token_id
+                    cur_thought_embedding = start_embedding if contains_start else end_embedding
+                    if self.use_reparam_for_thought_embeddings:
+                        inputs_embeds = torch.randn(batch_size, seq_len, self.model.config.hidden_size, device=input_ids.device, dtype=cur_thought_embedding.dtype)
+                        inputs_embeds = inputs_embeds.detach() * torch.exp(cur_thought_embedding[1]) + cur_thought_embedding[0]
+                        if contains_start:
+                            sampled_start = inputs_embeds.clone().detach()
+                        if contains_end:
+                            sampled_end = inputs_embeds.clone().detach()
+                    else:
+                        inputs_embeds = cur_thought_embedding.unsqueeze(0).repeat(batch_size, seq_len, 1)
+                else:
+                    with torch.set_grad_enabled(not self.train_only_thinking_embedding):
+                        inputs_embeds = self.model.embed_tokens(input_ids)
+            if self.n_ahead != 1 or self.n_ahead_talk != 1 or self.comparison_mode:
+                if attention_mask is None:
+                    base_attention_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=0).to(input_ids.device)
+                    base_attention_mask = base_attention_mask.view(1, 1, seq_len, seq_len)
+                    base_attention_mask = base_attention_mask.repeat(input_ids.shape[0], 1, 1, 1)
+                    attention_mask = base_attention_mask
+                    breakpoint()
+                elif attention_mask.dim() == 2:
+                    if seq_len + past_key_values_length != attention_mask.shape[-1]:
+                        breakpoint()
+                        attention_mask = torch.cat(
+                            [torch.ones((attention_mask.shape[0], past_key_values_length), dtype=attention_mask.dtype, device=attention_mask.device), attention_mask],
+                            dim=-1
+                        )
+                    # # if the attention mask
+                    attention_mask = _prepare_4d_causal_attention_mask(
+                        attention_mask,
+                        (batch_size, seq_len),
+                        inputs_embeds,
+                        past_key_values_length,
+                        sliding_window=self.config.sliding_window,
+                    )
+            outputs = self.model(
+                # input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            prev_hidden_states = hidden_states
+            hidden_states = outputs[0]
+            prev_rm_logits = rm_logits  # for policy gradient
+            prev_rm_tokens = cur_rm_tokens  # for policy gradient
+            if ahead_idx == 0:
+                hidden_states_lm = hidden_states
+                logits = self.lm_head(hidden_states_lm)
+                base_hidden_states = hidden_states.clone()
+                initial_loss_logits = logits.clone()
+                if self.optimize_lm_head_only_at_start or self.optimize_model_only_at_start:
+                    logits = logits.detach()
+                    base_hidden_states = base_hidden_states.detach()
+                if self.optimize_model_only_at_start:
+                    hidden_states = hidden_states.detach()
+                base_logits = logits.clone()
+            else:
+                talk_hidden_states = hidden_states
+                if self.merged_lm_and_talk_heads:
+                    assert self.no_residual
+                    residual_logits = self.lm_head(hidden_states)
+                    talk_hidden_states = hidden_states
+                else:
+                    if ahead_idx > self.n_ahead - 1:
+                        cur_base_hidden = torch.cat([
+                            base_hidden_states[..., ahead_idx - self.n_ahead + 1:, :],
+                            base_hidden_states[..., :ahead_idx - self.n_ahead + 1, :]
+                        ], dim=-2)
+                    else:
+                        cur_base_hidden = base_hidden_states
+                    if self.use_concat_talk_head:
+                        # concatenate the hidden states with the original hidden states
+                        head_input_hidden_states = torch.cat([cur_base_hidden, talk_hidden_states], dim=-1)
+                    else:
+                        head_input_hidden_states = talk_hidden_states
+                    residual_logits = self.talk_head[0](head_input_hidden_states)
+                    if self.use_shallow_talk:
+                        residual_logits = apply_head(self.lm_head, residual_logits, detach=self.optimize_lm_head_only_at_start)
+                    residual_logits = residual_logits.to(logits.device)
+                    if self.use_weighted_talk_head:
+                        # combine the cur_base_hidden with the talk_hidden_states according to the weighted head
+                        residual_logits = cur_base_hidden * (1 - residual_logits) + talk_hidden_states * residual_logits
+                        residual_logits = apply_head(self.lm_head, residual_logits, detach=self.optimize_lm_head_only_at_start)
+                assert sum([self.cumulative_residual, self.clever_residual, self.skip_residual, self.no_residual]) == 1
+                if self.clever_residual:
+                    if ahead_idx >= self.n_ahead - 1:
+                        # get the logits shifted according to the current talk ahead
+                        cur_base_logits = torch.cat([
+                            base_logits[..., ahead_idx - self.n_ahead + 1:, :],
+                            base_logits[..., :ahead_idx - self.n_ahead + 1, :]
+                        ], dim=-2)
+                        if self.optimize_lm_head_only_at_start:
+                            cur_base_logits = cur_base_logits.detach()
+                        logits = cur_base_logits + residual_logits
+                    else:
+                        logits += residual_logits / self.n_ahead
+                elif self.cumulative_residual:
+                    if self.residual_talk_head:
+                        if ahead_idx < self.n_ahead:
+                            logits += residual_logits
+                        else:
+                            # get the logits shifted according to the current talk ahead
+                            cur_base_logits = torch.cat([
+                                base_logits[..., ahead_idx - self.n_ahead + 1:, :],
+                                base_logits[..., :ahead_idx - self.n_ahead + 1, :]
+                            ], dim=-2)
+                            if self.optimize_lm_head_only_at_start:
+                                cur_base_logits = cur_base_logits.detach()
+                            logits = cur_base_logits + residual_logits
+                    else:
+                        if ahead_idx < self.n_ahead:
+                            logits += residual_logits
+                        else:
+                            logits = residual_logits
+                elif self.skip_residual:
+                    if ahead_idx >= self.n_ahead:
+                        # get the logits shifted according to the current talk ahead
+                        cur_base_logits = torch.cat([
+                            base_logits[..., ahead_idx - self.n_ahead + 1:, :],
+                            base_logits[..., :ahead_idx - self.n_ahead + 1, :]
+                        ], dim=-2)
+                        if self.optimize_lm_head_only_at_start:
+                            cur_base_logits = cur_base_logits.detach()
+                        logits = cur_base_logits
+                elif self.no_residual:
+                    logits = residual_logits
+                else:
+                    logits = base_logits + residual_logits
+            attempted = False
+            talk_loss_list = []
+            if self.original_mode or (self.n_ahead == 1) or (self.comparison_mode and ahead_idx == 0):# or (self.optimize_lm_head_only_at_start and ahead_idx == 0):
+                loss = None
+                attempted = True
+                if labels is not None:
+                    for shift_amount in range(self.n_ahead_talk):
+                        # Shift so that tokens < n predict n
+                        #  ab[cde]f
+                        # abc[def]
+                        if ahead_idx == 0 and self.optimize_lm_head_only_at_start:
+                            loss_logits = initial_loss_logits
+                        else:
+                            loss_logits = logits
+                        shift_logits = loss_logits[..., shift_amount:-1, :].contiguous()
+                        shift_labels = labels[..., 1 + shift_amount:].contiguous()
+                        # Flatten the tokens
+                        loss_fct = CrossEntropyLoss(reduction="none")
+                        shift_logits = shift_logits.view(-1, self.config.vocab_size)
+                        shift_labels = shift_labels.view(-1).clone()
+                        # Enable model parallelism
+                        shift_labels[shift_labels == self.tokenizer.pad_token_id] = -100
+                        shift_labels = shift_labels.to(shift_logits.device)
+                        loss = loss_fct(shift_logits, shift_labels)
+                        if not self.comparison_mode and not (self.optimize_lm_head_only_at_start and (self.n_ahead + self.n_ahead_talk > 2)) or self.original_mode:
+                            loss_list.append(loss)
+                        talk_loss_list.append(nonzero_mean(loss).detach())
+            if not attempted or self.comparison_mode:
+                rm_hidden_states = hidden_states
+                # print("Magnitude of RM hidden states before RM head", rm_hidden_states.norm())
+                rm_logits = apply_head(self.lm_head, rm_hidden_states, detach=self.optimize_lm_head_only_at_start)
+                # don't allow it to predict the thinking token
+                if self.tokenizer_has_start_thought_token:
+                    rm_logits[..., self.start_token_id] = -1e10
+                if self.tokenizer_has_end_thought_token:
+                    rm_logits[..., self.end_token_id] = -1e10
+                probabilities = rm_logits
+                if probabilities_2d is not None:
+                    prev_probabilities_2d = probabilities_2d.clone()
+                probabilities_2d = probabilities.view(-1, probabilities.size(-1))
+                did_skip_sampling = skip_sampling
+                skip_sampling = False
+                if ahead_idx == 0 and self.use_start_thought_token:
+                    override_token = self.start_token_id
+                elif self.use_thought_prefix and ahead_idx < self.tokenized_thought_prefix.shape[-1]:
+                    override_token = self.tokenized_thought_prefix[..., ahead_idx]
+                elif ahead_idx == self.n_ahead - 2 and self.use_end_thought_token:
+                    override_token = self.end_token_id
+                else:
+                    override_token = None
+                if override_token is not None and self.n_ahead > 1:
+                    # always start with the start token
+                    probabilities_2d = torch.zeros_like(probabilities_2d)
+                    probabilities_2d[:, override_token] = 1.0
+                    skip_sampling = True
+                elif ahead_idx >= self.n_ahead - 1:
+                    if labels is not None:  # we're in the talk phase
+                        cur_talk_n = ahead_idx - (self.n_ahead - 1) + 1
+                        # print("Setting rm to labels", cur_talk_n, "during", ahead_idx)
+                        shift_labels = labels[..., cur_talk_n:].contiguous().to(probabilities_2d.device)
+                        padding = torch.full_like(
+                            labels[..., :cur_talk_n],
+                            self.tokenizer.pad_token_id,
+                            dtype=torch.long,
+                            device=shift_labels.device
+                        )
+                        new_rm_tokens = torch.cat(
+                            [shift_labels, padding],
+                            dim=-1
+                        )
+                        # convert rm tokens to one-hot
+                        probabilities_2d = F.one_hot(new_rm_tokens, num_classes=self.vocab_size).reshape(-1, self.vocab_size).to(probabilities_2d.dtype)
+                        skip_sampling = True
+                    else:
+                        continue
+                temperature = self.gumbel_temperature if self.training else 0.001
+                prev_sample_probs = sample_probs
+                sample_probs = probabilities_2d
+                if ahead_idx < self.n_ahead - 1 and not skip_sampling:
+                    probabilities_2d = F.gumbel_softmax(sample_probs, tau=temperature, hard=True, dim=-1)
+                    if self.gumbel_detach:
+                        probabilities_2d = probabilities_2d.detach()
+                sampled_token_history.append(probabilities_2d.argmax(dim=-1).detach().cpu())
+                # convert rm logits directly to embeddings
+                contains_start = self.use_start_thought_token and (probabilities_2d[..., self.start_token_id].sum() > 0)
+                contains_end = self.use_end_thought_token and (probabilities_2d[..., self.end_token_id].sum() > 0)
+                contains_thought = contains_start or contains_end
+                if not contains_thought:
+                    with torch.set_grad_enabled(not self.train_only_thinking_embedding):
+                        inputs_embeds = probabilities_2d @ (self.model.embed_tokens.weight.to(probabilities.device).to(probabilities.dtype))
+                else:
+                    thought_id = self.start_token_id if contains_start else self.end_token_id
+                    cur_thought_embedding = start_embedding if contains_start else end_embedding
+                    if self.use_reparam_for_thought_embeddings:
+                        inputs_embeds = torch.randn(batch_size, seq_len, self.model.config.hidden_size, device=input_ids.device, dtype=cur_thought_embedding.dtype)
+                        inputs_embeds = inputs_embeds * torch.exp(cur_thought_embedding[1]) + cur_thought_embedding[0]
+                        if contains_start:
+                            sampled_start = inputs_embeds.clone().detach()
+                        else:
+                            sampled_end = inputs_embeds.clone().detach()
+                    else:
+                        inputs_embeds = cur_thought_embedding.unsqueeze(0).repeat(batch_size, seq_len, 1)
+                        inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
+                inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
+                if len(attention_mask.shape) == 2:
+                    breakpoint()
+                else:
+                    original_attention = attention_mask[..., :attention_mask.shape[-2]]
+                    if self.use_upper_triangular:
+                        new_attention = original_attention
+                    else:
+                        original_attention = original_attention == attention_mask.max()
+                        # because eye isn't implemented for BF16, we need to handle the case
+                        if not attention_mask.dtype == torch.bfloat16:
+                            new_attention = torch.eye(
+                                seq_len, dtype=attention_mask.dtype, device=attention_mask.device
+                            )
+                        else:
+                            new_attention = torch.eye(
+                                seq_len, dtype=torch.float32, device=attention_mask.device
+                            ).to(attention_mask.dtype)
+                        new_attention = new_attention.view(1, 1, seq_len, seq_len).repeat(input_ids.shape[0], 1, 1, 1)
+                        new_attention = new_attention * original_attention
+                        new_attention[new_attention == 0] = attention_mask.min()
+                        new_attention[new_attention == 1] = attention_mask.max()
+                    attention_mask = torch.cat([attention_mask, new_attention], dim=-1)
+                past_key_values = outputs.past_key_values
+                position_ids = position_ids + 1
+                if labels is not None and (self.n_ahead > 1 or not self.base_original_mode):
+                    # Shift so that tokens < n predict n
+                    # logits: abcdef -> bcdef? -> cdef??
+                    # labels: abcdef -> ?bcdef -> ??cdef
+                    if ahead_idx == 0 and self.optimize_lm_head_only_at_start:
+                        loss_logits = initial_loss_logits
+                    else:
+                        loss_logits = logits
+                    shift_idx = 1 + max(0, ahead_idx - (self.n_ahead - 1))
+                    shift_logits = loss_logits[..., :-shift_idx, :].contiguous()
+                    shift_labels = labels[..., shift_idx:].contiguous()
+                    # Flatten the tokens
+                    loss_fct = CrossEntropyLoss(reduction="none")
+                    shift_logits = shift_logits.view(-1, self.config.vocab_size)
+                    shift_labels = shift_labels.view(-1)
+                    # Enable model parallelism
+                    shift_labels = shift_labels.to(shift_logits.device)
+                    # if shift_labels.min() == self.tokenizer.pad_token_id:
+                    shift_labels = torch.where(shift_labels == self.tokenizer.pad_token_id, -100, shift_labels)
+                    unreduced_loss = loss_fct(shift_logits, shift_labels)
+                    if torch.any(unreduced_loss != unreduced_loss):
+                        raise ValueError("NaN loss")
+                    unreduced_loss = unreduced_loss.reshape(logits.shape[0], -1)
+                    loss_list.append(unreduced_loss)
+                    if self.use_policy_loss and ahead_idx > 0 and (ahead_idx > 1 or not self.use_start_thought_token):
+                        # we treat the change in loss as the reward
+                        previous_loss = loss_list[-2]
+                        # for example, suppose n_ahead = 3 and n_ahead_talk = 2
+                        # note that we end at self.n_ahead + self.n_ahead_talk - 2
+                        # in this case, 5 - 2 = 3, so we end at ahead_idx = 3
+                        # we also predict the next token at ahead_idx = 2
+                        # when we get to ahead_idx = 2, we predict ahead
+                        # so we shift by 1
+                        # note that this is ahead_idx = n_ahead - 1
+                        # when we get to ahead_idx = 3, we predict ahead
+                        # so we shift by 2
+                        # note that this is ahead_idx = n_ahead
+                        if ahead_idx < self.n_ahead - 1:
+                            shift_amount = 0
+                            original_dqn_reward = (previous_loss - unreduced_loss).detach()
+                            if self.first_and_last_mode:
+                                original_dqn_reward = original_dqn_reward * 0.0
+                        else:
+                            # logits vs cur_policy_shift_logits
+                            # let's look at rm_logits and prev_rm_logits
+                            shift_amount = max(0, ahead_idx - (self.n_ahead - 1))
+                            # let's say shift_amount = 2
+                            # abcdefg -> bcdefg? -> cdefg??
+                            # logits = [a b]c d e f[g]
+                            # labels = [a b c]d e f g
+                            cur_policy_shift_logits = initial_loss_logits[..., shift_amount:-1, :].contiguous().detach()
+                            cur_policy_shift_labels = labels[..., 1 + shift_amount:].contiguous()
+                            # Flatten the tokens
+                            cur_policy_loss_fct = CrossEntropyLoss(reduction="none")
+                            cur_policy_shift_logits = cur_policy_shift_logits.view(-1, self.config.vocab_size)
+                            cur_policy_shift_labels = cur_policy_shift_labels.view(-1).clone()
+                            # Enable model parallelism
+                            cur_policy_shift_labels[cur_policy_shift_labels == self.tokenizer.pad_token_id] = -100
+                            cur_policy_shift_labels = cur_policy_shift_labels.to(cur_policy_shift_labels.device)
+                            cur_policy_reward_base_loss = loss_fct(
+                                cur_policy_shift_logits, cur_policy_shift_labels.to(cur_policy_shift_logits.device)
+                            ).reshape(logits.shape[0], -1)
+                            original_dqn_reward = cur_policy_reward_base_loss.detach() - unreduced_loss
+                        if not did_skip_sampling:
+                            nonzero_indices = prev_probabilities_2d.nonzero()
+                            action_loglikelihoods = F.log_softmax(prev_sample_probs / self.reinforce_temperature, dim=-1)[nonzero_indices[:, 0], nonzero_indices[:, 1]]
+                            action_loglikelihoods_2d = action_loglikelihoods.reshape(batch_size, -1)[:, :-1 - shift_amount]
+                            action_loglikelihoods_list.append(action_loglikelihoods_2d)
+                        if policy_reward is None:
+                            policy_reward = original_dqn_reward[:, :-(self.n_ahead_talk - shift_amount)]
+                        else:
+                            if self.n_ahead_talk > shift_amount:
+                                added_reward = original_dqn_reward[:, :-(self.n_ahead_talk - shift_amount)]
+                            else:
+                                added_reward = original_dqn_reward
+                            policy_reward += added_reward
+                    if self.use_policy_loss and ahead_idx == self.n_ahead + self.n_ahead_talk - 2:
+                        # only compute during the thinking phase
+                        if self.use_reparam_for_thought_embeddings and (self.use_start_thought_token or self.use_end_thought_token):
+                            # sampled_start, sampled_end
+                            # calculate the log likelihood of the start and end embeddings sampled from a multivariate normal distribution
+                            # with mean start_embedding[0] and standard deviation start_embedding[1]
+                            if self.use_start_thought_token:
+                                exp_start_std = torch.exp(start_embedding[1])
+                                start_loglikelihood = -0.5 * (sampled_start.detach() - start_embedding[0]) ** 2 / exp_start_std ** 2 - start_embedding[1] - 0.5 * math.log(2 * math.pi)
+                                start_loglikelihood = start_loglikelihood.mean(dim=-1)
+                            if self.use_end_thought_token:
+                                exp_end_std = torch.exp(end_embedding[1])
+                                end_loglikelihood = -0.5 * (sampled_end.detach() - end_embedding[0]) ** 2 / exp_end_std ** 2 - end_embedding[1] - 0.5 * math.log(2 * math.pi)
+                                end_loglikelihood = end_loglikelihood.mean(dim=-1)
+                            # we use the mean instead of the sum to prevent dependence on the dimensionality of the embeddings
+                            if self.use_end_thought_token and self.use_policy_loss_for_end_thought:
+                                action_loglikelihoods_list.append(end_loglikelihood)
+                            if self.use_start_thought_token:
+                                action_loglikelihoods_list.append(start_loglikelihood)
+                        if ahead_idx == self.n_ahead + self.n_ahead_talk - 2 and self.eval_mode:
+                            with torch.no_grad():
+                                # calculate the 0.75 quantile of the rewards
+                                filtered_tokens = input_ids[:, :policy_reward.shape[-1]].cpu().detach().numpy().flatten()
+                                filtered_tokens_mask = filtered_tokens != self.tokenizer.pad_token_id
+                                filtered_tokens = filtered_tokens[filtered_tokens_mask]
+                                filtered_rewards = policy_reward.float().cpu().detach().numpy()[:, :seq_len - self.n_ahead_talk].flatten()
+                                filtered_rewards = filtered_rewards[filtered_tokens_mask]
+                                abs_reward_list = np.abs(policy_reward.float().cpu().detach().numpy()[:, :seq_len - self.n_ahead_talk].flatten())
+                                abs_reward_list = abs_reward_list[filtered_tokens_mask]
+                                medium_quantile = np.quantile(abs_reward_list, 0.5)
+                                upper_quantile = np.quantile(abs_reward_list, 0.95)
+                                save_tokens_with_rewards_to_pdf(
+                                    filtered_tokens,
+                                    [0] + filtered_rewards.tolist(),
+                                    self.tokenizer,
+                                    output_file=f"texts/rewards_talk_{self.n_ahead_talk}_{self.training_steps}.pdf",
+                                    eps=medium_quantile,
+                                    eps2=upper_quantile,
+                                )
+                                def plot_kde(data, losses):
+                                    sns.set(style="whitegrid")
+                                    # Create the KDE plot
+                                    sns.kdeplot(data, fill=True)
+                                    # Set the plot title and labels
+                                    plt.title("KDE Plot")
+                                    plt.xlabel("Value")
+                                    plt.ylabel("Density")
+                                    # Save the plot
+                                    plt.savefig(f"texts/kde_talk_{self.n_ahead_talk}_{self.training_steps}.pdf")
+                                    # Close the plot
+                                    plt.close()
+                                    # Step 1: Create a base color palette
+                                    base_colors = sns.color_palette("light:#5A9", n_colors=256)  # More colors for a smoother gradient
+                                    base_cmap = LinearSegmentedColormap.from_list("log_light", base_colors)
+                                    log_norm = LogNorm(vmin=1e-3, vmax=10)
+                                    sns.kdeplot(x=data, y=losses, fill=True, levels=20, norm=log_norm, cut=0, linewidths=0)
+                                    # limit y to 0 to 25 and x to -1 to 1
+                                    plt.xlim(-1, 1)
+                                    plt.ylim(0, 25)
+                                    plt.savefig(f"texts/jointer_talk_{self.n_ahead_talk}_{self.training_steps}.pdf")
+                                    plt.close()
+                                self.all_rewards.extend(filtered_rewards)
+                                self.all_unreduced_losses.extend(unreduced_loss[:, :-1].flatten()[filtered_tokens_mask].float().flatten().cpu().detach().numpy())
+                                plot_kde(self.all_rewards, self.all_unreduced_losses)
+                        for action_loglikelihoods_2d in action_loglikelihoods_list:
+                            train_policy_reward = policy_reward
+                            # discard rewards below the mean
+                            if self.trice_mode and self.n_passes > 1:
+                                batched_policy_reward = train_policy_reward.reshape(-1, self.n_passes, train_policy_reward.shape[-1])
+                                # average over the passes
+                                train_policy_reward = batched_policy_reward - batched_policy_reward.mean(dim=1, keepdim=True)
+                                train_policy_reward = train_policy_reward.reshape(-1, train_policy_reward.shape[-1])
+                            if self.subtract_mean_reward:
+                                train_policy_reward = train_policy_reward - train_policy_reward.mean()
+                            if self.remove_negative_rewards:
+                                fixed_policy_reward = train_policy_reward.detach().clamp(min=0)
+                            else:
+                                fixed_policy_reward = train_policy_reward.detach()
+                            actor_loss = -fixed_policy_reward * action_loglikelihoods_2d[:, :policy_reward.shape[-1]].to(policy_reward.device)
+                            if action_loglikelihoods_2d.mean() < -1e4 and not self.use_policy_loss_just_for_thoughts:
+                                # This will only happen when we force the next token to be the end of thought token
+                                break
+                            dqn_loss_list.append(actor_loss.mean())
+        if loss_list:
+            if self.first_and_last_mode:
+                loss = sum(
+                    self.loss_mean(loss_list[-(i + 1)]) for i in range(self.n_ahead_talk)
+                ) * (1 - self.original_loss_weight) / self.n_ahead_talk
+                loss = loss + self.loss_mean(loss_list[0]) * self.original_loss_weight
+                # Let's NaN out the others
+                # e.g. if n_ahead_talk = 2 and the list is 5 long, we want to NaN out 1, 2 but keep 0, 3, 4
+                for i in range(1, len(loss_list) - self.n_ahead_talk):
+                    loss_list[i] = loss_list[i] * math.nan
+            elif self.first_only:
+                loss = self.loss_mean(loss_list[0])
+            elif self.final_only_mode:
+                loss = sum(
+                    self.loss_mean(loss_list[-i]) for i in range(1, self.n_ahead_talk + 1)
+                ) / self.n_ahead_talk
+            else:
+                loss = None
+                for i in range(len(loss_list)):
+                    cur_loss = self.loss_mean(loss_list[i])
+                    if loss is not None:
+                        loss = loss + cur_loss.to(loss.device)
+                    else:
+                        loss = cur_loss
+                loss = loss / len(loss_list)
+            loss = loss * self.base_loss_beta
+        if dqn_loss_list:
+            dqn_loss = sum(dqn_loss_list) / len(dqn_loss_list)
+            if self.include_policy_loss:
+                if loss is not None:
+                    loss += dqn_loss * self.policy_loss_beta
+                else:
+                    loss = dqn_loss * self.policy_loss_beta
         if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        base_log_dict = {
+            f"loss_{i}": nonzero_mean(loss_list[i]) for i in range(len(loss_list))
+        }
+        if loss is not None:
+            base_log_dict["loss_train"] = loss.item()
+        for loss_key, loss_val in base_log_dict.items():
+            log_dict[loss_key] += loss_val / self.n_tokens_print
+        if self.use_policy_loss and policy_reward is not None:
+            log_dict["policy_loss"] += dqn_loss / self.n_tokens_print
+            log_dict["policy_reward"] += policy_reward.mean() / self.n_tokens_print
+        if not loss_list:
+            if loss is not None:
+                log_dict["loss_0"] += loss / self.n_tokens_print
+        else:
+            log_dict["loss_final"] += nonzero_mean(loss_list[-1]) / self.n_tokens_print
+            log_dict["loss_talk"] += sum(nonzero_mean(cur_loss_item) for cur_loss_item in loss_list[-self.n_ahead_talk:]) / self.n_ahead_talk / self.n_tokens_print
+        # also log relative losses to loss_0
+        if loss_list:
+            for i in range(len(loss_list)):
+                talk_idx = min(max(i - (self.n_ahead - 1), 0), len(talk_loss_list) - 1)
+                if not talk_loss_list:
+                    cur_talk_loss = nonzero_mean(loss_list[0])
+                else:
+                    cur_talk_loss = talk_loss_list[talk_idx]
+                log_dict[f"rel_loss_{i}"] += (nonzero_mean(loss_list[i]) - cur_talk_loss) / self.n_tokens_print
+        if self.training:
+            self.training_steps += 1
+        try:
+            # if self.training_steps % (self.gradient_accumulation_steps * 256) == 0:
+            if self.wandb_enabled:
+                if self.training_steps % (self.n_tokens_print) == 0 or not self.training:# and "0" in str(loss.device):
+                    if not self.training:
+                        new_log_dict = {}
+                        for key in list(log_dict.keys()):
+                            new_log_dict["eval_" + key] = log_dict[key]
+                        log_dict = new_log_dict
+                    log_dict["training_steps"] = self.training_steps
+                    log_dict["batch_size"] = batch_size
+                    log_dict["example_steps"] = self.training_steps * batch_size * self.gradient_accumulation_steps
+                    if self.n_ahead > 1:
+                        log_dict["compute_steps"] = self.training_steps * batch_size * (self.n_ahead + self.n_ahead_talk - 1) * self.gradient_accumulation_steps
+                    else: # There's no overhead for talk tokens if there's no thinking
+                        log_dict["compute_steps"] = self.training_steps * batch_size * self.gradient_accumulation_steps
+                    # remove all nans
+                    for key in list(log_dict.keys()):
+                        if log_dict[key] != log_dict[key]:
+                            del log_dict[key]
+                    if self.training:
+                        wandb.log(log_dict)
+                    if self.training:
+                        self.log_dict = defaultdict(int)
+                    else:
+                        self.eval_log_dict = defaultdict(int)
+        except Exception as e:
+            pass
+        if not self.training:
+            self.n_ahead_talk = n_ahead_talk_to_restore
+            self.n_passes = n_passes_to_restore
         return CausalLMOutputWithPast(
+            loss=loss if loss is not None else None,
+            logits=(rm_logits if self.n_ahead > 1 else logits) if not self.output_logits_at_the_end else logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing inputs_embeds as
             # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]