Upload 3 files

Browse files

Files changed (3) hide show

config.json +4 -14
configuration_quiet.py +26 -44
modeling_quiet.py +114 -1115

config.json CHANGED Viewed

@@ -1,9 +1,7 @@
 {
-  "_name_or_path": "Crystalcareai/Quiet-Star-Custom",
   "architectures": [
     "QuietForCausalLM"
   ],
-  "attention_dropout": 0.0,
   "auto_map": {
     "AutoConfig": "configuration_quiet.QuietConfig",
     "AutoModel": "modeling_quiet.QuietModel",
@@ -16,11 +14,9 @@
   "initializer_range": 0.02,
   "intermediate_size": 14336,
   "max_position_embeddings": 32768,
-  "max_thoughts": 10,
-  "merged_lm_and_talk_heads": false,
-  "merged_lm_and_think_heads": true,
-  "merged_talk_heads": true,
   "model_type": "quiet",
   "num_attention_heads": 32,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
@@ -29,13 +25,7 @@
   "sliding_window": 4096,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.37.0.dev0",
   "use_cache": true,
-  "use_complex_talk_head": true,
-  "use_complex_think_head": false,
-  "use_concat_talk_head": true,
-  "use_shallow_talk": false,
-  "use_shallow_think": true,
-  "use_weighted_talk_head": true,
-  "vocab_size": 32002
 }

 {
   "architectures": [
     "QuietForCausalLM"
   ],
   "auto_map": {
     "AutoConfig": "configuration_quiet.QuietConfig",
     "AutoModel": "modeling_quiet.QuietModel",
   "initializer_range": 0.02,
   "intermediate_size": 14336,
   "max_position_embeddings": 32768,
   "model_type": "quiet",
+  "max_thoughts": 3,
+  "thought_length": 10,
   "num_attention_heads": 32,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
   "sliding_window": 4096,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.34.0.dev0",
   "use_cache": true,
+  "vocab_size": 32000
 }

configuration_quiet.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 Quiet AI and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,28 +12,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Quiet model configuration"""
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
 logger = logging.get_logger(__name__)
-QUIET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "quietai/Quiet-7B-v0.1": "https://huggingface.co/quietai/Quiet-7B-v0.1/resolve/main/config.json",
-    "quietai/Quiet-7B-Instruct-v0.1": "https://huggingface.co/quietai/Quiet-7B-Instruct-v0.1/resolve/main/config.json",
-}
-class QuietConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`QuietModel`]. It is used to instantiate an
-    Quiet model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the Quiet-7B-v0.1 or Quiet-7B-Instruct-v0.1.
-    [quietai/Quiet-7B-v0.1](https://huggingface.co/quietai/Quiet-7B-v0.1)
-    [quietai/Quiet-7B-Instruct-v0.1](https://huggingface.co/quietai/Quiet-7B-Instruct-v0.1)
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -41,8 +39,8 @@ class QuietConfig(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the Quiet model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`QuietModel`]
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 14336):
@@ -61,7 +59,7 @@ class QuietConfig(PretrainedConfig):
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
         max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
-            The maximum sequence length that this model might ever be used with. Quiet's sliding window attention
             allows sequence of up to 4096*32 tokens.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -86,19 +84,19 @@ class QuietConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
     ```python
-    >>> from transformers import QuietModel, QuietConfig
-    >>> # Initializing a Quiet 7B style configuration
-    >>> configuration = QuietConfig()
-    >>> # Initializing a model from the Quiet 7B style configuration
-    >>> model = QuietModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "quiet"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
@@ -116,21 +114,13 @@ class QuietConfig(PretrainedConfig):
         use_cache=True,
         pad_token_id=None,
         bos_token_id=1,
         eos_token_id=2,
         tie_word_embeddings=False,
         rope_theta=10000.0,
         sliding_window=4096,
         attention_dropout=0.0,
-        max_thoughts=16,
-        merged_talk_heads=True,
-        merged_lm_and_talk_heads=False,
-        merged_lm_and_think_heads=True,
-        use_concat_talk_head=True,
-        use_shallow_think=True,
-        use_shallow_talk=False,
-        use_complex_think_head=False,
-        use_complex_talk_head=True,
-        use_weighted_talk_head=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -150,18 +140,10 @@ class QuietConfig(PretrainedConfig):
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
         self.rope_theta = rope_theta
         self.attention_dropout = attention_dropout
-        self.max_thoughts = max_thoughts
-        self.merged_talk_heads = merged_talk_heads
-        self.merged_lm_and_talk_heads = merged_lm_and_talk_heads
-        self.merged_lm_and_think_heads = merged_lm_and_think_heads
-        self.use_concat_talk_head = use_concat_talk_head
-        self.use_shallow_think = use_shallow_think
-        self.use_shallow_talk = use_shallow_talk
-        self.use_complex_think_head = use_complex_think_head
-        self.use_complex_talk_head = use_complex_talk_head
-        self.use_weighted_talk_head = use_weighted_talk_head
         super().__init__(
             pad_token_id=pad_token_id,
@@ -169,4 +151,4 @@ class QuietConfig(PretrainedConfig):
             eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
-        )

 # coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" Mistral model configuration"""
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
 logger = logging.get_logger(__name__)
+from ..deprecated._archive_maps import MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP  # noqa: F401, E402
+class MistralConfig(PretrainedConfig):
     r"""
+    This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
+    Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
+    [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
+    [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MistralModel`]
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 14336):
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
         max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Mistral's sliding window attention
             allows sequence of up to 4096*32 tokens.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
             The dropout ratio for the attention probabilities.
     ```python
+    >>> from transformers import MistralModel, MistralConfig
+    >>> # Initializing a Mistral 7B style configuration
+    >>> configuration = MistralConfig()
+    >>> # Initializing a model from the Mistral 7B style configuration
+    >>> model = MistralModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+    model_type = "mistral"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
         use_cache=True,
         pad_token_id=None,
         bos_token_id=1,
+        max_thoughts: int = 3,
+        thought_length: int = 10,
         eos_token_id=2,
         tie_word_embeddings=False,
         rope_theta=10000.0,
         sliding_window=4096,
         attention_dropout=0.0,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
+        self.max_thoughts = max_thoughts
+        self.thought_length = thought_length
         self.rope_theta = rope_theta
         self.attention_dropout = attention_dropout
         super().__init__(
             pad_token_id=pad_token_id,
             eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
+        )

modeling_quiet.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 Quiet AI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
@@ -17,23 +17,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Quiet model."""
 import inspect
 import math
-import copy
-import os
-import time
-import pandas as pd
-import seaborn as sns
-import matplotlib.pyplot as plt
-import wandb
-from termcolor import colored
-from tqdm import tqdm
-import random
-import numpy as np
-from matplotlib.colors import LinearSegmentedColormap, LogNorm
 import warnings
-from collections import defaultdict
 from typing import List, Optional, Tuple, Union
 import torch
@@ -42,12 +29,12 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
@@ -55,7 +42,7 @@ from transformers.utils import (
     logging,
     replace_return_docstrings,
 )
-from .configuration_quiet import QuietConfig
 if is_flash_attn_2_available():
@@ -67,73 +54,7 @@ if is_flash_attn_2_available():
 logger = logging.get_logger(__name__)
-_CONFIG_FOR_DOC = "QuietConfig"
-from reportlab.pdfgen import canvas
-from reportlab.lib.pagesizes import letter
-from reportlab.lib.colors import HexColor
-def save_tokens_with_rewards_to_pdf(input_ids, token_rewards, tokenizer, output_file="text.pdf", eps=0.2, eps2=0.5):
-    c = canvas.Canvas(output_file, pagesize=letter)
-    c.setFont("Courier", 8)
-    x, y = 50, 750
-    previous_text = ""
-    current_text = ""
-    for token_idx, reward in enumerate(token_rewards):
-        current_text = tokenizer.decode(input_ids[: token_idx + 1])
-        if current_text != previous_text:
-            diff_text = current_text[len(previous_text) :]
-            if "\n" in diff_text:
-                lines = diff_text.split("\n")
-                for line_idx, line in enumerate(lines):
-                    if line_idx > 0:
-                        x = 50
-                        y -= 12
-                    if abs(reward) < eps:
-                        opacity = 0
-                    elif abs(reward) > eps2:
-                        opacity = 0.8
-                    else:
-                        opacity = 0.8 * (abs(reward) - eps) / (eps2 - eps)
-                    text_width = c.stringWidth(line)
-                    if reward > 0:
-                        highlight_color = HexColor("#4CCD99")
-                    else:
-                        highlight_color = HexColor("#FFC700")
-                    highlight_color.alpha = opacity
-                    c.setFillColor(highlight_color)
-                    c.rect(x, y - 2, text_width, 10, fill=True, stroke=False)
-                    c.setFillColor(HexColor("#000000"))
-                    c.drawString(x, y, line)
-                    x += text_width
-            else:
-                if abs(reward) < eps:
-                    opacity = 0
-                elif abs(reward) > eps2:
-                    opacity = 0.8
-                else:
-                    opacity = 0.8 * (abs(reward) - eps) / (eps2 - eps)
-                text_width = c.stringWidth(diff_text)
-                if reward > 0:
-                    highlight_color = HexColor("#4CCD99")
-                else:
-                    highlight_color = HexColor("#FFC700")
-                highlight_color.alpha = opacity
-                c.setFillColor(highlight_color)
-                c.rect(x, y - 2, text_width, 10, fill=True, stroke=False)
-                c.setFillColor(HexColor("#000000"))
-                c.drawString(x, y, diff_text)
-                x += text_width
-            if x > 550:
-                x = 50
-                y -= 12
-            if y < 50:
-                c.showPage()
-                y = 750
-                x = 50
-            previous_text = current_text
-    c.showPage()
-    c.save()
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
@@ -141,7 +62,7 @@ def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
     return (
         indices,
         cu_seqlens,
@@ -149,11 +70,11 @@ def _get_unpad_data(attention_mask):
     )
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Quiet
-class QuietRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
-        QuietRMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -164,18 +85,19 @@ class QuietRMSNorm(nn.Module):
         hidden_states = hidden_states.to(torch.float32)
         variance = hidden_states.pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return hidden_states.to(input_dtype) * self.weight.to(hidden_states.device)
-# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Quiet
-class QuietRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         # Build here to make `torch.jit.trace` work.
@@ -185,7 +107,7 @@ class QuietRotaryEmbedding(nn.Module):
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
         freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
@@ -212,7 +134,8 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
@@ -241,7 +164,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     return q_embed, k_embed
-class QuietMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -269,20 +192,20 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-class QuietAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
     and "Generating Long Sequences with Sparse Transformers".
     """
-    def __init__(self, config: QuietConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                 "when creating this class."
             )
@@ -306,7 +229,7 @@ class QuietAttention(nn.Module):
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self.rotary_emb = QuietRotaryEmbedding(
             self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
@@ -397,9 +320,9 @@ class QuietAttention(nn.Module):
         return attn_output, attn_weights, past_key_value
-class QuietFlashAttention2(QuietAttention):
     """
-    Quiet flash attention module. This module inherits from `QuietAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
@@ -573,7 +496,7 @@ class QuietFlashAttention2(QuietAttention):
             attention_mask (`torch.Tensor`):
                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                 position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
                 Attention dropout
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
@@ -691,15 +614,16 @@ class QuietFlashAttention2(QuietAttention):
         )
-# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Quiet
-class QuietSdpaAttention(QuietAttention):
     """
-    Quiet attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `QuietAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
-    # Adapted from QuietAttention.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -712,7 +636,7 @@ class QuietSdpaAttention(QuietAttention):
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
-                "QuietModel is using QuietSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
@@ -765,37 +689,37 @@ class QuietSdpaAttention(QuietAttention):
             query_states,
             key_states,
             value_states,
-            attn_mask=attention_mask.to(query_states.device) if attention_mask is not None else None,
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
             is_causal=self.is_causal and attention_mask is None and q_len > 1,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
         return attn_output, None, past_key_value
-QUIET_ATTENTION_CLASSES = {
-    "eager": QuietAttention,
-    "flash_attention_2": QuietFlashAttention2,
-    "sdpa": QuietSdpaAttention,
 }
-class QuietDecoderLayer(nn.Module):
-    def __init__(self, config: QuietConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = QUIET_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.mlp = QuietMLP(config)
-        self.input_layernorm = QuietRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = QuietRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
@@ -838,7 +762,7 @@ class QuietDecoderLayer(nn.Module):
             output_attentions=output_attentions,
             use_cache=use_cache,
         )
-        hidden_states = residual.to(hidden_states.device) + hidden_states
         # Fully Connected
         residual = hidden_states
@@ -857,7 +781,7 @@ class QuietDecoderLayer(nn.Module):
         return outputs
-QUIET_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
@@ -867,7 +791,7 @@ QUIET_START_DOCSTRING = r"""
     and behavior.
     Parameters:
-        config ([`QuietConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -875,14 +799,14 @@ QUIET_START_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare Quiet Model outputting raw hidden-states without any specific head on top.",
-    QUIET_START_DOCSTRING,
 )
-class QuietPreTrainedModel(PreTrainedModel):
-    config_class = QuietConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["QuietDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
@@ -900,7 +824,7 @@ class QuietPreTrainedModel(PreTrainedModel):
                 module.weight.data[module.padding_idx].zero_()
-QUIET_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -971,28 +895,28 @@ QUIET_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare Quiet Model outputting raw hidden-states without any specific head on top.",
-    QUIET_START_DOCSTRING,
 )
-class QuietModel(QuietPreTrainedModel):
     """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`QuietDecoderLayer`]
     Args:
-        config: QuietConfig
     """
-    def __init__(self, config: QuietConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
-            [QuietDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
-        self.norm = QuietRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -1004,7 +928,7 @@ class QuietModel(QuietPreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
-    @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -1067,14 +991,14 @@ class QuietModel(QuietPreTrainedModel):
             if is_padding_right:
                 raise ValueError(
                     "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Quiet. Make sure to "
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
         if self._attn_implementation == "flash_attention_2":
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions and attention_mask.dim() == 2 and False:
             # output_attentions=True can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
             attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
@@ -1083,7 +1007,7 @@ class QuietModel(QuietPreTrainedModel):
                 inputs_embeds,
                 past_key_values_length,
             )
-        elif attention_mask is None or attention_mask.dim() == 2:
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask,
@@ -1151,129 +1075,15 @@ class QuietModel(QuietPreTrainedModel):
             attentions=all_self_attns,
         )
-def nonzero_mean(x, axis=None):
-    if axis is not None:
-        return x.sum(axis) / (x != 0).sum(axis)
-    return x.sum() / (x != 0).sum()
-def loss_mean(x):
-    return x.sum() / (x != 0).sum()
-class QuietForCausalLM(QuietPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
-        self.model = QuietModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        self.max_thoughts = config.max_thoughts
-        self.merged_lm_and_talk_heads = config.merged_lm_and_talk_heads
-        self.use_concat_talk_head = config.use_concat_talk_head
-        self.use_shallow_talk = config.use_shallow_talk
-        self.use_complex_talk_head = config.use_complex_talk_head
-        self.use_weighted_talk_head = config.use_weighted_talk_head
-        # the weighted head will output a single value, so it can't be passed to the lm head
-        assert not (self.use_weighted_talk_head and self.use_shallow_talk)
-        self.n_ahead = 1
-        self.n_ahead_talk = 1
-        self.n_passes = 1
-        self.n_tokens_print = 1
-        self.gradient_accumulation_steps = 1
-        self.training_steps = 0
-        self.tokenizer = None
-        self.start_token_id = None
-        self.end_token_id = None
-        self.rm_initialized = False
-        self.residual_talk_head = True
-        self.thought_init_std_scale = 1e-2
-        self.final_only_mode = False
-        self.first_and_last_mode = True
-        self.first_only = False
-        self.original_loss_weight = 0.5
-        self.cumulative_residual = False
-        self.clever_residual = False
-        self.skip_residual = False
-        self.no_residual = True
-        self.optimize_lm_head_only_at_start = False
-        self.optimize_model_only_at_start = False
-        if self.optimize_model_only_at_start:
-            raise NotImplementedError
-        self.train_only_thinking_embedding = False
-        self.weighted_embeddings = False
-        self.use_start_thought_token = True
-        self.use_end_thought_token = True
-        self.initialize_thought_embedding_to_normal = False
-        self.initial_start_token = "---"
-        self.initial_end_token = "---"
-        self.output_logits_at_the_end = True
-        self.wandb_enabled = False
-        self.gumbel_temperature = 0.001
-        self.use_policy_loss = True
-        self.include_policy_loss = True
-        self.trice_mode = True
-        self.remove_negative_rewards = True
-        self.use_policy_loss_for_end_thought = True
-        self.base_original_mode = False
-        self.original_mode = False
-        self.thought_prefix = "(Let's think step by step"
-        self.tokenized_thought_prefix = None
-        self.log_dict = defaultdict(int)
-        self.eval_log_dict = defaultdict(int)
-        self.print_final_only = True
-        self.loss_mean = loss_mean
-        self.all_rewards = []
-        self.all_unreduced_losses = []
-        self.kill_after = 100
-        self.start_embedding = nn.Parameter(torch.zeros(2, self.model.config.hidden_size))
-        self.end_embedding = nn.Parameter(torch.zeros(2, self.model.config.hidden_size))
-        self.policy_loss_beta = 1e6
-        self.embedding_scale = 1e2
-        self.reinforce_temperature = 3
-        self.base_loss_beta = 1
-        # Not used in the paper:
-        self.use_thought_prefix = False
-        self.use_reparam_for_thought_embeddings = False
-        self.use_upper_triangular = False
-        self.subtract_mean_reward = False
-        self.comparison_mode = False
-        self.gumbel_detach = True
-        # For visualization
-        self.eval_mode = False
-        num_talk = 1
-        talk_input_dim = config.hidden_size if not self.use_concat_talk_head else config.hidden_size * 2
-        if self.use_weighted_talk_head:
-            talk_output_dim = 1
-        else:
-            talk_output_dim = config.hidden_size if self.use_shallow_talk else config.vocab_size
-        if not self.merged_lm_and_talk_heads:
-            if self.use_complex_talk_head:
-                self.talk_head = nn.ModuleList([nn.Sequential(
-                    nn.Linear(talk_input_dim, config.hidden_size),
-                    nn.ReLU(),
-                    nn.Linear(config.hidden_size, config.hidden_size),
-                    nn.ReLU(),
-                    nn.Linear(config.hidden_size, talk_output_dim, bias=False)
-                )])
-            else:
-                self.talk_head = nn.ModuleList([nn.Sequential(
-                    nn.Linear(talk_input_dim, talk_output_dim, bias=False)
-                )])
         # Initialize weights and apply final processing
         self.post_init()
@@ -1296,126 +1106,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
     def get_decoder(self):
         return self.model
-    @torch.no_grad()
-    def infer(
-        self,
-        input_ids: torch.LongTensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        batch_size, seq_len = input_ids.shape
-        # Save the original input_ids and attention_mask for later use
-        original_input_ids = input_ids.clone()
-        original_attention_mask = attention_mask.clone() if attention_mask is not None else None
-        # Append the start thought token to the input sequence
-        start_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|startthought|>")
-        input_ids = torch.cat([input_ids, torch.tensor([[start_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
-        seq_len += 1
-        # Update the attention mask
-        if attention_mask is not None:
-            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
-        # Generate the continuation
-        continuation_length = self.n_ahead - 2
-        new_key_values = past_key_values
-        generated_tokens = []
-        for continuation_idx in range(continuation_length):
-            outputs = self.model(
-                input_ids=input_ids if continuation_idx == 0 else next_token_id.unsqueeze(-1).to(input_ids.device),
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_values=new_key_values,
-                inputs_embeds=inputs_embeds,
-                use_cache=True,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-            new_key_values = outputs.past_key_values
-            hidden_states = outputs[0]
-            logits = self.lm_head(hidden_states)
-            logits = logits[:, -1, :]  # Only consider the last token
-            # Apply Gumbel-Softmax to the logits
-            next_token_logits = F.gumbel_softmax(logits, tau=self.gumbel_temperature, hard=True, dim=-1)
-            next_token_id = torch.argmax(next_token_logits, dim=-1)
-            # Append the generated token to the input sequence
-            input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
-            generated_tokens.append(next_token_id)
-            seq_len += 1
-            # Update the attention mask
-            if attention_mask is not None:
-                attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
-            # Update the position ids
-            if position_ids is not None:
-                position_ids = torch.cat([position_ids, (position_ids[:, -1] + 1).unsqueeze(-1)], dim=-1)
-        # Append the end thought token to the input sequence
-        end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
-        input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
-        seq_len += 1
-        # Update the attention mask
-        if attention_mask is not None:
-            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
-        # Get the hidden states before and after the thought
-        outputs_before = self.model(
-            input_ids=original_input_ids,
-            attention_mask=original_attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states_before = outputs_before[0][:, -1:, :]
-        # two new tokens: last continuation token and end thought token
-        outputs_after = self.model(
-            input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor(end_thought_token_id).unsqueeze(-1).unsqueeze(-1).to(input_ids.device)], dim=-1),
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=new_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states_after = outputs_after[0][:, -1:, :]
-        # Apply the talk head to get the mixing weight
-        mixing_weight = self.talk_head[0](torch.cat([hidden_states_before, hidden_states_after], dim=-1))
-        # Apply the mixing weight to the hidden states
-        mixed_hidden_states = (1 - mixing_weight) * hidden_states_before + mixing_weight * hidden_states_after
-        # Apply the language model head to get the final logits
-        logits = self.lm_head(mixed_hidden_states)
-        # Decode the logits to get the generated text
-        generated_tokens = torch.cat(generated_tokens, dim=-1)
-        generated_text = self.tokenizer.decode(generated_tokens.squeeze(), skip_special_tokens=True)
-        return generated_text
-    @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1442,10 +1133,10 @@ class QuietForCausalLM(QuietPreTrainedModel):
         Example:
         ```python
-        >>> from transformers import AutoTokenizer, QuietForCausalLM
-        >>> model = QuietForCausalLM.from_pretrained("quietai/Quiet-7B-v0.1")
-        >>> tokenizer = AutoTokenizer.from_pretrained("quietai/Quiet-7B-v0.1")
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
@@ -1455,16 +1146,6 @@ class QuietForCausalLM(QuietPreTrainedModel):
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
-        log_dict = self.log_dict if self.training else self.eval_log_dict
-        if self.training and self.kill_after is not None and self.training_steps // self.gradient_accumulation_steps > self.kill_after:
-            raise ValueError("Killed after")
-        if not self.training:
-            n_ahead_talk_to_restore = self.n_ahead_talk
-            n_passes_to_restore = self.n_passes
-            self.n_ahead_talk = 1
-            self.n_passes = 1
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1472,730 +1153,48 @@ class QuietForCausalLM(QuietPreTrainedModel):
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        assert self.cumulative_residual or self.clever_residual or self.skip_residual or self.no_residual
-        assert not (self.skip_residual and self.use_policy_loss)
-        if self.tokenized_thought_prefix is None and self.use_thought_prefix:
-            self.tokenized_thought_prefix = self.tokenizer(self.thought_prefix, return_tensors="pt", add_special_tokens=False)["input_ids"]
-        def apply_head(head, states, detach=False):
-            if detach:
-                head_weight = head.weight.detach()
-            else:
-                head_weight = head.weight
-            head_weight = head_weight.to(states.device)
-            return (head_weight @ states.transpose(-1, -2)).transpose(-1, -2).contiguous()
-        def idx_if_sequential(head, idx=0):
-            if isinstance(head, nn.Sequential) or isinstance(head, nn.ModuleList):
-                return idx_if_sequential(head[idx], idx=idx)
-            return head
-        def none_repeat_interleave(x, n):
-            if x is None:
-                return x
-            return x.repeat_interleave(n, dim=0)
-        if self.n_passes > 1:
-            input_ids = none_repeat_interleave(input_ids, self.n_passes)
-            attention_mask = none_repeat_interleave(attention_mask, self.n_passes)
-            position_ids = none_repeat_interleave(position_ids, self.n_passes)
-            inputs_embeds = none_repeat_interleave(inputs_embeds, self.n_passes)
-            labels = none_repeat_interleave(labels, self.n_passes)
-            if past_key_values is not None:
-                past_key_values = [none_repeat_interleave(p, self.n_passes) for p in past_key_values]
-        cur_token_indices = torch.arange(input_ids.shape[1], device=input_ids.device)
-        self.tokenizer_has_start_thought_token = True
-        self.tokenizer_has_end_thought_token = True
-        if self.start_token_id is None:
-            self.start_token_id = self.tokenizer.convert_tokens_to_ids("<|startthought|>")
-            if self.start_token_id == 0:
-                self.start_token_id = self.tokenizer.bos_token_id
-                self.tokenizer_has_start_thought_token = False
-            elif self.use_start_thought_token:
-                # base_start_id = self.tokenizer.convert_tokens_to_ids(self.initial_start_token)
-                base_start_id = self.tokenizer.encode(self.initial_start_token, add_special_tokens=False)[0]
-                if self.initialize_thought_embedding_to_normal:
-                    self.start_embedding.data = torch.zeros_like(self.start_embedding.data)
-                else:
-                    self.start_embedding.data[0] = self.model.embed_tokens.weight.data[base_start_id].clone().detach() / self.embedding_scale
-                self.start_embedding.data[1] = torch.log(self.model.embed_tokens.weight.data.std(dim=0) * self.thought_init_std_scale / self.embedding_scale)
-        if self.end_token_id is None:
-            self.end_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
-            if self.end_token_id == 0:
-                self.end_token_id = self.tokenizer.eos_token_id
-                self.tokenizer_has_end_thought_token = False
-            elif self.use_end_thought_token:
-                # base_end_id = self.tokenizer.convert_tokens_to_ids(self.initial_end_token)
-                base_end_id = self.tokenizer.encode(self.initial_end_token, add_special_tokens=False)[0]
-                if self.initialize_thought_embedding_to_normal:
-                    self.end_embedding.data = torch.zeros_like(self.end_embedding.data)
-                else:
-                    self.end_embedding.data[0] = self.model.embed_tokens.weight.data[base_end_id].clone().detach() / self.embedding_scale
-                self.end_embedding.data[1] = torch.log(self.model.embed_tokens.weight.data.std(dim=0) * self.thought_init_std_scale / self.embedding_scale)
-        if not self.rm_initialized and (self.n_ahead > 1 or not self.base_original_mode):
-            self.rm_initialized = True
-            if not self.use_shallow_talk:
-                head = self.talk_head[0]
-                cur_head = head[-1] if isinstance(head, nn.Sequential) else head
-                talk_input_dim = cur_head.weight.data.shape[1]
-                talk_output_dim = 1 if self.use_weighted_talk_head else self.lm_head.weight.data.shape[0]
-                cur_head.weight.data = torch.zeros(talk_output_dim, talk_input_dim, device=cur_head.weight.device, dtype=cur_head.weight.dtype)
-            else:
-                # convert to identity transform
-                def lambda_transform(cur_head):
-                    if cur_head.weight.data.shape[0] != cur_head.weight.data.shape[1]:
-                        return torch.cat([
-                        torch.eye(
-                            cur_head.weight.data.shape[0],
-                            device=cur_head.weight.device,
-                            dtype=cur_head.weight.dtype
-                        ),
-                        torch.zeros(
-                            cur_head.weight.data.shape[0],
-                            cur_head.weight.data.shape[1] - cur_head.weight.data.shape[0],
-                            device=cur_head.weight.device,
-                            dtype=cur_head.weight.dtype
-                        )], dim=1)
-                    return torch.eye(
-                        cur_head.weight.data.shape[0],
-                        device=cur_head.weight.device,
-                        dtype=cur_head.weight.dtype
-                    )
-                if isinstance(self.talk_head[0], nn.Sequential):
-                    for cur_head in self.talk_head[0]:
-                        # if it has weights
-                        if hasattr(cur_head, "weight"):
-                            cur_head.weight.data = lambda_transform(cur_head)
-                else:
-                    self.talk_head[-1].weight.data = lambda_transform(self.talk_head[0])
         loss = None
-        prev_rm_tokens = None
-        cur_rm_tokens = None
-        prev_rm_logits = None
-        prev_sample_probs = None
-        did_skip_sampling = None
-        skip_sampling = None
-        sample_probs = None
-        hidden_states = None
-        logits = None
-        talk_kl_penalty = None
-        rm_logits = None
-        residual_logits = None
-        probabilities_2d = None
-        prev_probabilities_2d = None
-        policy_reward = None
-        logits_to_output = None
-        batch_size, seq_len = input_ids.shape
-        base_input_ids = input_ids.clone()
-        loss_list = []
-        dqn_loss_list = []
-        sampled_token_history = []
-        sample_probs_history = []
-        action_loglikelihoods_list = []
-        if self.use_end_thought_token or self.use_start_thought_token:
-            if not self.use_reparam_for_thought_embeddings:
-                start_embedding = self.start_embedding[0].unsqueeze(0) * self.embedding_scale
-                end_embedding = self.end_embedding[0].unsqueeze(0) * self.embedding_scale
-            else:
-                start_embedding = self.start_embedding * self.embedding_scale
-                end_embedding = self.end_embedding * self.embedding_scale
-            base_embeddings = self.model.embed_tokens.weight
-            if self.train_only_thinking_embedding:
-                base_embeddings = base_embeddings.detach()
-        # # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        fwd_iters = 1 if self.original_mode else self.n_ahead + self.n_ahead_talk - 1
-        for ahead_idx in range(fwd_iters):
-            past_key_values_length = 0
-            if past_key_values is not None:
-                use_legacy_cache = not isinstance(past_key_values, Cache)
-                if use_legacy_cache:
-                    past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-                past_key_values_length = past_key_values.get_usable_length(seq_len)
-            if position_ids is None:
-                device = input_ids.device if input_ids is not None else inputs_embeds.device
-                position_ids = torch.arange(
-                    past_key_values_length, seq_len + past_key_values_length, dtype=torch.long, device=device
-                )
-                position_ids = position_ids.unsqueeze(0).view(-1, seq_len)
-            else:
-                position_ids = position_ids.view(-1, seq_len).long()
-            if inputs_embeds is None:
-                contains_start = self.use_start_thought_token and (input_ids == self.start_token_id).any()
-                contains_end = self.use_end_thought_token and (input_ids == self.end_token_id).any()
-                contains_thought = contains_start or contains_end
-                if contains_thought:
-                    thought_id = self.start_token_id if contains_start else self.end_token_id
-                    cur_thought_embedding = start_embedding if contains_start else end_embedding
-                    if self.use_reparam_for_thought_embeddings:
-                        inputs_embeds = torch.randn(batch_size, seq_len, self.model.config.hidden_size, device=input_ids.device, dtype=cur_thought_embedding.dtype)
-                        inputs_embeds = inputs_embeds.detach() * torch.exp(cur_thought_embedding[1]) + cur_thought_embedding[0]
-                        if contains_start:
-                            sampled_start = inputs_embeds.clone().detach()
-                        if contains_end:
-                            sampled_end = inputs_embeds.clone().detach()
-                    else:
-                        inputs_embeds = cur_thought_embedding.unsqueeze(0).repeat(batch_size, seq_len, 1)
-                else:
-                    with torch.set_grad_enabled(not self.train_only_thinking_embedding):
-                        inputs_embeds = self.model.embed_tokens(input_ids)
-            if self.n_ahead != 1 or self.n_ahead_talk != 1 or self.comparison_mode:
-                if attention_mask is None:
-                    base_attention_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=0).to(input_ids.device)
-                    base_attention_mask = base_attention_mask.view(1, 1, seq_len, seq_len)
-                    base_attention_mask = base_attention_mask.repeat(input_ids.shape[0], 1, 1, 1)
-                    attention_mask = base_attention_mask
-                    breakpoint()
-                elif attention_mask.dim() == 2:
-                    if seq_len + past_key_values_length != attention_mask.shape[-1]:
-                        breakpoint()
-                        attention_mask = torch.cat(
-                            [torch.ones((attention_mask.shape[0], past_key_values_length), dtype=attention_mask.dtype, device=attention_mask.device), attention_mask],
-                            dim=-1
-                        )
-                    # # if the attention mask
-                    attention_mask = _prepare_4d_causal_attention_mask(
-                        attention_mask,
-                        (batch_size, seq_len),
-                        inputs_embeds,
-                        past_key_values_length,
-                        sliding_window=self.config.sliding_window,
-                    )
-            outputs = self.model(
-                # input_ids=input_ids,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_values=past_key_values,
-                inputs_embeds=inputs_embeds,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-            prev_hidden_states = hidden_states
-            hidden_states = outputs[0]
-            prev_rm_logits = rm_logits  # for policy gradient
-            prev_rm_tokens = cur_rm_tokens  # for policy gradient
-            if ahead_idx == 0:
-                hidden_states_lm = hidden_states
-                logits = self.lm_head(hidden_states_lm)
-                base_hidden_states = hidden_states.clone()
-                initial_loss_logits = logits.clone()
-                if self.optimize_lm_head_only_at_start or self.optimize_model_only_at_start:
-                    logits = logits.detach()
-                    base_hidden_states = base_hidden_states.detach()
-                if self.optimize_model_only_at_start:
-                    hidden_states = hidden_states.detach()
-                base_logits = logits.clone()
-            else:
-                talk_hidden_states = hidden_states
-                if self.merged_lm_and_talk_heads:
-                    assert self.no_residual
-                    residual_logits = self.lm_head(hidden_states)
-                    talk_hidden_states = hidden_states
-                else:
-                    if ahead_idx > self.n_ahead - 1:
-                        cur_base_hidden = torch.cat([
-                            base_hidden_states[..., ahead_idx - self.n_ahead + 1:, :],
-                            base_hidden_states[..., :ahead_idx - self.n_ahead + 1, :]
-                        ], dim=-2)
-                    else:
-                        cur_base_hidden = base_hidden_states
-                    if self.use_concat_talk_head:
-                        # concatenate the hidden states with the original hidden states
-                        head_input_hidden_states = torch.cat([cur_base_hidden, talk_hidden_states], dim=-1)
-                    else:
-                        head_input_hidden_states = talk_hidden_states
-                    residual_logits = self.talk_head[0](head_input_hidden_states)
-                    if self.use_shallow_talk:
-                        residual_logits = apply_head(self.lm_head, residual_logits, detach=self.optimize_lm_head_only_at_start)
-                    residual_logits = residual_logits.to(logits.device)
-                    if self.use_weighted_talk_head:
-                        # combine the cur_base_hidden with the talk_hidden_states according to the weighted head
-                        residual_logits = cur_base_hidden * (1 - residual_logits) + talk_hidden_states * residual_logits
-                        residual_logits = apply_head(self.lm_head, residual_logits, detach=self.optimize_lm_head_only_at_start)
-                assert sum([self.cumulative_residual, self.clever_residual, self.skip_residual, self.no_residual]) == 1
-                if self.clever_residual:
-                    if ahead_idx >= self.n_ahead - 1:
-                        # get the logits shifted according to the current talk ahead
-                        cur_base_logits = torch.cat([
-                            base_logits[..., ahead_idx - self.n_ahead + 1:, :],
-                            base_logits[..., :ahead_idx - self.n_ahead + 1, :]
-                        ], dim=-2)
-                        if self.optimize_lm_head_only_at_start:
-                            cur_base_logits = cur_base_logits.detach()
-                        logits = cur_base_logits + residual_logits
-                    else:
-                        logits += residual_logits / self.n_ahead
-                elif self.cumulative_residual:
-                    if self.residual_talk_head:
-                        if ahead_idx < self.n_ahead:
-                            logits += residual_logits
-                        else:
-                            # get the logits shifted according to the current talk ahead
-                            cur_base_logits = torch.cat([
-                                base_logits[..., ahead_idx - self.n_ahead + 1:, :],
-                                base_logits[..., :ahead_idx - self.n_ahead + 1, :]
-                            ], dim=-2)
-                            if self.optimize_lm_head_only_at_start:
-                                cur_base_logits = cur_base_logits.detach()
-                            logits = cur_base_logits + residual_logits
-                    else:
-                        if ahead_idx < self.n_ahead:
-                            logits += residual_logits
-                        else:
-                            logits = residual_logits
-                elif self.skip_residual:
-                    if ahead_idx >= self.n_ahead:
-                        # get the logits shifted according to the current talk ahead
-                        cur_base_logits = torch.cat([
-                            base_logits[..., ahead_idx - self.n_ahead + 1:, :],
-                            base_logits[..., :ahead_idx - self.n_ahead + 1, :]
-                        ], dim=-2)
-                        if self.optimize_lm_head_only_at_start:
-                            cur_base_logits = cur_base_logits.detach()
-                        logits = cur_base_logits
-                elif self.no_residual:
-                    logits = residual_logits
-                else:
-                    logits = base_logits + residual_logits
-            attempted = False
-            talk_loss_list = []
-            if self.original_mode or (self.n_ahead == 1) or (self.comparison_mode and ahead_idx == 0):# or (self.optimize_lm_head_only_at_start and ahead_idx == 0):
-                loss = None
-                attempted = True
-                if labels is not None:
-                    for shift_amount in range(self.n_ahead_talk):
-                        # Shift so that tokens < n predict n
-                        #  ab[cde]f
-                        # abc[def]
-                        if ahead_idx == 0 and self.optimize_lm_head_only_at_start:
-                            loss_logits = initial_loss_logits
-                        else:
-                            loss_logits = logits
-                        shift_logits = loss_logits[..., shift_amount:-1, :].contiguous()
-                        shift_labels = labels[..., 1 + shift_amount:].contiguous()
-                        # Flatten the tokens
-                        loss_fct = CrossEntropyLoss(reduction="none")
-                        shift_logits = shift_logits.view(-1, self.config.vocab_size)
-                        shift_labels = shift_labels.view(-1).clone()
-                        # Enable model parallelism
-                        shift_labels[shift_labels == self.tokenizer.pad_token_id] = -100
-                        shift_labels = shift_labels.to(shift_logits.device)
-                        loss = loss_fct(shift_logits, shift_labels)
-                        if not self.comparison_mode and not (self.optimize_lm_head_only_at_start and (self.n_ahead + self.n_ahead_talk > 2)) or self.original_mode:
-                            loss_list.append(loss)
-                        talk_loss_list.append(nonzero_mean(loss).detach())
-            if not attempted or self.comparison_mode:
-                rm_hidden_states = hidden_states
-                # print("Magnitude of RM hidden states before RM head", rm_hidden_states.norm())
-                rm_logits = apply_head(self.lm_head, rm_hidden_states, detach=self.optimize_lm_head_only_at_start)
-                # don't allow it to predict the thinking token
-                if self.tokenizer_has_start_thought_token:
-                    rm_logits[..., self.start_token_id] = -1e10
-                if self.tokenizer_has_end_thought_token:
-                    rm_logits[..., self.end_token_id] = -1e10
-                probabilities = rm_logits
-                if probabilities_2d is not None:
-                    prev_probabilities_2d = probabilities_2d.clone()
-                probabilities_2d = probabilities.view(-1, probabilities.size(-1))
-                did_skip_sampling = skip_sampling
-                skip_sampling = False
-                if ahead_idx == 0 and self.use_start_thought_token:
-                    override_token = self.start_token_id
-                elif self.use_thought_prefix and ahead_idx < self.tokenized_thought_prefix.shape[-1]:
-                    override_token = self.tokenized_thought_prefix[..., ahead_idx]
-                elif ahead_idx == self.n_ahead - 2 and self.use_end_thought_token:
-                    override_token = self.end_token_id
-                else:
-                    override_token = None
-                if override_token is not None and self.n_ahead > 1:
-                    # always start with the start token
-                    probabilities_2d = torch.zeros_like(probabilities_2d)
-                    probabilities_2d[:, override_token] = 1.0
-                    skip_sampling = True
-                elif ahead_idx >= self.n_ahead - 1:
-                    if labels is not None:  # we're in the talk phase
-                        cur_talk_n = ahead_idx - (self.n_ahead - 1) + 1
-                        # print("Setting rm to labels", cur_talk_n, "during", ahead_idx)
-                        shift_labels = labels[..., cur_talk_n:].contiguous().to(probabilities_2d.device)
-                        padding = torch.full_like(
-                            labels[..., :cur_talk_n],
-                            self.tokenizer.pad_token_id,
-                            dtype=torch.long,
-                            device=shift_labels.device
-                        )
-                        new_rm_tokens = torch.cat(
-                            [shift_labels, padding],
-                            dim=-1
-                        )
-                        # convert rm tokens to one-hot
-                        probabilities_2d = F.one_hot(new_rm_tokens, num_classes=self.vocab_size).reshape(-1, self.vocab_size).to(probabilities_2d.dtype)
-                        skip_sampling = True
-                    else:
-                        continue
-                temperature = self.gumbel_temperature if self.training else 0.001
-                prev_sample_probs = sample_probs
-                sample_probs = probabilities_2d
-                if ahead_idx < self.n_ahead - 1 and not skip_sampling:
-                    probabilities_2d = F.gumbel_softmax(sample_probs, tau=temperature, hard=True, dim=-1)
-                    if self.gumbel_detach:
-                        probabilities_2d = probabilities_2d.detach()
-                sampled_token_history.append(probabilities_2d.argmax(dim=-1).detach().cpu())
-                # convert rm logits directly to embeddings
-                contains_start = self.use_start_thought_token and (probabilities_2d[..., self.start_token_id].sum() > 0)
-                contains_end = self.use_end_thought_token and (probabilities_2d[..., self.end_token_id].sum() > 0)
-                contains_thought = contains_start or contains_end
-                if not contains_thought:
-                    with torch.set_grad_enabled(not self.train_only_thinking_embedding):
-                        inputs_embeds = probabilities_2d @ (self.model.embed_tokens.weight.to(probabilities.device).to(probabilities.dtype))
-                else:
-                    thought_id = self.start_token_id if contains_start else self.end_token_id
-                    cur_thought_embedding = start_embedding if contains_start else end_embedding
-                    if self.use_reparam_for_thought_embeddings:
-                        inputs_embeds = torch.randn(batch_size, seq_len, self.model.config.hidden_size, device=input_ids.device, dtype=cur_thought_embedding.dtype)
-                        inputs_embeds = inputs_embeds * torch.exp(cur_thought_embedding[1]) + cur_thought_embedding[0]
-                        if contains_start:
-                            sampled_start = inputs_embeds.clone().detach()
-                        else:
-                            sampled_end = inputs_embeds.clone().detach()
-                    else:
-                        inputs_embeds = cur_thought_embedding.unsqueeze(0).repeat(batch_size, seq_len, 1)
-                        inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
-                inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
-                if len(attention_mask.shape) == 2:
-                    breakpoint()
-                else:
-                    original_attention = attention_mask[..., :attention_mask.shape[-2]]
-                    if self.use_upper_triangular:
-                        new_attention = original_attention
-                    else:
-                        original_attention = original_attention == attention_mask.max()
-                        # because eye isn't implemented for BF16, we need to handle the case
-                        if not attention_mask.dtype == torch.bfloat16:
-                            new_attention = torch.eye(
-                                seq_len, dtype=attention_mask.dtype, device=attention_mask.device
-                            )
-                        else:
-                            new_attention = torch.eye(
-                                seq_len, dtype=torch.float32, device=attention_mask.device
-                            ).to(attention_mask.dtype)
-                        new_attention = new_attention.view(1, 1, seq_len, seq_len).repeat(input_ids.shape[0], 1, 1, 1)
-                        new_attention = new_attention * original_attention
-                        new_attention[new_attention == 0] = attention_mask.min()
-                        new_attention[new_attention == 1] = attention_mask.max()
-                    attention_mask = torch.cat([attention_mask, new_attention], dim=-1)
-                past_key_values = outputs.past_key_values
-                position_ids = position_ids + 1
-                if labels is not None and (self.n_ahead > 1 or not self.base_original_mode):
-                    # Shift so that tokens < n predict n
-                    # logits: abcdef -> bcdef? -> cdef??
-                    # labels: abcdef -> ?bcdef -> ??cdef
-                    if ahead_idx == 0 and self.optimize_lm_head_only_at_start:
-                        loss_logits = initial_loss_logits
-                    else:
-                        loss_logits = logits
-                    shift_idx = 1 + max(0, ahead_idx - (self.n_ahead - 1))
-                    shift_logits = loss_logits[..., :-shift_idx, :].contiguous()
-                    shift_labels = labels[..., shift_idx:].contiguous()
-                    # Flatten the tokens
-                    loss_fct = CrossEntropyLoss(reduction="none")
-                    shift_logits = shift_logits.view(-1, self.config.vocab_size)
-                    shift_labels = shift_labels.view(-1)
-                    # Enable model parallelism
-                    shift_labels = shift_labels.to(shift_logits.device)
-                    # if shift_labels.min() == self.tokenizer.pad_token_id:
-                    shift_labels = torch.where(shift_labels == self.tokenizer.pad_token_id, -100, shift_labels)
-                    unreduced_loss = loss_fct(shift_logits, shift_labels)
-                    if torch.any(unreduced_loss != unreduced_loss):
-                        raise ValueError("NaN loss")
-                    unreduced_loss = unreduced_loss.reshape(logits.shape[0], -1)
-                    loss_list.append(unreduced_loss)
-                    if self.use_policy_loss and ahead_idx > 0 and (ahead_idx > 1 or not self.use_start_thought_token):
-                        # we treat the change in loss as the reward
-                        previous_loss = loss_list[-2]
-                        # for example, suppose n_ahead = 3 and n_ahead_talk = 2
-                        # note that we end at self.n_ahead + self.n_ahead_talk - 2
-                        # in this case, 5 - 2 = 3, so we end at ahead_idx = 3
-                        # we also predict the next token at ahead_idx = 2
-                        # when we get to ahead_idx = 2, we predict ahead
-                        # so we shift by 1
-                        # note that this is ahead_idx = n_ahead - 1
-                        # when we get to ahead_idx = 3, we predict ahead
-                        # so we shift by 2
-                        # note that this is ahead_idx = n_ahead
-                        if ahead_idx < self.n_ahead - 1:
-                            shift_amount = 0
-                            original_dqn_reward = (previous_loss - unreduced_loss).detach()
-                            if self.first_and_last_mode:
-                                original_dqn_reward = original_dqn_reward * 0.0
-                        else:
-                            # logits vs cur_policy_shift_logits
-                            # let's look at rm_logits and prev_rm_logits
-                            shift_amount = max(0, ahead_idx - (self.n_ahead - 1))
-                            # let's say shift_amount = 2
-                            # abcdefg -> bcdefg? -> cdefg??
-                            # logits = [a b]c d e f[g]
-                            # labels = [a b c]d e f g
-                            cur_policy_shift_logits = initial_loss_logits[..., shift_amount:-1, :].contiguous().detach()
-                            cur_policy_shift_labels = labels[..., 1 + shift_amount:].contiguous()
-                            # Flatten the tokens
-                            cur_policy_loss_fct = CrossEntropyLoss(reduction="none")
-                            cur_policy_shift_logits = cur_policy_shift_logits.view(-1, self.config.vocab_size)
-                            cur_policy_shift_labels = cur_policy_shift_labels.view(-1).clone()
-                            # Enable model parallelism
-                            cur_policy_shift_labels[cur_policy_shift_labels == self.tokenizer.pad_token_id] = -100
-                            cur_policy_shift_labels = cur_policy_shift_labels.to(cur_policy_shift_labels.device)
-                            cur_policy_reward_base_loss = loss_fct(
-                                cur_policy_shift_logits, cur_policy_shift_labels.to(cur_policy_shift_logits.device)
-                            ).reshape(logits.shape[0], -1)
-                            original_dqn_reward = cur_policy_reward_base_loss.detach() - unreduced_loss
-                        if not did_skip_sampling:
-                            nonzero_indices = prev_probabilities_2d.nonzero()
-                            action_loglikelihoods = F.log_softmax(prev_sample_probs / self.reinforce_temperature, dim=-1)[nonzero_indices[:, 0], nonzero_indices[:, 1]]
-                            action_loglikelihoods_2d = action_loglikelihoods.reshape(batch_size, -1)[:, :-1 - shift_amount]
-                            action_loglikelihoods_list.append(action_loglikelihoods_2d)
-                        if policy_reward is None:
-                            policy_reward = original_dqn_reward[:, :-(self.n_ahead_talk - shift_amount)]
-                        else:
-                            if self.n_ahead_talk > shift_amount:
-                                added_reward = original_dqn_reward[:, :-(self.n_ahead_talk - shift_amount)]
-                            else:
-                                added_reward = original_dqn_reward
-                            policy_reward += added_reward
-                    if self.use_policy_loss and ahead_idx == self.n_ahead + self.n_ahead_talk - 2:
-                        # only compute during the thinking phase
-                        if self.use_reparam_for_thought_embeddings and (self.use_start_thought_token or self.use_end_thought_token):
-                            # sampled_start, sampled_end
-                            # calculate the log likelihood of the start and end embeddings sampled from a multivariate normal distribution
-                            # with mean start_embedding[0] and standard deviation start_embedding[1]
-                            if self.use_start_thought_token:
-                                exp_start_std = torch.exp(start_embedding[1])
-                                start_loglikelihood = -0.5 * (sampled_start.detach() - start_embedding[0]) ** 2 / exp_start_std ** 2 - start_embedding[1] - 0.5 * math.log(2 * math.pi)
-                                start_loglikelihood = start_loglikelihood.mean(dim=-1)
-                            if self.use_end_thought_token:
-                                exp_end_std = torch.exp(end_embedding[1])
-                                end_loglikelihood = -0.5 * (sampled_end.detach() - end_embedding[0]) ** 2 / exp_end_std ** 2 - end_embedding[1] - 0.5 * math.log(2 * math.pi)
-                                end_loglikelihood = end_loglikelihood.mean(dim=-1)
-                            # we use the mean instead of the sum to prevent dependence on the dimensionality of the embeddings
-                            if self.use_end_thought_token and self.use_policy_loss_for_end_thought:
-                                action_loglikelihoods_list.append(end_loglikelihood)
-                            if self.use_start_thought_token:
-                                action_loglikelihoods_list.append(start_loglikelihood)
-                        if ahead_idx == self.n_ahead + self.n_ahead_talk - 2 and self.eval_mode:
-                            with torch.no_grad():
-                                # calculate the 0.75 quantile of the rewards
-                                filtered_tokens = input_ids[:, :policy_reward.shape[-1]].cpu().detach().numpy().flatten()
-                                filtered_tokens_mask = filtered_tokens != self.tokenizer.pad_token_id
-                                filtered_tokens = filtered_tokens[filtered_tokens_mask]
-                                filtered_rewards = policy_reward.float().cpu().detach().numpy()[:, :seq_len - self.n_ahead_talk].flatten()
-                                filtered_rewards = filtered_rewards[filtered_tokens_mask]
-                                abs_reward_list = np.abs(policy_reward.float().cpu().detach().numpy()[:, :seq_len - self.n_ahead_talk].flatten())
-                                abs_reward_list = abs_reward_list[filtered_tokens_mask]
-                                medium_quantile = np.quantile(abs_reward_list, 0.5)
-                                upper_quantile = np.quantile(abs_reward_list, 0.95)
-                                save_tokens_with_rewards_to_pdf(
-                                    filtered_tokens,
-                                    [0] + filtered_rewards.tolist(),
-                                    self.tokenizer,
-                                    output_file=f"texts/rewards_talk_{self.n_ahead_talk}_{self.training_steps}.pdf",
-                                    eps=medium_quantile,
-                                    eps2=upper_quantile,
-                                )
-                                def plot_kde(data, losses):
-                                    sns.set(style="whitegrid")
-                                    # Create the KDE plot
-                                    sns.kdeplot(data, fill=True)
-                                    # Set the plot title and labels
-                                    plt.title("KDE Plot")
-                                    plt.xlabel("Value")
-                                    plt.ylabel("Density")
-                                    # Save the plot
-                                    plt.savefig(f"texts/kde_talk_{self.n_ahead_talk}_{self.training_steps}.pdf")
-                                    # Close the plot
-                                    plt.close()
-                                    # Step 1: Create a base color palette
-                                    base_colors = sns.color_palette("light:#5A9", n_colors=256)  # More colors for a smoother gradient
-                                    base_cmap = LinearSegmentedColormap.from_list("log_light", base_colors)
-                                    log_norm = LogNorm(vmin=1e-3, vmax=10)
-                                    sns.kdeplot(x=data, y=losses, fill=True, levels=20, norm=log_norm, cut=0, linewidths=0)
-                                    # limit y to 0 to 25 and x to -1 to 1
-                                    plt.xlim(-1, 1)
-                                    plt.ylim(0, 25)
-                                    plt.savefig(f"texts/jointer_talk_{self.n_ahead_talk}_{self.training_steps}.pdf")
-                                    plt.close()
-                                self.all_rewards.extend(filtered_rewards)
-                                self.all_unreduced_losses.extend(unreduced_loss[:, :-1].flatten()[filtered_tokens_mask].float().flatten().cpu().detach().numpy())
-                                plot_kde(self.all_rewards, self.all_unreduced_losses)
-                        for action_loglikelihoods_2d in action_loglikelihoods_list:
-                            train_policy_reward = policy_reward
-                            # discard rewards below the mean
-                            if self.trice_mode and self.n_passes > 1:
-                                batched_policy_reward = train_policy_reward.reshape(-1, self.n_passes, train_policy_reward.shape[-1])
-                                # average over the passes
-                                train_policy_reward = batched_policy_reward - batched_policy_reward.mean(dim=1, keepdim=True)
-                                train_policy_reward = train_policy_reward.reshape(-1, train_policy_reward.shape[-1])
-                            if self.subtract_mean_reward:
-                                train_policy_reward = train_policy_reward - train_policy_reward.mean()
-                            if self.remove_negative_rewards:
-                                fixed_policy_reward = train_policy_reward.detach().clamp(min=0)
-                            else:
-                                fixed_policy_reward = train_policy_reward.detach()
-                            actor_loss = -fixed_policy_reward * action_loglikelihoods_2d[:, :policy_reward.shape[-1]].to(policy_reward.device)
-                            if action_loglikelihoods_2d.mean() < -1e4 and not self.use_policy_loss_just_for_thoughts:
-                                # This will only happen when we force the next token to be the end of thought token
-                                break
-                            dqn_loss_list.append(actor_loss.mean())
-        if loss_list:
-            if self.first_and_last_mode:
-                loss = sum(
-                    self.loss_mean(loss_list[-(i + 1)]) for i in range(self.n_ahead_talk)
-                ) * (1 - self.original_loss_weight) / self.n_ahead_talk
-                loss = loss + self.loss_mean(loss_list[0]) * self.original_loss_weight
-                # Let's NaN out the others
-                # e.g. if n_ahead_talk = 2 and the list is 5 long, we want to NaN out 1, 2 but keep 0, 3, 4
-                for i in range(1, len(loss_list) - self.n_ahead_talk):
-                    loss_list[i] = loss_list[i] * math.nan
-            elif self.first_only:
-                loss = self.loss_mean(loss_list[0])
-            elif self.final_only_mode:
-                loss = sum(
-                    self.loss_mean(loss_list[-i]) for i in range(1, self.n_ahead_talk + 1)
-                ) / self.n_ahead_talk
-            else:
-                loss = None
-                for i in range(len(loss_list)):
-                    cur_loss = self.loss_mean(loss_list[i])
-                    if loss is not None:
-                        loss = loss + cur_loss.to(loss.device)
-                    else:
-                        loss = cur_loss
-                loss = loss / len(loss_list)
-            loss = loss * self.base_loss_beta
-        if dqn_loss_list:
-            dqn_loss = sum(dqn_loss_list) / len(dqn_loss_list)
-            if self.include_policy_loss:
-                if loss is not None:
-                    loss += dqn_loss * self.policy_loss_beta
-                else:
-                    loss = dqn_loss * self.policy_loss_beta
         if not return_dict:
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
-        base_log_dict = {
-            f"loss_{i}": nonzero_mean(loss_list[i]) for i in range(len(loss_list))
-        }
-        if loss is not None:
-            base_log_dict["loss_train"] = loss.item()
-        for loss_key, loss_val in base_log_dict.items():
-            log_dict[loss_key] += loss_val / self.n_tokens_print
-        if self.use_policy_loss and policy_reward is not None:
-            log_dict["policy_loss"] += dqn_loss / self.n_tokens_print
-            log_dict["policy_reward"] += policy_reward.mean() / self.n_tokens_print
-        if not loss_list:
-            if loss is not None:
-                log_dict["loss_0"] += loss / self.n_tokens_print
-        else:
-            log_dict["loss_final"] += nonzero_mean(loss_list[-1]) / self.n_tokens_print
-            log_dict["loss_talk"] += sum(nonzero_mean(cur_loss_item) for cur_loss_item in loss_list[-self.n_ahead_talk:]) / self.n_ahead_talk / self.n_tokens_print
-        # also log relative losses to loss_0
-        if loss_list:
-            for i in range(len(loss_list)):
-                talk_idx = min(max(i - (self.n_ahead - 1), 0), len(talk_loss_list) - 1)
-                if not talk_loss_list:
-                    cur_talk_loss = nonzero_mean(loss_list[0])
-                else:
-                    cur_talk_loss = talk_loss_list[talk_idx]
-                log_dict[f"rel_loss_{i}"] += (nonzero_mean(loss_list[i]) - cur_talk_loss) / self.n_tokens_print
-        if self.training:
-            self.training_steps += 1
-        try:
-            # if self.training_steps % (self.gradient_accumulation_steps * 256) == 0:
-            if self.wandb_enabled:
-                if self.training_steps % (self.n_tokens_print) == 0 or not self.training:# and "0" in str(loss.device):
-                    if not self.training:
-                        new_log_dict = {}
-                        for key in list(log_dict.keys()):
-                            new_log_dict["eval_" + key] = log_dict[key]
-                        log_dict = new_log_dict
-                    log_dict["training_steps"] = self.training_steps
-                    log_dict["batch_size"] = batch_size
-                    log_dict["example_steps"] = self.training_steps * batch_size * self.gradient_accumulation_steps
-                    if self.n_ahead > 1:
-                        log_dict["compute_steps"] = self.training_steps * batch_size * (self.n_ahead + self.n_ahead_talk - 1) * self.gradient_accumulation_steps
-                    else: # There's no overhead for talk tokens if there's no thinking
-                        log_dict["compute_steps"] = self.training_steps * batch_size * self.gradient_accumulation_steps
-                    # remove all nans
-                    for key in list(log_dict.keys()):
-                        if log_dict[key] != log_dict[key]:
-                            del log_dict[key]
-                    if self.training:
-                        wandb.log(log_dict)
-                    if self.training:
-                        self.log_dict = defaultdict(int)
-                    else:
-                        self.eval_log_dict = defaultdict(int)
-        except Exception as e:
-            pass
-        if not self.training:
-            self.n_ahead_talk = n_ahead_talk_to_restore
-            self.n_passes = n_passes_to_restore
         return CausalLMOutputWithPast(
-            loss=loss if loss is not None else None,
-            logits=(rm_logits if self.n_ahead > 1 else logits) if not self.output_logits_at_the_end else logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
@@ -2211,7 +1210,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing inputs_embeds as
             # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
@@ -2265,9 +1264,9 @@ class QuietForCausalLM(QuietPreTrainedModel):
 @add_start_docstrings(
     """
-    The Quiet Model transformer with a sequence classification head on top (linear layer).
-    [`QuietForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -2276,14 +1275,14 @@ class QuietForCausalLM(QuietPreTrainedModel):
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
     """,
-    QUIET_START_DOCSTRING,
 )
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Quiet, LLAMA->QUIET
-class QuietForSequenceClassification(QuietPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.model = QuietModel(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
@@ -2295,7 +1294,7 @@ class QuietForSequenceClassification(QuietPreTrainedModel):
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
-    @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,

 # coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" PyTorch Mistral model."""
 import inspect
 import math
 import warnings
 from typing import List, Optional, Tuple, Union
 import torch
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     logging,
     replace_return_docstrings,
 )
+from .configuration_mistral import MistralConfig
 if is_flash_attn_2_available():
 logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "MistralConfig"
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
     return (
         indices,
         cu_seqlens,
     )
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mistral
+class MistralRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
+        MistralRMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
         hidden_states = hidden_states.to(torch.float32)
         variance = hidden_states.pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class MistralRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         # Build here to make `torch.jit.trace` work.
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
         freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
     return torch.cat((-x2, x1), dim=-1)
+# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+# TODO @Arthur no longer copied from LLama after static cache
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
     return q_embed, k_embed
+class MistralMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class MistralAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
     and "Generating Long Sequences with Sparse Transformers".
     """
+    def __init__(self, config: MistralConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
                 "when creating this class."
             )
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = MistralRotaryEmbedding(
             self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
         return attn_output, attn_weights, past_key_value
+class MistralFlashAttention2(MistralAttention):
     """
+    Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
     untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
     flash attention and deal with padding tokens in case the input contains any of them.
     """
             attention_mask (`torch.Tensor`):
                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                 position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
                 Attention dropout
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         )
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
+# TODO @Arthur no longer copied from LLama after static cache
+class MistralSdpaAttention(MistralAttention):
     """
+    Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `MistralAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
+    # Adapted from MistralAttention.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
+                "MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
             query_states,
             key_states,
             value_states,
+            attn_mask=attention_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
             is_causal=self.is_causal and attention_mask is None and q_len > 1,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
         return attn_output, None, past_key_value
+MISTRAL_ATTENTION_CLASSES = {
+    "eager": MistralAttention,
+    "flash_attention_2": MistralFlashAttention2,
+    "sdpa": MistralSdpaAttention,
 }
+class MistralDecoderLayer(nn.Module):
+    def __init__(self, config: MistralConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.mlp = MistralMLP(config)
+        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
             output_attentions=output_attentions,
             use_cache=use_cache,
         )
+        hidden_states = residual + hidden_states
         # Fully Connected
         residual = hidden_states
         return outputs
+MISTRAL_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     and behavior.
     Parameters:
+        config ([`MistralConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 @add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
 )
+class MistralPreTrainedModel(PreTrainedModel):
+    config_class = MistralConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["MistralDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
                 module.weight.data[module.padding_idx].zero_()
+MISTRAL_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
 @add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
 )
+class MistralModel(MistralPreTrainedModel):
     """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
     Args:
+        config: MistralConfig
     """
+    def __init__(self, config: MistralConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
+            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
+        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
     def set_input_embeddings(self, value):
         self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
             if is_padding_right:
                 raise ValueError(
                     "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
         if self._attn_implementation == "flash_attention_2":
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
             # output_attentions=True can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
             attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
                 inputs_embeds,
                 past_key_values_length,
             )
+        else:
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask,
             attentions=all_self_attns,
         )
+class MistralForCausalLM(MistralPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
+        self.model = MistralModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
     def get_decoder(self):
         return self.model
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         Example:
         ```python
+        >>> from transformers import AutoTokenizer, MistralForCausalLM
+        >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
         loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Ensure tensors are on the same device
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits, shift_labels)
         if not return_dict:
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
         return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
 @add_start_docstrings(
     """
+    The Mistral Model transformer with a sequence classification head on top (linear layer).
+    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
     """,
+    MISTRAL_START_DOCSTRING,
 )
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mistral, LLAMA->MISTRAL
+class MistralForSequenceClassification(MistralPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = MistralModel(config)
         self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
         # Initialize weights and apply final processing
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,