@torch.no_grad()
    def infer(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        batch_size, seq_len = input_ids.shape

        # Save the original input_ids and attention_mask for later use
        original_input_ids = input_ids.clone()
        original_attention_mask = attention_mask.clone() if attention_mask is not None else None

        # Append the start thought token to the input sequence
        start_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|startthought|>")
        input_ids = torch.cat([input_ids, torch.tensor([[start_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
        seq_len += 1

        # Update the attention mask
        if attention_mask is not None:
            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)

        # Generate the continuation
        continuation_length = self.n_ahead - 2
        new_key_values = past_key_values
        next_token_id_defined = False  # Flag to check if next_token_id is defined
        
        start_time = time.time()
        for continuation_idx in range(continuation_length):
            outputs = self.model(
                input_ids=input_ids if continuation_idx == 0 else next_token_id.unsqueeze(-1).to(input_ids.device),
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_values=new_key_values,
                inputs_embeds=inputs_embeds,
                use_cache=True,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
            new_key_values = outputs.past_key_values

            hidden_states = outputs[0]

            logits = self.lm_head(hidden_states)
            logits = logits[:, -1, :]  # Only consider the last token

            # Apply Gumbel-Softmax to the logits
            next_token_logits = F.gumbel_softmax(logits, tau=self.gumbel_temperature, hard=True, dim=-1)
            next_token_id = torch.argmax(next_token_logits, dim=-1)

            # Append the generated token to the input sequence
            input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
            seq_len += 1

            # Update the attention mask
            if attention_mask is not None:
                attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
            
            next_token_id_defined = True  # Set the flag to True after next_token_id is defined

        # Check if next_token_id is defined before using it
        if next_token_id_defined:
            # Append the end thought token to the input sequence
            end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
            input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
            seq_len += 1

            # Update the attention mask
            if attention_mask is not None:
                attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)

            # Get the hidden states before and after the thought
            outputs_before = self.model(
                input_ids=original_input_ids,
                aattention_mask=original_attention_mask,
                position_ids=position_ids,
                past_key_values=past_key_values,
                inputs_embeds=inputs_embeds,
                use_cache=use_cache,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
            hidden_states_before = outputs_before[0][:, -1:, :]

            # Only execute if next_token_id is defined
            if next_token_id_defined:
                outputs_after = self.model(
                    input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1),
                    attention_mask=attention_mask,
                    position_ids=position_ids,
                    past_key_values=new_key_values,
                    inputs_embeds=inputs_embeds,
                    use_cache=use_cache,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                    return_dict=return_dict,
                )
                hidden_states_after = outputs_after[0][:, -1:, :]

                # Apply the talk head to get the mixing weight
                mixing_weight = self.talk_head[0](torch.cat([hidden_states_before, hidden_states_after], dim=-1))

                # Apply the mixing weight to the hidden states
                mixed_hidden_states = (1 - mixing_weight) * hidden_states_before + mixing_weight * hidden_states_after

                # Apply the language model head to get the final logits
                logits = self.lm_head(mixed_hidden_states)
                return logits
            else:
                # Handle the case where next_token_id is not defined (e.g., continuation_length <= 0)
                # This part of the code needs to be adapted based on how you want to handle this scenario.
                # As a placeholder, returning the logits from the last state of the original input.
                logits = self.lm_head(hidden_states_before)
                return logits