Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 4, 2024

Commit

3328933

verified ·

1 Parent(s): 5d9b936

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +113 -114

modeling_quiet.py CHANGED Viewed

@@ -1311,120 +1311,119 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
         elif isinstance(module, nn.Embedding):
             nn.init.xavier_uniform_(module.weight)
-	@torch.no_grad()
-	def infer(
-		self,
-		input_ids: torch.LongTensor,
-		attention_mask: Optional[torch.Tensor] = None,
-		position_ids: Optional[torch.LongTensor] = None,
-		past_key_values: Optional[List[torch.FloatTensor]] = None,
-		inputs_embeds: Optional[torch.FloatTensor] = None,
-		use_cache: Optional[bool] = None,
-		output_attentions: Optional[bool] = None,
-		output_hidden_states: Optional[bool] = None,
-		return_dict: Optional[bool] = None,
-	):
-		batch_size, seq_len = input_ids.shape
-		# Save the original input_ids and attention_mask for later use
-		original_input_ids = input_ids.clone()
-		original_attention_mask = attention_mask.clone() if attention_mask is not None else None
-		# Append the start thought token to the input sequence
-		start_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|startthought|>")
-		input_ids = torch.cat([input_ids, torch.tensor([[start_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
-		seq_len += 1
-		# Update the attention mask
-		if attention_mask is not None:
-			attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
-		# Generate the continuation
-		continuation_length = self.n_ahead - 2
-		new_key_values = past_key_values
-		# Initialize next_token_id with a default value
-		next_token_id = torch.zeros(batch_size, dtype=torch.long).to(input_ids.device)
-		start_time = time.time()
-		for continuation_idx in range(continuation_length):
-			outputs = self.model(
-				input_ids=input_ids if continuation_idx == 0 else next_token_id.unsqueeze(-1).to(input_ids.device),
-				attention_mask=attention_mask,
-				position_ids=position_ids,
-				past_key_values=new_key_values,
-				inputs_embeds=inputs_embeds,
-				use_cache=True,
-				output_attentions=output_attentions,
-				output_hidden_states=output_hidden_states,
-				return_dict=return_dict,
-			)
-			new_key_values = outputs.past_key_values
-			hidden_states = outputs[0]
-			logits = self.lm_head(hidden_states)
-			logits = logits[:, -1, :]  # Only consider the last token
-			# Apply Gumbel-Softmax to the logits
-			next_token_logits = F.gumbel_softmax(logits, tau=self.gumbel_temperature, hard=True, dim=-1)
-			next_token_id = torch.argmax(next_token_logits, dim=-1)
-			# Append the generated token to the input sequence
-			input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
-			seq_len += 1
-			# Update the attention mask
-			if attention_mask is not None:
-				attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
-		# Append the end thought token to the input sequence
-		end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
-		input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
-		seq_len += 1
-		# Update the attention mask
-		if attention_mask is not None:
-			attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
-		# Get the hidden states before and after the thought
-		outputs_before = self.model(
-			input_ids=original_input_ids,
-			attention_mask=original_attention_mask,
-			position_ids=position_ids,
-			past_key_values=past_key_values,
-			inputs_embeds=inputs_embeds,
-			use_cache=use_cache,
-			output_attentions=output_attentions,
-			output_hidden_states=output_hidden_states,
-			return_dict=return_dict,
-		)
-		hidden_states_before = outputs_before[0][:, -1:, :]
-		# two new tokens: last continuation token and end thought token
-		outputs_after = self.model(
-			input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor([[end_thought_token_id]]).to(input_ids.device)], dim=-1),
-			attention_mask=attention_mask,
-			position_ids=position_ids,
-			past_key_values=new_key_values,
-			inputs_embeds=inputs_embeds,
-			use_cache=use_cache,
-			output_attentions=output_attentions,
-			output_hidden_states=output_hidden_states,
-			return_dict=return_dict,
-		)
-		hidden_states_after = outputs_after[0][:, -1:, :]
-		# Apply the talk head to get the mixing weight
-		mixing_weight = self.talk_head[0](torch.cat([hidden_states_before, hidden_states_after], dim=-1))
-		# Apply the mixing weight to the hidden states
-		mixed_hidden_states = (1 - mixing_weight) * hidden_states_before + mixing_weight * hidden_states_after
-		# Apply the language model head to get the final logits
-		logits = self.lm_head(mixed_hidden_states)
-		return logits
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)

         elif isinstance(module, nn.Embedding):
             nn.init.xavier_uniform_(module.weight)
+    @torch.no_grad()
+    def infer(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        batch_size, seq_len = input_ids.shape
+        # Save the original input_ids and attention_mask for later use
+        original_input_ids = input_ids.clone()
+        original_attention_mask = attention_mask.clone() if attention_mask is not None else None
+        # Append the start thought token to the input sequence
+        start_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|startthought|>")
+        input_ids = torch.cat([input_ids, torch.tensor([[start_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
+        seq_len += 1
+        # Update the attention mask
+        if attention_mask is not None:
+            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+        # Generate the continuation
+        continuation_length = self.n_ahead - 2
+        new_key_values = past_key_values
+        # Initialize next_token_id with a default value
+        next_token_id = torch.zeros(batch_size, dtype=torch.long).to(input_ids.device)
+        start_time = time.time()
+        for continuation_idx in range(continuation_length):
+            outputs = self.model(
+                input_ids=input_ids if continuation_idx == 0 else next_token_id.unsqueeze(-1).to(input_ids.device),
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=new_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            new_key_values = outputs.past_key_values
+            hidden_states = outputs[0]
+            logits = self.lm_head(hidden_states)
+            logits = logits[:, -1, :]  # Only consider the last token
+            # Apply Gumbel-Softmax to the logits
+            next_token_logits = F.gumbel_softmax(logits, tau=self.gumbel_temperature, hard=True, dim=-1)
+            next_token_id = torch.argmax(next_token_logits, dim=-1)
+            # Append the generated token to the input sequence
+            input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
+            seq_len += 1
+            # Update the attention mask
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+        # Append the end thought token to the input sequence
+        end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
+        input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
+        seq_len += 1
+        # Update the attention mask
+        if attention_mask is not None:
+            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+        # Get the hidden states before and after the thought
+        outputs_before = self.model(
+            input_ids=original_input_ids,
+            attention_mask=original_attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states_before = outputs_before[0][:, -1:, :]
+        # two new tokens: last continuation token and end thought token
+        outputs_after = self.model(
+            input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor([[end_thought_token_id]]).to(input_ids.device)], dim=-1),
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=new_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states_after = outputs_after[0][:, -1:, :]
+        # Apply the talk head to get the mixing weight
+        mixing_weight = self.talk_head[0](torch.cat([hidden_states_before, hidden_states_after], dim=-1))
+        # Apply the mixing weight to the hidden states
+        mixed_hidden_states = (1 - mixing_weight) * hidden_states_before + mixing_weight * hidden_states_after
+        # Apply the language model head to get the final logits
+        logits = self.lm_head(mixed_hidden_states)
+        return logits
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)