Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 5, 2024

Commit

1eb16fa

verified ·

1 Parent(s): 9b4bbbb

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +110 -112

modeling_quiet.py CHANGED Viewed

@@ -1311,119 +1311,117 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
         elif isinstance(module, nn.Embedding):
             nn.init.xavier_uniform_(module.weight)
-    @torch.no_grad()
-    def infer(
-        self,
-        input_ids: torch.LongTensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        batch_size, seq_len = input_ids.shape
-        # Save the original input_ids and attention_mask for later use
-        original_input_ids = input_ids.clone()
-        original_attention_mask = attention_mask.clone() if attention_mask is not None else None
-        # Append the start thought token to the input sequence
-        start_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|startthought|>")
-        input_ids = torch.cat([input_ids, torch.tensor([[start_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
-        seq_len += 1
-        # Update the attention mask
-        if attention_mask is not None:
-            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
-        # Generate the continuation
-        continuation_length = self.n_ahead - 2
-        new_key_values = past_key_values
-        # Initialize next_token_id with a default value
-        next_token_id = torch.zeros(batch_size, dtype=torch.long).to(input_ids.device)
-        start_time = time.time()
-        for continuation_idx in range(continuation_length):
-            outputs = self.model(
-                input_ids=input_ids if continuation_idx == 0 else next_token_id.unsqueeze(-1).to(input_ids.device),
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_values=new_key_values,
-                inputs_embeds=inputs_embeds,
-                use_cache=True,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-            new_key_values = outputs.past_key_values
-            hidden_states = outputs[0]
-            logits = self.lm_head(hidden_states)
-            logits = logits[:, -1, :]  # Only consider the last token
-            # Apply Gumbel-Softmax to the logits
-            next_token_logits = F.gumbel_softmax(logits, tau=self.gumbel_temperature, hard=True, dim=-1)
-            next_token_id = torch.argmax(next_token_logits, dim=-1)
-            # Append the generated token to the input sequence
-            # input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
-            seq_len += 1
-            # Update the attention mask
-            if attention_mask is not None:
-                attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
-        # Append the end thought token to the input sequence
-        end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
-        input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
-        seq_len += 1
-        # Update the attention mask
-        if attention_mask is not None:
-            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
-        # Get the hidden states before and after the thought
-        outputs_before = self.model(
-            input_ids=original_input_ids,
-            attention_mask=original_attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states_before = outputs_before[0][:, -1:, :]
-        # two new tokens: last continuation token and end thought token
-        outputs_after = self.model(
-            input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1),
-            attention_mask=torch.cat([attention_mask[:, -1:], torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1),
-            position_ids=position_ids,
-            past_key_values=new_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states_after = outputs_after[0][:, -1:, :]
-        # Apply the talk head to get the mixing weight
-        mixing_weight = self.talk_head[0](torch.cat([hidden_states_before, hidden_states_after], dim=-1))
-        # Apply the mixing weight to the hidden states
-        mixed_hidden_states = (1 - mixing_weight) * hidden_states_before + mixing_weight * hidden_states_after
-        # Apply the language model head to get the final logits
-        logits = self.lm_head(mixed_hidden_states)
-        return logits
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)

         elif isinstance(module, nn.Embedding):
             nn.init.xavier_uniform_(module.weight)
+	@torch.no_grad()
+	def infer(
+		self,
+		input_ids: torch.LongTensor,
+		attention_mask: Optional[torch.Tensor] = None,
+		position_ids: Optional[torch.LongTensor] = None,
+		past_key_values: Optional[List[torch.FloatTensor]] = None,
+		inputs_embeds: Optional[torch.FloatTensor] = None,
+		use_cache: Optional[bool] = None,
+		output_attentions: Optional[bool] = None,
+		output_hidden_states: Optional[bool] = None,
+		return_dict: Optional[bool] = None,
+	):
+		batch_size, seq_len = input_ids.shape
+		# Save the original input_ids and attention_mask for later use
+		original_input_ids = input_ids.clone()
+		original_attention_mask = attention_mask.clone() if attention_mask is not None else None
+		# Append the start thought token to the input sequence
+		start_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|startthought|>")
+		input_ids = torch.cat([input_ids, torch.tensor([[start_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
+		seq_len += 1
+		# Update the attention mask
+		if attention_mask is not None:
+			attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+		# Generate the continuation
+		continuation_length = self.n_ahead - 2
+		new_key_values = past_key_values
+		start_time = time.time()
+		for continuation_idx in range(continuation_length):
+			outputs = self.model(
+				input_ids=input_ids if continuation_idx == 0 else next_token_id.unsqueeze(-1).to(input_ids.device),
+				attention_mask=attention_mask,
+				position_ids=position_ids,
+				past_key_values=new_key_values,
+				inputs_embeds=inputs_embeds,
+				use_cache=True,
+				output_attentions=output_attentions,
+				output_hidden_states=output_hidden_states,
+				return_dict=return_dict,
+			)
+			new_key_values = outputs.past_key_values
+			hidden_states = outputs[0]
+			logits = self.lm_head(hidden_states)
+			logits = logits[:, -1, :]  # Only consider the last token
+			# Apply Gumbel-Softmax to the logits
+			next_token_logits = F.gumbel_softmax(logits, tau=self.gumbel_temperature, hard=True, dim=-1)
+			next_token_id = torch.argmax(next_token_logits, dim=-1)
+			# Append the generated token to the input sequence
+			input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
+			seq_len += 1
+			# Update the attention mask
+			if attention_mask is not None:
+				attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+		# Append the end thought token to the input sequence
+		end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
+		input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
+		seq_len += 1
+		# Update the attention mask
+		if attention_mask is not None:
+			attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+		# Get the hidden states before and after the thought
+		outputs_before = self.model(
+			input_ids=original_input_ids,
+			attention_mask=original_attention_mask,
+			position_ids=position_ids,
+			past_key_values=past_key_values,
+			inputs_embeds=inputs_embeds,
+			use_cache=use_cache,
+			output_attentions=output_attentions,
+			output_hidden_states=output_hidden_states,
+			return_dict=return_dict,
+		)
+		hidden_states_before = outputs_before[0][:, -1:, :]
+		# two new tokens: last continuation token and end thought token
+		outputs_after = self.model(
+			input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor(end_thought_token_id).unsqueeze(-1).unsqueeze(-1).to(input_ids.device)], dim=-1),
+			attention_mask=attention_mask,
+			position_ids=position_ids,
+			past_key_values=new_key_values,
+			inputs_embeds=inputs_embeds,
+			use_cache=use_cache,
+			output_attentions=output_attentions,
+			output_hidden_states=output_hidden_states,
+			return_dict=return_dict,
+		)
+		hidden_states_after = outputs_after[0][:, -1:, :]
+		# Apply the talk head to get the mixing weight
+		mixing_weight = self.talk_head[0](torch.cat([hidden_states_before, hidden_states_after], dim=-1))
+		# Apply the mixing weight to the hidden states
+		mixed_hidden_states = (1 - mixing_weight) * hidden_states_before + mixing_weight * hidden_states_after
+		# Apply the language model head to get the final logits
+		logits = self.lm_head(mixed_hidden_states)
+		return logits
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)