Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 5, 2024

Commit

42469fd

verified ·

1 Parent(s): e281793

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +173 -174

modeling_quiet.py CHANGED Viewed

@@ -1315,180 +1315,179 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
         return model
-	@torch.no_grad()
-	def infer(
-		self,
-		input_ids: torch.LongTensor,
-		attention_mask: Optional[torch.Tensor] = None,
-		position_ids: Optional[torch.LongTensor] = None,
-		past_key_values: Optional[List[torch.FloatTensor]] = None,
-		inputs_embeds: Optional[torch.FloatTensor] = None,
-		use_cache: Optional[bool] = None,
-		output_attentions: Optional[bool] = None,
-		output_hidden_states: Optional[bool] = None,
-		return_dict: Optional[bool] = None,
-	):
-		batch_size, seq_len = input_ids.shape
-		# Save the original input_ids and attention_mask for later use
-		original_input_ids = input_ids.clone()
-		original_attention_mask = attention_mask.clone() if attention_mask is not None else None
-		# Append the start thought token to the input sequence
-		start_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|startthought|>")
-		input_ids = torch.cat([input_ids, torch.tensor([[start_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
-		seq_len += 1
-		# Update the attention mask
-		if attention_mask is not None:
-			attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
-		# Generate the continuation
-		continuation_length = self.n_ahead - 2
-		new_key_values = past_key_values
-		next_token_id_defined = False  # Flag to check if next_token_id is defined
-		start_time = time.time()
-		for continuation_idx in range(continuation_length):
-			outputs = self.model(
-				input_ids=input_ids if continuation_idx == 0 else next_token_id.unsqueeze(-1).to(input_ids.device),
-				attention_mask=attention_mask,
-				position_ids=position_ids,
-				past_key_values=new_key_values,
-				inputs_embeds=inputs_embeds,
-				use_cache=True,
-				output_attentions=output_attentions,
-				output_hidden_states=output_hidden_states,
-				return_dict=return_dict,
-			)
-			new_key_values = outputs.past_key_values
-			hidden_states = outputs[0]
-			logits = self.lm_head(hidden_states)
-			logits = logits[:, -1, :]  # Only consider the last token
-			# Apply Gumbel-Softmax to the logits
-			next_token_logits = F.gumbel_softmax(logits, tau=self.gumbel_temperature, hard=True, dim=-1)
-			next_token_id = torch.argmax(next_token_logits, dim=-1)
-			# Append the generated token to the input sequence
-			input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
-			seq_len += 1
-			# Update the attention mask
-			if attention_mask is not None:
-				attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
-			next_token_id_defined = True  # Set the flag to True after next_token_id is defined
-		# Check if next_token_id is defined before using it
-		if next_token_id_defined:
-			# Append the end thought token to the input sequence
-			end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
-			input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
-			seq_len += 1
-			# Update the attention mask
-			if attention_mask is not None:
-				attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
-			# Get the hidden states before and after the thought
-			outputs_before = self.model(
-				input_ids=original_input_ids,
-				attention_mask=original_attention_mask,
-				position_ids=position_ids,
-				past_key_values=past_key_values,
-				inputs_embeds=inputs_embeds,
-				use_cache=use_cache,
-				output_attentions=output_attentions,
-				output_hidden_states=output_hidden_states,
-				return_dict=return_dict,
-			)
-			hidden_states_before = outputs_before[0][:, -1:, :]
-			# Only execute if next_token_id is defined
-			if next_token_id_defined:
-				outputs_after = self.model(
-					input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1),
-					attention_mask=attention_mask,
-					position_ids=position_ids,
-					past_key_values=new_key_values,
-					inputs_embeds=inputs_embeds,
-					use_cache=use_cache,
-					output_attentions=output_attentions,
-					output_hidden_states=output_hidden_states,
-					return_dict=return_dict,
-				)
-				hidden_states_after = outputs_after[0][:, -1:, :]
-				# Apply the talk head to get the mixing weight
-				mixing_weight = self.talk_head[0](torch.cat([hidden_states_before, hidden_states_after], dim=-1))
-				# Apply the mixing weight to the hidden states
-				mixed_hidden_states = (1 - mixing_weight) * hidden_states_before + mixing_weight * hidden_states_after
-				# Apply the language model head to get the final logits
-				logits = self.lm_head(mixed_hidden_states)
-				if not return_dict:
-					return logits
-				return BaseModelOutputWithPast(
-					logits=logits,
-					past_key_values=new_key_values,
-					hidden_states=outputs_after.hidden_states if output_hidden_states else None,
-					attentions=outputs_after.attentions if output_attentions else None,
-				)
-			else:
-				# Handle the case where next_token_id is not defined (e.g., continuation_length <= 0)
-				# This part of the code needs to be adapted based on how you want to handle this scenario.
-				# As a placeholder, returning the logits from the last state of the original input.
-				logits = self.lm_head(hidden_states_before)
-				if not return_dict:
-					return logits
-				return BaseModelOutputWithPast(
-					logits=logits,
-					past_key_values=past_key_values,
-					hidden_states=outputs_before.hidden_states if output_hidden_states else None,
-					attentions=outputs_before.attentions if output_attentions else None,
-				)
-	@torch.no_grad()
-	def generate(
-		self,
-		input_ids: torch.LongTensor,
-		attention_mask: Optional[torch.Tensor] = None,
-		position_ids: Optional[torch.LongTensor] = None,
-		past_key_values: Optional[List[torch.FloatTensor]] = None,
-		inputs_embeds: Optional[torch.FloatTensor] = None,
-		use_cache: Optional[bool] = None,
-		output_attentions: Optional[bool] = None,
-		output_hidden_states: Optional[bool] = None,
-		return_dict_in_generate: Optional[bool] = None,
-		**model_kwargs,
-	) -> Union[BaseModelOutputWithPast, torch.LongTensor]:
-		return_dict = return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
-		output = self.infer(
-			input_ids=input_ids,
-			attention_mask=attention_mask,
-			position_ids=position_ids,
-			past_key_values=past_key_values,
-			inputs_embeds=inputs_embeds,
-			use_cache=use_cache,
-			output_attentions=output_attentions,
-			output_hidden_states=output_hidden_states,
-			return_dict=return_dict,
-		)
-		if return_dict:
-			return output
-		else:
-			return output.logits
-    @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,

         return model
+    @torch.no_grad()
+    def infer(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        batch_size, seq_len = input_ids.shape
+        # Save the original input_ids and attention_mask for later use
+        original_input_ids = input_ids.clone()
+        original_attention_mask = attention_mask.clone() if attention_mask is not None else None
+        # Append the start thought token to the input sequence
+        start_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|startthought|>")
+        input_ids = torch.cat([input_ids, torch.tensor([[start_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
+        seq_len += 1
+        # Update the attention mask
+        if attention_mask is not None:
+            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+        # Generate the continuation
+        continuation_length = self.n_ahead - 2
+        new_key_values = past_key_values
+        next_token_id_defined = False  # Flag to check if next_token_id is defined
+        start_time = time.time()
+        for continuation_idx in range(continuation_length):
+            outputs = self.model(
+                input_ids=input_ids if continuation_idx == 0 else next_token_id.unsqueeze(-1).to(input_ids.device),
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=new_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            new_key_values = outputs.past_key_values
+            hidden_states = outputs[0]
+            logits = self.lm_head(hidden_states)
+            logits = logits[:, -1, :]  # Only consider the last token
+            # Apply Gumbel-Softmax to the logits
+            next_token_logits = F.gumbel_softmax(logits, tau=self.gumbel_temperature, hard=True, dim=-1)
+            next_token_id = torch.argmax(next_token_logits, dim=-1)
+            # Append the generated token to the input sequence
+            input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
+            seq_len += 1
+            # Update the attention mask
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+            next_token_id_defined = True  # Set the flag to True after next_token_id is defined
+        # Check if next_token_id is defined before using it
+        if next_token_id_defined:
+            # Append the end thought token to the input sequence
+            end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
+            input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
+            seq_len += 1
+            # Update the attention mask
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+            # Get the hidden states before and after the thought
+            outputs_before = self.model(
+                input_ids=original_input_ids,
+                attention_mask=original_attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            hidden_states_before = outputs_before[0][:, -1:, :]
+            # Only execute if next_token_id is defined
+            if next_token_id_defined:
+                outputs_after = self.model(
+                    input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1),
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=new_key_values,
+                    inputs_embeds=inputs_embeds,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                )
+                hidden_states_after = outputs_after[0][:, -1:, :]
+                # Apply the talk head to get the mixing weight
+                mixing_weight = self.talk_head[0](torch.cat([hidden_states_before, hidden_states_after], dim=-1))
+                # Apply the mixing weight to the hidden states
+                mixed_hidden_states = (1 - mixing_weight) * hidden_states_before + mixing_weight * hidden_states_after
+                # Apply the language model head to get the final logits
+                logits = self.lm_head(mixed_hidden_states)
+                if not return_dict:
+                    return logits
+                return BaseModelOutputWithPast(
+                    logits=logits,
+                    past_key_values=new_key_values,
+                    hidden_states=outputs_after.hidden_states if output_hidden_states else None,
+                    attentions=outputs_after.attentions if output_attentions else None,
+                )
+            else:
+                # Handle the case where next_token_id is not defined (e.g., continuation_length <= 0)
+                # This part of the code needs to be adapted based on how you want to handle this scenario.
+                # As a placeholder, returning the logits from the last state of the original input.
+                logits = self.lm_head(hidden_states_before)
+                if not return_dict:
+                    return logits
+                return BaseModelOutputWithPast(
+                    logits=logits,
+                    past_key_values=past_key_values,
+                    hidden_states=outputs_before.hidden_states if output_hidden_states else None,
+                    attentions=outputs_before.attentions if output_attentions else None,
+                )
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        **model_kwargs,
+    ) -> Union[BaseModelOutputWithPast, torch.LongTensor]:
+        return_dict = return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
+        output = self.infer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if return_dict:
+            return output
+        else:
+            return output.logits
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,