Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 4, 2024

Commit

c1ae7ac

verified ·

1 Parent(s): 40c682f

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +110 -39

modeling_quiet.py CHANGED Viewed

@@ -1307,47 +1307,118 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
                 nn.init.constant_(module.bias, 0)
         elif isinstance(module, nn.Embedding):
             nn.init.xavier_uniform_(module.weight)
     @torch.no_grad()
-    def generate(self, input_ids, attention_mask=None, streamer=None, **kwargs):
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-        max_length = kwargs.get("max_length", 20)
-        temp = kwargs.get("temperature", 1.0)
-        with torch.no_grad():
-            finished_generating = torch.zeros(len(input_ids), dtype=torch.bool, device=input_ids.device)
-            for cur_token_idx in range(max_length):
-                # Sample the next token
-                new_ids = self(
-                    input_ids[~finished_generating],
-                    attention_mask=attention_mask[~finished_generating]
-                )['logits']
-                # Mask out the start and end thought tokens so we don't accidentally sample them
-                new_ids[:, :, self.tokenizer.vocab_size:] = -float("inf")
-                for list_idx, answer_idx in enumerate((~finished_generating).nonzero(as_tuple=True)[0]):
-                    # Find the index of the last token that is not padding
-                    base_answer_ids = input_ids[answer_idx]
-                    new_answer_ids = new_ids[list_idx]
-                    last_token_idx = (base_answer_ids != self.tokenizer.pad_token_id).nonzero(as_tuple=True)[0].max()
-                    new_ids_sampled = torch.multinomial(
-                            torch.nn.functional.softmax(new_answer_ids[last_token_idx] / temp, dim=-1), 1)
-                    # Assign the new id to the last token
-                    if last_token_idx + 1 >= len(base_answer_ids):
-                        # Add padding everywhere
-                        new_padding = torch.full((len(input_ids), 1), self.tokenizer.pad_token_id, dtype=torch.long,
-                                                 device=input_ids.device)
-                        input_ids = torch.cat([input_ids, new_padding], dim=-1)
-                        attention_mask = torch.cat([attention_mask, torch.zeros_like(new_padding)], dim=-1)
-                    attention_mask[answer_idx, last_token_idx + 1] = 1
-                    input_ids[answer_idx, last_token_idx + 1] = new_ids_sampled
-                    if new_ids_sampled == self.tokenizer.eos_token_id or new_ids_sampled == self.tokenizer.bos_token_id or new_ids_sampled == self.tokenizer.pad_token_id:
-                        finished_generating[answer_idx] = 1
-                if finished_generating.all():
-                    break
-        return input_ids, attention_mask
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)

                 nn.init.constant_(module.bias, 0)
         elif isinstance(module, nn.Embedding):
             nn.init.xavier_uniform_(module.weight)
     @torch.no_grad()
+    def infer(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        batch_size, seq_len = input_ids.shape
+        # Save the original input_ids and attention_mask for later use
+        original_input_ids = input_ids.clone()
+        original_attention_mask = attention_mask.clone() if attention_mask is not None else None
+        # Append the start thought token to the input sequence
+        start_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|startthought|>")
+        input_ids = torch.cat([input_ids, torch.tensor([[start_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
+        seq_len += 1
+        # Update the attention mask
+        if attention_mask is not None:
+            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+        # Generate the continuation
+        continuation_length = self.n_ahead - 2
+        new_key_values = past_key_values
+        start_time = time.time()
+        for continuation_idx in range(continuation_length):
+            outputs = self.model(
+                input_ids=input_ids if continuation_idx == 0 else next_token_id.unsqueeze(-1).to(input_ids.device),
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=new_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            new_key_values = outputs.past_key_values
+            hidden_states = outputs[0]
+            logits = self.lm_head(hidden_states)
+            logits = logits[:, -1, :]  # Only consider the last token
+            # Apply Gumbel-Softmax to the logits
+            next_token_logits = F.gumbel_softmax(logits, tau=self.gumbel_temperature, hard=True, dim=-1)
+            next_token_id = torch.argmax(next_token_logits, dim=-1)
+            # Append the generated token to the input sequence
+            input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
+            seq_len += 1
+            # Update the attention mask
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+        # Append the end thought token to the input sequence
+        end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
+        input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
+        seq_len += 1
+        # Update the attention mask
+        if attention_mask is not None:
+            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+        # Get the hidden states before and after the thought
+        outputs_before = self.model(
+            input_ids=original_input_ids,
+            attention_mask=original_attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states_before = outputs_before[0][:, -1:, :]
+        # two new tokens: last continuation token and end thought token
+        outputs_after = self.model(
+            input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor(end_thought_token_id).unsqueeze(-1).unsqueeze(-1).to(input_ids.device)], dim=-1),
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=new_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states_after = outputs_after[0][:, -1:, :]
+        # Apply the talk head to get the mixing weight
+        mixing_weight = self.talk_head[0](torch.cat([hidden_states_before, hidden_states_after], dim=-1))
+        # Apply the mixing weight to the hidden states
+        mixed_hidden_states = (1 - mixing_weight) * hidden_states_before + mixing_weight * hidden_states_after
+        # Apply the language model head to get the final logits
+        logits = self.lm_head(mixed_hidden_states)
+        return logits
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)