Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 8, 2024

Commit

de08a5d

verified ·

1 Parent(s): ea46013

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +43 -43

modeling_quiet.py CHANGED Viewed

@@ -1024,14 +1024,16 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
         # Update the attention mask
         if attention_mask is not None:
             attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
         # Generate the continuation
         continuation_length = self.n_ahead - 2
         new_key_values = past_key_values
         # Initialize next_token_id with a default value
         next_token_id = torch.zeros(batch_size, dtype=torch.long).to(input_ids.device)
         start_time = time.time()
         for continuation_idx in range(continuation_length):
             outputs = self.model(
@@ -1057,59 +1059,57 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
             next_token_id = torch.argmax(next_token_logits, dim=-1)
             # Append the generated token to the input sequence
-            # input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
             seq_len += 1
             # Update the attention mask
-            if attention_mask is not None:
-                attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
         # Append the end thought token to the input sequence
-        end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
-        input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
-        seq_len += 1
-        # Update the attention mask
-        if attention_mask is not None:
             attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
-        # Get the hidden states before and after the thought
-        outputs_before = self.model(
-            input_ids=original_input_ids,
-            attention_mask=original_attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states_before = outputs_before[0][:, -1:, :]
-        # two new tokens: last continuation token and end thought token
-        outputs_after = self.model(
-            input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1),
-            attention_mask=torch.cat([attention_mask[:, -1:], torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1),
-            position_ids=position_ids,
-            past_key_values=new_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states_after = outputs_after[0][:, -1:, :]
-        # Apply the talk head to get the mixing weight
-        mixing_weight = self.talk_head[0](torch.cat([hidden_states_before, hidden_states_after], dim=-1))
-        # Apply the mixing weight to the hidden states
-        mixed_hidden_states = (1 - mixing_weight) * hidden_states_before + mixing_weight * hidden_states_after
-        # Apply the language model head to get the final logits
-        logits = self.lm_head(mixed_hidden_states)
-        return logits
     @torch.no_grad()
     def generate(

         # Update the attention mask
         if attention_mask is not None:
             attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+        else:
+            attention_mask = torch.ones((batch_size, seq_len)).to(input_ids.device)
         # Generate the continuation
         continuation_length = self.n_ahead - 2
         new_key_values = past_key_values
         # Initialize next_token_id with a default value
         next_token_id = torch.zeros(batch_size, dtype=torch.long).to(input_ids.device)
         start_time = time.time()
         for continuation_idx in range(continuation_length):
             outputs = self.model(
             next_token_id = torch.argmax(next_token_logits, dim=-1)
             # Append the generated token to the input sequence
+            input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
             seq_len += 1
             # Update the attention mask
+            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
         # Append the end thought token to the input sequence
+            end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
+            input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
+            seq_len += 1
+            # Update the attention mask
             attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+            # Get the hidden states before and after the thought
+            outputs_before = self.model(
+                input_ids=original_input_ids,
+                attention_mask=original_attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            hidden_states_before = outputs_before[0][:, -1:, :]
+            # two new tokens: last continuation token and end thought token
+            outputs_after = self.model(
+                input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1),
+                attention_mask=attention_mask[:, -2:],
+                position_ids=position_ids,
+                past_key_values=new_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            hidden_states_after = outputs_after[0][:, -1:, :]
+            # Apply the talk head to get the mixing weight
+            mixing_weight = self.talk_head[0](torch.cat([hidden_states_before, hidden_states_after], dim=-1))
+            # Apply the mixing weight to the hidden states
+            mixed_hidden_states = (1 - mixing_weight) * hidden_states_before + mixing_weight * hidden_states_after
+            # Apply the language model head to get the final logits
+            logits = self.lm_head(mixed_hidden_states)
+            return logits
     @torch.no_grad()
     def generate(