Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 3, 2024

Commit

49663aa

verified ·

1 Parent(s): aa488fc

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +96 -29

modeling_quiet.py CHANGED Viewed

@@ -2128,6 +2128,50 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
         del start_embedding
         del end_embedding
         torch.cuda.empty_cache()
         return CausalLMOutputWithPast(
             loss=loss if loss is not None else None,
@@ -2169,36 +2213,59 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
         else:
-            attention_mask = attention_mask[:, -input_ids.shape[1]:]  # Adjust the attention mask size
-        if self.use_start_thought_token:
-            start_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|startthought|>")
-            input_ids = torch.cat(
-                [input_ids, torch.tensor([[start_thought_token_id]] * input_ids.shape[0], device=input_ids.device)],
-                dim=-1
-            )
-            attention_mask = torch.cat(
-                [attention_mask, torch.ones((input_ids.shape[0], 1), device=attention_mask.device)],
-                dim=-1
-            )
-        # Expand the attention mask to the correct shape
-        attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-        attention_mask = attention_mask.expand(input_ids.shape[0], 1, input_ids.shape[1], input_ids.shape[1])
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-            "attention_mask": attention_mask,
-            "inputs_embeds": inputs_embeds,
-        }
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):

         del start_embedding
         del end_embedding
         torch.cuda.empty_cache()
+        if not self.training:
+            # Inference mode
+            if max_length is None:
+                max_length = self.config.max_length
+            finished_generating = torch.zeros(batch_size, dtype=torch.bool, device=input_ids.device)
+            for cur_token_idx in range(max_length):
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=inputs_embeds,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                )
+                hidden_states = outputs[0]
+                logits = self.lm_head(hidden_states)
+                # Mask out the start and end thought tokens
+                logits[:, :, self.start_token_id] = -float("inf")
+                logits[:, :, self.end_token_id] = -float("inf")
+                for batch_idx in range(batch_size):
+                    if not finished_generating[batch_idx]:
+                        last_token_idx = (input_ids[batch_idx] != self.tokenizer.pad_token_id).nonzero(as_tuple=True)[0].max()
+                        new_id_sampled = torch.multinomial(
+                            torch.nn.functional.softmax(logits[batch_idx, last_token_idx] / temperature, dim=-1), 1
+                        )
+                        if last_token_idx + 1 >= input_ids.shape[1]:
+                            # Add padding
+                            new_padding = torch.full((batch_size, 1), self.tokenizer.pad_token_id, dtype=torch.long, device=input_ids.device)
+                            input_ids = torch.cat([input_ids, new_padding], dim=-1)
+                            attention_mask = torch.cat([attention_mask, torch.zeros_like(new_padding)], dim=-1)
+                        attention_mask[batch_idx, last_token_idx + 1] = 1
+                        input_ids[batch_idx, last_token_idx + 1] = new_id_sampled
+                        if new_id_sampled == self.tokenizer.eos_token_id or new_id_sampled == self.tokenizer.bos_token_id or new_id_sampled == self.tokenizer.pad_token_id:
+                            finished_generating[batch_idx] = True
+                if finished_generating.all():
+                    break
         return CausalLMOutputWithPast(
             loss=loss if loss is not None else None,
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing inputs_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
         else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):