naist-nlp
/

mitre_913m

Translation

Safetensors

mitre

custom_code

Model card Files Files and versions Community

zhiqu22 commited on 4 days ago

Commit

e57eb93

1 Parent(s): 23f06b4

synch-up with 466m

Browse files

Files changed (1) hide show

modeling_mitre.py +110 -34

modeling_mitre.py CHANGED Viewed

@@ -11,8 +11,6 @@ from transformers.utils import logging
 from transformers.generation import GenerationMixin
 from transformers.modeling_utils import PreTrainedModel
 from transformers.activations import ACT2FN
-from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
-from transformers.integrations.fsdp import is_fsdp_managed_module
 from transformers.modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     Seq2SeqLMOutput,
@@ -75,10 +73,6 @@ class MitreSdpaAttention(nn.Module):
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """
-        Input shape: Batch x Time x Channel
-        Output objects: attn_output, attn_weights (always be None), past_key_value
-        """
         """
         1. MitreModel is using MitreSdpaAttention, which is modifed from M2M100SdpaAttention.
            Notabley, both of them do not support `output_attentions=True` or `layer_head_mask` not None,
@@ -360,6 +354,8 @@ class MitreDecoder(MitrePreTrainedModel):
         elif past_key_values_length > 0:
             # in generation
             mask = torch.zeros(past_key_values_length + 1)
             mask = mask.to(embeds, copy=True)
             batch_mask = mask.unsqueeze(0).expand(b, -1).clone().contiguous()
@@ -374,7 +370,6 @@ class MitreDecoder(MitrePreTrainedModel):
         batch_mask = batch_mask.view(b, 1, batch_mask.shape[-2], batch_mask.shape[-1])
         return batch_mask
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -531,7 +526,6 @@ class MitreDecoder(MitrePreTrainedModel):
                         cache_value[:, :, src_length - max_register_num:, :]
                     )
                     next_decoder_cache += (clipped_rep,)
         if past_key_values_length == 0:
             hidden_states = hidden_states[:,src_length:,:]
@@ -759,6 +753,7 @@ class MitreForConditionalGeneration(MitrePreTrainedModel, GenerationMixin):
     @staticmethod
     def _reorder_register_cache(t, beam_idx):
         return t.index_select(dim=0, index=beam_idx.to(t.device))
     @staticmethod
@@ -782,20 +777,32 @@ class MitreForConditionalGeneration(MitrePreTrainedModel, GenerationMixin):
         ):
         """
             Inference with beam search.
-            This code is simplified from 'transformers.generation.utils.GenerationMixin.generate'.
-            This code follows the style of m2m and nllb.
-            Therefore, there are two points need improvement.
-            TODO
-            1. early_stop in beam search.
-                Current early_stop is at the beam search level instead of model level. Specficially,
-                although beamscorer generates eos to the sequence, the sequence is filled by 'pad(1)'.
-                As a result, the sequence, which has already finished, will be computed by the model
-                continuously. We plan to remove the finished token as Fairseq's style.
-            2. build self-attention mask.
-               Current building happens within the model. Thus, when running beam search, we have to
-               create a mask whose size is (beam_size * batch_size) from scratch. If we create the mask
-               outside of the model, we can create the mask by duplicating beam_size times.
-               Moreover, we can prepare a cache of mask in beam search to avoid create mask many times.
         """
         if generation_config != None:
             assert type(generation_config) is GenerationConfig
@@ -830,20 +837,24 @@ class MitreForConditionalGeneration(MitrePreTrainedModel, GenerationMixin):
         input_ids = self._expand_inputs_for_generation(input_ids, beam_size)
         decoder_input_ids = self._expand_inputs_for_generation(decoder_input_ids, beam_size)
-        # decoder_input_ids.to(device)
         cur_len = decoder_input_ids.shape[1]
         this_peer_finished = False
         past_key_values = None
-        registering_cache = None
         attention_mask = None
         logits_processor = LogitsProcessorList()
         stopping_criteria = StoppingCriteriaList()
         beam_scores = torch.zeros((batch_size, beam_size), dtype=torch.float, device=input_ids.device)
         beam_scores[:, 1:] = -1e9
-        beam_scores = beam_scores.view((batch_size * beam_size,))
         while not this_peer_finished:
             if past_key_values is not None:
@@ -856,23 +867,52 @@ class MitreForConditionalGeneration(MitrePreTrainedModel, GenerationMixin):
                     attention_mask = torch.cat((attention_mask, attention_mask[..., -1:]), dim=-1)
             else:
                 decoder_input_ids_for_generation = decoder_input_ids
-            outputs = self(input_ids, decoder_input_ids_for_generation, past_key_values=past_key_values, use_cache=True, registering_cache=registering_cache)
             del input_ids
             input_ids = None
             past_key_values = outputs.past_key_values
             registering_cache = outputs.registering_cache
             next_token_logits = outputs.logits[:, -1, :].clone().float()
-            next_token_logits = next_token_logits.to(device)
             next_token_scores = nn.functional.log_softmax(
                 next_token_logits, dim=-1
             )  # (batch_size * num_beams, vocab_size)
             next_token_scores_processed = logits_processor(decoder_input_ids, next_token_scores)
             next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
                 next_token_scores_processed
             )
@@ -891,6 +931,7 @@ class MitreForConditionalGeneration(MitrePreTrainedModel, GenerationMixin):
             next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
             next_tokens = next_tokens % vocab_size
             beam_outputs = beam_scorer.process(
                 decoder_input_ids,
                 next_token_scores,
@@ -903,14 +944,49 @@ class MitreForConditionalGeneration(MitrePreTrainedModel, GenerationMixin):
             beam_scores = beam_outputs["next_beam_scores"]
             beam_next_tokens = beam_outputs["next_beam_tokens"]
             beam_idx = beam_outputs["next_beam_indices"]
-            decoder_input_ids = torch.cat([decoder_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
-            del outputs
-            past_key_values = self._reorder_cache(past_key_values, beam_idx)
-            registering_cache["register_nums"] = self._reorder_register_cache(registering_cache["register_nums"], beam_idx)
             if registering_cache["attention_mask"] is not None:
-                registering_cache["attention_mask"] = self._reorder_register_cache(registering_cache["attention_mask"], beam_idx)
             cur_len = cur_len + 1

 from transformers.generation import GenerationMixin
 from transformers.modeling_utils import PreTrainedModel
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     Seq2SeqLMOutput,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """
         1. MitreModel is using MitreSdpaAttention, which is modifed from M2M100SdpaAttention.
            Notabley, both of them do not support `output_attentions=True` or `layer_head_mask` not None,
         elif past_key_values_length > 0:
             # in generation
+            # this block is only used in fairseq and is not used in huggingface,
+            # because we reuse the mask by the cache.
             mask = torch.zeros(past_key_values_length + 1)
             mask = mask.to(embeds, copy=True)
             batch_mask = mask.unsqueeze(0).expand(b, -1).clone().contiguous()
         batch_mask = batch_mask.view(b, 1, batch_mask.shape[-2], batch_mask.shape[-1])
         return batch_mask
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
                         cache_value[:, :, src_length - max_register_num:, :]
                     )
                     next_decoder_cache += (clipped_rep,)
         if past_key_values_length == 0:
             hidden_states = hidden_states[:,src_length:,:]
     @staticmethod
     def _reorder_register_cache(t, beam_idx):
+        """ a costumized reorder method """
         return t.index_select(dim=0, index=beam_idx.to(t.device))
     @staticmethod
         ):
         """
             Inference with beam search.
+            This code is improved from 'transformers.generation.utils.GenerationMixin.generate'.
+            There are **two main improved points**:
+            1. 'soft early_stop' in beam search.
+                a) problem in the vanilla version.
+                In multilingual translation model, e.g., NLLB and M2M, they adopt the 'vanilla early_
+                stop' in BeamSearchScorer (the official implementation provided by HuggingFace), i.e.,
+                the sequence, which is labled by 'end', is filled by 'pad(1)' still, in other words,
+                the ended sequence is fed into the model still, resulting in a heavy memory waste.
+                b) our improvement.
+                We implement soft early_stop to resolve the problem. Specifically, we do not change
+                anything in BeamSearchScorer to keep the codes' flexibility, rather we remove the ended
+                sequence from the input. Then, given that the output hidden states' shape is changed,
+                we insert some placeholders to keep the shape of BeamSearchScorer's states.
+                Based on our test, this improvement can decrease the memory cost to half than before.
+            2. mask reusing.
+                a) problem: registers need attention masks in each step.
+                A sequence possibly consists 4 parts, i.e., pads, source tokens, registers, and target
+                tokens. In training, we mask all tokens before registers for the generation of target
+                tokens. As a result, in generation, we cannot allow the target tokens to 'see' pads.
+                So, we need masks in each step, leading to computational resource waste.
+                b) our improvement.
+                First, we turncate the source tokens to save cost.
+                Second, given that there still exists some source tokens playing the role of placeholders,
+                we modify the mask generation compared to our codes in fairseq.
+                Third, in order to avoid re-generating masks, we add the mask into 'registering_cache'.
+                Then, we manage its order as the kv cache in beam search, and add a column of 0. every step.
         """
         if generation_config != None:
             assert type(generation_config) is GenerationConfig
         input_ids = self._expand_inputs_for_generation(input_ids, beam_size)
         decoder_input_ids = self._expand_inputs_for_generation(decoder_input_ids, beam_size)
         cur_len = decoder_input_ids.shape[1]
         this_peer_finished = False
         past_key_values = None
+        registering_cache= None
         attention_mask = None
+        # done_mask shows the ended sequences.
+        # (~done_mask) shows the running sequences.
+        done_mask = None
+        # we follow the style of M2M and NLLB
+        # so we simplify the initialization of thoes two processors.
         logits_processor = LogitsProcessorList()
         stopping_criteria = StoppingCriteriaList()
         beam_scores = torch.zeros((batch_size, beam_size), dtype=torch.float, device=input_ids.device)
         beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view((batch_size * beam_size,))
         while not this_peer_finished:
             if past_key_values is not None:
                     attention_mask = torch.cat((attention_mask, attention_mask[..., -1:]), dim=-1)
             else:
                 decoder_input_ids_for_generation = decoder_input_ids
+            outputs = self(
+                input_ids,
+                decoder_input_ids_for_generation,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                use_cache=True,
+                registering_cache=registering_cache
+            )
             del input_ids
             input_ids = None
             past_key_values = outputs.past_key_values
             registering_cache = outputs.registering_cache
             next_token_logits = outputs.logits[:, -1, :].clone().float()
+            del outputs
+            next_token_logits = next_token_logits.to(device)
             next_token_scores = nn.functional.log_softmax(
                 next_token_logits, dim=-1
             )  # (batch_size * num_beams, vocab_size)
             next_token_scores_processed = logits_processor(decoder_input_ids, next_token_scores)
+            # if any sequence is ended, we have to keep the shape of Scorer's states.
+            # Details are described in the head of this function.
+            if done_mask is not None:
+                if done_mask.any():
+                    # the placeholder of scores is '0.'
+                    restored_tensor = torch.zeros(
+                        (batch_size * beam_size, next_token_scores_processed.shape[1]),
+                        dtype=next_token_scores_processed.dtype,
+                        device=next_token_scores_processed.device
+                    )
+                    restored_tensor[~done_mask] = next_token_scores_processed
+                    next_token_scores_processed = restored_tensor
+                    # the placeholder of tokens is 'pad_token_id'
+                    restored_tokens = torch.full(
+                        (batch_size * beam_size, decoder_input_ids.shape[1]),
+                        self.generation_config.pad_token_id,
+                        dtype=decoder_input_ids.dtype,
+                        device=device
+                        )
+                    restored_tokens[~done_mask] = decoder_input_ids
+                    decoder_input_ids = restored_tokens
             next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
                 next_token_scores_processed
             )
             next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
             next_tokens = next_tokens % vocab_size
             beam_outputs = beam_scorer.process(
                 decoder_input_ids,
                 next_token_scores,
             beam_scores = beam_outputs["next_beam_scores"]
             beam_next_tokens = beam_outputs["next_beam_tokens"]
             beam_idx = beam_outputs["next_beam_indices"]
+            # 'last_done_mask' is used for reordering cache
+            # details are described in the next code block
+            if done_mask is not None:
+                last_done_mask = done_mask
+            # get the newest status of sequences.
+            # then, filter the beam_idx
+            done_mask = beam_scorer._done.clone().view(-1)
+            done_mask = self._expand_inputs_for_generation(done_mask, beam_size)
+            beam_idx = beam_idx[~done_mask]
+            decoder_input_ids = torch.cat([decoder_input_ids[beam_idx, :], beam_next_tokens[~done_mask].unsqueeze(-1)], dim=-1)
+            # different from processing tokens, caches' order is decided by 'tokens', 'done_mask' and
+            # 'beam_idx', simultaneously.
+            if decoder_input_ids_for_generation.shape[0] < beam_next_tokens.shape[0]:
+                # Take carefule! If the running sequences' num is small than the num of input sequences,
+                # it means the Scorer decides to end it, but the cache still follows the last status.
+                # Therefore, we should employ the last done mask rather than newest done mask.
+                if (~done_mask).sum() < decoder_input_ids_for_generation.shape[0]:
+                    count_mask = last_done_mask
+                else:
+                    count_mask = done_mask
+                # For biasing the beam_idx
+                # Example:
+                # done_mask with beam size of 2: [f, f, t, t, f, f]
+                # beam_idx: [0, 0, 2, 2, 4, 5]
+                # reorder_idx: [0-0, 0-0, 4-2, 5-2]
+                prefix_sum = torch.cat([
+                    torch.zeros_like(count_mask[:1], dtype=torch.long),
+                    torch.cumsum(count_mask.long(), dim=0)
+                    ], dim=0)
+                reorder_idx = beam_idx - prefix_sum[beam_idx]
+                not_done = ~done_mask[beam_idx]
+                reorder_idx = reorder_idx[not_done]
+            else:
+                reorder_idx = beam_idx
+            past_key_values = self._reorder_cache(past_key_values, reorder_idx)
+            registering_cache["register_nums"] = self._reorder_register_cache(registering_cache["register_nums"], reorder_idx)
             if registering_cache["attention_mask"] is not None:
+                registering_cache["attention_mask"] = self._reorder_register_cache(registering_cache["attention_mask"], reorder_idx)
             cur_len = cur_len + 1