hugohrban
/

progen2-small

Text Generation

Transformers

Safetensors

progen

custom_code

Model card Files Files and versions Community

hugohrban commited on Jun 8, 2024

Commit

3528543

verified ·

1 Parent(s): bc13d43

Update modeling_progen.py

Browse files

Files changed (1) hide show

modeling_progen.py +35 -145

modeling_progen.py CHANGED Viewed

@@ -32,7 +32,6 @@ from transformers.modeling_outputs import (
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
-from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
 from .configuration_progen import ProGenConfig
@@ -52,12 +51,11 @@ def fixed_pos_embedding(x, seq_dim=1, seq_len=None):
     return torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)
-def rotate_every_two(x):
     x1 = x[:, :, :, ::2]
     x2 = x[:, :, :, 1::2]
     x = torch.stack((-x2, x1), axis=-1)
-    return x.flatten(-2)  # in einsum notation: rearrange(x, '... d j -> ... (d j)')
 def apply_rotary_pos_emb(x, sincos, offset=0):
     sin, cos = map(
@@ -74,20 +72,21 @@ class ProGenAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
-        max_positions = config.max_position_embeddings
         self.register_buffer(
             "bias",
             torch.tril(
                 torch.ones((max_positions, max_positions), dtype=torch.bool)
             ).view(1, 1, max_positions, max_positions),
         )
-        self.register_buffer("masked_bias", torch.tensor(-1e9))
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
-        self.embed_dim = config.hidden_size
-        self.num_attention_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_attention_heads
         if self.head_dim * self.num_attention_heads != self.embed_dim:
             raise ValueError(
@@ -103,12 +102,12 @@ class ProGenAttention(nn.Module):
         if config.rotary_dim is not None:
             self.rotary_dim = config.rotary_dim
-    def _split_heads(self, x, n_head, dim_head, mp_num):
-        reshaped = x.reshape(x.shape[:-1] + (n_head // mp_num, dim_head))
-        reshaped = reshaped.reshape(x.shape[:-2] + (-1,) + reshaped.shape[-1:])
-        return reshaped
-    def _merge_heads(self, tensor, num_attention_heads, attn_head_size):
         """
         Merges attn_head_size dim and num_attn_heads dim into n_positions
         """
@@ -140,17 +139,17 @@ class ProGenAttention(nn.Module):
         # Keep the attention weights computation in fp32 to avoid overflow issues
         query = query.to(torch.float32)
         key = key.to(torch.float32)
-        #print("q.shape", query.shape)
-        #print("k.shape", key.shape)
-        attn_weights = query @ key.transpose(-1, -2)
         attn_weights = attn_weights / self.scale_attn
         attn_weights = torch.where(
             causal_mask, attn_weights, self.masked_bias.to(attn_weights.dtype)
         )
         if attention_mask is not None:
-            # Apply the attention mask
             attn_weights = attn_weights + attention_mask
         attn_weights = F.softmax(attn_weights, dim=-1)
@@ -160,7 +159,7 @@ class ProGenAttention(nn.Module):
         if head_mask is not None:
             attn_weights = attn_weights * head_mask
-        attn_output = attn_weights @ value
         return attn_output, attn_weights
@@ -173,24 +172,16 @@ class ProGenAttention(nn.Module):
         use_cache=False,
         output_attentions=False,
     ):
-        qkv = self.qkv_proj(hidden_states)
-        # TODO(enijkamp): factor out number of logical TPU-v3/v4 cores or make forward pass agnostic
-        # mp_num = 4
         mp_num = 8
-        qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1))
-        local_dim = self.head_dim * self.num_attention_heads // mp_num
-        query, value, key = torch.split(qkv_split, local_dim, dim=-1)
-        query = self._split_heads(
-            query, self.num_attention_heads, self.head_dim, mp_num=mp_num
-        )
-        key = self._split_heads(
-            key, self.num_attention_heads, self.head_dim, mp_num=mp_num
-        )
-        value = self._split_heads(
-            value, self.num_attention_heads, self.head_dim, mp_num=mp_num
-        )
         value = value.permute(0, 2, 1, 3)
         seq_len = key.shape[1]
@@ -237,7 +228,7 @@ class ProGenAttention(nn.Module):
             query, key, value, attention_mask, head_mask
         )
-        attn_output = self._merge_heads(
             attn_output, self.num_attention_heads, self.head_dim
         )
@@ -256,7 +247,7 @@ class ProGenMLP(nn.Module):
         self, intermediate_size, config
     ):  # in MLP: intermediate_size= 4 * embed_dim
         super().__init__()
-        embed_dim = config.n_embd
         self.fc_in = nn.Linear(embed_dim, intermediate_size)
         self.fc_out = nn.Linear(intermediate_size, embed_dim)
@@ -275,8 +266,8 @@ class ProGenMLP(nn.Module):
 class ProGenBlock(nn.Module):
     def __init__(self, config):
         super().__init__()
-        inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd
-        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
         self.attn = ProGenAttention(config)
         self.mlp = ProGenMLP(inner_dim, config)
@@ -302,7 +293,7 @@ class ProGenBlock(nn.Module):
         attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
         outputs = attn_outputs[1:]
-        feed_forward_hidden_states = self.mlp(hidden_states)
         hidden_states = attn_output + feed_forward_hidden_states + residual
         if use_cache:
@@ -321,7 +312,7 @@ class ProGenPreTrainedModel(PreTrainedModel):
     config_class = ProGenConfig
     base_model_prefix = "transformer"
-    is_parallelizable = True
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
@@ -347,61 +338,16 @@ class ProGenModel(ProGenPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.vocab_size_emb = config.vocab_size_emb
-        self.embed_dim = config.n_embd
         self.wte = nn.Embedding(config.vocab_size_emb, self.embed_dim)
         self.drop = nn.Dropout(config.embd_pdrop)
         self.h = nn.ModuleList([ProGenBlock(config) for _ in range(config.n_layer)])
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
         self.rotary_dim = min(
-            config.rotary_dim, config.n_positions // config.num_attention_heads
         )
         self.init_weights()
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-    def parallelize(self, device_map=None):
-        # Check validity of device_map
-        self.device_map = (
-            get_device_map(len(self.h), range(torch.cuda.device_count()))
-            if device_map is None
-            else device_map
-        )
-        assert_device_map(self.device_map, len(self.h))
-        self.model_parallel = True
-        self.first_device = (
-            "cpu"
-            if "cpu" in self.device_map.keys()
-            else "cuda:" + str(min(self.device_map.keys()))
-        )
-        self.last_device = "cuda:" + str(max(self.device_map.keys()))
-        self.wte = self.wte.to(self.first_device)
-        # Load onto devices
-        for k, v in self.device_map.items():
-            for block in v:
-                cuda_device = "cuda:" + str(k)
-                self.h[block] = self.h[block].to(cuda_device)
-        # ln_f to last
-        self.ln_f = self.ln_f.to(self.last_device)
-    def deparallelize(self):
-        self.model_parallel = False
-        self.device_map = None
-        self.first_device = "cpu"
-        self.last_device = "cpu"
-        self.wte = self.wte.to("cpu")
-        for index in range(len(self.h)):
-            self.h[index] = self.h[index].to("cpu")
-        self.ln_f = self.ln_f.to("cpu")
-        torch.cuda.empty_cache()
-    def get_input_embeddings(self):
-        return self.wte
-    def set_input_embeddings(self, new_embeddings):
-        self.wte = new_embeddings
     def forward(
         self,
         input_ids=None,
@@ -510,19 +456,6 @@ class ProGenModel(ProGenPreTrainedModel):
         all_self_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            # Model parallel
-            if self.model_parallel:
-                torch.cuda.set_device(hidden_states.device)
-                # Ensure layer_past is on same device as hidden_states (might not be correct)
-                if layer_past is not None:
-                    layer_past = tuple(
-                        past_state.to(hidden_states.device) for past_state in layer_past
-                    )
-                # Ensure that attention_mask is always on the same device as hidden_states
-                if attention_mask is not None:
-                    attention_mask = attention_mask.to(hidden_states.device)
-                if isinstance(head_mask, torch.Tensor):
-                    head_mask = head_mask.to(hidden_states.device)
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
@@ -567,12 +500,6 @@ class ProGenModel(ProGenPreTrainedModel):
                     outputs[2 if use_cache else 1],
                 )
-            # Model Parallel: If it's the last layer for that device, put things on the next device
-            if self.model_parallel:
-                for k, v in self.device_map.items():
-                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
-                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
         hidden_states = self.ln_f(hidden_states)
         hidden_states = hidden_states.view(*output_shape)
@@ -591,7 +518,7 @@ class ProGenModel(ProGenPreTrainedModel):
                 ]
                 if v is not None
             )
-        # print("hidden_states", hidden_states.shape)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=presents,
@@ -610,37 +537,9 @@ class ProGenForCausalLM(ProGenPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.transformer = ProGenModel(config)
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size_lm_head)
         self.init_weights()
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-    # def parallelize(self, device_map=None):
-    #     self.device_map = (
-    #         get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
-    #         if device_map is None
-    #         else device_map
-    #     )
-    #     assert_device_map(self.device_map, len(self.transformer.h))
-    #     self.transformer.parallelize(self.device_map)
-    #     self.lm_head = self.lm_head.to(self.transformer.first_device)
-    #     self.model_parallel = True
-    # def deparallelize(self):
-    #     self.transformer.deparallelize()
-    #     self.transformer = self.transformer.to("cpu")
-    #     self.lm_head = self.lm_head.to("cpu")
-    #     self.model_parallel = False
-    #     torch.cuda.empty_cache()
-    # def get_output_embeddings(self):
-    #     return None
-    # def set_output_embeddings(self, new_embeddings):
-    #     return
     def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
         token_type_ids = kwargs.get("token_type_ids", None)
         # only last token for inputs_ids if past is defined in kwargs
@@ -650,7 +549,6 @@ class ProGenForCausalLM(ProGenPreTrainedModel):
                 token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
         attention_mask = kwargs.get("attention_mask", None)
-        # print("attention_mask", attention_mask)
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
@@ -694,8 +592,7 @@ class ProGenForCausalLM(ProGenPreTrainedModel):
         return_dict = (
             return_dict if return_dict is not None else self.config.use_return_dict
         )
-        # print("here")
-        # print(attention_mask)
         transformer_outputs = self.transformer(
             input_ids,
             past_key_values=past_key_values,
@@ -711,11 +608,6 @@ class ProGenForCausalLM(ProGenPreTrainedModel):
         )
         hidden_states = transformer_outputs[0]
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.transformer.first_device)
-            hidden_states = hidden_states.to(self.lm_head.weight.device)
         # make sure sampling in fp16 works correctly and
         # compute loss in fp32 to match with mesh-tf version
         # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
@@ -726,12 +618,10 @@ class ProGenForCausalLM(ProGenPreTrainedModel):
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(
                 shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
             )
             loss = loss.to(hidden_states.dtype)
         if not return_dict:

 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
 from .configuration_progen import ProGenConfig
     return torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)
+def rotate_every_two(x: torch.Tensor):
     x1 = x[:, :, :, ::2]
     x2 = x[:, :, :, 1::2]
     x = torch.stack((-x2, x1), axis=-1)
+    return x.flatten(-2)
 def apply_rotary_pos_emb(x, sincos, offset=0):
     sin, cos = map(
     def __init__(self, config):
         super().__init__()
+        max_positions = config.n_positions
         self.register_buffer(
             "bias",
             torch.tril(
                 torch.ones((max_positions, max_positions), dtype=torch.bool)
             ).view(1, 1, max_positions, max_positions),
+            persistent=False
         )
+        self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)     # approx. -inf
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.embed_dim = config.embed_dim
+        self.num_attention_heads = config.n_head
         self.head_dim = self.embed_dim // self.num_attention_heads
         if self.head_dim * self.num_attention_heads != self.embed_dim:
             raise ValueError(
         if config.rotary_dim is not None:
             self.rotary_dim = config.rotary_dim
+    def _split_heads(self, x: torch.Tensor, n_head, dim_head) -> torch.Tensor:
+        x = x.reshape(x.shape[:-2] + (-1,))                             # (B, T, 8 * E // 8)
+        x = x.reshape(x.shape[:-1] + (n_head, dim_head))             # (B, T, n_heads, dim_head)
+        return x
+    def _merge_heads(self, tensor, num_attention_heads, attn_head_size) -> torch.Tensor:
         """
         Merges attn_head_size dim and num_attn_heads dim into n_positions
         """
         # Keep the attention weights computation in fp32 to avoid overflow issues
         query = query.to(torch.float32)
         key = key.to(torch.float32)
+        attn_weights = query @ key.transpose(-1, -2)                 # (B, n_heads, T, T)
         attn_weights = attn_weights / self.scale_attn
+        # attend only to previous positions
         attn_weights = torch.where(
             causal_mask, attn_weights, self.masked_bias.to(attn_weights.dtype)
         )
         if attention_mask is not None:
             attn_weights = attn_weights + attention_mask
         attn_weights = F.softmax(attn_weights, dim=-1)
         if head_mask is not None:
             attn_weights = attn_weights * head_mask
+        attn_output = attn_weights @ value                # (B, n_heads, T, dim_head)
         return attn_output, attn_weights
         use_cache=False,
         output_attentions=False,
     ):
+        qkv = self.qkv_proj(hidden_states)                                         # (B, T, 3 * E)
         mp_num = 8
+        qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1))                     # (B, T, 8, 3 * E // 8)
+        query, value, key = torch.split(qkv_split, self.embed_dim // mp_num, dim=-1) # 3 * (B, T, 8, E // 8)
+        query = self._split_heads(query, self.num_attention_heads, self.head_dim)  # (B, T, n_heads, dim_head)
+        key = self._split_heads(key, self.num_attention_heads, self.head_dim)      # (B, T, n_heads, dim_head)
+        value = self._split_heads(value, self.num_attention_heads, self.head_dim)  # (B, T, n_heads, dim_head)
         value = value.permute(0, 2, 1, 3)
         seq_len = key.shape[1]
             query, key, value, attention_mask, head_mask
         )
+        attn_output = self._merge_heads(                               # (B, T, E)
             attn_output, self.num_attention_heads, self.head_dim
         )
         self, intermediate_size, config
     ):  # in MLP: intermediate_size= 4 * embed_dim
         super().__init__()
+        embed_dim = config.embed_dim
         self.fc_in = nn.Linear(embed_dim, intermediate_size)
         self.fc_out = nn.Linear(intermediate_size, embed_dim)
 class ProGenBlock(nn.Module):
     def __init__(self, config):
         super().__init__()
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * config.embed_dim
+        self.ln_1 = nn.LayerNorm(config.embed_dim, eps=config.layer_norm_epsilon)
         self.attn = ProGenAttention(config)
         self.mlp = ProGenMLP(inner_dim, config)
         attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
         outputs = attn_outputs[1:]
+        feed_forward_hidden_states = self.mlp(hidden_states)            # (B, T, E)
         hidden_states = attn_output + feed_forward_hidden_states + residual
         if use_cache:
     config_class = ProGenConfig
     base_model_prefix = "transformer"
+    is_parallelizable = False
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
     def __init__(self, config):
         super().__init__(config)
         self.vocab_size_emb = config.vocab_size_emb
+        self.embed_dim = config.embed_dim
         self.wte = nn.Embedding(config.vocab_size_emb, self.embed_dim)
         self.drop = nn.Dropout(config.embd_pdrop)
         self.h = nn.ModuleList([ProGenBlock(config) for _ in range(config.n_layer)])
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
         self.rotary_dim = min(
+            config.rotary_dim, config.n_positions // config.n_head
         )
         self.init_weights()
     def forward(
         self,
         input_ids=None,
         all_self_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
                     outputs[2 if use_cache else 1],
                 )
         hidden_states = self.ln_f(hidden_states)
         hidden_states = hidden_states.view(*output_shape)
                 ]
                 if v is not None
             )
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=presents,
     def __init__(self, config):
         super().__init__(config)
         self.transformer = ProGenModel(config)
+        self.lm_head = nn.Linear(config.embed_dim, config.vocab_size_lm_head)
         self.init_weights()
     def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
         token_type_ids = kwargs.get("token_type_ids", None)
         # only last token for inputs_ids if past is defined in kwargs
                 token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
         attention_mask = kwargs.get("attention_mask", None)
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
         return_dict = (
             return_dict if return_dict is not None else self.config.use_return_dict
         )
         transformer_outputs = self.transformer(
             input_ids,
             past_key_values=past_key_values,
         )
         hidden_states = transformer_outputs[0]
         # make sure sampling in fp16 works correctly and
         # compute loss in fp32 to match with mesh-tf version
         # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(
                 shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
             )
             loss = loss.to(hidden_states.dtype)
         if not return_dict: