Crystalcareai
/

Gemma-7b-Fixed

Text Generation

Transformers

Safetensors

gemmoe

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 18, 2024

Commit

fe54712

verified ·

1 Parent(s): 28b5873

Update modeling_gemmoe.py

Browse files

Files changed (1) hide show

modeling_gemmoe.py +7 -29

modeling_gemmoe.py CHANGED Viewed

@@ -96,10 +96,8 @@ class GemmoeRMSNorm(nn.Module):
         normed_x = normed_x.type_as(x)
         return normed_x * (self.weight + 1)
 ALL_LAYERNORM_LAYERS.append(GemmoeRMSNorm)
 class GemmoeRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -110,10 +108,11 @@ class GemmoeRotaryEmbedding(nn.Module):
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
-        freq_exponents = (torch.arange(self.dim // 2, dtype=torch.float32, device="cpu").float())
-        timescale = self.base ** (freq_exponents / (self.dim / 2))
         positions = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.float32).float()
-        radians_new = positions[..., None] / timescale[None, :]
         emb = torch.cat((radians_new, radians_new), dim=-1)
         cos = emb.cos().to(device=device, dtype=dtype, non_blocking=True)
         sin = emb.sin().to(device=device, dtype=dtype, non_blocking=True)
@@ -127,7 +126,6 @@ class GemmoeRotaryEmbedding(nn.Module):
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
         return self.cos_cached[:seq_len], self.sin_cached[:seq_len]
 # Copied from transformers.models.llama.modeling_llama.rotate_half
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
@@ -136,34 +134,14 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`, *optional*):
-            Deprecated and unused.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
 # Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Gemmoe
 class GemmoeMLP(nn.Module):
     def __init__(self, config):

         normed_x = normed_x.type_as(x)
         return normed_x * (self.weight + 1)
 ALL_LAYERNORM_LAYERS.append(GemmoeRMSNorm)
 class GemmoeRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
+        freq_exponents = (2.0 / self.dim) * (torch.arange(self.dim // 2, dtype=torch.float32, device="cpu").float())
+        timescale = self.base ** freq_exponents
         positions = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.float32).float()
+        radians_new = positions[..., None] / timescale[None, None, :]
+        radians_new = radians_new.squeeze(0)
         emb = torch.cat((radians_new, radians_new), dim=-1)
         cos = emb.cos().to(device=device, dtype=dtype, non_blocking=True)
         sin = emb.sin().to(device=device, dtype=dtype, non_blocking=True)
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
         return self.cos_cached[:seq_len], self.sin_cached[:seq_len]
 # Copied from transformers.models.llama.modeling_llama.rotate_half
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     return torch.cat((-x2, x1), dim=-1)
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    seq_len, dim = q.shape[-2], q.shape[-1]
+    cos = cos[:seq_len].view(1, 1, seq_len, dim)
+    sin = sin[:seq_len].view(1, 1, seq_len, dim)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
 # Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Gemmoe
 class GemmoeMLP(nn.Module):
     def __init__(self, config):