microsoft
/

Phi-3-mini-128k-instruct

@@ -87,6 +87,8 @@ class Phi3Config(PretrainedConfig):
             contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
             the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
             divided by the number of attention heads divided by 2.
         eos_token_id (`int`, *optional*, defaults to 32000):
             The id of the "end-of-sequence" token.
         pad_token_id (`int`, *optional*, defaults to 32000):
@@ -132,6 +134,7 @@ class Phi3Config(PretrainedConfig):
         tie_word_embeddings=False,
         rope_theta=10000.0,
         rope_scaling=None,
         eos_token_id=32000,
         pad_token_id=32000,
         sliding_window=None,
@@ -162,6 +165,7 @@ class Phi3Config(PretrainedConfig):
         self.sliding_window = sliding_window
         super().__init__(
             eos_token_id=eos_token_id,
             pad_token_id=pad_token_id,
             tie_word_embeddings=tie_word_embeddings,

             contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
             the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
             divided by the number of attention heads divided by 2.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
         eos_token_id (`int`, *optional*, defaults to 32000):
             The id of the "end-of-sequence" token.
         pad_token_id (`int`, *optional*, defaults to 32000):
         tie_word_embeddings=False,
         rope_theta=10000.0,
         rope_scaling=None,
+        bos_token_id=1,
         eos_token_id=32000,
         pad_token_id=32000,
         sliding_window=None,
         self.sliding_window = sliding_window
         super().__init__(
+            bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             pad_token_id=pad_token_id,
             tie_word_embeddings=tie_word_embeddings,

modeling_phi3.py CHANGED Viewed

@@ -40,6 +40,7 @@ from transformers.utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
@@ -107,7 +108,7 @@ def _get_unpad_data(attention_mask):
     )
-# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Phi3
 class Phi3RotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -115,93 +116,131 @@ class Phi3RotaryEmbedding(nn.Module):
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-    def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-class _Phi3ScaledRotaryEmbedding(nn.Module):
     def __init__(
         self,
         dim,
         short_factor,
         long_factor,
-        max_position_embeddings=2048,
         original_max_position_embeddings=2048,
         base=10000,
     ):
-        super().__init__()
-        self.dim = dim
         self.short_factor = short_factor
         self.long_factor = long_factor
-        self.max_position_embeddings = max_position_embeddings
         self.original_max_position_embeddings = original_max_position_embeddings
-        self.base = base
-    def _calc_mscale(self, scale):
-        raise NotImplementedError("`_calc_mscale` should be implemented in subclasses")
     @torch.no_grad()
-    def forward(self, x, seq_len=None):
-        if seq_len is None:
-            seq_len = x.shape[-2]
-        t = torch.arange(seq_len, device=x.device, dtype=torch.float32)
-        if seq_len > self.original_max_position_embeddings:
-            t = torch.arange(seq_len, device=x.device, dtype=torch.float32)
-            rescale_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
         else:
-            t = torch.arange(self.original_max_position_embeddings, device=x.device, dtype=torch.float32)
-            rescale_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
-        inv_freq = 1.0 / (
-            rescale_factors * (self.base ** (torch.arange(0, self.dim, 2).float().to(x.device) / self.dim))
-        )
-        freqs = torch.outer(t, inv_freq)
-        mscale = self._calc_mscale(self.max_position_embeddings / self.original_max_position_embeddings)
-        emb = torch.cat((freqs, freqs), dim=-1)
-        return (emb.cos() * mscale).to(x.dtype), (emb.sin() * mscale).to(x.dtype)
-class Phi3SuScaledRotaryEmbedding(_Phi3ScaledRotaryEmbedding):
-    def _calc_mscale(self, scale):
         if scale <= 1.0:
             return 1.0
-        return math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
-class Phi3YarnScaledRotaryEmbedding(_Phi3ScaledRotaryEmbedding):
-    def _calc_mscale(self, scale):
-        if scale <= 1.0:
-            return 1.0
-        return 0.1 * math.log(scale) + 1.0
 # Copied from transformers.models.llama.modeling_llama.rotate_half
@@ -212,7 +251,8 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
     Args:
@@ -220,9 +260,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         k (`torch.Tensor`): The key tensor.
         cos (`torch.Tensor`): The cosine part of the rotary embedding.
         sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
         unsqueeze_dim (`int`, *optional*, defaults to 1):
             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
             sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
@@ -233,12 +272,11 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    # Need fp32 here to match logits
-    q_embed = (q.float() * cos.float()) + (rotate_half(q).float() * sin.float())
-    k_embed = (k.float() * cos.float()) + (rotate_half(k).float() * sin.float())
-    return q_embed.to(q.dtype), k_embed.to(k.dtype)
 class Phi3MLP(nn.Module):
@@ -252,12 +290,12 @@ class Phi3MLP(nn.Module):
         self.activation_fn = ACT2FN[config.hidden_act]
     def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
-        y = self.gate_up_proj(hidden_states)
-        gate, y = y.chunk(2, dim=-1)
-        y = y * self.activation_fn(gate)
-        return self.down_proj(y)
 # Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
@@ -296,6 +334,7 @@ class Phi3Attention(nn.Module):
         self.max_position_embeddings = config.max_position_embeddings
         self.original_max_position_embeddings = config.original_max_position_embeddings
         self.rope_theta = config.rope_theta
         self.is_causal = True
         if (self.head_dim * self.num_heads) != self.hidden_size:
@@ -310,7 +349,7 @@ class Phi3Attention(nn.Module):
         self._init_rope()
     def _init_rope(self):
-        if self.config.rope_scaling is None:
             self.rotary_emb = Phi3RotaryEmbedding(
                 self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
@@ -318,30 +357,30 @@ class Phi3Attention(nn.Module):
             )
         else:
             scaling_type = self.config.rope_scaling["type"]
             if scaling_type == "su":
                 self.rotary_emb = Phi3SuScaledRotaryEmbedding(
                     self.head_dim,
-                    self.config.rope_scaling["short_factor"],
-                    self.config.rope_scaling["long_factor"],
-                    max_position_embeddings=self.config.max_position_embeddings,
-                    original_max_position_embeddings=self.config.original_max_position_embeddings,
-                    base=self.config.rope_theta,
                 )
             elif scaling_type == "yarn":
                 self.rotary_emb = Phi3YarnScaledRotaryEmbedding(
                     self.head_dim,
-                    self.config.rope_scaling["short_factor"],
-                    self.config.rope_scaling["long_factor"],
-                    max_position_embeddings=self.config.max_position_embeddings,
-                    original_max_position_embeddings=self.config.original_max_position_embeddings,
-                    base=self.config.rope_theta,
                 )
             else:
                 raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -374,7 +413,8 @@ class Phi3Attention(nn.Module):
                     "with a layer index."
                 )
             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         if past_key_value is not None:
@@ -494,7 +534,7 @@ class Phi3FlashAttention2(Phi3Attention):
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
@@ -781,7 +821,7 @@ class Phi3SdpaAttention(Phi3Attention):
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
     )
+# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
 class Phi3RotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
+        self.register_buffer("inv_freq", None, persistent=False)
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
+        if self.inv_freq is None:
+            self.inv_freq = 1.0 / (
+                self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
+            )
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding):
     def __init__(
         self,
         dim,
         short_factor,
         long_factor,
         original_max_position_embeddings=2048,
+        max_position_embeddings=2048,
         base=10000,
+        device=None,
     ):
+        super().__init__(dim, max_position_embeddings, base, device)
         self.short_factor = short_factor
         self.long_factor = long_factor
         self.original_max_position_embeddings = original_max_position_embeddings
+    def _calc_scaling_factor(self, scale):
+        if scale <= 1.0:
+            return 1.0
+        return math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
     @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        position_ids_expanded = position_ids[:, None, :].float()
+        if position_ids_expanded.shape[-1] > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
         else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+        if self.inv_freq is None:
+            self.inv_freq = 1.0 / (
+                ext_factors
+                * self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
+            )
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            scaling_factor = self._calc_scaling_factor(
+                self.max_position_embeddings / self.original_max_position_embeddings
+            )
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * scaling_factor
+            sin = emb.sin() * scaling_factor
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding):
+    def __init__(
+        self,
+        dim,
+        short_factor,
+        long_factor,
+        original_max_position_embeddings=2048,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+    ):
+        super().__init__(dim, max_position_embeddings, base, device)
+        self.short_factor = short_factor
+        self.long_factor = long_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+    def _calc_scaling_factor(self, scale):
         if scale <= 1.0:
             return 1.0
+        return 0.1 * math.log(scale) + 1.0
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        position_ids_expanded = position_ids[:, None, :].float()
+        if position_ids_expanded.shape[-1] > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
+        else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+        if self.inv_freq is None:
+            self.inv_freq = 1.0 / (
+                ext_factors
+                * self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
+            )
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            scaling_factor = self._calc_scaling_factor(
+                self.max_position_embeddings / self.original_max_position_embeddings
+            )
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * scaling_factor
+            sin = emb.sin() * scaling_factor
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 # Copied from transformers.models.llama.modeling_llama.rotate_half
     return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
     Args:
         k (`torch.Tensor`): The key tensor.
         cos (`torch.Tensor`): The cosine part of the rotary embedding.
         sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
         unsqueeze_dim (`int`, *optional*, defaults to 1):
             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
             sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
 class Phi3MLP(nn.Module):
         self.activation_fn = ACT2FN[config.hidden_act]
     def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+        return self.down_proj(up_states)
 # Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
         self.max_position_embeddings = config.max_position_embeddings
         self.original_max_position_embeddings = config.original_max_position_embeddings
         self.rope_theta = config.rope_theta
+        self.rope_scaling = config.rope_scaling
         self.is_causal = True
         if (self.head_dim * self.num_heads) != self.hidden_size:
         self._init_rope()
     def _init_rope(self):
+        if self.rope_scaling is None:
             self.rotary_emb = Phi3RotaryEmbedding(
                 self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
             )
         else:
             scaling_type = self.config.rope_scaling["type"]
+            short_factor = self.config.rope_scaling["short_factor"]
+            long_factor = self.config.rope_scaling["long_factor"]
             if scaling_type == "su":
                 self.rotary_emb = Phi3SuScaledRotaryEmbedding(
                     self.head_dim,
+                    short_factor,
+                    long_factor,
+                    max_position_embeddings=self.max_position_embeddings,
+                    original_max_position_embeddings=self.original_max_position_embeddings,
+                    base=self.rope_theta,
                 )
             elif scaling_type == "yarn":
                 self.rotary_emb = Phi3YarnScaledRotaryEmbedding(
                     self.head_dim,
+                    short_factor,
+                    long_factor,
+                    max_position_embeddings=self.max_position_embeddings,
+                    original_max_position_embeddings=self.original_max_position_embeddings,
+                    base=self.rope_theta,
                 )
             else:
                 raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
     def forward(
         self,
         hidden_states: torch.Tensor,
                     "with a layer index."
                 )
             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         if past_key_value is not None:
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=rotary_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)