Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 6, 2024

Commit

3d962f0

verified ·

1 Parent(s): 5af661b

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +6 -322

modeling_quiet.py CHANGED Viewed

@@ -47,20 +47,11 @@ from transformers.utils import (
 	logging,
 	replace_return_docstrings,
 )
-import transformers
 from .configuration_quiet import QuietConfig
 import time
 from typing import Optional, List
-if is_flash_attn_2_available():
-	from flash_attn import flash_attn_func, flash_attn_varlen_func
-	from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-	_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "QuietConfig"
@@ -408,312 +399,6 @@ class QuietAttention(nn.Module):
 		return attn_output, attn_weights, past_key_value
-class QuietFlashAttention2(QuietAttention):
-	"""
-	Quiet flash attention module. This module inherits from `QuietAttention` as the weights of the module stays
-	untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-	flash attention and deal with padding tokens in case the input contains any of them.
-	"""
-	# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-	def __init__(self, *args, **kwargs):
-		super().__init__(*args, **kwargs)
-		# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-		# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-		# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-		self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-	def forward(
-		self,
-		hidden_states: torch.Tensor,
-		attention_mask: Optional[torch.Tensor] = None,
-		position_ids: Optional[torch.LongTensor] = None,
-		past_key_value: Optional[Cache] = None,
-		output_attentions: bool = False,
-		use_cache: bool = False,
-		**kwargs,
-	):
-		if "padding_mask" in kwargs:
-			warnings.warn(
-				"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-			)
-			# overwrite attention_mask with padding_mask
-			attention_mask = kwargs.pop("padding_mask")
-		bsz, q_len, _ = hidden_states.size()
-		query_states = self.q_proj(hidden_states)
-		key_states = self.k_proj(hidden_states)
-		value_states = self.v_proj(hidden_states)
-		query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-		key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-		value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-		kv_seq_len = key_states.shape[-2]
-		if past_key_value is not None:
-			if self.layer_idx is None:
-				raise ValueError(
-					f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-					"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-					"with a layer index."
-				)
-			kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-		# Because the input can be padded, the absolute sequence length depends on the max position id.
-		rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-		cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-		query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-		use_sliding_windows = (
-			_flash_supports_window_size
-			and getattr(self.config, "sliding_window", None) is not None
-			and kv_seq_len > self.config.sliding_window
-		)
-		if not _flash_supports_window_size:
-			logger.warning_once(
-				"The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-				" make sure to upgrade flash-attn library."
-			)
-		if past_key_value is not None:
-			# Activate slicing cache only if the config has a value `sliding_windows` attribute
-			cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-			if (
-				getattr(self.config, "sliding_window", None) is not None
-				and kv_seq_len > self.config.sliding_window
-				and cache_has_contents
-			):
-				slicing_tokens = 1 - self.config.sliding_window
-				past_key = past_key_value[self.layer_idx][0]
-				past_value = past_key_value[self.layer_idx][1]
-				past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-				past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-				if past_key.shape[-2] != self.config.sliding_window - 1:
-					raise ValueError(
-						f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-						f" {past_key.shape}"
-					)
-				if attention_mask is not None:
-					attention_mask = attention_mask[:, slicing_tokens:]
-					attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-			cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-			key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-		# repeat k/v heads if n_kv_heads < n_heads
-		key_states = repeat_kv(key_states, self.num_key_value_groups)
-		value_states = repeat_kv(value_states, self.num_key_value_groups)
-		dropout_rate = 0.0 if not self.training else self.attention_dropout
-		# In PEFT, usually we cast the layer norms in float32 for training stability reasons
-		# therefore the input hidden states gets silently casted in float32. Hence, we need
-		# cast them back in float16 just to be sure everything works as expected.
-		input_dtype = query_states.dtype
-		if input_dtype == torch.float32:
-			if torch.is_autocast_enabled():
-				target_dtype = torch.get_autocast_gpu_dtype()
-			# Handle the case where the model is quantized
-			elif hasattr(self.config, "_pre_quantization_dtype"):
-				target_dtype = self.config._pre_quantization_dtype
-			else:
-				target_dtype = self.q_proj.weight.dtype
-			logger.warning_once(
-				f"The input hidden states seems to be silently casted in float32, this might be related to"
-				f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-				f" {target_dtype}."
-			)
-			query_states = query_states.to(target_dtype)
-			key_states = key_states.to(target_dtype)
-			value_states = value_states.to(target_dtype)
-		# Reashape to the expected shape for Flash Attention
-		query_states = query_states.transpose(1, 2)
-		key_states = key_states.transpose(1, 2)
-		value_states = value_states.transpose(1, 2)
-		attn_output = self._flash_attention_forward(
-			query_states,
-			key_states,
-			value_states,
-			attention_mask,
-			q_len,
-			dropout=dropout_rate,
-			use_sliding_windows=use_sliding_windows,
-		)
-		attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-		attn_output = self.o_proj(attn_output)
-		if not output_attentions:
-			attn_weights = None
-		return attn_output, attn_weights, past_key_value
-	def _flash_attention_forward(
-		self,
-		query_states,
-		key_states,
-		value_states,
-		attention_mask,
-		query_length,
-		dropout=0.0,
-		softmax_scale=None,
-		use_sliding_windows=False,
-	):
-		"""
-		Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-		first unpad the input, then computes the attention scores and pad the final attention scores.
-		Args:
-			query_states (`torch.Tensor`):
-				Input query states to be passed to Flash Attention API
-			key_states (`torch.Tensor`):
-				Input key states to be passed to Flash Attention API
-			value_states (`torch.Tensor`):
-				Input value states to be passed to Flash Attention API
-			attention_mask (`torch.Tensor`):
-				The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-				position of padding tokens and 1 for the position of non-padding tokens.
-			dropout (`int`, *optional*):
-				Attention dropout
-			softmax_scale (`float`, *optional*):
-				The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-			use_sliding_windows (`bool`, *optional*):
-				Whether to activate sliding window attention.
-		"""
-		if not self._flash_attn_uses_top_left_mask:
-			causal = self.is_causal
-		else:
-			# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-			causal = self.is_causal and query_length != 1
-		# Ensure attention_mask has the correct shape and values
-		if attention_mask is not None:
-			if attention_mask.dim() == 4:
-				# Convert 4D attention mask to 2D
-				attention_mask = attention_mask.squeeze(1).squeeze(1)
-			elif attention_mask.dim() != 2:
-				raise ValueError(
-					f"Invalid attention mask dimension: {attention_mask.dim()}. Expected 2D or 4D mask."
-				)
-			# Ensure attention_mask has values of 0 and 1
-			attention_mask = attention_mask.to(torch.bool).to(torch.int32)
-		# Contains at least one padding token in the sequence
-		if attention_mask is not None:
-			batch_size = query_states.shape[0]
-			query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-				query_states, key_states, value_states, attention_mask, query_length
-			)
-			cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-			max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-			if not use_sliding_windows:
-				attn_output_unpad = flash_attn_varlen_func(
-					query_states,
-					key_states,
-					value_states,
-					cu_seqlens_q=cu_seqlens_q,
-					cu_seqlens_k=cu_seqlens_k,
-					max_seqlen_q=max_seqlen_in_batch_q,
-					max_seqlen_k=max_seqlen_in_batch_k,
-					dropout_p=dropout,
-					softmax_scale=softmax_scale,
-					causal=causal,
-				)
-			else:
-				attn_output_unpad = flash_attn_varlen_func(
-					query_states,
-					key_states,
-					value_states,
-					cu_seqlens_q=cu_seqlens_q,
-					cu_seqlens_k=cu_seqlens_k,
-					max_seqlen_q=max_seqlen_in_batch_q,
-					max_seqlen_k=max_seqlen_in_batch_k,
-					dropout_p=dropout,
-					softmax_scale=softmax_scale,
-					causal=causal,
-					window_size=(self.config.sliding_window, self.config.sliding_window),
-				)
-			attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-		else:
-			if not use_sliding_windows:
-				attn_output = flash_attn_func(
-					query_states,
-					key_states,
-					value_states,
-					dropout,
-					softmax_scale=softmax_scale,
-					causal=causal,
-				)
-			else:
-				attn_output = flash_attn_func(
-					query_states,
-					key_states,
-					value_states,
-					dropout,
-					softmax_scale=softmax_scale,
-					causal=causal,
-					window_size=(self.config.sliding_window, self.config.sliding_window),
-				)
-		return attn_output
-	def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-		batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-		# On the first iteration we need to properly re-create the padding mask
-		# by slicing it on the proper place
-		if kv_seq_len != attention_mask.shape[-1]:
-			attention_mask_num_tokens = attention_mask.shape[-1]
-			attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-		indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-		key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-		value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-		if query_length == kv_seq_len:
-			query_layer = index_first_axis(
-				query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-			)
-			cu_seqlens_q = cu_seqlens_k
-			max_seqlen_in_batch_q = max_seqlen_in_batch_k
-			indices_q = indices_k
-		elif query_length == 1:
-			max_seqlen_in_batch_q = 1
-			cu_seqlens_q = torch.arange(
-				batch_size + 1, dtype=torch.int32, device=query_layer.device
-			)  # There is a memcpy here, that is very bad.
-			indices_q = cu_seqlens_q[:-1]
-			query_layer = query_layer.squeeze(1)
-		else:
-			# The -q_len: slice assumes left padding.
-			attention_mask = attention_mask[:, -query_length:]
-			query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-		return (
-			query_layer,
-			key_layer,
-			value_layer,
-			indices_q,
-			(cu_seqlens_q, cu_seqlens_k),
-			(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-		)
 # Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Quiet
 class QuietSdpaAttention(QuietAttention):
 	"""
@@ -1567,16 +1252,15 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 				return x
 			return x.repeat_interleave(n, dim=0)
-		if self.n_passes > 1 and input_ids is not None:
 			input_ids = none_repeat_interleave(input_ids, self.n_passes)
-			attention_mask = none_repeat_interleave(attention_mask, self.n_passes) if attention_mask is not None else None
-			position_ids = none_repeat_interleave(position_ids, self.n_passes) if position_ids is not None else None
-			inputs_embeds = none_repeat_interleave(inputs_embeds, self.n_passes) if inputs_embeds is not None else None
-			labels = none_repeat_interleave(labels, self.n_passes) if labels is not None else None
 			if past_key_values is not None:
 				past_key_values = [none_repeat_interleave(p, self.n_passes) for p in past_key_values]
-		if input_ids is not None:
-			cur_token_indices = torch.arange(input_ids.shape[1], device=input_ids.device)
 		self.tokenizer_has_start_thought_token = True
 		self.tokenizer_has_end_thought_token = True

 	logging,
 	replace_return_docstrings,
 )
 from .configuration_quiet import QuietConfig
 import time
 from typing import Optional, List
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "QuietConfig"
 		return attn_output, attn_weights, past_key_value
 # Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Quiet
 class QuietSdpaAttention(QuietAttention):
 	"""
 				return x
 			return x.repeat_interleave(n, dim=0)
+		if self.n_passes > 1:
 			input_ids = none_repeat_interleave(input_ids, self.n_passes)
+			attention_mask = none_repeat_interleave(attention_mask, self.n_passes)
+			position_ids = none_repeat_interleave(position_ids, self.n_passes)
+			inputs_embeds = none_repeat_interleave(inputs_embeds, self.n_passes)
+			labels = none_repeat_interleave(labels, self.n_passes)
 			if past_key_values is not None:
 				past_key_values = [none_repeat_interleave(p, self.n_passes) for p in past_key_values]
+		cur_token_indices = torch.arange(input_ids.shape[1], device=input_ids.device)
 		self.tokenizer_has_start_thought_token = True
 		self.tokenizer_has_end_thought_token = True