Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 5, 2024

Commit

6b08966

verified ·

1 Parent(s): d3e1600

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +156 -64

modeling_quiet.py CHANGED Viewed

@@ -32,11 +32,11 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.generation.utils import GenerationMixin
 from transformers.generation.stopping_criteria import StoppingCriteriaList, validate_stopping_criteria
-from transformers import TextStreamer, AutoTokenizer
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
@@ -65,62 +65,62 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "QuietConfig"
-# def _prepare_4d_causal_attention_mask_for_sdpa(attention_mask, input_shape, inputs_embeds, past_key_values_length):
-# 	# Compute the attention mask correctly
-# 	bsz, tgt_len = input_shape
-# 	# Create a 4D attention mask from a 2D tensor mask.
-# 	# The shape of the output attention mask is (batch_size, 1, tgt_len, src_len)
-# 	# The values are either 0 or 1, where 0 means padding and 1 means non-padding.
-# 	combined_attention_mask = None
-# 	if attention_mask is not None:
-# 		# What if attention_mask is not None and has a shape of (batch_size, 1, tgt_len, src_len)
-# 		# In this case, we can just use it directly.
-# 		if attention_mask.dim() == 4:
-# 			combined_attention_mask = attention_mask
-# 		# What if attention_mask is not None and has a shape of (batch_size, 1, tgt_len)
-# 		# In this case, we need to expand it to (batch_size, 1, tgt_len, src_len)
-# 		elif attention_mask.dim() == 3:
-# 			expanded_attn_mask = attention_mask[:, None, :, :]
-# 			combined_attention_mask = expanded_attn_mask
-# 		# What if attention_mask is not None and has a shape of (batch_size, tgt_len)
-# 		# In this case, we need to expand it to (batch_size, 1, tgt_len, src_len)
-# 		elif attention_mask.dim() == 2:
-# 			# Provided a padding mask of dimensions [batch_size, seq_length]
-# 			# - if the model is a decoder, apply a causal mask in addition to the padding mask
-# 			# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-# 			if past_key_values_length > 0:
-# 				attention_mask = attention_mask.to(dtype=torch.long)
-# 				attention_mask = attention_mask[:, past_key_values_length:]
-# 			expanded_attn_mask = attention_mask[:, None, None, :]
-# 			combined_attention_mask = expanded_attn_mask
-# 		else:
-# 			raise ValueError(
-# 				"Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
-# 					input_shape, attention_mask.shape
-# 				)
-# 			)
-# 	# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-# 	# masked positions, this operation will create a tensor which is 0.0 for
-# 	# positions we want to attend and -10000.0 for masked positions.
-# 	# Since we are adding it to the raw scores before the softmax, this is
-# 	# effectively the same as removing these entirely.
-# 	if combined_attention_mask is not None:
-# 		# Ensure the attention mask values are within a reasonable range
-# 		combined_attention_mask = combined_attention_mask.clamp(min=0, max=1)
-# 		# Convert the attention mask to bfloat16
-# 		combined_attention_mask = combined_attention_mask.to(torch.bfloat16)
-# 		# Normalize the attention mask values to be between 0 and 1
-# 		combined_attention_mask = (1.0 - combined_attention_mask) * -10000.0
-# 	else:
-# 		combined_attention_mask = torch.zeros(
-# 			(bsz, 1, tgt_len, tgt_len), dtype=torch.bfloat16, device=inputs_embeds.device
-# 		)
-# 	return combined_attention_mask
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
@@ -774,7 +774,7 @@ class QuietSdpaAttention(QuietAttention):
 				raise ValueError(
 					f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
 				)
 		# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
 		# Reference: https://github.com/pytorch/pytorch/issues/112577.
 		if query_states.device.type == "cuda" and attention_mask is not None:
@@ -1182,7 +1182,7 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 		self.n_tokens_print = 1
 		self.gradient_accumulation_steps = 1
 		self.training_steps = 0
-		self.tokenizer = AutoTokenizer.from_pretrained("Crystalcareai/Quiet-Star-Custom")
 		self.start_token_id = None
 		self.end_token_id = None
 		self.rm_initialized = False
@@ -1238,6 +1238,7 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 		self.embedding_scale = 1e2
 		self.temperature = nn.Parameter(torch.ones(1))
 		self.max_temperature = config.max_temperature
 		self.reinforce_temperature = 3
 		self.base_loss_beta = 1
 		self.thinking_usefulness_head = nn.Linear(self.model.config.hidden_size, 1)
@@ -1424,9 +1425,67 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 		logits = self.lm_head(mixed_hidden_states)
 		return logits
-	def generate(self, input_ids, attention_mask=None, max_length=None, temperature=1.0, **kwargs):
-		from .generate import generate
-		return generate(self, input_ids=input_ids, attention_mask=attention_mask, max_length=max_length, temperature=temperature, **kwargs)
 	@add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
 	@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
@@ -1616,7 +1675,8 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 		sample_probs_history = []
 		action_loglikelihoods_list = []
-		temperature = self.temperature
 		if self.use_end_thought_token or self.use_start_thought_token:
 			if not self.use_reparam_for_thought_embeddings:
@@ -1674,12 +1734,15 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 					base_attention_mask = base_attention_mask.view(1, 1, seq_len, seq_len)
 					base_attention_mask = base_attention_mask.repeat(input_ids.shape[0], 1, 1, 1)
 					attention_mask = base_attention_mask
 				elif attention_mask.dim() == 2:
 					if seq_len + past_key_values_length != attention_mask.shape[-1]:
 						attention_mask = torch.cat(
 							[torch.ones((attention_mask.shape[0], past_key_values_length), dtype=attention_mask.dtype, device=attention_mask.device), attention_mask],
 							dim=-1
 						)
 					attention_mask = _prepare_4d_causal_attention_mask(
 						attention_mask,
 						(batch_size, seq_len),
@@ -1697,8 +1760,10 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 				use_cache=use_cache,
 				output_attentions=output_attentions,
 				output_hidden_states=output_hidden_states,
 				return_dict=return_dict,
 			)
 			prev_hidden_states = hidden_states
 			hidden_states = outputs[0]
 			prev_rm_logits = rm_logits  # for policy gradient
@@ -2125,6 +2190,33 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 			attentions=outputs.attentions,
 		)
 	def prepare_inputs_for_generation(

 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.generation.utils import GenerationMixin
 from transformers.generation.stopping_criteria import StoppingCriteriaList, validate_stopping_criteria
+from transformers import TextStreamer
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
 _CONFIG_FOR_DOC = "QuietConfig"
+def _prepare_4d_causal_attention_mask_for_sdpa(attention_mask, input_shape, inputs_embeds, past_key_values_length):
+	# Compute the attention mask correctly
+	bsz, tgt_len = input_shape
+	# Create a 4D attention mask from a 2D tensor mask.
+	# The shape of the output attention mask is (batch_size, 1, tgt_len, src_len)
+	# The values are either 0 or 1, where 0 means padding and 1 means non-padding.
+	combined_attention_mask = None
+	if attention_mask is not None:
+		# What if attention_mask is not None and has a shape of (batch_size, 1, tgt_len, src_len)
+		# In this case, we can just use it directly.
+		if attention_mask.dim() == 4:
+			combined_attention_mask = attention_mask
+		# What if attention_mask is not None and has a shape of (batch_size, 1, tgt_len)
+		# In this case, we need to expand it to (batch_size, 1, tgt_len, src_len)
+		elif attention_mask.dim() == 3:
+			expanded_attn_mask = attention_mask[:, None, :, :]
+			combined_attention_mask = expanded_attn_mask
+		# What if attention_mask is not None and has a shape of (batch_size, tgt_len)
+		# In this case, we need to expand it to (batch_size, 1, tgt_len, src_len)
+		elif attention_mask.dim() == 2:
+			# Provided a padding mask of dimensions [batch_size, seq_length]
+			# - if the model is a decoder, apply a causal mask in addition to the padding mask
+			# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+			if past_key_values_length > 0:
+				attention_mask = attention_mask.to(dtype=torch.long)
+				attention_mask = attention_mask[:, past_key_values_length:]
+			expanded_attn_mask = attention_mask[:, None, None, :]
+			combined_attention_mask = expanded_attn_mask
+		else:
+			raise ValueError(
+				"Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+					input_shape, attention_mask.shape
+				)
+			)
+	# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+	# masked positions, this operation will create a tensor which is 0.0 for
+	# positions we want to attend and -10000.0 for masked positions.
+	# Since we are adding it to the raw scores before the softmax, this is
+	# effectively the same as removing these entirely.
+	if combined_attention_mask is not None:
+		# Ensure the attention mask values are within a reasonable range
+		combined_attention_mask = combined_attention_mask.clamp(min=0, max=1)
+		# Convert the attention mask to bfloat16
+		combined_attention_mask = combined_attention_mask.to(torch.bfloat16)
+		# Normalize the attention mask values to be between 0 and 1
+		combined_attention_mask = (1.0 - combined_attention_mask) * -10000.0
+	else:
+		combined_attention_mask = torch.zeros(
+			(bsz, 1, tgt_len, tgt_len), dtype=torch.bfloat16, device=inputs_embeds.device
+		)
+	return combined_attention_mask
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 				raise ValueError(
 					f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
 				)
 		# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
 		# Reference: https://github.com/pytorch/pytorch/issues/112577.
 		if query_states.device.type == "cuda" and attention_mask is not None:
 		self.n_tokens_print = 1
 		self.gradient_accumulation_steps = 1
 		self.training_steps = 0
+		self.tokenizer = None
 		self.start_token_id = None
 		self.end_token_id = None
 		self.rm_initialized = False
 		self.embedding_scale = 1e2
 		self.temperature = nn.Parameter(torch.ones(1))
 		self.max_temperature = config.max_temperature
+		self.complexity_factor = config.complexity_factor
 		self.reinforce_temperature = 3
 		self.base_loss_beta = 1
 		self.thinking_usefulness_head = nn.Linear(self.model.config.hidden_size, 1)
 		logits = self.lm_head(mixed_hidden_states)
 		return logits
+	@torch.no_grad()
+	def generate(
+		self,
+		input_ids: Optional[torch.LongTensor] = None,
+		max_length: Optional[int] = None,
+		min_length: Optional[int] = None,
+		do_sample: Optional[bool] = None,
+		early_stopping: Optional[bool] = None,
+		num_beams: Optional[int] = None,
+		temperature: Optional[float] = None,
+		top_k: Optional[int] = None,
+		top_p: Optional[float] = None,
+		repetition_penalty: Optional[float] = None,
+		bad_words_ids: Optional[Iterable[int]] = None,
+		bos_token_id: Optional[int] = None,
+		pad_token_id: Optional[int] = None,
+		eos_token_id: Optional[int] = None,
+		length_penalty: Optional[float] = None,
+		no_repeat_ngram_size: Optional[int] = None,
+		encoder_no_repeat_ngram_size: Optional[int] = None,
+		num_return_sequences: Optional[int] = None,
+		max_time: Optional[float] = None,
+		max_new_tokens: Optional[int] = None,
+		decoder_start_token_id: Optional[int] = None,
+		use_cache: Optional[bool] = None,
+		num_beam_groups: Optional[int] = None,
+		diversity_penalty: Optional[float] = None,
+		prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+		output_attentions: Optional[bool] = None,
+		output_hidden_states: Optional[bool] = None,
+		output_scores: Optional[bool] = None,
+		return_dict_in_generate: Optional[bool] = None,
+		forced_bos_token_id: Optional[int] = None,
+		forced_eos_token_id: Optional[int] = None,
+		remove_invalid_values: Optional[bool] = None,
+		synced_gpus: Optional[bool] = False,
+		stopping_criteria: Optional[StoppingCriteriaList] = None,
+		**model_kwargs,
+	):
+		# Validate stopping criteria
+		stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+		# Set default values for attention_mask and max_new_tokens
+		if "attention_mask" not in model_kwargs:
+			attention_mask = torch.where(input_ids != self.tokenizer.pad_token_id, 1, 0).to(input_ids.device)
+			model_kwargs["attention_mask"] = attention_mask
+		if max_new_tokens is None:
+			max_new_tokens = 512
+		streamer = TextStreamer(self.tokenizer, skip_prompt=False, skip_special_tokens=True)
+		# Call the custom generate function
+		output_ids, _ = custom_generate(
+			self,
+			input_ids=input_ids,
+			streamer=streamer,
+			max_new_tokens=max_new_tokens,
+			**model_kwargs,
+		)
+		return output_ids
 	@add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
 	@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
 		sample_probs_history = []
 		action_loglikelihoods_list = []
+		complexity_scores = self.compute_complexity_scores(input_ids, attention_mask)
+		temperature = self.temperature * complexity_scores.unsqueeze(-1)
 		if self.use_end_thought_token or self.use_start_thought_token:
 			if not self.use_reparam_for_thought_embeddings:
 					base_attention_mask = base_attention_mask.view(1, 1, seq_len, seq_len)
 					base_attention_mask = base_attention_mask.repeat(input_ids.shape[0], 1, 1, 1)
 					attention_mask = base_attention_mask
+					breakpoint()
 				elif attention_mask.dim() == 2:
 					if seq_len + past_key_values_length != attention_mask.shape[-1]:
+						breakpoint()
 						attention_mask = torch.cat(
 							[torch.ones((attention_mask.shape[0], past_key_values_length), dtype=attention_mask.dtype, device=attention_mask.device), attention_mask],
 							dim=-1
 						)
+					# # if the attention mask
 					attention_mask = _prepare_4d_causal_attention_mask(
 						attention_mask,
 						(batch_size, seq_len),
 				use_cache=use_cache,
 				output_attentions=output_attentions,
 				output_hidden_states=output_hidden_states,
+				# output_router_logits=output_router_logits,
 				return_dict=return_dict,
 			)
 			prev_hidden_states = hidden_states
 			hidden_states = outputs[0]
 			prev_rm_logits = rm_logits  # for policy gradient
 			attentions=outputs.attentions,
 		)
+	def compute_complexity_scores(self, input_ids, attention_mask):
+		# Compute complexity scores based on input sequence characteristics
+		# Example: Normalize sequence lengths and consider the presence of rare tokens
+		seq_lengths = torch.sum(attention_mask, dim=-1)
+		max_length = torch.max(seq_lengths)
+		length_scores = seq_lengths / max_length
+		# Compute the proportion of rare tokens in each sequence
+		rare_token_ids = self.get_rare_token_ids()
+		rare_token_mask = torch.isin(input_ids, rare_token_ids)
+		rare_token_counts = torch.sum(rare_token_mask, dim=-1)
+		rare_token_scores = rare_token_counts / seq_lengths
+		# Combine length scores and rare token scores
+		complexity_scores = self.complexity_factor * length_scores + (1 - self.complexity_factor) * rare_token_scores
+		return complexity_scores
+	def get_rare_token_ids(self):
+		# Get the IDs of rare tokens based on a predefined frequency threshold
+		frequency_threshold = 1e-4
+		token_counts = torch.bincount(self.model.embed_tokens.weight.argmax(dim=-1))
+		total_tokens = torch.sum(token_counts)
+		rare_token_mask = token_counts / total_tokens < frequency_threshold
+		rare_token_ids = torch.nonzero(rare_token_mask).squeeze(-1)
+		return rare_token_ids
 	def prepare_inputs_for_generation(