Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 5, 2024

Commit

d00c49d

verified ·

1 Parent(s): 890bc4c

Update generate.py

Browse files

Files changed (1) hide show

generate.py +48 -58

generate.py CHANGED Viewed

@@ -45,56 +45,55 @@ def custom_generate(
     synced_gpus=None,
     **kwargs,
 ):
-		with torch.no_grad():
-			finished_generating = torch.zeros(len(input_ids), dtype=torch.bool, device=input_ids.device)
-			while not finished_generating.all() and input_ids.shape[1] < max_length:
-				# Sample the next token
-				new_ids = self(
-					input_ids[~finished_generating],
-					attention_mask=attention_mask[~finished_generating] if attention_mask is not None else None,
-					**kwargs
-				)['logits']
-				# Mask out the start and end thought tokens so we don't accidentally sample them
-				new_ids[:, :, self.tokenizer.vocab_size:] = -float("inf")
-				for list_idx, answer_idx in enumerate((~finished_generating).nonzero(as_tuple=True)[0]):
-					# Find the index of the last token that is not padding
-					base_answer_ids = input_ids[answer_idx]
-					new_answer_ids = new_ids[list_idx]
-					last_token_idx = (base_answer_ids != self.tokenizer.pad_token_id).nonzero(as_tuple=True)[0].max()
-					new_ids_sampled = torch.multinomial(
-						torch.nn.functional.softmax(new_answer_ids[last_token_idx] / kwargs.get("temperature", 1.0), dim=-1), 1)
-					# Assign the new id to the last token
-					if last_token_idx + 1 >= len(base_answer_ids):
-						# Add padding everywhere
-						new_padding = torch.full((len(input_ids), 1), self.tokenizer.pad_token_id, dtype=torch.long,
-												device=input_ids.device)
-						input_ids = torch.cat([input_ids, new_padding], dim=-1)
-						if attention_mask is not None:
-							attention_mask = torch.cat([attention_mask, torch.zeros_like(new_padding)], dim=-1)
-					if attention_mask is not None:
-						attention_mask[answer_idx, last_token_idx + 1] = 1
-					input_ids[answer_idx, last_token_idx + 1] = new_ids_sampled
-					if new_ids_sampled == self.tokenizer.eos_token_id or new_ids_sampled == self.tokenizer.bos_token_id or new_ids_sampled == self.tokenizer.pad_token_id:
-						finished_generating[answer_idx] = 1
-					# Check if the end token is generated
-					if new_ids_sampled == self.tokenizer.convert_tokens_to_ids("<|/assistant|>"):
-						finished_generating[answer_idx] = 1
-				if streamer is not None:
-					streamer.put(new_ids_sampled)
-		generated_token_ids = input_ids.tolist()
-		return generated_token_ids
 def generate(
@@ -158,15 +157,6 @@ def generate(
     self.use_complex_talk_head = use_complex_talk_head
     self.use_weighted_talk_head = use_weighted_talk_head
-    # Set model properties
-    self.use_end_thought_token = True
-    self.use_start_thought_token = True
-    self.n_ahead = n_ahead
-    self.n_passes = 1
-    self.eval_mode = True
-    self.first_run = False
-    self.rm_initialized = True
-    self.original_mode = False
     # Generate using the custom generate function
     generated_token_ids = custom_generate(

     synced_gpus=None,
     **kwargs,
 ):
+    with torch.no_grad():
+        finished_generating = torch.zeros(len(input_ids), dtype=torch.bool, device=input_ids.device)
+        while not finished_generating.all() and input_ids.shape[1] < max_length:
+            # Sample the next token
+            new_ids = self(
+                input_ids[~finished_generating],
+                attention_mask=attention_mask[~finished_generating] if attention_mask is not None else None,
+                **kwargs
+            )['logits']
+            # Mask out the start and end thought tokens so we don't accidentally sample them
+            new_ids[:, :, self.tokenizer.vocab_size:] = -float("inf")
+            for list_idx, answer_idx in enumerate((~finished_generating).nonzero(as_tuple=True)[0]):
+                # Find the index of the last token that is not padding
+                base_answer_ids = input_ids[answer_idx]
+                new_answer_ids = new_ids[list_idx]
+                last_token_idx = (base_answer_ids != self.tokenizer.pad_token_id).nonzero(as_tuple=True)[0].max()
+                new_ids_sampled = torch.multinomial(
+                    torch.nn.functional.softmax(new_answer_ids[last_token_idx] / temperature, dim=-1), 1)
+                # Assign the new id to the last token
+                if last_token_idx + 1 >= len(base_answer_ids):
+                    # Add padding everywhere
+                    new_padding = torch.full((len(input_ids), 1), self.tokenizer.pad_token_id, dtype=torch.long,
+                                            device=input_ids.device)
+                    input_ids = torch.cat([input_ids, new_padding], dim=-1)
+                    if attention_mask is not None:
+                        attention_mask = torch.cat([attention_mask, torch.zeros_like(new_padding)], dim=-1)
+                if attention_mask is not None:
+                    attention_mask[answer_idx, last_token_idx + 1] = 1
+                input_ids[answer_idx, last_token_idx + 1] = new_ids_sampled
+                if new_ids_sampled == self.tokenizer.eos_token_id or new_ids_sampled == self.tokenizer.bos_token_id or new_ids_sampled == self.tokenizer.pad_token_id:
+                    finished_generating[answer_idx] = 1
+                # Check if the end token is generated
+                if new_ids_sampled == self.tokenizer.convert_tokens_to_ids("</s>"):
+                    finished_generating[answer_idx] = 1
+            if streamer is not None:
+                streamer.put(new_ids_sampled)
+        generated_token_ids = input_ids.tolist()
+        return generated_token_ids
 def generate(
     self.use_complex_talk_head = use_complex_talk_head
     self.use_weighted_talk_head = use_weighted_talk_head
     # Generate using the custom generate function
     generated_token_ids = custom_generate(