Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 7, 2024

Commit

53a9463

verified ·

1 Parent(s): 27d3137

Update generate.py

Browse files

Files changed (1) hide show

generate.py +19 -26

generate.py CHANGED Viewed

@@ -6,6 +6,7 @@ from transformers.generation.utils import (
 )
 from transformers import TextStreamer
 def custom_generate(
     self,
     input_ids,
@@ -44,19 +45,19 @@ def custom_generate(
 ):
     if input_ids is None or input_ids.nelement() == 0:
         # If input_ids is None or an empty tensor, create a default input tensor
-        input_ids = torch.LongTensor([[self.tokenizer.bos_token_id]]).to(self.device)
-        attention_mask = torch.ones_like(input_ids).to(self.device)
     device = input_ids.device
     with torch.no_grad():
         batch_size = input_ids.shape[0]
         finished_generating = torch.zeros(batch_size, dtype=torch.bool, device=device)
-        max_length = input_ids.shape[1] + max_new_tokens
         generated_token_ids = torch.full((batch_size, max_new_tokens), self.tokenizer.pad_token_id, dtype=torch.long, device=device)
-        cur_token_idx = 0
-        while cur_token_idx < max_new_tokens:
             # Sample the next token
             new_ids = self(
                 input_ids[~finished_generating],
@@ -80,7 +81,7 @@ def custom_generate(
                 if last_token_idx + 1 >= len(base_answer_ids):
                     # Add padding everywhere
                     new_padding = torch.full((batch_size, 1), self.tokenizer.pad_token_id, dtype=torch.long,
-                                         device=device)
                     input_ids = torch.cat([input_ids, new_padding], dim=-1)
                     if attention_mask is not None:
                         attention_mask = torch.cat([attention_mask, torch.zeros_like(new_padding)], dim=-1)
@@ -88,15 +89,15 @@ def custom_generate(
                 if attention_mask is not None:
                     attention_mask[answer_idx, last_token_idx + 1] = 1
                 input_ids[answer_idx, last_token_idx + 1] = new_ids_sampled
-                if cur_token_idx < max_new_tokens:
-                    generated_token_ids[answer_idx, cur_token_idx] = new_ids_sampled
-                if new_ids_sampled == self.tokenizer.eos_token_id or cur_token_idx + 1 == max_new_tokens:
                     finished_generating[answer_idx] = 1
-            cur_token_idx += 1
             if finished_generating.all():
                 break
@@ -105,6 +106,7 @@ def custom_generate(
         return generated_token_ids
 def generate(
     self,
     input_ids,
@@ -152,12 +154,11 @@ def generate(
     use_weighted_talk_head=True,
     trust_remote_code=True,
     torch_dtype=torch.bfloat16,
-    dynamic_temperature=None,
     **model_kwargs,
 ):
     if max_new_tokens is None:
-        max_new_tokens = 128
     # Set model attributes
     self.max_thoughts = n_ahead + n_ahead_talk + 1
@@ -185,16 +186,11 @@ def generate(
     if isinstance(input_ids, str):
         input_ids = self.tokenizer.encode(input_ids, return_tensors='pt')
-    # Move input_ids and attention_mask to the same device as the model
-    input_ids = input_ids.to(self.device)
-    if attention_mask is not None:
-        attention_mask = attention_mask.to(self.device)
     generated_token_ids = custom_generate(
         self,
-        input_ids=input_ids,
         attention_mask=attention_mask,
-        max_new_tokens=max_new_tokens,
         min_length=min_length,
         do_sample=do_sample,
         early_stopping=early_stopping,
@@ -227,7 +223,4 @@ def generate(
         **model_kwargs,
     )
-    # Convert the generated token IDs tensor to text
-    generated_text = self.tokenizer.decode(generated_token_ids[0], skip_special_tokens=False)
     return generated_token_ids

 )
 from transformers import TextStreamer
 def custom_generate(
     self,
     input_ids,
 ):
     if input_ids is None or input_ids.nelement() == 0:
         # If input_ids is None or an empty tensor, create a default input tensor
+        input_ids = torch.LongTensor([[self.tokenizer.bos_token_id]])
+        attention_mask = torch.ones_like(input_ids)
     device = input_ids.device
     with torch.no_grad():
         batch_size = input_ids.shape[0]
+        if max_new_tokens is None:
+            raise ValueError("max_new_tokens must be provided.")
         finished_generating = torch.zeros(batch_size, dtype=torch.bool, device=device)
         generated_token_ids = torch.full((batch_size, max_new_tokens), self.tokenizer.pad_token_id, dtype=torch.long, device=device)
+        for cur_token_idx in range(max_new_tokens):
             # Sample the next token
             new_ids = self(
                 input_ids[~finished_generating],
                 if last_token_idx + 1 >= len(base_answer_ids):
                     # Add padding everywhere
                     new_padding = torch.full((batch_size, 1), self.tokenizer.pad_token_id, dtype=torch.long,
+                                            device=device)
                     input_ids = torch.cat([input_ids, new_padding], dim=-1)
                     if attention_mask is not None:
                         attention_mask = torch.cat([attention_mask, torch.zeros_like(new_padding)], dim=-1)
                 if attention_mask is not None:
                     attention_mask[answer_idx, last_token_idx + 1] = 1
                 input_ids[answer_idx, last_token_idx + 1] = new_ids_sampled
+                generated_token_ids[answer_idx, cur_token_idx] = new_ids_sampled
+                if new_ids_sampled == self.tokenizer.eos_token_id or new_ids_sampled == self.tokenizer.bos_token_id or new_ids_sampled == self.tokenizer.pad_token_id:
                     finished_generating[answer_idx] = 1
+                # Check if the end token is generated
+                if new_ids_sampled == self.tokenizer.convert_tokens_to_ids("</s>"):
+                    finished_generating[answer_idx] = 1
             if finished_generating.all():
                 break
         return generated_token_ids
 def generate(
     self,
     input_ids,
     use_weighted_talk_head=True,
     trust_remote_code=True,
     torch_dtype=torch.bfloat16,
     **model_kwargs,
 ):
+    # Set default value for max_new_tokens if not provided
     if max_new_tokens is None:
+        max_new_tokens = 20  # Set a reasonable default value
     # Set model attributes
     self.max_thoughts = n_ahead + n_ahead_talk + 1
     if isinstance(input_ids, str):
         input_ids = self.tokenizer.encode(input_ids, return_tensors='pt')
     generated_token_ids = custom_generate(
         self,
+        input_ids=input_ids,  # Pass input_ids explicitly
         attention_mask=attention_mask,
+        max_new_tokens=max_new_tokens,  # Pass max_new_tokens explicitly
         min_length=min_length,
         do_sample=do_sample,
         early_stopping=early_stopping,
         **model_kwargs,
     )
     return generated_token_ids