Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 3, 2024

Commit

0399b0e

verified ·

1 Parent(s): 67b00f4

Update inference.py

Browse files

Files changed (1) hide show

inference.py +35 -34

inference.py CHANGED Viewed

@@ -1,48 +1,49 @@
-import gc
 import torch
-from tqdm import tqdm
-from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM, AutoConfig
 model_path = "Crystalcareai/Quiet-Star-Custom"
-# Load model
-config = AutoConfig.from_pretrained(model_path, max_position_embeddings=2048, use_cache=False, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    model_path,
-    config=config,
-    device_map="auto",
-    low_cpu_mem_usage=True,
-    torch_dtype=torch.bfloat16,
-    trust_remote_code=True,
-)
 tokenizer = AutoTokenizer.from_pretrained(model_path)
-model.tokenizer = tokenizer  # Assign the tokenizer to the model instance
 streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
 # Convert prompt to tokens
 prompt_template = "[INST] {prompt} [/INST]"
-prompt = "You're standing on the surface of the Earth. "\
-        "You walk one mile south, one mile west and one mile north. "\
-        "You end up exactly where you started. Where are you?"
 input_ids = tokenizer(
     prompt_template.format(prompt=prompt),
     return_tensors='pt'
-).input_ids.cuda()
-# Generate output
-generation_output = model.generate(
-    input_ids,
-    max_length=1024,
-    do_sample=True,
-    top_k=50,
-    top_p=0.95,
-    num_return_sequences=1,
-    streamer=streamer,
-)
-# Decode the output
-generated_text = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
-print(generated_text)

 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
+from accelerate import infer_auto_device_map, init_empty_weights, dispatch_model
 model_path = "Crystalcareai/Quiet-Star-Custom"
+n_ahead = 8
+n_ahead_talk = 4
+merged_talk_heads = True
+model = AutoModelForCausalLM.from_pretrained(model_path,
+                                             max_thoughts=n_ahead + n_ahead_talk + 1,
+                                             merged_talk_heads=merged_talk_heads,
+                                             merged_lm_and_talk_heads=False,
+                                             merged_lm_and_think_heads=True,
+                                             use_concat_talk_head=True,
+                                             use_shallow_think=True,
+                                             use_shallow_talk=False,
+                                             use_complex_think_head=False,
+                                             use_complex_talk_head=True,
+                                             use_weighted_talk_head=True,
+                                             trust_remote_code=True,
+                                             torch_dtype=torch.bfloat16,
+                                             device_map="auto",
+                                             )
+model.eval()
 tokenizer = AutoTokenizer.from_pretrained(model_path)
+model.tokenizer = tokenizer  # Set the tokenizer attribute of the model
 streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
 # Convert prompt to tokens
 prompt_template = "[INST] {prompt} [/INST]"
+prompt = "It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: To begin with, Lesley is a close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy. It follows that Lesley is a great-grandfather of Leroy. Is the argument, given the explicitly stated premises, deductively valid or invalid?"
 input_ids = tokenizer(
     prompt_template.format(prompt=prompt),
     return_tensors='pt'
+).input_ids.to(model.device)
+attention_mask = torch.ones_like(input_ids)
+max_length = 1024
+output_ids, _ = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, streamer=streamer)
+print(tokenizer.decode(output_ids[0], skip_special_tokens=False))