Crystalcareai commited on
Commit
0399b0e
·
verified ·
1 Parent(s): 67b00f4

Update inference.py

Browse files
Files changed (1) hide show
  1. inference.py +35 -34
inference.py CHANGED
@@ -1,48 +1,49 @@
1
- import gc
2
  import torch
3
- from tqdm import tqdm
4
- from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM, AutoConfig
5
 
6
  model_path = "Crystalcareai/Quiet-Star-Custom"
7
 
8
- # Load model
9
- config = AutoConfig.from_pretrained(model_path, max_position_embeddings=2048, use_cache=False, trust_remote_code=True)
10
- model = AutoModelForCausalLM.from_pretrained(
11
- model_path,
12
- config=config,
13
- device_map="auto",
14
- low_cpu_mem_usage=True,
15
- torch_dtype=torch.bfloat16,
16
- trust_remote_code=True,
17
- )
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  tokenizer = AutoTokenizer.from_pretrained(model_path)
20
- model.tokenizer = tokenizer # Assign the tokenizer to the model instance
 
21
  streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
22
 
23
  # Convert prompt to tokens
24
  prompt_template = "[INST] {prompt} [/INST]"
25
-
26
- prompt = "You're standing on the surface of the Earth. "\
27
- "You walk one mile south, one mile west and one mile north. "\
28
- "You end up exactly where you started. Where are you?"
29
 
30
  input_ids = tokenizer(
31
  prompt_template.format(prompt=prompt),
32
  return_tensors='pt'
33
- ).input_ids.cuda()
34
-
35
- # Generate output
36
- generation_output = model.generate(
37
- input_ids,
38
- max_length=1024,
39
- do_sample=True,
40
- top_k=50,
41
- top_p=0.95,
42
- num_return_sequences=1,
43
- streamer=streamer,
44
- )
45
-
46
- # Decode the output
47
- generated_text = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
48
- print(generated_text)
 
 
1
  import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
3
+ from accelerate import infer_auto_device_map, init_empty_weights, dispatch_model
4
 
5
  model_path = "Crystalcareai/Quiet-Star-Custom"
6
 
7
+ n_ahead = 8
8
+ n_ahead_talk = 4
9
+ merged_talk_heads = True
10
+
11
+ model = AutoModelForCausalLM.from_pretrained(model_path,
12
+ max_thoughts=n_ahead + n_ahead_talk + 1,
13
+ merged_talk_heads=merged_talk_heads,
14
+ merged_lm_and_talk_heads=False,
15
+ merged_lm_and_think_heads=True,
16
+ use_concat_talk_head=True,
17
+ use_shallow_think=True,
18
+ use_shallow_talk=False,
19
+ use_complex_think_head=False,
20
+ use_complex_talk_head=True,
21
+ use_weighted_talk_head=True,
22
+ trust_remote_code=True,
23
+ torch_dtype=torch.bfloat16,
24
+ device_map="auto",
25
+ )
26
+
27
+ model.eval()
28
 
29
  tokenizer = AutoTokenizer.from_pretrained(model_path)
30
+ model.tokenizer = tokenizer # Set the tokenizer attribute of the model
31
+
32
  streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
33
 
34
  # Convert prompt to tokens
35
  prompt_template = "[INST] {prompt} [/INST]"
36
+ prompt = "It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: To begin with, Lesley is a close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy. It follows that Lesley is a great-grandfather of Leroy. Is the argument, given the explicitly stated premises, deductively valid or invalid?"
 
 
 
37
 
38
  input_ids = tokenizer(
39
  prompt_template.format(prompt=prompt),
40
  return_tensors='pt'
41
+ ).input_ids.to(model.device)
42
+
43
+ attention_mask = torch.ones_like(input_ids)
44
+
45
+ max_length = 1024
46
+
47
+ output_ids, _ = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, streamer=streamer)
48
+
49
+ print(tokenizer.decode(output_ids[0], skip_special_tokens=False))