seba commited on
Commit
edfe3bc
·
verified ·
1 Parent(s): b6f3c8a

python sample generation script

Browse files
Files changed (1) hide show
  1. coreml_example.py +90 -0
coreml_example.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import numpy as np
3
+ from argparse import ArgumentParser
4
+ from transformers import AutoTokenizer
5
+
6
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B")
7
+
8
+ parser = ArgumentParser()
9
+ parser.add_argument("--model_path", "--model-path", required=True)
10
+ parser.add_argument("--prompt", "-p", required=True)
11
+ parser.add_argument("--max-tokens", "--max_tokens", type=int, default=100)
12
+ parser.add_argument("--min_p", "--min-p", type=float, default=0.3)
13
+ parser.add_argument("--temp", type=float, default=1.0)
14
+ args = parser.parse_args()
15
+
16
+ import coremltools as ct
17
+
18
+ print("Loading model...")
19
+ if args.model_path.rstrip("/").endswith(".mlpackage"):
20
+ mf_model_1 = ct.models.MLModel(
21
+ args.model_path,
22
+ compute_units=ct.ComputeUnit.CPU_AND_NE,
23
+ function_name="length_1",
24
+ )
25
+ mf_model_64 = ct.models.MLModel(
26
+ args.model_path,
27
+ compute_units=ct.ComputeUnit.CPU_AND_NE,
28
+ function_name="length_64",
29
+ )
30
+ else:
31
+ mf_model_1 = ct.models.CompiledMLModel(
32
+ args.model_path,
33
+ compute_units=ct.ComputeUnit.CPU_AND_NE,
34
+ function_name="length_1",
35
+ )
36
+ mf_model_64 = ct.models.CompiledMLModel(
37
+ args.model_path,
38
+ compute_units=ct.ComputeUnit.CPU_AND_NE,
39
+ function_name="length_64",
40
+ )
41
+
42
+
43
+ def min_p_sample(logits, min_p, temp):
44
+ # logits = logits.astype(np.float16)
45
+ max_ = np.max(logits * (1 / temp), axis=1, keepdims=True)
46
+ logits = logits - max_
47
+ logits = np.exp(logits)
48
+ logits[logits < min_p] = 0
49
+ # logits = logits.astype(np.float32)
50
+ logits = np.cumsum(logits, axis=1)
51
+ sample = np.random.uniform(high=logits[:, -1:])
52
+ sample = np.argmax(logits > sample, axis=1).astype(np.int32)
53
+ return sample
54
+
55
+
56
+ length = len(tokenizer(args.prompt)["input_ids"])
57
+ input_ids = tokenizer(
58
+ args.prompt, return_tensors="np", padding="max_length", max_length=64
59
+ )["input_ids"].astype(np.int32)
60
+ print("Prompt:", args.prompt)
61
+ state = mf_model_64.make_state()
62
+ start = time.time()
63
+ pred = mf_model_64.predict(
64
+ {"input_ids": input_ids, "query_pos1": np.array([0], dtype=np.int32)}, state
65
+ )
66
+ prompt_time = time.time() - start
67
+ # input_ids = pred["logits"][..., length - 1].argmax(1, keepdims=True).astype(np.int32)
68
+ logits = pred["logits"][..., [length - 1]]
69
+ input_ids = min_p_sample(logits, args.min_p, args.temp)
70
+ print("Generated:")
71
+ print(tokenizer.decode(input_ids[0]), end="", flush=True)
72
+ start = time.time()
73
+ for i in range(args.max_tokens):
74
+ pred = mf_model_1.predict(
75
+ {"input_ids": input_ids, "query_pos1": np.array([i + length], dtype=np.int32)},
76
+ state,
77
+ )
78
+ input_ids = min_p_sample(pred["logits"], args.min_p, args.temp)
79
+ # input_ids = pred["logits"].argmax(1).astype(np.int32)
80
+ print(tokenizer.decode(input_ids[0]), end="", flush=True)
81
+ print("", "=" * 10)
82
+ generation_time = time.time() - start
83
+
84
+ print(
85
+ "Prompt:",
86
+ length / prompt_time,
87
+ "tokens-per-sec",
88
+ f"({64 / prompt_time} considering the processed padding)",
89
+ )
90
+ print("Generation:", args.max_tokens / generation_time, "tokens-per-sec")