calebnwokocha
commited on
Upload 21 files
Browse files- GPT2.cbp +63 -0
- GPT2.cscope_file_list +22 -0
- GPT2.depend +240 -0
- GPT2.layout +80 -0
- common-ggml.cpp +244 -0
- common-ggml.h +18 -0
- common.cpp +911 -0
- common.h +343 -0
- dr_wav.h +0 -0
- ggml-aarch64.c +0 -0
- ggml-aarch64.h +39 -0
- ggml-common.h +0 -0
- ggml-cpu-impl.h +614 -0
- ggml-impl.h +209 -0
- ggml-model-gpt-2-774M.bin +3 -0
- ggml-quants.c +0 -0
- ggml-quants.h +147 -0
- ggml.c +0 -0
- ggml.h +0 -0
- main-ctx.cpp +841 -0
- quantize.cpp +184 -0
GPT2.cbp
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
2 |
+
<CodeBlocks_project_file>
|
3 |
+
<FileVersion major="1" minor="6" />
|
4 |
+
<Project>
|
5 |
+
<Option title="GPT2" />
|
6 |
+
<Option pch_mode="2" />
|
7 |
+
<Option compiler="gcc" />
|
8 |
+
<Build>
|
9 |
+
<Target title="Debug">
|
10 |
+
<Option output="bin/Debug/GPT2" prefix_auto="1" extension_auto="1" />
|
11 |
+
<Option object_output="obj/Debug/" />
|
12 |
+
<Option type="1" />
|
13 |
+
<Option compiler="gcc" />
|
14 |
+
<Compiler>
|
15 |
+
<Add option="-g" />
|
16 |
+
</Compiler>
|
17 |
+
</Target>
|
18 |
+
<Target title="Release">
|
19 |
+
<Option output="bin/Release/GPT2" prefix_auto="1" extension_auto="1" />
|
20 |
+
<Option object_output="obj/Release/" />
|
21 |
+
<Option type="1" />
|
22 |
+
<Option compiler="gcc" />
|
23 |
+
<Compiler>
|
24 |
+
<Add option="-O2" />
|
25 |
+
</Compiler>
|
26 |
+
<Linker>
|
27 |
+
<Add option="-s" />
|
28 |
+
</Linker>
|
29 |
+
</Target>
|
30 |
+
</Build>
|
31 |
+
<Compiler>
|
32 |
+
<Add option="-Wall" />
|
33 |
+
<Add option="-fexceptions" />
|
34 |
+
</Compiler>
|
35 |
+
<Unit filename="GPT2.cbp" />
|
36 |
+
<Unit filename="GPT2.layout" />
|
37 |
+
<Unit filename="common-ggml.cpp" />
|
38 |
+
<Unit filename="common-ggml.h" />
|
39 |
+
<Unit filename="common.cpp" />
|
40 |
+
<Unit filename="common.h" />
|
41 |
+
<Unit filename="dr_wav.h" />
|
42 |
+
<Unit filename="ggml-aarch64.c">
|
43 |
+
<Option compilerVar="CC" />
|
44 |
+
</Unit>
|
45 |
+
<Unit filename="ggml-aarch64.h" />
|
46 |
+
<Unit filename="ggml-common.h" />
|
47 |
+
<Unit filename="ggml-cpu-impl.h" />
|
48 |
+
<Unit filename="ggml-impl.h" />
|
49 |
+
<Unit filename="ggml-quants.c">
|
50 |
+
<Option compilerVar="CC" />
|
51 |
+
</Unit>
|
52 |
+
<Unit filename="ggml-quants.h" />
|
53 |
+
<Unit filename="ggml.c">
|
54 |
+
<Option compilerVar="CC" />
|
55 |
+
</Unit>
|
56 |
+
<Unit filename="ggml.h" />
|
57 |
+
<Unit filename="main-ctx.cpp" />
|
58 |
+
<Unit filename="quantize.cpp" />
|
59 |
+
<Extensions>
|
60 |
+
<lib_finder disable_auto="1" />
|
61 |
+
</Extensions>
|
62 |
+
</Project>
|
63 |
+
</CodeBlocks_project_file>
|
GPT2.cscope_file_list
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-alloc.h"
|
2 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-alloc.c"
|
3 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-impl.h"
|
4 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common-ggml.cpp"
|
5 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-quants.c"
|
6 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-aarch64.h"
|
7 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common.cpp"
|
8 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-quants.h"
|
9 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-backend-impl.h"
|
10 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml.h"
|
11 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-common.h"
|
12 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml.c"
|
13 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-backend.h"
|
14 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common.h"
|
15 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\GPT2.cbp"
|
16 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common-ggml.h"
|
17 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-cpu-impl.h"
|
18 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-aarch64.c"
|
19 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\GPT2.layout"
|
20 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-backend.cpp"
|
21 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\dr_wav.h"
|
22 |
+
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\main-ctx.cpp"
|
GPT2.depend
ADDED
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# depslib dependency file v1.0
|
2 |
+
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\common-ggml.cpp
|
3 |
+
"common-ggml.h"
|
4 |
+
<regex>
|
5 |
+
<map>
|
6 |
+
|
7 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\common-ggml.h
|
8 |
+
"ggml.h"
|
9 |
+
<fstream>
|
10 |
+
<vector>
|
11 |
+
<string>
|
12 |
+
|
13 |
+
1730691388 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml.h
|
14 |
+
<stdbool.h>
|
15 |
+
<stddef.h>
|
16 |
+
<stdint.h>
|
17 |
+
<stdio.h>
|
18 |
+
|
19 |
+
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-alloc.c
|
20 |
+
"ggml-alloc.h"
|
21 |
+
"ggml-backend-impl.h"
|
22 |
+
"ggml.h"
|
23 |
+
"ggml-impl.h"
|
24 |
+
<assert.h>
|
25 |
+
<limits.h>
|
26 |
+
<stdarg.h>
|
27 |
+
<stdio.h>
|
28 |
+
<stdlib.h>
|
29 |
+
<string.h>
|
30 |
+
|
31 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-alloc.h
|
32 |
+
"ggml.h"
|
33 |
+
|
34 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-backend-impl.h
|
35 |
+
"ggml-backend.h"
|
36 |
+
|
37 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-backend.h
|
38 |
+
"ggml.h"
|
39 |
+
"ggml-alloc.h"
|
40 |
+
|
41 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-impl.h
|
42 |
+
"ggml.h"
|
43 |
+
<assert.h>
|
44 |
+
<stdlib.h>
|
45 |
+
<stdbool.h>
|
46 |
+
<stdint.h>
|
47 |
+
|
48 |
+
1730735604 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\common.cpp
|
49 |
+
"common.h"
|
50 |
+
"dr_wav.h"
|
51 |
+
<cmath>
|
52 |
+
<cstring>
|
53 |
+
<fstream>
|
54 |
+
<regex>
|
55 |
+
<locale>
|
56 |
+
<codecvt>
|
57 |
+
<sstream>
|
58 |
+
<fcntl.h>
|
59 |
+
<io.h>
|
60 |
+
|
61 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\common.h
|
62 |
+
<string>
|
63 |
+
<map>
|
64 |
+
<vector>
|
65 |
+
<random>
|
66 |
+
<thread>
|
67 |
+
<ctime>
|
68 |
+
<fstream>
|
69 |
+
<sstream>
|
70 |
+
|
71 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\dr_wav.h
|
72 |
+
"dr_wav.h"
|
73 |
+
<stddef.h>
|
74 |
+
<stdlib.h>
|
75 |
+
<string.h>
|
76 |
+
<limits.h>
|
77 |
+
<stdio.h>
|
78 |
+
<wchar.h>
|
79 |
+
<assert.h>
|
80 |
+
<errno.h>
|
81 |
+
|
82 |
+
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-backend.cpp
|
83 |
+
<windows.h>
|
84 |
+
"ggml-backend-impl.h"
|
85 |
+
"ggml-alloc.h"
|
86 |
+
"ggml-impl.h"
|
87 |
+
<assert.h>
|
88 |
+
<limits.h>
|
89 |
+
<stdarg.h>
|
90 |
+
<stdio.h>
|
91 |
+
<stdlib.h>
|
92 |
+
<string.h>
|
93 |
+
<string>
|
94 |
+
<vector>
|
95 |
+
<sys/types.h>
|
96 |
+
<sys/sysctl.h>
|
97 |
+
"ggml-cuda.h"
|
98 |
+
"ggml-metal.h"
|
99 |
+
"ggml-sycl.h"
|
100 |
+
"ggml-vulkan.h"
|
101 |
+
"ggml-blas.h"
|
102 |
+
"ggml-rpc.h"
|
103 |
+
"ggml-amx.h"
|
104 |
+
"ggml-cann.h"
|
105 |
+
<hbwmalloc.h>
|
106 |
+
|
107 |
+
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-aarch64.c
|
108 |
+
"ggml-common.h"
|
109 |
+
"ggml-quants.h"
|
110 |
+
"ggml-impl.h"
|
111 |
+
"ggml-cpu-impl.h"
|
112 |
+
<math.h>
|
113 |
+
<string.h>
|
114 |
+
<assert.h>
|
115 |
+
<float.h>
|
116 |
+
<stdlib.h>
|
117 |
+
<stdio.h>
|
118 |
+
"ggml-aarch64.h"
|
119 |
+
|
120 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-common.h
|
121 |
+
<stdint.h>
|
122 |
+
<metal_stdlib>
|
123 |
+
<musa_fp16.h>
|
124 |
+
<cuda_fp16.h>
|
125 |
+
<cstdint>
|
126 |
+
<hip/hip_fp16.h>
|
127 |
+
<cstdint>
|
128 |
+
<sycl/half_type.hpp>
|
129 |
+
<cstdint>
|
130 |
+
<stdint.h>
|
131 |
+
<metal_stdlib>
|
132 |
+
<cstdint>
|
133 |
+
<cstdint>
|
134 |
+
|
135 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-quants.h
|
136 |
+
"ggml-common.h"
|
137 |
+
"ggml.h"
|
138 |
+
|
139 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-cpu-impl.h
|
140 |
+
"ggml.h"
|
141 |
+
"ggml-impl.h"
|
142 |
+
<stdlib.h>
|
143 |
+
<stdbool.h>
|
144 |
+
<string.h>
|
145 |
+
<math.h>
|
146 |
+
<arm_sve.h>
|
147 |
+
<sys/prctl.h>
|
148 |
+
<arm_neon.h>
|
149 |
+
<wasm_simd128.h>
|
150 |
+
<altivec.h>
|
151 |
+
<intrin.h>
|
152 |
+
<immintrin.h>
|
153 |
+
<riscv_vector.h>
|
154 |
+
<lasxintrin.h>
|
155 |
+
<lsxintrin.h>
|
156 |
+
<arm_sve.h>
|
157 |
+
|
158 |
+
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-aarch64.h
|
159 |
+
"ggml-common.h"
|
160 |
+
"ggml.h"
|
161 |
+
|
162 |
+
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-quants.c
|
163 |
+
"ggml-common.h"
|
164 |
+
"ggml-quants.h"
|
165 |
+
"ggml-impl.h"
|
166 |
+
"ggml-cpu-impl.h"
|
167 |
+
<math.h>
|
168 |
+
<string.h>
|
169 |
+
<assert.h>
|
170 |
+
<float.h>
|
171 |
+
<stdlib.h>
|
172 |
+
<stdio.h>
|
173 |
+
|
174 |
+
1730734998 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml.c
|
175 |
+
"ggml-impl.h"
|
176 |
+
"ggml-cpu-impl.h"
|
177 |
+
"ggml-quants.h"
|
178 |
+
"ggml.h"
|
179 |
+
"ggml-aarch64.h"
|
180 |
+
<malloc.h>
|
181 |
+
<alloca.h>
|
182 |
+
<assert.h>
|
183 |
+
<errno.h>
|
184 |
+
<time.h>
|
185 |
+
<math.h>
|
186 |
+
<stdlib.h>
|
187 |
+
<string.h>
|
188 |
+
<stdint.h>
|
189 |
+
<inttypes.h>
|
190 |
+
<stdio.h>
|
191 |
+
<float.h>
|
192 |
+
<limits.h>
|
193 |
+
<stdarg.h>
|
194 |
+
<signal.h>
|
195 |
+
<syscall.h>
|
196 |
+
<omp.h>
|
197 |
+
<llamafile/sgemm.h>
|
198 |
+
<windows.h>
|
199 |
+
<stdatomic.h>
|
200 |
+
<pthread.h>
|
201 |
+
<stdatomic.h>
|
202 |
+
<sched.h>
|
203 |
+
<pthread_np.h>
|
204 |
+
<sys/types.h>
|
205 |
+
<sys/stat.h>
|
206 |
+
<unistd.h>
|
207 |
+
<hbwmalloc.h>
|
208 |
+
<unistd.h>
|
209 |
+
<mach/mach.h>
|
210 |
+
<TargetConditionals.h>
|
211 |
+
<sys/wait.h>
|
212 |
+
<unwind.h>
|
213 |
+
<dlfcn.h>
|
214 |
+
<stdio.h>
|
215 |
+
<execinfo.h>
|
216 |
+
<Accelerate/Accelerate.h>
|
217 |
+
<sys/auxv.h>
|
218 |
+
<sys/sysctl.h>
|
219 |
+
"windows.h"
|
220 |
+
<sys/types.h>
|
221 |
+
<sys/resource.h>
|
222 |
+
|
223 |
+
1730683892 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\main-alloc.cpp
|
224 |
+
|
225 |
+
1730737838 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\main-ctx.cpp
|
226 |
+
|
227 |
+
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\quantize.cpp
|
228 |
+
"ggml.h"
|
229 |
+
"common.h"
|
230 |
+
"common-ggml.h"
|
231 |
+
<cassert>
|
232 |
+
<cmath>
|
233 |
+
<cstdio>
|
234 |
+
<cstring>
|
235 |
+
<fstream>
|
236 |
+
<map>
|
237 |
+
<string>
|
238 |
+
<vector>
|
239 |
+
<regex>
|
240 |
+
|
GPT2.layout
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
2 |
+
<CodeBlocks_layout_file>
|
3 |
+
<FileVersion major="1" minor="0" />
|
4 |
+
<ActiveTarget name="Debug" />
|
5 |
+
<File name="ggml-impl.h" open="1" top="0" tabpos="10" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
6 |
+
<Cursor>
|
7 |
+
<Cursor1 position="6388" topLine="0" />
|
8 |
+
</Cursor>
|
9 |
+
</File>
|
10 |
+
<File name="common-ggml.cpp" open="1" top="0" tabpos="4" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
11 |
+
<Cursor>
|
12 |
+
<Cursor1 position="223" topLine="135" />
|
13 |
+
</Cursor>
|
14 |
+
</File>
|
15 |
+
<File name="ggml-quants.c" open="1" top="0" tabpos="11" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
16 |
+
<Cursor>
|
17 |
+
<Cursor1 position="2705" topLine="0" />
|
18 |
+
</Cursor>
|
19 |
+
</File>
|
20 |
+
<File name="ggml-aarch64.h" open="1" top="0" tabpos="8" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
21 |
+
<Cursor>
|
22 |
+
<Cursor1 position="1519" topLine="0" />
|
23 |
+
</Cursor>
|
24 |
+
</File>
|
25 |
+
<File name="common.cpp" open="1" top="0" tabpos="5" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
26 |
+
<Cursor>
|
27 |
+
<Cursor1 position="152" topLine="0" />
|
28 |
+
</Cursor>
|
29 |
+
</File>
|
30 |
+
<File name="ggml-quants.h" open="1" top="0" tabpos="13" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
31 |
+
<Cursor>
|
32 |
+
<Cursor1 position="0" topLine="128" />
|
33 |
+
</Cursor>
|
34 |
+
</File>
|
35 |
+
<File name="quantize.cpp" open="1" top="1" tabpos="15" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
36 |
+
<Cursor>
|
37 |
+
<Cursor1 position="4241" topLine="139" />
|
38 |
+
</Cursor>
|
39 |
+
</File>
|
40 |
+
<File name="ggml.h" open="1" top="0" tabpos="1" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
41 |
+
<Cursor>
|
42 |
+
<Cursor1 position="8069" topLine="212" />
|
43 |
+
</Cursor>
|
44 |
+
</File>
|
45 |
+
<File name="ggml-common.h" open="1" top="0" tabpos="14" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
46 |
+
<Cursor>
|
47 |
+
<Cursor1 position="0" topLine="0" />
|
48 |
+
</Cursor>
|
49 |
+
</File>
|
50 |
+
<File name="ggml.c" open="1" top="0" tabpos="6" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
51 |
+
<Cursor>
|
52 |
+
<Cursor1 position="522" topLine="0" />
|
53 |
+
</Cursor>
|
54 |
+
</File>
|
55 |
+
<File name="common.h" open="1" top="0" tabpos="2" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
56 |
+
<Cursor>
|
57 |
+
<Cursor1 position="0" topLine="0" />
|
58 |
+
</Cursor>
|
59 |
+
</File>
|
60 |
+
<File name="common-ggml.h" open="1" top="0" tabpos="3" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
61 |
+
<Cursor>
|
62 |
+
<Cursor1 position="141" topLine="0" />
|
63 |
+
</Cursor>
|
64 |
+
</File>
|
65 |
+
<File name="ggml-cpu-impl.h" open="1" top="0" tabpos="9" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
66 |
+
<Cursor>
|
67 |
+
<Cursor1 position="0" topLine="0" />
|
68 |
+
</Cursor>
|
69 |
+
</File>
|
70 |
+
<File name="ggml-aarch64.c" open="1" top="0" tabpos="7" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
71 |
+
<Cursor>
|
72 |
+
<Cursor1 position="442" topLine="0" />
|
73 |
+
</Cursor>
|
74 |
+
</File>
|
75 |
+
<File name="main-ctx.cpp" open="1" top="0" tabpos="12" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
|
76 |
+
<Cursor>
|
77 |
+
<Cursor1 position="114" topLine="659" />
|
78 |
+
</Cursor>
|
79 |
+
</File>
|
80 |
+
</CodeBlocks_layout_file>
|
common-ggml.cpp
ADDED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "common-ggml.h"
|
2 |
+
|
3 |
+
#include <regex>
|
4 |
+
#include <map>
|
5 |
+
|
6 |
+
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
|
7 |
+
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
|
8 |
+
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
|
9 |
+
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
|
10 |
+
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
|
11 |
+
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
|
12 |
+
{"q2_k", GGML_FTYPE_MOSTLY_Q2_K},
|
13 |
+
{"q3_k", GGML_FTYPE_MOSTLY_Q3_K},
|
14 |
+
{"q4_k", GGML_FTYPE_MOSTLY_Q4_K},
|
15 |
+
{"q5_k", GGML_FTYPE_MOSTLY_Q5_K},
|
16 |
+
{"q6_k", GGML_FTYPE_MOSTLY_Q6_K},
|
17 |
+
};
|
18 |
+
|
19 |
+
void ggml_print_ftypes(FILE * fp) {
|
20 |
+
for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
|
21 |
+
fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
|
22 |
+
}
|
23 |
+
}
|
24 |
+
|
25 |
+
enum ggml_ftype ggml_parse_ftype(const char * str) {
|
26 |
+
enum ggml_ftype ftype;
|
27 |
+
if (str[0] == 'q') {
|
28 |
+
const auto it = GGML_FTYPE_MAP.find(str);
|
29 |
+
if (it == GGML_FTYPE_MAP.end()) {
|
30 |
+
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
|
31 |
+
return GGML_FTYPE_UNKNOWN;
|
32 |
+
}
|
33 |
+
ftype = it->second;
|
34 |
+
} else {
|
35 |
+
ftype = (enum ggml_ftype) atoi(str);
|
36 |
+
}
|
37 |
+
|
38 |
+
return ftype;
|
39 |
+
}
|
40 |
+
|
41 |
+
bool ggml_common_quantize_0(
|
42 |
+
std::ifstream & finp,
|
43 |
+
std::ofstream & fout,
|
44 |
+
const ggml_ftype ftype,
|
45 |
+
const std::vector<std::string> & to_quant,
|
46 |
+
const std::vector<std::string> & to_skip) {
|
47 |
+
|
48 |
+
ggml_type qtype = GGML_TYPE_F32;
|
49 |
+
|
50 |
+
switch (ftype) {
|
51 |
+
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
|
52 |
+
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
|
53 |
+
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
|
54 |
+
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
|
55 |
+
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
|
56 |
+
case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break;
|
57 |
+
case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break;
|
58 |
+
case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break;
|
59 |
+
case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break;
|
60 |
+
case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break;
|
61 |
+
case GGML_FTYPE_UNKNOWN:
|
62 |
+
case GGML_FTYPE_ALL_F32:
|
63 |
+
case GGML_FTYPE_MOSTLY_F16:
|
64 |
+
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
65 |
+
case GGML_FTYPE_MOSTLY_IQ2_XXS:
|
66 |
+
case GGML_FTYPE_MOSTLY_IQ2_XS:
|
67 |
+
case GGML_FTYPE_MOSTLY_IQ2_S:
|
68 |
+
case GGML_FTYPE_MOSTLY_IQ3_XXS:
|
69 |
+
case GGML_FTYPE_MOSTLY_IQ3_S:
|
70 |
+
case GGML_FTYPE_MOSTLY_IQ1_S:
|
71 |
+
case GGML_FTYPE_MOSTLY_IQ4_NL:
|
72 |
+
case GGML_FTYPE_MOSTLY_IQ4_XS:
|
73 |
+
case GGML_FTYPE_MOSTLY_IQ1_M:
|
74 |
+
case GGML_FTYPE_MOSTLY_BF16:
|
75 |
+
case GGML_FTYPE_MOSTLY_Q4_0_4_4:
|
76 |
+
case GGML_FTYPE_MOSTLY_Q4_0_4_8:
|
77 |
+
case GGML_FTYPE_MOSTLY_Q4_0_8_8:
|
78 |
+
{
|
79 |
+
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
80 |
+
return false;
|
81 |
+
}
|
82 |
+
};
|
83 |
+
|
84 |
+
if (!ggml_is_quantized(qtype)) {
|
85 |
+
fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
|
86 |
+
return false;
|
87 |
+
}
|
88 |
+
|
89 |
+
size_t total_size_org = 0;
|
90 |
+
size_t total_size_new = 0;
|
91 |
+
|
92 |
+
std::vector<float> work;
|
93 |
+
|
94 |
+
std::vector<uint8_t> data_u8;
|
95 |
+
std::vector<ggml_fp16_t> data_f16;
|
96 |
+
std::vector<float> data_f32;
|
97 |
+
|
98 |
+
while (true) {
|
99 |
+
int32_t n_dims;
|
100 |
+
int32_t length;
|
101 |
+
int32_t ttype;
|
102 |
+
|
103 |
+
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
104 |
+
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
|
105 |
+
finp.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
106 |
+
|
107 |
+
if (finp.eof()) {
|
108 |
+
break;
|
109 |
+
}
|
110 |
+
|
111 |
+
int32_t nelements = 1;
|
112 |
+
int32_t ne[4] = { 1, 1, 1, 1 };
|
113 |
+
for (int i = 0; i < n_dims; ++i) {
|
114 |
+
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
115 |
+
nelements *= ne[i];
|
116 |
+
}
|
117 |
+
|
118 |
+
std::string name(length, 0);
|
119 |
+
finp.read (&name[0], length);
|
120 |
+
|
121 |
+
printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
|
122 |
+
|
123 |
+
bool quantize = false;
|
124 |
+
|
125 |
+
// check if we should quantize this tensor
|
126 |
+
for (const auto & s : to_quant) {
|
127 |
+
if (std::regex_match(name, std::regex(s))) {
|
128 |
+
quantize = true;
|
129 |
+
break;
|
130 |
+
}
|
131 |
+
}
|
132 |
+
|
133 |
+
// check if we should skip this tensor
|
134 |
+
for (const auto & s : to_skip) {
|
135 |
+
if (std::regex_match(name, std::regex(s))) {
|
136 |
+
quantize = false;
|
137 |
+
break;
|
138 |
+
}
|
139 |
+
}
|
140 |
+
|
141 |
+
// quantize only 2D tensors
|
142 |
+
quantize &= (n_dims == 2);
|
143 |
+
|
144 |
+
if (quantize) {
|
145 |
+
if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
|
146 |
+
fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
147 |
+
return false;
|
148 |
+
}
|
149 |
+
|
150 |
+
if (ttype == GGML_TYPE_F16) {
|
151 |
+
data_f16.resize(nelements);
|
152 |
+
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
|
153 |
+
data_f32.resize(nelements);
|
154 |
+
for (int i = 0; i < nelements; ++i) {
|
155 |
+
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
|
156 |
+
}
|
157 |
+
} else {
|
158 |
+
data_f32.resize(nelements);
|
159 |
+
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
|
160 |
+
}
|
161 |
+
|
162 |
+
ttype = qtype;
|
163 |
+
} else {
|
164 |
+
const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
|
165 |
+
|
166 |
+
data_u8.resize(nelements*bpe);
|
167 |
+
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
|
168 |
+
}
|
169 |
+
|
170 |
+
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
171 |
+
fout.write(reinterpret_cast<char *>(&length), sizeof(length));
|
172 |
+
fout.write(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
173 |
+
for (int i = 0; i < n_dims; ++i) {
|
174 |
+
fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
175 |
+
}
|
176 |
+
fout.write(&name[0], length);
|
177 |
+
|
178 |
+
if (quantize) {
|
179 |
+
work.resize(nelements); // for quantization
|
180 |
+
|
181 |
+
size_t cur_size = 0;
|
182 |
+
switch ((ggml_type) ttype) {
|
183 |
+
case GGML_TYPE_Q4_0:
|
184 |
+
case GGML_TYPE_Q4_1:
|
185 |
+
case GGML_TYPE_Q5_0:
|
186 |
+
case GGML_TYPE_Q5_1:
|
187 |
+
case GGML_TYPE_Q8_0:
|
188 |
+
case GGML_TYPE_Q2_K:
|
189 |
+
case GGML_TYPE_Q3_K:
|
190 |
+
case GGML_TYPE_Q4_K:
|
191 |
+
case GGML_TYPE_Q5_K:
|
192 |
+
case GGML_TYPE_Q6_K:
|
193 |
+
{
|
194 |
+
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], nullptr);
|
195 |
+
} break;
|
196 |
+
case GGML_TYPE_F32:
|
197 |
+
case GGML_TYPE_F16:
|
198 |
+
case GGML_TYPE_I8:
|
199 |
+
case GGML_TYPE_I16:
|
200 |
+
case GGML_TYPE_I32:
|
201 |
+
case GGML_TYPE_I64:
|
202 |
+
case GGML_TYPE_F64:
|
203 |
+
case GGML_TYPE_Q8_1:
|
204 |
+
case GGML_TYPE_Q8_K:
|
205 |
+
case GGML_TYPE_IQ2_XXS:
|
206 |
+
case GGML_TYPE_IQ2_XS:
|
207 |
+
case GGML_TYPE_IQ2_S:
|
208 |
+
case GGML_TYPE_IQ3_XXS:
|
209 |
+
case GGML_TYPE_IQ3_S:
|
210 |
+
case GGML_TYPE_IQ1_S:
|
211 |
+
case GGML_TYPE_IQ4_NL:
|
212 |
+
case GGML_TYPE_IQ4_XS:
|
213 |
+
case GGML_TYPE_IQ1_M:
|
214 |
+
case GGML_TYPE_BF16:
|
215 |
+
case GGML_TYPE_Q4_0_4_4:
|
216 |
+
case GGML_TYPE_Q4_0_4_8:
|
217 |
+
case GGML_TYPE_Q4_0_8_8:
|
218 |
+
case GGML_TYPE_TQ1_0:
|
219 |
+
case GGML_TYPE_TQ2_0:
|
220 |
+
case GGML_TYPE_COUNT:
|
221 |
+
{
|
222 |
+
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
223 |
+
return false;
|
224 |
+
}
|
225 |
+
}
|
226 |
+
|
227 |
+
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
|
228 |
+
total_size_new += cur_size;
|
229 |
+
|
230 |
+
printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
|
231 |
+
} else {
|
232 |
+
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
|
233 |
+
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
|
234 |
+
total_size_new += data_u8.size();
|
235 |
+
}
|
236 |
+
|
237 |
+
total_size_org += nelements * sizeof(float);
|
238 |
+
}
|
239 |
+
|
240 |
+
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
241 |
+
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
|
242 |
+
|
243 |
+
return true;
|
244 |
+
}
|
common-ggml.h
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#pragma once
|
2 |
+
|
3 |
+
#include "ggml.h"
|
4 |
+
|
5 |
+
#include <fstream>
|
6 |
+
#include <vector>
|
7 |
+
#include <string>
|
8 |
+
|
9 |
+
enum ggml_ftype ggml_parse_ftype(const char * str);
|
10 |
+
|
11 |
+
void ggml_print_ftypes(FILE * fp = stderr);
|
12 |
+
|
13 |
+
bool ggml_common_quantize_0(
|
14 |
+
std::ifstream & finp,
|
15 |
+
std::ofstream & fout,
|
16 |
+
const ggml_ftype ftype,
|
17 |
+
const std::vector<std::string> & to_quant,
|
18 |
+
const std::vector<std::string> & to_skip);
|
common.cpp
ADDED
@@ -0,0 +1,911 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#define _USE_MATH_DEFINES // for M_PI
|
2 |
+
|
3 |
+
#include "common.h"
|
4 |
+
|
5 |
+
// third-party utilities
|
6 |
+
// use your favorite implementations
|
7 |
+
#define DR_WAV_IMPLEMENTATION
|
8 |
+
#include "dr_wav.h"
|
9 |
+
|
10 |
+
#include <cmath>
|
11 |
+
#include <cstring>
|
12 |
+
#include <fstream>
|
13 |
+
#include <regex>
|
14 |
+
#include <locale>
|
15 |
+
#include <codecvt>
|
16 |
+
#include <sstream>
|
17 |
+
|
18 |
+
#if defined(_MSC_VER)
|
19 |
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
20 |
+
#endif
|
21 |
+
|
22 |
+
#ifdef _WIN32
|
23 |
+
#include <fcntl.h>
|
24 |
+
#include <io.h>
|
25 |
+
#endif
|
26 |
+
|
27 |
+
#ifdef WHISPER_FFMPEG
|
28 |
+
// as implemented in ffmpeg_trancode.cpp only embedded in common lib if whisper built with ffmpeg support
|
29 |
+
extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data);
|
30 |
+
#endif
|
31 |
+
|
32 |
+
// Function to check if the next argument exists
|
33 |
+
static std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
|
34 |
+
if (i + 1 < argc && argv[i + 1][0] != '-') {
|
35 |
+
return argv[++i];
|
36 |
+
} else {
|
37 |
+
fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
|
38 |
+
gpt_print_usage(argc, argv, params);
|
39 |
+
exit(0);
|
40 |
+
}
|
41 |
+
}
|
42 |
+
|
43 |
+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
44 |
+
for (int i = 1; i < argc; i++) {
|
45 |
+
std::string arg = argv[i];
|
46 |
+
|
47 |
+
if (arg == "-s" || arg == "--seed") {
|
48 |
+
params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
49 |
+
} else if (arg == "-t" || arg == "--threads") {
|
50 |
+
params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
51 |
+
} else if (arg == "-p" || arg == "--prompt") {
|
52 |
+
params.prompt = get_next_arg(i, argc, argv, arg, params);
|
53 |
+
} else if (arg == "-n" || arg == "--n_predict") {
|
54 |
+
params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
55 |
+
} else if (arg == "-np" || arg == "--n_parallel") {
|
56 |
+
params.n_parallel = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
57 |
+
} else if (arg == "--top_k") {
|
58 |
+
params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
59 |
+
} else if (arg == "--top_p") {
|
60 |
+
params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
|
61 |
+
} else if (arg == "--temp") {
|
62 |
+
params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
|
63 |
+
} else if (arg == "--repeat-last-n") {
|
64 |
+
params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
65 |
+
} else if (arg == "--repeat-penalty") {
|
66 |
+
params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
|
67 |
+
} else if (arg == "-b" || arg == "--batch_size") {
|
68 |
+
params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
|
69 |
+
} else if (arg == "-c" || arg == "--context") {
|
70 |
+
params.n_ctx= std::stoi(get_next_arg(i, argc, argv, arg, params));
|
71 |
+
} else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
|
72 |
+
params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
73 |
+
} else if (arg == "--ignore-eos") {
|
74 |
+
params.ignore_eos = true;
|
75 |
+
} else if (arg == "-m" || arg == "--model") {
|
76 |
+
params.model = get_next_arg(i, argc, argv, arg, params);
|
77 |
+
} else if (arg == "-i" || arg == "--interactive") {
|
78 |
+
params.interactive = true;
|
79 |
+
} else if (arg == "-ip" || arg == "--interactive-port") {
|
80 |
+
params.interactive = true;
|
81 |
+
params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
|
82 |
+
} else if (arg == "-h" || arg == "--help") {
|
83 |
+
gpt_print_usage(argc, argv, params);
|
84 |
+
exit(0);
|
85 |
+
} else if (arg == "-f" || arg == "--file") {
|
86 |
+
get_next_arg(i, argc, argv, arg, params);
|
87 |
+
std::ifstream file(argv[i]);
|
88 |
+
if (!file) {
|
89 |
+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
90 |
+
break;
|
91 |
+
}
|
92 |
+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
93 |
+
if (params.prompt.back() == '\n') {
|
94 |
+
params.prompt.pop_back();
|
95 |
+
}
|
96 |
+
} else if (arg == "-tt" || arg == "--token_test") {
|
97 |
+
params.token_test = get_next_arg(i, argc, argv, arg, params);
|
98 |
+
}
|
99 |
+
else {
|
100 |
+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
101 |
+
gpt_print_usage(argc, argv, params);
|
102 |
+
exit(0);
|
103 |
+
}
|
104 |
+
}
|
105 |
+
|
106 |
+
return true;
|
107 |
+
}
|
108 |
+
|
109 |
+
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
110 |
+
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
111 |
+
fprintf(stderr, "\n");
|
112 |
+
fprintf(stderr, "options:\n");
|
113 |
+
fprintf(stderr, " -h, --help show this help message and exit\n");
|
114 |
+
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
|
115 |
+
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
116 |
+
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
117 |
+
fprintf(stderr, " prompt to start generation with (default: random)\n");
|
118 |
+
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
119 |
+
fprintf(stderr, " load prompt from a file\n");
|
120 |
+
fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
|
121 |
+
fprintf(stderr, " test tokenization\n");
|
122 |
+
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
|
123 |
+
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
|
124 |
+
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
|
125 |
+
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
|
126 |
+
fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
|
127 |
+
fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
|
128 |
+
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
129 |
+
fprintf(stderr, " -c N, --context N context / KV cache size (default: %d)\n", params.n_ctx);
|
130 |
+
fprintf(stderr, " --ignore-eos ignore EOS token during generation\n");
|
131 |
+
fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
|
132 |
+
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
133 |
+
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
134 |
+
fprintf(stderr, "\n");
|
135 |
+
}
|
136 |
+
|
137 |
+
std::string gpt_random_prompt(std::mt19937 & rng) {
|
138 |
+
const int r = rng() % 10;
|
139 |
+
switch (r) {
|
140 |
+
case 0: return "So";
|
141 |
+
case 1: return "Once upon a time";
|
142 |
+
case 2: return "When";
|
143 |
+
case 3: return "The";
|
144 |
+
case 4: return "After";
|
145 |
+
case 5: return "If";
|
146 |
+
case 6: return "import";
|
147 |
+
case 7: return "He";
|
148 |
+
case 8: return "She";
|
149 |
+
case 9: return "They";
|
150 |
+
}
|
151 |
+
|
152 |
+
return "The";
|
153 |
+
}
|
154 |
+
|
155 |
+
std::string trim(const std::string & s) {
|
156 |
+
std::regex e("^\\s+|\\s+$");
|
157 |
+
return std::regex_replace(s, e, "");
|
158 |
+
}
|
159 |
+
|
160 |
+
std::string replace(const std::string & s, const std::string & from, const std::string & to) {
|
161 |
+
std::string result = s;
|
162 |
+
size_t pos = 0;
|
163 |
+
while ((pos = result.find(from, pos)) != std::string::npos) {
|
164 |
+
result.replace(pos, from.length(), to);
|
165 |
+
pos += to.length();
|
166 |
+
}
|
167 |
+
return result;
|
168 |
+
}
|
169 |
+
|
170 |
+
void gpt_vocab::add_special_token(const std::string & token) {
|
171 |
+
special_tokens.push_back(token);
|
172 |
+
}
|
173 |
+
|
174 |
+
std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
175 |
+
std::map<std::string, int32_t> result;
|
176 |
+
|
177 |
+
// read file into string
|
178 |
+
std::string json;
|
179 |
+
{
|
180 |
+
std::ifstream ifs(fname);
|
181 |
+
if (!ifs) {
|
182 |
+
fprintf(stderr, "Failed to open %s\n", fname.c_str());
|
183 |
+
exit(1);
|
184 |
+
}
|
185 |
+
|
186 |
+
json = std::string((std::istreambuf_iterator<char>(ifs)),
|
187 |
+
(std::istreambuf_iterator<char>()));
|
188 |
+
}
|
189 |
+
|
190 |
+
if (json[0] != '{') {
|
191 |
+
return result;
|
192 |
+
}
|
193 |
+
|
194 |
+
// parse json
|
195 |
+
{
|
196 |
+
bool has_key = false;
|
197 |
+
bool in_token = false;
|
198 |
+
|
199 |
+
std::string str_key = "";
|
200 |
+
std::string str_val = "";
|
201 |
+
|
202 |
+
int n = json.size();
|
203 |
+
for (int i = 1; i < n; ++i) {
|
204 |
+
if (!in_token) {
|
205 |
+
if (json[i] == ' ') continue;
|
206 |
+
if (json[i] == '"') {
|
207 |
+
in_token = true;
|
208 |
+
continue;
|
209 |
+
}
|
210 |
+
} else {
|
211 |
+
if (json[i] == '\\' && i+1 < n) {
|
212 |
+
if (has_key == false) {
|
213 |
+
str_key += json[i];
|
214 |
+
} else {
|
215 |
+
str_val += json[i];
|
216 |
+
}
|
217 |
+
++i;
|
218 |
+
} else if (json[i] == '"') {
|
219 |
+
if (has_key == false) {
|
220 |
+
has_key = true;
|
221 |
+
++i;
|
222 |
+
while (json[i] == ' ') ++i;
|
223 |
+
++i; // :
|
224 |
+
while (json[i] == ' ') ++i;
|
225 |
+
if (json[i] != '\"') {
|
226 |
+
while (json[i] != ',' && json[i] != '}') {
|
227 |
+
str_val += json[i++];
|
228 |
+
}
|
229 |
+
has_key = false;
|
230 |
+
} else {
|
231 |
+
in_token = true;
|
232 |
+
continue;
|
233 |
+
}
|
234 |
+
} else {
|
235 |
+
has_key = false;
|
236 |
+
}
|
237 |
+
|
238 |
+
str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
|
239 |
+
str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
|
240 |
+
str_key = ::replace(str_key, "\\\"", "\""); // \\\" -> "
|
241 |
+
|
242 |
+
try {
|
243 |
+
result[str_key] = std::stoi(str_val);
|
244 |
+
} catch (...) {
|
245 |
+
//fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
|
246 |
+
|
247 |
+
}
|
248 |
+
str_key = "";
|
249 |
+
str_val = "";
|
250 |
+
in_token = false;
|
251 |
+
continue;
|
252 |
+
}
|
253 |
+
if (has_key == false) {
|
254 |
+
str_key += json[i];
|
255 |
+
} else {
|
256 |
+
str_val += json[i];
|
257 |
+
}
|
258 |
+
}
|
259 |
+
}
|
260 |
+
}
|
261 |
+
|
262 |
+
return result;
|
263 |
+
}
|
264 |
+
|
265 |
+
std::string convert_to_utf8(const std::wstring & input) {
|
266 |
+
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
267 |
+
return converter.to_bytes(input);
|
268 |
+
}
|
269 |
+
|
270 |
+
|
271 |
+
std::wstring convert_to_wstring(const std::string & input) {
|
272 |
+
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
273 |
+
return converter.from_bytes(input);
|
274 |
+
}
|
275 |
+
|
276 |
+
void gpt_split_words(std::string str, std::vector<std::string>& words) {
|
277 |
+
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
278 |
+
const std::regex re(pattern);
|
279 |
+
std::smatch m;
|
280 |
+
|
281 |
+
while (std::regex_search(str, m, re)) {
|
282 |
+
for (auto x : m) {
|
283 |
+
words.push_back(x);
|
284 |
+
}
|
285 |
+
str = m.suffix();
|
286 |
+
}
|
287 |
+
}
|
288 |
+
|
289 |
+
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
290 |
+
std::vector<std::string> words;
|
291 |
+
|
292 |
+
// first split the text into words
|
293 |
+
{
|
294 |
+
std::string str = text;
|
295 |
+
|
296 |
+
// Generate the subpattern from the special_tokens vector if it's not empty
|
297 |
+
if (!vocab.special_tokens.empty()) {
|
298 |
+
const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
|
299 |
+
std::string special_tokens_subpattern;
|
300 |
+
for (const auto & token : vocab.special_tokens) {
|
301 |
+
if (!special_tokens_subpattern.empty()) {
|
302 |
+
special_tokens_subpattern += "|";
|
303 |
+
}
|
304 |
+
special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
|
305 |
+
}
|
306 |
+
|
307 |
+
std::regex re(special_tokens_subpattern);
|
308 |
+
std::smatch m;
|
309 |
+
// Split the text by special tokens.
|
310 |
+
while (std::regex_search(str, m, re)) {
|
311 |
+
// Split the substrings in-between special tokens into words.
|
312 |
+
gpt_split_words(m.prefix(), words);
|
313 |
+
// Add matched special tokens as words.
|
314 |
+
for (auto x : m) {
|
315 |
+
words.push_back(x);
|
316 |
+
}
|
317 |
+
str = m.suffix();
|
318 |
+
}
|
319 |
+
// Remaining text without special tokens will be handled below.
|
320 |
+
}
|
321 |
+
|
322 |
+
gpt_split_words(str, words);
|
323 |
+
}
|
324 |
+
|
325 |
+
// find the longest token that forms each word in words:
|
326 |
+
std::vector<gpt_vocab::id> tokens;
|
327 |
+
for (const auto & word : words) {
|
328 |
+
for (int i = 0; i < (int) word.size(); ){
|
329 |
+
for (int j = word.size() - 1; j >= i; j--){
|
330 |
+
auto cand = word.substr(i, j-i+1);
|
331 |
+
auto it = vocab.token_to_id.find(cand);
|
332 |
+
if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
|
333 |
+
tokens.push_back(it->second);
|
334 |
+
i = j + 1;
|
335 |
+
break;
|
336 |
+
}
|
337 |
+
else if (j == i){ // word.substr(i, 1) has no matching
|
338 |
+
fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
|
339 |
+
i++;
|
340 |
+
}
|
341 |
+
}
|
342 |
+
}
|
343 |
+
}
|
344 |
+
|
345 |
+
return tokens;
|
346 |
+
}
|
347 |
+
|
348 |
+
static std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
|
349 |
+
std::vector<gpt_vocab::id> output;
|
350 |
+
std::stringstream ss(input);
|
351 |
+
std::string token;
|
352 |
+
|
353 |
+
while (std::getline(ss, token, delimiter)) {
|
354 |
+
output.push_back(std::stoi(token));
|
355 |
+
}
|
356 |
+
|
357 |
+
return output;
|
358 |
+
}
|
359 |
+
|
360 |
+
static std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
|
361 |
+
if (fpath_test.empty()){
|
362 |
+
fprintf(stderr, "%s : No test file found.\n", __func__);
|
363 |
+
return std::map<std::string, std::vector<gpt_vocab::id>>();
|
364 |
+
}
|
365 |
+
|
366 |
+
std::map<std::string, std::vector<gpt_vocab::id>> tests;
|
367 |
+
|
368 |
+
auto fin = std::ifstream(fpath_test, std::ios_base::in);
|
369 |
+
const char * delimeter = " => ";
|
370 |
+
const char del_tok = ',';
|
371 |
+
std::string line;
|
372 |
+
while (std::getline(fin, line)) {
|
373 |
+
size_t delimiterPos = line.find(delimeter);
|
374 |
+
if (delimiterPos != std::string::npos) {
|
375 |
+
std::string text = line.substr(0, delimiterPos);
|
376 |
+
std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter));
|
377 |
+
tests[text] = parse_tokens_from_string(s_tokens, del_tok);
|
378 |
+
}
|
379 |
+
}
|
380 |
+
return tests;
|
381 |
+
}
|
382 |
+
|
383 |
+
void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){
|
384 |
+
std::map<std::string, std::vector<gpt_vocab::id>> tests = extract_tests_from_file(fpath_test);
|
385 |
+
|
386 |
+
size_t n_fails = 0;
|
387 |
+
|
388 |
+
for (const auto & test : tests) {
|
389 |
+
std::vector<gpt_vocab::id> tokens = gpt_tokenize(vocab, test.first);
|
390 |
+
|
391 |
+
if (tokens != test.second){
|
392 |
+
n_fails++;
|
393 |
+
|
394 |
+
// print out failure cases
|
395 |
+
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str());
|
396 |
+
fprintf(stderr, "%s : tokens in hf: ", __func__);
|
397 |
+
for (const auto & t : test.second) {
|
398 |
+
fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
|
399 |
+
}
|
400 |
+
fprintf(stderr, "\n");
|
401 |
+
fprintf(stderr, "%s : tokens in ggml: ", __func__);
|
402 |
+
for (const auto & t : tokens) {
|
403 |
+
fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
|
404 |
+
}
|
405 |
+
fprintf(stderr, "\n");
|
406 |
+
}
|
407 |
+
}
|
408 |
+
|
409 |
+
fprintf(stderr, "%s : %zu tests failed out of %zu tests.\n", __func__, n_fails, tests.size());
|
410 |
+
}
|
411 |
+
|
412 |
+
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
|
413 |
+
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
|
414 |
+
|
415 |
+
vocab.token_to_id = ::json_parse(fname);
|
416 |
+
|
417 |
+
for (const auto & kv : vocab.token_to_id) {
|
418 |
+
vocab.id_to_token[kv.second] = kv.first;
|
419 |
+
}
|
420 |
+
|
421 |
+
printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
|
422 |
+
|
423 |
+
// print the vocabulary
|
424 |
+
//for (auto kv : vocab.token_to_id) {
|
425 |
+
// printf("'%s' -> %d\n", kv.first.data(), kv.second);
|
426 |
+
//}
|
427 |
+
|
428 |
+
return true;
|
429 |
+
}
|
430 |
+
|
431 |
+
gpt_vocab::id gpt_sample_top_k_top_p(
|
432 |
+
const gpt_vocab & vocab,
|
433 |
+
const float * logits,
|
434 |
+
int top_k,
|
435 |
+
double top_p,
|
436 |
+
double temp,
|
437 |
+
std::mt19937 & rng) {
|
438 |
+
int n_logits = vocab.id_to_token.size();
|
439 |
+
|
440 |
+
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
441 |
+
logits_id.reserve(n_logits);
|
442 |
+
|
443 |
+
{
|
444 |
+
const double scale = 1.0/temp;
|
445 |
+
for (int i = 0; i < n_logits; ++i) {
|
446 |
+
logits_id.push_back(std::make_pair(logits[i]*scale, i));
|
447 |
+
}
|
448 |
+
}
|
449 |
+
|
450 |
+
// find the top K tokens
|
451 |
+
std::partial_sort(
|
452 |
+
logits_id.begin(),
|
453 |
+
logits_id.begin() + top_k, logits_id.end(),
|
454 |
+
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
455 |
+
return a.first > b.first;
|
456 |
+
});
|
457 |
+
|
458 |
+
logits_id.resize(top_k);
|
459 |
+
|
460 |
+
double maxl = -INFINITY;
|
461 |
+
for (const auto & kv : logits_id) {
|
462 |
+
maxl = std::max(maxl, kv.first);
|
463 |
+
}
|
464 |
+
|
465 |
+
// compute probs for the top K tokens
|
466 |
+
std::vector<double> probs;
|
467 |
+
probs.reserve(logits_id.size());
|
468 |
+
|
469 |
+
double sum = 0.0;
|
470 |
+
for (const auto & kv : logits_id) {
|
471 |
+
double p = exp(kv.first - maxl);
|
472 |
+
probs.push_back(p);
|
473 |
+
sum += p;
|
474 |
+
}
|
475 |
+
|
476 |
+
// normalize the probs
|
477 |
+
for (auto & p : probs) {
|
478 |
+
p /= sum;
|
479 |
+
}
|
480 |
+
|
481 |
+
if (top_p < 1.0f) {
|
482 |
+
double cumsum = 0.0f;
|
483 |
+
for (int i = 0; i < top_k; i++) {
|
484 |
+
cumsum += probs[i];
|
485 |
+
if (cumsum >= top_p) {
|
486 |
+
top_k = i + 1;
|
487 |
+
probs.resize(top_k);
|
488 |
+
logits_id.resize(top_k);
|
489 |
+
break;
|
490 |
+
}
|
491 |
+
}
|
492 |
+
|
493 |
+
cumsum = 1.0/cumsum;
|
494 |
+
for (int i = 0; i < (int) probs.size(); i++) {
|
495 |
+
probs[i] *= cumsum;
|
496 |
+
}
|
497 |
+
}
|
498 |
+
|
499 |
+
//printf("\n");
|
500 |
+
//for (int i = 0; i < (int) probs.size(); i++) {
|
501 |
+
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
502 |
+
//}
|
503 |
+
//exit(0);
|
504 |
+
|
505 |
+
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
506 |
+
int idx = dist(rng);
|
507 |
+
|
508 |
+
return logits_id[idx].second;
|
509 |
+
}
|
510 |
+
|
511 |
+
gpt_vocab::id gpt_sample_top_k_top_p_repeat(
|
512 |
+
const gpt_vocab & vocab,
|
513 |
+
const float * logits,
|
514 |
+
const int32_t * last_n_tokens_data,
|
515 |
+
size_t last_n_tokens_data_size,
|
516 |
+
int top_k,
|
517 |
+
double top_p,
|
518 |
+
double temp,
|
519 |
+
int repeat_last_n,
|
520 |
+
float repeat_penalty,
|
521 |
+
std::mt19937 & rng) {
|
522 |
+
|
523 |
+
int n_logits = vocab.id_to_token.size();
|
524 |
+
|
525 |
+
const auto * plogits = logits;
|
526 |
+
|
527 |
+
const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
|
528 |
+
|
529 |
+
if (temp <= 0) {
|
530 |
+
// select the token with the highest logit directly
|
531 |
+
float max_logit = plogits[0];
|
532 |
+
gpt_vocab::id max_id = 0;
|
533 |
+
|
534 |
+
for (int i = 1; i < n_logits; ++i) {
|
535 |
+
if (plogits[i] > max_logit) {
|
536 |
+
max_logit = plogits[i];
|
537 |
+
max_id = i;
|
538 |
+
}
|
539 |
+
}
|
540 |
+
return max_id;
|
541 |
+
}
|
542 |
+
|
543 |
+
|
544 |
+
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
545 |
+
logits_id.reserve(n_logits);
|
546 |
+
|
547 |
+
{
|
548 |
+
const float scale = 1.0f/temp;
|
549 |
+
for (int i = 0; i < n_logits; ++i) {
|
550 |
+
// repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
|
551 |
+
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
|
552 |
+
if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
|
553 |
+
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
|
554 |
+
if (plogits[i] < 0.0f) {
|
555 |
+
logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
|
556 |
+
} else {
|
557 |
+
logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
|
558 |
+
}
|
559 |
+
} else {
|
560 |
+
logits_id.push_back(std::make_pair(plogits[i]*scale, i));
|
561 |
+
}
|
562 |
+
}
|
563 |
+
}
|
564 |
+
|
565 |
+
// find the top K tokens
|
566 |
+
std::partial_sort(
|
567 |
+
logits_id.begin(),
|
568 |
+
logits_id.begin() + top_k, logits_id.end(),
|
569 |
+
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
570 |
+
return a.first > b.first;
|
571 |
+
});
|
572 |
+
|
573 |
+
logits_id.resize(top_k);
|
574 |
+
|
575 |
+
double maxl = -INFINITY;
|
576 |
+
for (const auto & kv : logits_id) {
|
577 |
+
maxl = std::max(maxl, kv.first);
|
578 |
+
}
|
579 |
+
|
580 |
+
// compute probs for the top K tokens
|
581 |
+
std::vector<double> probs;
|
582 |
+
probs.reserve(logits_id.size());
|
583 |
+
|
584 |
+
double sum = 0.0;
|
585 |
+
for (const auto & kv : logits_id) {
|
586 |
+
double p = exp(kv.first - maxl);
|
587 |
+
probs.push_back(p);
|
588 |
+
sum += p;
|
589 |
+
}
|
590 |
+
|
591 |
+
// normalize the probs
|
592 |
+
for (auto & p : probs) {
|
593 |
+
p /= sum;
|
594 |
+
}
|
595 |
+
|
596 |
+
if (top_p < 1.0f) {
|
597 |
+
double cumsum = 0.0f;
|
598 |
+
for (int i = 0; i < top_k; i++) {
|
599 |
+
cumsum += probs[i];
|
600 |
+
if (cumsum >= top_p) {
|
601 |
+
top_k = i + 1;
|
602 |
+
probs.resize(top_k);
|
603 |
+
logits_id.resize(top_k);
|
604 |
+
break;
|
605 |
+
}
|
606 |
+
}
|
607 |
+
|
608 |
+
cumsum = 1.0/cumsum;
|
609 |
+
for (int i = 0; i < (int) probs.size(); i++) {
|
610 |
+
probs[i] *= cumsum;
|
611 |
+
}
|
612 |
+
}
|
613 |
+
|
614 |
+
// printf("\n");
|
615 |
+
// for (int i = 0; i < (int) probs.size(); i++) {
|
616 |
+
// for (int i = 0; i < 10; i++) {
|
617 |
+
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
618 |
+
// }
|
619 |
+
|
620 |
+
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
621 |
+
int idx = dist(rng);
|
622 |
+
|
623 |
+
return logits_id[idx].second;
|
624 |
+
|
625 |
+
}
|
626 |
+
|
627 |
+
bool is_wav_buffer(const std::string buf) {
|
628 |
+
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
|
629 |
+
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
|
630 |
+
if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") {
|
631 |
+
return false;
|
632 |
+
}
|
633 |
+
|
634 |
+
uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4);
|
635 |
+
if (chunk_size + 8 != buf.size()) {
|
636 |
+
return false;
|
637 |
+
}
|
638 |
+
|
639 |
+
return true;
|
640 |
+
}
|
641 |
+
|
642 |
+
bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
|
643 |
+
drwav wav;
|
644 |
+
std::vector<uint8_t> wav_data; // used for pipe input from stdin or ffmpeg decoding output
|
645 |
+
|
646 |
+
if (fname == "-") {
|
647 |
+
{
|
648 |
+
#ifdef _WIN32
|
649 |
+
_setmode(_fileno(stdin), _O_BINARY);
|
650 |
+
#endif
|
651 |
+
|
652 |
+
uint8_t buf[1024];
|
653 |
+
while (true)
|
654 |
+
{
|
655 |
+
const size_t n = fread(buf, 1, sizeof(buf), stdin);
|
656 |
+
if (n == 0) {
|
657 |
+
break;
|
658 |
+
}
|
659 |
+
wav_data.insert(wav_data.end(), buf, buf + n);
|
660 |
+
}
|
661 |
+
}
|
662 |
+
|
663 |
+
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
|
664 |
+
fprintf(stderr, "error: failed to open WAV file from stdin\n");
|
665 |
+
return false;
|
666 |
+
}
|
667 |
+
|
668 |
+
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
|
669 |
+
}
|
670 |
+
else if (is_wav_buffer(fname)) {
|
671 |
+
if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
|
672 |
+
fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
|
673 |
+
return false;
|
674 |
+
}
|
675 |
+
}
|
676 |
+
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
|
677 |
+
#if defined(WHISPER_FFMPEG)
|
678 |
+
if (ffmpeg_decode_audio(fname, wav_data) != 0) {
|
679 |
+
fprintf(stderr, "error: failed to ffmpeg decode '%s' \n", fname.c_str());
|
680 |
+
return false;
|
681 |
+
}
|
682 |
+
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
|
683 |
+
fprintf(stderr, "error: failed to read wav data as wav \n");
|
684 |
+
return false;
|
685 |
+
}
|
686 |
+
#else
|
687 |
+
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
|
688 |
+
return false;
|
689 |
+
#endif
|
690 |
+
}
|
691 |
+
|
692 |
+
if (wav.channels != 1 && wav.channels != 2) {
|
693 |
+
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
|
694 |
+
drwav_uninit(&wav);
|
695 |
+
return false;
|
696 |
+
}
|
697 |
+
|
698 |
+
if (stereo && wav.channels != 2) {
|
699 |
+
fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
|
700 |
+
drwav_uninit(&wav);
|
701 |
+
return false;
|
702 |
+
}
|
703 |
+
|
704 |
+
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
|
705 |
+
fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
|
706 |
+
drwav_uninit(&wav);
|
707 |
+
return false;
|
708 |
+
}
|
709 |
+
|
710 |
+
if (wav.bitsPerSample != 16) {
|
711 |
+
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
|
712 |
+
drwav_uninit(&wav);
|
713 |
+
return false;
|
714 |
+
}
|
715 |
+
|
716 |
+
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
|
717 |
+
|
718 |
+
std::vector<int16_t> pcm16;
|
719 |
+
pcm16.resize(n*wav.channels);
|
720 |
+
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
|
721 |
+
drwav_uninit(&wav);
|
722 |
+
|
723 |
+
// convert to mono, float
|
724 |
+
pcmf32.resize(n);
|
725 |
+
if (wav.channels == 1) {
|
726 |
+
for (uint64_t i = 0; i < n; i++) {
|
727 |
+
pcmf32[i] = float(pcm16[i])/32768.0f;
|
728 |
+
}
|
729 |
+
} else {
|
730 |
+
for (uint64_t i = 0; i < n; i++) {
|
731 |
+
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
|
732 |
+
}
|
733 |
+
}
|
734 |
+
|
735 |
+
if (stereo) {
|
736 |
+
// convert to stereo, float
|
737 |
+
pcmf32s.resize(2);
|
738 |
+
|
739 |
+
pcmf32s[0].resize(n);
|
740 |
+
pcmf32s[1].resize(n);
|
741 |
+
for (uint64_t i = 0; i < n; i++) {
|
742 |
+
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
|
743 |
+
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
|
744 |
+
}
|
745 |
+
}
|
746 |
+
|
747 |
+
return true;
|
748 |
+
}
|
749 |
+
|
750 |
+
void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
|
751 |
+
const float rc = 1.0f / (2.0f * M_PI * cutoff);
|
752 |
+
const float dt = 1.0f / sample_rate;
|
753 |
+
const float alpha = dt / (rc + dt);
|
754 |
+
|
755 |
+
float y = data[0];
|
756 |
+
|
757 |
+
for (size_t i = 1; i < data.size(); i++) {
|
758 |
+
y = alpha * (y + data[i] - data[i - 1]);
|
759 |
+
data[i] = y;
|
760 |
+
}
|
761 |
+
}
|
762 |
+
|
763 |
+
bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
|
764 |
+
const int n_samples = pcmf32.size();
|
765 |
+
const int n_samples_last = (sample_rate * last_ms) / 1000;
|
766 |
+
|
767 |
+
if (n_samples_last >= n_samples) {
|
768 |
+
// not enough samples - assume no speech
|
769 |
+
return false;
|
770 |
+
}
|
771 |
+
|
772 |
+
if (freq_thold > 0.0f) {
|
773 |
+
high_pass_filter(pcmf32, freq_thold, sample_rate);
|
774 |
+
}
|
775 |
+
|
776 |
+
float energy_all = 0.0f;
|
777 |
+
float energy_last = 0.0f;
|
778 |
+
|
779 |
+
for (int i = 0; i < n_samples; i++) {
|
780 |
+
energy_all += fabsf(pcmf32[i]);
|
781 |
+
|
782 |
+
if (i >= n_samples - n_samples_last) {
|
783 |
+
energy_last += fabsf(pcmf32[i]);
|
784 |
+
}
|
785 |
+
}
|
786 |
+
|
787 |
+
energy_all /= n_samples;
|
788 |
+
energy_last /= n_samples_last;
|
789 |
+
|
790 |
+
if (verbose) {
|
791 |
+
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
|
792 |
+
}
|
793 |
+
|
794 |
+
if (energy_last > vad_thold*energy_all) {
|
795 |
+
return false;
|
796 |
+
}
|
797 |
+
|
798 |
+
return true;
|
799 |
+
}
|
800 |
+
|
801 |
+
float similarity(const std::string & s0, const std::string & s1) {
|
802 |
+
const size_t len0 = s0.size() + 1;
|
803 |
+
const size_t len1 = s1.size() + 1;
|
804 |
+
|
805 |
+
std::vector<int> col(len1, 0);
|
806 |
+
std::vector<int> prevCol(len1, 0);
|
807 |
+
|
808 |
+
for (size_t i = 0; i < len1; i++) {
|
809 |
+
prevCol[i] = i;
|
810 |
+
}
|
811 |
+
|
812 |
+
for (size_t i = 0; i < len0; i++) {
|
813 |
+
col[0] = i;
|
814 |
+
for (size_t j = 1; j < len1; j++) {
|
815 |
+
col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
|
816 |
+
}
|
817 |
+
col.swap(prevCol);
|
818 |
+
}
|
819 |
+
|
820 |
+
const float dist = prevCol[len1 - 1];
|
821 |
+
|
822 |
+
return 1.0f - (dist / std::max(s0.size(), s1.size()));
|
823 |
+
}
|
824 |
+
|
825 |
+
bool sam_params_parse(int argc, char ** argv, sam_params & params) {
|
826 |
+
for (int i = 1; i < argc; i++) {
|
827 |
+
std::string arg = argv[i];
|
828 |
+
|
829 |
+
if (arg == "-s" || arg == "--seed") {
|
830 |
+
params.seed = std::stoi(argv[++i]);
|
831 |
+
} else if (arg == "-t" || arg == "--threads") {
|
832 |
+
params.n_threads = std::stoi(argv[++i]);
|
833 |
+
} else if (arg == "-m" || arg == "--model") {
|
834 |
+
params.model = argv[++i];
|
835 |
+
} else if (arg == "-i" || arg == "--inp") {
|
836 |
+
params.fname_inp = argv[++i];
|
837 |
+
} else if (arg == "-o" || arg == "--out") {
|
838 |
+
params.fname_out = argv[++i];
|
839 |
+
} else if (arg == "-h" || arg == "--help") {
|
840 |
+
sam_print_usage(argc, argv, params);
|
841 |
+
exit(0);
|
842 |
+
} else {
|
843 |
+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
844 |
+
sam_print_usage(argc, argv, params);
|
845 |
+
exit(0);
|
846 |
+
}
|
847 |
+
}
|
848 |
+
|
849 |
+
return true;
|
850 |
+
}
|
851 |
+
|
852 |
+
void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
|
853 |
+
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
854 |
+
fprintf(stderr, "\n");
|
855 |
+
fprintf(stderr, "options:\n");
|
856 |
+
fprintf(stderr, " -h, --help show this help message and exit\n");
|
857 |
+
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
|
858 |
+
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
859 |
+
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
860 |
+
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
861 |
+
fprintf(stderr, " -i FNAME, --inp FNAME\n");
|
862 |
+
fprintf(stderr, " input file (default: %s)\n", params.fname_inp.c_str());
|
863 |
+
fprintf(stderr, " -o FNAME, --out FNAME\n");
|
864 |
+
fprintf(stderr, " output file (default: %s)\n", params.fname_out.c_str());
|
865 |
+
fprintf(stderr, "\n");
|
866 |
+
}
|
867 |
+
|
868 |
+
// 500 -> 00:05.000
|
869 |
+
// 6000 -> 01:00.000
|
870 |
+
std::string to_timestamp(int64_t t, bool comma) {
|
871 |
+
int64_t msec = t * 10;
|
872 |
+
int64_t hr = msec / (1000 * 60 * 60);
|
873 |
+
msec = msec - hr * (1000 * 60 * 60);
|
874 |
+
int64_t min = msec / (1000 * 60);
|
875 |
+
msec = msec - min * (1000 * 60);
|
876 |
+
int64_t sec = msec / 1000;
|
877 |
+
msec = msec - sec * 1000;
|
878 |
+
|
879 |
+
char buf[32];
|
880 |
+
snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
|
881 |
+
|
882 |
+
return std::string(buf);
|
883 |
+
}
|
884 |
+
|
885 |
+
int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate) {
|
886 |
+
return std::max(0, std::min((int) n_samples - 1, (int) ((t*whisper_sample_rate)/100)));
|
887 |
+
}
|
888 |
+
|
889 |
+
bool is_file_exist(const char *fileName)
|
890 |
+
{
|
891 |
+
std::ifstream infile(fileName);
|
892 |
+
return infile.good();
|
893 |
+
}
|
894 |
+
|
895 |
+
bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id)
|
896 |
+
{
|
897 |
+
std::ofstream speak_file(path.c_str());
|
898 |
+
if (speak_file.fail()) {
|
899 |
+
fprintf(stderr, "%s: failed to open speak_file\n", __func__);
|
900 |
+
return false;
|
901 |
+
} else {
|
902 |
+
speak_file.write(text.c_str(), text.size());
|
903 |
+
speak_file.close();
|
904 |
+
int ret = system((command + " " + std::to_string(voice_id) + " " + path).c_str());
|
905 |
+
if (ret != 0) {
|
906 |
+
fprintf(stderr, "%s: failed to speak\n", __func__);
|
907 |
+
return false;
|
908 |
+
}
|
909 |
+
}
|
910 |
+
return true;
|
911 |
+
}
|
common.h
ADDED
@@ -0,0 +1,343 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Various helper functions and utilities
|
2 |
+
|
3 |
+
#pragma once
|
4 |
+
|
5 |
+
#include <string>
|
6 |
+
#include <map>
|
7 |
+
#include <vector>
|
8 |
+
#include <random>
|
9 |
+
#include <thread>
|
10 |
+
#include <ctime>
|
11 |
+
#include <fstream>
|
12 |
+
#include <sstream>
|
13 |
+
|
14 |
+
#define COMMON_SAMPLE_RATE 16000
|
15 |
+
|
16 |
+
//
|
17 |
+
// GPT CLI argument parsing
|
18 |
+
//
|
19 |
+
|
20 |
+
struct gpt_params {
|
21 |
+
int32_t seed = -1; // RNG seed
|
22 |
+
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
23 |
+
int32_t n_predict = 200; // new tokens to predict
|
24 |
+
int32_t n_parallel = 1; // number of parallel streams
|
25 |
+
int32_t n_batch = 32; // batch size for prompt processing
|
26 |
+
int32_t n_ctx = 2048; // context size (this is the KV cache max size)
|
27 |
+
int32_t n_gpu_layers = 0; // number of layers to offlload to the GPU
|
28 |
+
|
29 |
+
bool ignore_eos = false; // ignore EOS token when generating text
|
30 |
+
|
31 |
+
// sampling parameters
|
32 |
+
int32_t top_k = 40;
|
33 |
+
float top_p = 0.9f;
|
34 |
+
float temp = 0.9f;
|
35 |
+
int32_t repeat_last_n = 64;
|
36 |
+
float repeat_penalty = 1.00f;
|
37 |
+
|
38 |
+
std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
|
39 |
+
std::string prompt = "";
|
40 |
+
std::string token_test = "";
|
41 |
+
|
42 |
+
bool interactive = false;
|
43 |
+
int32_t interactive_port = -1;
|
44 |
+
};
|
45 |
+
|
46 |
+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
47 |
+
|
48 |
+
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
49 |
+
|
50 |
+
std::string gpt_random_prompt(std::mt19937 & rng);
|
51 |
+
|
52 |
+
//
|
53 |
+
// Vocab utils
|
54 |
+
//
|
55 |
+
|
56 |
+
std::string trim(const std::string & s);
|
57 |
+
|
58 |
+
std::string replace(
|
59 |
+
const std::string & s,
|
60 |
+
const std::string & from,
|
61 |
+
const std::string & to);
|
62 |
+
|
63 |
+
struct gpt_vocab {
|
64 |
+
using id = int32_t;
|
65 |
+
using token = std::string;
|
66 |
+
|
67 |
+
std::map<token, id> token_to_id;
|
68 |
+
std::map<id, token> id_to_token;
|
69 |
+
std::vector<std::string> special_tokens;
|
70 |
+
|
71 |
+
void add_special_token(const std::string & token);
|
72 |
+
};
|
73 |
+
|
74 |
+
// poor-man's JSON parsing
|
75 |
+
std::map<std::string, int32_t> json_parse(const std::string & fname);
|
76 |
+
|
77 |
+
std::string convert_to_utf8(const std::wstring & input);
|
78 |
+
|
79 |
+
std::wstring convert_to_wstring(const std::string & input);
|
80 |
+
|
81 |
+
void gpt_split_words(std::string str, std::vector<std::string>& words);
|
82 |
+
|
83 |
+
// split text into tokens
|
84 |
+
//
|
85 |
+
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
86 |
+
//
|
87 |
+
// Regex (Python):
|
88 |
+
// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
|
89 |
+
//
|
90 |
+
// Regex (C++):
|
91 |
+
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
|
92 |
+
//
|
93 |
+
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
|
94 |
+
|
95 |
+
// test outputs of gpt_tokenize
|
96 |
+
//
|
97 |
+
// - compare with tokens generated by the huggingface tokenizer
|
98 |
+
// - test cases are chosen based on the model's main language (under 'prompt' directory)
|
99 |
+
// - if all sentences are tokenized identically, print 'All tests passed.'
|
100 |
+
// - otherwise, print sentence, huggingface tokens, ggml tokens
|
101 |
+
//
|
102 |
+
void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test);
|
103 |
+
|
104 |
+
// load the tokens from encoder.json
|
105 |
+
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
|
106 |
+
|
107 |
+
// sample next token given probabilities for each embedding
|
108 |
+
//
|
109 |
+
// - consider only the top K tokens
|
110 |
+
// - from them, consider only the top tokens with cumulative probability > P
|
111 |
+
//
|
112 |
+
// TODO: not sure if this implementation is correct
|
113 |
+
// TODO: temperature is not implemented
|
114 |
+
//
|
115 |
+
gpt_vocab::id gpt_sample_top_k_top_p(
|
116 |
+
const gpt_vocab & vocab,
|
117 |
+
const float * logits,
|
118 |
+
int top_k,
|
119 |
+
double top_p,
|
120 |
+
double temp,
|
121 |
+
std::mt19937 & rng);
|
122 |
+
|
123 |
+
gpt_vocab::id gpt_sample_top_k_top_p_repeat(
|
124 |
+
const gpt_vocab & vocab,
|
125 |
+
const float * logits,
|
126 |
+
const int32_t * last_n_tokens_data,
|
127 |
+
size_t last_n_tokens_data_size,
|
128 |
+
int top_k,
|
129 |
+
double top_p,
|
130 |
+
double temp,
|
131 |
+
int repeat_last_n,
|
132 |
+
float repeat_penalty,
|
133 |
+
std::mt19937 & rng);
|
134 |
+
|
135 |
+
//
|
136 |
+
// Audio utils
|
137 |
+
//
|
138 |
+
|
139 |
+
// Check if a buffer is a WAV audio file
|
140 |
+
bool is_wav_buffer(const std::string buf);
|
141 |
+
|
142 |
+
// Read WAV audio file and store the PCM data into pcmf32
|
143 |
+
// fname can be a buffer of WAV data instead of a filename
|
144 |
+
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
|
145 |
+
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
|
146 |
+
bool read_wav(
|
147 |
+
const std::string & fname,
|
148 |
+
std::vector<float> & pcmf32,
|
149 |
+
std::vector<std::vector<float>> & pcmf32s,
|
150 |
+
bool stereo);
|
151 |
+
|
152 |
+
// Write PCM data into WAV audio file
|
153 |
+
class wav_writer {
|
154 |
+
private:
|
155 |
+
std::ofstream file;
|
156 |
+
uint32_t dataSize = 0;
|
157 |
+
std::string wav_filename;
|
158 |
+
|
159 |
+
bool write_header(const uint32_t sample_rate,
|
160 |
+
const uint16_t bits_per_sample,
|
161 |
+
const uint16_t channels) {
|
162 |
+
|
163 |
+
file.write("RIFF", 4);
|
164 |
+
file.write("\0\0\0\0", 4); // Placeholder for file size
|
165 |
+
file.write("WAVE", 4);
|
166 |
+
file.write("fmt ", 4);
|
167 |
+
|
168 |
+
const uint32_t sub_chunk_size = 16;
|
169 |
+
const uint16_t audio_format = 1; // PCM format
|
170 |
+
const uint32_t byte_rate = sample_rate * channels * bits_per_sample / 8;
|
171 |
+
const uint16_t block_align = channels * bits_per_sample / 8;
|
172 |
+
|
173 |
+
file.write(reinterpret_cast<const char *>(&sub_chunk_size), 4);
|
174 |
+
file.write(reinterpret_cast<const char *>(&audio_format), 2);
|
175 |
+
file.write(reinterpret_cast<const char *>(&channels), 2);
|
176 |
+
file.write(reinterpret_cast<const char *>(&sample_rate), 4);
|
177 |
+
file.write(reinterpret_cast<const char *>(&byte_rate), 4);
|
178 |
+
file.write(reinterpret_cast<const char *>(&block_align), 2);
|
179 |
+
file.write(reinterpret_cast<const char *>(&bits_per_sample), 2);
|
180 |
+
file.write("data", 4);
|
181 |
+
file.write("\0\0\0\0", 4); // Placeholder for data size
|
182 |
+
|
183 |
+
return true;
|
184 |
+
}
|
185 |
+
|
186 |
+
// It is assumed that PCM data is normalized to a range from -1 to 1
|
187 |
+
bool write_audio(const float * data, size_t length) {
|
188 |
+
for (size_t i = 0; i < length; ++i) {
|
189 |
+
const int16_t intSample = int16_t(data[i] * 32767);
|
190 |
+
file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
|
191 |
+
dataSize += sizeof(int16_t);
|
192 |
+
}
|
193 |
+
if (file.is_open()) {
|
194 |
+
file.seekp(4, std::ios::beg);
|
195 |
+
uint32_t fileSize = 36 + dataSize;
|
196 |
+
file.write(reinterpret_cast<char *>(&fileSize), 4);
|
197 |
+
file.seekp(40, std::ios::beg);
|
198 |
+
file.write(reinterpret_cast<char *>(&dataSize), 4);
|
199 |
+
file.seekp(0, std::ios::end);
|
200 |
+
}
|
201 |
+
return true;
|
202 |
+
}
|
203 |
+
|
204 |
+
bool open_wav(const std::string & filename) {
|
205 |
+
if (filename != wav_filename) {
|
206 |
+
if (file.is_open()) {
|
207 |
+
file.close();
|
208 |
+
}
|
209 |
+
}
|
210 |
+
if (!file.is_open()) {
|
211 |
+
file.open(filename, std::ios::binary);
|
212 |
+
wav_filename = filename;
|
213 |
+
dataSize = 0;
|
214 |
+
}
|
215 |
+
return file.is_open();
|
216 |
+
}
|
217 |
+
|
218 |
+
public:
|
219 |
+
bool open(const std::string & filename,
|
220 |
+
const uint32_t sample_rate,
|
221 |
+
const uint16_t bits_per_sample,
|
222 |
+
const uint16_t channels) {
|
223 |
+
|
224 |
+
if (open_wav(filename)) {
|
225 |
+
write_header(sample_rate, bits_per_sample, channels);
|
226 |
+
} else {
|
227 |
+
return false;
|
228 |
+
}
|
229 |
+
|
230 |
+
return true;
|
231 |
+
}
|
232 |
+
|
233 |
+
bool close() {
|
234 |
+
file.close();
|
235 |
+
return true;
|
236 |
+
}
|
237 |
+
|
238 |
+
bool write(const float * data, size_t length) {
|
239 |
+
return write_audio(data, length);
|
240 |
+
}
|
241 |
+
|
242 |
+
~wav_writer() {
|
243 |
+
if (file.is_open()) {
|
244 |
+
file.close();
|
245 |
+
}
|
246 |
+
}
|
247 |
+
};
|
248 |
+
|
249 |
+
|
250 |
+
// Apply a high-pass frequency filter to PCM audio
|
251 |
+
// Suppresses frequencies below cutoff Hz
|
252 |
+
void high_pass_filter(
|
253 |
+
std::vector<float> & data,
|
254 |
+
float cutoff,
|
255 |
+
float sample_rate);
|
256 |
+
|
257 |
+
// Basic voice activity detection (VAD) using audio energy adaptive threshold
|
258 |
+
bool vad_simple(
|
259 |
+
std::vector<float> & pcmf32,
|
260 |
+
int sample_rate,
|
261 |
+
int last_ms,
|
262 |
+
float vad_thold,
|
263 |
+
float freq_thold,
|
264 |
+
bool verbose);
|
265 |
+
|
266 |
+
// compute similarity between two strings using Levenshtein distance
|
267 |
+
float similarity(const std::string & s0, const std::string & s1);
|
268 |
+
|
269 |
+
//
|
270 |
+
// SAM argument parsing
|
271 |
+
//
|
272 |
+
|
273 |
+
struct sam_params {
|
274 |
+
int32_t seed = -1; // RNG seed
|
275 |
+
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
276 |
+
|
277 |
+
std::string model = "models/sam-vit-b/ggml-model-f16.bin"; // model path
|
278 |
+
std::string fname_inp = "img.jpg";
|
279 |
+
std::string fname_out = "img.out";
|
280 |
+
};
|
281 |
+
|
282 |
+
bool sam_params_parse(int argc, char ** argv, sam_params & params);
|
283 |
+
|
284 |
+
void sam_print_usage(int argc, char ** argv, const sam_params & params);
|
285 |
+
|
286 |
+
//
|
287 |
+
// Terminal utils
|
288 |
+
//
|
289 |
+
|
290 |
+
#define SQR(X) ((X) * (X))
|
291 |
+
#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
|
292 |
+
|
293 |
+
/**
|
294 |
+
* Quantizes 24-bit RGB to xterm256 code range [16,256).
|
295 |
+
*/
|
296 |
+
static int rgb2xterm256(int r, int g, int b) {
|
297 |
+
unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
|
298 |
+
int av, ir, ig, ib, il, qr, qg, qb, ql;
|
299 |
+
av = r * .299 + g * .587 + b * .114 + .5;
|
300 |
+
ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
|
301 |
+
qr = cube[(ir = UNCUBE(r))];
|
302 |
+
qg = cube[(ig = UNCUBE(g))];
|
303 |
+
qb = cube[(ib = UNCUBE(b))];
|
304 |
+
if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
|
305 |
+
SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
|
306 |
+
return ir * 36 + ig * 6 + ib + 020;
|
307 |
+
return il + 0350;
|
308 |
+
}
|
309 |
+
|
310 |
+
static std::string set_xterm256_foreground(int r, int g, int b) {
|
311 |
+
int x = rgb2xterm256(r, g, b);
|
312 |
+
std::ostringstream oss;
|
313 |
+
oss << "\033[38;5;" << x << "m";
|
314 |
+
return oss.str();
|
315 |
+
}
|
316 |
+
|
317 |
+
// Lowest is red, middle is yellow, highest is green. Color scheme from
|
318 |
+
// Paul Tol; it is colorblind friendly https://personal.sron.nl/~pault/
|
319 |
+
const std::vector<std::string> k_colors = {
|
320 |
+
set_xterm256_foreground(220, 5, 12),
|
321 |
+
set_xterm256_foreground(232, 96, 28),
|
322 |
+
set_xterm256_foreground(241, 147, 45),
|
323 |
+
set_xterm256_foreground(246, 193, 65),
|
324 |
+
set_xterm256_foreground(247, 240, 86),
|
325 |
+
set_xterm256_foreground(144, 201, 135),
|
326 |
+
set_xterm256_foreground( 78, 178, 101),
|
327 |
+
};
|
328 |
+
|
329 |
+
//
|
330 |
+
// Other utils
|
331 |
+
//
|
332 |
+
|
333 |
+
// convert timestamp to string, 6000 -> 01:00.000
|
334 |
+
std::string to_timestamp(int64_t t, bool comma = false);
|
335 |
+
|
336 |
+
// given a timestamp get the sample
|
337 |
+
int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate);
|
338 |
+
|
339 |
+
// check if file exists using ifstream
|
340 |
+
bool is_file_exist(const char *fileName);
|
341 |
+
|
342 |
+
// write text to file, and call system("command voice_id file")
|
343 |
+
bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id);
|
dr_wav.h
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ggml-aarch64.c
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ggml-aarch64.h
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
|
2 |
+
#pragma once
|
3 |
+
|
4 |
+
#define GGML_COMMON_DECL_C
|
5 |
+
#include "ggml-common.h"
|
6 |
+
|
7 |
+
#include "ggml.h"
|
8 |
+
|
9 |
+
// GGML internal header
|
10 |
+
|
11 |
+
#ifdef __cplusplus
|
12 |
+
extern "C" {
|
13 |
+
#endif
|
14 |
+
|
15 |
+
// Quantization
|
16 |
+
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
17 |
+
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
18 |
+
|
19 |
+
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
20 |
+
|
21 |
+
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
22 |
+
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
23 |
+
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
24 |
+
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
25 |
+
|
26 |
+
// GEMV
|
27 |
+
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
28 |
+
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
29 |
+
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
30 |
+
|
31 |
+
// GEMM
|
32 |
+
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
33 |
+
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
34 |
+
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
35 |
+
|
36 |
+
#ifdef __cplusplus
|
37 |
+
}
|
38 |
+
#endif
|
39 |
+
|
ggml-common.h
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ggml-cpu-impl.h
ADDED
@@ -0,0 +1,614 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#pragma once
|
2 |
+
|
3 |
+
// GGML CPU internal header
|
4 |
+
|
5 |
+
#include "ggml.h"
|
6 |
+
#include "ggml-impl.h"
|
7 |
+
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
8 |
+
//#include <stddef.h>
|
9 |
+
#include <stdbool.h>
|
10 |
+
#include <string.h> // memcpy
|
11 |
+
#include <math.h> // fabsf
|
12 |
+
|
13 |
+
|
14 |
+
#ifdef __cplusplus
|
15 |
+
extern "C" {
|
16 |
+
#endif
|
17 |
+
|
18 |
+
#if defined(_MSC_VER)
|
19 |
+
|
20 |
+
#define m512bh(p) p
|
21 |
+
#define m512i(p) p
|
22 |
+
|
23 |
+
#else
|
24 |
+
|
25 |
+
#define m512bh(p) (__m512bh)(p)
|
26 |
+
#define m512i(p) (__m512i)(p)
|
27 |
+
|
28 |
+
#endif
|
29 |
+
|
30 |
+
/**
|
31 |
+
* Converts brain16 to float32.
|
32 |
+
*
|
33 |
+
* The bfloat16 floating point format has the following structure:
|
34 |
+
*
|
35 |
+
* ┌sign
|
36 |
+
* │
|
37 |
+
* │ ┌exponent
|
38 |
+
* │ │
|
39 |
+
* │ │ ┌mantissa
|
40 |
+
* │ │ │
|
41 |
+
* │┌──┴───┐┌─┴───┐
|
42 |
+
* 0b0000000000000000 brain16
|
43 |
+
*
|
44 |
+
* Since bf16 has the same number of exponent bits as a 32bit float,
|
45 |
+
* encoding and decoding numbers becomes relatively straightforward.
|
46 |
+
*
|
47 |
+
* ┌sign
|
48 |
+
* │
|
49 |
+
* │ ┌exponent
|
50 |
+
* │ │
|
51 |
+
* │ │ ┌mantissa
|
52 |
+
* │ │ │
|
53 |
+
* │┌──┴───┐┌─┴───────────────────┐
|
54 |
+
* 0b00000000000000000000000000000000 IEEE binary32
|
55 |
+
*
|
56 |
+
* For comparison, the standard fp16 format has fewer exponent bits.
|
57 |
+
*
|
58 |
+
* ┌sign
|
59 |
+
* │
|
60 |
+
* │ ┌exponent
|
61 |
+
* │ │
|
62 |
+
* │ │ ┌mantissa
|
63 |
+
* │ │ │
|
64 |
+
* │┌─┴─┐┌─┴──────┐
|
65 |
+
* 0b0000000000000000 IEEE binary16
|
66 |
+
*
|
67 |
+
* @see IEEE 754-2008
|
68 |
+
*/
|
69 |
+
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
70 |
+
union {
|
71 |
+
float f;
|
72 |
+
uint32_t i;
|
73 |
+
} u;
|
74 |
+
u.i = (uint32_t)h.bits << 16;
|
75 |
+
return u.f;
|
76 |
+
}
|
77 |
+
|
78 |
+
/**
|
79 |
+
* Converts float32 to brain16.
|
80 |
+
*
|
81 |
+
* This is binary identical with Google Brain float conversion.
|
82 |
+
* Floats shall round to nearest even, and NANs shall be quiet.
|
83 |
+
* Subnormals aren't flushed to zero, except perhaps when used.
|
84 |
+
* This code should vectorize nicely if using modern compilers.
|
85 |
+
*/
|
86 |
+
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
87 |
+
ggml_bf16_t h;
|
88 |
+
union {
|
89 |
+
float f;
|
90 |
+
uint32_t i;
|
91 |
+
} u;
|
92 |
+
u.f = s;
|
93 |
+
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
94 |
+
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
95 |
+
return h;
|
96 |
+
}
|
97 |
+
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
98 |
+
return h;
|
99 |
+
}
|
100 |
+
|
101 |
+
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
102 |
+
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
103 |
+
|
104 |
+
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
105 |
+
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
106 |
+
#ifndef __FMA__
|
107 |
+
#define __FMA__
|
108 |
+
#endif
|
109 |
+
#ifndef __F16C__
|
110 |
+
#define __F16C__
|
111 |
+
#endif
|
112 |
+
#endif
|
113 |
+
|
114 |
+
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
115 |
+
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
116 |
+
#ifndef __SSE3__
|
117 |
+
#define __SSE3__
|
118 |
+
#endif
|
119 |
+
#ifndef __SSSE3__
|
120 |
+
#define __SSSE3__
|
121 |
+
#endif
|
122 |
+
#endif
|
123 |
+
|
124 |
+
#if defined(__ARM_FEATURE_SVE)
|
125 |
+
#include <arm_sve.h>
|
126 |
+
#include <sys/prctl.h>
|
127 |
+
#endif
|
128 |
+
|
129 |
+
// 16-bit float
|
130 |
+
// on Arm, we use __fp16
|
131 |
+
// on x86, we use uint16_t
|
132 |
+
#if defined(__ARM_NEON)
|
133 |
+
|
134 |
+
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
135 |
+
//
|
136 |
+
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
137 |
+
//
|
138 |
+
#include <arm_neon.h>
|
139 |
+
|
140 |
+
#ifdef _MSC_VER
|
141 |
+
|
142 |
+
typedef uint16_t ggml_fp16_internal_t;
|
143 |
+
|
144 |
+
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
145 |
+
|
146 |
+
#else
|
147 |
+
|
148 |
+
typedef __fp16 ggml_fp16_internal_t;
|
149 |
+
|
150 |
+
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
151 |
+
|
152 |
+
#endif // _MSC_VER
|
153 |
+
|
154 |
+
#if !defined(__aarch64__)
|
155 |
+
|
156 |
+
// 32-bit ARM compatibility
|
157 |
+
|
158 |
+
// vaddlvq_s16
|
159 |
+
// vpaddq_s16
|
160 |
+
// vpaddq_s32
|
161 |
+
// vaddvq_s32
|
162 |
+
// vaddvq_f32
|
163 |
+
// vmaxvq_f32
|
164 |
+
// vcvtnq_s32_f32
|
165 |
+
// vzip1_u8
|
166 |
+
// vzip2_u8
|
167 |
+
|
168 |
+
inline static int32_t vaddlvq_s16(int16x8_t v) {
|
169 |
+
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
|
170 |
+
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
|
171 |
+
}
|
172 |
+
|
173 |
+
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
174 |
+
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
175 |
+
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
176 |
+
return vcombine_s16(a0, b0);
|
177 |
+
}
|
178 |
+
|
179 |
+
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
180 |
+
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
181 |
+
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
182 |
+
return vcombine_s32(a0, b0);
|
183 |
+
}
|
184 |
+
|
185 |
+
inline static int32_t vaddvq_s32(int32x4_t v) {
|
186 |
+
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
187 |
+
}
|
188 |
+
|
189 |
+
inline static float vaddvq_f32(float32x4_t v) {
|
190 |
+
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
191 |
+
}
|
192 |
+
|
193 |
+
inline static float vmaxvq_f32(float32x4_t v) {
|
194 |
+
return
|
195 |
+
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
196 |
+
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
197 |
+
}
|
198 |
+
|
199 |
+
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
200 |
+
int32x4_t res;
|
201 |
+
|
202 |
+
res[0] = roundf(vgetq_lane_f32(v, 0));
|
203 |
+
res[1] = roundf(vgetq_lane_f32(v, 1));
|
204 |
+
res[2] = roundf(vgetq_lane_f32(v, 2));
|
205 |
+
res[3] = roundf(vgetq_lane_f32(v, 3));
|
206 |
+
|
207 |
+
return res;
|
208 |
+
}
|
209 |
+
|
210 |
+
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
211 |
+
uint8x8_t res;
|
212 |
+
|
213 |
+
res[0] = a[0]; res[1] = b[0];
|
214 |
+
res[2] = a[1]; res[3] = b[1];
|
215 |
+
res[4] = a[2]; res[5] = b[2];
|
216 |
+
res[6] = a[3]; res[7] = b[3];
|
217 |
+
|
218 |
+
return res;
|
219 |
+
}
|
220 |
+
|
221 |
+
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
222 |
+
uint8x8_t res;
|
223 |
+
|
224 |
+
res[0] = a[4]; res[1] = b[4];
|
225 |
+
res[2] = a[5]; res[3] = b[5];
|
226 |
+
res[4] = a[6]; res[5] = b[6];
|
227 |
+
res[6] = a[7]; res[7] = b[7];
|
228 |
+
|
229 |
+
return res;
|
230 |
+
}
|
231 |
+
|
232 |
+
// vld1q_s16_x2
|
233 |
+
// vld1q_u8_x2
|
234 |
+
// vld1q_u8_x4
|
235 |
+
// vld1q_s8_x2
|
236 |
+
// vld1q_s8_x4
|
237 |
+
// TODO: double-check these work correctly
|
238 |
+
|
239 |
+
typedef struct ggml_int16x8x2_t {
|
240 |
+
int16x8_t val[2];
|
241 |
+
} ggml_int16x8x2_t;
|
242 |
+
|
243 |
+
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
244 |
+
ggml_int16x8x2_t res;
|
245 |
+
|
246 |
+
res.val[0] = vld1q_s16(ptr + 0);
|
247 |
+
res.val[1] = vld1q_s16(ptr + 8);
|
248 |
+
|
249 |
+
return res;
|
250 |
+
}
|
251 |
+
|
252 |
+
typedef struct ggml_uint8x16x2_t {
|
253 |
+
uint8x16_t val[2];
|
254 |
+
} ggml_uint8x16x2_t;
|
255 |
+
|
256 |
+
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
257 |
+
ggml_uint8x16x2_t res;
|
258 |
+
|
259 |
+
res.val[0] = vld1q_u8(ptr + 0);
|
260 |
+
res.val[1] = vld1q_u8(ptr + 16);
|
261 |
+
|
262 |
+
return res;
|
263 |
+
}
|
264 |
+
|
265 |
+
typedef struct ggml_uint8x16x4_t {
|
266 |
+
uint8x16_t val[4];
|
267 |
+
} ggml_uint8x16x4_t;
|
268 |
+
|
269 |
+
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
270 |
+
ggml_uint8x16x4_t res;
|
271 |
+
|
272 |
+
res.val[0] = vld1q_u8(ptr + 0);
|
273 |
+
res.val[1] = vld1q_u8(ptr + 16);
|
274 |
+
res.val[2] = vld1q_u8(ptr + 32);
|
275 |
+
res.val[3] = vld1q_u8(ptr + 48);
|
276 |
+
|
277 |
+
return res;
|
278 |
+
}
|
279 |
+
|
280 |
+
typedef struct ggml_int8x16x2_t {
|
281 |
+
int8x16_t val[2];
|
282 |
+
} ggml_int8x16x2_t;
|
283 |
+
|
284 |
+
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
285 |
+
ggml_int8x16x2_t res;
|
286 |
+
|
287 |
+
res.val[0] = vld1q_s8(ptr + 0);
|
288 |
+
res.val[1] = vld1q_s8(ptr + 16);
|
289 |
+
|
290 |
+
return res;
|
291 |
+
}
|
292 |
+
|
293 |
+
typedef struct ggml_int8x16x4_t {
|
294 |
+
int8x16_t val[4];
|
295 |
+
} ggml_int8x16x4_t;
|
296 |
+
|
297 |
+
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
298 |
+
ggml_int8x16x4_t res;
|
299 |
+
|
300 |
+
res.val[0] = vld1q_s8(ptr + 0);
|
301 |
+
res.val[1] = vld1q_s8(ptr + 16);
|
302 |
+
res.val[2] = vld1q_s8(ptr + 32);
|
303 |
+
res.val[3] = vld1q_s8(ptr + 48);
|
304 |
+
|
305 |
+
return res;
|
306 |
+
}
|
307 |
+
|
308 |
+
// NOTE: not tested
|
309 |
+
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
310 |
+
int8x16_t res;
|
311 |
+
|
312 |
+
res[ 0] = a[b[ 0]];
|
313 |
+
res[ 1] = a[b[ 1]];
|
314 |
+
res[ 2] = a[b[ 2]];
|
315 |
+
res[ 3] = a[b[ 3]];
|
316 |
+
res[ 4] = a[b[ 4]];
|
317 |
+
res[ 5] = a[b[ 5]];
|
318 |
+
res[ 6] = a[b[ 6]];
|
319 |
+
res[ 7] = a[b[ 7]];
|
320 |
+
res[ 8] = a[b[ 8]];
|
321 |
+
res[ 9] = a[b[ 9]];
|
322 |
+
res[10] = a[b[10]];
|
323 |
+
res[11] = a[b[11]];
|
324 |
+
res[12] = a[b[12]];
|
325 |
+
res[13] = a[b[13]];
|
326 |
+
res[14] = a[b[14]];
|
327 |
+
res[15] = a[b[15]];
|
328 |
+
|
329 |
+
return res;
|
330 |
+
}
|
331 |
+
|
332 |
+
// NOTE: not tested
|
333 |
+
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
334 |
+
uint8x16_t res;
|
335 |
+
|
336 |
+
res[ 0] = a[b[ 0]];
|
337 |
+
res[ 1] = a[b[ 1]];
|
338 |
+
res[ 2] = a[b[ 2]];
|
339 |
+
res[ 3] = a[b[ 3]];
|
340 |
+
res[ 4] = a[b[ 4]];
|
341 |
+
res[ 5] = a[b[ 5]];
|
342 |
+
res[ 6] = a[b[ 6]];
|
343 |
+
res[ 7] = a[b[ 7]];
|
344 |
+
res[ 8] = a[b[ 8]];
|
345 |
+
res[ 9] = a[b[ 9]];
|
346 |
+
res[10] = a[b[10]];
|
347 |
+
res[11] = a[b[11]];
|
348 |
+
res[12] = a[b[12]];
|
349 |
+
res[13] = a[b[13]];
|
350 |
+
res[14] = a[b[14]];
|
351 |
+
res[15] = a[b[15]];
|
352 |
+
|
353 |
+
return res;
|
354 |
+
}
|
355 |
+
|
356 |
+
#else
|
357 |
+
|
358 |
+
#define ggml_int16x8x2_t int16x8x2_t
|
359 |
+
#define ggml_uint8x16x2_t uint8x16x2_t
|
360 |
+
#define ggml_uint8x16x4_t uint8x16x4_t
|
361 |
+
#define ggml_int8x16x2_t int8x16x2_t
|
362 |
+
#define ggml_int8x16x4_t int8x16x4_t
|
363 |
+
|
364 |
+
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
365 |
+
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
366 |
+
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
367 |
+
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
368 |
+
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
369 |
+
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
370 |
+
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
371 |
+
|
372 |
+
#endif // !defined(__aarch64__)
|
373 |
+
|
374 |
+
#if !defined(__ARM_FEATURE_DOTPROD)
|
375 |
+
|
376 |
+
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
377 |
+
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
378 |
+
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
379 |
+
|
380 |
+
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
381 |
+
}
|
382 |
+
|
383 |
+
#else
|
384 |
+
|
385 |
+
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
386 |
+
|
387 |
+
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
388 |
+
|
389 |
+
#endif // defined(__ARM_NEON)
|
390 |
+
|
391 |
+
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
392 |
+
|
393 |
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
394 |
+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
395 |
+
|
396 |
+
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
397 |
+
|
398 |
+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
399 |
+
ggml_fp16_internal_t tmp;
|
400 |
+
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
401 |
+
return (float)tmp;
|
402 |
+
}
|
403 |
+
|
404 |
+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
405 |
+
ggml_fp16_t res;
|
406 |
+
ggml_fp16_internal_t tmp = f;
|
407 |
+
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
408 |
+
return res;
|
409 |
+
}
|
410 |
+
|
411 |
+
#else
|
412 |
+
|
413 |
+
#ifdef __wasm_simd128__
|
414 |
+
#include <wasm_simd128.h>
|
415 |
+
#else
|
416 |
+
#ifdef __POWER9_VECTOR__
|
417 |
+
#include <altivec.h>
|
418 |
+
#undef bool
|
419 |
+
#define bool _Bool
|
420 |
+
#else
|
421 |
+
#if defined(_MSC_VER) || defined(__MINGW32__)
|
422 |
+
#include <intrin.h>
|
423 |
+
#else
|
424 |
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
425 |
+
#if !defined(__riscv)
|
426 |
+
#include <immintrin.h>
|
427 |
+
#endif
|
428 |
+
#endif
|
429 |
+
#endif
|
430 |
+
#endif
|
431 |
+
#endif
|
432 |
+
|
433 |
+
#ifdef __riscv_v_intrinsic
|
434 |
+
#include <riscv_vector.h>
|
435 |
+
#endif
|
436 |
+
|
437 |
+
#if defined(__loongarch64)
|
438 |
+
#if defined(__loongarch_asx)
|
439 |
+
#include <lasxintrin.h>
|
440 |
+
#endif
|
441 |
+
#if defined(__loongarch_sx)
|
442 |
+
#include <lsxintrin.h>
|
443 |
+
#endif
|
444 |
+
#endif
|
445 |
+
|
446 |
+
#if defined(__loongarch_asx)
|
447 |
+
|
448 |
+
typedef union {
|
449 |
+
int32_t i;
|
450 |
+
float f;
|
451 |
+
} ft_union;
|
452 |
+
|
453 |
+
/* float type data load instructions */
|
454 |
+
static __m128 __lsx_vreplfr2vr_s(float val) {
|
455 |
+
ft_union fi_tmpval = {.f = val};
|
456 |
+
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
457 |
+
}
|
458 |
+
|
459 |
+
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
460 |
+
ft_union fi_tmpval = {.f = val};
|
461 |
+
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
462 |
+
}
|
463 |
+
#endif
|
464 |
+
|
465 |
+
#ifdef __F16C__
|
466 |
+
|
467 |
+
#ifdef _MSC_VER
|
468 |
+
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
469 |
+
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
470 |
+
#else
|
471 |
+
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
472 |
+
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
473 |
+
#endif
|
474 |
+
|
475 |
+
#elif defined(__POWER9_VECTOR__)
|
476 |
+
|
477 |
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
478 |
+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
479 |
+
/* the inline asm below is about 12% faster than the lookup method */
|
480 |
+
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
481 |
+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
482 |
+
|
483 |
+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
484 |
+
register float f;
|
485 |
+
register double d;
|
486 |
+
__asm__(
|
487 |
+
"mtfprd %0,%2\n"
|
488 |
+
"xscvhpdp %0,%0\n"
|
489 |
+
"frsp %1,%0\n" :
|
490 |
+
/* temp */ "=d"(d),
|
491 |
+
/* out */ "=f"(f):
|
492 |
+
/* in */ "r"(h));
|
493 |
+
return f;
|
494 |
+
}
|
495 |
+
|
496 |
+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
497 |
+
register double d;
|
498 |
+
register ggml_fp16_t r;
|
499 |
+
__asm__( /* xscvdphp can work on double or single precision */
|
500 |
+
"xscvdphp %0,%2\n"
|
501 |
+
"mffprd %1,%0\n" :
|
502 |
+
/* temp */ "=d"(d),
|
503 |
+
/* out */ "=r"(r):
|
504 |
+
/* in */ "f"(f));
|
505 |
+
return r;
|
506 |
+
}
|
507 |
+
|
508 |
+
#else
|
509 |
+
|
510 |
+
// FP16 <-> FP32
|
511 |
+
// ref: https://github.com/Maratyszcza/FP16
|
512 |
+
|
513 |
+
static inline float fp32_from_bits(uint32_t w) {
|
514 |
+
union {
|
515 |
+
uint32_t as_bits;
|
516 |
+
float as_value;
|
517 |
+
} fp32;
|
518 |
+
fp32.as_bits = w;
|
519 |
+
return fp32.as_value;
|
520 |
+
}
|
521 |
+
|
522 |
+
static inline uint32_t fp32_to_bits(float f) {
|
523 |
+
union {
|
524 |
+
float as_value;
|
525 |
+
uint32_t as_bits;
|
526 |
+
} fp32;
|
527 |
+
fp32.as_value = f;
|
528 |
+
return fp32.as_bits;
|
529 |
+
}
|
530 |
+
|
531 |
+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
532 |
+
const uint32_t w = (uint32_t) h << 16;
|
533 |
+
const uint32_t sign = w & UINT32_C(0x80000000);
|
534 |
+
const uint32_t two_w = w + w;
|
535 |
+
|
536 |
+
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
537 |
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
538 |
+
const float exp_scale = 0x1.0p-112f;
|
539 |
+
#else
|
540 |
+
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
541 |
+
#endif
|
542 |
+
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
543 |
+
|
544 |
+
const uint32_t magic_mask = UINT32_C(126) << 23;
|
545 |
+
const float magic_bias = 0.5f;
|
546 |
+
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
547 |
+
|
548 |
+
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
549 |
+
const uint32_t result = sign |
|
550 |
+
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
551 |
+
return fp32_from_bits(result);
|
552 |
+
}
|
553 |
+
|
554 |
+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
555 |
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
556 |
+
const float scale_to_inf = 0x1.0p+112f;
|
557 |
+
const float scale_to_zero = 0x1.0p-110f;
|
558 |
+
#else
|
559 |
+
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
560 |
+
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
561 |
+
#endif
|
562 |
+
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
563 |
+
|
564 |
+
const uint32_t w = fp32_to_bits(f);
|
565 |
+
const uint32_t shl1_w = w + w;
|
566 |
+
const uint32_t sign = w & UINT32_C(0x80000000);
|
567 |
+
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
568 |
+
if (bias < UINT32_C(0x71000000)) {
|
569 |
+
bias = UINT32_C(0x71000000);
|
570 |
+
}
|
571 |
+
|
572 |
+
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
573 |
+
const uint32_t bits = fp32_to_bits(base);
|
574 |
+
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
575 |
+
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
576 |
+
const uint32_t nonsign = exp_bits + mantissa_bits;
|
577 |
+
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
578 |
+
}
|
579 |
+
|
580 |
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
581 |
+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
582 |
+
|
583 |
+
#endif // __F16C__
|
584 |
+
|
585 |
+
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
586 |
+
|
587 |
+
#ifdef __ARM_FEATURE_SVE
|
588 |
+
#include <arm_sve.h>
|
589 |
+
#endif // __ARM_FEATURE_SVE
|
590 |
+
|
591 |
+
// precomputed f32 table for f16 (256 KB)
|
592 |
+
// defined in ggml.c, initialized in ggml_init()
|
593 |
+
extern float ggml_table_f32_f16[1 << 16];
|
594 |
+
|
595 |
+
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
596 |
+
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
597 |
+
// This is also true for POWER9.
|
598 |
+
#if !defined(GGML_FP16_TO_FP32)
|
599 |
+
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
600 |
+
uint16_t s;
|
601 |
+
memcpy(&s, &f, sizeof(uint16_t));
|
602 |
+
return ggml_table_f32_f16[s];
|
603 |
+
}
|
604 |
+
|
605 |
+
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
606 |
+
#endif
|
607 |
+
|
608 |
+
#if !defined(GGML_FP32_TO_FP16)
|
609 |
+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
610 |
+
#endif
|
611 |
+
|
612 |
+
#ifdef __cplusplus
|
613 |
+
}
|
614 |
+
#endif
|
ggml-impl.h
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#pragma once
|
2 |
+
|
3 |
+
// GGML internal header
|
4 |
+
|
5 |
+
#include "ggml.h"
|
6 |
+
|
7 |
+
#include <assert.h>
|
8 |
+
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
9 |
+
#include <stdbool.h>
|
10 |
+
#include <stdint.h>
|
11 |
+
|
12 |
+
#ifdef __cplusplus
|
13 |
+
extern "C" {
|
14 |
+
#endif
|
15 |
+
|
16 |
+
#undef MIN
|
17 |
+
#undef MAX
|
18 |
+
|
19 |
+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
20 |
+
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
21 |
+
|
22 |
+
// required for mmap as gguf only guarantees 32-byte alignment
|
23 |
+
#define TENSOR_ALIGNMENT 32
|
24 |
+
|
25 |
+
// static_assert should be a #define, but if it's not,
|
26 |
+
// fall back to the _Static_assert C11 keyword.
|
27 |
+
// if C99 - static_assert is noop
|
28 |
+
// ref: https://stackoverflow.com/a/53923785/4039976
|
29 |
+
#ifndef __cplusplus
|
30 |
+
#ifndef static_assert
|
31 |
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
32 |
+
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
33 |
+
#else
|
34 |
+
#define static_assert(cond, msg) struct global_scope_noop_trick
|
35 |
+
#endif
|
36 |
+
#endif
|
37 |
+
#endif
|
38 |
+
|
39 |
+
//
|
40 |
+
// logging
|
41 |
+
//
|
42 |
+
|
43 |
+
GGML_ATTRIBUTE_FORMAT(2, 3)
|
44 |
+
void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
|
45 |
+
void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
|
46 |
+
|
47 |
+
#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
48 |
+
#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
49 |
+
#define GGML_LOG_WARN(...) ggml_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
50 |
+
#define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
51 |
+
#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
52 |
+
#define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
|
53 |
+
|
54 |
+
// bitset
|
55 |
+
|
56 |
+
typedef uint32_t ggml_bitset_t;
|
57 |
+
|
58 |
+
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
59 |
+
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
60 |
+
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
61 |
+
|
62 |
+
static size_t ggml_bitset_size(size_t n) {
|
63 |
+
return (n + BITSET_MASK) >> BITSET_SHR;
|
64 |
+
}
|
65 |
+
|
66 |
+
static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) {
|
67 |
+
return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK)));
|
68 |
+
}
|
69 |
+
|
70 |
+
static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) {
|
71 |
+
bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK));
|
72 |
+
}
|
73 |
+
|
74 |
+
static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
|
75 |
+
bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK));
|
76 |
+
}
|
77 |
+
|
78 |
+
// hash set
|
79 |
+
|
80 |
+
#define GGML_HASHSET_FULL ((size_t)-1)
|
81 |
+
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
82 |
+
|
83 |
+
struct ggml_hash_set {
|
84 |
+
size_t size;
|
85 |
+
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
86 |
+
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
87 |
+
};
|
88 |
+
|
89 |
+
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
90 |
+
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
91 |
+
|
92 |
+
// returns the minimum size for a hash set that can hold min_sz elements
|
93 |
+
size_t ggml_hash_size(size_t min_sz);
|
94 |
+
|
95 |
+
// remove all elements from the hash set
|
96 |
+
void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
|
97 |
+
|
98 |
+
// returns true if key is in the hash set
|
99 |
+
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
100 |
+
|
101 |
+
// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
102 |
+
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
103 |
+
|
104 |
+
// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
105 |
+
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
106 |
+
|
107 |
+
// return index, asserts if table is full
|
108 |
+
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
109 |
+
|
110 |
+
// hash function for ggml_tensor
|
111 |
+
static inline size_t ggml_hash(const struct ggml_tensor * p) {
|
112 |
+
// the last 4 bits are always zero due to alignment
|
113 |
+
return (size_t)(uintptr_t)p >> 4;
|
114 |
+
}
|
115 |
+
|
116 |
+
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
117 |
+
size_t h = ggml_hash(key) % hash_set->size;
|
118 |
+
|
119 |
+
// linear probing
|
120 |
+
size_t i = h;
|
121 |
+
while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) {
|
122 |
+
i = (i + 1) % hash_set->size;
|
123 |
+
if (i == h) {
|
124 |
+
// visited all hash table entries -> not found
|
125 |
+
return GGML_HASHSET_FULL;
|
126 |
+
}
|
127 |
+
}
|
128 |
+
return i;
|
129 |
+
}
|
130 |
+
|
131 |
+
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
132 |
+
size_t i = ggml_hash_find(hash_set, key);
|
133 |
+
return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i);
|
134 |
+
}
|
135 |
+
|
136 |
+
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
137 |
+
size_t h = ggml_hash(key) % hash_set->size;
|
138 |
+
|
139 |
+
// linear probing
|
140 |
+
size_t i = h;
|
141 |
+
do {
|
142 |
+
if (!ggml_bitset_get(hash_set->used, i)) {
|
143 |
+
ggml_bitset_set(hash_set->used, i);
|
144 |
+
hash_set->keys[i] = key;
|
145 |
+
return i;
|
146 |
+
}
|
147 |
+
if (hash_set->keys[i] == key) {
|
148 |
+
return GGML_HASHSET_ALREADY_EXISTS;
|
149 |
+
}
|
150 |
+
i = (i + 1) % hash_set->size;
|
151 |
+
} while (i != h);
|
152 |
+
|
153 |
+
// visited all hash table entries -> not found
|
154 |
+
GGML_ABORT("fatal error");
|
155 |
+
}
|
156 |
+
|
157 |
+
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
158 |
+
size_t h = ggml_hash(key) % hash_set->size;
|
159 |
+
|
160 |
+
// linear probing
|
161 |
+
size_t i = h;
|
162 |
+
do {
|
163 |
+
if (!ggml_bitset_get(hash_set->used, i)) {
|
164 |
+
ggml_bitset_set(hash_set->used, i);
|
165 |
+
hash_set->keys[i] = key;
|
166 |
+
return i;
|
167 |
+
}
|
168 |
+
if (hash_set->keys[i] == key) {
|
169 |
+
return i;
|
170 |
+
}
|
171 |
+
i = (i + 1) % hash_set->size;
|
172 |
+
} while (i != h);
|
173 |
+
|
174 |
+
// visited all hash table entries -> not found
|
175 |
+
GGML_ABORT("fatal error");
|
176 |
+
}
|
177 |
+
|
178 |
+
// computation graph
|
179 |
+
|
180 |
+
enum ggml_cgraph_eval_order {
|
181 |
+
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
182 |
+
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
183 |
+
GGML_CGRAPH_EVAL_ORDER_COUNT
|
184 |
+
};
|
185 |
+
|
186 |
+
struct ggml_cgraph {
|
187 |
+
int size;
|
188 |
+
int n_nodes;
|
189 |
+
int n_leafs;
|
190 |
+
|
191 |
+
struct ggml_tensor ** nodes;
|
192 |
+
struct ggml_tensor ** grads;
|
193 |
+
struct ggml_tensor ** leafs;
|
194 |
+
|
195 |
+
struct ggml_hash_set visited_hash_set;
|
196 |
+
|
197 |
+
enum ggml_cgraph_eval_order order;
|
198 |
+
};
|
199 |
+
|
200 |
+
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
201 |
+
|
202 |
+
// Memory allocation
|
203 |
+
|
204 |
+
void * ggml_aligned_malloc(size_t size);
|
205 |
+
void ggml_aligned_free(void * ptr, size_t size);
|
206 |
+
|
207 |
+
#ifdef __cplusplus
|
208 |
+
}
|
209 |
+
#endif
|
ggml-model-gpt-2-774M.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:44b54a6ab261de692b791d6492940de6e606182158e60d59a630c26a38e3ccf8
|
3 |
+
size 1552422809
|
ggml-quants.c
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ggml-quants.h
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#pragma once
|
2 |
+
|
3 |
+
#define GGML_COMMON_DECL_C
|
4 |
+
#include "ggml-common.h"
|
5 |
+
|
6 |
+
#include "ggml.h"
|
7 |
+
|
8 |
+
// GGML internal header
|
9 |
+
|
10 |
+
#ifdef __cplusplus
|
11 |
+
extern "C" {
|
12 |
+
#endif
|
13 |
+
|
14 |
+
// Quantization
|
15 |
+
void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
16 |
+
void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
17 |
+
void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
18 |
+
void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
19 |
+
void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
20 |
+
void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
21 |
+
|
22 |
+
void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
23 |
+
void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
24 |
+
void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
25 |
+
void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
26 |
+
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
27 |
+
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
28 |
+
|
29 |
+
void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
|
30 |
+
void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
|
31 |
+
|
32 |
+
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
33 |
+
void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
34 |
+
void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
35 |
+
void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
36 |
+
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
37 |
+
|
38 |
+
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
39 |
+
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
40 |
+
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
41 |
+
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
42 |
+
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
43 |
+
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
44 |
+
|
45 |
+
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
46 |
+
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
47 |
+
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
48 |
+
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
49 |
+
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
50 |
+
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
51 |
+
|
52 |
+
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
53 |
+
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
54 |
+
|
55 |
+
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
56 |
+
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
57 |
+
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
58 |
+
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
59 |
+
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
60 |
+
|
61 |
+
// Dequantization
|
62 |
+
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
63 |
+
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
64 |
+
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
65 |
+
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
66 |
+
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
67 |
+
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
68 |
+
|
69 |
+
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
70 |
+
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
71 |
+
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
72 |
+
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
73 |
+
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
74 |
+
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
75 |
+
|
76 |
+
void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
77 |
+
void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
78 |
+
|
79 |
+
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
80 |
+
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
81 |
+
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
82 |
+
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
83 |
+
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
84 |
+
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
85 |
+
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
86 |
+
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
87 |
+
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
88 |
+
|
89 |
+
// Dot product
|
90 |
+
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
91 |
+
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
92 |
+
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
93 |
+
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
94 |
+
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
95 |
+
|
96 |
+
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
97 |
+
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
98 |
+
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
99 |
+
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
100 |
+
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
101 |
+
|
102 |
+
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
103 |
+
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
104 |
+
|
105 |
+
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
106 |
+
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
107 |
+
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
108 |
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
109 |
+
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
110 |
+
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
111 |
+
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
112 |
+
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
113 |
+
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
114 |
+
|
115 |
+
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
116 |
+
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
117 |
+
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
118 |
+
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
119 |
+
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
120 |
+
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
121 |
+
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
122 |
+
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
123 |
+
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
124 |
+
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
125 |
+
|
126 |
+
size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
127 |
+
size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
128 |
+
|
129 |
+
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
130 |
+
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
131 |
+
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
132 |
+
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
133 |
+
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
134 |
+
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
135 |
+
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
136 |
+
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
137 |
+
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
138 |
+
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
139 |
+
|
140 |
+
void iq2xs_init_impl(enum ggml_type type);
|
141 |
+
void iq2xs_free_impl(enum ggml_type type);
|
142 |
+
void iq3xs_init_impl(int grid_size);
|
143 |
+
void iq3xs_free_impl(int grid_size);
|
144 |
+
|
145 |
+
#ifdef __cplusplus
|
146 |
+
}
|
147 |
+
#endif
|
ggml.c
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ggml.h
ADDED
The diff for this file is too large to render.
See raw diff
|
|
main-ctx.cpp
ADDED
@@ -0,0 +1,841 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//#include "ggml.h"
|
2 |
+
//
|
3 |
+
//#include "common.h"
|
4 |
+
//#include "common-ggml.h"
|
5 |
+
//
|
6 |
+
//#include <cassert>
|
7 |
+
//#include <cmath>
|
8 |
+
//#include <cstdio>
|
9 |
+
//#include <cstring>
|
10 |
+
//#include <fstream>
|
11 |
+
//#include <map>
|
12 |
+
//#include <string>
|
13 |
+
//#include <vector>
|
14 |
+
//
|
15 |
+
//#if defined(_MSC_VER)
|
16 |
+
//#pragma warning(disable: 4244 4267) // possible loss of data
|
17 |
+
//#endif
|
18 |
+
//
|
19 |
+
//// default hparams (GPT-2 117M)
|
20 |
+
//struct gpt2_hparams {
|
21 |
+
// int32_t n_vocab = 50257; // Vocabulary size remains the same
|
22 |
+
// int32_t n_ctx = 1024; // Maximum context length (sequence length)
|
23 |
+
// int32_t n_embd = 1024; // Embedding dimensionality
|
24 |
+
// int32_t n_head = 16; // Number of attention heads
|
25 |
+
// int32_t n_layer = 24; // Number of transformer layers
|
26 |
+
// int32_t ftype = 1; // Set to 1 for FP16 precision (optional)
|
27 |
+
// float eps = 1e-5f; // Small constant for numerical stability
|
28 |
+
//};
|
29 |
+
//
|
30 |
+
//struct gpt2_layer {
|
31 |
+
// // normalization
|
32 |
+
// struct ggml_tensor * ln_1_g;
|
33 |
+
// struct ggml_tensor * ln_1_b;
|
34 |
+
//
|
35 |
+
// struct ggml_tensor * ln_2_g;
|
36 |
+
// struct ggml_tensor * ln_2_b;
|
37 |
+
//
|
38 |
+
// // attention
|
39 |
+
// struct ggml_tensor * c_attn_attn_w;
|
40 |
+
// struct ggml_tensor * c_attn_attn_b;
|
41 |
+
//
|
42 |
+
// struct ggml_tensor * c_attn_proj_w;
|
43 |
+
// struct ggml_tensor * c_attn_proj_b;
|
44 |
+
//
|
45 |
+
// // mlp
|
46 |
+
// struct ggml_tensor * c_mlp_fc_w;
|
47 |
+
// struct ggml_tensor * c_mlp_fc_b;
|
48 |
+
//
|
49 |
+
// struct ggml_tensor * c_mlp_proj_w;
|
50 |
+
// struct ggml_tensor * c_mlp_proj_b;
|
51 |
+
//};
|
52 |
+
//
|
53 |
+
//struct gpt2_model {
|
54 |
+
// gpt2_hparams hparams;
|
55 |
+
//
|
56 |
+
// // normalization
|
57 |
+
// struct ggml_tensor * ln_f_g;
|
58 |
+
// struct ggml_tensor * ln_f_b;
|
59 |
+
//
|
60 |
+
// struct ggml_tensor * wte; // position embedding
|
61 |
+
// struct ggml_tensor * wpe; // token embedding
|
62 |
+
// struct ggml_tensor * lm_head; // language model head
|
63 |
+
//
|
64 |
+
// std::vector<gpt2_layer> layers;
|
65 |
+
//
|
66 |
+
// // key + value memory
|
67 |
+
// struct ggml_tensor * memory_k;
|
68 |
+
// struct ggml_tensor * memory_v;
|
69 |
+
//
|
70 |
+
// //
|
71 |
+
// struct ggml_context * ctx_w;
|
72 |
+
// std::map<std::string, struct ggml_tensor *> tensors;
|
73 |
+
//};
|
74 |
+
//
|
75 |
+
//// load the model's weights from a file
|
76 |
+
//bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
|
77 |
+
// printf("%s: loading model from '%s'\n", __func__, fname.c_str());
|
78 |
+
//
|
79 |
+
// auto fin = std::ifstream(fname, std::ios::binary);
|
80 |
+
// if (!fin) {
|
81 |
+
// fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
82 |
+
// return false;
|
83 |
+
// }
|
84 |
+
//
|
85 |
+
// // verify magic
|
86 |
+
// {
|
87 |
+
// uint32_t magic;
|
88 |
+
// fin.read((char *) &magic, sizeof(magic));
|
89 |
+
// if (magic != GGML_FILE_MAGIC) {
|
90 |
+
// fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
91 |
+
// return false;
|
92 |
+
// }
|
93 |
+
// }
|
94 |
+
//
|
95 |
+
// // load hparams
|
96 |
+
// {
|
97 |
+
// auto & hparams = model.hparams;
|
98 |
+
//
|
99 |
+
// fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
100 |
+
// fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
101 |
+
// fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
102 |
+
// fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
103 |
+
// fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
104 |
+
// fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
105 |
+
//
|
106 |
+
// const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
107 |
+
//
|
108 |
+
// printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
109 |
+
// printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
110 |
+
// printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
111 |
+
// printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
112 |
+
// printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
113 |
+
// printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
114 |
+
// printf("%s: qntvr = %d\n", __func__, qntvr);
|
115 |
+
//
|
116 |
+
// hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
117 |
+
// }
|
118 |
+
//
|
119 |
+
// // load vocab
|
120 |
+
// {
|
121 |
+
// int32_t n_vocab = 0;
|
122 |
+
// fin.read((char *) &n_vocab, sizeof(n_vocab));
|
123 |
+
//
|
124 |
+
// if (n_vocab != model.hparams.n_vocab) {
|
125 |
+
// fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
126 |
+
// __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
|
127 |
+
// return false;
|
128 |
+
// }
|
129 |
+
//
|
130 |
+
// std::string word;
|
131 |
+
// std::vector<char> buf(128);
|
132 |
+
//
|
133 |
+
// for (int i = 0; i < n_vocab; i++) {
|
134 |
+
// uint32_t len;
|
135 |
+
// fin.read((char *) &len, sizeof(len));
|
136 |
+
//
|
137 |
+
// buf.resize(len);
|
138 |
+
// fin.read((char *) buf.data(), len);
|
139 |
+
// word.assign(buf.data(), len);
|
140 |
+
//
|
141 |
+
// vocab.token_to_id[word] = i;
|
142 |
+
// vocab.id_to_token[i] = word;
|
143 |
+
// }
|
144 |
+
// }
|
145 |
+
//
|
146 |
+
// // for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
147 |
+
// // in order to save memory and also to speed up the computation
|
148 |
+
// ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
149 |
+
// if (wtype == GGML_TYPE_COUNT) {
|
150 |
+
// fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
|
151 |
+
// __func__, fname.c_str(), model.hparams.ftype);
|
152 |
+
// return false;
|
153 |
+
// }
|
154 |
+
//
|
155 |
+
// auto & ctx = model.ctx_w;
|
156 |
+
//
|
157 |
+
// size_t ctx_size = 0;
|
158 |
+
//
|
159 |
+
// {
|
160 |
+
// const auto & hparams = model.hparams;
|
161 |
+
//
|
162 |
+
// const int n_embd = hparams.n_embd;
|
163 |
+
// const int n_layer = hparams.n_layer;
|
164 |
+
// const int n_ctx = hparams.n_ctx;
|
165 |
+
// const int n_vocab = hparams.n_vocab;
|
166 |
+
//
|
167 |
+
// ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
|
168 |
+
// ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
|
169 |
+
//
|
170 |
+
// ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // wte
|
171 |
+
// ctx_size += ggml_row_size(GGML_TYPE_F32, n_ctx*n_embd); // wpe
|
172 |
+
// ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // lm_head
|
173 |
+
//
|
174 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
|
175 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
|
176 |
+
//
|
177 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
|
178 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
|
179 |
+
//
|
180 |
+
// ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
|
181 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
|
182 |
+
//
|
183 |
+
// ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
|
184 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
|
185 |
+
//
|
186 |
+
// ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
|
187 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
|
188 |
+
//
|
189 |
+
// ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
|
190 |
+
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_proj_b
|
191 |
+
//
|
192 |
+
// ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
|
193 |
+
// ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
|
194 |
+
//
|
195 |
+
// ctx_size += (6 + 12*n_layer)*512; // object overhead
|
196 |
+
//
|
197 |
+
// printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
|
198 |
+
// printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
199 |
+
// }
|
200 |
+
//
|
201 |
+
// // create the ggml context
|
202 |
+
// {
|
203 |
+
// struct ggml_init_params params = {
|
204 |
+
// /*.mem_size =*/ ctx_size,
|
205 |
+
// /*.mem_buffer =*/ NULL,
|
206 |
+
// /*.no_alloc =*/ false,
|
207 |
+
// };
|
208 |
+
//
|
209 |
+
// model.ctx_w = ggml_init(params);
|
210 |
+
// if (!model.ctx_w) {
|
211 |
+
// fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
212 |
+
// return false;
|
213 |
+
// }
|
214 |
+
// }
|
215 |
+
//
|
216 |
+
// // prepare memory for the weights
|
217 |
+
// {
|
218 |
+
// const auto & hparams = model.hparams;
|
219 |
+
//
|
220 |
+
// const int n_embd = hparams.n_embd;
|
221 |
+
// const int n_layer = hparams.n_layer;
|
222 |
+
// const int n_ctx = hparams.n_ctx;
|
223 |
+
// const int n_vocab = hparams.n_vocab;
|
224 |
+
//
|
225 |
+
// model.layers.resize(n_layer);
|
226 |
+
//
|
227 |
+
// model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
228 |
+
// model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
229 |
+
//
|
230 |
+
// model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
231 |
+
// model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
|
232 |
+
// model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
233 |
+
//
|
234 |
+
// // map by name
|
235 |
+
// model.tensors["model/ln_f/g"] = model.ln_f_g;
|
236 |
+
// model.tensors["model/ln_f/b"] = model.ln_f_b;
|
237 |
+
//
|
238 |
+
// model.tensors["model/wte"] = model.wte;
|
239 |
+
// model.tensors["model/wpe"] = model.wpe;
|
240 |
+
// model.tensors["model/lm_head"] = model.lm_head;
|
241 |
+
//
|
242 |
+
// for (int i = 0; i < n_layer; ++i) {
|
243 |
+
// auto & layer = model.layers[i];
|
244 |
+
//
|
245 |
+
// layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
246 |
+
// layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
247 |
+
//
|
248 |
+
// layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
249 |
+
// layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
250 |
+
//
|
251 |
+
// layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
|
252 |
+
// layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
|
253 |
+
//
|
254 |
+
// layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
255 |
+
// layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
256 |
+
//
|
257 |
+
// layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
|
258 |
+
// layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
|
259 |
+
//
|
260 |
+
// layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
|
261 |
+
// layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
262 |
+
//
|
263 |
+
// // map by name
|
264 |
+
// model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
|
265 |
+
// model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
|
266 |
+
//
|
267 |
+
// model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
|
268 |
+
// model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
|
269 |
+
//
|
270 |
+
// model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
|
271 |
+
// model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
|
272 |
+
//
|
273 |
+
// model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
|
274 |
+
// model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
|
275 |
+
//
|
276 |
+
// model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
|
277 |
+
// model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
|
278 |
+
//
|
279 |
+
// model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
|
280 |
+
// model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
|
281 |
+
// }
|
282 |
+
// }
|
283 |
+
//
|
284 |
+
// // key + value memory
|
285 |
+
// {
|
286 |
+
// const auto & hparams = model.hparams;
|
287 |
+
//
|
288 |
+
// const int n_embd = hparams.n_embd;
|
289 |
+
// const int n_layer = hparams.n_layer;
|
290 |
+
// const int n_ctx = hparams.n_ctx;
|
291 |
+
//
|
292 |
+
// const int n_mem = n_layer*n_ctx;
|
293 |
+
// const int n_elements = n_embd*n_mem;
|
294 |
+
//
|
295 |
+
// model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
296 |
+
// model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
297 |
+
//
|
298 |
+
// const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
299 |
+
//
|
300 |
+
// printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
301 |
+
// }
|
302 |
+
//
|
303 |
+
// // load weights
|
304 |
+
// {
|
305 |
+
// size_t total_size = 0;
|
306 |
+
//
|
307 |
+
// bool has_lm_head = false;
|
308 |
+
//
|
309 |
+
// while (true) {
|
310 |
+
// int32_t n_dims;
|
311 |
+
// int32_t length;
|
312 |
+
// int32_t ttype;
|
313 |
+
//
|
314 |
+
// fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
315 |
+
// fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
316 |
+
// fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
317 |
+
//
|
318 |
+
// if (fin.eof()) {
|
319 |
+
// break;
|
320 |
+
// }
|
321 |
+
//
|
322 |
+
// int32_t nelements = 1;
|
323 |
+
// int32_t ne[2] = { 1, 1 };
|
324 |
+
// for (int i = 0; i < n_dims; ++i) {
|
325 |
+
// fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
326 |
+
// nelements *= ne[i];
|
327 |
+
// }
|
328 |
+
//
|
329 |
+
// std::string name(length, 0);
|
330 |
+
// fin.read(&name[0], length);
|
331 |
+
//
|
332 |
+
// if (model.tensors.find(name) == model.tensors.end()) {
|
333 |
+
// fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
|
334 |
+
// return false;
|
335 |
+
// }
|
336 |
+
//
|
337 |
+
// auto tensor = model.tensors[name];
|
338 |
+
// if (ggml_nelements(tensor) != nelements) {
|
339 |
+
// fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
|
340 |
+
// return false;
|
341 |
+
// }
|
342 |
+
//
|
343 |
+
// if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
344 |
+
// fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
345 |
+
// __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
|
346 |
+
// return false;
|
347 |
+
// }
|
348 |
+
//
|
349 |
+
// // for debugging
|
350 |
+
// if (0) {
|
351 |
+
// printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
352 |
+
// }
|
353 |
+
//
|
354 |
+
// const size_t bpe = ggml_type_size(ggml_type(ttype));
|
355 |
+
//
|
356 |
+
// if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
357 |
+
// fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
358 |
+
// __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
|
359 |
+
// return false;
|
360 |
+
// }
|
361 |
+
//
|
362 |
+
// fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
363 |
+
//
|
364 |
+
// // GPT-2 models share the WTE tensor as the LM head
|
365 |
+
// if (name == "model/wte" && has_lm_head == false) {
|
366 |
+
// memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
|
367 |
+
// }
|
368 |
+
//
|
369 |
+
// if (name == "model/lm_head") {
|
370 |
+
// has_lm_head = true;
|
371 |
+
// }
|
372 |
+
//
|
373 |
+
// total_size += ggml_nbytes(tensor);
|
374 |
+
// }
|
375 |
+
//
|
376 |
+
// printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
|
377 |
+
// }
|
378 |
+
//
|
379 |
+
// fin.close();
|
380 |
+
//
|
381 |
+
// return true;
|
382 |
+
//}
|
383 |
+
//
|
384 |
+
//// evaluate the transformer
|
385 |
+
////
|
386 |
+
//// - model: the model
|
387 |
+
//// - n_threads: number of threads to use
|
388 |
+
//// - n_past: the context size so far
|
389 |
+
//// - embd_inp: the embeddings of the tokens in the context
|
390 |
+
//// - embd_w: the predicted logits for the next token
|
391 |
+
////
|
392 |
+
//bool gpt2_eval(
|
393 |
+
// const gpt2_model & model,
|
394 |
+
// const int n_threads,
|
395 |
+
// const int n_past,
|
396 |
+
// const std::vector<gpt_vocab::id> & embd_inp,
|
397 |
+
// std::vector<float> & embd_w,
|
398 |
+
// size_t & mem_per_token) {
|
399 |
+
// const int N = embd_inp.size();
|
400 |
+
//
|
401 |
+
// const auto & hparams = model.hparams;
|
402 |
+
//
|
403 |
+
// const int n_embd = hparams.n_embd;
|
404 |
+
// const int n_layer = hparams.n_layer;
|
405 |
+
// const int n_ctx = hparams.n_ctx;
|
406 |
+
// const int n_head = hparams.n_head;
|
407 |
+
// const int n_vocab = hparams.n_vocab;
|
408 |
+
//
|
409 |
+
// static size_t buf_size = 256u*1024*1024;
|
410 |
+
// static void * buf = malloc(buf_size);
|
411 |
+
//
|
412 |
+
// if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
413 |
+
// const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
414 |
+
// //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
415 |
+
//
|
416 |
+
// // reallocate
|
417 |
+
// buf_size = buf_size_new;
|
418 |
+
// buf = realloc(buf, buf_size);
|
419 |
+
// if (buf == nullptr) {
|
420 |
+
// fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
|
421 |
+
// return false;
|
422 |
+
// }
|
423 |
+
// }
|
424 |
+
//
|
425 |
+
// struct ggml_init_params params = {
|
426 |
+
// /*.mem_size =*/ buf_size,
|
427 |
+
// /*.mem_buffer =*/ buf,
|
428 |
+
// /*.no_alloc =*/ false,
|
429 |
+
// };
|
430 |
+
//
|
431 |
+
// struct ggml_context * ctx0 = ggml_init(params);
|
432 |
+
// struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
433 |
+
//
|
434 |
+
// struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
435 |
+
// memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
436 |
+
//
|
437 |
+
// struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
438 |
+
// for (int i = 0; i < N; ++i) {
|
439 |
+
// ((int32_t *) position->data)[i] = n_past + i;
|
440 |
+
// }
|
441 |
+
//
|
442 |
+
// // wte + wpe
|
443 |
+
// struct ggml_tensor * inpL =
|
444 |
+
// ggml_add(ctx0,
|
445 |
+
// ggml_get_rows(ctx0, model.wte, embd),
|
446 |
+
// ggml_get_rows(ctx0, model.wpe, position));
|
447 |
+
//
|
448 |
+
// for (int il = 0; il < n_layer; ++il) {
|
449 |
+
// struct ggml_tensor * cur;
|
450 |
+
//
|
451 |
+
// // norm
|
452 |
+
// {
|
453 |
+
// // [ 768, N]
|
454 |
+
// cur = ggml_norm(ctx0, inpL, hparams.eps);
|
455 |
+
//
|
456 |
+
// // cur = ln_1_g*cur + ln_1_b
|
457 |
+
// // [ 768, N]
|
458 |
+
// cur = ggml_add(ctx0,
|
459 |
+
// ggml_mul(ctx0,
|
460 |
+
// ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
|
461 |
+
// cur),
|
462 |
+
// ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
|
463 |
+
// }
|
464 |
+
//
|
465 |
+
// // attn
|
466 |
+
// // [2304, 768] - model.layers[il].c_attn_attn_w
|
467 |
+
// // [2304, 1] - model.layers[il].c_attn_attn_b
|
468 |
+
// // [ 768, N] - cur (in)
|
469 |
+
// // [2304, N] - cur (out)
|
470 |
+
// //
|
471 |
+
// // cur = attn_w*cur + attn_b
|
472 |
+
// // [2304, N]
|
473 |
+
// {
|
474 |
+
// cur = ggml_mul_mat(ctx0,
|
475 |
+
// model.layers[il].c_attn_attn_w,
|
476 |
+
// cur);
|
477 |
+
//
|
478 |
+
// cur = ggml_add(ctx0,
|
479 |
+
// ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
|
480 |
+
// cur);
|
481 |
+
// }
|
482 |
+
//
|
483 |
+
// // self-attention
|
484 |
+
// {
|
485 |
+
// struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
|
486 |
+
// struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
|
487 |
+
// struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
|
488 |
+
//
|
489 |
+
// // store key and value to memory
|
490 |
+
// if (N >= 1) {
|
491 |
+
// struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
|
492 |
+
// struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
|
493 |
+
//
|
494 |
+
// ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
495 |
+
// ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
496 |
+
// }
|
497 |
+
//
|
498 |
+
// // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
499 |
+
// // [64, N, 12]
|
500 |
+
// struct ggml_tensor * Q =
|
501 |
+
// ggml_permute(ctx0,
|
502 |
+
// ggml_cpy(ctx0,
|
503 |
+
// Qcur,
|
504 |
+
// ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
505 |
+
// 0, 2, 1, 3);
|
506 |
+
//
|
507 |
+
// // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
508 |
+
// // [64, n_past + N, 12]
|
509 |
+
// struct ggml_tensor * K =
|
510 |
+
// ggml_permute(ctx0,
|
511 |
+
// ggml_reshape_3d(ctx0,
|
512 |
+
// ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
|
513 |
+
// n_embd/n_head, n_head, n_past + N),
|
514 |
+
// 0, 2, 1, 3);
|
515 |
+
//
|
516 |
+
// // GG: flash attention
|
517 |
+
// //struct ggml_tensor * V =
|
518 |
+
// // ggml_cpy(ctx0,
|
519 |
+
// // ggml_permute(ctx0,
|
520 |
+
// // ggml_reshape_3d(ctx0,
|
521 |
+
// // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
522 |
+
// // n_embd/n_head, n_head, n_past + N),
|
523 |
+
// // 1, 2, 0, 3),
|
524 |
+
// // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
|
525 |
+
//
|
526 |
+
// //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
|
527 |
+
//
|
528 |
+
// // K * Q
|
529 |
+
// // [n_past + N, N, 12]
|
530 |
+
// struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
531 |
+
//
|
532 |
+
// // KQ_scaled = KQ / sqrt(n_embd/n_head)
|
533 |
+
// // [n_past + N, N, 12]
|
534 |
+
// struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));
|
535 |
+
//
|
536 |
+
// // KQ_masked = mask_past(KQ_scaled)
|
537 |
+
// // [n_past + N, N, 12]
|
538 |
+
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
539 |
+
//
|
540 |
+
// // KQ = soft_max(KQ_masked)
|
541 |
+
// // [n_past + N, N, 12]
|
542 |
+
// struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
543 |
+
//
|
544 |
+
// // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
545 |
+
// // [n_past + N, 64, 12]
|
546 |
+
// struct ggml_tensor * V_trans =
|
547 |
+
// ggml_cpy(ctx0,
|
548 |
+
// ggml_permute(ctx0,
|
549 |
+
// ggml_reshape_3d(ctx0,
|
550 |
+
// ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
551 |
+
// n_embd/n_head, n_head, n_past + N),
|
552 |
+
// 1, 2, 0, 3),
|
553 |
+
// ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
|
554 |
+
//
|
555 |
+
// // KQV = transpose(V) * KQ_soft_max
|
556 |
+
// // [64, N, 12]
|
557 |
+
// struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
558 |
+
//
|
559 |
+
// // KQV_merged = KQV.permute(0, 2, 1, 3)
|
560 |
+
// // [64, 12, N]
|
561 |
+
// struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
562 |
+
//
|
563 |
+
// // cur = KQV_merged.contiguous().view(n_embd, N)
|
564 |
+
// // [768, N]
|
565 |
+
// cur = ggml_cpy(ctx0,
|
566 |
+
// KQV_merged,
|
567 |
+
// ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
568 |
+
// }
|
569 |
+
//
|
570 |
+
// // projection
|
571 |
+
// // [ 768, 768] - model.layers[il].c_attn_proj_w
|
572 |
+
// // [ 768, 1] - model.layers[il].c_attn_proj_b
|
573 |
+
// // [ 768, N] - cur (in)
|
574 |
+
// // [ 768, N] - cur (out)
|
575 |
+
// //
|
576 |
+
// // cur = proj_w*cur + proj_b
|
577 |
+
// // [768, N]
|
578 |
+
// {
|
579 |
+
// cur = ggml_mul_mat(ctx0,
|
580 |
+
// model.layers[il].c_attn_proj_w,
|
581 |
+
// cur);
|
582 |
+
//
|
583 |
+
// cur = ggml_add(ctx0,
|
584 |
+
// ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
|
585 |
+
// cur);
|
586 |
+
// }
|
587 |
+
//
|
588 |
+
// // add the input
|
589 |
+
// cur = ggml_add(ctx0, cur, inpL);
|
590 |
+
//
|
591 |
+
// struct ggml_tensor * inpFF = cur;
|
592 |
+
//
|
593 |
+
// // feed-forward network
|
594 |
+
// {
|
595 |
+
// // norm
|
596 |
+
// {
|
597 |
+
// cur = ggml_norm(ctx0, inpFF, hparams.eps);
|
598 |
+
//
|
599 |
+
// // cur = ln_2_g*cur + ln_2_b
|
600 |
+
// // [ 768, N]
|
601 |
+
// cur = ggml_add(ctx0,
|
602 |
+
// ggml_mul(ctx0,
|
603 |
+
// ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
|
604 |
+
// cur),
|
605 |
+
// ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
|
606 |
+
// }
|
607 |
+
//
|
608 |
+
// // fully connected
|
609 |
+
// // [3072, 768] - model.layers[il].c_mlp_fc_w
|
610 |
+
// // [3072, 1] - model.layers[il].c_mlp_fc_b
|
611 |
+
// // [ 768, N] - cur (in)
|
612 |
+
// // [3072, N] - cur (out)
|
613 |
+
// //
|
614 |
+
// // cur = fc_w*cur + fc_b
|
615 |
+
// // [3072, N]
|
616 |
+
// cur = ggml_mul_mat(ctx0,
|
617 |
+
// model.layers[il].c_mlp_fc_w,
|
618 |
+
// cur);
|
619 |
+
//
|
620 |
+
// cur = ggml_add(ctx0,
|
621 |
+
// ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
|
622 |
+
// cur);
|
623 |
+
//
|
624 |
+
// // GELU activation
|
625 |
+
// // [3072, N]
|
626 |
+
// cur = ggml_gelu(ctx0, cur);
|
627 |
+
//
|
628 |
+
// // projection
|
629 |
+
// // [ 768, 3072] - model.layers[il].c_mlp_proj_w
|
630 |
+
// // [ 768, 1] - model.layers[il].c_mlp_proj_b
|
631 |
+
// // [3072, N] - cur (in)
|
632 |
+
// // [ 768, N] - cur (out)
|
633 |
+
// //
|
634 |
+
// // cur = proj_w*cur + proj_b
|
635 |
+
// // [768, N]
|
636 |
+
// cur = ggml_mul_mat(ctx0,
|
637 |
+
// model.layers[il].c_mlp_proj_w,
|
638 |
+
// cur);
|
639 |
+
//
|
640 |
+
// cur = ggml_add(ctx0,
|
641 |
+
// ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
|
642 |
+
// cur);
|
643 |
+
// }
|
644 |
+
//
|
645 |
+
// // input for next layer
|
646 |
+
// inpL = ggml_add(ctx0, cur, inpFF);
|
647 |
+
// }
|
648 |
+
//
|
649 |
+
// // norm
|
650 |
+
// {
|
651 |
+
// // [ 768, N]
|
652 |
+
// inpL = ggml_norm(ctx0, inpL, hparams.eps);
|
653 |
+
//
|
654 |
+
// // inpL = ln_f_g*inpL + ln_f_b
|
655 |
+
// // [ 768, N]
|
656 |
+
// inpL = ggml_add(ctx0,
|
657 |
+
// ggml_mul(ctx0,
|
658 |
+
// ggml_repeat(ctx0, model.ln_f_g, inpL),
|
659 |
+
// inpL),
|
660 |
+
// ggml_repeat(ctx0, model.ln_f_b, inpL));
|
661 |
+
// }
|
662 |
+
//
|
663 |
+
// // inpL = WTE * inpL
|
664 |
+
// // [ 768, 50257] - model.lm_head
|
665 |
+
// // [ 768, N] - inpL
|
666 |
+
// inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
|
667 |
+
//
|
668 |
+
// // logits -> probs
|
669 |
+
// //inpL = ggml_soft_max_inplace(ctx0, inpL);
|
670 |
+
//
|
671 |
+
// // run the computation
|
672 |
+
// ggml_build_forward_expand(gf, inpL);
|
673 |
+
// ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
|
674 |
+
//
|
675 |
+
// //if (n_past%100 == 0) {
|
676 |
+
// // ggml_graph_print (&gf);
|
677 |
+
// // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
678 |
+
// //}
|
679 |
+
//
|
680 |
+
// //embd_w.resize(n_vocab*N);
|
681 |
+
// //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
682 |
+
//
|
683 |
+
// // return result just for the last token
|
684 |
+
// embd_w.resize(n_vocab);
|
685 |
+
// memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
686 |
+
//
|
687 |
+
// if (mem_per_token == 0) {
|
688 |
+
// mem_per_token = ggml_used_mem(ctx0)/N;
|
689 |
+
// }
|
690 |
+
// //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
|
691 |
+
//
|
692 |
+
// ggml_free(ctx0);
|
693 |
+
//
|
694 |
+
// return true;
|
695 |
+
//}
|
696 |
+
//
|
697 |
+
//int main(int argc, char ** argv) {
|
698 |
+
// ggml_time_init();
|
699 |
+
//
|
700 |
+
// const int64_t t_main_start_us = ggml_time_us();
|
701 |
+
//
|
702 |
+
// gpt_params params;
|
703 |
+
// params.model = "ggml-model-gpt-2-774M.bin";
|
704 |
+
//
|
705 |
+
// if (gpt_params_parse(argc, argv, params) == false) {
|
706 |
+
// return 1;
|
707 |
+
// }
|
708 |
+
//
|
709 |
+
// if (params.seed < 0) {
|
710 |
+
// params.seed = time(NULL);
|
711 |
+
// }
|
712 |
+
//
|
713 |
+
// printf("%s: seed = %d\n", __func__, params.seed);
|
714 |
+
//
|
715 |
+
// std::mt19937 rng(params.seed);
|
716 |
+
// if (params.prompt.empty()) {
|
717 |
+
// params.prompt = gpt_random_prompt(rng);
|
718 |
+
// }
|
719 |
+
//
|
720 |
+
// int64_t t_load_us = 0;
|
721 |
+
//
|
722 |
+
// gpt_vocab vocab;
|
723 |
+
// gpt2_model model;
|
724 |
+
//
|
725 |
+
// // load the model
|
726 |
+
// {
|
727 |
+
// const int64_t t_start_us = ggml_time_us();
|
728 |
+
//
|
729 |
+
// if (!gpt2_model_load(params.model, model, vocab)) {
|
730 |
+
// fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
731 |
+
// return 1;
|
732 |
+
// }
|
733 |
+
//
|
734 |
+
// t_load_us = ggml_time_us() - t_start_us;
|
735 |
+
//
|
736 |
+
// test_gpt_tokenizer(vocab, params.token_test);
|
737 |
+
// }
|
738 |
+
//
|
739 |
+
// while(true) {
|
740 |
+
// int n_past = 0;
|
741 |
+
//
|
742 |
+
// int64_t t_sample_us = 0;
|
743 |
+
// int64_t t_predict_us = 0;
|
744 |
+
//
|
745 |
+
// std::vector<float> logits;
|
746 |
+
//
|
747 |
+
// // tokenize the prompt
|
748 |
+
// std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
|
749 |
+
//
|
750 |
+
// params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
|
751 |
+
//
|
752 |
+
// printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
753 |
+
// printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
|
754 |
+
// for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
|
755 |
+
// printf("%d ", embd_inp[i]);
|
756 |
+
// }
|
757 |
+
// printf("\n\n");
|
758 |
+
//
|
759 |
+
// // submit the input prompt token-by-token
|
760 |
+
// // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
|
761 |
+
// std::vector<gpt_vocab::id> embd;
|
762 |
+
//
|
763 |
+
// // determine the required inference memory per token:
|
764 |
+
// size_t mem_per_token = 0;
|
765 |
+
// gpt2_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
766 |
+
//
|
767 |
+
// for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
|
768 |
+
// // predict
|
769 |
+
// if (embd.size() > 0) {
|
770 |
+
// const int64_t t_start_us = ggml_time_us();
|
771 |
+
//
|
772 |
+
// if (!gpt2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
|
773 |
+
// printf("Failed to predict\n");
|
774 |
+
// return 1;
|
775 |
+
// }
|
776 |
+
//
|
777 |
+
// t_predict_us += ggml_time_us() - t_start_us;
|
778 |
+
// }
|
779 |
+
//
|
780 |
+
// n_past += embd.size();
|
781 |
+
// embd.clear();
|
782 |
+
//
|
783 |
+
// if (i >= embd_inp.size()) {
|
784 |
+
// // sample next token
|
785 |
+
// const int top_k = params.top_k;
|
786 |
+
// const float top_p = params.top_p;
|
787 |
+
// const float temp = params.temp;
|
788 |
+
//
|
789 |
+
// const int n_vocab = model.hparams.n_vocab;
|
790 |
+
//
|
791 |
+
// gpt_vocab::id id = 0;
|
792 |
+
//
|
793 |
+
// {
|
794 |
+
// const int64_t t_start_sample_us = ggml_time_us();
|
795 |
+
//
|
796 |
+
// id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
|
797 |
+
//
|
798 |
+
// t_sample_us += ggml_time_us() - t_start_sample_us;
|
799 |
+
// }
|
800 |
+
//
|
801 |
+
// // add it to the context
|
802 |
+
// embd.push_back(id);
|
803 |
+
// } else {
|
804 |
+
// // if here, it means we are still processing the input prompt
|
805 |
+
// for (size_t k = i; k < embd_inp.size(); k++) {
|
806 |
+
// embd.push_back(embd_inp[k]);
|
807 |
+
// if (int32_t(embd.size()) >= params.n_batch) {
|
808 |
+
// break;
|
809 |
+
// }
|
810 |
+
// }
|
811 |
+
// i += embd.size() - 1;
|
812 |
+
// }
|
813 |
+
//
|
814 |
+
// // display text
|
815 |
+
// for (auto id : embd) {
|
816 |
+
// printf("%s", vocab.id_to_token[id].c_str());
|
817 |
+
// }
|
818 |
+
// fflush(stdout);
|
819 |
+
//
|
820 |
+
// // end of text token
|
821 |
+
// if (embd.back() == 50256) {
|
822 |
+
// // report timing
|
823 |
+
// {
|
824 |
+
// const int64_t t_main_end_us = ggml_time_us();
|
825 |
+
//
|
826 |
+
// printf("\n\n");
|
827 |
+
// printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
|
828 |
+
// printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
|
829 |
+
// printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
|
830 |
+
// printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
|
831 |
+
// printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
|
832 |
+
// }
|
833 |
+
// break;
|
834 |
+
// }
|
835 |
+
// }
|
836 |
+
// }
|
837 |
+
//
|
838 |
+
// ggml_free(model.ctx_w);
|
839 |
+
//
|
840 |
+
// return 0;
|
841 |
+
//}
|
quantize.cpp
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "ggml.h"
|
2 |
+
|
3 |
+
#include "common.h"
|
4 |
+
#include "common-ggml.h"
|
5 |
+
|
6 |
+
#include <cassert>
|
7 |
+
#include <cmath>
|
8 |
+
#include <cstdio>
|
9 |
+
#include <cstring>
|
10 |
+
#include <fstream>
|
11 |
+
#include <map>
|
12 |
+
#include <string>
|
13 |
+
#include <vector>
|
14 |
+
#include <regex>
|
15 |
+
|
16 |
+
// default hparams (GPT-2 117M)
|
17 |
+
struct gpt2_hparams {
|
18 |
+
int32_t n_vocab = 50257;
|
19 |
+
int32_t n_ctx = 1024;
|
20 |
+
int32_t n_embd = 768;
|
21 |
+
int32_t n_head = 12;
|
22 |
+
int32_t n_layer = 12;
|
23 |
+
int32_t ftype = 1;
|
24 |
+
};
|
25 |
+
|
26 |
+
// quantize a model
|
27 |
+
bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
|
28 |
+
gpt_vocab vocab;
|
29 |
+
|
30 |
+
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
31 |
+
|
32 |
+
auto finp = std::ifstream(fname_inp, std::ios::binary);
|
33 |
+
if (!finp) {
|
34 |
+
fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
|
35 |
+
return false;
|
36 |
+
}
|
37 |
+
|
38 |
+
auto fout = std::ofstream(fname_out, std::ios::binary);
|
39 |
+
if (!fout) {
|
40 |
+
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
|
41 |
+
return false;
|
42 |
+
}
|
43 |
+
|
44 |
+
// verify magic
|
45 |
+
{
|
46 |
+
uint32_t magic;
|
47 |
+
finp.read((char *) &magic, sizeof(magic));
|
48 |
+
if (magic != GGML_FILE_MAGIC) {
|
49 |
+
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
|
50 |
+
return false;
|
51 |
+
}
|
52 |
+
|
53 |
+
fout.write((char *) &magic, sizeof(magic));
|
54 |
+
}
|
55 |
+
|
56 |
+
gpt2_hparams hparams;
|
57 |
+
|
58 |
+
// load hparams
|
59 |
+
{
|
60 |
+
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
61 |
+
finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
62 |
+
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
63 |
+
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
64 |
+
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
65 |
+
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
66 |
+
|
67 |
+
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
68 |
+
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
|
69 |
+
|
70 |
+
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
71 |
+
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
72 |
+
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
73 |
+
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
74 |
+
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
75 |
+
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
|
76 |
+
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
|
77 |
+
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
|
78 |
+
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
|
79 |
+
|
80 |
+
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
81 |
+
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
82 |
+
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
83 |
+
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
|
84 |
+
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
85 |
+
fout.write((char *) &ftype_dst, sizeof(ftype_dst));
|
86 |
+
}
|
87 |
+
|
88 |
+
// load vocab
|
89 |
+
{
|
90 |
+
int32_t n_vocab = 0;
|
91 |
+
finp.read ((char *) &n_vocab, sizeof(n_vocab));
|
92 |
+
fout.write((char *) &n_vocab, sizeof(n_vocab));
|
93 |
+
|
94 |
+
if (n_vocab != hparams.n_vocab) {
|
95 |
+
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
96 |
+
__func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
|
97 |
+
return false;
|
98 |
+
}
|
99 |
+
|
100 |
+
std::string word;
|
101 |
+
for (int i = 0; i < n_vocab; i++) {
|
102 |
+
uint32_t len;
|
103 |
+
finp.read ((char *) &len, sizeof(len));
|
104 |
+
fout.write((char *) &len, sizeof(len));
|
105 |
+
|
106 |
+
word.resize(len);
|
107 |
+
finp.read ((char *) word.data(), len);
|
108 |
+
fout.write((char *) word.data(), len);
|
109 |
+
|
110 |
+
vocab.token_to_id[word] = i;
|
111 |
+
vocab.id_to_token[i] = word;
|
112 |
+
}
|
113 |
+
}
|
114 |
+
|
115 |
+
// regexes of tensor names to be quantized
|
116 |
+
const std::vector<std::string> to_quant = {
|
117 |
+
"model/wte",
|
118 |
+
"model/lm_head",
|
119 |
+
"model/h.*/attn/c_attn/w",
|
120 |
+
"model/h.*/attn/c_proj/w",
|
121 |
+
"model/h.*/mlp/c_fc/w",
|
122 |
+
"model/h.*/mlp/c_proj/w",
|
123 |
+
};
|
124 |
+
|
125 |
+
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
|
126 |
+
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
|
127 |
+
return false;
|
128 |
+
}
|
129 |
+
|
130 |
+
finp.close();
|
131 |
+
fout.close();
|
132 |
+
|
133 |
+
return true;
|
134 |
+
}
|
135 |
+
|
136 |
+
// usage:
|
137 |
+
// ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
|
138 |
+
//
|
139 |
+
int main(int argc, char ** argv) {
|
140 |
+
if (argc != 4) {
|
141 |
+
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
142 |
+
ggml_print_ftypes(stderr);
|
143 |
+
return 1;
|
144 |
+
}
|
145 |
+
|
146 |
+
// needed to initialize f16 tables
|
147 |
+
{
|
148 |
+
struct ggml_init_params params = { 0, NULL, false };
|
149 |
+
struct ggml_context * ctx = ggml_init(params);
|
150 |
+
ggml_free(ctx);
|
151 |
+
}
|
152 |
+
|
153 |
+
const std::string fname_inp = argv[1];
|
154 |
+
const std::string fname_out = argv[2];
|
155 |
+
|
156 |
+
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
|
157 |
+
|
158 |
+
const int64_t t_main_start_us = ggml_time_us();
|
159 |
+
|
160 |
+
int64_t t_quantize_us = 0;
|
161 |
+
|
162 |
+
// load the model
|
163 |
+
{
|
164 |
+
const int64_t t_start_us = ggml_time_us();
|
165 |
+
|
166 |
+
if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
|
167 |
+
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
168 |
+
return 1;
|
169 |
+
}
|
170 |
+
|
171 |
+
t_quantize_us = ggml_time_us() - t_start_us;
|
172 |
+
}
|
173 |
+
|
174 |
+
// report timing
|
175 |
+
{
|
176 |
+
const int64_t t_main_end_us = ggml_time_us();
|
177 |
+
|
178 |
+
printf("\n");
|
179 |
+
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
|
180 |
+
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
|
181 |
+
}
|
182 |
+
|
183 |
+
return 0;
|
184 |
+
}
|