commited on
Upload 21 files
Browse files- GPT2.cbp +63 -0
- GPT2.cscope_file_list +22 -0
- GPT2.depend +240 -0
- GPT2.layout +80 -0
- common-ggml.cpp +244 -0
- common-ggml.h +18 -0
- common.cpp +911 -0
- common.h +343 -0
- dr_wav.h +0 -0
- ggml-aarch64.c +0 -0
- ggml-aarch64.h +39 -0
- ggml-common.h +0 -0
- ggml-cpu-impl.h +614 -0
- ggml-impl.h +209 -0
- ggml-model-gpt-2-774M.bin +3 -0
- ggml-quants.c +0 -0
- ggml-quants.h +147 -0
- ggml.c +0 -0
- ggml.h +0 -0
- main-ctx.cpp +841 -0
- quantize.cpp +184 -0
@@ -0,0 +1,63 @@
1 |
<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
2 |
3 |
<FileVersion major="1" minor="6" />
4 |
5 |
<Option title="GPT2" />
6 |
<Option pch_mode="2" />
7 |
<Option compiler="gcc" />
8 |
9 |
<Target title="Debug">
10 |
<Option output="bin/Debug/GPT2" prefix_auto="1" extension_auto="1" />
11 |
<Option object_output="obj/Debug/" />
12 |
<Option type="1" />
13 |
<Option compiler="gcc" />
14 |
15 |
<Add option="-g" />
16 |
17 |
18 |
<Target title="Release">
19 |
<Option output="bin/Release/GPT2" prefix_auto="1" extension_auto="1" />
20 |
<Option object_output="obj/Release/" />
21 |
<Option type="1" />
22 |
<Option compiler="gcc" />
23 |
24 |
<Add option="-O2" />
25 |
26 |
27 |
<Add option="-s" />
28 |
29 |
30 |
31 |
32 |
<Add option="-Wall" />
33 |
<Add option="-fexceptions" />
34 |
35 |
<Unit filename="GPT2.cbp" />
36 |
<Unit filename="GPT2.layout" />
37 |
<Unit filename="common-ggml.cpp" />
38 |
<Unit filename="common-ggml.h" />
39 |
<Unit filename="common.cpp" />
40 |
<Unit filename="common.h" />
41 |
<Unit filename="dr_wav.h" />
42 |
<Unit filename="ggml-aarch64.c">
43 |
<Option compilerVar="CC" />
44 |
45 |
<Unit filename="ggml-aarch64.h" />
46 |
<Unit filename="ggml-common.h" />
47 |
<Unit filename="ggml-cpu-impl.h" />
48 |
<Unit filename="ggml-impl.h" />
49 |
<Unit filename="ggml-quants.c">
50 |
<Option compilerVar="CC" />
51 |
52 |
<Unit filename="ggml-quants.h" />
53 |
<Unit filename="ggml.c">
54 |
<Option compilerVar="CC" />
55 |
56 |
<Unit filename="ggml.h" />
57 |
<Unit filename="main-ctx.cpp" />
58 |
<Unit filename="quantize.cpp" />
59 |
60 |
<lib_finder disable_auto="1" />
61 |
62 |
63 |
@@ -0,0 +1,22 @@
1 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-alloc.h"
2 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-alloc.c"
3 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-impl.h"
4 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common-ggml.cpp"
5 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-quants.c"
6 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-aarch64.h"
7 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common.cpp"
8 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-quants.h"
9 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-backend-impl.h"
10 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml.h"
11 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-common.h"
12 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml.c"
13 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-backend.h"
14 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common.h"
15 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\GPT2.cbp"
16 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\common-ggml.h"
17 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-cpu-impl.h"
18 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-aarch64.c"
19 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\GPT2.layout"
20 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\ggml-backend.cpp"
21 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\dr_wav.h"
22 |
"C:\Users\Caleb P. Nwokocha\CodeBlocksProjects\GPT2\main-ctx.cpp"
@@ -0,0 +1,240 @@
1 |
# depslib dependency file v1.0
2 |
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\common-ggml.cpp
3 |
4 |
5 |
6 |
7 |
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\common-ggml.h
8 |
9 |
10 |
11 |
12 |
13 |
1730691388 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml.h
14 |
15 |
16 |
17 |
18 |
19 |
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-alloc.c
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-alloc.h
32 |
33 |
34 |
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-backend-impl.h
35 |
36 |
37 |
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-backend.h
38 |
39 |
40 |
41 |
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-impl.h
42 |
43 |
44 |
45 |
46 |
47 |
48 |
1730735604 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\common.cpp
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\common.h
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\dr_wav.h
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-backend.cpp
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-aarch64.c
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-common.h
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-quants.h
136 |
137 |
138 |
139 |
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-cpu-impl.h
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
1730534952 c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-aarch64.h
159 |
160 |
161 |
162 |
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml-quants.c
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
1730734998 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\ggml.c
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
1730683892 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\main-alloc.cpp
224 |
225 |
1730737838 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\main-ctx.cpp
226 |
227 |
1730534952 source:c:\users\caleb p. nwokocha\codeblocksprojects\gpt2\quantize.cpp
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
@@ -0,0 +1,80 @@
1 |
<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
2 |
3 |
<FileVersion major="1" minor="0" />
4 |
<ActiveTarget name="Debug" />
5 |
<File name="ggml-impl.h" open="1" top="0" tabpos="10" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
6 |
7 |
<Cursor1 position="6388" topLine="0" />
8 |
9 |
10 |
<File name="common-ggml.cpp" open="1" top="0" tabpos="4" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
11 |
12 |
<Cursor1 position="223" topLine="135" />
13 |
14 |
15 |
<File name="ggml-quants.c" open="1" top="0" tabpos="11" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
16 |
17 |
<Cursor1 position="2705" topLine="0" />
18 |
19 |
20 |
<File name="ggml-aarch64.h" open="1" top="0" tabpos="8" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
21 |
22 |
<Cursor1 position="1519" topLine="0" />
23 |
24 |
25 |
<File name="common.cpp" open="1" top="0" tabpos="5" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
26 |
27 |
<Cursor1 position="152" topLine="0" />
28 |
29 |
30 |
<File name="ggml-quants.h" open="1" top="0" tabpos="13" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
31 |
32 |
<Cursor1 position="0" topLine="128" />
33 |
34 |
35 |
<File name="quantize.cpp" open="1" top="1" tabpos="15" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
36 |
37 |
<Cursor1 position="4241" topLine="139" />
38 |
39 |
40 |
<File name="ggml.h" open="1" top="0" tabpos="1" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
41 |
42 |
<Cursor1 position="8069" topLine="212" />
43 |
44 |
45 |
<File name="ggml-common.h" open="1" top="0" tabpos="14" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
46 |
47 |
<Cursor1 position="0" topLine="0" />
48 |
49 |
50 |
<File name="ggml.c" open="1" top="0" tabpos="6" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
51 |
52 |
<Cursor1 position="522" topLine="0" />
53 |
54 |
55 |
<File name="common.h" open="1" top="0" tabpos="2" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
56 |
57 |
<Cursor1 position="0" topLine="0" />
58 |
59 |
60 |
<File name="common-ggml.h" open="1" top="0" tabpos="3" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
61 |
62 |
<Cursor1 position="141" topLine="0" />
63 |
64 |
65 |
<File name="ggml-cpu-impl.h" open="1" top="0" tabpos="9" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
66 |
67 |
<Cursor1 position="0" topLine="0" />
68 |
69 |
70 |
<File name="ggml-aarch64.c" open="1" top="0" tabpos="7" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
71 |
72 |
<Cursor1 position="442" topLine="0" />
73 |
74 |
75 |
<File name="main-ctx.cpp" open="1" top="0" tabpos="12" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
76 |
77 |
<Cursor1 position="114" topLine="659" />
78 |
79 |
80 |
@@ -0,0 +1,244 @@
1 |
#include "common-ggml.h"
2 |
3 |
#include <regex>
4 |
#include <map>
5 |
6 |
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
7 |
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
8 |
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
9 |
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
10 |
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
11 |
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
void ggml_print_ftypes(FILE * fp) {
20 |
for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
21 |
fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
22 |
23 |
24 |
25 |
enum ggml_ftype ggml_parse_ftype(const char * str) {
26 |
enum ggml_ftype ftype;
27 |
if (str[0] == 'q') {
28 |
const auto it = GGML_FTYPE_MAP.find(str);
29 |
if (it == GGML_FTYPE_MAP.end()) {
30 |
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
31 |
32 |
33 |
ftype = it->second;
34 |
} else {
35 |
ftype = (enum ggml_ftype) atoi(str);
36 |
37 |
38 |
return ftype;
39 |
40 |
41 |
bool ggml_common_quantize_0(
42 |
std::ifstream & finp,
43 |
std::ofstream & fout,
44 |
const ggml_ftype ftype,
45 |
const std::vector<std::string> & to_quant,
46 |
const std::vector<std::string> & to_skip) {
47 |
48 |
ggml_type qtype = GGML_TYPE_F32;
49 |
50 |
switch (ftype) {
51 |
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
52 |
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
53 |
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
54 |
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
55 |
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
56 |
case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break;
57 |
case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break;
58 |
case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break;
59 |
case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break;
60 |
case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break;
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
case GGML_FTYPE_MOSTLY_Q4_0_4_4:
76 |
case GGML_FTYPE_MOSTLY_Q4_0_4_8:
77 |
case GGML_FTYPE_MOSTLY_Q4_0_8_8:
78 |
79 |
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
80 |
return false;
81 |
82 |
83 |
84 |
if (!ggml_is_quantized(qtype)) {
85 |
fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
86 |
return false;
87 |
88 |
89 |
size_t total_size_org = 0;
90 |
size_t total_size_new = 0;
91 |
92 |
std::vector<float> work;
93 |
94 |
std::vector<uint8_t> data_u8;
95 |
std::vector<ggml_fp16_t> data_f16;
96 |
std::vector<float> data_f32;
97 |
98 |
while (true) {
99 |
int32_t n_dims;
100 |
int32_t length;
101 |
int32_t ttype;
102 |
103 |
+<char *>(&n_dims), sizeof(n_dims));
104 |
+<char *>(&length), sizeof(length));
105 |
+<char *>(&ttype), sizeof(ttype));
106 |
107 |
if (finp.eof()) {
108 |
109 |
110 |
111 |
int32_t nelements = 1;
112 |
int32_t ne[4] = { 1, 1, 1, 1 };
113 |
for (int i = 0; i < n_dims; ++i) {
114 |
+ (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
115 |
nelements *= ne[i];
116 |
117 |
118 |
std::string name(length, 0);
119 |
+ (&name[0], length);
120 |
121 |
printf("%64s - [%5d, %5d, %5d], type = %6s ",, ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
122 |
123 |
bool quantize = false;
124 |
125 |
// check if we should quantize this tensor
126 |
for (const auto & s : to_quant) {
127 |
if (std::regex_match(name, std::regex(s))) {
128 |
quantize = true;
129 |
130 |
131 |
132 |
133 |
// check if we should skip this tensor
134 |
for (const auto & s : to_skip) {
135 |
if (std::regex_match(name, std::regex(s))) {
136 |
quantize = false;
137 |
138 |
139 |
140 |
141 |
// quantize only 2D tensors
142 |
quantize &= (n_dims == 2);
143 |
144 |
if (quantize) {
145 |
if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
146 |
fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
147 |
return false;
148 |
149 |
150 |
if (ttype == GGML_TYPE_F16) {
151 |
152 |
+<char *>(, nelements * sizeof(ggml_fp16_t));
153 |
154 |
for (int i = 0; i < nelements; ++i) {
155 |
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
156 |
157 |
} else {
158 |
159 |
+<char *>(, nelements * sizeof(float));
160 |
161 |
162 |
ttype = qtype;
163 |
} else {
164 |
const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
165 |
166 |
167 |
+<char *>(, nelements * bpe);
168 |
169 |
170 |
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
171 |
fout.write(reinterpret_cast<char *>(&length), sizeof(length));
172 |
fout.write(reinterpret_cast<char *>(&ttype), sizeof(ttype));
173 |
for (int i = 0; i < n_dims; ++i) {
174 |
fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
175 |
176 |
fout.write(&name[0], length);
177 |
178 |
if (quantize) {
179 |
work.resize(nelements); // for quantization
180 |
181 |
size_t cur_size = 0;
182 |
switch ((ggml_type) ttype) {
183 |
case GGML_TYPE_Q4_0:
184 |
case GGML_TYPE_Q4_1:
185 |
case GGML_TYPE_Q5_0:
186 |
case GGML_TYPE_Q5_1:
187 |
case GGML_TYPE_Q8_0:
188 |
case GGML_TYPE_Q2_K:
189 |
case GGML_TYPE_Q3_K:
190 |
case GGML_TYPE_Q4_K:
191 |
case GGML_TYPE_Q5_K:
192 |
case GGML_TYPE_Q6_K:
193 |
194 |
cur_size = ggml_quantize_chunk((ggml_type) ttype,,, 0, nelements/ne[0], ne[0], nullptr);
195 |
} break;
196 |
case GGML_TYPE_F32:
197 |
case GGML_TYPE_F16:
198 |
case GGML_TYPE_I8:
199 |
case GGML_TYPE_I16:
200 |
case GGML_TYPE_I32:
201 |
case GGML_TYPE_I64:
202 |
case GGML_TYPE_F64:
203 |
case GGML_TYPE_Q8_1:
204 |
case GGML_TYPE_Q8_K:
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
case GGML_TYPE_BF16:
215 |
case GGML_TYPE_Q4_0_4_4:
216 |
case GGML_TYPE_Q4_0_4_8:
217 |
case GGML_TYPE_Q4_0_8_8:
218 |
case GGML_TYPE_TQ1_0:
219 |
case GGML_TYPE_TQ2_0:
220 |
221 |
222 |
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
223 |
return false;
224 |
225 |
226 |
227 |
fout.write(reinterpret_cast<char *>(, cur_size);
228 |
total_size_new += cur_size;
229 |
230 |
printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
231 |
} else {
232 |
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
233 |
fout.write(reinterpret_cast<char *>(, data_u8.size());
234 |
total_size_new += data_u8.size();
235 |
236 |
237 |
total_size_org += nelements * sizeof(float);
238 |
239 |
240 |
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
241 |
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
242 |
243 |
return true;
244 |
@@ -0,0 +1,18 @@
1 |
#pragma once
2 |
3 |
#include "ggml.h"
4 |
5 |
#include <fstream>
6 |
#include <vector>
7 |
#include <string>
8 |
9 |
enum ggml_ftype ggml_parse_ftype(const char * str);
10 |
11 |
void ggml_print_ftypes(FILE * fp = stderr);
12 |
13 |
bool ggml_common_quantize_0(
14 |
std::ifstream & finp,
15 |
std::ofstream & fout,
16 |
const ggml_ftype ftype,
17 |
const std::vector<std::string> & to_quant,
18 |
const std::vector<std::string> & to_skip);
@@ -0,0 +1,911 @@
1 |
#define _USE_MATH_DEFINES // for M_PI
2 |
3 |
#include "common.h"
4 |
5 |
// third-party utilities
6 |
// use your favorite implementations
7 |
8 |
#include "dr_wav.h"
9 |
10 |
#include <cmath>
11 |
#include <cstring>
12 |
#include <fstream>
13 |
#include <regex>
14 |
#include <locale>
15 |
#include <codecvt>
16 |
#include <sstream>
17 |
18 |
#if defined(_MSC_VER)
19 |
#pragma warning(disable: 4244 4267) // possible loss of data
20 |
21 |
22 |
#ifdef _WIN32
23 |
#include <fcntl.h>
24 |
#include <io.h>
25 |
26 |
27 |
28 |
// as implemented in ffmpeg_trancode.cpp only embedded in common lib if whisper built with ffmpeg support
29 |
extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data);
30 |
31 |
32 |
// Function to check if the next argument exists
33 |
static std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
34 |
if (i + 1 < argc && argv[i + 1][0] != '-') {
35 |
return argv[++i];
36 |
} else {
37 |
fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
38 |
gpt_print_usage(argc, argv, params);
39 |
40 |
41 |
42 |
43 |
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
44 |
for (int i = 1; i < argc; i++) {
45 |
std::string arg = argv[i];
46 |
47 |
if (arg == "-s" || arg == "--seed") {
48 |
params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
49 |
} else if (arg == "-t" || arg == "--threads") {
50 |
params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
51 |
} else if (arg == "-p" || arg == "--prompt") {
52 |
params.prompt = get_next_arg(i, argc, argv, arg, params);
53 |
} else if (arg == "-n" || arg == "--n_predict") {
54 |
params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
55 |
} else if (arg == "-np" || arg == "--n_parallel") {
56 |
params.n_parallel = std::stoi(get_next_arg(i, argc, argv, arg, params));
57 |
} else if (arg == "--top_k") {
58 |
params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
59 |
} else if (arg == "--top_p") {
60 |
params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
61 |
} else if (arg == "--temp") {
62 |
params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
63 |
} else if (arg == "--repeat-last-n") {
64 |
params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
65 |
} else if (arg == "--repeat-penalty") {
66 |
params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
67 |
} else if (arg == "-b" || arg == "--batch_size") {
68 |
params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
69 |
} else if (arg == "-c" || arg == "--context") {
70 |
params.n_ctx= std::stoi(get_next_arg(i, argc, argv, arg, params));
71 |
} else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
72 |
params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
73 |
} else if (arg == "--ignore-eos") {
74 |
params.ignore_eos = true;
75 |
} else if (arg == "-m" || arg == "--model") {
76 |
params.model = get_next_arg(i, argc, argv, arg, params);
77 |
} else if (arg == "-i" || arg == "--interactive") {
78 |
params.interactive = true;
79 |
} else if (arg == "-ip" || arg == "--interactive-port") {
80 |
params.interactive = true;
81 |
params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
82 |
} else if (arg == "-h" || arg == "--help") {
83 |
gpt_print_usage(argc, argv, params);
84 |
85 |
} else if (arg == "-f" || arg == "--file") {
86 |
get_next_arg(i, argc, argv, arg, params);
87 |
std::ifstream file(argv[i]);
88 |
if (!file) {
89 |
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
90 |
91 |
92 |
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
93 |
if (params.prompt.back() == '\n') {
94 |
95 |
96 |
} else if (arg == "-tt" || arg == "--token_test") {
97 |
params.token_test = get_next_arg(i, argc, argv, arg, params);
98 |
99 |
else {
100 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
101 |
gpt_print_usage(argc, argv, params);
102 |
103 |
104 |
105 |
106 |
return true;
107 |
108 |
109 |
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
110 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
111 |
fprintf(stderr, "\n");
112 |
fprintf(stderr, "options:\n");
113 |
fprintf(stderr, " -h, --help show this help message and exit\n");
114 |
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
115 |
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
116 |
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
117 |
fprintf(stderr, " prompt to start generation with (default: random)\n");
118 |
fprintf(stderr, " -f FNAME, --file FNAME\n");
119 |
fprintf(stderr, " load prompt from a file\n");
120 |
fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
121 |
fprintf(stderr, " test tokenization\n");
122 |
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
123 |
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
124 |
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
125 |
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
126 |
fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
127 |
fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
128 |
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
129 |
fprintf(stderr, " -c N, --context N context / KV cache size (default: %d)\n", params.n_ctx);
130 |
fprintf(stderr, " --ignore-eos ignore EOS token during generation\n");
131 |
fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
132 |
fprintf(stderr, " -m FNAME, --model FNAME\n");
133 |
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
134 |
fprintf(stderr, "\n");
135 |
136 |
137 |
std::string gpt_random_prompt(std::mt19937 & rng) {
138 |
const int r = rng() % 10;
139 |
switch (r) {
140 |
case 0: return "So";
141 |
case 1: return "Once upon a time";
142 |
case 2: return "When";
143 |
case 3: return "The";
144 |
case 4: return "After";
145 |
case 5: return "If";
146 |
case 6: return "import";
147 |
case 7: return "He";
148 |
case 8: return "She";
149 |
case 9: return "They";
150 |
151 |
152 |
return "The";
153 |
154 |
155 |
std::string trim(const std::string & s) {
156 |
std::regex e("^\\s+|\\s+$");
157 |
return std::regex_replace(s, e, "");
158 |
159 |
160 |
std::string replace(const std::string & s, const std::string & from, const std::string & to) {
161 |
std::string result = s;
162 |
size_t pos = 0;
163 |
while ((pos = result.find(from, pos)) != std::string::npos) {
164 |
result.replace(pos, from.length(), to);
165 |
pos += to.length();
166 |
167 |
return result;
168 |
169 |
170 |
void gpt_vocab::add_special_token(const std::string & token) {
171 |
172 |
173 |
174 |
std::map<std::string, int32_t> json_parse(const std::string & fname) {
175 |
std::map<std::string, int32_t> result;
176 |
177 |
// read file into string
178 |
std::string json;
179 |
180 |
std::ifstream ifs(fname);
181 |
if (!ifs) {
182 |
fprintf(stderr, "Failed to open %s\n", fname.c_str());
183 |
184 |
185 |
186 |
json = std::string((std::istreambuf_iterator<char>(ifs)),
187 |
188 |
189 |
190 |
if (json[0] != '{') {
191 |
return result;
192 |
193 |
194 |
// parse json
195 |
196 |
bool has_key = false;
197 |
bool in_token = false;
198 |
199 |
std::string str_key = "";
200 |
std::string str_val = "";
201 |
202 |
int n = json.size();
203 |
for (int i = 1; i < n; ++i) {
204 |
if (!in_token) {
205 |
if (json[i] == ' ') continue;
206 |
if (json[i] == '"') {
207 |
in_token = true;
208 |
209 |
210 |
} else {
211 |
if (json[i] == '\\' && i+1 < n) {
212 |
if (has_key == false) {
213 |
str_key += json[i];
214 |
} else {
215 |
str_val += json[i];
216 |
217 |
218 |
} else if (json[i] == '"') {
219 |
if (has_key == false) {
220 |
has_key = true;
221 |
222 |
while (json[i] == ' ') ++i;
223 |
++i; // :
224 |
while (json[i] == ' ') ++i;
225 |
if (json[i] != '\"') {
226 |
while (json[i] != ',' && json[i] != '}') {
227 |
str_val += json[i++];
228 |
229 |
has_key = false;
230 |
} else {
231 |
in_token = true;
232 |
233 |
234 |
} else {
235 |
has_key = false;
236 |
237 |
238 |
str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
239 |
str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
240 |
str_key = ::replace(str_key, "\\\"", "\""); // \\\" -> "
241 |
242 |
try {
243 |
result[str_key] = std::stoi(str_val);
244 |
} catch (...) {
245 |
//fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
246 |
247 |
248 |
str_key = "";
249 |
str_val = "";
250 |
in_token = false;
251 |
252 |
253 |
if (has_key == false) {
254 |
str_key += json[i];
255 |
} else {
256 |
str_val += json[i];
257 |
258 |
259 |
260 |
261 |
262 |
return result;
263 |
264 |
265 |
std::string convert_to_utf8(const std::wstring & input) {
266 |
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
267 |
return converter.to_bytes(input);
268 |
269 |
270 |
271 |
std::wstring convert_to_wstring(const std::string & input) {
272 |
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
273 |
return converter.from_bytes(input);
274 |
275 |
276 |
void gpt_split_words(std::string str, std::vector<std::string>& words) {
277 |
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
278 |
const std::regex re(pattern);
279 |
std::smatch m;
280 |
281 |
while (std::regex_search(str, m, re)) {
282 |
for (auto x : m) {
283 |
284 |
285 |
str = m.suffix();
286 |
287 |
288 |
289 |
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
290 |
std::vector<std::string> words;
291 |
292 |
// first split the text into words
293 |
294 |
std::string str = text;
295 |
296 |
// Generate the subpattern from the special_tokens vector if it's not empty
297 |
if (!vocab.special_tokens.empty()) {
298 |
const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
299 |
std::string special_tokens_subpattern;
300 |
for (const auto & token : vocab.special_tokens) {
301 |
if (!special_tokens_subpattern.empty()) {
302 |
special_tokens_subpattern += "|";
303 |
304 |
special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
305 |
306 |
307 |
std::regex re(special_tokens_subpattern);
308 |
std::smatch m;
309 |
// Split the text by special tokens.
310 |
while (std::regex_search(str, m, re)) {
311 |
// Split the substrings in-between special tokens into words.
312 |
gpt_split_words(m.prefix(), words);
313 |
// Add matched special tokens as words.
314 |
for (auto x : m) {
315 |
316 |
317 |
str = m.suffix();
318 |
319 |
// Remaining text without special tokens will be handled below.
320 |
321 |
322 |
gpt_split_words(str, words);
323 |
324 |
325 |
// find the longest token that forms each word in words:
326 |
std::vector<gpt_vocab::id> tokens;
327 |
for (const auto & word : words) {
328 |
for (int i = 0; i < (int) word.size(); ){
329 |
for (int j = word.size() - 1; j >= i; j--){
330 |
auto cand = word.substr(i, j-i+1);
331 |
auto it = vocab.token_to_id.find(cand);
332 |
if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
333 |
334 |
i = j + 1;
335 |
336 |
337 |
else if (j == i){ // word.substr(i, 1) has no matching
338 |
fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
339 |
340 |
341 |
342 |
343 |
344 |
345 |
return tokens;
346 |
347 |
348 |
static std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
349 |
std::vector<gpt_vocab::id> output;
350 |
std::stringstream ss(input);
351 |
std::string token;
352 |
353 |
while (std::getline(ss, token, delimiter)) {
354 |
355 |
356 |
357 |
return output;
358 |
359 |
360 |
static std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
361 |
if (fpath_test.empty()){
362 |
fprintf(stderr, "%s : No test file found.\n", __func__);
363 |
return std::map<std::string, std::vector<gpt_vocab::id>>();
364 |
365 |
366 |
std::map<std::string, std::vector<gpt_vocab::id>> tests;
367 |
368 |
auto fin = std::ifstream(fpath_test, std::ios_base::in);
369 |
const char * delimeter = " => ";
370 |
const char del_tok = ',';
371 |
std::string line;
372 |
while (std::getline(fin, line)) {
373 |
size_t delimiterPos = line.find(delimeter);
374 |
if (delimiterPos != std::string::npos) {
375 |
std::string text = line.substr(0, delimiterPos);
376 |
std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter));
377 |
tests[text] = parse_tokens_from_string(s_tokens, del_tok);
378 |
379 |
380 |
return tests;
381 |
382 |
383 |
void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){
384 |
std::map<std::string, std::vector<gpt_vocab::id>> tests = extract_tests_from_file(fpath_test);
385 |
386 |
size_t n_fails = 0;
387 |
388 |
for (const auto & test : tests) {
389 |
std::vector<gpt_vocab::id> tokens = gpt_tokenize(vocab, test.first);
390 |
391 |
if (tokens != test.second){
392 |
393 |
394 |
// print out failure cases
395 |
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str());
396 |
fprintf(stderr, "%s : tokens in hf: ", __func__);
397 |
for (const auto & t : test.second) {
398 |
fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
399 |
400 |
fprintf(stderr, "\n");
401 |
fprintf(stderr, "%s : tokens in ggml: ", __func__);
402 |
for (const auto & t : tokens) {
403 |
fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
404 |
405 |
fprintf(stderr, "\n");
406 |
407 |
408 |
409 |
fprintf(stderr, "%s : %zu tests failed out of %zu tests.\n", __func__, n_fails, tests.size());
410 |
411 |
412 |
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
413 |
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
414 |
415 |
vocab.token_to_id = ::json_parse(fname);
416 |
417 |
for (const auto & kv : vocab.token_to_id) {
418 |
vocab.id_to_token[kv.second] = kv.first;
419 |
420 |
421 |
printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
422 |
423 |
// print the vocabulary
424 |
//for (auto kv : vocab.token_to_id) {
425 |
// printf("'%s' -> %d\n",, kv.second);
426 |
427 |
428 |
return true;
429 |
430 |
431 |
gpt_vocab::id gpt_sample_top_k_top_p(
432 |
const gpt_vocab & vocab,
433 |
const float * logits,
434 |
int top_k,
435 |
double top_p,
436 |
double temp,
437 |
std::mt19937 & rng) {
438 |
int n_logits = vocab.id_to_token.size();
439 |
440 |
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
441 |
442 |
443 |
444 |
const double scale = 1.0/temp;
445 |
for (int i = 0; i < n_logits; ++i) {
446 |
logits_id.push_back(std::make_pair(logits[i]*scale, i));
447 |
448 |
449 |
450 |
// find the top K tokens
451 |
452 |
453 |
logits_id.begin() + top_k, logits_id.end(),
454 |
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
455 |
return a.first > b.first;
456 |
457 |
458 |
459 |
460 |
double maxl = -INFINITY;
461 |
for (const auto & kv : logits_id) {
462 |
maxl = std::max(maxl, kv.first);
463 |
464 |
465 |
// compute probs for the top K tokens
466 |
std::vector<double> probs;
467 |
468 |
469 |
double sum = 0.0;
470 |
for (const auto & kv : logits_id) {
471 |
double p = exp(kv.first - maxl);
472 |
473 |
sum += p;
474 |
475 |
476 |
// normalize the probs
477 |
for (auto & p : probs) {
478 |
p /= sum;
479 |
480 |
481 |
if (top_p < 1.0f) {
482 |
double cumsum = 0.0f;
483 |
for (int i = 0; i < top_k; i++) {
484 |
cumsum += probs[i];
485 |
if (cumsum >= top_p) {
486 |
top_k = i + 1;
487 |
488 |
489 |
490 |
491 |
492 |
493 |
cumsum = 1.0/cumsum;
494 |
for (int i = 0; i < (int) probs.size(); i++) {
495 |
probs[i] *= cumsum;
496 |
497 |
498 |
499 |
500 |
//for (int i = 0; i < (int) probs.size(); i++) {
501 |
// printf("%d: '%s' %f\n", i,[i].second).c_str(), probs[i]);
502 |
503 |
504 |
505 |
std::discrete_distribution<> dist(probs.begin(), probs.end());
506 |
int idx = dist(rng);
507 |
508 |
return logits_id[idx].second;
509 |
510 |
511 |
gpt_vocab::id gpt_sample_top_k_top_p_repeat(
512 |
const gpt_vocab & vocab,
513 |
const float * logits,
514 |
const int32_t * last_n_tokens_data,
515 |
size_t last_n_tokens_data_size,
516 |
int top_k,
517 |
double top_p,
518 |
double temp,
519 |
int repeat_last_n,
520 |
float repeat_penalty,
521 |
std::mt19937 & rng) {
522 |
523 |
int n_logits = vocab.id_to_token.size();
524 |
525 |
const auto * plogits = logits;
526 |
527 |
const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
528 |
529 |
if (temp <= 0) {
530 |
// select the token with the highest logit directly
531 |
float max_logit = plogits[0];
532 |
gpt_vocab::id max_id = 0;
533 |
534 |
for (int i = 1; i < n_logits; ++i) {
535 |
if (plogits[i] > max_logit) {
536 |
max_logit = plogits[i];
537 |
max_id = i;
538 |
539 |
540 |
return max_id;
541 |
542 |
543 |
544 |
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
545 |
546 |
547 |
548 |
const float scale = 1.0f/temp;
549 |
for (int i = 0; i < n_logits; ++i) {
550 |
// repetition penalty from ctrl paper (
551 |
// credit
552 |
if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
553 |
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
554 |
if (plogits[i] < 0.0f) {
555 |
logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
556 |
} else {
557 |
logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
558 |
559 |
} else {
560 |
logits_id.push_back(std::make_pair(plogits[i]*scale, i));
561 |
562 |
563 |
564 |
565 |
// find the top K tokens
566 |
567 |
568 |
logits_id.begin() + top_k, logits_id.end(),
569 |
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
570 |
return a.first > b.first;
571 |
572 |
573 |
574 |
575 |
double maxl = -INFINITY;
576 |
for (const auto & kv : logits_id) {
577 |
maxl = std::max(maxl, kv.first);
578 |
579 |
580 |
// compute probs for the top K tokens
581 |
std::vector<double> probs;
582 |
583 |
584 |
double sum = 0.0;
585 |
for (const auto & kv : logits_id) {
586 |
double p = exp(kv.first - maxl);
587 |
588 |
sum += p;
589 |
590 |
591 |
// normalize the probs
592 |
for (auto & p : probs) {
593 |
p /= sum;
594 |
595 |
596 |
if (top_p < 1.0f) {
597 |
double cumsum = 0.0f;
598 |
for (int i = 0; i < top_k; i++) {
599 |
cumsum += probs[i];
600 |
if (cumsum >= top_p) {
601 |
top_k = i + 1;
602 |
603 |
604 |
605 |
606 |
607 |
608 |
cumsum = 1.0/cumsum;
609 |
for (int i = 0; i < (int) probs.size(); i++) {
610 |
probs[i] *= cumsum;
611 |
612 |
613 |
614 |
// printf("\n");
615 |
// for (int i = 0; i < (int) probs.size(); i++) {
616 |
// for (int i = 0; i < 10; i++) {
617 |
// printf("%d: '%s' %f\n", i,[i].second).c_str(), probs[i]);
618 |
// }
619 |
620 |
std::discrete_distribution<> dist(probs.begin(), probs.end());
621 |
int idx = dist(rng);
622 |
623 |
return logits_id[idx].second;
624 |
625 |
626 |
627 |
bool is_wav_buffer(const std::string buf) {
628 |
// RIFF ref:
629 |
// WAV ref:
630 |
if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") {
631 |
return false;
632 |
633 |
634 |
uint32_t chunk_size = *reinterpret_cast<const uint32_t*>( + 4);
635 |
if (chunk_size + 8 != buf.size()) {
636 |
return false;
637 |
638 |
639 |
return true;
640 |
641 |
642 |
bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
643 |
drwav wav;
644 |
std::vector<uint8_t> wav_data; // used for pipe input from stdin or ffmpeg decoding output
645 |
646 |
if (fname == "-") {
647 |
648 |
#ifdef _WIN32
649 |
_setmode(_fileno(stdin), _O_BINARY);
650 |
651 |
652 |
uint8_t buf[1024];
653 |
while (true)
654 |
655 |
const size_t n = fread(buf, 1, sizeof(buf), stdin);
656 |
if (n == 0) {
657 |
658 |
659 |
wav_data.insert(wav_data.end(), buf, buf + n);
660 |
661 |
662 |
663 |
if (drwav_init_memory(&wav,, wav_data.size(), nullptr) == false) {
664 |
fprintf(stderr, "error: failed to open WAV file from stdin\n");
665 |
return false;
666 |
667 |
668 |
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
669 |
670 |
else if (is_wav_buffer(fname)) {
671 |
if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
672 |
fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
673 |
return false;
674 |
675 |
676 |
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
677 |
#if defined(WHISPER_FFMPEG)
678 |
if (ffmpeg_decode_audio(fname, wav_data) != 0) {
679 |
fprintf(stderr, "error: failed to ffmpeg decode '%s' \n", fname.c_str());
680 |
return false;
681 |
682 |
if (drwav_init_memory(&wav,, wav_data.size(), nullptr) == false) {
683 |
fprintf(stderr, "error: failed to read wav data as wav \n");
684 |
return false;
685 |
686 |
687 |
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
688 |
return false;
689 |
690 |
691 |
692 |
if (wav.channels != 1 && wav.channels != 2) {
693 |
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
694 |
695 |
return false;
696 |
697 |
698 |
if (stereo && wav.channels != 2) {
699 |
fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
700 |
701 |
return false;
702 |
703 |
704 |
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
705 |
fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
706 |
707 |
return false;
708 |
709 |
710 |
if (wav.bitsPerSample != 16) {
711 |
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
712 |
713 |
return false;
714 |
715 |
716 |
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
717 |
718 |
std::vector<int16_t> pcm16;
719 |
720 |
drwav_read_pcm_frames_s16(&wav, n,;
721 |
722 |
723 |
// convert to mono, float
724 |
725 |
if (wav.channels == 1) {
726 |
for (uint64_t i = 0; i < n; i++) {
727 |
pcmf32[i] = float(pcm16[i])/32768.0f;
728 |
729 |
} else {
730 |
for (uint64_t i = 0; i < n; i++) {
731 |
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
732 |
733 |
734 |
735 |
if (stereo) {
736 |
// convert to stereo, float
737 |
738 |
739 |
740 |
741 |
for (uint64_t i = 0; i < n; i++) {
742 |
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
743 |
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
744 |
745 |
746 |
747 |
return true;
748 |
749 |
750 |
void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
751 |
const float rc = 1.0f / (2.0f * M_PI * cutoff);
752 |
const float dt = 1.0f / sample_rate;
753 |
const float alpha = dt / (rc + dt);
754 |
755 |
float y = data[0];
756 |
757 |
for (size_t i = 1; i < data.size(); i++) {
758 |
y = alpha * (y + data[i] - data[i - 1]);
759 |
data[i] = y;
760 |
761 |
762 |
763 |
bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
764 |
const int n_samples = pcmf32.size();
765 |
const int n_samples_last = (sample_rate * last_ms) / 1000;
766 |
767 |
if (n_samples_last >= n_samples) {
768 |
// not enough samples - assume no speech
769 |
return false;
770 |
771 |
772 |
if (freq_thold > 0.0f) {
773 |
high_pass_filter(pcmf32, freq_thold, sample_rate);
774 |
775 |
776 |
float energy_all = 0.0f;
777 |
float energy_last = 0.0f;
778 |
779 |
for (int i = 0; i < n_samples; i++) {
780 |
energy_all += fabsf(pcmf32[i]);
781 |
782 |
if (i >= n_samples - n_samples_last) {
783 |
energy_last += fabsf(pcmf32[i]);
784 |
785 |
786 |
787 |
energy_all /= n_samples;
788 |
energy_last /= n_samples_last;
789 |
790 |
if (verbose) {
791 |
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
792 |
793 |
794 |
if (energy_last > vad_thold*energy_all) {
795 |
return false;
796 |
797 |
798 |
return true;
799 |
800 |
801 |
float similarity(const std::string & s0, const std::string & s1) {
802 |
const size_t len0 = s0.size() + 1;
803 |
const size_t len1 = s1.size() + 1;
804 |
805 |
std::vector<int> col(len1, 0);
806 |
std::vector<int> prevCol(len1, 0);
807 |
808 |
for (size_t i = 0; i < len1; i++) {
809 |
prevCol[i] = i;
810 |
811 |
812 |
for (size_t i = 0; i < len0; i++) {
813 |
col[0] = i;
814 |
for (size_t j = 1; j < len1; j++) {
815 |
col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
816 |
817 |
818 |
819 |
820 |
const float dist = prevCol[len1 - 1];
821 |
822 |
return 1.0f - (dist / std::max(s0.size(), s1.size()));
823 |
824 |
825 |
bool sam_params_parse(int argc, char ** argv, sam_params & params) {
826 |
for (int i = 1; i < argc; i++) {
827 |
std::string arg = argv[i];
828 |
829 |
if (arg == "-s" || arg == "--seed") {
830 |
params.seed = std::stoi(argv[++i]);
831 |
} else if (arg == "-t" || arg == "--threads") {
832 |
params.n_threads = std::stoi(argv[++i]);
833 |
} else if (arg == "-m" || arg == "--model") {
834 |
params.model = argv[++i];
835 |
} else if (arg == "-i" || arg == "--inp") {
836 |
params.fname_inp = argv[++i];
837 |
} else if (arg == "-o" || arg == "--out") {
838 |
params.fname_out = argv[++i];
839 |
} else if (arg == "-h" || arg == "--help") {
840 |
sam_print_usage(argc, argv, params);
841 |
842 |
} else {
843 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
844 |
sam_print_usage(argc, argv, params);
845 |
846 |
847 |
848 |
849 |
return true;
850 |
851 |
852 |
void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
853 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
854 |
fprintf(stderr, "\n");
855 |
fprintf(stderr, "options:\n");
856 |
fprintf(stderr, " -h, --help show this help message and exit\n");
857 |
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
858 |
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
859 |
fprintf(stderr, " -m FNAME, --model FNAME\n");
860 |
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
861 |
fprintf(stderr, " -i FNAME, --inp FNAME\n");
862 |
fprintf(stderr, " input file (default: %s)\n", params.fname_inp.c_str());
863 |
fprintf(stderr, " -o FNAME, --out FNAME\n");
864 |
fprintf(stderr, " output file (default: %s)\n", params.fname_out.c_str());
865 |
fprintf(stderr, "\n");
866 |
867 |
868 |
// 500 -> 00:05.000
869 |
// 6000 -> 01:00.000
870 |
std::string to_timestamp(int64_t t, bool comma) {
871 |
int64_t msec = t * 10;
872 |
int64_t hr = msec / (1000 * 60 * 60);
873 |
msec = msec - hr * (1000 * 60 * 60);
874 |
int64_t min = msec / (1000 * 60);
875 |
msec = msec - min * (1000 * 60);
876 |
int64_t sec = msec / 1000;
877 |
msec = msec - sec * 1000;
878 |
879 |
char buf[32];
880 |
snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
881 |
882 |
return std::string(buf);
883 |
884 |
885 |
int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate) {
886 |
return std::max(0, std::min((int) n_samples - 1, (int) ((t*whisper_sample_rate)/100)));
887 |
888 |
889 |
bool is_file_exist(const char *fileName)
890 |
891 |
std::ifstream infile(fileName);
892 |
return infile.good();
893 |
894 |
895 |
bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id)
896 |
897 |
std::ofstream speak_file(path.c_str());
898 |
if ( {
899 |
fprintf(stderr, "%s: failed to open speak_file\n", __func__);
900 |
return false;
901 |
} else {
902 |
speak_file.write(text.c_str(), text.size());
903 |
904 |
int ret = system((command + " " + std::to_string(voice_id) + " " + path).c_str());
905 |
if (ret != 0) {
906 |
fprintf(stderr, "%s: failed to speak\n", __func__);
907 |
return false;
908 |
909 |
910 |
return true;
911 |
@@ -0,0 +1,343 @@
1 |
// Various helper functions and utilities
2 |
3 |
#pragma once
4 |
5 |
#include <string>
6 |
#include <map>
7 |
#include <vector>
8 |
#include <random>
9 |
#include <thread>
10 |
#include <ctime>
11 |
#include <fstream>
12 |
#include <sstream>
13 |
14 |
#define COMMON_SAMPLE_RATE 16000
15 |
16 |
17 |
// GPT CLI argument parsing
18 |
19 |
20 |
struct gpt_params {
21 |
int32_t seed = -1; // RNG seed
22 |
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
23 |
int32_t n_predict = 200; // new tokens to predict
24 |
int32_t n_parallel = 1; // number of parallel streams
25 |
int32_t n_batch = 32; // batch size for prompt processing
26 |
int32_t n_ctx = 2048; // context size (this is the KV cache max size)
27 |
int32_t n_gpu_layers = 0; // number of layers to offlload to the GPU
28 |
29 |
bool ignore_eos = false; // ignore EOS token when generating text
30 |
31 |
// sampling parameters
32 |
int32_t top_k = 40;
33 |
float top_p = 0.9f;
34 |
float temp = 0.9f;
35 |
int32_t repeat_last_n = 64;
36 |
float repeat_penalty = 1.00f;
37 |
38 |
std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
39 |
std::string prompt = "";
40 |
std::string token_test = "";
41 |
42 |
bool interactive = false;
43 |
int32_t interactive_port = -1;
44 |
45 |
46 |
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
47 |
48 |
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
49 |
50 |
std::string gpt_random_prompt(std::mt19937 & rng);
51 |
52 |
53 |
// Vocab utils
54 |
55 |
56 |
std::string trim(const std::string & s);
57 |
58 |
std::string replace(
59 |
const std::string & s,
60 |
const std::string & from,
61 |
const std::string & to);
62 |
63 |
struct gpt_vocab {
64 |
using id = int32_t;
65 |
using token = std::string;
66 |
67 |
std::map<token, id> token_to_id;
68 |
std::map<id, token> id_to_token;
69 |
std::vector<std::string> special_tokens;
70 |
71 |
void add_special_token(const std::string & token);
72 |
73 |
74 |
// poor-man's JSON parsing
75 |
std::map<std::string, int32_t> json_parse(const std::string & fname);
76 |
77 |
std::string convert_to_utf8(const std::wstring & input);
78 |
79 |
std::wstring convert_to_wstring(const std::string & input);
80 |
81 |
void gpt_split_words(std::string str, std::vector<std::string>& words);
82 |
83 |
// split text into tokens
84 |
85 |
// ref:
86 |
87 |
// Regex (Python):
88 |
// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
89 |
90 |
// Regex (C++):
91 |
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
92 |
93 |
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
94 |
95 |
// test outputs of gpt_tokenize
96 |
97 |
// - compare with tokens generated by the huggingface tokenizer
98 |
// - test cases are chosen based on the model's main language (under 'prompt' directory)
99 |
// - if all sentences are tokenized identically, print 'All tests passed.'
100 |
// - otherwise, print sentence, huggingface tokens, ggml tokens
101 |
102 |
void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test);
103 |
104 |
// load the tokens from encoder.json
105 |
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
106 |
107 |
// sample next token given probabilities for each embedding
108 |
109 |
// - consider only the top K tokens
110 |
// - from them, consider only the top tokens with cumulative probability > P
111 |
112 |
// TODO: not sure if this implementation is correct
113 |
// TODO: temperature is not implemented
114 |
115 |
gpt_vocab::id gpt_sample_top_k_top_p(
116 |
const gpt_vocab & vocab,
117 |
const float * logits,
118 |
int top_k,
119 |
double top_p,
120 |
double temp,
121 |
std::mt19937 & rng);
122 |
123 |
gpt_vocab::id gpt_sample_top_k_top_p_repeat(
124 |
const gpt_vocab & vocab,
125 |
const float * logits,
126 |
const int32_t * last_n_tokens_data,
127 |
size_t last_n_tokens_data_size,
128 |
int top_k,
129 |
double top_p,
130 |
double temp,
131 |
int repeat_last_n,
132 |
float repeat_penalty,
133 |
std::mt19937 & rng);
134 |
135 |
136 |
// Audio utils
137 |
138 |
139 |
// Check if a buffer is a WAV audio file
140 |
bool is_wav_buffer(const std::string buf);
141 |
142 |
// Read WAV audio file and store the PCM data into pcmf32
143 |
// fname can be a buffer of WAV data instead of a filename
144 |
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
145 |
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
146 |
bool read_wav(
147 |
const std::string & fname,
148 |
std::vector<float> & pcmf32,
149 |
std::vector<std::vector<float>> & pcmf32s,
150 |
bool stereo);
151 |
152 |
// Write PCM data into WAV audio file
153 |
class wav_writer {
154 |
155 |
std::ofstream file;
156 |
uint32_t dataSize = 0;
157 |
std::string wav_filename;
158 |
159 |
bool write_header(const uint32_t sample_rate,
160 |
const uint16_t bits_per_sample,
161 |
const uint16_t channels) {
162 |
163 |
file.write("RIFF", 4);
164 |
file.write("\0\0\0\0", 4); // Placeholder for file size
165 |
file.write("WAVE", 4);
166 |
file.write("fmt ", 4);
167 |
168 |
const uint32_t sub_chunk_size = 16;
169 |
const uint16_t audio_format = 1; // PCM format
170 |
const uint32_t byte_rate = sample_rate * channels * bits_per_sample / 8;
171 |
const uint16_t block_align = channels * bits_per_sample / 8;
172 |
173 |
file.write(reinterpret_cast<const char *>(&sub_chunk_size), 4);
174 |
file.write(reinterpret_cast<const char *>(&audio_format), 2);
175 |
file.write(reinterpret_cast<const char *>(&channels), 2);
176 |
file.write(reinterpret_cast<const char *>(&sample_rate), 4);
177 |
file.write(reinterpret_cast<const char *>(&byte_rate), 4);
178 |
file.write(reinterpret_cast<const char *>(&block_align), 2);
179 |
file.write(reinterpret_cast<const char *>(&bits_per_sample), 2);
180 |
file.write("data", 4);
181 |
file.write("\0\0\0\0", 4); // Placeholder for data size
182 |
183 |
return true;
184 |
185 |
186 |
// It is assumed that PCM data is normalized to a range from -1 to 1
187 |
bool write_audio(const float * data, size_t length) {
188 |
for (size_t i = 0; i < length; ++i) {
189 |
const int16_t intSample = int16_t(data[i] * 32767);
190 |
file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
191 |
dataSize += sizeof(int16_t);
192 |
193 |
if (file.is_open()) {
194 |
file.seekp(4, std::ios::beg);
195 |
uint32_t fileSize = 36 + dataSize;
196 |
file.write(reinterpret_cast<char *>(&fileSize), 4);
197 |
file.seekp(40, std::ios::beg);
198 |
file.write(reinterpret_cast<char *>(&dataSize), 4);
199 |
file.seekp(0, std::ios::end);
200 |
201 |
return true;
202 |
203 |
204 |
bool open_wav(const std::string & filename) {
205 |
if (filename != wav_filename) {
206 |
if (file.is_open()) {
207 |
208 |
209 |
210 |
if (!file.is_open()) {
211 |
+, std::ios::binary);
212 |
wav_filename = filename;
213 |
dataSize = 0;
214 |
215 |
return file.is_open();
216 |
217 |
218 |
219 |
bool open(const std::string & filename,
220 |
const uint32_t sample_rate,
221 |
const uint16_t bits_per_sample,
222 |
const uint16_t channels) {
223 |
224 |
if (open_wav(filename)) {
225 |
write_header(sample_rate, bits_per_sample, channels);
226 |
} else {
227 |
return false;
228 |
229 |
230 |
return true;
231 |
232 |
233 |
bool close() {
234 |
235 |
return true;
236 |
237 |
238 |
bool write(const float * data, size_t length) {
239 |
return write_audio(data, length);
240 |
241 |
242 |
~wav_writer() {
243 |
if (file.is_open()) {
244 |
245 |
246 |
247 |
248 |
249 |
250 |
// Apply a high-pass frequency filter to PCM audio
251 |
// Suppresses frequencies below cutoff Hz
252 |
void high_pass_filter(
253 |
std::vector<float> & data,
254 |
float cutoff,
255 |
float sample_rate);
256 |
257 |
// Basic voice activity detection (VAD) using audio energy adaptive threshold
258 |
bool vad_simple(
259 |
std::vector<float> & pcmf32,
260 |
int sample_rate,
261 |
int last_ms,
262 |
float vad_thold,
263 |
float freq_thold,
264 |
bool verbose);
265 |
266 |
// compute similarity between two strings using Levenshtein distance
267 |
float similarity(const std::string & s0, const std::string & s1);
268 |
269 |
270 |
// SAM argument parsing
271 |
272 |
273 |
struct sam_params {
274 |
int32_t seed = -1; // RNG seed
275 |
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
276 |
277 |
std::string model = "models/sam-vit-b/ggml-model-f16.bin"; // model path
278 |
std::string fname_inp = "img.jpg";
279 |
std::string fname_out = "img.out";
280 |
281 |
282 |
bool sam_params_parse(int argc, char ** argv, sam_params & params);
283 |
284 |
void sam_print_usage(int argc, char ** argv, const sam_params & params);
285 |
286 |
287 |
// Terminal utils
288 |
289 |
290 |
#define SQR(X) ((X) * (X))
291 |
#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
292 |
293 |
294 |
* Quantizes 24-bit RGB to xterm256 code range [16,256).
295 |
296 |
static int rgb2xterm256(int r, int g, int b) {
297 |
unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
298 |
int av, ir, ig, ib, il, qr, qg, qb, ql;
299 |
av = r * .299 + g * .587 + b * .114 + .5;
300 |
ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
301 |
qr = cube[(ir = UNCUBE(r))];
302 |
qg = cube[(ig = UNCUBE(g))];
303 |
qb = cube[(ib = UNCUBE(b))];
304 |
if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
305 |
SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
306 |
return ir * 36 + ig * 6 + ib + 020;
307 |
return il + 0350;
308 |
309 |
310 |
static std::string set_xterm256_foreground(int r, int g, int b) {
311 |
int x = rgb2xterm256(r, g, b);
312 |
std::ostringstream oss;
313 |
oss << "\033[38;5;" << x << "m";
314 |
return oss.str();
315 |
316 |
317 |
// Lowest is red, middle is yellow, highest is green. Color scheme from
318 |
// Paul Tol; it is colorblind friendly
319 |
const std::vector<std::string> k_colors = {
320 |
set_xterm256_foreground(220, 5, 12),
321 |
set_xterm256_foreground(232, 96, 28),
322 |
set_xterm256_foreground(241, 147, 45),
323 |
set_xterm256_foreground(246, 193, 65),
324 |
set_xterm256_foreground(247, 240, 86),
325 |
set_xterm256_foreground(144, 201, 135),
326 |
set_xterm256_foreground( 78, 178, 101),
327 |
328 |
329 |
330 |
// Other utils
331 |
332 |
333 |
// convert timestamp to string, 6000 -> 01:00.000
334 |
std::string to_timestamp(int64_t t, bool comma = false);
335 |
336 |
// given a timestamp get the sample
337 |
int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate);
338 |
339 |
// check if file exists using ifstream
340 |
bool is_file_exist(const char *fileName);
341 |
342 |
// write text to file, and call system("command voice_id file")
343 |
bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id);
The diff for this file is too large to render.
See raw diff
The diff for this file is too large to render.
See raw diff
@@ -0,0 +1,39 @@
1 |
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
2 |
#pragma once
3 |
4 |
5 |
#include "ggml-common.h"
6 |
7 |
#include "ggml.h"
8 |
9 |
// GGML internal header
10 |
11 |
#ifdef __cplusplus
12 |
extern "C" {
13 |
14 |
15 |
// Quantization
16 |
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
17 |
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
18 |
19 |
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
20 |
21 |
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
22 |
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
23 |
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
24 |
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
25 |
26 |
27 |
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
28 |
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
29 |
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
30 |
31 |
32 |
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
33 |
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
34 |
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
35 |
36 |
#ifdef __cplusplus
37 |
38 |
39 |
The diff for this file is too large to render.
See raw diff
@@ -0,0 +1,614 @@
1 |
#pragma once
2 |
3 |
// GGML CPU internal header
4 |
5 |
#include "ggml.h"
6 |
#include "ggml-impl.h"
7 |
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug:
8 |
//#include <stddef.h>
9 |
#include <stdbool.h>
10 |
#include <string.h> // memcpy
11 |
#include <math.h> // fabsf
12 |
13 |
14 |
#ifdef __cplusplus
15 |
extern "C" {
16 |
17 |
18 |
#if defined(_MSC_VER)
19 |
20 |
#define m512bh(p) p
21 |
#define m512i(p) p
22 |
23 |
24 |
25 |
#define m512bh(p) (__m512bh)(p)
26 |
#define m512i(p) (__m512i)(p)
27 |
28 |
29 |
30 |
31 |
* Converts brain16 to float32.
32 |
33 |
* The bfloat16 floating point format has the following structure:
34 |
35 |
* ┌sign
36 |
* │
37 |
* │ ┌exponent
38 |
* │ │
39 |
* │ │ ┌mantissa
40 |
* │ │ │
41 |
* │┌──┴───┐┌─┴───┐
42 |
* 0b0000000000000000 brain16
43 |
44 |
* Since bf16 has the same number of exponent bits as a 32bit float,
45 |
* encoding and decoding numbers becomes relatively straightforward.
46 |
47 |
* ┌sign
48 |
* │
49 |
* │ ┌exponent
50 |
* │ │
51 |
* │ │ ┌mantissa
52 |
* │ │ │
53 |
* │┌──┴───┐┌─┴───────────────────┐
54 |
* 0b00000000000000000000000000000000 IEEE binary32
55 |
56 |
* For comparison, the standard fp16 format has fewer exponent bits.
57 |
58 |
* ┌sign
59 |
* │
60 |
* │ ┌exponent
61 |
* │ │
62 |
* │ │ ┌mantissa
63 |
* │ │ │
64 |
* │┌─┴─┐┌─┴──────┐
65 |
* 0b0000000000000000 IEEE binary16
66 |
67 |
* @see IEEE 754-2008
68 |
69 |
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
70 |
union {
71 |
float f;
72 |
uint32_t i;
73 |
} u;
74 |
u.i = (uint32_t)h.bits << 16;
75 |
return u.f;
76 |
77 |
78 |
79 |
* Converts float32 to brain16.
80 |
81 |
* This is binary identical with Google Brain float conversion.
82 |
* Floats shall round to nearest even, and NANs shall be quiet.
83 |
* Subnormals aren't flushed to zero, except perhaps when used.
84 |
* This code should vectorize nicely if using modern compilers.
85 |
86 |
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
87 |
ggml_bf16_t h;
88 |
union {
89 |
float f;
90 |
uint32_t i;
91 |
} u;
92 |
u.f = s;
93 |
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
94 |
h.bits = (u.i >> 16) | 64; /* force to quiet */
95 |
return h;
96 |
97 |
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
98 |
return h;
99 |
100 |
101 |
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
102 |
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
103 |
104 |
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
105 |
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
106 |
#ifndef __FMA__
107 |
#define __FMA__
108 |
109 |
#ifndef __F16C__
110 |
#define __F16C__
111 |
112 |
113 |
114 |
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
115 |
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
116 |
#ifndef __SSE3__
117 |
#define __SSE3__
118 |
119 |
#ifndef __SSSE3__
120 |
#define __SSSE3__
121 |
122 |
123 |
124 |
#if defined(__ARM_FEATURE_SVE)
125 |
#include <arm_sve.h>
126 |
#include <sys/prctl.h>
127 |
128 |
129 |
// 16-bit float
130 |
// on Arm, we use __fp16
131 |
// on x86, we use uint16_t
132 |
#if defined(__ARM_NEON)
133 |
134 |
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
135 |
136 |
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
137 |
138 |
#include <arm_neon.h>
139 |
140 |
#ifdef _MSC_VER
141 |
142 |
typedef uint16_t ggml_fp16_internal_t;
143 |
144 |
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
145 |
146 |
147 |
148 |
typedef __fp16 ggml_fp16_internal_t;
149 |
150 |
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
151 |
152 |
#endif // _MSC_VER
153 |
154 |
#if !defined(__aarch64__)
155 |
156 |
// 32-bit ARM compatibility
157 |
158 |
// vaddlvq_s16
159 |
// vpaddq_s16
160 |
// vpaddq_s32
161 |
// vaddvq_s32
162 |
// vaddvq_f32
163 |
// vmaxvq_f32
164 |
// vcvtnq_s32_f32
165 |
// vzip1_u8
166 |
// vzip2_u8
167 |
168 |
inline static int32_t vaddlvq_s16(int16x8_t v) {
169 |
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
170 |
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
171 |
172 |
173 |
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
174 |
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
175 |
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
176 |
return vcombine_s16(a0, b0);
177 |
178 |
179 |
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
180 |
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
181 |
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
182 |
return vcombine_s32(a0, b0);
183 |
184 |
185 |
inline static int32_t vaddvq_s32(int32x4_t v) {
186 |
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
187 |
188 |
189 |
inline static float vaddvq_f32(float32x4_t v) {
190 |
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
191 |
192 |
193 |
inline static float vmaxvq_f32(float32x4_t v) {
194 |
195 |
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
196 |
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
197 |
198 |
199 |
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
200 |
int32x4_t res;
201 |
202 |
res[0] = roundf(vgetq_lane_f32(v, 0));
203 |
res[1] = roundf(vgetq_lane_f32(v, 1));
204 |
res[2] = roundf(vgetq_lane_f32(v, 2));
205 |
res[3] = roundf(vgetq_lane_f32(v, 3));
206 |
207 |
return res;
208 |
209 |
210 |
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
211 |
uint8x8_t res;
212 |
213 |
res[0] = a[0]; res[1] = b[0];
214 |
res[2] = a[1]; res[3] = b[1];
215 |
res[4] = a[2]; res[5] = b[2];
216 |
res[6] = a[3]; res[7] = b[3];
217 |
218 |
return res;
219 |
220 |
221 |
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
222 |
uint8x8_t res;
223 |
224 |
res[0] = a[4]; res[1] = b[4];
225 |
res[2] = a[5]; res[3] = b[5];
226 |
res[4] = a[6]; res[5] = b[6];
227 |
res[6] = a[7]; res[7] = b[7];
228 |
229 |
return res;
230 |
231 |
232 |
// vld1q_s16_x2
233 |
// vld1q_u8_x2
234 |
// vld1q_u8_x4
235 |
// vld1q_s8_x2
236 |
// vld1q_s8_x4
237 |
// TODO: double-check these work correctly
238 |
239 |
typedef struct ggml_int16x8x2_t {
240 |
int16x8_t val[2];
241 |
} ggml_int16x8x2_t;
242 |
243 |
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
244 |
ggml_int16x8x2_t res;
245 |
246 |
res.val[0] = vld1q_s16(ptr + 0);
247 |
res.val[1] = vld1q_s16(ptr + 8);
248 |
249 |
return res;
250 |
251 |
252 |
typedef struct ggml_uint8x16x2_t {
253 |
uint8x16_t val[2];
254 |
} ggml_uint8x16x2_t;
255 |
256 |
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
257 |
ggml_uint8x16x2_t res;
258 |
259 |
res.val[0] = vld1q_u8(ptr + 0);
260 |
res.val[1] = vld1q_u8(ptr + 16);
261 |
262 |
return res;
263 |
264 |
265 |
typedef struct ggml_uint8x16x4_t {
266 |
uint8x16_t val[4];
267 |
} ggml_uint8x16x4_t;
268 |
269 |
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
270 |
ggml_uint8x16x4_t res;
271 |
272 |
res.val[0] = vld1q_u8(ptr + 0);
273 |
res.val[1] = vld1q_u8(ptr + 16);
274 |
res.val[2] = vld1q_u8(ptr + 32);
275 |
res.val[3] = vld1q_u8(ptr + 48);
276 |
277 |
return res;
278 |
279 |
280 |
typedef struct ggml_int8x16x2_t {
281 |
int8x16_t val[2];
282 |
} ggml_int8x16x2_t;
283 |
284 |
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
285 |
ggml_int8x16x2_t res;
286 |
287 |
res.val[0] = vld1q_s8(ptr + 0);
288 |
res.val[1] = vld1q_s8(ptr + 16);
289 |
290 |
return res;
291 |
292 |
293 |
typedef struct ggml_int8x16x4_t {
294 |
int8x16_t val[4];
295 |
} ggml_int8x16x4_t;
296 |
297 |
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
298 |
ggml_int8x16x4_t res;
299 |
300 |
res.val[0] = vld1q_s8(ptr + 0);
301 |
res.val[1] = vld1q_s8(ptr + 16);
302 |
res.val[2] = vld1q_s8(ptr + 32);
303 |
res.val[3] = vld1q_s8(ptr + 48);
304 |
305 |
return res;
306 |
307 |
308 |
// NOTE: not tested
309 |
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
310 |
int8x16_t res;
311 |
312 |
res[ 0] = a[b[ 0]];
313 |
res[ 1] = a[b[ 1]];
314 |
res[ 2] = a[b[ 2]];
315 |
res[ 3] = a[b[ 3]];
316 |
res[ 4] = a[b[ 4]];
317 |
res[ 5] = a[b[ 5]];
318 |
res[ 6] = a[b[ 6]];
319 |
res[ 7] = a[b[ 7]];
320 |
res[ 8] = a[b[ 8]];
321 |
res[ 9] = a[b[ 9]];
322 |
res[10] = a[b[10]];
323 |
res[11] = a[b[11]];
324 |
res[12] = a[b[12]];
325 |
res[13] = a[b[13]];
326 |
res[14] = a[b[14]];
327 |
res[15] = a[b[15]];
328 |
329 |
return res;
330 |
331 |
332 |
// NOTE: not tested
333 |
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
334 |
uint8x16_t res;
335 |
336 |
res[ 0] = a[b[ 0]];
337 |
res[ 1] = a[b[ 1]];
338 |
res[ 2] = a[b[ 2]];
339 |
res[ 3] = a[b[ 3]];
340 |
res[ 4] = a[b[ 4]];
341 |
res[ 5] = a[b[ 5]];
342 |
res[ 6] = a[b[ 6]];
343 |
res[ 7] = a[b[ 7]];
344 |
res[ 8] = a[b[ 8]];
345 |
res[ 9] = a[b[ 9]];
346 |
res[10] = a[b[10]];
347 |
res[11] = a[b[11]];
348 |
res[12] = a[b[12]];
349 |
res[13] = a[b[13]];
350 |
res[14] = a[b[14]];
351 |
res[15] = a[b[15]];
352 |
353 |
return res;
354 |
355 |
356 |
357 |
358 |
#define ggml_int16x8x2_t int16x8x2_t
359 |
#define ggml_uint8x16x2_t uint8x16x2_t
360 |
#define ggml_uint8x16x4_t uint8x16x4_t
361 |
#define ggml_int8x16x2_t int8x16x2_t
362 |
#define ggml_int8x16x4_t int8x16x4_t
363 |
364 |
#define ggml_vld1q_s16_x2 vld1q_s16_x2
365 |
#define ggml_vld1q_u8_x2 vld1q_u8_x2
366 |
#define ggml_vld1q_u8_x4 vld1q_u8_x4
367 |
#define ggml_vld1q_s8_x2 vld1q_s8_x2
368 |
#define ggml_vld1q_s8_x4 vld1q_s8_x4
369 |
#define ggml_vqtbl1q_s8 vqtbl1q_s8
370 |
#define ggml_vqtbl1q_u8 vqtbl1q_u8
371 |
372 |
#endif // !defined(__aarch64__)
373 |
374 |
#if !defined(__ARM_FEATURE_DOTPROD)
375 |
376 |
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
377 |
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
378 |
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
379 |
380 |
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
381 |
382 |
383 |
384 |
385 |
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
386 |
387 |
#endif // !defined(__ARM_FEATURE_DOTPROD)
388 |
389 |
#endif // defined(__ARM_NEON)
390 |
391 |
#if defined(__ARM_NEON) && !defined(_MSC_VER)
392 |
393 |
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
394 |
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
395 |
396 |
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
397 |
398 |
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
399 |
ggml_fp16_internal_t tmp;
400 |
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
401 |
return (float)tmp;
402 |
403 |
404 |
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
405 |
ggml_fp16_t res;
406 |
ggml_fp16_internal_t tmp = f;
407 |
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
408 |
return res;
409 |
410 |
411 |
412 |
413 |
#ifdef __wasm_simd128__
414 |
#include <wasm_simd128.h>
415 |
416 |
#ifdef __POWER9_VECTOR__
417 |
#include <altivec.h>
418 |
#undef bool
419 |
#define bool _Bool
420 |
421 |
#if defined(_MSC_VER) || defined(__MINGW32__)
422 |
#include <intrin.h>
423 |
424 |
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
425 |
#if !defined(__riscv)
426 |
#include <immintrin.h>
427 |
428 |
429 |
430 |
431 |
432 |
433 |
#ifdef __riscv_v_intrinsic
434 |
#include <riscv_vector.h>
435 |
436 |
437 |
#if defined(__loongarch64)
438 |
#if defined(__loongarch_asx)
439 |
#include <lasxintrin.h>
440 |
441 |
#if defined(__loongarch_sx)
442 |
#include <lsxintrin.h>
443 |
444 |
445 |
446 |
#if defined(__loongarch_asx)
447 |
448 |
typedef union {
449 |
int32_t i;
450 |
float f;
451 |
} ft_union;
452 |
453 |
/* float type data load instructions */
454 |
static __m128 __lsx_vreplfr2vr_s(float val) {
455 |
ft_union fi_tmpval = {.f = val};
456 |
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
457 |
458 |
459 |
static __m256 __lasx_xvreplfr2vr_s(float val) {
460 |
ft_union fi_tmpval = {.f = val};
461 |
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
462 |
463 |
464 |
465 |
#ifdef __F16C__
466 |
467 |
#ifdef _MSC_VER
468 |
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
469 |
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
470 |
471 |
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
472 |
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
473 |
474 |
475 |
#elif defined(__POWER9_VECTOR__)
476 |
477 |
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
478 |
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
479 |
/* the inline asm below is about 12% faster than the lookup method */
480 |
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
481 |
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
482 |
483 |
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
484 |
register float f;
485 |
register double d;
486 |
487 |
"mtfprd %0,%2\n"
488 |
"xscvhpdp %0,%0\n"
489 |
"frsp %1,%0\n" :
490 |
/* temp */ "=d"(d),
491 |
/* out */ "=f"(f):
492 |
/* in */ "r"(h));
493 |
return f;
494 |
495 |
496 |
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
497 |
register double d;
498 |
register ggml_fp16_t r;
499 |
__asm__( /* xscvdphp can work on double or single precision */
500 |
"xscvdphp %0,%2\n"
501 |
"mffprd %1,%0\n" :
502 |
/* temp */ "=d"(d),
503 |
/* out */ "=r"(r):
504 |
/* in */ "f"(f));
505 |
return r;
506 |
507 |
508 |
509 |
510 |
// FP16 <-> FP32
511 |
// ref:
512 |
513 |
static inline float fp32_from_bits(uint32_t w) {
514 |
union {
515 |
uint32_t as_bits;
516 |
float as_value;
517 |
} fp32;
518 |
fp32.as_bits = w;
519 |
return fp32.as_value;
520 |
521 |
522 |
static inline uint32_t fp32_to_bits(float f) {
523 |
union {
524 |
float as_value;
525 |
uint32_t as_bits;
526 |
} fp32;
527 |
fp32.as_value = f;
528 |
return fp32.as_bits;
529 |
530 |
531 |
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
532 |
const uint32_t w = (uint32_t) h << 16;
533 |
const uint32_t sign = w & UINT32_C(0x80000000);
534 |
const uint32_t two_w = w + w;
535 |
536 |
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
537 |
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
538 |
const float exp_scale = 0x1.0p-112f;
539 |
540 |
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
541 |
542 |
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
543 |
544 |
const uint32_t magic_mask = UINT32_C(126) << 23;
545 |
const float magic_bias = 0.5f;
546 |
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
547 |
548 |
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
549 |
const uint32_t result = sign |
550 |
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
551 |
return fp32_from_bits(result);
552 |
553 |
554 |
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
555 |
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
556 |
const float scale_to_inf = 0x1.0p+112f;
557 |
const float scale_to_zero = 0x1.0p-110f;
558 |
559 |
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
560 |
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
561 |
562 |
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
563 |
564 |
const uint32_t w = fp32_to_bits(f);
565 |
const uint32_t shl1_w = w + w;
566 |
const uint32_t sign = w & UINT32_C(0x80000000);
567 |
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
568 |
if (bias < UINT32_C(0x71000000)) {
569 |
bias = UINT32_C(0x71000000);
570 |
571 |
572 |
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
573 |
const uint32_t bits = fp32_to_bits(base);
574 |
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
575 |
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
576 |
const uint32_t nonsign = exp_bits + mantissa_bits;
577 |
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
578 |
579 |
580 |
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
581 |
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
582 |
583 |
#endif // __F16C__
584 |
585 |
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
586 |
587 |
588 |
#include <arm_sve.h>
589 |
#endif // __ARM_FEATURE_SVE
590 |
591 |
// precomputed f32 table for f16 (256 KB)
592 |
// defined in ggml.c, initialized in ggml_init()
593 |
extern float ggml_table_f32_f16[1 << 16];
594 |
595 |
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
596 |
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
597 |
// This is also true for POWER9.
598 |
#if !defined(GGML_FP16_TO_FP32)
599 |
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
600 |
uint16_t s;
601 |
memcpy(&s, &f, sizeof(uint16_t));
602 |
return ggml_table_f32_f16[s];
603 |
604 |
605 |
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
606 |
607 |
608 |
#if !defined(GGML_FP32_TO_FP16)
609 |
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
610 |
611 |
612 |
#ifdef __cplusplus
613 |
614 |
@@ -0,0 +1,209 @@
1 |
#pragma once
2 |
3 |
// GGML internal header
4 |
5 |
#include "ggml.h"
6 |
7 |
#include <assert.h>
8 |
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug:
9 |
#include <stdbool.h>
10 |
#include <stdint.h>
11 |
12 |
#ifdef __cplusplus
13 |
extern "C" {
14 |
15 |
16 |
#undef MIN
17 |
#undef MAX
18 |
19 |
#define MIN(a, b) ((a) < (b) ? (a) : (b))
20 |
#define MAX(a, b) ((a) > (b) ? (a) : (b))
21 |
22 |
// required for mmap as gguf only guarantees 32-byte alignment
23 |
24 |
25 |
// static_assert should be a #define, but if it's not,
26 |
// fall back to the _Static_assert C11 keyword.
27 |
// if C99 - static_assert is noop
28 |
// ref:
29 |
#ifndef __cplusplus
30 |
#ifndef static_assert
31 |
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
32 |
#define static_assert(cond, msg) _Static_assert(cond, msg)
33 |
34 |
#define static_assert(cond, msg) struct global_scope_noop_trick
35 |
36 |
37 |
38 |
39 |
40 |
// logging
41 |
42 |
43 |
44 |
void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
45 |
void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
46 |
47 |
#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
48 |
#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
49 |
#define GGML_LOG_WARN(...) ggml_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
50 |
#define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
51 |
#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
52 |
#define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
53 |
54 |
// bitset
55 |
56 |
typedef uint32_t ggml_bitset_t;
57 |
58 |
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
59 |
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
60 |
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
61 |
62 |
static size_t ggml_bitset_size(size_t n) {
63 |
return (n + BITSET_MASK) >> BITSET_SHR;
64 |
65 |
66 |
static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) {
67 |
return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK)));
68 |
69 |
70 |
static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) {
71 |
bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK));
72 |
73 |
74 |
static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
75 |
bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK));
76 |
77 |
78 |
// hash set
79 |
80 |
#define GGML_HASHSET_FULL ((size_t)-1)
81 |
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
82 |
83 |
struct ggml_hash_set {
84 |
size_t size;
85 |
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
86 |
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
87 |
88 |
89 |
struct ggml_hash_set ggml_hash_set_new(size_t size);
90 |
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
91 |
92 |
// returns the minimum size for a hash set that can hold min_sz elements
93 |
size_t ggml_hash_size(size_t min_sz);
94 |
95 |
// remove all elements from the hash set
96 |
void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
97 |
98 |
// returns true if key is in the hash set
99 |
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
100 |
101 |
// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
102 |
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
103 |
104 |
// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
105 |
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
106 |
107 |
// return index, asserts if table is full
108 |
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
109 |
110 |
// hash function for ggml_tensor
111 |
static inline size_t ggml_hash(const struct ggml_tensor * p) {
112 |
// the last 4 bits are always zero due to alignment
113 |
return (size_t)(uintptr_t)p >> 4;
114 |
115 |
116 |
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
117 |
size_t h = ggml_hash(key) % hash_set->size;
118 |
119 |
// linear probing
120 |
size_t i = h;
121 |
while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) {
122 |
i = (i + 1) % hash_set->size;
123 |
if (i == h) {
124 |
// visited all hash table entries -> not found
125 |
126 |
127 |
128 |
return i;
129 |
130 |
131 |
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
132 |
size_t i = ggml_hash_find(hash_set, key);
133 |
return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i);
134 |
135 |
136 |
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
137 |
size_t h = ggml_hash(key) % hash_set->size;
138 |
139 |
// linear probing
140 |
size_t i = h;
141 |
do {
142 |
if (!ggml_bitset_get(hash_set->used, i)) {
143 |
ggml_bitset_set(hash_set->used, i);
144 |
hash_set->keys[i] = key;
145 |
return i;
146 |
147 |
if (hash_set->keys[i] == key) {
148 |
149 |
150 |
i = (i + 1) % hash_set->size;
151 |
} while (i != h);
152 |
153 |
// visited all hash table entries -> not found
154 |
GGML_ABORT("fatal error");
155 |
156 |
157 |
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
158 |
size_t h = ggml_hash(key) % hash_set->size;
159 |
160 |
// linear probing
161 |
size_t i = h;
162 |
do {
163 |
if (!ggml_bitset_get(hash_set->used, i)) {
164 |
ggml_bitset_set(hash_set->used, i);
165 |
hash_set->keys[i] = key;
166 |
return i;
167 |
168 |
if (hash_set->keys[i] == key) {
169 |
return i;
170 |
171 |
i = (i + 1) % hash_set->size;
172 |
} while (i != h);
173 |
174 |
// visited all hash table entries -> not found
175 |
GGML_ABORT("fatal error");
176 |
177 |
178 |
// computation graph
179 |
180 |
enum ggml_cgraph_eval_order {
181 |
182 |
183 |
184 |
185 |
186 |
struct ggml_cgraph {
187 |
int size;
188 |
int n_nodes;
189 |
int n_leafs;
190 |
191 |
struct ggml_tensor ** nodes;
192 |
struct ggml_tensor ** grads;
193 |
struct ggml_tensor ** leafs;
194 |
195 |
struct ggml_hash_set visited_hash_set;
196 |
197 |
enum ggml_cgraph_eval_order order;
198 |
199 |
200 |
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
201 |
202 |
// Memory allocation
203 |
204 |
void * ggml_aligned_malloc(size_t size);
205 |
void ggml_aligned_free(void * ptr, size_t size);
206 |
207 |
#ifdef __cplusplus
208 |
209 |
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:44b54a6ab261de692b791d6492940de6e606182158e60d59a630c26a38e3ccf8
3 |
size 1552422809
The diff for this file is too large to render.
See raw diff
@@ -0,0 +1,147 @@
1 |
#pragma once
2 |
3 |
4 |
#include "ggml-common.h"
5 |
6 |
#include "ggml.h"
7 |
8 |
// GGML internal header
9 |
10 |
#ifdef __cplusplus
11 |
extern "C" {
12 |
13 |
14 |
// Quantization
15 |
void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
16 |
void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
17 |
void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
18 |
void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
19 |
void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
20 |
void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
21 |
22 |
void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
23 |
void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
24 |
void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
25 |
void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
26 |
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
27 |
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
28 |
29 |
void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
30 |
void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
31 |
32 |
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
33 |
void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
34 |
void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
35 |
void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
36 |
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
37 |
38 |
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
39 |
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
40 |
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
41 |
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
42 |
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
43 |
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
44 |
45 |
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
46 |
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
47 |
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
48 |
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
49 |
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
50 |
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
51 |
52 |
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
53 |
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
54 |
55 |
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
56 |
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
57 |
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
58 |
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
59 |
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
60 |
61 |
// Dequantization
62 |
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
63 |
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
64 |
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
65 |
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
66 |
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
67 |
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
68 |
69 |
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
70 |
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
71 |
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
72 |
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
73 |
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
74 |
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
75 |
76 |
void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
77 |
void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
78 |
79 |
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
80 |
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
81 |
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
82 |
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
83 |
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
84 |
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
85 |
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
86 |
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
87 |
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
88 |
89 |
// Dot product
90 |
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
91 |
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
92 |
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
93 |
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
94 |
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
95 |
96 |
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
97 |
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
98 |
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
99 |
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
100 |
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
101 |
102 |
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
103 |
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
104 |
105 |
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
106 |
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
107 |
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
108 |
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
109 |
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
110 |
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
111 |
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
112 |
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
113 |
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
114 |
115 |
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
116 |
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
117 |
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
118 |
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
119 |
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
120 |
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
121 |
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
122 |
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
123 |
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
124 |
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
125 |
126 |
size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
127 |
size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
128 |
129 |
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
130 |
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
131 |
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
132 |
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
133 |
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
134 |
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
135 |
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
136 |
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
137 |
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
138 |
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
139 |
140 |
void iq2xs_init_impl(enum ggml_type type);
141 |
void iq2xs_free_impl(enum ggml_type type);
142 |
void iq3xs_init_impl(int grid_size);
143 |
void iq3xs_free_impl(int grid_size);
144 |
145 |
#ifdef __cplusplus
146 |
147 |
The diff for this file is too large to render.
See raw diff
The diff for this file is too large to render.
See raw diff
@@ -0,0 +1,841 @@
1 |
//#include "ggml.h"
2 |
3 |
//#include "common.h"
4 |
//#include "common-ggml.h"
5 |
6 |
//#include <cassert>
7 |
//#include <cmath>
8 |
//#include <cstdio>
9 |
//#include <cstring>
10 |
//#include <fstream>
11 |
//#include <map>
12 |
//#include <string>
13 |
//#include <vector>
14 |
15 |
//#if defined(_MSC_VER)
16 |
//#pragma warning(disable: 4244 4267) // possible loss of data
17 |
18 |
19 |
//// default hparams (GPT-2 117M)
20 |
//struct gpt2_hparams {
21 |
// int32_t n_vocab = 50257; // Vocabulary size remains the same
22 |
// int32_t n_ctx = 1024; // Maximum context length (sequence length)
23 |
// int32_t n_embd = 1024; // Embedding dimensionality
24 |
// int32_t n_head = 16; // Number of attention heads
25 |
// int32_t n_layer = 24; // Number of transformer layers
26 |
// int32_t ftype = 1; // Set to 1 for FP16 precision (optional)
27 |
// float eps = 1e-5f; // Small constant for numerical stability
28 |
29 |
30 |
//struct gpt2_layer {
31 |
// // normalization
32 |
// struct ggml_tensor * ln_1_g;
33 |
// struct ggml_tensor * ln_1_b;
34 |
35 |
// struct ggml_tensor * ln_2_g;
36 |
// struct ggml_tensor * ln_2_b;
37 |
38 |
// // attention
39 |
// struct ggml_tensor * c_attn_attn_w;
40 |
// struct ggml_tensor * c_attn_attn_b;
41 |
42 |
// struct ggml_tensor * c_attn_proj_w;
43 |
// struct ggml_tensor * c_attn_proj_b;
44 |
45 |
// // mlp
46 |
// struct ggml_tensor * c_mlp_fc_w;
47 |
// struct ggml_tensor * c_mlp_fc_b;
48 |
49 |
// struct ggml_tensor * c_mlp_proj_w;
50 |
// struct ggml_tensor * c_mlp_proj_b;
51 |
52 |
53 |
//struct gpt2_model {
54 |
// gpt2_hparams hparams;
55 |
56 |
// // normalization
57 |
// struct ggml_tensor * ln_f_g;
58 |
// struct ggml_tensor * ln_f_b;
59 |
60 |
// struct ggml_tensor * wte; // position embedding
61 |
// struct ggml_tensor * wpe; // token embedding
62 |
// struct ggml_tensor * lm_head; // language model head
63 |
64 |
// std::vector<gpt2_layer> layers;
65 |
66 |
// // key + value memory
67 |
// struct ggml_tensor * memory_k;
68 |
// struct ggml_tensor * memory_v;
69 |
70 |
// //
71 |
// struct ggml_context * ctx_w;
72 |
// std::map<std::string, struct ggml_tensor *> tensors;
73 |
74 |
75 |
//// load the model's weights from a file
76 |
//bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
77 |
// printf("%s: loading model from '%s'\n", __func__, fname.c_str());
78 |
79 |
// auto fin = std::ifstream(fname, std::ios::binary);
80 |
// if (!fin) {
81 |
// fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
82 |
// return false;
83 |
// }
84 |
85 |
// // verify magic
86 |
// {
87 |
// uint32_t magic;
88 |
// *) &magic, sizeof(magic));
89 |
// if (magic != GGML_FILE_MAGIC) {
90 |
// fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
91 |
// return false;
92 |
// }
93 |
// }
94 |
95 |
// // load hparams
96 |
// {
97 |
// auto & hparams = model.hparams;
98 |
99 |
// *) &hparams.n_vocab, sizeof(hparams.n_vocab));
100 |
// *) &hparams.n_ctx, sizeof(hparams.n_ctx));
101 |
// *) &hparams.n_embd, sizeof(hparams.n_embd));
102 |
// *) &hparams.n_head, sizeof(hparams.n_head));
103 |
// *) &hparams.n_layer, sizeof(hparams.n_layer));
104 |
// *) &hparams.ftype, sizeof(hparams.ftype));
105 |
106 |
// const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
107 |
108 |
// printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
109 |
// printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
110 |
// printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
111 |
// printf("%s: n_head = %d\n", __func__, hparams.n_head);
112 |
// printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
113 |
// printf("%s: ftype = %d\n", __func__, hparams.ftype);
114 |
// printf("%s: qntvr = %d\n", __func__, qntvr);
115 |
116 |
// hparams.ftype %= GGML_QNT_VERSION_FACTOR;
117 |
// }
118 |
119 |
// // load vocab
120 |
// {
121 |
// int32_t n_vocab = 0;
122 |
// *) &n_vocab, sizeof(n_vocab));
123 |
124 |
// if (n_vocab != model.hparams.n_vocab) {
125 |
// fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
126 |
// __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
127 |
// return false;
128 |
// }
129 |
130 |
// std::string word;
131 |
// std::vector<char> buf(128);
132 |
133 |
// for (int i = 0; i < n_vocab; i++) {
134 |
// uint32_t len;
135 |
// *) &len, sizeof(len));
136 |
137 |
// buf.resize(len);
138 |
// *), len);
139 |
// word.assign(, len);
140 |
141 |
// vocab.token_to_id[word] = i;
142 |
// vocab.id_to_token[i] = word;
143 |
// }
144 |
// }
145 |
146 |
// // for the big tensors, we have the option to store the data in 16-bit floats or quantized
147 |
// // in order to save memory and also to speed up the computation
148 |
// ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
149 |
// if (wtype == GGML_TYPE_COUNT) {
150 |
// fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
151 |
// __func__, fname.c_str(), model.hparams.ftype);
152 |
// return false;
153 |
// }
154 |
155 |
// auto & ctx = model.ctx_w;
156 |
157 |
// size_t ctx_size = 0;
158 |
159 |
// {
160 |
// const auto & hparams = model.hparams;
161 |
162 |
// const int n_embd = hparams.n_embd;
163 |
// const int n_layer = hparams.n_layer;
164 |
// const int n_ctx = hparams.n_ctx;
165 |
// const int n_vocab = hparams.n_vocab;
166 |
167 |
// ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
168 |
// ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
169 |
170 |
// ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // wte
171 |
// ctx_size += ggml_row_size(GGML_TYPE_F32, n_ctx*n_embd); // wpe
172 |
// ctx_size += ggml_row_size(wtype, n_vocab*n_embd); // lm_head
173 |
174 |
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
175 |
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
176 |
177 |
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
178 |
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
179 |
180 |
// ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
181 |
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
182 |
183 |
// ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
184 |
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
185 |
186 |
// ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
187 |
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
188 |
189 |
// ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
190 |
// ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_proj_b
191 |
192 |
// ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
193 |
// ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
194 |
195 |
// ctx_size += (6 + 12*n_layer)*512; // object overhead
196 |
197 |
// printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
198 |
// printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
199 |
// }
200 |
201 |
// // create the ggml context
202 |
// {
203 |
// struct ggml_init_params params = {
204 |
// /*.mem_size =*/ ctx_size,
205 |
// /*.mem_buffer =*/ NULL,
206 |
// /*.no_alloc =*/ false,
207 |
// };
208 |
209 |
// model.ctx_w = ggml_init(params);
210 |
// if (!model.ctx_w) {
211 |
// fprintf(stderr, "%s: ggml_init() failed\n", __func__);
212 |
// return false;
213 |
// }
214 |
// }
215 |
216 |
// // prepare memory for the weights
217 |
// {
218 |
// const auto & hparams = model.hparams;
219 |
220 |
// const int n_embd = hparams.n_embd;
221 |
// const int n_layer = hparams.n_layer;
222 |
// const int n_ctx = hparams.n_ctx;
223 |
// const int n_vocab = hparams.n_vocab;
224 |
225 |
// model.layers.resize(n_layer);
226 |
227 |
// model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
228 |
// model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
229 |
230 |
// model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
231 |
// model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
232 |
// model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
233 |
234 |
// // map by name
235 |
// model.tensors["model/ln_f/g"] = model.ln_f_g;
236 |
// model.tensors["model/ln_f/b"] = model.ln_f_b;
237 |
238 |
// model.tensors["model/wte"] = model.wte;
239 |
// model.tensors["model/wpe"] = model.wpe;
240 |
// model.tensors["model/lm_head"] = model.lm_head;
241 |
242 |
// for (int i = 0; i < n_layer; ++i) {
243 |
// auto & layer = model.layers[i];
244 |
245 |
// layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
246 |
// layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
247 |
248 |
// layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
249 |
// layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
250 |
251 |
// layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
252 |
// layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
253 |
254 |
// layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
255 |
// layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
256 |
257 |
// layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
258 |
// layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
259 |
260 |
// layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
261 |
// layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
262 |
263 |
// // map by name
264 |
// model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
265 |
// model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
266 |
267 |
// model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
268 |
// model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
269 |
270 |
// model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
271 |
// model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
272 |
273 |
// model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
274 |
// model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
275 |
276 |
// model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
277 |
// model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
278 |
279 |
// model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
280 |
// model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
281 |
// }
282 |
// }
283 |
284 |
// // key + value memory
285 |
// {
286 |
// const auto & hparams = model.hparams;
287 |
288 |
// const int n_embd = hparams.n_embd;
289 |
// const int n_layer = hparams.n_layer;
290 |
// const int n_ctx = hparams.n_ctx;
291 |
292 |
// const int n_mem = n_layer*n_ctx;
293 |
// const int n_elements = n_embd*n_mem;
294 |
295 |
// model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
296 |
// model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
297 |
298 |
// const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
299 |
300 |
// printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
301 |
// }
302 |
303 |
// // load weights
304 |
// {
305 |
// size_t total_size = 0;
306 |
307 |
// bool has_lm_head = false;
308 |
309 |
// while (true) {
310 |
// int32_t n_dims;
311 |
// int32_t length;
312 |
// int32_t ttype;
313 |
314 |
//<char *>(&n_dims), sizeof(n_dims));
315 |
//<char *>(&length), sizeof(length));
316 |
//<char *>(&ttype), sizeof(ttype));
317 |
318 |
// if (fin.eof()) {
319 |
// break;
320 |
// }
321 |
322 |
// int32_t nelements = 1;
323 |
// int32_t ne[2] = { 1, 1 };
324 |
// for (int i = 0; i < n_dims; ++i) {
325 |
//<char *>(&ne[i]), sizeof(ne[i]));
326 |
// nelements *= ne[i];
327 |
// }
328 |
329 |
// std::string name(length, 0);
330 |
//[0], length);
331 |
332 |
// if (model.tensors.find(name) == model.tensors.end()) {
333 |
// fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
334 |
// return false;
335 |
// }
336 |
337 |
// auto tensor = model.tensors[name];
338 |
// if (ggml_nelements(tensor) != nelements) {
339 |
// fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
340 |
// return false;
341 |
// }
342 |
343 |
// if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
344 |
// fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
345 |
// __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
346 |
// return false;
347 |
// }
348 |
349 |
// // for debugging
350 |
// if (0) {
351 |
// printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
352 |
// }
353 |
354 |
// const size_t bpe = ggml_type_size(ggml_type(ttype));
355 |
356 |
// if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
357 |
// fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
358 |
// __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
359 |
// return false;
360 |
// }
361 |
362 |
//<char *>(tensor->data), ggml_nbytes(tensor));
363 |
364 |
// // GPT-2 models share the WTE tensor as the LM head
365 |
// if (name == "model/wte" && has_lm_head == false) {
366 |
// memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
367 |
// }
368 |
369 |
// if (name == "model/lm_head") {
370 |
// has_lm_head = true;
371 |
// }
372 |
373 |
// total_size += ggml_nbytes(tensor);
374 |
// }
375 |
376 |
// printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
377 |
// }
378 |
379 |
// fin.close();
380 |
381 |
// return true;
382 |
383 |
384 |
//// evaluate the transformer
385 |
386 |
//// - model: the model
387 |
//// - n_threads: number of threads to use
388 |
//// - n_past: the context size so far
389 |
//// - embd_inp: the embeddings of the tokens in the context
390 |
//// - embd_w: the predicted logits for the next token
391 |
392 |
//bool gpt2_eval(
393 |
// const gpt2_model & model,
394 |
// const int n_threads,
395 |
// const int n_past,
396 |
// const std::vector<gpt_vocab::id> & embd_inp,
397 |
// std::vector<float> & embd_w,
398 |
// size_t & mem_per_token) {
399 |
// const int N = embd_inp.size();
400 |
401 |
// const auto & hparams = model.hparams;
402 |
403 |
// const int n_embd = hparams.n_embd;
404 |
// const int n_layer = hparams.n_layer;
405 |
// const int n_ctx = hparams.n_ctx;
406 |
// const int n_head = hparams.n_head;
407 |
// const int n_vocab = hparams.n_vocab;
408 |
409 |
// static size_t buf_size = 256u*1024*1024;
410 |
// static void * buf = malloc(buf_size);
411 |
412 |
// if (mem_per_token > 0 && mem_per_token*N > buf_size) {
413 |
// const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
414 |
// //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
415 |
416 |
// // reallocate
417 |
// buf_size = buf_size_new;
418 |
// buf = realloc(buf, buf_size);
419 |
// if (buf == nullptr) {
420 |
// fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
421 |
// return false;
422 |
// }
423 |
// }
424 |
425 |
// struct ggml_init_params params = {
426 |
// /*.mem_size =*/ buf_size,
427 |
// /*.mem_buffer =*/ buf,
428 |
// /*.no_alloc =*/ false,
429 |
// };
430 |
431 |
// struct ggml_context * ctx0 = ggml_init(params);
432 |
// struct ggml_cgraph * gf = ggml_new_graph(ctx0);
433 |
434 |
// struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
435 |
// memcpy(embd->data,, N*ggml_element_size(embd));
436 |
437 |
// struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
438 |
// for (int i = 0; i < N; ++i) {
439 |
// ((int32_t *) position->data)[i] = n_past + i;
440 |
// }
441 |
442 |
// // wte + wpe
443 |
// struct ggml_tensor * inpL =
444 |
// ggml_add(ctx0,
445 |
// ggml_get_rows(ctx0, model.wte, embd),
446 |
// ggml_get_rows(ctx0, model.wpe, position));
447 |
448 |
// for (int il = 0; il < n_layer; ++il) {
449 |
// struct ggml_tensor * cur;
450 |
451 |
// // norm
452 |
// {
453 |
// // [ 768, N]
454 |
// cur = ggml_norm(ctx0, inpL, hparams.eps);
455 |
456 |
// // cur = ln_1_g*cur + ln_1_b
457 |
// // [ 768, N]
458 |
// cur = ggml_add(ctx0,
459 |
// ggml_mul(ctx0,
460 |
// ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
461 |
// cur),
462 |
// ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
463 |
// }
464 |
465 |
// // attn
466 |
// // [2304, 768] - model.layers[il].c_attn_attn_w
467 |
// // [2304, 1] - model.layers[il].c_attn_attn_b
468 |
// // [ 768, N] - cur (in)
469 |
// // [2304, N] - cur (out)
470 |
// //
471 |
// // cur = attn_w*cur + attn_b
472 |
// // [2304, N]
473 |
// {
474 |
// cur = ggml_mul_mat(ctx0,
475 |
// model.layers[il].c_attn_attn_w,
476 |
// cur);
477 |
478 |
// cur = ggml_add(ctx0,
479 |
// ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
480 |
// cur);
481 |
// }
482 |
483 |
// // self-attention
484 |
// {
485 |
// struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
486 |
// struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
487 |
// struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
488 |
489 |
// // store key and value to memory
490 |
// if (N >= 1) {
491 |
// struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
492 |
// struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
493 |
494 |
// ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
495 |
// ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
496 |
// }
497 |
498 |
// // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
499 |
// // [64, N, 12]
500 |
// struct ggml_tensor * Q =
501 |
// ggml_permute(ctx0,
502 |
// ggml_cpy(ctx0,
503 |
// Qcur,
504 |
// ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
505 |
// 0, 2, 1, 3);
506 |
507 |
// // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
508 |
// // [64, n_past + N, 12]
509 |
// struct ggml_tensor * K =
510 |
// ggml_permute(ctx0,
511 |
// ggml_reshape_3d(ctx0,
512 |
// ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
513 |
// n_embd/n_head, n_head, n_past + N),
514 |
// 0, 2, 1, 3);
515 |
516 |
// // GG: flash attention
517 |
// //struct ggml_tensor * V =
518 |
// // ggml_cpy(ctx0,
519 |
// // ggml_permute(ctx0,
520 |
// // ggml_reshape_3d(ctx0,
521 |
// // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
522 |
// // n_embd/n_head, n_head, n_past + N),
523 |
// // 1, 2, 0, 3),
524 |
// // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
525 |
526 |
// //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
527 |
528 |
// // K * Q
529 |
// // [n_past + N, N, 12]
530 |
// struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
531 |
532 |
// // KQ_scaled = KQ / sqrt(n_embd/n_head)
533 |
// // [n_past + N, N, 12]
534 |
// struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));
535 |
536 |
// // KQ_masked = mask_past(KQ_scaled)
537 |
// // [n_past + N, N, 12]
538 |
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
539 |
540 |
// // KQ = soft_max(KQ_masked)
541 |
// // [n_past + N, N, 12]
542 |
// struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
543 |
544 |
// // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
545 |
// // [n_past + N, 64, 12]
546 |
// struct ggml_tensor * V_trans =
547 |
// ggml_cpy(ctx0,
548 |
// ggml_permute(ctx0,
549 |
// ggml_reshape_3d(ctx0,
550 |
// ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
551 |
// n_embd/n_head, n_head, n_past + N),
552 |
// 1, 2, 0, 3),
553 |
// ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
554 |
555 |
// // KQV = transpose(V) * KQ_soft_max
556 |
// // [64, N, 12]
557 |
// struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
558 |
559 |
// // KQV_merged = KQV.permute(0, 2, 1, 3)
560 |
// // [64, 12, N]
561 |
// struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
562 |
563 |
// // cur = KQV_merged.contiguous().view(n_embd, N)
564 |
// // [768, N]
565 |
// cur = ggml_cpy(ctx0,
566 |
// KQV_merged,
567 |
// ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
568 |
// }
569 |
570 |
// // projection
571 |
// // [ 768, 768] - model.layers[il].c_attn_proj_w
572 |
// // [ 768, 1] - model.layers[il].c_attn_proj_b
573 |
// // [ 768, N] - cur (in)
574 |
// // [ 768, N] - cur (out)
575 |
// //
576 |
// // cur = proj_w*cur + proj_b
577 |
// // [768, N]
578 |
// {
579 |
// cur = ggml_mul_mat(ctx0,
580 |
// model.layers[il].c_attn_proj_w,
581 |
// cur);
582 |
583 |
// cur = ggml_add(ctx0,
584 |
// ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
585 |
// cur);
586 |
// }
587 |
588 |
// // add the input
589 |
// cur = ggml_add(ctx0, cur, inpL);
590 |
591 |
// struct ggml_tensor * inpFF = cur;
592 |
593 |
// // feed-forward network
594 |
// {
595 |
// // norm
596 |
// {
597 |
// cur = ggml_norm(ctx0, inpFF, hparams.eps);
598 |
599 |
// // cur = ln_2_g*cur + ln_2_b
600 |
// // [ 768, N]
601 |
// cur = ggml_add(ctx0,
602 |
// ggml_mul(ctx0,
603 |
// ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
604 |
// cur),
605 |
// ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
606 |
// }
607 |
608 |
// // fully connected
609 |
// // [3072, 768] - model.layers[il].c_mlp_fc_w
610 |
// // [3072, 1] - model.layers[il].c_mlp_fc_b
611 |
// // [ 768, N] - cur (in)
612 |
// // [3072, N] - cur (out)
613 |
// //
614 |
// // cur = fc_w*cur + fc_b
615 |
// // [3072, N]
616 |
// cur = ggml_mul_mat(ctx0,
617 |
// model.layers[il].c_mlp_fc_w,
618 |
// cur);
619 |
620 |
// cur = ggml_add(ctx0,
621 |
// ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
622 |
// cur);
623 |
624 |
// // GELU activation
625 |
// // [3072, N]
626 |
// cur = ggml_gelu(ctx0, cur);
627 |
628 |
// // projection
629 |
// // [ 768, 3072] - model.layers[il].c_mlp_proj_w
630 |
// // [ 768, 1] - model.layers[il].c_mlp_proj_b
631 |
// // [3072, N] - cur (in)
632 |
// // [ 768, N] - cur (out)
633 |
// //
634 |
// // cur = proj_w*cur + proj_b
635 |
// // [768, N]
636 |
// cur = ggml_mul_mat(ctx0,
637 |
// model.layers[il].c_mlp_proj_w,
638 |
// cur);
639 |
640 |
// cur = ggml_add(ctx0,
641 |
// ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
642 |
// cur);
643 |
// }
644 |
645 |
// // input for next layer
646 |
// inpL = ggml_add(ctx0, cur, inpFF);
647 |
// }
648 |
649 |
// // norm
650 |
// {
651 |
// // [ 768, N]
652 |
// inpL = ggml_norm(ctx0, inpL, hparams.eps);
653 |
654 |
// // inpL = ln_f_g*inpL + ln_f_b
655 |
// // [ 768, N]
656 |
// inpL = ggml_add(ctx0,
657 |
// ggml_mul(ctx0,
658 |
// ggml_repeat(ctx0, model.ln_f_g, inpL),
659 |
// inpL),
660 |
// ggml_repeat(ctx0, model.ln_f_b, inpL));
661 |
// }
662 |
663 |
// // inpL = WTE * inpL
664 |
// // [ 768, 50257] - model.lm_head
665 |
// // [ 768, N] - inpL
666 |
// inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
667 |
668 |
// // logits -> probs
669 |
// //inpL = ggml_soft_max_inplace(ctx0, inpL);
670 |
671 |
// // run the computation
672 |
// ggml_build_forward_expand(gf, inpL);
673 |
// ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
674 |
675 |
// //if (n_past%100 == 0) {
676 |
// // ggml_graph_print (&gf);
677 |
// // ggml_graph_dump_dot(&gf, NULL, "");
678 |
// //}
679 |
680 |
// //embd_w.resize(n_vocab*N);
681 |
// //memcpy(, ggml_get_data(inpL), sizeof(float)*n_vocab*N);
682 |
683 |
// // return result just for the last token
684 |
// embd_w.resize(n_vocab);
685 |
// memcpy(, (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
686 |
687 |
// if (mem_per_token == 0) {
688 |
// mem_per_token = ggml_used_mem(ctx0)/N;
689 |
// }
690 |
// //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
691 |
692 |
// ggml_free(ctx0);
693 |
694 |
// return true;
695 |
696 |
697 |
//int main(int argc, char ** argv) {
698 |
// ggml_time_init();
699 |
700 |
// const int64_t t_main_start_us = ggml_time_us();
701 |
702 |
// gpt_params params;
703 |
// params.model = "ggml-model-gpt-2-774M.bin";
704 |
705 |
// if (gpt_params_parse(argc, argv, params) == false) {
706 |
// return 1;
707 |
// }
708 |
709 |
// if (params.seed < 0) {
710 |
// params.seed = time(NULL);
711 |
// }
712 |
713 |
// printf("%s: seed = %d\n", __func__, params.seed);
714 |
715 |
// std::mt19937 rng(params.seed);
716 |
// if (params.prompt.empty()) {
717 |
// params.prompt = gpt_random_prompt(rng);
718 |
// }
719 |
720 |
// int64_t t_load_us = 0;
721 |
722 |
// gpt_vocab vocab;
723 |
// gpt2_model model;
724 |
725 |
// // load the model
726 |
// {
727 |
// const int64_t t_start_us = ggml_time_us();
728 |
729 |
// if (!gpt2_model_load(params.model, model, vocab)) {
730 |
// fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
731 |
// return 1;
732 |
// }
733 |
734 |
// t_load_us = ggml_time_us() - t_start_us;
735 |
736 |
// test_gpt_tokenizer(vocab, params.token_test);
737 |
// }
738 |
739 |
// while(true) {
740 |
// int n_past = 0;
741 |
742 |
// int64_t t_sample_us = 0;
743 |
// int64_t t_predict_us = 0;
744 |
745 |
// std::vector<float> logits;
746 |
747 |
// // tokenize the prompt
748 |
// std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
749 |
750 |
// params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
751 |
752 |
// printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
753 |
// printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
754 |
// for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
755 |
// printf("%d ", embd_inp[i]);
756 |
// }
757 |
// printf("\n\n");
758 |
759 |
// // submit the input prompt token-by-token
760 |
// // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
761 |
// std::vector<gpt_vocab::id> embd;
762 |
763 |
// // determine the required inference memory per token:
764 |
// size_t mem_per_token = 0;
765 |
// gpt2_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
766 |
767 |
// for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
768 |
// // predict
769 |
// if (embd.size() > 0) {
770 |
// const int64_t t_start_us = ggml_time_us();
771 |
772 |
// if (!gpt2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
773 |
// printf("Failed to predict\n");
774 |
// return 1;
775 |
// }
776 |
777 |
// t_predict_us += ggml_time_us() - t_start_us;
778 |
// }
779 |
780 |
// n_past += embd.size();
781 |
// embd.clear();
782 |
783 |
// if (i >= embd_inp.size()) {
784 |
// // sample next token
785 |
// const int top_k = params.top_k;
786 |
// const float top_p = params.top_p;
787 |
// const float temp = params.temp;
788 |
789 |
// const int n_vocab = model.hparams.n_vocab;
790 |
791 |
// gpt_vocab::id id = 0;
792 |
793 |
// {
794 |
// const int64_t t_start_sample_us = ggml_time_us();
795 |
796 |
// id = gpt_sample_top_k_top_p(vocab, + (logits.size() - n_vocab), top_k, top_p, temp, rng);
797 |
798 |
// t_sample_us += ggml_time_us() - t_start_sample_us;
799 |
// }
800 |
801 |
// // add it to the context
802 |
// embd.push_back(id);
803 |
// } else {
804 |
// // if here, it means we are still processing the input prompt
805 |
// for (size_t k = i; k < embd_inp.size(); k++) {
806 |
// embd.push_back(embd_inp[k]);
807 |
// if (int32_t(embd.size()) >= params.n_batch) {
808 |
// break;
809 |
// }
810 |
// }
811 |
// i += embd.size() - 1;
812 |
// }
813 |
814 |
// // display text
815 |
// for (auto id : embd) {
816 |
// printf("%s", vocab.id_to_token[id].c_str());
817 |
// }
818 |
// fflush(stdout);
819 |
820 |
// // end of text token
821 |
// if (embd.back() == 50256) {
822 |
// // report timing
823 |
// {
824 |
// const int64_t t_main_end_us = ggml_time_us();
825 |
826 |
// printf("\n\n");
827 |
// printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
828 |
// printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
829 |
// printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
830 |
// printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
831 |
// printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
832 |
// }
833 |
// break;
834 |
// }
835 |
// }
836 |
// }
837 |
838 |
// ggml_free(model.ctx_w);
839 |
840 |
// return 0;
841 |
@@ -0,0 +1,184 @@
1 |
#include "ggml.h"
2 |
3 |
#include "common.h"
4 |
#include "common-ggml.h"
5 |
6 |
#include <cassert>
7 |
#include <cmath>
8 |
#include <cstdio>
9 |
#include <cstring>
10 |
#include <fstream>
11 |
#include <map>
12 |
#include <string>
13 |
#include <vector>
14 |
#include <regex>
15 |
16 |
// default hparams (GPT-2 117M)
17 |
struct gpt2_hparams {
18 |
int32_t n_vocab = 50257;
19 |
int32_t n_ctx = 1024;
20 |
int32_t n_embd = 768;
21 |
int32_t n_head = 12;
22 |
int32_t n_layer = 12;
23 |
int32_t ftype = 1;
24 |
25 |
26 |
// quantize a model
27 |
bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
28 |
gpt_vocab vocab;
29 |
30 |
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
31 |
32 |
auto finp = std::ifstream(fname_inp, std::ios::binary);
33 |
if (!finp) {
34 |
fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
35 |
return false;
36 |
37 |
38 |
auto fout = std::ofstream(fname_out, std::ios::binary);
39 |
if (!fout) {
40 |
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
41 |
return false;
42 |
43 |
44 |
// verify magic
45 |
46 |
uint32_t magic;
47 |
+ *) &magic, sizeof(magic));
48 |
if (magic != GGML_FILE_MAGIC) {
49 |
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
50 |
return false;
51 |
52 |
53 |
fout.write((char *) &magic, sizeof(magic));
54 |
55 |
56 |
gpt2_hparams hparams;
57 |
58 |
// load hparams
59 |
60 |
+ *) &hparams.n_vocab, sizeof(hparams.n_vocab));
61 |
+ *) &hparams.n_ctx, sizeof(hparams.n_ctx));
62 |
+ *) &hparams.n_embd, sizeof(hparams.n_embd));
63 |
+ *) &hparams.n_head, sizeof(hparams.n_head));
64 |
+ *) &hparams.n_layer, sizeof(hparams.n_layer));
65 |
+ *) &hparams.ftype, sizeof(hparams.ftype));
66 |
67 |
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
68 |
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
69 |
70 |
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
71 |
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
72 |
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
73 |
printf("%s: n_head = %d\n", __func__, hparams.n_head);
74 |
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
75 |
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
76 |
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
77 |
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
78 |
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
79 |
80 |
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
81 |
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
82 |
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
83 |
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
84 |
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
85 |
fout.write((char *) &ftype_dst, sizeof(ftype_dst));
86 |
87 |
88 |
// load vocab
89 |
90 |
int32_t n_vocab = 0;
91 |
+ ((char *) &n_vocab, sizeof(n_vocab));
92 |
fout.write((char *) &n_vocab, sizeof(n_vocab));
93 |
94 |
if (n_vocab != hparams.n_vocab) {
95 |
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
96 |
__func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
97 |
return false;
98 |
99 |
100 |
std::string word;
101 |
for (int i = 0; i < n_vocab; i++) {
102 |
uint32_t len;
103 |
+ ((char *) &len, sizeof(len));
104 |
fout.write((char *) &len, sizeof(len));
105 |
106 |
107 |
+ ((char *), len);
108 |
fout.write((char *), len);
109 |
110 |
vocab.token_to_id[word] = i;
111 |
vocab.id_to_token[i] = word;
112 |
113 |
114 |
115 |
// regexes of tensor names to be quantized
116 |
const std::vector<std::string> to_quant = {
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
126 |
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
127 |
return false;
128 |
129 |
130 |
131 |
132 |
133 |
return true;
134 |
135 |
136 |
// usage:
137 |
// ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
138 |
139 |
int main(int argc, char ** argv) {
140 |
if (argc != 4) {
141 |
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
142 |
143 |
return 1;
144 |
145 |
146 |
// needed to initialize f16 tables
147 |
148 |
struct ggml_init_params params = { 0, NULL, false };
149 |
struct ggml_context * ctx = ggml_init(params);
150 |
151 |
152 |
153 |
const std::string fname_inp = argv[1];
154 |
const std::string fname_out = argv[2];
155 |
156 |
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
157 |
158 |
const int64_t t_main_start_us = ggml_time_us();
159 |
160 |
int64_t t_quantize_us = 0;
161 |
162 |
// load the model
163 |
164 |
const int64_t t_start_us = ggml_time_us();
165 |
166 |
if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
167 |
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
168 |
return 1;
169 |
170 |
171 |
t_quantize_us = ggml_time_us() - t_start_us;
172 |
173 |
174 |
// report timing
175 |
176 |
const int64_t t_main_end_us = ggml_time_us();
177 |
178 |
179 |
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
180 |
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
181 |
182 |
183 |
return 0;
184 |