spuun commited on
Commit
79015ec
·
0 Parent(s):

Duplicate from spuun/lama

Browse files
Files changed (5) hide show
  1. .gitattributes +34 -0
  2. README.md +13 -0
  3. app.py +52 -0
  4. convert.py +1149 -0
  5. requirements.txt +3 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Lama
3
+ emoji: 🐨
4
+ colorFrom: red
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.24.1
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: spuun/lama
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_cpp import Llama
2
+ import gradio
3
+ import random
4
+ import requests
5
+ import os
6
+ import subprocess
7
+
8
+ if not os.path.exists("ggml-model-q4_0.bin"):
9
+ open("ggml-model-q4_0.bin", "wb").write(
10
+ requests.get(
11
+ "https://huggingface.co/hlhr202/llama-7B-ggml-int4/resolve/main/ggml-model-q4_0.bin"
12
+ ).content
13
+ )
14
+ open("tokenizer.model", "wb").write(
15
+ requests.get(
16
+ "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model"
17
+ ).content
18
+ )
19
+ print("Downloaded model files. Doing conversion.")
20
+ print(
21
+ subprocess.check_output(
22
+ "python convert.py ggml-model-q4_0.bin --outfile ggml-model.bin", shell=True
23
+ ).decode("utf-8")
24
+ )
25
+ else:
26
+ print("Model already exists, skipping redownload")
27
+
28
+
29
+ print("Loading model...")
30
+ llm = Llama(
31
+ model_path="ggml-model.bin",
32
+ seed=random.randint(1, 9999999),
33
+ n_ctx=2048,
34
+ n_threads=3,
35
+ )
36
+ print("Model loaded.")
37
+
38
+
39
+ def generate(prompt, stop):
40
+ output = llm(
41
+ bytes(prompt, "utf-8").decode("unicode_escape"),
42
+ max_tokens=64,
43
+ temperature=0.75,
44
+ top_p=0.7,
45
+ stop=[bytes(stop, "utf-8").decode("unicode_escape")],
46
+ )
47
+ print(output)
48
+ return output["choices"][0]["text"]
49
+
50
+
51
+ app = gradio.Interface(fn=generate, inputs=["text", "text"], outputs="text")
52
+ app.launch(enable_queue=True, show_api=True)
convert.py ADDED
@@ -0,0 +1,1149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Convert.py taken from https://github.com/ggerganov/llama.cpp
2
+ import argparse
3
+ import concurrent.futures
4
+ import copy
5
+ import enum
6
+ import faulthandler
7
+ import functools
8
+ import io
9
+ import itertools
10
+ import json
11
+ import math
12
+ import mmap
13
+ import pickle
14
+ import re
15
+ import signal
16
+ import struct
17
+ import sys
18
+ import zipfile
19
+ from abc import ABCMeta, abstractmethod
20
+ from dataclasses import dataclass
21
+ from pathlib import Path
22
+ from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List,
23
+ Literal, Optional, Sequence, Tuple, TypeVar, Union)
24
+
25
+ import numpy as np
26
+ from sentencepiece import SentencePieceProcessor # type: ignore
27
+
28
+ if TYPE_CHECKING:
29
+ from typing_extensions import TypeAlias
30
+
31
+ if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
32
+ faulthandler.register(signal.SIGUSR1)
33
+
34
+ NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class UnquantizedDataType:
39
+ name: str
40
+
41
+
42
+ DT_F16 = UnquantizedDataType('F16')
43
+ DT_F32 = UnquantizedDataType('F32')
44
+ DT_I32 = UnquantizedDataType('I32')
45
+ DT_BF16 = UnquantizedDataType('BF16')
46
+
47
+
48
+ @dataclass(frozen=True)
49
+ class QuantizedDataType:
50
+ groupsize: int
51
+ have_addends: bool
52
+ have_g_idx: bool
53
+
54
+
55
+ DT_Q4_0 = QuantizedDataType(groupsize=32, have_addends=False, have_g_idx=False)
56
+ DT_Q4_1 = QuantizedDataType(groupsize=32, have_addends=True, have_g_idx=False)
57
+
58
+ DataType = Union[UnquantizedDataType, QuantizedDataType]
59
+
60
+ DATA_TYPE_TO_FTYPE: Dict[DataType, int] = {
61
+ DT_F32: 0,
62
+ DT_F16: 1,
63
+ DT_Q4_0: 2,
64
+ DT_Q4_1: 3,
65
+ }
66
+
67
+ FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \
68
+ {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()}
69
+
70
+ DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
71
+ DT_F16: np.dtype(np.float16),
72
+ DT_F32: np.dtype(np.float32),
73
+ DT_I32: np.dtype(np.int32),
74
+ }
75
+
76
+ NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \
77
+ {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
78
+
79
+
80
+ class GGMLFileType(enum.Enum):
81
+ AllF32 = 0
82
+ MostlyF16 = 1 # except 1d tensors
83
+ MostlyQ4_0 = 2 # except 1d tensors
84
+ MostlyQ4_1 = 3 # except 1d tensors
85
+ PerLayerIsQ4_1 = 4 # but tok_embeddings.weight and output.weight are F16
86
+
87
+ def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
88
+ if len(tensor.shape) == 1:
89
+ # 1D tensors are always F32.
90
+ return DT_F32
91
+ elif self == GGMLFileType.AllF32:
92
+ return DT_F32
93
+ elif self == GGMLFileType.MostlyF16:
94
+ return DT_F16
95
+ elif self == GGMLFileType.MostlyQ4_0:
96
+ return DT_Q4_0
97
+ elif self == GGMLFileType.MostlyQ4_1:
98
+ return DT_Q4_1
99
+ elif self == GGMLFileType.PerLayerIsQ4_1:
100
+ if name in ('output.weight', 'tok_embeddings.weight'):
101
+ return DT_F16
102
+ else:
103
+ return DT_Q4_1
104
+ else:
105
+ raise ValueError(self)
106
+
107
+
108
+ def make_tensors_list() -> List[str]:
109
+ ret = [
110
+ 'tok_embeddings.weight',
111
+ 'norm.weight',
112
+ 'output.weight',
113
+ ]
114
+ for i in range(80): # maximum number of layer
115
+ ret += [
116
+ f'layers.{i}.attention.wq.weight',
117
+ f'layers.{i}.attention.wk.weight',
118
+ f'layers.{i}.attention.wv.weight',
119
+ f'layers.{i}.attention.wo.weight',
120
+ f'layers.{i}.attention_norm.weight',
121
+ f'layers.{i}.feed_forward.w1.weight',
122
+ f'layers.{i}.feed_forward.w2.weight',
123
+ f'layers.{i}.feed_forward.w3.weight',
124
+ f'layers.{i}.atttention_norm.weight',
125
+ f'layers.{i}.ffn_norm.weight',
126
+ ]
127
+ return ret
128
+
129
+
130
+ TENSORS_LIST = make_tensors_list()
131
+ TENSORS_SET = set(TENSORS_LIST)
132
+
133
+
134
+ @dataclass
135
+ class Params:
136
+ n_vocab: int
137
+ n_embd: int
138
+ n_mult: int
139
+ n_head: int
140
+ n_layer: int
141
+ file_type: GGMLFileType
142
+
143
+ @staticmethod
144
+ def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params':
145
+ n_vocab, n_embd = model["tok_embeddings.weight"].shape
146
+
147
+ return Params(
148
+ n_vocab=n_vocab,
149
+ n_embd=n_embd,
150
+ n_mult=256,
151
+ n_head=n_embd // 128,
152
+ n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model),
153
+ file_type=file_type,
154
+ )
155
+
156
+
157
+ class SentencePieceVocab:
158
+ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
159
+ self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
160
+ added_tokens: Dict[str, int]
161
+ if fname_added_tokens is not None:
162
+ added_tokens = json.load(open(fname_added_tokens))
163
+ else:
164
+ added_tokens = {}
165
+ vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
166
+ expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
167
+ actual_ids = sorted(added_tokens.values())
168
+ if expected_ids != actual_ids:
169
+ raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
170
+ items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
171
+ self.added_tokens_list = [text for (text, idx) in items]
172
+ self.vocab_size_base: int = vocab_size
173
+ self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
174
+ self.fname_tokenizer = fname_tokenizer
175
+ self.fname_added_tokens = fname_added_tokens
176
+
177
+ def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
178
+ tokenizer = self.sentencepiece_tokenizer
179
+ for i in range(tokenizer.vocab_size()):
180
+ text: bytes
181
+ if tokenizer.is_unknown(i):
182
+ text = " \u2047 ".encode("utf-8")
183
+ elif tokenizer.is_control(i):
184
+ text = b""
185
+ elif tokenizer.is_byte(i):
186
+ piece = tokenizer.id_to_piece(i)
187
+ if len(piece) != 6:
188
+ raise Exception(f"Invalid token: {piece}")
189
+ byte_value = int(piece[3:-1], 16)
190
+ text = struct.pack("B", byte_value)
191
+ else:
192
+ text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
193
+ score: float = tokenizer.get_score(i)
194
+ yield text, score
195
+
196
+ def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
197
+ for text in self.added_tokens_list:
198
+ score = -1000.0
199
+ yield text.encode("utf-8"), score
200
+
201
+ def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
202
+ yield from self.sentencepiece_tokens()
203
+ yield from self.added_tokens()
204
+
205
+ def __repr__(self) -> str:
206
+ return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
207
+
208
+
209
+ class GGMLVocab:
210
+ def __init__(self, tokens: List[Tuple[bytes, float]]):
211
+ self.tokens = tokens
212
+ self.vocab_size = len(tokens)
213
+
214
+ def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
215
+ return self.tokens
216
+
217
+ def __repr__(self) -> str:
218
+ return f"<GGMLVocab with {self.vocab_size} tokens>"
219
+
220
+
221
+ Vocab = Union[SentencePieceVocab, GGMLVocab]
222
+
223
+
224
+ def permute(weights: NDArray, n_head: int) -> NDArray:
225
+ return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
226
+ .swapaxes(1, 2)
227
+ .reshape(weights.shape))
228
+
229
+
230
+ def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray:
231
+ # First reinterpret each row from a list of int32s containing 8 values each
232
+ # to a list of uint8s containing 2 values each.
233
+ qvalues_pack8 = qvalues_pack32.view(np.uint8)
234
+
235
+ # Then split out the two values per int8 (which requires an actual
236
+ # conversion because numpy doesn't natively support int4s).
237
+ qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], dtype=np.uint8)
238
+ qvalues[:, 0::2] = qvalues_pack8 & 0xf
239
+ qvalues[:, 1::2] = qvalues_pack8 >> 4
240
+
241
+ assert addends is None or addends.shape == scales.shape
242
+ assert qvalues.shape[0] == scales.shape[0]
243
+ assert qvalues.shape[1] % scales.shape[1] == 0
244
+ if g_idx is None:
245
+ repeat_count = qvalues.shape[1] // scales.shape[1]
246
+ scales = scales[:, :, np.newaxis]
247
+ if addends is not None:
248
+ addends = addends[:, :, np.newaxis]
249
+ # Reshape so that the below computation broadcasts over scales and addends:
250
+ qvalues.shape = (qvalues.shape[0], scales.shape[1], int(repeat_count))
251
+ else:
252
+ # In this case the scale and addend is selected for each column by g_idx:
253
+ assert addends is not None
254
+ scales = scales[:, g_idx]
255
+ addends = addends[:, g_idx]
256
+ if addends is None:
257
+ # Q4_0
258
+ qvalues = qvalues.view(np.int8)
259
+ qvalues -= 8
260
+ # And do the actual 'value = scale * qvalue + addend' computation.
261
+ values = scales * qvalues
262
+ if addends is not None:
263
+ values += addends
264
+ if g_idx is None:
265
+ values.shape = (values.shape[0], values.shape[1] * values.shape[2])
266
+ return values
267
+
268
+
269
+ class Tensor(metaclass=ABCMeta):
270
+ data_type: DataType
271
+
272
+ @abstractmethod
273
+ def astype(self, data_type: DataType) -> 'Tensor': ...
274
+ @abstractmethod
275
+ def permute(self, n_head: int) -> 'Tensor': ...
276
+ @abstractmethod
277
+ def to_ggml(self) -> 'GGMLCompatibleTensor': ...
278
+
279
+
280
+ class UnquantizedTensor(Tensor):
281
+ def __init__(self, ndarray: NDArray) -> None:
282
+ assert isinstance(ndarray, np.ndarray)
283
+ self.ndarray = ndarray
284
+ self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
285
+
286
+ def astype(self, data_type: DataType) -> Tensor:
287
+ dtype = DATA_TYPE_TO_NUMPY[data_type]
288
+ return UnquantizedTensor(self.ndarray.astype(dtype))
289
+
290
+ def to_ggml(self) -> 'UnquantizedTensor':
291
+ return self
292
+
293
+ def permute(self, n_head: int) -> 'UnquantizedTensor':
294
+ return UnquantizedTensor(permute(self.ndarray, n_head))
295
+
296
+
297
+ def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
298
+ tensor = lazy_tensor.load()
299
+ assert isinstance(tensor, UnquantizedTensor)
300
+
301
+ # double-check:
302
+ actual_shape = list(tensor.ndarray.shape)
303
+ assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
304
+ if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
305
+ if convert:
306
+ tensor.ndarray = tensor.ndarray.astype(expected_dtype)
307
+ else:
308
+ raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
309
+
310
+ return tensor.ndarray
311
+
312
+
313
+ class GGMLQuantizedTensor(Tensor):
314
+ data_type: QuantizedDataType
315
+
316
+ def __init__(self, ndarray: NDArray, shape: List[int], data_type: DataType) -> None:
317
+ rows, columns = shape
318
+ assert data_type in (DT_Q4_1, DT_Q4_0) # for now
319
+ assert isinstance(data_type, QuantizedDataType) # redundant, but mypy complains without this
320
+ assert columns % data_type.groupsize == 0
321
+ words_in_block = 6 if data_type == DT_Q4_1 else 5
322
+ self.ndarray = ndarray.view(dtype=np.uint32).reshape((rows, columns // data_type.groupsize, words_in_block))
323
+ self.shape = shape[:]
324
+ self.data_type = data_type
325
+
326
+ def astype(self, data_type: DataType) -> Tensor:
327
+ if data_type == self.data_type:
328
+ return self
329
+ scales = self.ndarray[:, :, 0].view(np.float32)
330
+ if self.data_type.have_addends:
331
+ addends = self.ndarray[:, :, 1].view(np.float32)
332
+ else:
333
+ addends = None
334
+ qweights = self.ndarray[:, :, -4:].reshape([self.shape[0], self.shape[1] // 8])
335
+
336
+ dq = dequantize_q4(qweights, scales, addends, g_idx=None)
337
+ return UnquantizedTensor(dq).astype(data_type)
338
+
339
+ def to_ggml(self) -> 'GGMLQuantizedTensor':
340
+ return self
341
+
342
+ def permute(self, n_head: int) -> 'GGMLQuantizedTensor':
343
+ return GGMLQuantizedTensor(permute(self.ndarray, n_head), self.shape, self.data_type)
344
+
345
+
346
+ GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]
347
+
348
+
349
+ class DeferredPermutedTensor(Tensor):
350
+ def __init__(self, base: Tensor, n_head: int) -> None:
351
+ self.base = base
352
+ self.n_head = n_head
353
+ self.data_type = self.base.data_type
354
+
355
+ def astype(self, data_type: DataType) -> Tensor:
356
+ return self.base.astype(data_type).permute(self.n_head)
357
+
358
+ def to_ggml(self) -> GGMLCompatibleTensor:
359
+ return self.base.to_ggml().permute(self.n_head)
360
+
361
+ def permute(self, n_head: int) -> Tensor:
362
+ raise Exception("shouldn't permute twice")
363
+
364
+
365
+ class GPTQForLLaMaQuantizedTensor(Tensor):
366
+ def __init__(self, model: 'LazyModel', namebase: str) -> None:
367
+ qweight = load_unquantized(model[f"{namebase}.qweight"], np.int32)
368
+ scales = load_unquantized(model[f"{namebase}.scales"], np.float32, convert=True)
369
+
370
+ bias = model.get(f"{namebase}.bias")
371
+ if bias is not None:
372
+ # Q4_1 does not support bias; good thing the bias is always all zeros.
373
+ assert not np.any(load_unquantized(bias))
374
+
375
+ if f"{namebase}.zeros" in model:
376
+ zeros = load_unquantized(model[f"{namebase}.zeros"], np.float32)
377
+ else:
378
+ qzeros = load_unquantized(model[f"{namebase}.qzeros"], np.int32)
379
+ assert qzeros.dtype == np.int32
380
+ zeros = dequantize_q4(qzeros, scales, scales, g_idx=None)
381
+ assert zeros.dtype == np.float32
382
+
383
+ assert zeros.shape == scales.shape
384
+
385
+ # Output is transposed compared to the input, and addends have their sign flipped.
386
+ # Scales and zeros similarly must be transposed but only for newer
387
+ # versions of GPTQ-for-LLaMa; the older versions can be identified by
388
+ # having shape (n_embd, 1).
389
+ qweight = qweight.T
390
+ if scales.shape[1] != 1:
391
+ scales = scales.T
392
+ zeros = zeros.T
393
+
394
+ # Output also has signs flipped for the addends.
395
+ self.qweight = qweight
396
+ self.scales = scales
397
+ self.addends = -zeros
398
+
399
+ self.g_idx: Optional[NDArray]
400
+ if f"{namebase}.g_idx" in model:
401
+ self.g_idx = load_unquantized(model[f"{namebase}.g_idx"], np.int32)
402
+ assert self.g_idx.shape == (qweight.shape[1] * 8,)
403
+ else:
404
+ self.g_idx = None
405
+
406
+ self.shape = [self.qweight.shape[0], self.qweight.shape[1] * 8]
407
+ self.data_type = QuantizedDataType(groupsize=self.groupsize(), have_addends=True,
408
+ have_g_idx=(self.g_idx is not None))
409
+
410
+ def inspect(self, row: int, col: int) -> None:
411
+ '''For debugging.'''
412
+ qweight = (self.qweight[row, col // 8] >> (4 * (col & 7))) & 0xf
413
+ if self.g_idx is not None:
414
+ group = self.g_idx[col]
415
+ else:
416
+ group = int(col // self.groupsize())
417
+ scale = self.scales[row, group]
418
+ addend = self.addends[row, group]
419
+ with np.printoptions(precision=None, suppress=True):
420
+ print(f'scale:{scale} addend:{addend} qweight:{qweight}')
421
+ print('possible values:', np.arange(16) * scale + addend)
422
+ print('actual value:', qweight * scale + addend)
423
+
424
+ def astype(self, data_type: DataType) -> Tensor:
425
+ if isinstance(data_type, QuantizedDataType):
426
+ assert self.g_idx is None and data_type.have_addends is True and data_type.have_g_idx is False
427
+ return self.regroup(data_type.groupsize)
428
+
429
+ dequantized = dequantize_q4(np.ascontiguousarray(self.qweight), self.scales, self.addends, self.g_idx)
430
+ return UnquantizedTensor(dequantized).astype(data_type)
431
+
432
+ def groupsize(self) -> int:
433
+ assert self.addends.shape == self.scales.shape
434
+ assert self.shape[1] % self.scales.shape[1] == 0
435
+ return self.shape[1] // self.scales.shape[1]
436
+
437
+ def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor':
438
+ # Old versions of GPTQ-for-LLaMa shared scales and addends between all the
439
+ # columns in a row. Newer versions share them between every set of N
440
+ # columns in a row, where N is the `groupsize` parameter, usually 128. The
441
+ # output format shares them between every set of 32 columns. To handle
442
+ # this, duplicate scales and addends for every smaller group.
443
+ # (In the above, 'row' and 'column' are in the sense of the output.)
444
+ assert self.g_idx is None
445
+ old_groupsize = self.groupsize()
446
+ assert old_groupsize >= new_groupsize and old_groupsize % new_groupsize == 0, old_groupsize
447
+ ret = copy.copy(self)
448
+ ret.addends = self.addends.repeat(old_groupsize // new_groupsize, axis=1)
449
+ ret.scales = self.scales.repeat(old_groupsize // new_groupsize, axis=1)
450
+ ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False)
451
+ return ret
452
+
453
+ def permute(self, n_head: int) -> Tensor:
454
+ return DeferredPermutedTensor(self, n_head)
455
+
456
+ def to_ggml(self) -> GGMLQuantizedTensor:
457
+ # The output format looks like this:
458
+ # For each row:
459
+ # For each group of 32 columns:
460
+ # - addend (float32, 4 bytes)
461
+ # - scale (float32, 4 bytes)
462
+ # - weights (int4 * 32, 16 bytes)
463
+
464
+ if self.groupsize() != 32:
465
+ raise Exception("should have been regrouped before converting to ggml")
466
+
467
+ # Since the output format is mixed between integers and floats, we have
468
+ # to hackily view the floats as int32s just so numpy will let us
469
+ # concatenate them.
470
+ addends_view = self.addends.view(dtype=np.int32)[:, :, np.newaxis]
471
+ scales_view = self.scales.view(dtype=np.int32)[:, :, np.newaxis]
472
+
473
+ # Split into groups of 4 columns (i.e. 32 columns of quantized data):
474
+ grouped = self.qweight.reshape([self.qweight.shape[0], self.qweight.shape[1] // 4, 4])
475
+
476
+ # And concatenate:
477
+ grouped = np.concatenate([scales_view, addends_view, grouped], axis=2, casting='no')
478
+
479
+ return GGMLQuantizedTensor(grouped, self.shape, DT_Q4_1)
480
+
481
+
482
+ @dataclass
483
+ class LazyTensor:
484
+ _load: Callable[[], Tensor]
485
+ shape: List[int]
486
+ data_type: DataType
487
+ description: str
488
+
489
+ def load(self) -> Tensor:
490
+ ret = self._load()
491
+ assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
492
+ return ret
493
+
494
+ def astype(self, data_type: DataType) -> 'LazyTensor':
495
+ self.validate_conversion_to(data_type)
496
+
497
+ def load() -> Tensor:
498
+ return self.load().astype(data_type)
499
+ return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
500
+
501
+ def validate_conversion_to(self, data_type: DataType) -> None:
502
+ if data_type == self.data_type:
503
+ return
504
+ if isinstance(data_type, QuantizedDataType):
505
+ if not isinstance(self.data_type, QuantizedDataType):
506
+ raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
507
+ if self.data_type.have_g_idx:
508
+ sys.stderr.write("Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively supported by GGML. For now you can still convert this model by passing `--outtype f16` to dequantize, but that will result in a much larger output file for no quality benefit.\n")
509
+ sys.exit(1)
510
+ assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
511
+
512
+
513
+ LazyModel = Dict[str, LazyTensor]
514
+
515
+
516
+ @dataclass
517
+ class ModelPlus:
518
+ model: LazyModel
519
+ paths: List[Path] # Where this was read from.
520
+ format: Literal['ggml', 'torch', 'safetensors']
521
+ vocab: Optional[Vocab] # For GGML models (which have vocab built in), the vocab.
522
+
523
+
524
+ def merge_sharded(models: List[LazyModel]) -> LazyModel:
525
+ # Original LLaMA models have each file contain one part of each tensor.
526
+ # Use a dict instead of a set to preserve order.
527
+ names = {name: None for model in models for name in model}
528
+
529
+ def convert(name: str) -> LazyTensor:
530
+ lazy_tensors: List[LazyTensor] = [model[name] for model in models]
531
+ if len(lazy_tensors) == 1:
532
+ # only one file; don't go through this procedure since there might
533
+ # be quantized tensors
534
+ return lazy_tensors[0]
535
+ if len(lazy_tensors[0].shape) == 1:
536
+ # the tensor is just duplicated in every file
537
+ return lazy_tensors[0]
538
+ if name.startswith('tok_embeddings.') or \
539
+ name.endswith('.attention.wo.weight') or \
540
+ name.endswith('.feed_forward.w2.weight'):
541
+ # split by columns
542
+ axis = 1
543
+ else:
544
+ # split by rows
545
+ axis = 0
546
+ concatenated_shape = list(lazy_tensors[0].shape)
547
+ concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
548
+
549
+ def load() -> UnquantizedTensor:
550
+ ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
551
+ concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
552
+ return UnquantizedTensor(concatenated)
553
+ description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
554
+ return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
555
+ return {name: convert(name) for name in names}
556
+
557
+
558
+ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
559
+ formats = set(mp.format for mp in models_plus)
560
+ assert len(formats) == 1, "different formats?"
561
+ format = formats.pop()
562
+ paths = [path for mp in models_plus for path in mp.paths]
563
+ # Use the first non-None vocab, if any.
564
+ try:
565
+ vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
566
+ except StopIteration:
567
+ vocab = None
568
+
569
+ if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
570
+ # Transformers models put different tensors in different files, but
571
+ # don't split indivdual tensors between files.
572
+ model: LazyModel = {}
573
+ for mp in models_plus:
574
+ model.update(mp.model)
575
+ else:
576
+ model = merge_sharded([mp.model for mp in models_plus])
577
+
578
+ return ModelPlus(model, paths, format, vocab)
579
+
580
+
581
+ def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
582
+ def load() -> Tensor:
583
+ return lazy_tensor.load().permute(n_head)
584
+ return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
585
+
586
+
587
+ def convert_transformers_to_orig(model: LazyModel) -> LazyModel:
588
+ out: LazyModel = {}
589
+ out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
590
+ out["norm.weight"] = model["model.norm.weight"]
591
+ out["output.weight"] = model["lm_head.weight"]
592
+
593
+ n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128
594
+ for i in itertools.count():
595
+ if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
596
+ break
597
+ out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head)
598
+ out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head)
599
+ out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
600
+ out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
601
+
602
+ out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
603
+ out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"]
604
+ out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"]
605
+
606
+ out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"]
607
+ out[f"layers.{i}.ffn_norm.weight"] = model[f"model.layers.{i}.post_attention_layernorm.weight"]
608
+ return out
609
+
610
+
611
+ def handle_quantization(model: LazyModel) -> LazyModel:
612
+ '''Convert a model with entries for 'foo.qweight', 'foo.scales', etc.
613
+ (which resolve to UnquantizedTensors with the raw data) to one with entries
614
+ for 'foo.weight' (which resolve to QuantizedTensors).
615
+ '''
616
+ def convert(name: str) -> Tuple[str, LazyTensor]:
617
+ if name.endswith(".qweight"):
618
+ namebase = name.rsplit('.', 1)[0]
619
+ orig_name = namebase + ".weight"
620
+
621
+ lazy_tensor = model[name]
622
+ assert len(lazy_tensor.shape) == 2
623
+ real_shape = [lazy_tensor.shape[1], lazy_tensor.shape[0] * 8]
624
+
625
+ # Calculate type. This replicates the logic in
626
+ # GPTQForLLaMaQuantizedTensor (which is executed when the modelis
627
+ # actually loaded).
628
+ lazy_scales = model[f"{namebase}.scales"]
629
+ scales_width = 1 if lazy_scales.shape[1] == 1 else lazy_scales.shape[0]
630
+ assert real_shape[1] % scales_width == 0
631
+ groupsize = real_shape[1] // scales_width
632
+ have_g_idx = f"{namebase}.g_idx" in model
633
+ data_type = QuantizedDataType(groupsize=groupsize, have_addends=True, have_g_idx=have_g_idx)
634
+
635
+ def load() -> Tensor:
636
+ return GPTQForLLaMaQuantizedTensor(model, namebase)
637
+
638
+ return (orig_name, LazyTensor(load, real_shape, data_type, '[quantized]'))
639
+ else:
640
+ return (name, model[name])
641
+ return dict(convert(name) for name in model)
642
+
643
+ # Functionality that simulates `torch.load` but where individual tensors are
644
+ # only loaded into memory on demand, not all at once.
645
+ # PyTorch can't do this natively as of time of writing:
646
+ # - https://github.com/pytorch/pytorch/issues/64327
647
+ # This allows us to de-shard without multiplying RAM usage, and also
648
+ # conveniently drops the PyTorch dependency (though we still need numpy).
649
+
650
+
651
+ @dataclass
652
+ class LazyStorageKind:
653
+ data_type: DataType
654
+
655
+
656
+ @dataclass
657
+ class LazyStorage:
658
+ load: Callable[[int, int], NDArray]
659
+ kind: LazyStorageKind
660
+ description: str
661
+
662
+
663
+ class LazyUnpickler(pickle.Unpickler):
664
+ def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
665
+ super().__init__(fp)
666
+ self.data_base_path = data_base_path
667
+ self.zip_file = zip_file
668
+
669
+ def persistent_load(self, pid: Any) -> Any:
670
+ assert pid[0] == 'storage'
671
+ assert isinstance(pid[1], LazyStorageKind)
672
+ data_type = pid[1].data_type
673
+ filename_stem = pid[2]
674
+ filename = self.data_base_path + '/' + filename_stem
675
+ info = self.zip_file.getinfo(filename)
676
+
677
+ def load(offset: int, elm_count: int) -> NDArray:
678
+ dtype = DATA_TYPE_TO_NUMPY.get(data_type)
679
+ if dtype is None:
680
+ raise Exception("tensor stored in unsupported format")
681
+ fp = self.zip_file.open(info)
682
+ fp.seek(offset * dtype.itemsize)
683
+ size = elm_count * dtype.itemsize
684
+ data = fp.read(size)
685
+ assert len(data) == size
686
+ return np.frombuffer(data, dtype)
687
+ description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
688
+ return LazyStorage(load=load, kind=pid[1], description=description)
689
+
690
+ def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName]
691
+ requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
692
+ assert isinstance(storage, LazyStorage)
693
+
694
+ def load() -> UnquantizedTensor:
695
+ elm_count = stride[0] * size[0]
696
+ return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
697
+ description = f'pickled storage_offset={storage_offset} in {storage.description}'
698
+ return LazyTensor(load, list(size), storage.kind.data_type, description)
699
+
700
+ CLASSES: Dict[Any, Any] = {
701
+ ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2,
702
+ ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
703
+ ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
704
+ ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
705
+ ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
706
+ }
707
+
708
+ def find_class(self, module: str, name: str) -> Any:
709
+ if not module.startswith('torch'):
710
+ return super().find_class(module, name)
711
+ return self.CLASSES[(module, name)]
712
+
713
+
714
+ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
715
+ zf = zipfile.ZipFile(outer_fp)
716
+ pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
717
+ assert len(pickle_paths) == 1, pickle_paths
718
+ pickle_fp = zf.open(pickle_paths[0], 'r')
719
+ unpickler = LazyUnpickler(pickle_fp,
720
+ data_base_path=pickle_paths[0][:-4],
721
+ zip_file=zf)
722
+ model = unpickler.load()
723
+ as_dict = dict(model.items())
724
+ return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
725
+
726
+
727
+ SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
728
+ 'F16': DT_F16,
729
+ 'F32': DT_F32,
730
+ 'I32': DT_I32,
731
+ }
732
+
733
+
734
+ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
735
+ header_size, = struct.unpack('<Q', fp.read(8))
736
+ header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
737
+ # Use mmap for the actual data to avoid race conditions with the file offset.
738
+ mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
739
+ byte_buf = mapped[8 + header_size:]
740
+
741
+ def convert(info: Dict[str, Any]) -> LazyTensor:
742
+ data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
743
+ numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
744
+ shape: List[int] = info['shape']
745
+ begin, end = info['data_offsets']
746
+ assert 0 <= begin <= end <= len(byte_buf)
747
+ assert end - begin == math.prod(shape) * numpy_dtype.itemsize
748
+ buf = byte_buf[begin:end]
749
+
750
+ def load() -> UnquantizedTensor:
751
+ return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
752
+ description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
753
+ return LazyTensor(load, shape, data_type, description)
754
+ model = {name: convert(info) for (name, info) in header.items()}
755
+ return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
756
+
757
+
758
+ def must_read(fp: IO[bytes], length: int) -> bytes:
759
+ ret = fp.read(length)
760
+ if len(ret) < length:
761
+ raise Exception("unexpectedly reached end of file")
762
+ return ret
763
+
764
+
765
+ def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
766
+ magic = must_read(fp, 4)[::-1]
767
+ if magic in (b'ggmf', b'ggjt'):
768
+ version, = struct.unpack("i", must_read(fp, 4))
769
+ assert version == 1
770
+ else:
771
+ assert magic == b'ggml'
772
+ version = None
773
+ n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack('<7i', must_read(fp, 28))
774
+
775
+ tokens: List[Tuple[bytes, float]] = []
776
+ for i in range(n_vocab):
777
+ if i == 32000:
778
+ # HACK: GPT4All messed with the format without changing the magic
779
+ # number. Specifically, they changed the vocab section to contain
780
+ # `n_vocab - 1` tokens instead of `n_vocab` (i.e. omitting the
781
+ # extra pad token). Try to detect if we're reading a file like
782
+ # this.
783
+ orig_pos = fp.tell()
784
+ fp.seek(20, io.SEEK_CUR)
785
+ is_gpt4all = fp.read(21) == b'tok_embeddings.weight'
786
+ fp.seek(orig_pos)
787
+ if is_gpt4all:
788
+ break
789
+
790
+ length, = struct.unpack("i", must_read(fp, 4))
791
+ text = must_read(fp, length)
792
+ if magic != b'ggml':
793
+ score, = struct.unpack("f", must_read(fp, 4))
794
+ tokens.append((text, score))
795
+ vocab = GGMLVocab(tokens) if magic != b'ggml' else None
796
+
797
+ model: LazyModel = {}
798
+ # Use mmap for the actual data to avoid race conditions with the file offset.
799
+ off = fp.raw.tell()
800
+ mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
801
+ fp.raw.seek(off) # needed on Windows
802
+
803
+ def read_tensor() -> None: # this is a function so that variables captured in `load` don't change
804
+ shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
805
+ assert 0 <= shape_len <= 3
806
+ shape: List[int] = list(struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len)))
807
+ shape = shape[::-1]
808
+ name = must_read(fp, name_len).decode('utf-8')
809
+ data_type = FTYPE_TO_DATA_TYPE[ftype]
810
+
811
+ if magic == b'ggjt':
812
+ fp.seek((fp.tell() + 31) & -32)
813
+
814
+ if data_type == DT_Q4_1:
815
+ # See GPTQForLLaMaQuantizedTensor.ggml_ndarray()
816
+ size = 24 * (shape[1] // 32) * shape[0]
817
+ elif data_type == DT_Q4_0:
818
+ size = 20 * (shape[1] // 32) * shape[0]
819
+ else:
820
+ numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
821
+ elm_count = math.prod(shape)
822
+ size = elm_count * numpy_dtype.itemsize
823
+ offset = fp.tell()
824
+ buf = mapped[offset:offset+size]
825
+ fp.seek(size, io.SEEK_CUR)
826
+
827
+ def load() -> Tensor:
828
+ if isinstance(data_type, QuantizedDataType):
829
+ ndarray = np.frombuffer(buf, dtype=np.uint32)
830
+ return GGMLQuantizedTensor(ndarray, shape, data_type)
831
+ else:
832
+ return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
833
+ description = f'ggml offset={offset} type={data_type} path={path}'
834
+ model[name] = LazyTensor(load, shape, data_type, description)
835
+
836
+ while fp.read(1) != b'':
837
+ fp.seek(-1, io.SEEK_CUR)
838
+ read_tensor()
839
+
840
+ return ModelPlus(model=model, paths=[path], format='ggml', vocab=vocab)
841
+
842
+
843
+ @functools.lru_cache(maxsize=None)
844
+ def lazy_load_file(path: Path) -> ModelPlus:
845
+ fp = open(path, 'rb')
846
+ first8 = fp.read(8)
847
+ fp.seek(0)
848
+ if first8[:2] == b'PK':
849
+ # A zip file, i.e. PyTorch format
850
+ return lazy_load_torch_file(fp, path)
851
+ elif first8[2:4] == b'gg':
852
+ # GGML format
853
+ return lazy_load_ggml_file(fp, path)
854
+ elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
855
+ # Probably safetensors
856
+ return lazy_load_safetensors_file(fp, path)
857
+ else:
858
+ raise ValueError(f"unknown format: {path}")
859
+
860
+
861
+ In = TypeVar('In')
862
+ Out = TypeVar('Out')
863
+
864
+
865
+ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
866
+ '''Parallel map, but with backpressure. If the caller doesn't call `next`
867
+ fast enough, this will stop calling `func` at some point rather than
868
+ letting results pile up in memory. Specifically, there is a max of one
869
+ output value buffered per thread.'''
870
+ with concurrent.futures.ThreadPoolExecutor() as executor:
871
+ futures: List[concurrent.futures.Future[Out]] = []
872
+ items_rev = list(iterable)[::-1]
873
+ for i in range(min(concurrency, len(items_rev))):
874
+ futures.append(executor.submit(func, items_rev.pop()))
875
+ while futures:
876
+ result = futures.pop(0).result()
877
+ if items_rev:
878
+ futures.append(executor.submit(func, items_rev.pop()))
879
+ yield result
880
+
881
+
882
+ def check_vocab_size(params: Params, vocab: Vocab) -> None:
883
+ if params.n_vocab != vocab.vocab_size:
884
+ # GGMLVocab comes from the same file as the model so shouldn't mismatch:
885
+ assert isinstance(vocab, SentencePieceVocab)
886
+ if params.n_vocab == vocab.vocab_size_base:
887
+ print("Ignoring added_tokens.json since model matches vocab size without it.")
888
+ vocab.added_tokens_list = []
889
+ vocab.vocab_size = vocab.vocab_size_base
890
+ return
891
+ msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
892
+ if vocab.fname_added_tokens is not None:
893
+ msg += f" combined with {vocab.fname_added_tokens}"
894
+ msg += f" has {vocab.vocab_size})."
895
+ if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
896
+ msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
897
+ raise Exception(msg)
898
+
899
+
900
+ class OutputFile:
901
+ def __init__(self, fname_out: Path) -> None:
902
+ self.fout = open(fname_out, "wb")
903
+
904
+ def write_file_header(self, params: Params) -> None:
905
+ self.fout.write(b"ggjt"[::-1]) # magic
906
+ values = [
907
+ 1, # file version
908
+ params.n_vocab,
909
+ params.n_embd,
910
+ params.n_mult,
911
+ params.n_head,
912
+ params.n_layer,
913
+ params.n_embd // params.n_head, # rot (obsolete)
914
+ params.file_type.value,
915
+ ]
916
+ self.fout.write(struct.pack("i" * len(values), *values))
917
+
918
+ def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
919
+ sname = name.encode('utf-8')
920
+ self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type]))
921
+ self.fout.write(struct.pack("i" * len(shape), *shape[::-1]))
922
+ self.fout.write(sname)
923
+ self.fout.seek((self.fout.tell() + 31) & -32)
924
+
925
+ def write_vocab(self, vocab: Vocab) -> None:
926
+ for text, score in vocab.all_tokens():
927
+ self.fout.write(struct.pack("i", len(text)))
928
+ self.fout.write(text)
929
+ self.fout.write(struct.pack("f", score))
930
+
931
+ @staticmethod
932
+ def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
933
+ of = OutputFile(fname_out)
934
+ params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
935
+ n_head=1, n_layer=0, file_type=GGMLFileType.AllF32)
936
+ of = OutputFile(fname_out)
937
+ of.write_file_header(params)
938
+ of.write_vocab(vocab)
939
+ of.fout.close()
940
+
941
+ @staticmethod
942
+ def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
943
+ check_vocab_size(params, vocab)
944
+ of = OutputFile(fname_out)
945
+ of.write_file_header(params)
946
+ print("Writing vocab...")
947
+ of.write_vocab(vocab)
948
+
949
+ def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
950
+ name, lazy_tensor = item
951
+ return lazy_tensor.load().to_ggml().ndarray
952
+
953
+ ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
954
+ for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
955
+ size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
956
+ padi = len(str(len(model)))
957
+ print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
958
+ of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type)
959
+ ndarray.tofile(of.fout)
960
+ of.fout.close()
961
+
962
+
963
+ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
964
+ wq_type = model["layers.0.attention.wq.weight"].data_type
965
+ if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
966
+ return GGMLFileType.AllF32
967
+ if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
968
+ return GGMLFileType.MostlyF16
969
+ if output_type_str == "q4_1" or (output_type_str is None and isinstance(wq_type, QuantizedDataType) and
970
+ wq_type.have_addends):
971
+ if isinstance(model["output.weight"].data_type, QuantizedDataType):
972
+ return GGMLFileType.MostlyQ4_1
973
+ else:
974
+ return GGMLFileType.PerLayerIsQ4_1
975
+ if output_type_str == "q4_0" or (output_type_str is None and isinstance(wq_type, QuantizedDataType)):
976
+ return GGMLFileType.MostlyQ4_0
977
+ name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
978
+ raise Exception(f"Unexpected combination of types: {name_to_type}")
979
+
980
+
981
+ def do_necessary_conversions(model: LazyModel) -> LazyModel:
982
+ model = handle_quantization(model)
983
+
984
+ if "lm_head.weight" in model:
985
+ model = convert_transformers_to_orig(model)
986
+ model = filter_and_sort_tensors(model)
987
+
988
+ return model
989
+
990
+
991
+ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
992
+ return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
993
+ for (name, tensor) in model.items()}
994
+
995
+
996
+ def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
997
+ '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
998
+ the nth path in the model.
999
+ '''
1000
+ # Support the following patterns:
1001
+ patterns: List[Tuple[str, str]] = [
1002
+ # - x.00.pth, x.01.pth, etc.
1003
+ (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
1004
+ # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
1005
+ (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
1006
+ # x.bin, x.bin.1, etc.
1007
+ (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
1008
+ ]
1009
+ for regex, replacement in patterns:
1010
+ if re.search(regex, path.name):
1011
+ new_path = path.with_name(re.sub(regex, replacement, path.name))
1012
+ if new_path.exists():
1013
+ return new_path
1014
+ return None
1015
+
1016
+
1017
+ def find_multifile_paths(path: Path) -> List[Path]:
1018
+ '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
1019
+ the whole list of paths in the model.
1020
+ '''
1021
+ ret: List[Path] = []
1022
+ for i in itertools.count():
1023
+ nth_path = nth_multifile_path(path, i)
1024
+ if nth_path is None:
1025
+ break
1026
+ ret.append(nth_path)
1027
+ if not ret:
1028
+ # No matches. This should only happen if the file was named, e.g.,
1029
+ # foo.0, and there was no file named foo. Oh well, try to process it
1030
+ # as a single file.
1031
+ return [path]
1032
+ return ret
1033
+
1034
+
1035
+ def load_some_model(path: Path) -> ModelPlus:
1036
+ '''Load a model of any supported format.'''
1037
+ # Be extra-friendly and accept either a file or a directory:
1038
+ if path.is_dir():
1039
+ globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt"]
1040
+ files = [file for glob in globs for file in path.glob(glob)]
1041
+ if not files:
1042
+ # Try GGML too, but with lower priority, since if both a non-GGML
1043
+ # model and a GGML model exist in the same directory, we assume the
1044
+ # latter was converted from the former.
1045
+ files = list(path.glob("ggml-model*.bin*"))
1046
+ if not files:
1047
+ raise Exception(f"Can't find model in directory {path}")
1048
+ if len(files) > 1:
1049
+ raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
1050
+ path = files[0]
1051
+
1052
+ paths = find_multifile_paths(path)
1053
+ models_plus: List[ModelPlus] = []
1054
+ for path in paths:
1055
+ print(f"Loading model file {path}")
1056
+ models_plus.append(lazy_load_file(path))
1057
+
1058
+ model_plus = merge_multifile_models(models_plus)
1059
+ return model_plus
1060
+
1061
+
1062
+ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
1063
+ return {name: model[name] for name in TENSORS_LIST if name in model}
1064
+
1065
+
1066
+ def load_vocab(path: Path) -> SentencePieceVocab:
1067
+ # Be extra-friendly and accept either a file or a directory. Also, if it's
1068
+ # a directory, it might be the model directory, and tokenizer.model might
1069
+ # be in the parent of that.
1070
+ if path.is_dir():
1071
+ path2 = path / "tokenizer.model"
1072
+ # Use `.parent` instead of /.. to handle the symlink case better.
1073
+ path3 = path.parent / "tokenizer.model"
1074
+ if path2.exists():
1075
+ path = path2
1076
+ elif path3.exists():
1077
+ path = path3
1078
+ else:
1079
+ raise FileNotFoundError(f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, pass the directory as --vocab-dir")
1080
+ added_tokens_path = path.parent / "added_tokens.json"
1081
+ print(f"Loading vocab file {path}")
1082
+ return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
1083
+
1084
+
1085
+ def default_outfile(model_paths: List[Path], params: Params) -> Path:
1086
+ namestr = {
1087
+ GGMLFileType.AllF32: "f32",
1088
+ GGMLFileType.MostlyF16: "f16",
1089
+ GGMLFileType.MostlyQ4_1: "q4_1",
1090
+ GGMLFileType.PerLayerIsQ4_1: "q4_1",
1091
+ }[params.file_type]
1092
+ ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
1093
+ if ret in model_paths:
1094
+ sys.stderr.write(f"Error: Default output path ({ret}) would overwrite the input. Please explicitly specify a path using --outfile.\n")
1095
+ sys.exit(1)
1096
+ return ret
1097
+
1098
+
1099
+ def do_dump_model(model_plus: ModelPlus) -> None:
1100
+ print(f"model_plus.paths = {model_plus.paths!r}")
1101
+ print(f"model_plus.format = {model_plus.format!r}")
1102
+ print(f"model_plus.vocab = {model_plus.vocab!r}")
1103
+ for name, lazy_tensor in model_plus.model.items():
1104
+ print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
1105
+
1106
+
1107
+ def main(args_in: Optional[List[str]] = None) -> None:
1108
+ parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
1109
+ parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
1110
+ parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
1111
+ parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
1112
+ parser.add_argument("--outtype", choices=["f32", "f16", "q4_1"], help="output format (default: based on input)")
1113
+ parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
1114
+ parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
1115
+ parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
1116
+ args = parser.parse_args(args_in)
1117
+
1118
+ vocab: Vocab
1119
+ if args.dump_single:
1120
+ model_plus = lazy_load_file(args.model)
1121
+ do_dump_model(model_plus)
1122
+ elif args.vocab_only:
1123
+ vocab = load_vocab(args.vocab_dir or args.model)
1124
+ assert args.outfile, "need --outfile if using --vocab-only"
1125
+ outfile = args.outfile
1126
+ OutputFile.write_vocab_only(outfile, vocab)
1127
+ print(f"Wrote {outfile}")
1128
+ else:
1129
+ model_plus = load_some_model(args.model)
1130
+ if args.dump:
1131
+ do_dump_model(model_plus)
1132
+ return
1133
+ if model_plus.vocab is not None and args.vocab_dir is None:
1134
+ vocab = model_plus.vocab
1135
+ else:
1136
+ vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
1137
+ vocab = load_vocab(vocab_dir)
1138
+ model = model_plus.model
1139
+ model = do_necessary_conversions(model)
1140
+ output_type = pick_output_type(model, args.outtype)
1141
+ model = convert_to_output_type(model, output_type)
1142
+ params = Params.guessed(model, output_type)
1143
+ outfile = args.outfile or default_outfile(model_plus.paths, params)
1144
+ OutputFile.write_all(outfile, params, model, vocab)
1145
+ print(f"Wrote {outfile}")
1146
+
1147
+
1148
+ if __name__ == '__main__':
1149
+ main()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ llama-cpp-python
2
+ numpy==1.24
3
+ sentencepiece==0.1.98