solve "cannot load pytorch checkpoint issue" and fix tokenizer
Browse files- .gitattributes +1 -0
- SimSun.ttf +3 -0
- config.json +2 -2
- modeling_vitphi.py +2 -2
- pytorch_model.bin +2 -2
- tokenization_vitphi.py +41 -9
- vocab.tiktoken +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
SimSun.ttf filter=lfs diff=lfs merge=lfs -text
|
SimSun.ttf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ca4da082cd970f0c8abaa79f213ddcbc475f7b5afabcb81b385998f9ebfbb53f
|
3 |
+
size 10499104
|
config.json
CHANGED
@@ -35,7 +35,7 @@
|
|
35 |
"visual": {
|
36 |
"heads": 16,
|
37 |
"image_size": 448,
|
38 |
-
"image_start_id":
|
39 |
"layers": 48,
|
40 |
"mlp_ratio": 4.9231,
|
41 |
"output_dim": 4096,
|
@@ -43,4 +43,4 @@
|
|
43 |
"width": 1664
|
44 |
},
|
45 |
"vocab_size": 51200
|
46 |
-
}
|
|
|
35 |
"visual": {
|
36 |
"heads": 16,
|
37 |
"image_size": 448,
|
38 |
+
"image_start_id": 50508,
|
39 |
"layers": 48,
|
40 |
"mlp_ratio": 4.9231,
|
41 |
"output_dim": 4096,
|
|
|
43 |
"width": 1664
|
44 |
},
|
45 |
"vocab_size": 51200
|
46 |
+
}
|
modeling_vitphi.py
CHANGED
@@ -45,8 +45,8 @@ from einops import rearrange
|
|
45 |
from transformers.activations import ACT2FN
|
46 |
from transformers import PretrainedConfig, PreTrainedModel
|
47 |
from transformers.modeling_outputs import CausalLMOutputWithPast
|
48 |
-
from
|
49 |
-
from
|
50 |
# from configuration_vitphi import MixFormerVLSequentialConfig
|
51 |
# from visual import VisionTransformer
|
52 |
|
|
|
45 |
from transformers.activations import ACT2FN
|
46 |
from transformers import PretrainedConfig, PreTrainedModel
|
47 |
from transformers.modeling_outputs import CausalLMOutputWithPast
|
48 |
+
from configuration_vitphi import MixFormerVLSequentialConfig
|
49 |
+
from visual import VisionTransformer
|
50 |
# from configuration_vitphi import MixFormerVLSequentialConfig
|
51 |
# from visual import VisionTransformer
|
52 |
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:196df56fad9a8cda2cfd334a4c75dd21a5fc4522f2ed28ddb02e8ff50c31de4d
|
3 |
+
size 6726979333
|
tokenization_vitphi.py
CHANGED
@@ -44,12 +44,26 @@ IMEND = "<|im_end|>"
|
|
44 |
# as different as possible to minimize the impact
|
45 |
EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
|
46 |
SPECIAL_TOKENS = (
|
47 |
-
|
48 |
IMSTART,
|
49 |
IMEND,
|
50 |
) + EXTRAS
|
51 |
IMG_TOKEN_SPAN = 256
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
55 |
with open(tiktoken_bpe_file, "rb") as f:
|
@@ -119,9 +133,22 @@ class VitPhiTokenizer(PreTrainedTokenizer):
|
|
119 |
box_end_tag='</box>',
|
120 |
quad_start_tag='<quad>',
|
121 |
quad_end_tag='</quad>',
|
|
|
|
|
|
|
|
|
122 |
**kwargs,
|
123 |
):
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
self.image_start_tag = image_start_tag
|
126 |
self.image_end_tag = image_end_tag
|
127 |
self.image_pad_tag = image_pad_tag
|
@@ -140,14 +167,17 @@ class VitPhiTokenizer(PreTrainedTokenizer):
|
|
140 |
)
|
141 |
|
142 |
self.errors = errors # how to handle errors in decoding
|
|
|
143 |
|
144 |
self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
|
145 |
self.special_tokens = {
|
146 |
token: index
|
147 |
for index, token in enumerate(
|
148 |
-
SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks)
|
149 |
)
|
150 |
}
|
|
|
|
|
151 |
self.img_start_id = self.special_tokens[self.image_start_tag]
|
152 |
self.img_end_id = self.special_tokens[self.image_end_tag]
|
153 |
self.img_pad_id = self.special_tokens[self.image_pad_tag]
|
@@ -165,8 +195,8 @@ class VitPhiTokenizer(PreTrainedTokenizer):
|
|
165 |
special_tokens=self.special_tokens,
|
166 |
)
|
167 |
assert (
|
168 |
-
len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
|
169 |
-
), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
|
170 |
|
171 |
self.decoder = {
|
172 |
v: k for k, v in self.mergeable_ranks.items()
|
@@ -174,9 +204,9 @@ class VitPhiTokenizer(PreTrainedTokenizer):
|
|
174 |
self.decoder.update({v: k for k, v in self.special_tokens.items()})
|
175 |
|
176 |
self.tokenizer = enc # type: tiktoken.Encoding
|
177 |
-
|
178 |
-
self.eod_id = self.tokenizer.eot_token
|
179 |
self.im_start_id = self.special_tokens[IMSTART]
|
|
|
180 |
self.im_end_id = self.special_tokens[IMEND]
|
181 |
|
182 |
def __len__(self) -> int:
|
@@ -251,12 +281,14 @@ class VitPhiTokenizer(PreTrainedTokenizer):
|
|
251 |
`List[bytes|str]`: The list of tokens.
|
252 |
"""
|
253 |
tokens = []
|
|
|
|
|
254 |
text = unicodedata.normalize("NFC", text)
|
|
|
255 |
|
256 |
# this implementation takes a detour: text -> token id -> token surface forms
|
257 |
for t in self.tokenizer.encode(
|
258 |
-
|
259 |
-
):
|
260 |
tokens.append(self.decoder[t])
|
261 |
|
262 |
def _encode_imgurl(img_tokens):
|
|
|
44 |
# as different as possible to minimize the impact
|
45 |
EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
|
46 |
SPECIAL_TOKENS = (
|
47 |
+
ENDOFTEXT,
|
48 |
IMSTART,
|
49 |
IMEND,
|
50 |
) + EXTRAS
|
51 |
IMG_TOKEN_SPAN = 256
|
52 |
|
53 |
+
def bytes_to_unicode():
|
54 |
+
bs = (
|
55 |
+
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
|
56 |
+
)
|
57 |
+
cs = bs[:]
|
58 |
+
n = 0
|
59 |
+
for b in range(2**8):
|
60 |
+
if b not in bs:
|
61 |
+
bs.append(b)
|
62 |
+
cs.append(2**8 + n)
|
63 |
+
n += 1
|
64 |
+
cs = [chr(n) for n in cs]
|
65 |
+
return dict(zip(bs, cs))
|
66 |
+
|
67 |
|
68 |
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
69 |
with open(tiktoken_bpe_file, "rb") as f:
|
|
|
133 |
box_end_tag='</box>',
|
134 |
quad_start_tag='<quad>',
|
135 |
quad_end_tag='</quad>',
|
136 |
+
unk_token="<|endoftext|>",
|
137 |
+
bos_token="<|endoftext|>",
|
138 |
+
eos_token="<|endoftext|>",
|
139 |
+
pad_token=None,
|
140 |
**kwargs,
|
141 |
):
|
142 |
+
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
143 |
+
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
144 |
+
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
145 |
+
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
146 |
+
super().__init__(errors=errors,
|
147 |
+
unk_token=unk_token,
|
148 |
+
bos_token=bos_token,
|
149 |
+
eos_token=eos_token,
|
150 |
+
pad_token=pad_token,
|
151 |
+
**kwargs)
|
152 |
self.image_start_tag = image_start_tag
|
153 |
self.image_end_tag = image_end_tag
|
154 |
self.image_pad_tag = image_pad_tag
|
|
|
167 |
)
|
168 |
|
169 |
self.errors = errors # how to handle errors in decoding
|
170 |
+
self.byte_encoder = bytes_to_unicode()
|
171 |
|
172 |
self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
|
173 |
self.special_tokens = {
|
174 |
token: index
|
175 |
for index, token in enumerate(
|
176 |
+
SPECIAL_TOKENS + self.IMAGE_ST, start=len(self.mergeable_ranks)-1
|
177 |
)
|
178 |
}
|
179 |
+
self.special_tokens[ENDOFTEXT] = 50256
|
180 |
+
# print(self.special_tokens)
|
181 |
self.img_start_id = self.special_tokens[self.image_start_tag]
|
182 |
self.img_end_id = self.special_tokens[self.image_end_tag]
|
183 |
self.img_pad_id = self.special_tokens[self.image_pad_tag]
|
|
|
195 |
special_tokens=self.special_tokens,
|
196 |
)
|
197 |
assert (
|
198 |
+
len(self.mergeable_ranks) + len(self.special_tokens) - 1 == enc.n_vocab # has a common word
|
199 |
+
), f"{len(self.mergeable_ranks) + len(self.special_tokens)} - 1 != {enc.n_vocab} in encoding"
|
200 |
|
201 |
self.decoder = {
|
202 |
v: k for k, v in self.mergeable_ranks.items()
|
|
|
204 |
self.decoder.update({v: k for k, v in self.special_tokens.items()})
|
205 |
|
206 |
self.tokenizer = enc # type: tiktoken.Encoding
|
207 |
+
|
|
|
208 |
self.im_start_id = self.special_tokens[IMSTART]
|
209 |
+
self.eod_id = self.im_start_id - 1
|
210 |
self.im_end_id = self.special_tokens[IMEND]
|
211 |
|
212 |
def __len__(self) -> int:
|
|
|
281 |
`List[bytes|str]`: The list of tokens.
|
282 |
"""
|
283 |
tokens = []
|
284 |
+
text = "".join([self.byte_encoder[b] for b in text.encode("utf-8")])
|
285 |
+
#text = text.replace(" ", self.byte_encoder[" ".encode("utf-8")[0]])
|
286 |
text = unicodedata.normalize("NFC", text)
|
287 |
+
#print("----after nfc------:", text)
|
288 |
|
289 |
# this implementation takes a detour: text -> token id -> token surface forms
|
290 |
for t in self.tokenizer.encode(
|
291 |
+
text, allowed_special=allowed_special, disallowed_special=disallowed_special ):
|
|
|
292 |
tokens.append(self.decoder[t])
|
293 |
|
294 |
def _encode_imgurl(img_tokens):
|
vocab.tiktoken
CHANGED
The diff for this file is too large to render.
See raw diff
|
|