tianxie-sf
commited on
Commit
·
b5bab0d
1
Parent(s):
029ee1b
add pad token and default eos token
Browse files- tokenization_xgen.py +14 -5
tokenization_xgen.py
CHANGED
@@ -25,7 +25,7 @@ MAX_MODEL_INPUT_SIZES = {
|
|
25 |
}
|
26 |
|
27 |
|
28 |
-
def tiktoken_tokenizer(base="gpt2", add_special=True):
|
29 |
if not add_special:
|
30 |
return tiktoken.get_encoding(base)
|
31 |
|
@@ -83,6 +83,9 @@ def tiktoken_tokenizer(base="gpt2", add_special=True):
|
|
83 |
special_tokens[sp] = idx
|
84 |
idx += 1
|
85 |
|
|
|
|
|
|
|
86 |
# In production, load the arguments directly instead of accessing private attributes
|
87 |
# See openai_public.py for examples of arguments for specific encodings
|
88 |
enc = tiktoken.Encoding(
|
@@ -112,19 +115,22 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
112 |
def __init__(
|
113 |
self,
|
114 |
pad_token=None,
|
|
|
115 |
add_eos_token=False,
|
116 |
add_special_tokens=True,
|
117 |
**kwargs,
|
118 |
):
|
119 |
-
|
|
|
120 |
super().__init__(
|
121 |
-
pad_token=
|
|
|
122 |
add_eos_token=add_eos_token,
|
123 |
add_special_tokens=add_special_tokens,
|
124 |
**kwargs,
|
125 |
)
|
126 |
self.add_eos_token = add_eos_token
|
127 |
-
self.encoder = tiktoken_tokenizer(base="gpt2", add_special=add_special_tokens)
|
128 |
|
129 |
@property
|
130 |
def vocab_size(self):
|
@@ -142,6 +148,9 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
142 |
|
143 |
def _convert_token_to_id(self, token):
|
144 |
"""Converts a token (str) in an id using the vocab."""
|
|
|
|
|
|
|
145 |
return token
|
146 |
|
147 |
def _convert_id_to_token(self, index):
|
@@ -216,4 +225,4 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
216 |
if token_ids_1 is not None:
|
217 |
output += [1] * len(token_ids_1 + eos_token_id)
|
218 |
|
219 |
-
return output
|
|
|
25 |
}
|
26 |
|
27 |
|
28 |
+
def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
|
29 |
if not add_special:
|
30 |
return tiktoken.get_encoding(base)
|
31 |
|
|
|
83 |
special_tokens[sp] = idx
|
84 |
idx += 1
|
85 |
|
86 |
+
if pad_token and pad_token not in tokenizer._special_tokens and pad_token not in special_tokens:
|
87 |
+
special_tokens[pad_token] = idx
|
88 |
+
idx += 1
|
89 |
# In production, load the arguments directly instead of accessing private attributes
|
90 |
# See openai_public.py for examples of arguments for specific encodings
|
91 |
enc = tiktoken.Encoding(
|
|
|
115 |
def __init__(
|
116 |
self,
|
117 |
pad_token=None,
|
118 |
+
eos_token="<|endoftext|>",
|
119 |
add_eos_token=False,
|
120 |
add_special_tokens=True,
|
121 |
**kwargs,
|
122 |
):
|
123 |
+
pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
124 |
+
eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
125 |
super().__init__(
|
126 |
+
pad_token=pad_token_added,
|
127 |
+
eos_token=eos_token_added,
|
128 |
add_eos_token=add_eos_token,
|
129 |
add_special_tokens=add_special_tokens,
|
130 |
**kwargs,
|
131 |
)
|
132 |
self.add_eos_token = add_eos_token
|
133 |
+
self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
|
134 |
|
135 |
@property
|
136 |
def vocab_size(self):
|
|
|
148 |
|
149 |
def _convert_token_to_id(self, token):
|
150 |
"""Converts a token (str) in an id using the vocab."""
|
151 |
+
if isinstance(token, str):
|
152 |
+
ids = self._tokenize(token)
|
153 |
+
return ids[0]
|
154 |
return token
|
155 |
|
156 |
def _convert_id_to_token(self, index):
|
|
|
225 |
if token_ids_1 is not None:
|
226 |
output += [1] * len(token_ids_1 + eos_token_id)
|
227 |
|
228 |
+
return output
|