add chat_template (#4)
Browse files- add chat_template (373b65c403f3cfc50e2d8f0a3b0b0353078cd005)
Co-authored-by: one dozon <[email protected]>
- README.md +1 -1
- tokenization_chatglm.py +32 -7
- tokenizer_config.json +19 -0
README.md
CHANGED
@@ -4,7 +4,7 @@
|
|
4 |
```python
|
5 |
>>> from transformers import AutoTokenizer, AutoModel
|
6 |
>>> tokenizer = AutoTokenizer.from_pretrained("thu-coai/CharacterGLM-6B", trust_remote_code=True)
|
7 |
-
>>> model = AutoModel.from_pretrained("
|
8 |
>>> model = model.eval()
|
9 |
>>> session_meta = {'user_info': '我是陆星辰,是一个男性,是一位知名导演,也是苏梦远的合作导演。我擅长拍摄音乐题材的电影。苏梦远对我的态度是尊敬的,并视我为良师益友。', 'bot_info': '苏梦远,本名苏远心,是一位当红的国内女歌手及演员。在参加选秀节目后,凭借独特的嗓音及出众的舞台魅力迅速成名,进入娱乐圈。她外表美丽动人,但真正的魅力在于她的才华和勤奋。苏梦远是音乐学院毕业的优秀生,善于创作,拥有多首热门原创歌曲。除了音乐方面的成就,她还热衷于慈善事业,积极参加公益活动,用实际行动传递正能量。在工作中,她对待工作非常敬业,拍戏时总是全身心投入角色,赢得了业内人士的赞誉和粉丝的喜爱。虽然在娱乐圈,但她始终保持低调、谦逊的态度,深得同行尊重。在表达时,苏梦远喜欢使用“我们”和“一起”,强调团队精神。', 'bot_name': '苏梦远', 'user_name': '陆星辰'}
|
10 |
>>> response, history = model.chat(tokenizer, session_meta, "你好", history=[])
|
|
|
4 |
```python
|
5 |
>>> from transformers import AutoTokenizer, AutoModel
|
6 |
>>> tokenizer = AutoTokenizer.from_pretrained("thu-coai/CharacterGLM-6B", trust_remote_code=True)
|
7 |
+
>>> model = AutoModel.from_pretrained("thu-coai/CharacterGLM-6b", trust_remote_code=True, device='cuda')
|
8 |
>>> model = model.eval()
|
9 |
>>> session_meta = {'user_info': '我是陆星辰,是一个男性,是一位知名导演,也是苏梦远的合作导演。我擅长拍摄音乐题材的电影。苏梦远对我的态度是尊敬的,并视我为良师益友。', 'bot_info': '苏梦远,本名苏远心,是一位当红的国内女歌手及演员。在参加选秀节目后,凭借独特的嗓音及出众的舞台魅力迅速成名,进入娱乐圈。她外表美丽动人,但真正的魅力在于她的才华和勤奋。苏梦远是音乐学院毕业的优秀生,善于创作,拥有多首热门原创歌曲。除了音乐方面的成就,她还热衷于慈善事业,积极参加公益活动,用实际行动传递正能量。在工作中,她对待工作非常敬业,拍戏时总是全身心投入角色,赢得了业内人士的赞誉和粉丝的喜爱。虽然在娱乐圈,但她始终保持低调、谦逊的态度,深得同行尊重。在表达时,苏梦远喜欢使用“我们”和“一起”,强调团队精神。', 'bot_name': '苏梦远', 'user_name': '陆星辰'}
|
10 |
>>> response, history = model.chat(tokenizer, session_meta, "你好", history=[])
|
tokenization_chatglm.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import os
|
2 |
-
import
|
3 |
from typing import List, Optional, Union, Dict
|
4 |
from sentencepiece import SentencePieceProcessor
|
5 |
from transformers import PreTrainedTokenizer
|
@@ -27,9 +27,22 @@ class SPTokenizer:
|
|
27 |
self.special_tokens[token] = self.n_words
|
28 |
self.index_special_tokens[self.n_words] = token
|
29 |
self.n_words += 1
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
|
35 |
assert type(s) is str
|
@@ -41,7 +54,18 @@ class SPTokenizer:
|
|
41 |
return t
|
42 |
|
43 |
def decode(self, t: List[int]) -> str:
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
def decode_tokens(self, tokens: List[str]) -> str:
|
47 |
text = self.sp_model.DecodePieces(tokens)
|
@@ -65,7 +89,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
65 |
|
66 |
model_input_names = ["input_ids", "attention_mask", "position_ids"]
|
67 |
|
68 |
-
def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
|
69 |
self.name = "GLMTokenizer"
|
70 |
|
71 |
self.vocab_file = vocab_file
|
@@ -75,6 +99,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
75 |
"<eos>": self.tokenizer.eos_id,
|
76 |
"<pad>": self.tokenizer.pad_id
|
77 |
}
|
|
|
78 |
super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
|
79 |
|
80 |
def get_command(self, token):
|
@@ -110,7 +135,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
110 |
return vocab
|
111 |
|
112 |
def _tokenize(self, text, **kwargs):
|
113 |
-
return self.tokenizer.tokenize(text)
|
114 |
|
115 |
def _convert_token_to_id(self, token):
|
116 |
""" Converts a token (str) in an id using the vocab. """
|
|
|
1 |
import os
|
2 |
+
import re
|
3 |
from typing import List, Optional, Union, Dict
|
4 |
from sentencepiece import SentencePieceProcessor
|
5 |
from transformers import PreTrainedTokenizer
|
|
|
27 |
self.special_tokens[token] = self.n_words
|
28 |
self.index_special_tokens[self.n_words] = token
|
29 |
self.n_words += 1
|
30 |
+
self.role_special_token_expression = "|".join([re.escape(token) for token in special_tokens]) # for apply_chat_template
|
31 |
+
|
32 |
+
def tokenize(self, s: str, encode_special_tokens=False):
|
33 |
+
if encode_special_tokens:
|
34 |
+
last_index = 0
|
35 |
+
t = []
|
36 |
+
for match in re.finditer(self.role_special_token_expression, s):
|
37 |
+
if last_index < match.start():
|
38 |
+
t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
|
39 |
+
t.append(s[match.start():match.end()])
|
40 |
+
last_index = match.end()
|
41 |
+
if last_index < len(s):
|
42 |
+
t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
|
43 |
+
return t
|
44 |
+
else:
|
45 |
+
return self.sp_model.EncodeAsPieces(s)
|
46 |
|
47 |
def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
|
48 |
assert type(s) is str
|
|
|
54 |
return t
|
55 |
|
56 |
def decode(self, t: List[int]) -> str:
|
57 |
+
text, buffer = "", []
|
58 |
+
for token in t:
|
59 |
+
if token in self.index_special_tokens:
|
60 |
+
if buffer:
|
61 |
+
text += self.sp_model.decode(buffer)
|
62 |
+
buffer = []
|
63 |
+
text += self.index_special_tokens[token]
|
64 |
+
else:
|
65 |
+
buffer.append(token)
|
66 |
+
if buffer:
|
67 |
+
text += self.sp_model.decode(buffer)
|
68 |
+
return text
|
69 |
|
70 |
def decode_tokens(self, tokens: List[str]) -> str:
|
71 |
text = self.sp_model.DecodePieces(tokens)
|
|
|
89 |
|
90 |
model_input_names = ["input_ids", "attention_mask", "position_ids"]
|
91 |
|
92 |
+
def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False, **kwargs):
|
93 |
self.name = "GLMTokenizer"
|
94 |
|
95 |
self.vocab_file = vocab_file
|
|
|
99 |
"<eos>": self.tokenizer.eos_id,
|
100 |
"<pad>": self.tokenizer.pad_id
|
101 |
}
|
102 |
+
self.encode_special_tokens = encode_special_tokens
|
103 |
super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
|
104 |
|
105 |
def get_command(self, token):
|
|
|
135 |
return vocab
|
136 |
|
137 |
def _tokenize(self, text, **kwargs):
|
138 |
+
return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
|
139 |
|
140 |
def _convert_token_to_id(self, token):
|
141 |
""" Converts a token (str) in an id using the vocab. """
|
tokenizer_config.json
CHANGED
@@ -1,10 +1,29 @@
|
|
1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"auto_map": {
|
3 |
"AutoTokenizer": [
|
4 |
"tokenization_chatglm.ChatGLMTokenizer",
|
5 |
null
|
6 |
]
|
7 |
},
|
|
|
8 |
"clean_up_tokenization_spaces": true,
|
9 |
"do_lower_case": false,
|
10 |
"model_max_length": 1000000000000000019884624838656,
|
|
|
1 |
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"64790": {
|
4 |
+
"content": "[gMASK]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": true,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": false
|
10 |
+
},
|
11 |
+
"64792": {
|
12 |
+
"content": "sop",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": true,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": false
|
18 |
+
}
|
19 |
+
},
|
20 |
"auto_map": {
|
21 |
"AutoTokenizer": [
|
22 |
"tokenization_chatglm.ChatGLMTokenizer",
|
23 |
null
|
24 |
]
|
25 |
},
|
26 |
+
"chat_template": "{% set ns = namespace() %}[gMASK]sop{% for message in messages %}{% if loop.first %}{% set ns.bot_name = message['bot_name'] %}{% set ns.user_name = message['user_name'] %}以下是一段{{ message['bot_name'] }}和{{ message['user_name'] }}之间的对话。{%+ if message['bot_profile'] is defined and message['bot_profile']|length +%}\n关于{{ message['bot_name'] }}的信息:{{ message['bot_profile']|replace('\n', ' ') }}{% endif %}{%+ if message['user_profile'] is defined and message['user_profile']|length +%}\n关于{{ message['user_name'] }}的信息:{{ message['user_profile']|replace('\n', ' ') }}{% endif %}{%+ else +%}\n[{% if message['role'] == 'user' %}{{ ns.user_name }}{% else %}{{ ns.bot_name }}{% endif %}]{{ message['content']|replace('\n', ' ') }}{% endif %}{% endfor %}{%+ if add_generation_prompt +%}\n[{{ ns.bot_name }}]{% endif %}",
|
27 |
"clean_up_tokenization_spaces": true,
|
28 |
"do_lower_case": false,
|
29 |
"model_max_length": 1000000000000000019884624838656,
|