Spaces:
Sleeping
Sleeping
lewiswu1209
commited on
Commit
·
f3c6b77
1
Parent(s):
a0ed808
Refactoring
Browse files- .gitattributes +0 -4
- README.md +4 -4
- app.py +67 -14
- bot/chatbot.py +109 -0
- bot/config.py +21 -0
- bot/interface.py +0 -48
- bot/simctgdialogue.py +0 -177
- bot/skills/couplet.py +16 -0
- bot/skills/delete_memory.py +11 -0
- bot/skills/give_role.py +14 -0
- bot/skills/poem.py +15 -0
- bot/utlis.py +0 -174
- data/.gitkeep +0 -0
- data_parallel.py +100 -0
- dataset.py +21 -0
- preprocess.py +105 -0
- pytorchtools.py +53 -0
- requirements.txt +2 -18
- templates/chat_template.html +240 -0
- train.py +432 -0
- web.py +151 -0
.gitattributes
CHANGED
@@ -9,13 +9,9 @@
|
|
9 |
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
10 |
*.model filter=lfs diff=lfs merge=lfs -text
|
11 |
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
14 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
15 |
*.ot filter=lfs diff=lfs merge=lfs -text
|
16 |
*.parquet filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
19 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
21 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
|
|
9 |
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
10 |
*.model filter=lfs diff=lfs merge=lfs -text
|
11 |
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
12 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
13 |
*.ot filter=lfs diff=lfs merge=lfs -text
|
14 |
*.parquet filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
15 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
16 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
17 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
title: Winnie
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
|
|
1 |
---
|
2 |
title: Winnie
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: gray
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.0.24
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
app.py
CHANGED
@@ -1,25 +1,78 @@
|
|
1 |
|
|
|
|
|
2 |
import gradio as gr
|
3 |
|
4 |
-
from bot.
|
|
|
|
|
|
|
5 |
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
def
|
9 |
global bot
|
10 |
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
|
15 |
-
response = bot.chat(history)
|
16 |
-
history.append(response)
|
17 |
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
if
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
).launch()
|
|
|
|
|
|
|
|
1 |
|
2 |
+
import os
|
3 |
+
|
4 |
import gradio as gr
|
5 |
|
6 |
+
from bot.chatbot import ChatBot
|
7 |
+
from bot.config import special_token_list
|
8 |
+
|
9 |
+
bot:ChatBot = None
|
10 |
|
11 |
+
def get_skill_list() -> list:
|
12 |
+
path:str = os.path.split( os.path.realpath(__file__) )[0]
|
13 |
+
file_list:list[str] = os.listdir( path + "/bot/skills/" )
|
14 |
+
plugin_list:list[str] = []
|
15 |
+
for file in file_list:
|
16 |
+
if file.endswith(".py"):
|
17 |
+
plugin_list.append( file[:-3] )
|
18 |
+
return plugin_list
|
19 |
|
20 |
+
def general(input_txt:str, state:dict = {}):
|
21 |
global bot
|
22 |
|
23 |
+
history_list:list = state.get("history", [])
|
24 |
+
role_card:dict[str, str] = state.get("role_card", {
|
25 |
+
"<NAME>": "Winnie",
|
26 |
+
"<GENDER>": "女",
|
27 |
+
"<YEAROFBIRTH>":"1995",
|
28 |
+
"<MONTHOFBIRTH>":"5",
|
29 |
+
"<DAYOFBIRTH>":"6",
|
30 |
+
"<ZODIAC>":"金牛座",
|
31 |
+
"<AGE>":"27"
|
32 |
+
}
|
33 |
+
)
|
34 |
|
35 |
+
output_txt:str = None
|
|
|
|
|
36 |
|
37 |
+
for skill_name in get_skill_list():
|
38 |
+
if output_txt is None:
|
39 |
+
plugin = __import__("bot.skills."+skill_name, fromlist=[skill_name])
|
40 |
+
plugin_class = getattr(plugin, "Skill")
|
41 |
+
p = plugin_class()
|
42 |
+
output_txt, history_list, role_card = p.process(input_txt, history_list, role_card)
|
43 |
|
44 |
+
if output_txt is None:
|
45 |
+
res, history_list = bot.chat(input_txt, history_list, role_card=role_card)
|
46 |
+
output_txt = "".join(res)
|
47 |
+
|
48 |
+
state["history"] = history_list
|
49 |
+
state["role_card"] = role_card
|
50 |
+
|
51 |
+
return output_txt, state
|
52 |
+
|
53 |
+
def main() -> None:
|
54 |
+
global bot
|
55 |
+
|
56 |
+
bot = ChatBot.get_chat_bot("lewiswu1209/Winnie", special_token_list=special_token_list)
|
57 |
+
|
58 |
+
title:str = "使用中文和Winnie聊天"
|
59 |
+
|
60 |
+
description:str = "输入任意文字,Winnie会和你对话<br>"
|
61 |
+
description += "输入ERASE MEMORY,会清空Winnie的记忆<br>"
|
62 |
+
description += "输入\"<TAG>=<VALUE>\",可以修改Winnie的角色信息<br>"
|
63 |
+
description += "例如:<NAME>=Vicky,会修改Winnie的名字<br>"
|
64 |
+
description += "可以修改的角色信息有:<br>"
|
65 |
+
description += "<NAME>, <GENDER>, <YEAROFBIRTH>, <MONTHOFBIRTH>, <DAYOFBIRTH>, <ZODIAC>, <AGE><br>"
|
66 |
+
description += "输入“上联:XXXXXXX”,Winnie会和你对对联<br>"
|
67 |
+
description += "输入“写诗:XXXXXXX”,Winnie会以XXXXXXX为开头写诗"
|
68 |
+
|
69 |
+
gr.Interface(
|
70 |
+
fn = general,
|
71 |
+
title = title,
|
72 |
+
description = description,
|
73 |
+
inputs = ["text", "state"],
|
74 |
+
outputs = ["text", "state"]
|
75 |
).launch()
|
76 |
+
|
77 |
+
if __name__ == "__main__":
|
78 |
+
main()
|
bot/chatbot.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from tokenize import tokenize
|
3 |
+
import torch
|
4 |
+
import torch.nn.functional as F
|
5 |
+
|
6 |
+
from transformers import BertTokenizer, GPT2LMHeadModel
|
7 |
+
|
8 |
+
def replace_special_tokens(tokens:list, special_token_dict:dict)->list:
|
9 |
+
replaced_tokens:list = []
|
10 |
+
|
11 |
+
for token in tokens:
|
12 |
+
if token in special_token_dict.keys():
|
13 |
+
replaced_tokens.append( special_token_dict[token] )
|
14 |
+
else:
|
15 |
+
replaced_tokens.append( token )
|
16 |
+
|
17 |
+
return replaced_tokens
|
18 |
+
|
19 |
+
def top_k_top_p_filtering(logits, top_k:int=0, top_p:float=0.0, filter_value:float=-float('Inf')):
|
20 |
+
top_k = min( top_k, logits.size(-1) )
|
21 |
+
|
22 |
+
if top_k > 0:
|
23 |
+
indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
|
24 |
+
logits[indices_to_remove] = filter_value
|
25 |
+
|
26 |
+
if top_p > 0.0:
|
27 |
+
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
|
28 |
+
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
|
29 |
+
sorted_indices_to_remove = cumulative_probs > top_p
|
30 |
+
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
|
31 |
+
sorted_indices_to_remove[..., 0] = 0
|
32 |
+
indices_to_remove = sorted_indices[sorted_indices_to_remove]
|
33 |
+
logits[indices_to_remove] = filter_value
|
34 |
+
|
35 |
+
return logits
|
36 |
+
|
37 |
+
class ChatBot():
|
38 |
+
|
39 |
+
def get_chat_bot(checkpoint:str, vocab_path:str = None, special_token_list:list = [])->object:
|
40 |
+
tokenizer = ChatBot.get_tokenizer(checkpoint, vocab_path, special_token_list)
|
41 |
+
model = GPT2LMHeadModel.from_pretrained(checkpoint)
|
42 |
+
|
43 |
+
return ChatBot(tokenizer, model)
|
44 |
+
|
45 |
+
def get_tokenizer(checkpoint:str, vocab_path:str = None, special_token_list:list = [])->object:
|
46 |
+
if vocab_path is None:
|
47 |
+
tokenizer = BertTokenizer.from_pretrained(checkpoint, sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]")
|
48 |
+
else:
|
49 |
+
tokenizer = BertTokenizer(vocab_path, sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]")
|
50 |
+
tokenizer.add_special_tokens( {'additional_special_tokens':special_token_list} )
|
51 |
+
|
52 |
+
return tokenizer
|
53 |
+
|
54 |
+
def __init__(self, tokenizer:object, model:object)->None:
|
55 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
56 |
+
self.tokenizer = tokenizer
|
57 |
+
self.model = model
|
58 |
+
self.model.to(self.device)
|
59 |
+
|
60 |
+
def convert_ids_to_tokens(self, ids:list):
|
61 |
+
return self.tokenizer.convert_ids_to_tokens(ids)
|
62 |
+
|
63 |
+
def convert_ids_to_text(self, ids):
|
64 |
+
return "".join( self.convert_ids_to_tokens(ids) )
|
65 |
+
|
66 |
+
def convert_text_to_ids(self, text, add_special_tokens=False):
|
67 |
+
return self.tokenizer.encode(text, add_special_tokens=add_special_tokens)
|
68 |
+
|
69 |
+
def get_prediction(self, input_tensor, input_ids, repetition_penalty, temperature, top_k, top_p):
|
70 |
+
self.model.eval()
|
71 |
+
|
72 |
+
generated_ids = []
|
73 |
+
for _ in range(64):
|
74 |
+
output_pt = self.model(input_tensor)
|
75 |
+
|
76 |
+
next_token_logits = output_pt.logits[0, -1, :]
|
77 |
+
for id in set(input_ids):
|
78 |
+
if id != self.tokenizer.sep_token_id:
|
79 |
+
next_token_logits[id] /= repetition_penalty
|
80 |
+
for id in set(generated_ids):
|
81 |
+
next_token_logits[id] /= repetition_penalty
|
82 |
+
next_token_logits = next_token_logits / temperature
|
83 |
+
next_token_logits[self.tokenizer.convert_tokens_to_ids('[UNK]')] = -float('Inf')
|
84 |
+
filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
|
85 |
+
next_token = torch.multinomial( F.softmax(filtered_logits, dim=-1), num_samples=1 )
|
86 |
+
|
87 |
+
if next_token == self.tokenizer.sep_token_id:
|
88 |
+
break
|
89 |
+
|
90 |
+
input_tensor = torch.cat( (input_tensor, next_token.unsqueeze(0)), dim=1 )
|
91 |
+
generated_ids.append( next_token.item() )
|
92 |
+
|
93 |
+
return generated_ids
|
94 |
+
|
95 |
+
def chat(self:object, text:str, history:list, role_card:dict={}) -> str:
|
96 |
+
text_ids = self.tokenizer.encode(text, add_special_tokens=False)
|
97 |
+
history.append(text_ids)
|
98 |
+
input_ids = [self.tokenizer.cls_token_id]
|
99 |
+
for history_utr in history[-50:]:
|
100 |
+
input_ids.extend(history_utr)
|
101 |
+
input_ids.append(self.tokenizer.sep_token_id)
|
102 |
+
input_tensor = torch.tensor(input_ids).to(self.device).unsqueeze(0)
|
103 |
+
generated_ids = self.get_prediction(input_tensor, input_ids, repetition_penalty=1.2, temperature=0.73, top_k=10, top_p=0.7)
|
104 |
+
|
105 |
+
history.append(generated_ids)
|
106 |
+
|
107 |
+
generated_tokens = replace_special_tokens( self.convert_ids_to_tokens(generated_ids), role_card )
|
108 |
+
|
109 |
+
return "".join(generated_tokens), history
|
bot/config.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
special_token_list:list = [
|
2 |
+
'<NAME>',
|
3 |
+
'<GENDER>',
|
4 |
+
'<YEAROFBIRTH>',
|
5 |
+
'<MONTHOFBIRTH>',
|
6 |
+
'<DAYOFBIRTH>',
|
7 |
+
'<ZODIAC>',
|
8 |
+
'<AGE>',
|
9 |
+
'<CMD>',
|
10 |
+
'<NICK>',
|
11 |
+
'<HEIGHT>',
|
12 |
+
'<WEIGHT>',
|
13 |
+
'<WORK>',
|
14 |
+
'<HOBBY>',
|
15 |
+
'<HOMETOWN>',
|
16 |
+
'<CITY>',
|
17 |
+
'<BUST>',
|
18 |
+
'<WAIST>',
|
19 |
+
'<HIP>',
|
20 |
+
'<CUP>'
|
21 |
+
]
|
bot/interface.py
DELETED
@@ -1,48 +0,0 @@
|
|
1 |
-
|
2 |
-
from random import choice
|
3 |
-
from random import randint
|
4 |
-
from random import uniform
|
5 |
-
|
6 |
-
from bot.simctgdialogue import SimCTGDialogue
|
7 |
-
|
8 |
-
class Chatbot():
|
9 |
-
def __init__(self):
|
10 |
-
self.model = SimCTGDialogue("cambridgeltl/simctg_lccc_dialogue", [])
|
11 |
-
self.tokenizer = self.model.tokenizer
|
12 |
-
self.model.eval()
|
13 |
-
|
14 |
-
def __contrastive_search(self, context_list):
|
15 |
-
print("__contrastive_search")
|
16 |
-
print(context_list)
|
17 |
-
beam_width, alpha, decoding_len = randint(1, 8), uniform(0.10, 0.40), 64
|
18 |
-
return self.model.contrastive_search(context_list, beam_width, alpha, decoding_len)
|
19 |
-
|
20 |
-
def __diverse_contrastive_search(self, context_list):
|
21 |
-
print("__diverse_contrastive_search")
|
22 |
-
print(context_list)
|
23 |
-
sample_step, nucleus_p = 1, uniform(0.10, 0.40)
|
24 |
-
beam_width, alpha, decoding_len = randint(1, 5), uniform(0.10, 0.40), 64
|
25 |
-
return self.model.diverse_contrastive_search(context_list, sample_step, nucleus_p, beam_width, alpha, decoding_len)
|
26 |
-
|
27 |
-
def __greedy_search(self, context_list):
|
28 |
-
print("__greedy_search")
|
29 |
-
print(context_list)
|
30 |
-
decoding_len = 64
|
31 |
-
return self.model.greedy_search(context_list, decoding_len)
|
32 |
-
|
33 |
-
def __beam_search(self, context_list):
|
34 |
-
print("__beam_search")
|
35 |
-
print(context_list)
|
36 |
-
beam_width, decoding_len = randint(1, 9), 64
|
37 |
-
return self.model.beam_search(context_list, beam_width, decoding_len)
|
38 |
-
|
39 |
-
def chat(self, prefix = []):
|
40 |
-
methods_for_sort_dialogue = [self.__contrastive_search, self.__greedy_search]
|
41 |
-
methods_for_long_dialogue = [self.__beam_search, self.__diverse_contrastive_search, self.__greedy_search, self.__contrastive_search]
|
42 |
-
|
43 |
-
if ( len(prefix) < 4 ):
|
44 |
-
response = choice(methods_for_sort_dialogue)(prefix)
|
45 |
-
else:
|
46 |
-
response = choice(methods_for_long_dialogue)(prefix)
|
47 |
-
|
48 |
-
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bot/simctgdialogue.py
DELETED
@@ -1,177 +0,0 @@
|
|
1 |
-
|
2 |
-
import torch
|
3 |
-
|
4 |
-
from torch import nn
|
5 |
-
|
6 |
-
class SimCTGDialogue(nn.Module):
|
7 |
-
def __init__(self, model_name, additional_special_tokens):
|
8 |
-
super(SimCTGDialogue, self).__init__()
|
9 |
-
from transformers import AutoTokenizer, GPT2LMHeadModel
|
10 |
-
eos_token = '[SEP]'
|
11 |
-
pad_token = '[PAD]'
|
12 |
-
self.tokenizer = AutoTokenizer.from_pretrained(model_name, additional_special_tokens=additional_special_tokens)
|
13 |
-
self.vocab_size = len(self.tokenizer)
|
14 |
-
self.model = GPT2LMHeadModel.from_pretrained(model_name)
|
15 |
-
self.embed_dim = self.model.config.hidden_size
|
16 |
-
if pad_token in self.tokenizer.vocab:
|
17 |
-
print ('PAD token exists.')
|
18 |
-
else:
|
19 |
-
print ('Add PAD token to the tokenizer.')
|
20 |
-
print ('Original vocabulary size is {}'.format(len(self.tokenizer)))
|
21 |
-
self.tokenizer.add_tokens([pad_token])
|
22 |
-
print ('Vocabulary size after extension is {}'.format(len(self.tokenizer)))
|
23 |
-
assert len(self.tokenizer.convert_tokens_to_ids([pad_token])) == 1
|
24 |
-
self.model.resize_token_embeddings(len(self.tokenizer))
|
25 |
-
self.pad_token_id = self.tokenizer.convert_tokens_to_ids([pad_token])[0]
|
26 |
-
self.vocab_size = len(self.tokenizer)
|
27 |
-
if 'e' in eos_token:
|
28 |
-
self.eos_token = self.tokenizer.eos_token
|
29 |
-
else:
|
30 |
-
self.eos_token = eos_token
|
31 |
-
print (self.eos_token)
|
32 |
-
|
33 |
-
def parse_dialogue_context(self, context_list, cuda_available=False, device=0):
|
34 |
-
# context_list: a list of utterances in the dialogue session
|
35 |
-
uttr_num = len(context_list)
|
36 |
-
context_text = self.eos_token.join(context_list).strip(self.eos_token) + self.eos_token
|
37 |
-
#print (context_text)
|
38 |
-
tokens = self.tokenizer.tokenize(context_text)
|
39 |
-
input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
40 |
-
input_ids = input_ids
|
41 |
-
input_ids = torch.LongTensor(input_ids).view(1,-1)
|
42 |
-
if cuda_available:
|
43 |
-
input_ids = input_ids.cuda(device)
|
44 |
-
return input_ids, uttr_num
|
45 |
-
|
46 |
-
def extract_response(self, output_ids, uttr_num):
|
47 |
-
output_text = self.tokenizer.decode(output_ids)
|
48 |
-
# extract response
|
49 |
-
item_list = output_text.split(self.eos_token)
|
50 |
-
response = item_list[uttr_num].strip()
|
51 |
-
if self.eos_token == '<|endoftext|>': # English GPT
|
52 |
-
response = ' '.join(response.split())
|
53 |
-
else:
|
54 |
-
response = ''.join(response.split())
|
55 |
-
return response
|
56 |
-
|
57 |
-
def contrastive_search(self, context_list, beam_width, alpha, decoding_len,
|
58 |
-
cuda_available=False, device=0):
|
59 |
-
input_ids, uttr_num = self.parse_dialogue_context(context_list,
|
60 |
-
cuda_available=cuda_available, device=device)
|
61 |
-
output = self.fast_contrastive_generation(input_ids, beam_width, alpha, decoding_len)
|
62 |
-
return self.extract_response(output, uttr_num)
|
63 |
-
|
64 |
-
def diverse_contrastive_search(self, context_list, sample_step, nucleus_p,
|
65 |
-
beam_width, alpha, decoding_len, cuda_available=False, device=0):
|
66 |
-
input_ids, uttr_num = self.parse_dialogue_context(context_list,
|
67 |
-
cuda_available=cuda_available, device=device)
|
68 |
-
output = self.diverse_contrastive_generation(input_ids, sample_step, nucleus_p,
|
69 |
-
beam_width, alpha, decoding_len)
|
70 |
-
return self.extract_response(output, uttr_num)
|
71 |
-
|
72 |
-
def greedy_search(self, context_list, decoding_len, cuda_available=False, device=0):
|
73 |
-
input_ids, uttr_num = self.parse_dialogue_context(context_list,
|
74 |
-
cuda_available=cuda_available, device=device)
|
75 |
-
output = self.greedy_generation(input_ids, decoding_len)
|
76 |
-
return self.extract_response(output, uttr_num)
|
77 |
-
|
78 |
-
def beam_search(self, context_list, beam_width, decoding_len,
|
79 |
-
cuda_available=False, device=0):
|
80 |
-
input_ids, uttr_num = self.parse_dialogue_context(context_list,
|
81 |
-
cuda_available=cuda_available, device=device)
|
82 |
-
output = self.beam_generation(input_ids, beam_width, decoding_len)
|
83 |
-
return self.extract_response(output, uttr_num)
|
84 |
-
|
85 |
-
def nucleus_sampling(self, context_list, nucleus_p, decoding_len,
|
86 |
-
cuda_available=False, device=0):
|
87 |
-
input_ids, uttr_num = self.parse_dialogue_context(context_list,
|
88 |
-
cuda_available=cuda_available, device=device)
|
89 |
-
output = self.nucleus_generation(input_ids, nucleus_p, decoding_len)
|
90 |
-
return self.extract_response(output, uttr_num)
|
91 |
-
|
92 |
-
def fast_contrastive_generation(self, input_ids, beam_width, alpha, decoding_len):
|
93 |
-
'''
|
94 |
-
input_ids: prefix input; 1 x prefix_len
|
95 |
-
decoding_len: how many tokens to generate
|
96 |
-
beam_width: size of candidate pool during decoding
|
97 |
-
alpha: regulates importance of model confidence and degeneration penalty
|
98 |
-
'''
|
99 |
-
self.model.eval()
|
100 |
-
from bot.utlis import ContrastiveDecodingOneStepFast
|
101 |
-
# sanity check
|
102 |
-
assert alpha >= 0. and alpha <= 1.0
|
103 |
-
|
104 |
-
# fast mode
|
105 |
-
batch_size, seqlen = input_ids.size()
|
106 |
-
#generated = [[] for _ in range(batch_size)]
|
107 |
-
generated = [item for item in input_ids.tolist()]
|
108 |
-
past_key_values = None
|
109 |
-
last_hidden_states = None
|
110 |
-
logits = None
|
111 |
-
for step in range(decoding_len):
|
112 |
-
input_ids, past_key_values, last_hidden_states, logits = ContrastiveDecodingOneStepFast(
|
113 |
-
self.model,
|
114 |
-
input_ids,
|
115 |
-
beam_width,
|
116 |
-
alpha,
|
117 |
-
past_key_values,
|
118 |
-
last_hidden_states,
|
119 |
-
self.tokenizer,
|
120 |
-
logits,
|
121 |
-
first_step=step == 0,
|
122 |
-
)
|
123 |
-
tokens = input_ids.squeeze(dim=-1).tolist()
|
124 |
-
for idx, t in enumerate(tokens):
|
125 |
-
generated[idx].append(t)
|
126 |
-
return generated[0]
|
127 |
-
|
128 |
-
def diverse_contrastive_generation(self, input_ids, sample_step, nucleus_p, beam_width, alpha, decoding_len):
|
129 |
-
'''
|
130 |
-
sample_step:
|
131 |
-
number of steps to decode with nucleus sampling,
|
132 |
-
for the remaining steps we use contrastive search
|
133 |
-
decoding_len:
|
134 |
-
the total number of generated tokens
|
135 |
-
beam_width:
|
136 |
-
size of candidate pool during decoding
|
137 |
-
alpha:
|
138 |
-
regulates importance of model confidence and degeneration penalty
|
139 |
-
|
140 |
-
'''
|
141 |
-
contrastive_step = decoding_len - sample_step
|
142 |
-
_, prefix_len = input_ids.size()
|
143 |
-
# first do sample
|
144 |
-
input_ids = self.model.generate(
|
145 |
-
input_ids,
|
146 |
-
do_sample=True,
|
147 |
-
max_length=prefix_len+sample_step,
|
148 |
-
top_p=nucleus_p,
|
149 |
-
top_k=0)
|
150 |
-
# then do contrastive search
|
151 |
-
output = self.fast_contrastive_generation(input_ids, beam_width, alpha, contrastive_step)
|
152 |
-
return output
|
153 |
-
|
154 |
-
def greedy_generation(self, input_ids, decoding_len):
|
155 |
-
_, prefix_len = input_ids.size()
|
156 |
-
output = self.model.generate(
|
157 |
-
input_ids,
|
158 |
-
max_length=prefix_len+decoding_len)
|
159 |
-
return output[0]
|
160 |
-
|
161 |
-
def beam_generation(self, input_ids, beam_width, decoding_len):
|
162 |
-
_, prefix_len = input_ids.size()
|
163 |
-
output = self.model.generate(
|
164 |
-
input_ids,
|
165 |
-
max_length=prefix_len+decoding_len,
|
166 |
-
num_beams=beam_width)
|
167 |
-
return output[0]
|
168 |
-
|
169 |
-
def nucleus_generation(self, input_ids, nucleus_p, decoding_len):
|
170 |
-
_, prefix_len = input_ids.size()
|
171 |
-
output = self.model.generate(
|
172 |
-
input_ids,
|
173 |
-
do_sample=True,
|
174 |
-
max_length=prefix_len+decoding_len,
|
175 |
-
top_p=nucleus_p,
|
176 |
-
top_k=0)
|
177 |
-
return output[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bot/skills/couplet.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import requests
|
3 |
+
|
4 |
+
class Skill:
|
5 |
+
def __init__(self:object) -> None:
|
6 |
+
pass
|
7 |
+
|
8 |
+
def process(self:object, input_txt:str, history_list:list, role_card:dict):
|
9 |
+
output_text:str = None
|
10 |
+
if input_txt.startswith("上联:") or input_txt.startswith("上联:"):
|
11 |
+
output_text = requests.post(
|
12 |
+
url='https://hf.space/embed/lewiswu1209/gpt2-chinese-couplet/+/api/predict/',
|
13 |
+
json={"data": [input_txt[3:]]}
|
14 |
+
).json()["data"][0]
|
15 |
+
output_text = "我对下联:" + output_text
|
16 |
+
return output_text, history_list, role_card
|
bot/skills/delete_memory.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
class Skill:
|
3 |
+
def __init__(self:object) -> None:
|
4 |
+
pass
|
5 |
+
|
6 |
+
def process(self:object, input_txt:str, history_list:list, role_card:dict):
|
7 |
+
output_txt:str = None
|
8 |
+
if input_txt.upper()=="ERASE MEMORY":
|
9 |
+
history_list = []
|
10 |
+
output_txt = "我是谁?我在哪?我在干什么?"
|
11 |
+
return output_txt, history_list, role_card
|
bot/skills/give_role.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
class Skill:
|
3 |
+
def __init__(self:object) -> None:
|
4 |
+
pass
|
5 |
+
|
6 |
+
def process(self:object, input_txt:str, history_list:list, role_card:dict):
|
7 |
+
output_txt:str = None
|
8 |
+
for tag in role_card.keys():
|
9 |
+
prefix:str = "{}=".format(tag)
|
10 |
+
if input_txt.startswith( prefix ):
|
11 |
+
role_card[tag]=input_txt[len(prefix):]
|
12 |
+
output_txt = "已设置{}为{}".format(tag, role_card[tag])
|
13 |
+
break
|
14 |
+
return output_txt, history_list, role_card
|
bot/skills/poem.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import requests
|
3 |
+
|
4 |
+
class Skill:
|
5 |
+
def __init__(self:object) -> None:
|
6 |
+
pass
|
7 |
+
|
8 |
+
def process(self:object, input_txt:str, history_list:list, role_card:dict):
|
9 |
+
output_text:str = None
|
10 |
+
if input_txt.startswith("写诗:") or input_txt.startswith("写诗:"):
|
11 |
+
output_text = requests.post(
|
12 |
+
url='https://hf.space/embed/lewiswu1209/gpt2-chinese-poem/+/api/predict/',
|
13 |
+
json={"data": [input_txt[3:]]}
|
14 |
+
).json()["data"][0]
|
15 |
+
return output_text, history_list, role_card
|
bot/utlis.py
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
|
2 |
-
import torch
|
3 |
-
import random
|
4 |
-
import torch.nn.functional as F
|
5 |
-
|
6 |
-
def ranking(context_hidden, next_hidden, next_top_k_ids, next_top_k_probs, alpha):
|
7 |
-
'''
|
8 |
-
context_hidden: beam_width x context_len x embed_dim
|
9 |
-
next_hidden: beam_width x 1 x embed_dim
|
10 |
-
next_top_k_ids: beam_width x 1
|
11 |
-
'''
|
12 |
-
beam_width, context_len, embed_dim = context_hidden.size()
|
13 |
-
assert next_hidden.size() == torch.Size([beam_width, 1, embed_dim])
|
14 |
-
norm_context_hidden = context_hidden / context_hidden.norm(dim=2, keepdim=True)
|
15 |
-
norm_next_hidden = next_hidden / next_hidden.norm(dim=2, keepdim=True)
|
16 |
-
cosine_matrix = torch.matmul(norm_context_hidden, norm_next_hidden.transpose(1,2)).squeeze(-1)
|
17 |
-
assert cosine_matrix.size() == torch.Size([beam_width, context_len])
|
18 |
-
scores, _ = torch.max(cosine_matrix, dim = -1)
|
19 |
-
assert scores.size() == torch.Size([beam_width])
|
20 |
-
next_top_k_probs = next_top_k_probs.view(-1)
|
21 |
-
scores = (1.0 - alpha) * next_top_k_probs - alpha * scores
|
22 |
-
_, selected_idx = torch.topk(scores, k = 1)
|
23 |
-
assert selected_idx.size() == torch.Size([1])
|
24 |
-
selected_idx = selected_idx.unsqueeze(0)
|
25 |
-
assert selected_idx.size() == torch.Size([1,1])
|
26 |
-
next_id = torch.gather(next_top_k_ids, dim = 0, index=selected_idx)
|
27 |
-
assert next_id.size() == torch.Size([1,1])
|
28 |
-
return next_id
|
29 |
-
|
30 |
-
def ContrastiveDecodingOneStep(model, input_ids, beam_width, alpha):
|
31 |
-
'''
|
32 |
-
model: the generation model, e.g., gpt2
|
33 |
-
input_ids: 1 x seqlen
|
34 |
-
'''
|
35 |
-
prev_hidden_states, logits = model.compute_logits_and_hidden_states(input_ids)
|
36 |
-
_, seqlen, embed_dim = prev_hidden_states.size()
|
37 |
-
_, _, vocab_size = logits.size()
|
38 |
-
p = random.uniform(0, 1)
|
39 |
-
|
40 |
-
logit_for_next_step = logits[:,-1,:]
|
41 |
-
assert logit_for_next_step.size() == torch.Size([1, vocab_size])
|
42 |
-
|
43 |
-
next_probs = F.softmax(logit_for_next_step, dim = -1)
|
44 |
-
assert next_probs.size() == logit_for_next_step.size()
|
45 |
-
|
46 |
-
_, top_k_ids = torch.topk(logit_for_next_step, dim = -1, k = beam_width)
|
47 |
-
assert top_k_ids.size() == torch.Size([1, beam_width])
|
48 |
-
|
49 |
-
top_k_probs = torch.gather(next_probs, dim = 1, index=top_k_ids)
|
50 |
-
|
51 |
-
assert top_k_probs.size() == top_k_ids.size()
|
52 |
-
# compute new hidden
|
53 |
-
expanded_context = [input_ids for _ in range(beam_width)]
|
54 |
-
expanded_context = torch.cat(expanded_context, dim = 0)
|
55 |
-
assert expanded_context.size() == torch.Size([beam_width, seqlen])
|
56 |
-
top_k_ids = top_k_ids.view(beam_width, 1)
|
57 |
-
next_input_ids = torch.cat([expanded_context, top_k_ids], dim = -1)
|
58 |
-
assert next_input_ids.size() == torch.Size([beam_width, seqlen+1])
|
59 |
-
new_hidden_states, next_logits = model.compute_logits_and_hidden_states(next_input_ids)
|
60 |
-
assert new_hidden_states.size() == torch.Size([beam_width, seqlen+1, embed_dim])
|
61 |
-
context_hidden = new_hidden_states[:,:seqlen,:]
|
62 |
-
assert context_hidden.size() == torch.Size([beam_width, seqlen, embed_dim])
|
63 |
-
next_hidden = new_hidden_states[:,seqlen:,:]
|
64 |
-
assert next_hidden.size() == torch.Size([beam_width, 1, embed_dim])
|
65 |
-
|
66 |
-
next_id = ranking(context_hidden, next_hidden, top_k_ids, top_k_probs, alpha)
|
67 |
-
|
68 |
-
next_input_ids = torch.cat([input_ids, next_id], dim = -1)
|
69 |
-
assert next_input_ids.size() == torch.Size([1, seqlen+1])
|
70 |
-
return next_input_ids
|
71 |
-
|
72 |
-
# ========== batch version ========= #
|
73 |
-
def ranking_fast(context_hidden, next_hidden, next_top_k_probs, alpha, beam_width):
|
74 |
-
'''
|
75 |
-
context_hidden: bsz*beam x seqlen x embed_dim
|
76 |
-
next_hidden: bsz*beam x 1 x embed_dim
|
77 |
-
next_top_k_probs: bsz x beam
|
78 |
-
'''
|
79 |
-
_, context_len, embed_dim = context_hidden.size()
|
80 |
-
norm_context_hidden = context_hidden / context_hidden.norm(dim=2, keepdim=True)
|
81 |
-
norm_next_hidden = next_hidden / next_hidden.norm(dim=2, keepdim=True)
|
82 |
-
cosine_matrix = torch.matmul(norm_context_hidden, norm_next_hidden.transpose(1,2)).squeeze(-1) # [B*K, S]
|
83 |
-
scores, _ = torch.max(cosine_matrix, dim=-1) # [B*K]
|
84 |
-
next_top_k_probs = next_top_k_probs.view(-1) # [B*K]
|
85 |
-
scores = (1.0 - alpha) * next_top_k_probs - alpha * scores
|
86 |
-
scores = torch.stack(torch.split(scores, beam_width)) # [B, K]
|
87 |
-
selected_idx = scores.max(dim=-1)[1] # [B]
|
88 |
-
return selected_idx
|
89 |
-
|
90 |
-
def ContrastiveDecodingOneStepFast(
|
91 |
-
model,
|
92 |
-
ids,
|
93 |
-
beam_width,
|
94 |
-
alpha,
|
95 |
-
past_key_values,
|
96 |
-
last_hidden_states,
|
97 |
-
vocab,
|
98 |
-
logit_for_next_step,
|
99 |
-
first_step=False,
|
100 |
-
):
|
101 |
-
# input_ids: [B, S]
|
102 |
-
if first_step:
|
103 |
-
output = model(
|
104 |
-
input_ids=ids,
|
105 |
-
past_key_values=past_key_values,
|
106 |
-
use_cache=True,
|
107 |
-
output_hidden_states=True
|
108 |
-
)
|
109 |
-
past_key_values = output.past_key_values
|
110 |
-
last_hidden_states = output.hidden_states[-1] # [B, S, E]
|
111 |
-
logit_for_next_step = output.logits[:, -1, :] # [B, V]
|
112 |
-
bsz, seqlen, embed_dim = last_hidden_states.size()
|
113 |
-
p = random.uniform(0, 1)
|
114 |
-
|
115 |
-
next_probs = F.softmax(logit_for_next_step, dim=-1)
|
116 |
-
_, top_k_ids = torch.topk(logit_for_next_step, dim=-1, k=beam_width) # [B, K]
|
117 |
-
top_k_probs = torch.gather(next_probs, dim=1, index=top_k_ids) # [B, K]
|
118 |
-
# compute new hidden
|
119 |
-
past_key_values = enlarge_past_key_values(past_key_values, beam_width)
|
120 |
-
output = model(
|
121 |
-
input_ids=top_k_ids.view(-1, 1),
|
122 |
-
attention_mask=torch.ones_like(top_k_ids.view(-1, 1)),
|
123 |
-
past_key_values=past_key_values,
|
124 |
-
output_hidden_states=True,
|
125 |
-
use_cache=True,
|
126 |
-
)
|
127 |
-
past_key_values = output.past_key_values
|
128 |
-
logits = output.logits[:, -1, :] # [B*K, V]
|
129 |
-
next_hidden = output.hidden_states[-1] # [B*K, 1, E]
|
130 |
-
context_hidden = last_hidden_states.unsqueeze(1).expand(-1, beam_width, -1, -1).reshape(bsz*beam_width, seqlen, embed_dim) # [B*K, S, E]
|
131 |
-
|
132 |
-
selected_idx = ranking_fast(
|
133 |
-
context_hidden,
|
134 |
-
next_hidden,
|
135 |
-
top_k_probs, # [B, K]
|
136 |
-
alpha,
|
137 |
-
beam_width,
|
138 |
-
) # [B]
|
139 |
-
# prepare for the next step
|
140 |
-
next_id = top_k_ids[range(len(top_k_ids)), selected_idx].unsqueeze(-1) # [B, 1]
|
141 |
-
next_hidden = torch.stack(torch.split(next_hidden.squeeze(dim=1), beam_width)) # [B, K, E]
|
142 |
-
next_hidden = next_hidden[range(bsz), selected_idx, :] # [B, E]
|
143 |
-
last_hidden_states = torch.cat([last_hidden_states, next_hidden.unsqueeze(1)], dim=1) # [B, S, E]
|
144 |
-
past_key_values = select_past_key_values(past_key_values, beam_width, selected_idx)
|
145 |
-
logits = torch.stack(torch.split(logits, beam_width))[range(bsz), selected_idx, :] # [B, V]
|
146 |
-
# next_id: [B, 1]
|
147 |
-
return next_id, past_key_values, last_hidden_states, logits
|
148 |
-
|
149 |
-
def enlarge_past_key_values(past_key_values, beam_width):
|
150 |
-
# from [B, num_head, seq_len, esz] to [B*K, num_head, seq_len, esz]
|
151 |
-
new_key_values = []
|
152 |
-
for layer in past_key_values:
|
153 |
-
items = []
|
154 |
-
for item in layer:
|
155 |
-
# item is the key and value matrix
|
156 |
-
bsz, num_head, seq_len, esz = item.size()
|
157 |
-
item = item.unsqueeze(1).expand(-1, beam_width, -1, -1, -1).reshape(bsz*beam_width, num_head, seq_len, esz) # [bsz*beam, num_head, seq_len, esz]
|
158 |
-
items.append(item)
|
159 |
-
new_key_values.append(items)
|
160 |
-
return new_key_values
|
161 |
-
|
162 |
-
def select_past_key_values(past_key_values, beam_width, selected_idx):
|
163 |
-
'''select_idx: [B]'''
|
164 |
-
new_key_values = []
|
165 |
-
for layer in past_key_values:
|
166 |
-
items = []
|
167 |
-
for item in layer:
|
168 |
-
bsz_and_beam, num_head, seq_len, esz = item.size()
|
169 |
-
bsz = int(bsz_and_beam//beam_width)
|
170 |
-
item = torch.stack(torch.split(item, beam_width, dim=0)) # [B, K, num_head, seq_len, esz]
|
171 |
-
item = item[range(bsz), selected_idx, :, :, :] # [B, num_head, seq_len, esz]
|
172 |
-
items.append(item)
|
173 |
-
new_key_values.append(items)
|
174 |
-
return new_key_values
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/.gitkeep
ADDED
File without changes
|
data_parallel.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch.nn.parallel import DataParallel
|
2 |
+
import torch
|
3 |
+
from torch.nn.parallel._functions import Scatter
|
4 |
+
from torch.nn.parallel.parallel_apply import parallel_apply
|
5 |
+
|
6 |
+
|
7 |
+
def scatter(inputs, target_gpus, chunk_sizes, dim=0):
|
8 |
+
r"""
|
9 |
+
Slices tensors into approximately equal chunks and
|
10 |
+
distributes them across given GPUs. Duplicates
|
11 |
+
references to objects that are not tensors.
|
12 |
+
"""
|
13 |
+
def scatter_map(obj):
|
14 |
+
if isinstance(obj, torch.Tensor):
|
15 |
+
try:
|
16 |
+
return Scatter.apply(target_gpus, chunk_sizes, dim, obj)
|
17 |
+
except:
|
18 |
+
print('obj', obj.size())
|
19 |
+
print('dim', dim)
|
20 |
+
print('chunk_sizes', chunk_sizes)
|
21 |
+
quit()
|
22 |
+
if isinstance(obj, tuple) and len(obj) > 0:
|
23 |
+
return list(zip(*map(scatter_map, obj)))
|
24 |
+
if isinstance(obj, list) and len(obj) > 0:
|
25 |
+
return list(map(list, zip(*map(scatter_map, obj))))
|
26 |
+
if isinstance(obj, dict) and len(obj) > 0:
|
27 |
+
return list(map(type(obj), zip(*map(scatter_map, obj.items()))))
|
28 |
+
return [obj for targets in target_gpus]
|
29 |
+
|
30 |
+
# After scatter_map is called, a scatter_map cell will exist. This cell
|
31 |
+
# has a reference to the actual function scatter_map, which has references
|
32 |
+
# to a closure that has a reference to the scatter_map cell (because the
|
33 |
+
# fn is recursive). To avoid this reference cycle, we set the function to
|
34 |
+
# None, clearing the cell
|
35 |
+
try:
|
36 |
+
return scatter_map(inputs)
|
37 |
+
finally:
|
38 |
+
scatter_map = None
|
39 |
+
|
40 |
+
|
41 |
+
def scatter_kwargs(inputs, kwargs, target_gpus, chunk_sizes, dim=0):
|
42 |
+
r"""Scatter with support for kwargs dictionary"""
|
43 |
+
inputs = scatter(inputs, target_gpus, chunk_sizes, dim) if inputs else []
|
44 |
+
kwargs = scatter(kwargs, target_gpus, chunk_sizes, dim) if kwargs else []
|
45 |
+
if len(inputs) < len(kwargs):
|
46 |
+
inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
|
47 |
+
elif len(kwargs) < len(inputs):
|
48 |
+
kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
|
49 |
+
inputs = tuple(inputs)
|
50 |
+
kwargs = tuple(kwargs)
|
51 |
+
return inputs, kwargs
|
52 |
+
|
53 |
+
|
54 |
+
class BalancedDataParallel(DataParallel):
|
55 |
+
def __init__(self, gpu0_bsz, *args, **kwargs):
|
56 |
+
self.gpu0_bsz = gpu0_bsz
|
57 |
+
super().__init__(*args, **kwargs)
|
58 |
+
|
59 |
+
def forward(self, *inputs, **kwargs):
|
60 |
+
if not self.device_ids:
|
61 |
+
return self.module(*inputs, **kwargs)
|
62 |
+
if self.gpu0_bsz == 0:
|
63 |
+
device_ids = self.device_ids[1:]
|
64 |
+
else:
|
65 |
+
device_ids = self.device_ids
|
66 |
+
inputs, kwargs = self.scatter(inputs, kwargs, device_ids)
|
67 |
+
# print('len(inputs)1: ', str(len(inputs)))
|
68 |
+
# print('self.device_ids[:len(inputs)]', str(self.device_ids[:len(inputs)]))
|
69 |
+
if len(self.device_ids) == 1:
|
70 |
+
return self.module(*inputs[0], **kwargs[0])
|
71 |
+
replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
|
72 |
+
if self.gpu0_bsz == 0:
|
73 |
+
replicas = replicas[1:]
|
74 |
+
outputs = self.parallel_apply(replicas, device_ids, inputs, kwargs)
|
75 |
+
return self.gather(outputs, self.output_device)
|
76 |
+
|
77 |
+
def parallel_apply(self, replicas, device_ids, inputs, kwargs):
|
78 |
+
return parallel_apply(replicas, inputs, kwargs, device_ids[:len(inputs)])
|
79 |
+
|
80 |
+
def scatter(self, inputs, kwargs, device_ids):
|
81 |
+
bsz = inputs[0].size(self.dim)
|
82 |
+
num_dev = len(self.device_ids)
|
83 |
+
gpu0_bsz = self.gpu0_bsz
|
84 |
+
bsz_unit = (bsz - gpu0_bsz) // (num_dev - 1)
|
85 |
+
if gpu0_bsz < bsz_unit:
|
86 |
+
chunk_sizes = [gpu0_bsz] + [bsz_unit] * (num_dev - 1)
|
87 |
+
delta = bsz - sum(chunk_sizes)
|
88 |
+
for i in range(delta):
|
89 |
+
chunk_sizes[i + 1] += 1
|
90 |
+
if gpu0_bsz == 0:
|
91 |
+
chunk_sizes = chunk_sizes[1:]
|
92 |
+
else:
|
93 |
+
return super().scatter(inputs, kwargs, device_ids)
|
94 |
+
|
95 |
+
# print('bsz: ', bsz)
|
96 |
+
# print('num_dev: ', num_dev)
|
97 |
+
# print('gpu0_bsz: ', gpu0_bsz)
|
98 |
+
# print('bsz_unit: ', bsz_unit)
|
99 |
+
# print('chunk_sizes: ', chunk_sizes)
|
100 |
+
return scatter_kwargs(inputs, kwargs, device_ids, chunk_sizes, dim=self.dim)
|
dataset.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch.utils.data import Dataset
|
2 |
+
import torch
|
3 |
+
|
4 |
+
|
5 |
+
class MyDataset(Dataset):
|
6 |
+
"""
|
7 |
+
|
8 |
+
"""
|
9 |
+
|
10 |
+
def __init__(self, input_list, max_len):
|
11 |
+
self.input_list = input_list
|
12 |
+
self.max_len = max_len
|
13 |
+
|
14 |
+
def __getitem__(self, index):
|
15 |
+
input_ids = self.input_list[index]
|
16 |
+
input_ids = input_ids[:self.max_len]
|
17 |
+
input_ids = torch.tensor(input_ids, dtype=torch.long)
|
18 |
+
return input_ids
|
19 |
+
|
20 |
+
def __len__(self):
|
21 |
+
return len(self.input_list)
|
preprocess.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tokenizers import BertWordPieceTokenizer
|
2 |
+
from transformers import BertTokenizer
|
3 |
+
from transformers import BertTokenizerFast
|
4 |
+
import argparse
|
5 |
+
import pandas as pd
|
6 |
+
import pickle
|
7 |
+
import jieba.analyse
|
8 |
+
from tqdm import tqdm
|
9 |
+
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
|
10 |
+
import logging
|
11 |
+
import numpy as np
|
12 |
+
from chatbot.config import config
|
13 |
+
|
14 |
+
|
15 |
+
def create_logger(log_path):
|
16 |
+
"""
|
17 |
+
将日志输出到日志文件和控制台
|
18 |
+
"""
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
logger.setLevel(logging.INFO)
|
21 |
+
|
22 |
+
formatter = logging.Formatter(
|
23 |
+
'%(asctime)s - %(levelname)s - %(message)s')
|
24 |
+
|
25 |
+
# 创建一个handler,用于写入日志文件
|
26 |
+
file_handler = logging.FileHandler(
|
27 |
+
filename=log_path)
|
28 |
+
file_handler.setFormatter(formatter)
|
29 |
+
file_handler.setLevel(logging.INFO)
|
30 |
+
logger.addHandler(file_handler)
|
31 |
+
|
32 |
+
# 创建一个handler,用于将日志输出到控制台
|
33 |
+
console = logging.StreamHandler()
|
34 |
+
console.setLevel(logging.DEBUG)
|
35 |
+
console.setFormatter(formatter)
|
36 |
+
logger.addHandler(console)
|
37 |
+
|
38 |
+
return logger
|
39 |
+
|
40 |
+
|
41 |
+
def preprocess():
|
42 |
+
"""
|
43 |
+
对原始语料进行tokenize,将每段对话处理成如下形式:"[CLS]utterance1[SEP]utterance2[SEP]utterance3[SEP]"
|
44 |
+
"""
|
45 |
+
# 设置参数
|
46 |
+
parser = argparse.ArgumentParser()
|
47 |
+
parser.add_argument('--vocab_path', default='vocab/vocab.txt', type=str, required=False,
|
48 |
+
help='词表路径')
|
49 |
+
parser.add_argument('--log_path', default='data/preprocess.log', type=str, required=False, help='训练日志存放位置')
|
50 |
+
parser.add_argument('--train_path', default='data/train.txt', type=str, required=False, help='训练日志存放位置')
|
51 |
+
parser.add_argument('--save_path', default='data/train.pkl', type=str, required=False, help='tokenize的训练数据集')
|
52 |
+
args = parser.parse_args()
|
53 |
+
|
54 |
+
# 初始化日志对象
|
55 |
+
logger = create_logger(args.log_path)
|
56 |
+
|
57 |
+
# 初始化tokenizer
|
58 |
+
tokenizer = BertTokenizerFast(vocab_file=args.vocab_path, sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]")
|
59 |
+
special_tokens = []
|
60 |
+
for key in config["mask_token"].keys():
|
61 |
+
special_tokens.append(key)
|
62 |
+
tokenizer.add_special_tokens( {'additional_special_tokens':special_tokens} )
|
63 |
+
sep_id = tokenizer.sep_token_id
|
64 |
+
cls_id = tokenizer.cls_token_id
|
65 |
+
logger.info("preprocessing data,data path:{}, save path:{}".format(args.train_path, args.save_path))
|
66 |
+
|
67 |
+
# 读取训练数据集
|
68 |
+
with open(args.train_path, 'rb') as f:
|
69 |
+
data = f.read().decode("utf-8")
|
70 |
+
|
71 |
+
# 需要区分linux和windows环境下的换行符
|
72 |
+
if "\r\n" in data:
|
73 |
+
train_data = data.split("\r\n\r\n")
|
74 |
+
else:
|
75 |
+
train_data = data.split("\n\n")
|
76 |
+
logger.info("there are {} dialogue in dataset".format(len(train_data)))
|
77 |
+
|
78 |
+
# 开始进行tokenize
|
79 |
+
# 保存所有的对话数据,每条数据的格式为:"[CLS]utterance1[SEP]utterance2[SEP]utterance3[SEP]"
|
80 |
+
dialogue_len = [] # 记录所有对话tokenize之后的长度,用于统计中位数与均值
|
81 |
+
dialogue_list = []
|
82 |
+
with open(args.save_path, "w", encoding="utf-8") as f:
|
83 |
+
for index, dialogue in enumerate(tqdm(train_data)):
|
84 |
+
if "\r\n" in data:
|
85 |
+
utterances = dialogue.split("\r\n")
|
86 |
+
else:
|
87 |
+
utterances = dialogue.split("\n")
|
88 |
+
|
89 |
+
input_ids = [cls_id] # 每个dialogue以[CLS]开头
|
90 |
+
for utterance in utterances:
|
91 |
+
input_ids += tokenizer.encode(utterance, add_special_tokens=False)
|
92 |
+
input_ids.append(sep_id) # 每个utterance之后添加[SEP],表示utterance结束
|
93 |
+
dialogue_len.append(len(input_ids))
|
94 |
+
dialogue_list.append(input_ids)
|
95 |
+
len_mean = np.mean(dialogue_len)
|
96 |
+
len_median = np.median(dialogue_len)
|
97 |
+
len_max = np.max(dialogue_len)
|
98 |
+
with open(args.save_path, "wb") as f:
|
99 |
+
pickle.dump(dialogue_list, f)
|
100 |
+
logger.info("finish preprocessing data,the result is stored in {}".format(args.save_path))
|
101 |
+
logger.info("mean of dialogue len:{},median of dialogue len:{},max len:{}".format(len_mean, len_median, len_max))
|
102 |
+
|
103 |
+
|
104 |
+
if __name__ == '__main__':
|
105 |
+
preprocess()
|
pytorchtools.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
from os.path import join
|
4 |
+
import os
|
5 |
+
|
6 |
+
class EarlyStopping:
|
7 |
+
"""Early stops the training if validation loss doesn't improve after a given patience."""
|
8 |
+
def __init__(self, patience=7, verbose=False, delta=0, save_path="."):
|
9 |
+
"""
|
10 |
+
Args:
|
11 |
+
patience (int): How long to wait after last time validation loss improved.
|
12 |
+
Default: 7
|
13 |
+
verbose (bool): If True, prints a message for each validation loss improvement.
|
14 |
+
Default: False
|
15 |
+
delta (float): Minimum change in the monitored quantity to qualify as an improvement.
|
16 |
+
Default: 0
|
17 |
+
"""
|
18 |
+
self.patience = patience
|
19 |
+
self.verbose = verbose
|
20 |
+
self.counter = 0
|
21 |
+
self.best_score = None
|
22 |
+
self.early_stop = False
|
23 |
+
self.val_loss_min = np.Inf
|
24 |
+
self.delta = delta
|
25 |
+
self.save_path = save_path
|
26 |
+
|
27 |
+
def __call__(self, val_loss, model):
|
28 |
+
|
29 |
+
score = -val_loss
|
30 |
+
|
31 |
+
if self.best_score is None:
|
32 |
+
self.best_score = score
|
33 |
+
self.save_checkpoint(val_loss, model)
|
34 |
+
elif score < self.best_score + self.delta:
|
35 |
+
self.counter += 1
|
36 |
+
print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
|
37 |
+
if self.counter >= self.patience:
|
38 |
+
self.early_stop = True
|
39 |
+
else:
|
40 |
+
self.best_score = score
|
41 |
+
self.save_checkpoint(val_loss, model)
|
42 |
+
self.counter = 0
|
43 |
+
|
44 |
+
def save_checkpoint(self, val_loss, model):
|
45 |
+
'''Saves model when validation loss decrease.'''
|
46 |
+
if self.verbose:
|
47 |
+
print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...')
|
48 |
+
# save_path = join(self.save_path, "best_model")
|
49 |
+
# if not os.path.exists(save_path):
|
50 |
+
# os.mkdir(save_path)
|
51 |
+
# model_to_save = model.module if hasattr(model, 'module') else model
|
52 |
+
# model_to_save.save_pretrained(save_path)
|
53 |
+
self.val_loss_min = val_loss
|
requirements.txt
CHANGED
@@ -1,18 +1,2 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
sacrebleu==1.4.10
|
4 |
-
six
|
5 |
-
wheel
|
6 |
-
progressbar
|
7 |
-
sklearn
|
8 |
-
torch==1.6.0
|
9 |
-
torchvision==0.7.0
|
10 |
-
transformers==4.7.0
|
11 |
-
pyyaml
|
12 |
-
nltk
|
13 |
-
sentencepiece
|
14 |
-
spacy
|
15 |
-
gdown
|
16 |
-
seaborn
|
17 |
-
matplotlib
|
18 |
-
pandas
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
templates/chat_template.html
ADDED
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<html lang="zh">
|
2 |
+
<head>
|
3 |
+
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
4 |
+
<title>聊天机器人</title>
|
5 |
+
<style>
|
6 |
+
body {
|
7 |
+
padding:0;
|
8 |
+
margin:0;
|
9 |
+
background:-moz-linear-gradient(-45deg,#183850 0,#183850 25%,#192C46 50%,#22254C 75%,#22254C 100%);
|
10 |
+
background:-webkit-linear-gradient(-45deg,#183850 0,#183850 25%,#192C46 50%,#22254C 75%,#22254C 100%);
|
11 |
+
background-repeat:no-repeat;
|
12 |
+
background-attachment:fixed
|
13 |
+
}
|
14 |
+
::-webkit-scrollbar {
|
15 |
+
width:10px
|
16 |
+
}
|
17 |
+
::-webkit-scrollbar-track {
|
18 |
+
border-radius:10px;
|
19 |
+
background-color:rgba(25,147,147,0.1)
|
20 |
+
}
|
21 |
+
::-webkit-scrollbar-thumb {
|
22 |
+
border-radius:10px;
|
23 |
+
background-color:rgba(25,147,147,0.2)
|
24 |
+
}
|
25 |
+
.chat-thread {
|
26 |
+
margin:24px auto 0 auto;
|
27 |
+
padding:0 20px 0 0;
|
28 |
+
list-style:none;
|
29 |
+
overflow-y:scroll;
|
30 |
+
overflow-x:hidden
|
31 |
+
}
|
32 |
+
.chat-thread li {
|
33 |
+
position:relative;
|
34 |
+
clear:both;
|
35 |
+
display:inline-block;
|
36 |
+
padding:16px 40px 16px 20px;
|
37 |
+
margin:0 0 20px 0;
|
38 |
+
font:16px/20px "Noto Sans",sans-serif;
|
39 |
+
border-radius:10px;
|
40 |
+
background-color:rgba(25,147,147,0.2)
|
41 |
+
}
|
42 |
+
.chat-thread li:before {
|
43 |
+
position:absolute;
|
44 |
+
top:0;
|
45 |
+
width:50px;
|
46 |
+
height:50px;
|
47 |
+
border-radius:50px;
|
48 |
+
content:""
|
49 |
+
}
|
50 |
+
.chat-thread li:after {
|
51 |
+
position:absolute;
|
52 |
+
top:15px;
|
53 |
+
content:"";
|
54 |
+
width:0;
|
55 |
+
height:0;
|
56 |
+
border-top:15px solid rgba(25,147,147,0.2)
|
57 |
+
}
|
58 |
+
.chat-thread li:nth-child(odd) {
|
59 |
+
animation:show-chat-odd .15s 1 ease-in;
|
60 |
+
-moz-animation:show-chat-odd .15s 1 ease-in;
|
61 |
+
-webkit-animation:show-chat-odd .15s 1 ease-in;
|
62 |
+
float:right;
|
63 |
+
margin-right:80px;
|
64 |
+
color:#0AD5C1
|
65 |
+
}
|
66 |
+
.chat-thread li:nth-child(odd):before {
|
67 |
+
right:-80px;
|
68 |
+
background-image:url()
|
69 |
+
}
|
70 |
+
.chat-thread li:nth-child(odd):after {
|
71 |
+
border-right:15px solid transparent;
|
72 |
+
right:-15px
|
73 |
+
}
|
74 |
+
.chat-thread li:nth-child(even) {
|
75 |
+
animation:show-chat-even .15s 1 ease-in;
|
76 |
+
-moz-animation:show-chat-even .15s 1 ease-in;
|
77 |
+
-webkit-animation:show-chat-even .15s 1 ease-in;
|
78 |
+
float:left;
|
79 |
+
margin-left:80px;
|
80 |
+
color:#0EC879
|
81 |
+
}
|
82 |
+
.chat-thread li:nth-child(even):before {
|
83 |
+
left:-80px;
|
84 |
+
background-image:url()
|
85 |
+
}
|
86 |
+
.chat-thread li:nth-child(even):after {
|
87 |
+
border-left:15px solid transparent;
|
88 |
+
left:-15px
|
89 |
+
}
|
90 |
+
.chat-window {
|
91 |
+
position:fixed;
|
92 |
+
bottom:18px
|
93 |
+
}
|
94 |
+
.chat-window-message {
|
95 |
+
width:100%;
|
96 |
+
height:48px;
|
97 |
+
font:32px/48px "Noto Sans",sans-serif;
|
98 |
+
background:0;
|
99 |
+
color:#0AD5C1;
|
100 |
+
border:0;
|
101 |
+
border-bottom:1px solid rgba(25,147,147,0.2);
|
102 |
+
outline:0
|
103 |
+
}
|
104 |
+
@media all and (max-width:767px) {
|
105 |
+
.chat-thread {
|
106 |
+
width:90%;
|
107 |
+
height:90%
|
108 |
+
}
|
109 |
+
.chat-window {
|
110 |
+
left:5%;
|
111 |
+
width:90%
|
112 |
+
}
|
113 |
+
}
|
114 |
+
@media all and (min-width:768px) {
|
115 |
+
.chat-thread {
|
116 |
+
width:50%;
|
117 |
+
height:90%
|
118 |
+
}
|
119 |
+
.chat-window {
|
120 |
+
left:25%;
|
121 |
+
width:50%
|
122 |
+
}
|
123 |
+
}
|
124 |
+
@keyframes show-chat-even {
|
125 |
+
0% {
|
126 |
+
margin-left:-480px
|
127 |
+
}
|
128 |
+
100% {
|
129 |
+
margin-left:0
|
130 |
+
}
|
131 |
+
}
|
132 |
+
@-moz-keyframes show-chat-even {
|
133 |
+
0% {
|
134 |
+
margin-left:-480px
|
135 |
+
}
|
136 |
+
100% {
|
137 |
+
margin-left:0
|
138 |
+
}
|
139 |
+
}
|
140 |
+
@-webkit-keyframes show-chat-even {
|
141 |
+
0% {
|
142 |
+
margin-left:-480px
|
143 |
+
}
|
144 |
+
100% {
|
145 |
+
margin-left:0
|
146 |
+
}
|
147 |
+
}
|
148 |
+
@keyframes show-chat-odd {
|
149 |
+
0% {
|
150 |
+
margin-right:-480px
|
151 |
+
}
|
152 |
+
100% {
|
153 |
+
margin-right:0
|
154 |
+
}
|
155 |
+
}
|
156 |
+
@-moz-keyframes show-chat-odd {
|
157 |
+
0% {
|
158 |
+
margin-right:-480px
|
159 |
+
}
|
160 |
+
100% {
|
161 |
+
margin-right:0
|
162 |
+
}
|
163 |
+
}
|
164 |
+
@-webkit-keyframes show-chat-odd {
|
165 |
+
0% {
|
166 |
+
margin-right:-480px
|
167 |
+
}
|
168 |
+
100% {
|
169 |
+
margin-right:0
|
170 |
+
}
|
171 |
+
}
|
172 |
+
</style>
|
173 |
+
</head>
|
174 |
+
<body onload="loadhistory()">
|
175 |
+
<ul class="chat-thread">
|
176 |
+
|
177 |
+
</ul>
|
178 |
+
<div class="chat-window">
|
179 |
+
<input class="chat-window-message" name="chat-window-message" type="text" autocomplete="off" autofocus="" placeholder="对我说HELP,看看我能干什么~">
|
180 |
+
</div>
|
181 |
+
<script src="https://cdn.bootcdn.net/ajax/libs/jquery/2.1.3/jquery.min.js"></script>
|
182 |
+
<script>
|
183 |
+
var chat_window = document.querySelector(".chat-window");
|
184 |
+
|
185 |
+
chat_window.onkeydown=function(event){
|
186 |
+
var e = event || window.event || arguments.callee.caller.arguments[0];
|
187 |
+
if (e && e.keyCode == 13 ) {
|
188 |
+
send_data();
|
189 |
+
}
|
190 |
+
}
|
191 |
+
|
192 |
+
function send_data() {
|
193 |
+
var chat_thread = document.querySelector(".chat-thread");
|
194 |
+
var chat_window_message = document.querySelector(".chat-window-message");
|
195 |
+
chat_window_message.disabled = true;
|
196 |
+
var text = chat_window_message.value;
|
197 |
+
var new_li_label = document.createElement("li"), new_li_text = document.createTextNode(text);
|
198 |
+
new_li_label.appendChild(new_li_text);
|
199 |
+
chat_thread.appendChild(new_li_label);
|
200 |
+
chat_thread.scrollTop = chat_thread.scrollHeight;
|
201 |
+
chat_window_message.value = "";
|
202 |
+
document.title = "聊天机器人 ~ 对方正在输入…"
|
203 |
+
$.getJSON("/chitchat/chat?text="+text, function(data){
|
204 |
+
var new_li_label = document.createElement("li");
|
205 |
+
data.forEach(function(item){
|
206 |
+
var new_text = document.createTextNode(item);
|
207 |
+
var new_span = document.createElement("span");
|
208 |
+
new_span.appendChild(new_text);
|
209 |
+
var new_br = document.createElement("br");
|
210 |
+
new_li_label.appendChild(new_span);
|
211 |
+
new_li_label.appendChild(new_br);
|
212 |
+
});
|
213 |
+
chat_thread.appendChild(new_li_label);
|
214 |
+
chat_thread.scrollTop = chat_thread.scrollHeight;
|
215 |
+
|
216 |
+
document.title = "聊天机器人"
|
217 |
+
chat_window_message.disabled = false;
|
218 |
+
});
|
219 |
+
}
|
220 |
+
|
221 |
+
function loadhistory() {
|
222 |
+
var chat_thread = document.querySelector(".chat-thread");
|
223 |
+
var chat_window_message = document.querySelector(".chat-window-message");
|
224 |
+
chat_window_message.disabled = true;
|
225 |
+
document.title = "聊天机器人 ~ 正在回忆…"
|
226 |
+
$.getJSON("/chitchat/history", function(data){
|
227 |
+
data.forEach(function(item) {
|
228 |
+
var new_li_label = document.createElement("li"),new_li_text = document.createTextNode(item);
|
229 |
+
new_li_label.appendChild(new_li_text);
|
230 |
+
chat_thread.appendChild(new_li_label);
|
231 |
+
});
|
232 |
+
chat_thread.scrollTop = chat_thread.scrollHeight;
|
233 |
+
|
234 |
+
chat_window_message.disabled = false;
|
235 |
+
document.title = "聊天机器人"
|
236 |
+
});
|
237 |
+
}
|
238 |
+
</script>
|
239 |
+
</body>
|
240 |
+
</html>
|
train.py
ADDED
@@ -0,0 +1,432 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import math
|
3 |
+
import time
|
4 |
+
import torch
|
5 |
+
import torch.nn.functional as F
|
6 |
+
import torch.optim as optim
|
7 |
+
import logging
|
8 |
+
from datetime import datetime
|
9 |
+
import os
|
10 |
+
from torch.utils.data import Dataset, DataLoader
|
11 |
+
from os.path import join, exists
|
12 |
+
from torch.nn import CrossEntropyLoss
|
13 |
+
from tqdm import tqdm
|
14 |
+
from torch.nn import DataParallel
|
15 |
+
import transformers
|
16 |
+
import pickle
|
17 |
+
import sys
|
18 |
+
from pytorchtools import EarlyStopping
|
19 |
+
from sklearn.model_selection import train_test_split
|
20 |
+
from data_parallel import BalancedDataParallel
|
21 |
+
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, GPT2Config
|
22 |
+
from transformers import BertTokenizerFast
|
23 |
+
import pandas as pd
|
24 |
+
import torch.nn.utils.rnn as rnn_utils
|
25 |
+
import numpy as np
|
26 |
+
from dataset import MyDataset
|
27 |
+
from chatbot.config import config
|
28 |
+
|
29 |
+
|
30 |
+
def set_args():
|
31 |
+
parser = argparse.ArgumentParser()
|
32 |
+
parser.add_argument('--device', default='3', type=str, required=False, help='设置使用哪些显卡')
|
33 |
+
parser.add_argument('--no_cuda', action='store_true', help='不使用GPU进行训练')
|
34 |
+
parser.add_argument('--vocab_path', default='vocab/vocab.txt', type=str, required=False,
|
35 |
+
help='词表路径')
|
36 |
+
parser.add_argument('--model_config', default='config/config.json', type=str, required=False,
|
37 |
+
help='设置模型参数')
|
38 |
+
parser.add_argument('--train_path', default='data/train.pkl', type=str, required=False, help='训练集路径')
|
39 |
+
parser.add_argument('--max_len', default=150, type=int, required=False, help='训练时,输入数据的最大长度')
|
40 |
+
|
41 |
+
parser.add_argument('--log_path', default='data/train.log', type=str, required=False, help='训练日志存放位置')
|
42 |
+
parser.add_argument('--log', default=True, help="是否记录日志")
|
43 |
+
parser.add_argument('--ignore_index', default=-100, type=int, required=False, help='对于ignore_index的label token不计算梯度')
|
44 |
+
# parser.add_argument('--input_len', default=200, type=int, required=False, help='输入的长度')
|
45 |
+
parser.add_argument('--epochs', default=100, type=int, required=False, help='训练的最大轮次')
|
46 |
+
parser.add_argument('--batch_size', default=4, type=int, required=False, help='训练的batch size')
|
47 |
+
parser.add_argument('--gpu0_bsz', default=10, type=int, required=False, help='0号卡的batch size')
|
48 |
+
parser.add_argument('--lr', default=2.6e-5, type=float, required=False, help='学习率')
|
49 |
+
parser.add_argument('--eps', default=1.0e-09, type=float, required=False, help='衰减率')
|
50 |
+
parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss')
|
51 |
+
parser.add_argument('--gradient_accumulation_steps', default=4, type=int, required=False, help='梯度积累')
|
52 |
+
parser.add_argument('--max_grad_norm', default=2.0, type=float, required=False)
|
53 |
+
parser.add_argument('--save_model_path', default='model', type=str, required=False,
|
54 |
+
help='模型输出路径')
|
55 |
+
parser.add_argument('--pretrained_model', default='', type=str, required=False,
|
56 |
+
help='预训练的模型的路径')
|
57 |
+
# parser.add_argument('--seed', type=int, default=None, help='设置种子用于生成随机数,以使得训练的结果是确定的')
|
58 |
+
parser.add_argument('--num_workers', type=int, default=0, help="dataloader加载数据时使用的线程数量")
|
59 |
+
parser.add_argument('--patience', type=int, default=0, help="用于early stopping,设为0时,不进行early stopping.early stop得到的模型的生成效果不一定会更好。")
|
60 |
+
parser.add_argument('--warmup_steps', type=int, default=4000, help='warm up步数')
|
61 |
+
# parser.add_argument('--label_smoothing', default=True, action='store_true', help='是否进行标签平滑')
|
62 |
+
parser.add_argument('--val_num', type=int, default=8000, help='验证集大小')
|
63 |
+
args = parser.parse_args()
|
64 |
+
return args
|
65 |
+
|
66 |
+
|
67 |
+
def create_logger(args):
|
68 |
+
"""
|
69 |
+
将日志输出到日志文件和控制台
|
70 |
+
"""
|
71 |
+
logger = logging.getLogger(__name__)
|
72 |
+
logger.setLevel(logging.INFO)
|
73 |
+
|
74 |
+
formatter = logging.Formatter(
|
75 |
+
'%(asctime)s - %(levelname)s - %(message)s')
|
76 |
+
|
77 |
+
# 创建一个handler,用于写入日志文件
|
78 |
+
file_handler = logging.FileHandler(
|
79 |
+
filename=args.log_path)
|
80 |
+
file_handler.setFormatter(formatter)
|
81 |
+
file_handler.setLevel(logging.INFO)
|
82 |
+
logger.addHandler(file_handler)
|
83 |
+
|
84 |
+
# 创建一个handler,用于将日志输出到控制台
|
85 |
+
console = logging.StreamHandler()
|
86 |
+
console.setLevel(logging.DEBUG)
|
87 |
+
console.setFormatter(formatter)
|
88 |
+
logger.addHandler(console)
|
89 |
+
|
90 |
+
return logger
|
91 |
+
|
92 |
+
|
93 |
+
def collate_fn(batch):
|
94 |
+
input_ids = rnn_utils.pad_sequence(batch, batch_first=True, padding_value=0)
|
95 |
+
labels = rnn_utils.pad_sequence(batch, batch_first=True, padding_value=-100)
|
96 |
+
return input_ids, labels
|
97 |
+
|
98 |
+
|
99 |
+
# def padding_batch(data_list, pad_id):
|
100 |
+
# """
|
101 |
+
# 使用pad_id将data_list的每条数据,填充至data_list中最长的长度
|
102 |
+
# :param data_list:
|
103 |
+
# :param pad_id:
|
104 |
+
# :return:
|
105 |
+
# """
|
106 |
+
# # 统计data_list中的最大长度
|
107 |
+
# max_len = 0
|
108 |
+
# for data in data_list:
|
109 |
+
# max_len = max_len if max_len > len(data) else len(data)
|
110 |
+
#
|
111 |
+
# # 对数据进行padding
|
112 |
+
# new_data_list = []
|
113 |
+
# for data in data_list:
|
114 |
+
# new_data = data + [pad_id] * (max_len - len(data))
|
115 |
+
# new_data_list.append(new_data)
|
116 |
+
# return new_data_list
|
117 |
+
|
118 |
+
|
119 |
+
def load_dataset(logger, args):
|
120 |
+
"""
|
121 |
+
加载训练集和验证集
|
122 |
+
"""
|
123 |
+
logger.info("loading training dataset and validating dataset")
|
124 |
+
train_path = args.train_path
|
125 |
+
|
126 |
+
with open(train_path, "rb") as f:
|
127 |
+
input_list = pickle.load(f)
|
128 |
+
|
129 |
+
# 划分训练集与验证集
|
130 |
+
val_num = args.val_num
|
131 |
+
input_list_train = input_list[val_num:]
|
132 |
+
input_list_val = input_list[:val_num]
|
133 |
+
# test
|
134 |
+
# input_list_train = input_list_train[:24]
|
135 |
+
# input_list_val = input_list_val[:24]
|
136 |
+
|
137 |
+
train_dataset = MyDataset(input_list_train, args.max_len)
|
138 |
+
val_dataset = MyDataset(input_list_val, args.max_len)
|
139 |
+
|
140 |
+
return train_dataset, val_dataset
|
141 |
+
|
142 |
+
|
143 |
+
def train_epoch(model, train_dataloader, optimizer, scheduler, logger,
|
144 |
+
epoch, args):
|
145 |
+
model.train()
|
146 |
+
device = args.device
|
147 |
+
# pad_id = args.pad_id
|
148 |
+
# sep_id = args.sep_id
|
149 |
+
ignore_index = args.ignore_index
|
150 |
+
epoch_start_time = datetime.now()
|
151 |
+
total_loss = 0 # 记录下整个epoch的loss的总和
|
152 |
+
|
153 |
+
# epoch_correct_num:每个epoch中,output预测正确的word的数量
|
154 |
+
# epoch_total_num: 每个epoch中,output预测的word的总数量
|
155 |
+
epoch_correct_num, epoch_total_num = 0, 0
|
156 |
+
|
157 |
+
for batch_idx, (input_ids, labels) in enumerate(train_dataloader):
|
158 |
+
# 捕获cuda out of memory exception
|
159 |
+
try:
|
160 |
+
input_ids = input_ids.to(device)
|
161 |
+
labels = labels.to(device)
|
162 |
+
outputs = model.forward(input_ids, labels=labels)
|
163 |
+
logits = outputs.logits
|
164 |
+
loss = outputs.loss
|
165 |
+
loss = loss.mean()
|
166 |
+
|
167 |
+
# 统计该batch的预测token的正确数与总数
|
168 |
+
batch_correct_num, batch_total_num = calculate_acc(logits, labels, ignore_index=ignore_index)
|
169 |
+
# 统计该epoch的预测token的正确数与总数
|
170 |
+
epoch_correct_num += batch_correct_num
|
171 |
+
epoch_total_num += batch_total_num
|
172 |
+
# 计算该batch的accuracy
|
173 |
+
batch_acc = batch_correct_num / batch_total_num
|
174 |
+
|
175 |
+
total_loss += loss.item()
|
176 |
+
if args.gradient_accumulation_steps > 1:
|
177 |
+
loss = loss / args.gradient_accumulation_steps
|
178 |
+
|
179 |
+
loss.backward()
|
180 |
+
# 梯度裁剪
|
181 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
|
182 |
+
|
183 |
+
# 进行一定step的梯度累计之后,更新参数
|
184 |
+
if (batch_idx + 1) % args.gradient_accumulation_steps == 0:
|
185 |
+
# 更新参数
|
186 |
+
optimizer.step()
|
187 |
+
# 更新学习率
|
188 |
+
scheduler.step()
|
189 |
+
# 清空梯度信息
|
190 |
+
optimizer.zero_grad()
|
191 |
+
|
192 |
+
if (batch_idx + 1) % args.log_step == 0:
|
193 |
+
logger.info(
|
194 |
+
"batch {} of epoch {}, loss {}, batch_acc {}, lr {}".format(
|
195 |
+
batch_idx + 1, epoch + 1, loss.item() * args.gradient_accumulation_steps, batch_acc, scheduler.get_lr()))
|
196 |
+
|
197 |
+
del input_ids, outputs
|
198 |
+
|
199 |
+
except RuntimeError as exception:
|
200 |
+
if "out of memory" in str(exception):
|
201 |
+
logger.info("WARNING: ran out of memory")
|
202 |
+
if hasattr(torch.cuda, 'empty_cache'):
|
203 |
+
torch.cuda.empty_cache()
|
204 |
+
else:
|
205 |
+
logger.info(str(exception))
|
206 |
+
raise exception
|
207 |
+
|
208 |
+
# 记录当前epoch的平均loss与accuracy
|
209 |
+
epoch_mean_loss = total_loss / len(train_dataloader)
|
210 |
+
epoch_mean_acc = epoch_correct_num / epoch_total_num
|
211 |
+
logger.info(
|
212 |
+
"epoch {}: loss {}, predict_acc {}".format(epoch + 1, epoch_mean_loss, epoch_mean_acc))
|
213 |
+
|
214 |
+
# save model
|
215 |
+
logger.info('saving model for epoch {}'.format(epoch + 1))
|
216 |
+
model_path = join(args.save_model_path, 'epoch{}'.format(epoch + 1))
|
217 |
+
if not os.path.exists(model_path):
|
218 |
+
os.mkdir(model_path)
|
219 |
+
model_to_save = model.module if hasattr(model, 'module') else model
|
220 |
+
model_to_save.save_pretrained(model_path)
|
221 |
+
logger.info('epoch {} finished'.format(epoch + 1))
|
222 |
+
epoch_finish_time = datetime.now()
|
223 |
+
logger.info('time for one epoch: {}'.format(epoch_finish_time - epoch_start_time))
|
224 |
+
|
225 |
+
return epoch_mean_loss
|
226 |
+
|
227 |
+
|
228 |
+
def validate_epoch(model, validate_dataloader, logger, epoch, args):
|
229 |
+
logger.info("start validating")
|
230 |
+
model.eval()
|
231 |
+
device = args.device
|
232 |
+
# pad_id = args.pad_id
|
233 |
+
# sep_id = args.sep_id
|
234 |
+
ignore_index = args.ignore_index
|
235 |
+
epoch_start_time = datetime.now()
|
236 |
+
total_loss = 0
|
237 |
+
# 捕获cuda out of memory exception
|
238 |
+
try:
|
239 |
+
with torch.no_grad():
|
240 |
+
for batch_idx, (input_ids, labels) in enumerate(validate_dataloader):
|
241 |
+
input_ids = input_ids.to(device)
|
242 |
+
labels = labels.to(device)
|
243 |
+
outputs = model.forward(input_ids, labels=labels)
|
244 |
+
logits = outputs.logits
|
245 |
+
loss = outputs.loss
|
246 |
+
loss = loss.mean()
|
247 |
+
|
248 |
+
total_loss += loss.item()
|
249 |
+
del input_ids, outputs
|
250 |
+
|
251 |
+
# 记录当前epoch的平均loss
|
252 |
+
epoch_mean_loss = total_loss / len(validate_dataloader)
|
253 |
+
logger.info(
|
254 |
+
"validate epoch {}: loss {}".format(epoch+1, epoch_mean_loss))
|
255 |
+
epoch_finish_time = datetime.now()
|
256 |
+
logger.info('time for validating one epoch: {}'.format(epoch_finish_time - epoch_start_time))
|
257 |
+
return epoch_mean_loss
|
258 |
+
except RuntimeError as exception:
|
259 |
+
if "out of memory" in str(exception):
|
260 |
+
logger.info("WARNING: ran out of memory")
|
261 |
+
if hasattr(torch.cuda, 'empty_cache'):
|
262 |
+
torch.cuda.empty_cache()
|
263 |
+
else:
|
264 |
+
logger.info(str(exception))
|
265 |
+
raise exception
|
266 |
+
|
267 |
+
|
268 |
+
def train(model, logger, train_dataset, validate_dataset, args):
|
269 |
+
train_dataloader = DataLoader(
|
270 |
+
train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn,
|
271 |
+
drop_last=True
|
272 |
+
)
|
273 |
+
validate_dataloader = DataLoader(validate_dataset, batch_size=args.batch_size, shuffle=True,
|
274 |
+
num_workers=args.num_workers, collate_fn=collate_fn, drop_last=True)
|
275 |
+
early_stopping = EarlyStopping(args.patience, verbose=True, save_path=args.save_model_path)
|
276 |
+
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.epochs
|
277 |
+
optimizer = transformers.AdamW(model.parameters(), lr=args.lr, eps=args.eps)
|
278 |
+
# scheduler = transformers.WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
|
279 |
+
scheduler = transformers.get_linear_schedule_with_warmup(
|
280 |
+
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
281 |
+
)
|
282 |
+
|
283 |
+
logger.info('starting training')
|
284 |
+
|
285 |
+
# 用于记录每个epoch训练和验证的loss
|
286 |
+
train_losses, validate_losses = [], []
|
287 |
+
# 记录验证集的最小loss
|
288 |
+
best_val_loss = 10000
|
289 |
+
# 开始训练
|
290 |
+
for epoch in range(args.epochs):
|
291 |
+
# ========== train ========== #
|
292 |
+
train_loss = train_epoch(
|
293 |
+
model=model, train_dataloader=train_dataloader,
|
294 |
+
optimizer=optimizer, scheduler=scheduler,
|
295 |
+
logger=logger, epoch=epoch, args=args)
|
296 |
+
train_losses.append(train_loss)
|
297 |
+
|
298 |
+
# ========== validate ========== #
|
299 |
+
validate_loss = validate_epoch(
|
300 |
+
model=model, validate_dataloader=validate_dataloader,
|
301 |
+
logger=logger, epoch=epoch, args=args)
|
302 |
+
validate_losses.append(validate_loss)
|
303 |
+
|
304 |
+
# 保存当前困惑度最低的模型,困惑度低,模型的生成效果不一定会越好
|
305 |
+
if validate_loss < best_val_loss:
|
306 |
+
best_val_loss = validate_loss
|
307 |
+
logger.info('saving current best model for epoch {}'.format(epoch + 1))
|
308 |
+
model_path = join(args.save_model_path, 'min_ppl_model'.format(epoch + 1))
|
309 |
+
if not os.path.exists(model_path):
|
310 |
+
os.mkdir(model_path)
|
311 |
+
model_to_save = model.module if hasattr(model, 'module') else model
|
312 |
+
model_to_save.save_pretrained(model_path)
|
313 |
+
|
314 |
+
# 如果patience=0,则不进行early stopping
|
315 |
+
if args.patience == 0:
|
316 |
+
continue
|
317 |
+
early_stopping(validate_loss, model)
|
318 |
+
if early_stopping.early_stop:
|
319 |
+
logger.info("Early stopping")
|
320 |
+
break
|
321 |
+
logger.info('training finished')
|
322 |
+
logger.info("train_losses:{}".format(train_losses))
|
323 |
+
logger.info("validate_losses:{}".format(validate_losses))
|
324 |
+
|
325 |
+
|
326 |
+
def caculate_loss(logit, target, pad_idx, smoothing=True):
|
327 |
+
if smoothing:
|
328 |
+
logit = logit[..., :-1, :].contiguous().view(-1, logit.size(2))
|
329 |
+
target = target[..., 1:].contiguous().view(-1)
|
330 |
+
|
331 |
+
eps = 0.1
|
332 |
+
n_class = logit.size(-1)
|
333 |
+
|
334 |
+
one_hot = torch.zeros_like(logit).scatter(1, target.view(-1, 1), 1)
|
335 |
+
one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)
|
336 |
+
log_prb = F.log_softmax(logit, dim=1)
|
337 |
+
|
338 |
+
non_pad_mask = target.ne(pad_idx)
|
339 |
+
loss = -(one_hot * log_prb).sum(dim=1)
|
340 |
+
loss = loss.masked_select(non_pad_mask).mean() # average later
|
341 |
+
else:
|
342 |
+
# loss = F.cross_entropy(predict_logit, target, ignore_index=pad_idx)
|
343 |
+
logit = logit[..., :-1, :].contiguous().view(-1, logit.size(-1))
|
344 |
+
labels = target[..., 1:].contiguous().view(-1)
|
345 |
+
loss = F.cross_entropy(logit, labels, ignore_index=pad_idx)
|
346 |
+
return loss
|
347 |
+
|
348 |
+
|
349 |
+
def calculate_acc(logit, labels, ignore_index=-100):
|
350 |
+
logit = logit[..., :-1, :].contiguous().view(-1, logit.size(-1))
|
351 |
+
labels = labels[..., 1:].contiguous().view(-1)
|
352 |
+
|
353 |
+
_, logit = logit.max(dim=-1) # 对于每条数据,返回最大的index
|
354 |
+
# 进行��运算,返回一个tensor,若labels的第i个位置为pad_id,则置为0,否则为1
|
355 |
+
non_pad_mask = labels.ne(ignore_index)
|
356 |
+
n_correct = logit.eq(labels).masked_select(non_pad_mask).sum().item()
|
357 |
+
n_word = non_pad_mask.sum().item()
|
358 |
+
return n_correct, n_word
|
359 |
+
|
360 |
+
|
361 |
+
def main():
|
362 |
+
# 初始化参数
|
363 |
+
args = set_args()
|
364 |
+
|
365 |
+
# 设置使用哪些显卡进行训练
|
366 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = args.device
|
367 |
+
|
368 |
+
args.cuda = not args.no_cuda
|
369 |
+
|
370 |
+
if args.batch_size < 2048 and args.warmup_steps <= 4000:
|
371 |
+
print('[Warning] The warmup steps may be not enough.\n' \
|
372 |
+
'(sz_b, warmup) = (2048, 4000) is the official setting.\n' \
|
373 |
+
'Using smaller batch w/o longer warmup may cause ' \
|
374 |
+
'the warmup stage ends with only little data trained.')
|
375 |
+
|
376 |
+
# 创建日志对象
|
377 |
+
logger = create_logger(args)
|
378 |
+
# 当用户使用GPU,并且GPU可用时
|
379 |
+
args.cuda = torch.cuda.is_available() and not args.no_cuda
|
380 |
+
device = 'cuda:0' if args.cuda else 'cpu'
|
381 |
+
args.device = device
|
382 |
+
logger.info('using device:{}'.format(device))
|
383 |
+
|
384 |
+
# 初始化tokenizer
|
385 |
+
tokenizer = BertTokenizerFast(vocab_file=args.vocab_path, sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]")
|
386 |
+
special_tokens = []
|
387 |
+
for key in config["mask_token"].keys():
|
388 |
+
special_tokens.append(key)
|
389 |
+
tokenizer.add_special_tokens( {'additional_special_tokens':special_tokens} )
|
390 |
+
args.sep_id = tokenizer.sep_token_id
|
391 |
+
args.pad_id = tokenizer.pad_token_id
|
392 |
+
args.cls_id = tokenizer.cls_token_id
|
393 |
+
|
394 |
+
# 创建模型的输出目录
|
395 |
+
if not os.path.exists(args.save_model_path):
|
396 |
+
os.mkdir(args.save_model_path)
|
397 |
+
|
398 |
+
# 创建模型
|
399 |
+
if args.pretrained_model: # 加载预训练模型
|
400 |
+
model = GPT2LMHeadModel.from_pretrained(args.pretrained_model)
|
401 |
+
else: # 初始化模型
|
402 |
+
model_config = GPT2Config.from_json_file(args.model_config)
|
403 |
+
model = GPT2LMHeadModel(config=model_config)
|
404 |
+
model = model.to(device)
|
405 |
+
logger.info('model config:\n{}'.format(model.config.to_json_string()))
|
406 |
+
assert model.config.vocab_size == tokenizer.vocab_size
|
407 |
+
|
408 |
+
# 并行训练模型
|
409 |
+
if args.cuda and torch.cuda.device_count() > 1:
|
410 |
+
model = DataParallel(model).cuda()
|
411 |
+
# model = BalancedDataParallel(args.gpu0_bsz, model, dim=0).cuda()
|
412 |
+
logger.info("use GPU {} to train".format(args.device))
|
413 |
+
|
414 |
+
# 计算模型参数数量
|
415 |
+
num_parameters = 0
|
416 |
+
parameters = model.parameters()
|
417 |
+
for parameter in parameters:
|
418 |
+
num_parameters += parameter.numel()
|
419 |
+
logger.info('number of model parameters: {}'.format(num_parameters))
|
420 |
+
|
421 |
+
# 记录参数设置
|
422 |
+
logger.info("args:{}".format(args))
|
423 |
+
|
424 |
+
# 加载训练集和验证集
|
425 |
+
# ========= Loading Dataset ========= #
|
426 |
+
train_dataset, validate_dataset = load_dataset(logger, args)
|
427 |
+
|
428 |
+
train(model, logger, train_dataset, validate_dataset, args)
|
429 |
+
|
430 |
+
|
431 |
+
if __name__ == '__main__':
|
432 |
+
main()
|
web.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import re
|
5 |
+
import requests
|
6 |
+
import argparse
|
7 |
+
import string
|
8 |
+
|
9 |
+
from datetime import timedelta
|
10 |
+
from flask import Flask, session, request, jsonify, render_template
|
11 |
+
|
12 |
+
from transformers.models.bert.tokenization_bert import BertTokenizer
|
13 |
+
|
14 |
+
from bot.chatbot import ChatBot
|
15 |
+
from bot.config import special_token_list
|
16 |
+
|
17 |
+
app = Flask(__name__)
|
18 |
+
app.config["SECRET_KEY"] = os.urandom(74)
|
19 |
+
app.config["PERMANENT_SESSION_LIFETIME"] = timedelta(days=7)
|
20 |
+
|
21 |
+
tokenizer:BertTokenizer = None
|
22 |
+
|
23 |
+
history_matrix:dict = {}
|
24 |
+
|
25 |
+
def move_history_from_session_to_global_memory() -> None:
|
26 |
+
global history_matrix
|
27 |
+
|
28 |
+
if session.get( "session_hash") and session["history"]:
|
29 |
+
history_matrix[session["session_hash"]] = session["history"]
|
30 |
+
|
31 |
+
def move_history_from_global_memory_to_session() -> None:
|
32 |
+
global history_matrix
|
33 |
+
|
34 |
+
if session.get( "session_hash"):
|
35 |
+
session["history"] = history_matrix.get( session.get( "session_hash") )
|
36 |
+
|
37 |
+
def set_args() -> argparse.Namespace:
|
38 |
+
parser:argparse.ArgumentParser = argparse.ArgumentParser()
|
39 |
+
parser.add_argument("--vocab_path", default=None, type=str, required=False, help="选择词库")
|
40 |
+
parser.add_argument("--model_path", default="lewiswu1209/Winnie", type=str, required=False, help="对话模型路径")
|
41 |
+
|
42 |
+
return parser.parse_args()
|
43 |
+
|
44 |
+
@app.route("/chitchat/history", methods = ["GET"])
|
45 |
+
def get_history_list() -> str:
|
46 |
+
global tokenizer
|
47 |
+
|
48 |
+
move_history_from_global_memory_to_session()
|
49 |
+
|
50 |
+
history_list:list = session.get("history")
|
51 |
+
if history_list is None:
|
52 |
+
history_list = []
|
53 |
+
|
54 |
+
history:list = []
|
55 |
+
for history_ids in history_list:
|
56 |
+
tokens = tokenizer.convert_ids_to_tokens(history_ids)
|
57 |
+
fixed_tokens = []
|
58 |
+
for token in tokens:
|
59 |
+
if token.startswith("##"):
|
60 |
+
token = token[2:]
|
61 |
+
fixed_tokens.append(token)
|
62 |
+
history.append( "".join( fixed_tokens ) )
|
63 |
+
|
64 |
+
return jsonify(history)
|
65 |
+
|
66 |
+
@app.route("/chitchat/chat", methods = ["GET"])
|
67 |
+
def talk() -> str:
|
68 |
+
global tokenizer
|
69 |
+
global history_matrix
|
70 |
+
|
71 |
+
if request.args.get("hash"):
|
72 |
+
session["session_hash"] = request.args.get("hash")
|
73 |
+
move_history_from_global_memory_to_session()
|
74 |
+
|
75 |
+
if session.get("session_hash") is None:
|
76 |
+
session["session_hash"] = "".join( random.sample(string.ascii_lowercase + string.digits, 11) )
|
77 |
+
|
78 |
+
if request.args.get("text"):
|
79 |
+
input_text = request.args.get("text")
|
80 |
+
history_list = session.get("history")
|
81 |
+
|
82 |
+
if input_text.upper()=="HELP":
|
83 |
+
help_info_list = ["输入任意文字,Winnie会回答你的问题",
|
84 |
+
"输入ERASE MEMORY,Winnie会清空记忆",
|
85 |
+
"输入\"<TAG>=<VALUE>\",Winnie会记录你的角色信息",
|
86 |
+
"例如:<NAME>=Vicky,Winnie会修改自己的名字",
|
87 |
+
"可以修改的角色信息有:",
|
88 |
+
"<NAME>, <GENDER>, <YEAROFBIRTH>, <MONTHOFBIRTH>, <DAYOFBIRTH>, <ZODIAC>, <AGE>",
|
89 |
+
"输入“上联:XXXXXXX”,Winnie会和你对对联",
|
90 |
+
"输入“写诗:XXXXXXX”,Winnie会以XXXXXXX为开头写诗"
|
91 |
+
]
|
92 |
+
return jsonify(help_info_list)
|
93 |
+
|
94 |
+
if history_list is None or len(history_list)==0 or input_text == "ERASE MEMORY":
|
95 |
+
history_list = []
|
96 |
+
output_text = requests.post(
|
97 |
+
url='https://hf.space/embed/lewiswu1209/Winnie/+/api/predict/',
|
98 |
+
json={"data": ["ERASE MEMORY"], "session_hash": session["session_hash"]}
|
99 |
+
).json()["data"][0]
|
100 |
+
|
101 |
+
if input_text != "ERASE MEMORY":
|
102 |
+
if not re.match( r"^<.+>=.+$", input_text ):
|
103 |
+
history_list.append( tokenizer.encode(input_text, add_special_tokens=False) )
|
104 |
+
output_text = requests.post(
|
105 |
+
url='https://hf.space/embed/lewiswu1209/Winnie/+/api/predict/',
|
106 |
+
json={"data": [input_text], "session_hash": session["session_hash"]}
|
107 |
+
).json()["data"][0]
|
108 |
+
if not re.match( r"^<.+>=.+$", input_text ):
|
109 |
+
history_list.append( tokenizer.encode(output_text, add_special_tokens=False) )
|
110 |
+
|
111 |
+
session["history"] = history_list
|
112 |
+
history_matrix[session["session_hash"]] = history_list
|
113 |
+
return jsonify([output_text])
|
114 |
+
else:
|
115 |
+
return jsonify([""])
|
116 |
+
|
117 |
+
@app.route("/")
|
118 |
+
def index() -> str:
|
119 |
+
return "Hello world!"
|
120 |
+
|
121 |
+
@app.route("/chitchat/hash", methods = ["GET"])
|
122 |
+
def get_hash() -> str:
|
123 |
+
global history_matrix
|
124 |
+
|
125 |
+
if request.args.get("hash"):
|
126 |
+
session["session_hash"] = request.args.get("hash")
|
127 |
+
move_history_from_global_memory_to_session()
|
128 |
+
hash = session.get("session_hash")
|
129 |
+
if hash:
|
130 |
+
return session.get("session_hash")
|
131 |
+
else:
|
132 |
+
return " "
|
133 |
+
|
134 |
+
@app.route( "/chitchat", methods = ["GET"] )
|
135 |
+
def chitchat() -> str:
|
136 |
+
return render_template( "chat_template.html" )
|
137 |
+
|
138 |
+
def main() -> None:
|
139 |
+
global tokenizer
|
140 |
+
|
141 |
+
args = set_args()
|
142 |
+
tokenizer = ChatBot.get_tokenizer(
|
143 |
+
args.model_path,
|
144 |
+
vocab_path=args.vocab_path,
|
145 |
+
special_token_list = special_token_list
|
146 |
+
)
|
147 |
+
|
148 |
+
app.run( host = "127.0.0.1", port = 8080 )
|
149 |
+
|
150 |
+
if __name__ == "__main__":
|
151 |
+
main()
|