Ontocord.AI commited on
Commit
9f3edac
·
1 Parent(s): 7d1de3e

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +150 -0
README.md ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ ```
6
+ # test merged experts
7
+ # TODO: add dynamic routing, testing better expert mixtures
8
+
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM
10
+
11
+ import torch
12
+
13
+ from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM, GPTNeoXLayer
14
+ from torch import nn
15
+ class GPTNeoXExpertsForCasualLM(GPTNeoXForCausalLM):
16
+ """ Stores various experts for layers 9, 10 """ # , 11
17
+ def __init__(self, config):
18
+ super().__init__(config)
19
+ self.config = config
20
+ self.orig_chat = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
21
+ self.uspto_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
22
+ self.github_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
23
+ self.pubmed_abstracts_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
24
+ self.freelaw_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
25
+ self.arxiv_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
26
+ self.merged_chat_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
27
+ self.curr_expert = "merged_chat_expert"
28
+
29
+ def generate_with_expert(self, text, tokenizer, expert="merged_chat_expert", return_answer_only=False, do_self_contrastive=True, max_length=512, min_length=1, max_return_sequences=1, do_sample=True, do_beam=False, device="cuda", target_lang=None):
30
+ """Generates using one of the experts."""
31
+ tokenizer.pad_token = tokenizer.eos_token
32
+ if expert != self.curr_expert:
33
+ self.curr_expert = expert
34
+ for layer_id in range(2):
35
+ if expert == "orig_chat":
36
+ self.gpt_neox.layers[layer_id+9] = self.orig_chat[layer_id]
37
+ elif expert == "uspto_expert":
38
+ self.gpt_neox.layers[layer_id+9] = self.uspto_expert[layer_id]
39
+ elif expert == "github_expert":
40
+ self.gpt_neox.layers[layer_id+9] = self.github_expert[layer_id]
41
+ elif expert == "pubmed_abstracts_expert":
42
+ self.gpt_neox.layers[layer_id+9] = self.pubmed_abstracts_expert[layer_id]
43
+ elif expert == "arxiv_expert":
44
+ self.gpt_neox.layers[layer_id+9] = self.arxiv_expert[layer_id]
45
+ elif expert == "freelaw_expert":
46
+ self.gpt_neox.layers[layer_id+9] = self.freelaw_expert[layer_id]
47
+ elif expert == "merged_chat_expert":
48
+ self.gpt_neox.layers[layer_id+9] = self.merged_chat_expert[layer_id]
49
+
50
+ if type(text) is str:
51
+ text = [text]
52
+ text = [p.strip() for p in text]
53
+ input_ids = tokenizer(text, return_tensors='pt',padding=True, truncation=True, max_length=max_length )
54
+ input_ids = input_ids.to(device)
55
+ with torch.no_grad():
56
+ outputs = self.generate(
57
+ **input_ids,
58
+ max_length=max_length,
59
+ repetition_penalty=1.1,
60
+ min_length=min_length,
61
+ do_sample=True,
62
+ top_p=0.95,
63
+ penalty_alpha=0.6 if do_self_contrastive else None,
64
+ top_k=10,
65
+ )
66
+ ret = []
67
+ for i in range(len(outputs)): # can use batch_decode, unless we want to do something special here
68
+ out = tokenizer.decode(outputs[i], skip_special_tokens=True)
69
+ if return_answer_only:
70
+ out = out[len(text[i]):].lstrip(".? \n\t")
71
+ ret.append(out)
72
+
73
+ return ret
74
+
75
+ tokenizer = AutoTokenizer.from_pretrained("theblackcat102/pythia-1b-deduped-sft")
76
+
77
+ tokenizer.pad_token = tokenizer.eos_token
78
+
79
+
80
+ model1 = GPTNeoXExpertsForCasualLM.from_pretrained("Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts")
81
+ model1=model1.half().cuda().eval()
82
+ for expert in ["orig_chat", "merged_chat_expert", "uspto_expert", "github_expert", "pubmed_abstracts_expert", "arxiv_expert", "freelaw_expert"]:
83
+ print (f'## {expert}')
84
+ print (model1.generate_with_expert("<human> Write a patent about an electric toothbrush\n<bot>", tokenizer, expert=expert)[0])
85
+ print (f'## {expert} more')
86
+ print (model1.generate_with_expert("Field of the Invention.\nAn electric toothbrush\n", tokenizer, expert=expert)[0])
87
+ ```
88
+
89
+ ```
90
+
91
+ def recreate_merged_expert():
92
+ model1 = GPTNeoXExpertsForCasualLM.from_pretrained("theblackcat102/pythia-1b-deduped-sft").float()
93
+
94
+ model2 = AutoModelForCausalLM.from_pretrained("stillerman/MDEL-pubmed-feelaw-github-arxiv").float()
95
+
96
+ model_uspto = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-uspto").float()
97
+
98
+ model_github = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-github").float()
99
+ model_pubmed_abstracts = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-pubmed_abstracts").float()
100
+ model_freelaw = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-freelaw").float()
101
+ model_arxiv = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-arxiv").float()
102
+
103
+ model = AutoModelForCausalLM.from_pretrained("theblackcat102/pythia-1b-deduped-sft").float() # half().cuda().eval()
104
+
105
+ with torch.no_grad():
106
+ for layer_id in [9,10]: #9,10,11,12,13
107
+ model1.orig_chat[layer_id-9] = model.gpt_neox.layers[layer_id]
108
+
109
+ for layer_id in [9,10]: #9,10,11,12,13
110
+ for p1, p2, p3 in zip(model1.gpt_neox.layers[layer_id].parameters(), model2.gpt_neox.layers[layer_id].parameters(), model_uspto.gpt_neox.layers[layer_id].parameters()):
111
+ p1.data = p1.data*.6 + p2.data*0.3 + p3.data*0.1
112
+ model1.merged_chat_expert[layer_id-9] = model1.gpt_neox.layers[layer_id]
113
+
114
+ #model1.uspto_expert.layers_9_10_11 = []
115
+ for layer_id in [9,10]: #9,10,11,12,13
116
+ for p1, p2 in zip(model_uspto.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
117
+ p1.data = p1.data*.6 + p2.data*0.4
118
+ model1.uspto_expert[layer_id-9] = model_uspto.gpt_neox.layers[layer_id]
119
+
120
+ #model1.github_expert.layers_9_10_11 = []
121
+ for layer_id in [9,10]: #9,10,11,12,13
122
+ for p1, p2 in zip(model_github.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
123
+ p1.data = p1.data*.6 + p2.data*0.4
124
+ model1.github_expert[layer_id-9] = model_github.gpt_neox.layers[layer_id]
125
+
126
+ #model1.pubmed_abstracts_expert.layers_9_10_11 = []
127
+ for layer_id in [9,10]: #9,10,11,12,13
128
+ for p1, p2 in zip(model_pubmed_abstracts.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
129
+ p1.data = p1.data*.6 + p2.data*0.4
130
+ model1.pubmed_abstracts_expert[layer_id-9] = model_pubmed_abstracts.gpt_neox.layers[layer_id]
131
+
132
+ #model1.freelaw_expert.layers_9_10_11 = []
133
+ for layer_id in [9,10]: #9,10,11,12,13
134
+ for p1, p2 in zip(model_freelaw.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
135
+ p1.data = p1.data*.6 + p2.data*0.4
136
+ model1.freelaw_expert[layer_id-9] = model_freelaw.gpt_neox.layers[layer_id]
137
+
138
+ #model1.arxiv_expert.layers_9_10_11 = []
139
+ for layer_id in [9,10]: #9,10,11,12,13
140
+ for p1, p2 in zip(model_arxiv.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
141
+ p1.data = p1.data*.6 + p2.data*0.4
142
+ model1.arxiv_expert[layer_id-9] = model_arxiv.gpt_neox.layers[layer_id]
143
+
144
+
145
+
146
+ model1 = model1.half().eval()
147
+ model1.save_pretrained("MDEL-theblackcat-chat-5-experts", torch_dtype=torch.float16)
148
+ model1.push_to_hub("Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts")
149
+ return model1
150
+ ```