KoichiYasuoka
commited on
Commit
·
c98537a
1
Parent(s):
a66a53d
model improved for transformers 4.42
Browse files- config.json +1 -15
- maker.sh +3 -39
- pytorch_model-00001-of-00006.bin +2 -2
- pytorch_model-00002-of-00006.bin +2 -2
- pytorch_model-00003-of-00006.bin +1 -1
- pytorch_model-00004-of-00006.bin +2 -2
- pytorch_model-00005-of-00006.bin +1 -1
- pytorch_model-00006-of-00006.bin +1 -1
- pytorch_model.bin.index.json +3 -3
- tokenizer.json +517 -516
- tokenizer_config.json +2 -0
- upos.py +1 -40
config.json
CHANGED
@@ -2,24 +2,12 @@
|
|
2 |
"architectures": [
|
3 |
"MistralForTokenClassification"
|
4 |
],
|
5 |
-
"attention_bias": false,
|
6 |
"attention_dropout": 0.0,
|
7 |
-
"auto_map": {
|
8 |
-
"AutoModelForTokenClassification": "upos.MistralForTokenClassification"
|
9 |
-
},
|
10 |
"bos_token_id": 1,
|
11 |
"custom_pipelines": {
|
12 |
"upos": {
|
13 |
"impl": "upos.BellmanFordTokenClassificationPipeline",
|
14 |
"pt": "AutoModelForTokenClassification"
|
15 |
-
},
|
16 |
-
"token-classification": {
|
17 |
-
"impl": "upos.RawTokenClassificationPipeline",
|
18 |
-
"pt": "AutoModelForTokenClassification"
|
19 |
-
},
|
20 |
-
"ner": {
|
21 |
-
"impl": "upos.RawTokenClassificationPipeline",
|
22 |
-
"pt": "AutoModelForTokenClassification"
|
23 |
}
|
24 |
},
|
25 |
"eos_token_id": 2,
|
@@ -156,15 +144,13 @@
|
|
156 |
"num_attention_heads": 32,
|
157 |
"num_hidden_layers": 32,
|
158 |
"num_key_value_heads": 8,
|
159 |
-
"pretraining_tp": 1,
|
160 |
"rms_norm_eps": 1e-05,
|
161 |
-
"rope_scaling": null,
|
162 |
"rope_theta": 10000.0,
|
163 |
"sliding_window": 4096,
|
164 |
"tie_word_embeddings": false,
|
165 |
"tokenizer_class": "LlamaTokenizerFast",
|
166 |
"torch_dtype": "float32",
|
167 |
-
"transformers_version": "4.
|
168 |
"use_cache": true,
|
169 |
"vocab_size": 43317
|
170 |
}
|
|
|
2 |
"architectures": [
|
3 |
"MistralForTokenClassification"
|
4 |
],
|
|
|
5 |
"attention_dropout": 0.0,
|
|
|
|
|
|
|
6 |
"bos_token_id": 1,
|
7 |
"custom_pipelines": {
|
8 |
"upos": {
|
9 |
"impl": "upos.BellmanFordTokenClassificationPipeline",
|
10 |
"pt": "AutoModelForTokenClassification"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
}
|
12 |
},
|
13 |
"eos_token_id": 2,
|
|
|
144 |
"num_attention_heads": 32,
|
145 |
"num_hidden_layers": 32,
|
146 |
"num_key_value_heads": 8,
|
|
|
147 |
"rms_norm_eps": 1e-05,
|
|
|
148 |
"rope_theta": 10000.0,
|
149 |
"sliding_window": 4096,
|
150 |
"tie_word_embeddings": false,
|
151 |
"tokenizer_class": "LlamaTokenizerFast",
|
152 |
"torch_dtype": "float32",
|
153 |
+
"transformers_version": "4.42.4",
|
154 |
"use_cache": true,
|
155 |
"vocab_size": 43317
|
156 |
}
|
maker.sh
CHANGED
@@ -9,7 +9,7 @@ then TMPA=./maker$$a.py
|
|
9 |
src="tokyotech-llm/Swallow-MS-7b-v0.1"
|
10 |
tgt="exSwallow-MS-7b-v0.1"
|
11 |
import json,torch,unicodedata
|
12 |
-
from transformers import LlamaTokenizerFast,
|
13 |
with open("JapaneseCoreKanji.txt","r",encoding="utf-8") as r:
|
14 |
cjk=[chr(int(t,16)) for t in r.read().strip().split("\n") if not t.startswith("#")]
|
15 |
with open("ja_gsd_modern.conllu","r",encoding="utf-8") as r:
|
@@ -26,7 +26,7 @@ d=json.loads(tkz.backend_tokenizer.to_str())
|
|
26 |
for i,j in enumerate(c,len(tkz)):
|
27 |
d["model"]["vocab"][j]=i
|
28 |
tkz.backend_tokenizer.from_str(json.dumps(d)).save("tokenizer.json")
|
29 |
-
mdl=
|
30 |
tkz=LlamaTokenizerFast(tokenizer_file="tokenizer.json",model_max_length=mdl.config.max_position_embeddings,cls_token="<s>",sep_token="<s>",mask_token="<unk>",pad_token="</s>")
|
31 |
e=mdl.resize_token_embeddings(len(tkz))
|
32 |
f=mdl.get_output_embeddings()
|
@@ -48,45 +48,9 @@ cat << 'EOF' > $TMPB
|
|
48 |
#! /usr/bin/env deepspeed
|
49 |
src="exSwallow-MS-7b-v0.1"
|
50 |
tgt="KoichiYasuoka/Swallow-MS-7b-upos"
|
51 |
-
from transformers import LlamaTokenizerFast,
|
52 |
-
from transformers.modeling_outputs import TokenClassifierOutput
|
53 |
from tokenizers.normalizers import Replace
|
54 |
|
55 |
-
class MistralForTokenClassification(MistralPreTrainedModel):
|
56 |
-
def __init__(self,config):
|
57 |
-
from torch import nn
|
58 |
-
super().__init__(config)
|
59 |
-
self.num_labels=config.num_labels
|
60 |
-
self.model=MistralModel(config)
|
61 |
-
if hasattr(config,"classifier_dropout") and config.classifier_dropout is not None:
|
62 |
-
classifier_dropout=config.classifier_dropout
|
63 |
-
elif hasattr(config,"hidden_dropout") and config.hidden_dropout is not None:
|
64 |
-
classifier_dropout=config.hidden_dropout
|
65 |
-
else:
|
66 |
-
classifier_dropout=0.1
|
67 |
-
self.dropout=nn.Dropout(classifier_dropout)
|
68 |
-
self.classifier=nn.Linear(config.hidden_size,config.num_labels)
|
69 |
-
self.post_init()
|
70 |
-
def get_input_embeddings(self):
|
71 |
-
return self.model.embed_tokens
|
72 |
-
def set_input_embeddings(self,value):
|
73 |
-
self.model.embed_tokens=value
|
74 |
-
def forward(self,input_ids=None,past_key_values=None,attention_mask=None,position_ids=None,inputs_embeds=None,labels=None,use_cache=None,output_attentions=None,output_hidden_states=None,return_dict=None):
|
75 |
-
return_dict=return_dict if return_dict is not None else self.config.use_return_dict
|
76 |
-
transformer_outputs=self.model(input_ids,past_key_values=past_key_values,attention_mask=attention_mask,position_ids=position_ids,inputs_embeds=inputs_embeds,use_cache=use_cache,output_attentions=output_attentions,output_hidden_states=output_hidden_states,return_dict=return_dict)
|
77 |
-
hidden_states=transformer_outputs[0]
|
78 |
-
hidden_states=self.dropout(hidden_states)
|
79 |
-
logits=self.classifier(hidden_states)
|
80 |
-
loss=None
|
81 |
-
if labels is not None:
|
82 |
-
from torch import nn
|
83 |
-
loss_fct=nn.CrossEntropyLoss()
|
84 |
-
loss=loss_fct(logits.view(-1,self.num_labels),labels.view(-1))
|
85 |
-
if not return_dict:
|
86 |
-
output=(logits,)+transformer_outputs[2:]
|
87 |
-
return ((loss,)+output) if loss is not None else output
|
88 |
-
return TokenClassifierOutput(loss=loss,logits=logits,hidden_states=transformer_outputs.hidden_states,attentions=transformer_outputs.attentions)
|
89 |
-
|
90 |
class UPOSFileDataset(object):
|
91 |
def __init__(self,conllu,tokenizer):
|
92 |
self.conllu=open(conllu,"r",encoding="utf-8")
|
|
|
9 |
src="tokyotech-llm/Swallow-MS-7b-v0.1"
|
10 |
tgt="exSwallow-MS-7b-v0.1"
|
11 |
import json,torch,unicodedata
|
12 |
+
from transformers import LlamaTokenizerFast,MistralForCausalLM
|
13 |
with open("JapaneseCoreKanji.txt","r",encoding="utf-8") as r:
|
14 |
cjk=[chr(int(t,16)) for t in r.read().strip().split("\n") if not t.startswith("#")]
|
15 |
with open("ja_gsd_modern.conllu","r",encoding="utf-8") as r:
|
|
|
26 |
for i,j in enumerate(c,len(tkz)):
|
27 |
d["model"]["vocab"][j]=i
|
28 |
tkz.backend_tokenizer.from_str(json.dumps(d)).save("tokenizer.json")
|
29 |
+
mdl=MistralForCausalLM.from_pretrained(src)
|
30 |
tkz=LlamaTokenizerFast(tokenizer_file="tokenizer.json",model_max_length=mdl.config.max_position_embeddings,cls_token="<s>",sep_token="<s>",mask_token="<unk>",pad_token="</s>")
|
31 |
e=mdl.resize_token_embeddings(len(tkz))
|
32 |
f=mdl.get_output_embeddings()
|
|
|
48 |
#! /usr/bin/env deepspeed
|
49 |
src="exSwallow-MS-7b-v0.1"
|
50 |
tgt="KoichiYasuoka/Swallow-MS-7b-upos"
|
51 |
+
from transformers import LlamaTokenizerFast,MistralForTokenClassification,AutoConfig,DataCollatorForTokenClassification,TrainingArguments,Trainer
|
|
|
52 |
from tokenizers.normalizers import Replace
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
class UPOSFileDataset(object):
|
55 |
def __init__(self,conllu,tokenizer):
|
56 |
self.conllu=open(conllu,"r",encoding="utf-8")
|
pytorch_model-00001-of-00006.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3bc383e2836bfade4a66c8c3c8e5871240d5b1cc86aaf912120c098dc37db88f
|
3 |
+
size 2539520
|
pytorch_model-00002-of-00006.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2a123e02228dbfea708d4ff57dc9ccf7bfd5dbb992567a86d42ab14146b55d2
|
3 |
+
size 4390338560
|
pytorch_model-00003-of-00006.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4999825316
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70f9676f885d30e08789261553c6ed7a949acef9de9a67dce005cede7ec8fa73
|
3 |
size 4999825316
|
pytorch_model-00004-of-00006.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7b8a16ad523ebf2fa508012f2b0d0fee6d2c2a4583daff8d4058aaf7240cc020
|
3 |
+
size 23379968
|
pytorch_model-00005-of-00006.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4999825320
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2ba5e8032cd9a92cfaf0f3d30360285d03a7f32af32b0ad07f0c0b7d241d64f5
|
3 |
size 4999825320
|
pytorch_model-00006-of-00006.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 3960601264
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a82d77158930bdf9ffc566bd2be9e7600b4b4831d287679e3086cab45466ac6
|
3 |
size 3960601264
|
pytorch_model.bin.index.json
CHANGED
@@ -3,8 +3,6 @@
|
|
3 |
"total_size": 28629041392
|
4 |
},
|
5 |
"weight_map": {
|
6 |
-
"classifier.bias": "pytorch_model-00006-of-00006.bin",
|
7 |
-
"classifier.weight": "pytorch_model-00006-of-00006.bin",
|
8 |
"model.embed_tokens.weight": "pytorch_model-00001-of-00006.bin",
|
9 |
"model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
|
10 |
"model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
|
@@ -294,6 +292,8 @@
|
|
294 |
"model.layers.9.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
|
295 |
"model.layers.9.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
|
296 |
"model.layers.9.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
|
297 |
-
"model.norm.weight": "pytorch_model-00006-of-00006.bin"
|
|
|
|
|
298 |
}
|
299 |
}
|
|
|
3 |
"total_size": 28629041392
|
4 |
},
|
5 |
"weight_map": {
|
|
|
|
|
6 |
"model.embed_tokens.weight": "pytorch_model-00001-of-00006.bin",
|
7 |
"model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
|
8 |
"model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
|
|
|
292 |
"model.layers.9.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
|
293 |
"model.layers.9.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
|
294 |
"model.layers.9.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
|
295 |
+
"model.norm.weight": "pytorch_model-00006-of-00006.bin",
|
296 |
+
"score.bias": "pytorch_model-00006-of-00006.bin",
|
297 |
+
"score.weight": "pytorch_model-00006-of-00006.bin"
|
298 |
}
|
299 |
}
|
tokenizer.json
CHANGED
@@ -125,6 +125,7 @@
|
|
125 |
"end_of_word_suffix": null,
|
126 |
"fuse_unk": true,
|
127 |
"byte_fallback": false,
|
|
|
128 |
"vocab": {
|
129 |
"<unk>": 0,
|
130 |
"<s>": 1,
|
@@ -42926,523 +42927,523 @@
|
|
42926 |
"勅": 42797,
|
42927 |
"婿": 42798,
|
42928 |
"魯": 42799,
|
42929 |
-
"
|
42930 |
-
"
|
42931 |
-
"
|
42932 |
-
"
|
42933 |
-
"
|
42934 |
-
"
|
42935 |
-
"
|
42936 |
-
"
|
42937 |
-
"
|
42938 |
-
"
|
42939 |
-
"
|
42940 |
-
"
|
42941 |
-
"
|
42942 |
-
"
|
42943 |
-
"
|
42944 |
-
"
|
42945 |
-
"
|
42946 |
-
"
|
42947 |
-
"
|
42948 |
-
"
|
42949 |
-
"
|
42950 |
-
"
|
42951 |
-
"
|
42952 |
-
"
|
42953 |
-
"
|
42954 |
-
"
|
42955 |
-
"
|
42956 |
-
"
|
42957 |
-
"
|
42958 |
-
"
|
42959 |
-
"
|
42960 |
-
"
|
42961 |
-
"
|
42962 |
-
"
|
42963 |
-
"
|
42964 |
-
"
|
42965 |
-
"
|
42966 |
-
"
|
42967 |
-
"
|
42968 |
-
"
|
42969 |
-
"
|
42970 |
-
"
|
42971 |
-
"
|
42972 |
-
"
|
42973 |
-
"
|
42974 |
-
"
|
42975 |
-
"
|
42976 |
-
"
|
42977 |
-
"
|
42978 |
-
"
|
42979 |
-
"
|
42980 |
-
"
|
42981 |
-
"
|
42982 |
-
"
|
42983 |
-
"
|
42984 |
-
"
|
42985 |
-
"
|
42986 |
-
"
|
42987 |
-
"
|
42988 |
-
"
|
42989 |
-
"
|
42990 |
-
"
|
42991 |
-
"
|
42992 |
-
"
|
42993 |
-
"
|
42994 |
-
"
|
42995 |
-
"
|
42996 |
-
"
|
42997 |
-
"
|
42998 |
-
"
|
42999 |
-
"
|
43000 |
-
"
|
43001 |
-
"
|
43002 |
-
"
|
43003 |
-
"
|
43004 |
-
"
|
43005 |
-
"
|
43006 |
-
"
|
43007 |
-
"
|
43008 |
-
"
|
43009 |
-
"
|
43010 |
-
"
|
43011 |
-
"
|
43012 |
-
"
|
43013 |
-
"
|
43014 |
-
"
|
43015 |
-
"
|
43016 |
-
"
|
43017 |
-
"
|
43018 |
-
"
|
43019 |
-
"
|
43020 |
-
"
|
43021 |
-
"
|
43022 |
-
"
|
43023 |
-
"
|
43024 |
-
"
|
43025 |
-
"
|
43026 |
-
"
|
43027 |
-
"
|
43028 |
-
"
|
43029 |
-
"
|
43030 |
-
"
|
43031 |
-
"
|
43032 |
-
"
|
43033 |
-
"
|
43034 |
-
"
|
43035 |
-
"
|
43036 |
-
"
|
43037 |
-
"
|
43038 |
-
"
|
43039 |
-
"
|
43040 |
-
"
|
43041 |
-
"
|
43042 |
-
"
|
43043 |
-
"
|
43044 |
-
"
|
43045 |
-
"
|
43046 |
-
"
|
43047 |
-
"
|
43048 |
-
"
|
43049 |
-
"
|
43050 |
-
"
|
43051 |
-
"
|
43052 |
-
"
|
43053 |
-
"
|
43054 |
-
"
|
43055 |
-
"
|
43056 |
-
"
|
43057 |
-
"
|
43058 |
-
"
|
43059 |
-
"
|
43060 |
-
"
|
43061 |
-
"
|
43062 |
-
"
|
43063 |
-
"
|
43064 |
-
"
|
43065 |
-
"
|
43066 |
-
"
|
43067 |
-
"
|
43068 |
-
"
|
43069 |
-
"
|
43070 |
-
"
|
43071 |
-
"
|
43072 |
-
"
|
43073 |
-
"
|
43074 |
-
"
|
43075 |
-
"
|
43076 |
-
"
|
43077 |
-
"
|
43078 |
-
"
|
43079 |
-
"
|
43080 |
-
"
|
43081 |
-
"
|
43082 |
-
"
|
43083 |
-
"
|
43084 |
-
"
|
43085 |
-
"
|
43086 |
-
"
|
43087 |
-
"
|
43088 |
-
"
|
43089 |
-
"
|
43090 |
-
"
|
43091 |
-
"
|
43092 |
-
"
|
43093 |
-
"
|
43094 |
-
"
|
43095 |
-
"
|
43096 |
-
"
|
43097 |
-
"
|
43098 |
-
"
|
43099 |
-
"
|
43100 |
-
"
|
43101 |
-
"
|
43102 |
-
"
|
43103 |
-
"
|
43104 |
-
"
|
43105 |
-
"
|
43106 |
-
"
|
43107 |
-
"
|
43108 |
-
"
|
43109 |
-
"
|
43110 |
-
"
|
43111 |
-
"
|
43112 |
-
"
|
43113 |
-
"
|
43114 |
-
"
|
43115 |
-
"
|
43116 |
-
"
|
43117 |
-
"
|
43118 |
-
"
|
43119 |
-
"
|
43120 |
-
"
|
43121 |
-
"
|
43122 |
-
"
|
43123 |
-
"
|
43124 |
-
"
|
43125 |
-
"
|
43126 |
-
"
|
43127 |
-
"
|
43128 |
-
"
|
43129 |
-
"
|
43130 |
-
"
|
43131 |
-
"
|
43132 |
-
"
|
43133 |
-
"
|
43134 |
-
"
|
43135 |
-
"
|
43136 |
-
"
|
43137 |
-
"
|
43138 |
-
"
|
43139 |
-
"
|
43140 |
-
"
|
43141 |
-
"
|
43142 |
-
"
|
43143 |
-
"
|
43144 |
-
"
|
43145 |
-
"
|
43146 |
-
"
|
43147 |
-
"
|
43148 |
-
"
|
43149 |
-
"
|
43150 |
-
"
|
43151 |
-
"
|
43152 |
-
"
|
43153 |
-
"
|
43154 |
-
"
|
43155 |
-
"
|
43156 |
-
"
|
43157 |
-
"
|
43158 |
-
"
|
43159 |
-
"
|
43160 |
-
"
|
43161 |
-
"
|
43162 |
-
"
|
43163 |
-
"
|
43164 |
-
"
|
43165 |
-
"
|
43166 |
-
"
|
43167 |
-
"
|
43168 |
-
"
|
43169 |
-
"
|
43170 |
-
"
|
43171 |
-
"
|
43172 |
-
"
|
43173 |
-
"
|
43174 |
-
"
|
43175 |
-
"
|
43176 |
-
"
|
43177 |
-
"
|
43178 |
-
"
|
43179 |
-
"
|
43180 |
-
"
|
43181 |
-
"
|
43182 |
-
"
|
43183 |
-
"
|
43184 |
-
"
|
43185 |
-
"
|
43186 |
-
"
|
43187 |
-
"
|
43188 |
-
"
|
43189 |
-
"
|
43190 |
-
"
|
43191 |
-
"
|
43192 |
-
"
|
43193 |
-
"
|
43194 |
-
"
|
43195 |
-
"
|
43196 |
-
"
|
43197 |
-
"
|
43198 |
-
"
|
43199 |
-
"
|
43200 |
-
"
|
43201 |
-
"
|
43202 |
-
"
|
43203 |
-
"
|
43204 |
-
"
|
43205 |
-
"
|
43206 |
-
"
|
43207 |
-
"
|
43208 |
-
"
|
43209 |
-
"
|
43210 |
-
"
|
43211 |
-
"
|
43212 |
-
"
|
43213 |
-
"
|
43214 |
-
"
|
43215 |
-
"
|
43216 |
-
"
|
43217 |
-
"
|
43218 |
-
"
|
43219 |
-
"
|
43220 |
-
"
|
43221 |
-
"
|
43222 |
-
"
|
43223 |
-
"
|
43224 |
-
"
|
43225 |
-
"
|
43226 |
-
"
|
43227 |
-
"
|
43228 |
-
"
|
43229 |
-
"
|
43230 |
-
"
|
43231 |
-
"
|
43232 |
-
"
|
43233 |
-
"
|
43234 |
-
"
|
43235 |
-
"
|
43236 |
-
"
|
43237 |
-
"
|
43238 |
-
"
|
43239 |
-
"
|
43240 |
-
"
|
43241 |
-
"
|
43242 |
-
"
|
43243 |
-
"
|
43244 |
-
"
|
43245 |
-
"
|
43246 |
-
"
|
43247 |
-
"
|
43248 |
-
"
|
43249 |
-
"
|
43250 |
-
"
|
43251 |
-
"
|
43252 |
-
"
|
43253 |
-
"
|
43254 |
-
"
|
43255 |
-
"
|
43256 |
-
"
|
43257 |
-
"
|
43258 |
-
"
|
43259 |
-
"
|
43260 |
-
"
|
43261 |
-
"
|
43262 |
-
"
|
43263 |
-
"
|
43264 |
-
"
|
43265 |
-
"
|
43266 |
-
"
|
43267 |
-
"
|
43268 |
-
"
|
43269 |
-
"
|
43270 |
-
"
|
43271 |
-
"
|
43272 |
-
"
|
43273 |
-
"
|
43274 |
-
"
|
43275 |
-
"
|
43276 |
-
"
|
43277 |
-
"
|
43278 |
-
"
|
43279 |
-
"
|
43280 |
-
"
|
43281 |
-
"
|
43282 |
-
"
|
43283 |
-
"
|
43284 |
-
"
|
43285 |
-
"
|
43286 |
-
"
|
43287 |
-
"
|
43288 |
-
"
|
43289 |
-
"
|
43290 |
-
"
|
43291 |
-
"
|
43292 |
-
"
|
43293 |
-
"
|
43294 |
-
"
|
43295 |
-
"
|
43296 |
-
"
|
43297 |
-
"
|
43298 |
-
"
|
43299 |
-
"
|
43300 |
-
"
|
43301 |
-
"
|
43302 |
-
"
|
43303 |
-
"
|
43304 |
-
"
|
43305 |
-
"
|
43306 |
-
"
|
43307 |
-
"
|
43308 |
-
"
|
43309 |
-
"
|
43310 |
-
"
|
43311 |
-
"
|
43312 |
-
"
|
43313 |
-
"
|
43314 |
-
"
|
43315 |
-
"
|
43316 |
-
"
|
43317 |
-
"
|
43318 |
-
"
|
43319 |
-
"
|
43320 |
-
"
|
43321 |
-
"
|
43322 |
-
"
|
43323 |
-
"
|
43324 |
-
"
|
43325 |
-
"
|
43326 |
-
"
|
43327 |
-
"
|
43328 |
-
"
|
43329 |
-
"
|
43330 |
-
"
|
43331 |
-
"
|
43332 |
-
"
|
43333 |
-
"
|
43334 |
-
"
|
43335 |
-
"
|
43336 |
-
"
|
43337 |
-
"
|
43338 |
-
"
|
43339 |
-
"
|
43340 |
-
"
|
43341 |
-
"
|
43342 |
-
"
|
43343 |
-
"
|
43344 |
-
"
|
43345 |
-
"
|
43346 |
-
"
|
43347 |
-
"
|
43348 |
-
"
|
43349 |
-
"
|
43350 |
-
"
|
43351 |
-
"
|
43352 |
-
"
|
43353 |
-
"
|
43354 |
-
"
|
43355 |
-
"
|
43356 |
-
"
|
43357 |
-
"
|
43358 |
-
"
|
43359 |
-
"
|
43360 |
-
"
|
43361 |
-
"
|
43362 |
-
"
|
43363 |
-
"
|
43364 |
-
"
|
43365 |
"錮": 43236,
|
43366 |
-
"
|
43367 |
-
"
|
43368 |
-
"
|
43369 |
-
"
|
43370 |
-
"
|
43371 |
-
"
|
43372 |
-
"
|
43373 |
-
"
|
43374 |
-
"
|
43375 |
-
"
|
43376 |
-
"
|
43377 |
-
"
|
43378 |
-
"
|
43379 |
-
"
|
43380 |
-
"
|
43381 |
-
"
|
43382 |
-
"
|
43383 |
-
"
|
43384 |
-
"
|
43385 |
-
"
|
43386 |
-
"
|
43387 |
-
"
|
43388 |
-
"
|
43389 |
-
"
|
43390 |
-
"
|
43391 |
-
"
|
43392 |
-
"
|
43393 |
-
"
|
43394 |
-
"
|
43395 |
-
"
|
43396 |
-
"
|
43397 |
-
"
|
43398 |
-
"
|
43399 |
-
"
|
43400 |
-
"
|
43401 |
-
"
|
43402 |
-
"
|
43403 |
-
"
|
43404 |
-
"
|
43405 |
-
"
|
43406 |
-
"
|
43407 |
-
"
|
43408 |
-
"
|
43409 |
-
"
|
43410 |
-
"
|
43411 |
-
"
|
43412 |
-
"
|
43413 |
-
"
|
43414 |
-
"
|
43415 |
-
"
|
43416 |
-
"
|
43417 |
-
"
|
43418 |
-
"
|
43419 |
-
"
|
43420 |
-
"
|
43421 |
-
"
|
43422 |
-
"
|
43423 |
-
"
|
43424 |
-
"
|
43425 |
-
"
|
43426 |
-
"
|
43427 |
-
"
|
43428 |
-
"
|
43429 |
-
"
|
43430 |
-
"
|
43431 |
-
"
|
43432 |
-
"
|
43433 |
-
"
|
43434 |
-
"
|
43435 |
-
"
|
43436 |
-
"
|
43437 |
-
"
|
43438 |
-
"
|
43439 |
-
"
|
43440 |
-
"
|
43441 |
-
"
|
43442 |
-
"
|
43443 |
-
"
|
43444 |
-
"
|
43445 |
-
"
|
43446 |
},
|
43447 |
"merges": [
|
43448 |
"▁ t",
|
|
|
125 |
"end_of_word_suffix": null,
|
126 |
"fuse_unk": true,
|
127 |
"byte_fallback": false,
|
128 |
+
"ignore_merges": false,
|
129 |
"vocab": {
|
130 |
"<unk>": 0,
|
131 |
"<s>": 1,
|
|
|
42927 |
"勅": 42797,
|
42928 |
"婿": 42798,
|
42929 |
"魯": 42799,
|
42930 |
+
"禊": 42800,
|
42931 |
+
"羞": 42801,
|
42932 |
+
"瀑": 42802,
|
42933 |
+
"斂": 42803,
|
42934 |
+
"謁": 42804,
|
42935 |
+
"諠": 42805,
|
42936 |
+
"𠮟": 42806,
|
42937 |
+
"蓉": 42807,
|
42938 |
+
"悉": 42808,
|
42939 |
+
"劾": 42809,
|
42940 |
+
"逬": 42810,
|
42941 |
+
"耘": 42811,
|
42942 |
+
"裨": 42812,
|
42943 |
+
"獻": 42813,
|
42944 |
+
"逓": 42814,
|
42945 |
+
"儕": 42815,
|
42946 |
+
"聘": 42816,
|
42947 |
+
"誥": 42817,
|
42948 |
+
"悌": 42818,
|
42949 |
+
"憮": 42819,
|
42950 |
+
"盡": 42820,
|
42951 |
+
"脩": 42821,
|
42952 |
+
"詭": 42822,
|
42953 |
+
"羣": 42823,
|
42954 |
+
"賣": 42824,
|
42955 |
+
"竢": 42825,
|
42956 |
+
"痘": 42826,
|
42957 |
+
"楯": 42827,
|
42958 |
+
"畏": 42828,
|
42959 |
+
"戮": 42829,
|
42960 |
+
"飜": 42830,
|
42961 |
+
"痍": 42831,
|
42962 |
+
"恰": 42832,
|
42963 |
+
"忒": 42833,
|
42964 |
+
"縱": 42834,
|
42965 |
+
"爭": 42835,
|
42966 |
+
"悖": 42836,
|
42967 |
+
"哨": 42837,
|
42968 |
+
"苟": 42838,
|
42969 |
+
"璽": 42839,
|
42970 |
+
"攘": 42840,
|
42971 |
+
"狡": 42841,
|
42972 |
+
"恣": 42842,
|
42973 |
+
"猗": 42843,
|
42974 |
+
"黽": 42844,
|
42975 |
+
"證": 42845,
|
42976 |
+
"愈": 42846,
|
42977 |
+
"賤": 42847,
|
42978 |
+
"佩": 42848,
|
42979 |
+
"畢": 42849,
|
42980 |
+
"棍": 42850,
|
42981 |
+
"筭": 42851,
|
42982 |
+
"讓": 42852,
|
42983 |
+
"頗": 42853,
|
42984 |
+
"譬": 42854,
|
42985 |
+
"馭": 42855,
|
42986 |
+
"遁": 42856,
|
42987 |
+
"專": 42857,
|
42988 |
+
"紂": 42858,
|
42989 |
+
"險": 42859,
|
42990 |
+
"瑟": 42860,
|
42991 |
+
"盈": 42861,
|
42992 |
+
"傅": 42862,
|
42993 |
+
"亥": 42863,
|
42994 |
+
"墮": 42864,
|
42995 |
+
"纉": 42865,
|
42996 |
+
"伍": 42866,
|
42997 |
+
"緡": 42867,
|
42998 |
+
"倩": 42868,
|
42999 |
+
"虧": 42869,
|
43000 |
+
"赫": 42870,
|
43001 |
+
"鎭": 42871,
|
43002 |
+
"爲": 42872,
|
43003 |
+
"饗": 42873,
|
43004 |
+
"鳶": 42874,
|
43005 |
+
"駢": 42875,
|
43006 |
+
"駑": 42876,
|
43007 |
+
"掩": 42877,
|
43008 |
+
"洽": 42878,
|
43009 |
+
"釀": 42879,
|
43010 |
+
"逈": 42880,
|
43011 |
+
"蹊": 42881,
|
43012 |
+
"蜀": 42882,
|
43013 |
+
"醨": 42883,
|
43014 |
+
"爰": 42884,
|
43015 |
+
"鄙": 42885,
|
43016 |
+
"輕": 42886,
|
43017 |
+
"楔": 42887,
|
43018 |
+
"陋": 42888,
|
43019 |
+
"騁": 42889,
|
43020 |
+
"醗": 42890,
|
43021 |
+
"猾": 42891,
|
43022 |
+
"毘": 42892,
|
43023 |
+
"咀": 42893,
|
43024 |
+
"絜": 42894,
|
43025 |
+
"淇": 42895,
|
43026 |
+
"巌": 42896,
|
43027 |
+
"榜": 42897,
|
43028 |
+
"蛾": 42898,
|
43029 |
+
"羸": 42899,
|
43030 |
+
"敖": 42900,
|
43031 |
+
"疆": 42901,
|
43032 |
+
"驅": 42902,
|
43033 |
+
"騷": 42903,
|
43034 |
+
"趨": 42904,
|
43035 |
+
"靜": 42905,
|
43036 |
+
"營": 42906,
|
43037 |
+
"詔": 42907,
|
43038 |
+
"衷": 42908,
|
43039 |
+
"廠": 42909,
|
43040 |
+
"僨": 42910,
|
43041 |
+
"殷": 42911,
|
43042 |
+
"皈": 42912,
|
43043 |
+
"厭": 42913,
|
43044 |
+
"覺": 42914,
|
43045 |
+
"聯": 42915,
|
43046 |
+
"惟": 42916,
|
43047 |
+
"竟": 42917,
|
43048 |
+
"奧": 42918,
|
43049 |
+
"辭": 42919,
|
43050 |
+
"斥": 42920,
|
43051 |
+
"撹": 42921,
|
43052 |
+
"擾": 42922,
|
43053 |
+
"浹": 42923,
|
43054 |
+
"恂": 42924,
|
43055 |
+
"儡": 42925,
|
43056 |
+
"寔": 42926,
|
43057 |
+
"瞻": 42927,
|
43058 |
+
"舷": 42928,
|
43059 |
+
"靱": 42929,
|
43060 |
+
"鋒": 42930,
|
43061 |
+
"雜": 42931,
|
43062 |
+
"禽": 42932,
|
43063 |
+
"拔": 42933,
|
43064 |
+
"麾": 42934,
|
43065 |
+
"檣": 42935,
|
43066 |
+
"俄": 42936,
|
43067 |
+
"彭": 42937,
|
43068 |
+
"堡": 42938,
|
43069 |
+
"壯": 42939,
|
43070 |
+
"搆": 42940,
|
43071 |
+
"恆": 42941,
|
43072 |
+
"臥": 42942,
|
43073 |
+
"諺": 42943,
|
43074 |
+
"酉": 42944,
|
43075 |
+
"扈": 42945,
|
43076 |
+
"胤": 42946,
|
43077 |
+
"苅": 42947,
|
43078 |
+
"屡": 42948,
|
43079 |
+
"膠": 42949,
|
43080 |
+
"咤": 42950,
|
43081 |
+
"嚴": 42951,
|
43082 |
+
"彝": 42952,
|
43083 |
+
"巓": 42953,
|
43084 |
+
"濶": 42954,
|
43085 |
+
"糢": 42955,
|
43086 |
+
"攝": 42956,
|
43087 |
+
"歸": 42957,
|
43088 |
+
"賈": 42958,
|
43089 |
+
"黠": 42959,
|
43090 |
+
"貽": 42960,
|
43091 |
+
"諮": 42961,
|
43092 |
+
"譯": 42962,
|
43093 |
+
"飮": 42963,
|
43094 |
+
"胥": 42964,
|
43095 |
+
"矜": 42965,
|
43096 |
+
"儁": 42966,
|
43097 |
+
"饜": 42967,
|
43098 |
+
"筵": 42968,
|
43099 |
+
"塡": 42969,
|
43100 |
+
"蠻": 42970,
|
43101 |
+
"蘐": 42971,
|
43102 |
+
"臘": 42972,
|
43103 |
+
"瑣": 42973,
|
43104 |
+
"庸": 42974,
|
43105 |
+
"擕": 42975,
|
43106 |
+
"黨": 42976,
|
43107 |
+
"僩": 42977,
|
43108 |
+
"沮": 42978,
|
43109 |
+
"窩": 42979,
|
43110 |
+
"諫": 42980,
|
43111 |
+
"蜻": 42981,
|
43112 |
+
"摧": 42982,
|
43113 |
+
"蛯": 42983,
|
43114 |
+
"憾": 42984,
|
43115 |
+
"厘": 42985,
|
43116 |
+
"攬": 42986,
|
43117 |
+
"搖": 42987,
|
43118 |
+
"亂": 42988,
|
43119 |
+
"峻": 42989,
|
43120 |
+
"菑": 42990,
|
43121 |
+
"憊": 42991,
|
43122 |
+
"寃": 42992,
|
43123 |
+
"珊": 42993,
|
43124 |
+
"丞": 42994,
|
43125 |
+
"毆": 42995,
|
43126 |
+
"歡": 42996,
|
43127 |
+
"籌": 42997,
|
43128 |
+
"槓": 42998,
|
43129 |
+
"淺": 42999,
|
43130 |
+
"濟": 43000,
|
43131 |
+
"兒": 43001,
|
43132 |
+
"灼": 43002,
|
43133 |
+
"舍": 43003,
|
43134 |
+
"徽": 43004,
|
43135 |
+
"豈": 43005,
|
43136 |
+
"僇": 43006,
|
43137 |
+
"蹄": 43007,
|
43138 |
+
"壓": 43008,
|
43139 |
+
"朧": 43009,
|
43140 |
+
"儒": 43010,
|
43141 |
+
"楷": 43011,
|
43142 |
+
"禀": 43012,
|
43143 |
+
"毋": 43013,
|
43144 |
+
"廟": 43014,
|
43145 |
+
"煥": 43015,
|
43146 |
+
"岌": 43016,
|
43147 |
+
"觸": 43017,
|
43148 |
+
"簪": 43018,
|
43149 |
+
"匐": 43019,
|
43150 |
+
"薛": 43020,
|
43151 |
+
"辜": 43021,
|
43152 |
+
"巍": 43022,
|
43153 |
+
"麿": 43023,
|
43154 |
+
"皺": 43024,
|
43155 |
+
"臀": 43025,
|
43156 |
+
"頒": 43026,
|
43157 |
+
"勞": 43027,
|
43158 |
+
"虔": 43028,
|
43159 |
+
"嚼": 43029,
|
43160 |
+
"邑": 43030,
|
43161 |
+
"嬰": 43031,
|
43162 |
+
"勒": 43032,
|
43163 |
+
"鬢": 43033,
|
43164 |
+
"囮": 43034,
|
43165 |
+
"誦": 43035,
|
43166 |
+
"蓁": 43036,
|
43167 |
+
"兩": 43037,
|
43168 |
+
"篆": 43038,
|
43169 |
+
"勸": 43039,
|
43170 |
+
"隨": 43040,
|
43171 |
+
"啜": 43041,
|
43172 |
+
"舜": 43042,
|
43173 |
+
"尸": 43043,
|
43174 |
+
"茲": 43044,
|
43175 |
+
"竭": 43045,
|
43176 |
+
"岨": 43046,
|
43177 |
+
"戲": 43047,
|
43178 |
+
"陷": 43048,
|
43179 |
+
"綮": 43049,
|
43180 |
+
"歐": 43050,
|
43181 |
+
"襄": 43051,
|
43182 |
+
"廢": 43052,
|
43183 |
+
"綬": 43053,
|
43184 |
+
"戾": 43054,
|
43185 |
+
"夭": 43055,
|
43186 |
+
"遑": 43056,
|
43187 |
+
"僞": 43057,
|
43188 |
+
"駸": 43058,
|
43189 |
+
"抔": 43059,
|
43190 |
+
"耻": 43060,
|
43191 |
+
"禎": 43061,
|
43192 |
+
"畧": 43062,
|
43193 |
+
"逼": 43063,
|
43194 |
+
"圓": 43064,
|
43195 |
+
"姦": 43065,
|
43196 |
+
"憬": 43066,
|
43197 |
+
"鑒": 43067,
|
43198 |
+
"僻": 43068,
|
43199 |
+
"鐸": 43069,
|
43200 |
+
"濤": 43070,
|
43201 |
+
"隋": 43071,
|
43202 |
+
"鍾": 43072,
|
43203 |
+
"滌": 43073,
|
43204 |
+
"諛": 43074,
|
43205 |
+
"挈": 43075,
|
43206 |
+
"崗": 43076,
|
43207 |
+
"匍": 43077,
|
43208 |
+
"緝": 43078,
|
43209 |
+
"澁": 43079,
|
43210 |
+
"儼": 43080,
|
43211 |
+
"僥": 43081,
|
43212 |
+
"闢": 43082,
|
43213 |
+
"聲": 43083,
|
43214 |
+
"蛉": 43084,
|
43215 |
+
"酋": 43085,
|
43216 |
+
"恙": 43086,
|
43217 |
+
"蹈": 43087,
|
43218 |
+
"欵": 43088,
|
43219 |
+
"唖": 43089,
|
43220 |
+
"槙": 43090,
|
43221 |
+
"裴": 43091,
|
43222 |
+
"吻": 43092,
|
43223 |
+
"蕃": 43093,
|
43224 |
+
"熙": 43094,
|
43225 |
+
"糀": 43095,
|
43226 |
+
"彗": 43096,
|
43227 |
+
"臾": 43097,
|
43228 |
+
"寵": 43098,
|
43229 |
+
"勵": 43099,
|
43230 |
+
"懺": 43100,
|
43231 |
+
"屬": 43101,
|
43232 |
+
"鸞": 43102,
|
43233 |
+
"隱": 43103,
|
43234 |
+
"諂": 43104,
|
43235 |
+
"尹": 43105,
|
43236 |
+
"舒": 43106,
|
43237 |
+
"餘": 43107,
|
43238 |
+
"贖": 43108,
|
43239 |
+
"嗚": 43109,
|
43240 |
+
"闍": 43110,
|
43241 |
+
"醉": 43111,
|
43242 |
+
"縣": 43112,
|
43243 |
+
"朕": 43113,
|
43244 |
+
"彥": 43114,
|
43245 |
+
"醵": 43115,
|
43246 |
+
"殉": 43116,
|
43247 |
+
"淫": 43117,
|
43248 |
+
"恕": 43118,
|
43249 |
+
"纂": 43119,
|
43250 |
+
"錐": 43120,
|
43251 |
+
"慄": 43121,
|
43252 |
+
"玻": 43122,
|
43253 |
+
"敎": 43123,
|
43254 |
+
"袈": 43124,
|
43255 |
+
"拮": 43125,
|
43256 |
+
"裔": 43126,
|
43257 |
+
"沃": 43127,
|
43258 |
+
"雉": 43128,
|
43259 |
+
"擔": 43129,
|
43260 |
+
"闡": 43130,
|
43261 |
+
"肚": 43131,
|
43262 |
+
"陪": 43132,
|
43263 |
+
"亦": 43133,
|
43264 |
+
"渺": 43134,
|
43265 |
+
"藥": 43135,
|
43266 |
+
"鵠": 43136,
|
43267 |
+
"忿": 43137,
|
43268 |
+
"傚": 43138,
|
43269 |
+
"謭": 43139,
|
43270 |
+
"雖": 43140,
|
43271 |
+
"胖": 43141,
|
43272 |
+
"匈": 43142,
|
43273 |
+
"黃": 43143,
|
43274 |
+
"翰": 43144,
|
43275 |
+
"埸": 43145,
|
43276 |
+
"芒": 43146,
|
43277 |
+
"葢": 43147,
|
43278 |
+
"罕": 43148,
|
43279 |
+
"肆": 43149,
|
43280 |
+
"饒": 43150,
|
43281 |
+
"諷": 43151,
|
43282 |
+
"弐": 43152,
|
43283 |
+
"讀": 43153,
|
43284 |
+
"鵬": 43154,
|
43285 |
+
"闊": 43155,
|
43286 |
+
"盍": 43156,
|
43287 |
+
"歟": 43157,
|
43288 |
+
"俟": 43158,
|
43289 |
+
"懥": 43159,
|
43290 |
+
"埠": 43160,
|
43291 |
+
"斤": 43161,
|
43292 |
+
"龕": 43162,
|
43293 |
+
"恤": 43163,
|
43294 |
+
"舊": 43164,
|
43295 |
+
"倹": 43165,
|
43296 |
+
"諱": 43166,
|
43297 |
+
"雞": 43167,
|
43298 |
+
"孛": 43168,
|
43299 |
+
"尤": 43169,
|
43300 |
+
"團": 43170,
|
43301 |
+
"汀": 43171,
|
43302 |
+
"盜": 43172,
|
43303 |
+
"篩": 43173,
|
43304 |
+
"鐵": 43174,
|
43305 |
+
"皓": 43175,
|
43306 |
+
"揠": 43176,
|
43307 |
+
"觀": 43177,
|
43308 |
+
"羹": 43178,
|
43309 |
+
"豕": 43179,
|
43310 |
+
"帥": 43180,
|
43311 |
+
"隧": 43181,
|
43312 |
+
"獸": 43182,
|
43313 |
+
"碇": 43183,
|
43314 |
+
"殲": 43184,
|
43315 |
+
"斷": 43185,
|
43316 |
+
"桀": 43186,
|
43317 |
+
"尙": 43187,
|
43318 |
+
"愼": 43188,
|
43319 |
+
"蔗": 43189,
|
43320 |
+
"熾": 43190,
|
43321 |
+
"續": 43191,
|
43322 |
+
"皷": 43192,
|
43323 |
+
"畫": 43193,
|
43324 |
+
"丙": 43194,
|
43325 |
+
"茫": 43195,
|
43326 |
+
"揜": 43196,
|
43327 |
+
"靡": 43197,
|
43328 |
+
"墾": 43198,
|
43329 |
+
"燮": 43199,
|
43330 |
+
"逕": 43200,
|
43331 |
+
"媢": 43201,
|
43332 |
+
"穆": 43202,
|
43333 |
+
"覊": 43203,
|
43334 |
+
"漸": 43204,
|
43335 |
+
"樣": 43205,
|
43336 |
+
"恪": 43206,
|
43337 |
+
"頴": 43207,
|
43338 |
+
"咎": 43208,
|
43339 |
+
"宥": 43209,
|
43340 |
+
"礦": 43210,
|
43341 |
+
"貶": 43211,
|
43342 |
+
"膽": 43212,
|
43343 |
+
"慘": 43213,
|
43344 |
+
"貮": 43214,
|
43345 |
+
"傀": 43215,
|
43346 |
+
"懼": 43216,
|
43347 |
+
"籾": 43217,
|
43348 |
+
"吏": 43218,
|
43349 |
+
"榮": 43219,
|
43350 |
+
"繭": 43220,
|
43351 |
+
"撿": 43221,
|
43352 |
+
"甞": 43222,
|
43353 |
+
"遲": 43223,
|
43354 |
+
"驗": 43224,
|
43355 |
+
"跋": 43225,
|
43356 |
+
"滊": 43226,
|
43357 |
+
"糾": 43227,
|
43358 |
+
"諧": 43228,
|
43359 |
+
"剝": 43229,
|
43360 |
+
"肅": 43230,
|
43361 |
+
"柘": 43231,
|
43362 |
+
"辨": 43232,
|
43363 |
+
"據": 43233,
|
43364 |
+
"誣": 43234,
|
43365 |
+
"竄": 43235,
|
43366 |
"錮": 43236,
|
43367 |
+
"歎": 43237,
|
43368 |
+
"窘": 43238,
|
43369 |
+
"愧": 43239,
|
43370 |
+
"聽": 43240,
|
43371 |
+
"臂": 43241,
|
43372 |
+
"妾": 43242,
|
43373 |
+
"駭": 43243,
|
43374 |
+
"頽": 43244,
|
43375 |
+
"鏃": 43245,
|
43376 |
+
"亨": 43246,
|
43377 |
+
"殘": 43247,
|
43378 |
+
"壹": 43248,
|
43379 |
+
"轍": 43249,
|
43380 |
+
"嘲": 43250,
|
43381 |
+
"爨": 43251,
|
43382 |
+
"驕": 43252,
|
43383 |
+
"滿": 43253,
|
43384 |
+
"碩": 43254,
|
43385 |
+
"碍": 43255,
|
43386 |
+
"虞": 43256,
|
43387 |
+
"賂": 43257,
|
43388 |
+
"脹": 43258,
|
43389 |
+
"頰": 43259,
|
43390 |
+
"恊": 43260,
|
43391 |
+
"揆": 43261,
|
43392 |
+
"躓": 43262,
|
43393 |
+
"辟": 43263,
|
43394 |
+
"倖": 43264,
|
43395 |
+
"謨": 43265,
|
43396 |
+
"梃": 43266,
|
43397 |
+
"罷": 43267,
|
43398 |
+
"覽": 43268,
|
43399 |
+
"宍": 43269,
|
43400 |
+
"諟": 43270,
|
43401 |
+
"欷": 43271,
|
43402 |
+
"繼": 43272,
|
43403 |
+
"歔": 43273,
|
43404 |
+
"埴": 43274,
|
43405 |
+
"舅": 43275,
|
43406 |
+
"啻": 43276,
|
43407 |
+
"戊": 43277,
|
43408 |
+
"儘": 43278,
|
43409 |
+
"處": 43279,
|
43410 |
+
"巖": 43280,
|
43411 |
+
"豫": 43281,
|
43412 |
+
"擧": 43282,
|
43413 |
+
"竊": 43283,
|
43414 |
+
"狸": 43284,
|
43415 |
+
"蘊": 43285,
|
43416 |
+
"橙": 43286,
|
43417 |
+
"鰭": 43287,
|
43418 |
+
"甍": 43288,
|
43419 |
+
"菰": 43289,
|
43420 |
+
"價": 43290,
|
43421 |
+
"揖": 43291,
|
43422 |
+
"嫡": 43292,
|
43423 |
+
"耆": 43293,
|
43424 |
+
"葦": 43294,
|
43425 |
+
"堯": 43295,
|
43426 |
+
"戌": 43296,
|
43427 |
+
"獨": 43297,
|
43428 |
+
"汝": 43298,
|
43429 |
+
"澳": 43299,
|
43430 |
+
"呵": 43300,
|
43431 |
+
"憚": 43301,
|
43432 |
+
"嚮": 43302,
|
43433 |
+
"翹": 43303,
|
43434 |
+
"孟": 43304,
|
43435 |
+
"鄭": 43305,
|
43436 |
+
"壤": 43306,
|
43437 |
+
"竝": 43307,
|
43438 |
+
"菉": 43308,
|
43439 |
+
"烟": 43309,
|
43440 |
+
"閻": 43310,
|
43441 |
+
"禮": 43311,
|
43442 |
+
"犂": 43312,
|
43443 |
+
"欣": 43313,
|
43444 |
+
"惡": 43314,
|
43445 |
+
"鉤": 43315,
|
43446 |
+
"艱": 43316
|
43447 |
},
|
43448 |
"merges": [
|
43449 |
"▁ t",
|
tokenizer_config.json
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
{
|
2 |
"add_bos_token": true,
|
3 |
"add_eos_token": false,
|
|
|
4 |
"added_tokens_decoder": {
|
5 |
"0": {
|
6 |
"content": "<unk>",
|
@@ -31,6 +32,7 @@
|
|
31 |
"clean_up_tokenization_spaces": false,
|
32 |
"cls_token": "<s>",
|
33 |
"eos_token": "</s>",
|
|
|
34 |
"mask_token": "<unk>",
|
35 |
"model_max_length": 4096,
|
36 |
"pad_token": "</s>",
|
|
|
1 |
{
|
2 |
"add_bos_token": true,
|
3 |
"add_eos_token": false,
|
4 |
+
"add_prefix_space": null,
|
5 |
"added_tokens_decoder": {
|
6 |
"0": {
|
7 |
"content": "<unk>",
|
|
|
32 |
"clean_up_tokenization_spaces": false,
|
33 |
"cls_token": "<s>",
|
34 |
"eos_token": "</s>",
|
35 |
+
"legacy": true,
|
36 |
"mask_token": "<unk>",
|
37 |
"model_max_length": 4096,
|
38 |
"pad_token": "</s>",
|
upos.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
from transformers import TokenClassificationPipeline
|
2 |
-
from transformers.modeling_outputs import TokenClassifierOutput
|
3 |
|
4 |
class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
5 |
def __init__(self,**kwargs):
|
@@ -40,41 +39,3 @@ class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
|
40 |
t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
|
41 |
return w
|
42 |
|
43 |
-
class RawTokenClassificationPipeline(TokenClassificationPipeline):
|
44 |
-
def check_model_type(self,supported_models):
|
45 |
-
pass
|
46 |
-
|
47 |
-
class MistralForTokenClassification(MistralPreTrainedModel):
|
48 |
-
def __init__(self,config):
|
49 |
-
from torch import nn
|
50 |
-
super().__init__(config)
|
51 |
-
self.num_labels=config.num_labels
|
52 |
-
self.model=MistralModel(config)
|
53 |
-
if hasattr(config,"classifier_dropout") and config.classifier_dropout is not None:
|
54 |
-
classifier_dropout=config.classifier_dropout
|
55 |
-
elif hasattr(config,"hidden_dropout") and config.hidden_dropout is not None:
|
56 |
-
classifier_dropout=config.hidden_dropout
|
57 |
-
else:
|
58 |
-
classifier_dropout=0.1
|
59 |
-
self.dropout=nn.Dropout(classifier_dropout)
|
60 |
-
self.classifier=nn.Linear(config.hidden_size,config.num_labels)
|
61 |
-
self.post_init()
|
62 |
-
def get_input_embeddings(self):
|
63 |
-
return self.model.embed_tokens
|
64 |
-
def set_input_embeddings(self,value):
|
65 |
-
self.model.embed_tokens=value
|
66 |
-
def forward(self,input_ids=None,past_key_values=None,attention_mask=None,position_ids=None,inputs_embeds=None,labels=None,use_cache=None,output_attentions=None,output_hidden_states=None,return_dict=None):
|
67 |
-
return_dict=return_dict if return_dict is not None else self.config.use_return_dict
|
68 |
-
transformer_outputs=self.model(input_ids,past_key_values=past_key_values,attention_mask=attention_mask,position_ids=position_ids,inputs_embeds=inputs_embeds,use_cache=use_cache,output_attentions=output_attentions,output_hidden_states=output_hidden_states,return_dict=return_dict)
|
69 |
-
hidden_states=transformer_outputs[0]
|
70 |
-
hidden_states=self.dropout(hidden_states)
|
71 |
-
logits=self.classifier(hidden_states)
|
72 |
-
loss=None
|
73 |
-
if labels is not None:
|
74 |
-
from torch import nn
|
75 |
-
loss_fct=nn.CrossEntropyLoss()
|
76 |
-
loss=loss_fct(logits.view(-1,self.num_labels),labels.view(-1))
|
77 |
-
if not return_dict:
|
78 |
-
output=(logits,)+transformer_outputs[2:]
|
79 |
-
return ((loss,)+output) if loss is not None else output
|
80 |
-
return TokenClassifierOutput(loss=loss,logits=logits,hidden_states=transformer_outputs.hidden_states,attentions=transformer_outputs.attentions)
|
|
|
1 |
+
from transformers import TokenClassificationPipeline
|
|
|
2 |
|
3 |
class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
4 |
def __init__(self,**kwargs):
|
|
|
39 |
t["text"]=model_outputs["sentence"][t["start"]:t["end"]]
|
40 |
return w
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|