diff --git a/KTeleBERT/__pycache__/config.cpython-38.pyc b/KTeleBERT/__pycache__/config.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b210093bcab8230e6034a3a0f3db034bb4d80d7
Binary files /dev/null and b/KTeleBERT/__pycache__/config.cpython-38.pyc differ
diff --git a/KTeleBERT/config.py b/KTeleBERT/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..643d7a1270213ddd48d922d30c9a450770b290b1
--- /dev/null
+++ b/KTeleBERT/config.py
@@ -0,0 +1,234 @@
+import os.path as osp
+import numpy as np
+import random
+import torch
+from easydict import EasyDict as edict
+import argparse
+
+
+LAYER_MAPPING = {
+    0: 'od_layer_0',
+    1: 'od_layer_1',
+    2: 'od_layer_2',
+}
+
+
+class cfg():
+    def __init__(self):
+        self.this_dir = osp.dirname(__file__)
+        # change
+        self.data_root = osp.abspath(osp.join(self.this_dir, '..', '..', 'data', ''))
+
+        # TODO: add some static variable  (The frequency of change is low)
+
+    def get_args(self):
+        parser = argparse.ArgumentParser()
+        # ------------ base ------------
+        parser.add_argument('--train_strategy', default=1, type=int)
+        parser.add_argument('--batch_size', default=64, type=int)
+        parser.add_argument('--batch_size_ke', default=14, type=int)
+        parser.add_argument('--batch_size_od', default=8, type=int)
+        parser.add_argument('--batch_size_ad', default=32, type=int)
+
+        parser.add_argument('--epoch', default=15, type=int)
+        parser.add_argument("--save_model", default=1, type=int, choices=[0, 1])
+        # 用transformer的 save_pretrain 方式保存
+        parser.add_argument("--save_pretrain", default=0, type=int, choices=[0, 1])
+        parser.add_argument("--from_pretrain", default=0, type=int, choices=[0, 1])
+
+        # torthlight
+        parser.add_argument("--no_tensorboard", default=False, action="store_true")
+        parser.add_argument("--exp_name", default="huawei_exp", type=str, help="Experiment name")
+        parser.add_argument("--dump_path", default="dump/", type=str, help="Experiment dump path")
+        parser.add_argument("--exp_id", default="ke256_raekt_ernie2_bs20_p3_c3_5e-6", type=str, help="Experiment ID")
+        # or 3407
+        parser.add_argument("--random_seed", default=42, type=int)
+        # 数据参数
+        parser.add_argument("--data_path", default="huawei", type=str, help="Experiment path")
+        parser.add_argument('--train_ratio', default=1, type=float, help='ratio for train/test')
+        parser.add_argument("--seq_data_name", default='Seq_data_base', type=str, help="seq_data 名字")
+        parser.add_argument("--kg_data_name", default='KG_data_base_rule', type=str, help="kg_data 名字")
+        parser.add_argument("--order_data_name", default='event_order_data', type=str, help="order_data 名字")
+        # TODO: add some dynamic variable
+        parser.add_argument("--model_name", default="MacBert", type=str, help="model name")
+
+        # ------------ 训练阶段 ------------
+        parser.add_argument("--scheduler", default="cos", type=str, choices=["linear", "cos"])
+        parser.add_argument("--optim", default="adamw", type=str)
+        parser.add_argument("--adam_epsilon", default=1e-8, type=float)
+        parser.add_argument('--workers', type=int, default=8)
+        parser.add_argument('--accumulation_steps', type=int, default=6)
+        parser.add_argument('--accumulation_steps_ke', type=int, default=6)
+        parser.add_argument('--accumulation_steps_ad', type=int, default=6)
+        parser.add_argument('--accumulation_steps_od', type=int, default=6)
+        parser.add_argument("--train_together", default=0, type=int)
+
+        # 3e-5
+        parser.add_argument('--lr', type=float, default=1e-5)
+        # 逐层学习率衰减
+        parser.add_argument("--LLRD", default=0, type=int, choices=[0, 1])
+        parser.add_argument('--weight_decay', type=float, default=0.01)
+        parser.add_argument('--clip', type=float, default=1., help='gradient clipping')
+        parser.add_argument('--scheduler_steps', type=int, default=None,
+                            help='total number of step for the scheduler, if None then scheduler_total_step = total_step')
+        parser.add_argument('--eval_step', default=100, type=int, help='evaluate each n step')
+
+        # ------------ PLM ------------
+        parser.add_argument('--maxlength', type=int, default=200)
+        parser.add_argument('--mlm_probability', type=float, default=0.15)
+        parser.add_argument('--final_mlm_probability', type=float, default=0.4)
+        parser.add_argument('--mlm_probability_increase', type=str, default="curve", choices=["linear", "curve"])
+        parser.add_argument("--mask_stratege", default="rand", type=str, choices=["rand", "wwm", "domain"])
+        # 前n个epoch 用rand，后面用wwm. multi-stage knowledge masking strategy
+        parser.add_argument("--ernie_stratege", default=-1, type=int)
+        # 用mlm任务进行训练,默认使用chinese_ref且添加新的special word
+        parser.add_argument("--use_mlm_task", default=1, type=int, choices=[0, 1])
+        # 添加新的special word
+        parser.add_argument("--add_special_word", default=1, type=int, choices=[0, 1])
+        # freeze
+        parser.add_argument("--freeze_layer", default=0, type=int, choices=[0, 1, 2, 3, 4])
+        # 是否mask 特殊token
+        parser.add_argument("--special_token_mask", default=0, type=int, choices=[0, 1])
+        parser.add_argument("--emb_init", default=1, type=int, choices=[0, 1])
+        parser.add_argument("--cls_head_init", default=1, type=int, choices=[0, 1])
+        # 是否使用自适应权重
+        parser.add_argument("--use_awl", default=1, type=int, choices=[0, 1])
+        parser.add_argument("--mask_loss_scale", default=1.0, type=float)
+
+        # ------------ KGE ------------
+        parser.add_argument('--ke_norm', type=int, default=1)
+        parser.add_argument('--ke_dim', type=int, default=768)
+        parser.add_argument('--ke_margin', type=float, default=1.0)
+        parser.add_argument('--neg_num', type=int, default=10)
+        parser.add_argument('--adv_temp', type=float, default=1.0, help='The temperature of sampling in self-adversarial negative sampling.')
+        # 5e-4
+        parser.add_argument('--ke_lr', type=float, default=3e-5)
+        parser.add_argument('--only_ke_loss', type=int, default=0)
+
+        # ------------ 数值embedding相关 ------------
+        parser.add_argument('--use_NumEmb', type=int, default=1)
+        parser.add_argument("--contrastive_loss", default=1, type=int, choices=[0, 1])
+        parser.add_argument("--l_layers", default=2, type=int)
+        parser.add_argument('--use_kpi_loss', type=int, default=1)
+
+        # ------------ 测试阶段 ------------
+        parser.add_argument("--only_test", default=0, type=int, choices=[0, 1])
+        parser.add_argument("--mask_test", default=0, type=int, choices=[0, 1])
+        parser.add_argument("--embed_gen", default=0, type=int, choices=[0, 1])
+        parser.add_argument("--ke_test", default=0, type=int, choices=[0, 1])
+        # -1: 测全集
+        parser.add_argument("--ke_test_num", default=-1, type=int)
+        parser.add_argument("--path_gen", default="", type=str)
+
+        # ------------ 时序阶段 ------------
+        # 1：预训练
+        # 2：时序 finetune
+        # 3. 异常检测 finetune + 时序, 且是迭代的
+        # 是否加载od模型
+        parser.add_argument("--order_load", default=0, type=int)
+        parser.add_argument("--order_num", default=2, type=int)
+        parser.add_argument("--od_type", default='linear_cat', type=str, choices=['linear_cat', 'vertical_attention'])
+        parser.add_argument("--eps", default=0.2, type=float, help='label smoothing..')
+        parser.add_argument("--num_od_layer", default=0, type=int)
+        parser.add_argument("--plm_emb_type", default='cls', type=str, choices=['cls', 'last_avg'])
+        parser.add_argument("--order_test_name", default='', type=str)
+        parser.add_argument("--order_threshold", default=0.5, type=float)
+        # ------------ 并行训练 ------------
+        # 是否并行
+        parser.add_argument('--rank', type=int, default=0, help='rank to dist')
+        parser.add_argument('--dist', type=int, default=0, help='whether to dist')
+        # 不要改该参数，系统会自动分配
+        parser.add_argument('--device', default='cuda', help='device id (i.e. 0 or 0,1 or cpu)')
+        # 开启的进程数(注意不是线程),不用设置该参数，会根据nproc_per_node自动设置
+        parser.add_argument('--world-size', default=4, type=int,
+                            help='number of distributed processes')
+        parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
+        parser.add_argument("--local_rank", default=-1, type=int)
+        self.cfg = parser.parse_args()
+
+    def update_train_configs(self):
+        # add some constraint for parameters
+        # e.g. cannot save and test at the same time
+        # 修正默认参数
+        # TODO: 测试逻辑有问题需要修改
+        if len(self.cfg.order_test_name) > 0:
+            self.cfg.save_model = 0
+            if len(self.cfg.order_test_name) == 0:
+                self.cfg.train_ratio = min(0.8, self.cfg.train_ratio)
+            # 自适应载入文件名
+            else:
+                print("od test ... ")
+                self.cfg.train_strategy == 5
+                self.cfg.plm_emb_type = 'last_avg' if 'last_avg' in self.cfg.model_name else 'cls'
+                for key in LAYER_MAPPING.keys():
+                    if LAYER_MAPPING[key] in self.cfg.model_name:
+                        self.cfg.num_od_layer = key
+                self.cfg.order_test_name = osp.join('downstream_task', f'{self.cfg.order_test_name}')
+
+        if self.cfg.mask_test or self.cfg.embed_gen or self.cfg.ke_test or len(self.cfg.order_test_name) > 0:
+            assert len(self.cfg.model_name) > 0
+            self.cfg.only_test = 1
+        if self.cfg.only_test == 1:
+            self.save_model = 0
+            self.save_pretrain = 0
+
+        # TODO: update some dynamic variable
+        self.cfg.data_root = self.data_root
+        self.cfg.data_path = osp.join(self.data_root, self.cfg.data_path)
+        self.cfg.plm_path = osp.join(self.data_root, 'transformer')
+        self.cfg.dump_path = osp.join(self.cfg.data_path, self.cfg.dump_path)
+        # bs 控制尽量在32
+
+        # 自适应权重的数量
+        self.cfg.awl_num = 1
+        # ------------ 数值embedding相关 ------------
+        self.cfg.hidden_size = 768
+        self.cfg.num_attention_heads = 8
+        self.cfg.hidden_dropout_prob = 0.1
+        self.cfg.num_kpi = 304
+        self.cfg.specail_emb_path = None
+        if self.cfg.emb_init:
+            self.cfg.specail_emb_path = osp.join(self.cfg.data_path, 'added_vocab_embedding.pt')
+
+        # ------------- 多任务学习相关 -------------
+        # 四个阶段
+        self.cfg.mask_epoch, self.cfg.ke_epoch, self.cfg.ad_epoch, self.cfg.od_epoch = None, None, None, None
+        # 触发多任务 学习
+        if self.cfg.train_strategy > 1:
+            self.cfg.mask_epoch = [0, 1, 1, 1, 0]
+            self.cfg.ke_epoch = [4, 3, 2, 2, 0]
+            if self.cfg.only_ke_loss:
+                self.cfg.mask_epoch = [0, 0, 0, 0, 0]
+            self.cfg.epoch = sum(self.cfg.mask_epoch) + sum(self.cfg.ke_epoch)
+            if self.cfg.train_strategy > 2:
+                self.cfg.ad_epoch = [0, 6, 3, 1, 0]
+                self.cfg.epoch += sum(self.cfg.ad_epoch)
+                if self.cfg.train_strategy > 3 and not self.cfg.only_ke_loss:
+                    self.cfg.od_epoch = [0, 0, 9, 1, 0]
+                    # self.cfg.mask_epoch[3] = 1
+                    self.cfg.epoch += sum(self.cfg.od_epoch)
+            self.cfg.epoch_matrix = []
+            for epochs in [self.cfg.mask_epoch, self.cfg.ke_epoch, self.cfg.ad_epoch, self.cfg.od_epoch]:
+                if epochs is not None:
+                    self.cfg.epoch_matrix.append(epochs)
+            if self.cfg.train_together:
+                # loss 直接相加，训练epoch就是mask的epoch
+                self.cfg.epoch = sum(self.cfg.mask_epoch)
+                self.cfg.batch_size = int((self.cfg.batch_size - 16) / self.cfg.train_strategy)
+                self.cfg.batch_size_ke = int(self.cfg.batch_size_ke / self.cfg.train_strategy) - 2
+                self.cfg.batch_size_ad = int(self.cfg.batch_size_ad / self.cfg.train_strategy) - 1
+                self.cfg.batch_size_od = int(self.cfg.batch_size_od / self.cfg.train_strategy) - 1
+                self.cfg.accumulation_steps = (self.cfg.accumulation_steps - 1) * self.cfg.train_strategy
+
+        self.cfg.neg_num = max(min(self.cfg.neg_num, self.cfg.batch_size_ke - 3), 1)
+
+        self.cfg.accumulation_steps_dict = {0: self.cfg.accumulation_steps, 1: self.cfg.accumulation_steps_ke, 2: self.cfg.accumulation_steps_ad, 3: self.cfg.accumulation_steps_od}
+
+        # 使用数值embedding也必须添加新词因为位置信息和tokenizer绑定
+        if self.cfg.use_mlm_task or self.cfg.use_NumEmb:
+            assert self.cfg.add_special_word == 1
+
+        if self.cfg.use_NumEmb:
+            self.cfg.awl_num += 1
+
+        return self.cfg
diff --git a/KTeleBERT/data_trans.py b/KTeleBERT/data_trans.py
new file mode 100644
index 0000000000000000000000000000000000000000..0273e20054017a876c57fb8752df897f83e2d04a
--- /dev/null
+++ b/KTeleBERT/data_trans.py
@@ -0,0 +1,56 @@
+import os.path as osp
+import numpy as np
+import random
+import torch
+import argparse
+import pdb
+import json
+
+'''
+把数据合并
+同时抽取一部分需要的数据出来
+'''
+
+this_dir = osp.dirname(__file__)
+
+data_root = osp.abspath(osp.join(this_dir, '..', '..', 'data', ''))
+
+data_path = "huawei"
+data_path = osp.join(data_root, data_path)
+
+
+with open(osp.join(data_path, 'product_corpus.json'), "r") as f:
+    data_doc = json.load(f)
+
+with open(osp.join(data_path, '831_alarm_serialize.json'), "r") as f:
+    data_alarm = json.load(f)
+# kpi_info.json
+with open(osp.join(data_path, '917_kpi_serialize_50_mn.json'), "r") as f:
+    data_kpi = json.load(f)
+
+
+# 实体的序列化
+with open(osp.join(data_path, '5GC_KB/database_entity_serialize.json'), "r") as f:
+    data_entity = json.load(f)
+
+random.shuffle(data_kpi)
+random.shuffle(data_doc)
+random.shuffle(data_alarm)
+random.shuffle(data_entity)
+data = data_alarm + data_kpi + data_entity + data_doc
+random.shuffle(data)
+
+# 241527
+pdb.set_trace()
+with open(osp.join(data_path, 'Seq_data_large.json'), "w") as fp:
+    json.dump(data, fp, ensure_ascii=False)
+
+
+# 三元组
+with open(osp.join(data_path, '5GC_KB/database_triples.json'), "r") as f:
+    data = json.load(f)
+random.shuffle(data)
+
+
+with open(osp.join(data_path, 'KG_data_base.json'), "w") as fp:
+    json.dump(data, fp, ensure_ascii=False)
diff --git a/KTeleBERT/get_chinese_ref.py b/KTeleBERT/get_chinese_ref.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddb0216ea64eed071ec77a1d4d7126f7ed3fd912
--- /dev/null
+++ b/KTeleBERT/get_chinese_ref.py
@@ -0,0 +1,454 @@
+import os.path as osp
+import numpy as np
+import random
+import torch
+from easydict import EasyDict as edict
+import argparse
+import pdb
+import json
+from model import BertTokenizer
+from collections import Counter
+from ltp import LTP
+from tqdm import tqdm
+from src.utils import add_special_token
+from functools import reduce
+from time import time
+from numpy import mean
+import math
+
+from src.utils import Loss_log, time_trans
+from collections import defaultdict
+
+
+class cfg():
+    def __init__(self):
+        self.this_dir = osp.dirname(__file__)
+        # change
+        self.data_root = osp.abspath(osp.join(self.this_dir, '..', '..', 'data', ''))
+
+    def get_args(self):
+        parser = argparse.ArgumentParser()
+        # seq_data_name = "Seq_data_tiny_831"
+        parser.add_argument("--data_path", default="huawei", type=str, help="Experiment path")
+        # TODO: freq 可以考虑 150
+        parser.add_argument("--freq", default=50, type=int, help="出现多少次的词认为是重要的")
+        parser.add_argument("--batch_size", default=100, type=int, help="分词的batch size")
+        parser.add_argument("--seq_data_name", default='Seq_data_large', type=str, help="seq_data 名字")
+        parser.add_argument("--deal_numeric", default=0, type=int, help="是否处理数值数据")
+
+        parser.add_argument("--read_cws", default=0, type=int, help="是否需要读训练好的cws文件")
+        self.cfg = parser.parse_args()
+
+    def update_train_configs(self):
+        # TODO: update some dynamic variable
+        self.cfg.data_root = self.data_root
+        self.cfg.data_path = osp.join(self.data_root, self.cfg.data_path)
+
+        return self.cfg
+
+
+def refresh_data(ref, freq, special_token):
+    '''
+    功能：在自定义的special token基础上基于最小出现频率得到更多新词分词系统的参考，作为wwm基础
+    输入：
+        freq: 在（37万）语义词典中的最小出现频率（空格为分词）
+        special_token: 前面手工定义的特殊token（可能存在交集）
+    输出：
+        add_words：在定义的最小出现频率基础上筛选出来的新词
+    '''
+    # 经常出现的sub token
+    seq_sub_data = [line.split() for line in ref]
+    all_data = []
+    for data in seq_sub_data:
+        all_data.extend(data)
+    sub_word_times = dict(Counter(all_data))
+    asub_word_time_order = sorted(sub_word_times.items(), key=lambda x: x[1], reverse=True)
+    # ('LST', 1218), ('RMV', 851), ('DSP', 821), ('ADD', 820), ('MOD', 590), ('SET', 406), ('AWS', 122)
+    # ADD、ACT、ALM-XXX、DEL、DSP、LST
+    add_words = []
+
+    for i in asub_word_time_order:
+        # 把出现频率很高的词加进来
+        if i[1] >= freq and len(i[0]) > 1 and len(i[0]) < 20 and not str.isdigit(i[0]):
+            add_words.append(i[0])
+    add_words.extend(special_token)
+    # 卡100阈值时是935个特殊token
+    print(f"[{len(add_words)}] special words will be added with frequency [{freq}]!")
+    return add_words
+
+
+def cws(seq_data, add_words, batch_size):
+    '''
+    功能：所有序列数据的输入转换成分词之后的结果 
+    输入：
+        seq_data：所有序列数据输入 e.g.['KPI异常下降', 'KPI异常上升']  
+        add_words：添加的special words
+        batch_size：每次分多少句
+    输出：
+        all_segment：所有序列数据的输出 e.g. [['KPI', '异常', '下降']， ['KPI', '异常', '上升']] 
+        data_size：输入/输出的序列数量（e.g. 2）
+    '''
+    # seq_data = seq_data.cuda()
+    print(f"loading...")
+    ltp = LTP("LTP/base2")  # 默认加载 base2 模型
+    # ltp = LTP()
+    print(f"begin adding words ...")
+    # ltp.add_words(words=add_words, max_window=5) #4.1.5
+    ltp.add_words(words=add_words)  # 4.2.8
+    ltp.to("cuda")
+    # for word in add_words:
+    #     ltp.add_word(word)
+    print(f"{len(add_words)} special words are added!")
+
+    #
+    # for data in seq_data:
+    #     output = ltp.pipeline([data], tasks=["cws"])
+    data_size = len(seq_data)
+    seq_data_cws = []
+    size = int(data_size / batch_size) + 1
+    b = 0
+    e = b + batch_size
+    # pdb.set_trace()
+
+    log = Loss_log()
+
+    with tqdm(total=size) as _tqdm:
+        # pdb.set_trace()
+        # log.time_init()
+        # pdb.set_trace()
+        error_data = []
+        for i in range(size):
+
+            output = []
+            try:
+                _output = ltp.pipeline(seq_data[b:e], tasks=["cws"])
+                for data in _output.cws:
+                    try:
+                        data_out = ltp.pipeline(data, tasks=["cws"])
+                        # data_out_ = reduce(lambda x, y: x.extend(y) or x, data_out.cws)
+                        data_out_ = []
+                        for i in data_out.cws:
+                            data_out_.extend([k.strip() for k in i])
+                        output.append(data_out_)
+                    except:
+                        print(f"二阶段分词出错！范围是：[{b}]-[{e}]")
+                        error_data.append(data)
+
+            # pdb.set_trace()
+            except:
+                print(f"第一阶段分词出错！范围是：[{b}]-[{e}]")
+                error_data.append(f"第一阶段分词出错！范围是：[{b}]-[{e}]")
+                # continue
+            seq_data_cws.extend(output)
+            b = e
+            e += batch_size
+
+            # 时间统计
+            if e >= data_size:
+                if b >= data_size:
+                    break
+                e = data_size
+            _tqdm.set_description(f'from {b} to {e}:')
+            _tqdm.update(1)
+
+    print(f"过滤了{data_size - len(seq_data_cws)}个句子")
+
+    return seq_data_cws, data_size, error_data
+
+
+def ltp_debug(ltp, op):
+    output = []
+    for data in op:
+        data_out = ltp.pipeline(data, tasks=["cws"])
+        # data_out_ = reduce(lambda x, y: x.extend(y) or x, data_out.cws)
+        data_out_ = []
+        for i in data_out.cws:
+            # 保留空格的话需要手动去除空格
+            data_out_.append(i[0].strip())
+            # 之前没有空格
+            # data_out_.extend(i)
+        output.append(data_out_)
+    return output
+
+
+def deal_sub_words(subwords, special_token):
+    '''
+    功能：把每个word的整体内，非首字符的部分加上 '##' 前缀， special_token 不应该被mask
+    '''
+    for i in range(len(subwords)):
+        if i == 0:
+            continue
+        if subwords[i] in special_token:
+            continue
+        if subwords[i].startswith("##"):
+            continue
+
+        subwords[i] = "##" + subwords[i]
+    return subwords
+
+
+def generate_chinese_ref(seq_data_cws, special_token, deal_numeric, kpi_dic):
+    '''
+    输入： 
+        seq_data_cws：所有序列数据的输出 e.g. [['KPI', '异常', '下降']， ['KPI', '异常', '上升']] 
+        special_token：不应该被mask ['[SEP]', '[MASK]', '[ALM]', '[KPI]', '[CLS]', '[LOC]', '[EOS]', '[ENT]', '[ATTR]', '[NUM]', '|']
+        data_size：数据量 e.g. 2
+    输出：
+        ww_return （whole word return）：打标之后的chinese ref e.g. [['KPI', '异','##常', '下', '##降']， ['KPI', '异', '##常', '上', '##升']] 
+    '''
+    # 定义全局set和逆字典统计哪些KPI最后没有被涉及
+    data_size = len(seq_data_cws)
+    kpi_static_set = set()
+    rev_kpi_dic = dict(zip(kpi_dic.values(), kpi_dic.keys()))
+    max_len = 0
+    sten_that_over_maxl = []
+    with tqdm(total=data_size) as _tqdm:
+        ww_return = []
+        ww_list = []
+        kpi_info = []
+        not_in_KPI = defaultdict(int)
+        for i in range(data_size):
+            _tqdm.set_description(f'checking...[{i}/{data_size}] max len: [{max_len}]')
+            orig = tokenizer.tokenize(" ".join(seq_data_cws[i]))
+
+            if deal_numeric:
+                # 得到元组信息，前两位是KPI下标范围
+                _kpi_info, kpi_type_list = extract_kpi(orig, kpi_dic, not_in_KPI)
+                kpi_info.append(_kpi_info)
+                kpi_static_set.update(kpi_type_list)
+
+            sub_total = []
+            ww_seq_tmp = []
+            ww_tmp = []
+            for sub_data in seq_data_cws[i]:
+                sub = tokenizer.tokenize(sub_data)
+                sub_total.extend(sub)
+                # 在whole word 里面添加#号
+                # 输入:  ['异', '常']
+                ref_token = deal_sub_words(sub, special_token)
+                # 输出:  ['异', '##常']
+                ww_seq_tmp.extend(ref_token)
+                ww_tmp.append(ref_token)
+
+            if sub_total != orig:
+                print("error in match... ")
+                if len(orig) > 512:
+                    print("the lenth is over the max lenth")
+                pdb.set_trace()
+
+            # 变成[[...],[...],[...], ...]
+            # ww_return.append(ww_tmp)
+            sz_ww_seq = len(ww_seq_tmp)
+            # 求最大长度
+            max_len = sz_ww_seq if sz_ww_seq > max_len else max_len
+            if sz_ww_seq > 500:
+                sten_that_over_maxl.append((ww_seq_tmp, sz_ww_seq))
+
+            assert len(sub_total) == sz_ww_seq
+            ww_return.append(ww_seq_tmp)
+            ww_list.append(ww_tmp)
+            # pdb.set_trace()
+            _tqdm.update(1)
+    # pdb.set_trace()
+    if deal_numeric:
+        in_kpi = []
+        # pdb.set_trace()
+        for key in rev_kpi_dic.keys():
+            if key in kpi_static_set:
+                in_kpi.append(rev_kpi_dic[key])
+        if len(in_kpi) < len(rev_kpi_dic):
+            print(f"[{len(in_kpi)}] KPI are covered by data: {in_kpi}")
+            print(f" [{len(not_in_KPI)}] KPI无法匹配{not_in_KPI}")
+        else:
+            print("all KPI are covered!")
+    return ww_return, kpi_info, sten_that_over_maxl
+
+
+def extract_num(seq_data_cws):
+    '''
+        功能：把序列中的数值信息提取出来
+        同时过滤 nan 数值
+    '''
+    num_ref = []
+    seq_data_cws_new = []
+    for j in range(len(seq_data_cws)):
+        num_index = [i for i, x in enumerate(seq_data_cws[j]) if x == '[NUM]']
+        # kpi_score = [float(seq_data_cws[i][index+1]) for index in num_index]
+        kpi_score = []
+        flag = 1
+        for index in num_index:
+            # if math.isnan(tmp):
+            #     pdb.set_trace()
+            try:
+                tmp = float(seq_data_cws[j][index + 1])
+            except:
+                # pdb.set_trace()
+                flag = 0
+                continue
+            if math.isnan(tmp):
+                flag = 0
+            else:
+                kpi_score.append(tmp)
+
+        if len(num_index) > 0:
+            for index in reversed(num_index):
+                seq_data_cws[j].pop(index + 1)
+        if flag == 1:
+            num_ref.append(kpi_score)
+            seq_data_cws_new.append(seq_data_cws[j])
+    return seq_data_cws_new, num_ref
+
+
+def extract_kpi(token_data, kpi_dic, not_in_KPI):
+    '''
+        功能：把序列中的[KPI]下标范围，[NUM]下标提取出来
+        输出格式： [(1,2,4),(5,6,7)]
+    '''
+    kpi_and_num_info = []
+    kpi_type = []
+    kpi_index = [i for i, x in enumerate(token_data) if x.lower() == '[kpi]']
+    num_index = [i for i, x in enumerate(token_data) if x.lower() == '[num]']
+    sz = len(kpi_index)
+    assert sz == len(num_index)
+    for i in range(sz):
+        # (kpi 开始，kpi 结束，NUM token位置)
+        # DONE: 添加KPI的类别
+        kpi_name = ''.join(token_data[kpi_index[i] + 1: num_index[i] - 1])
+        kpi_name_clear = kpi_name.replace('##', '')
+
+        if kpi_name in kpi_dic:
+            kpi_id = int(kpi_dic[kpi_name])
+        elif kpi_name_clear in kpi_dic:
+            kpi_id = int(kpi_dic[kpi_name_clear])
+        elif kpi_name_clear in not_in_KPI:
+            kpi_id = -1
+            not_in_KPI[kpi_name_clear] += 1
+        else:
+            # 只打印一次
+            not_in_KPI[kpi_name_clear] += 1
+            kpi_id = -1
+            # print(f"{kpi_name_clear} not in KPI dict")
+
+        kpi_info = [kpi_index[i] + 1, num_index[i] - 2, num_index[i], kpi_id]
+        kpi_and_num_info.append(kpi_info)
+        kpi_type.append(kpi_id)
+    # pdb.set_trace()
+
+    return kpi_and_num_info, kpi_type
+
+
+def kpi_combine(kpi_info, num_ref):
+    sz = len(kpi_info)
+    assert sz == len(num_ref)
+    for i in range(sz):
+        for j in range(len(kpi_info[i])):
+            kpi_info[i][j].append(num_ref[i][j])
+            # pdb.set_trace()
+    return kpi_info
+
+# 所有字母小写
+
+
+def kpi_lower_update(kpi_dic):
+    new_dic = {}
+    for key in kpi_dic:
+        kk = key.lower().split()
+        kk = ''.join(kk).strip()
+        new_dic[kk] = kpi_dic[key]
+    return new_dic
+
+
+if __name__ == '__main__':
+    '''
+    功能： 得到 chinese ref 文件，同时刷新训练/测试文件（仅针对序列的文本数据）
+    '''
+    cfg = cfg()
+    cfg.get_args()
+    cfgs = cfg.update_train_configs()
+
+    # 路径指定
+    domain_file_path = osp.join(cfgs.data_path, 'special_vocab.txt')
+    with open(domain_file_path, encoding="utf-8") as f:
+        ref = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
+    tokenizer = BertTokenizer.from_pretrained(osp.join(cfgs.data_root, 'transformer', 'MacBert'), do_lower_case=True)
+    seq_data_name = cfgs.seq_data_name
+    with open(osp.join(cfgs.data_path, f'{seq_data_name}.json'), "r") as fp:
+        seq_data = json.load(fp)
+    kpi_dic_name = 'kpi2id'
+    with open(osp.join(cfgs.data_path, f'{kpi_dic_name}.json'), "r") as fp:
+        kpi_dic = json.load(fp)
+    kpi_dic = kpi_lower_update(kpi_dic)
+    # 供测试
+    random.shuffle(seq_data)
+    # seq_data = seq_data[:500]
+    print(f"tokenizer size before: {len(tokenizer)}")
+    tokenizer, special_token, norm_token = add_special_token(tokenizer)
+    special_token = special_token + norm_token
+
+    print(f"tokenizer size after: {len(tokenizer)}")
+    print('------------------------ refresh data --------------------------------')
+    add_words = refresh_data(ref, cfgs.freq, special_token)
+
+    if not cfgs.read_cws:
+        print('------------------------ cws ----------------------------------')
+        seq_data_cws, data_size, error_data = cws(seq_data, add_words, cfgs.batch_size)
+        print(f'batch size is {cfgs.batch_size}')
+        if len(error_data) > 0:
+            with open(osp.join(cfgs.data_path, f'{seq_data_name}_error.json'), "w") as fp:
+                json.dump(error_data, fp, ensure_ascii=False)
+        save_path_cws_orig = osp.join(cfgs.data_path, f'{seq_data_name}_cws_orig.json')
+        print("get the new training data! saving...")
+        with open(save_path_cws_orig, 'w', ) as fp:
+            json.dump(seq_data_cws, fp, ensure_ascii=False)
+    else:
+        print('------------------------ read ----------------------------------')
+        save_path_cws = osp.join(cfgs.data_path, f'{seq_data_name}_cws_orig.json')
+        print("get the new training data!")
+        with open(save_path_cws, 'r', ) as fp:
+            seq_data_cws = json.load(fp)
+        data_size = len(seq_data_cws)
+
+    sz_orig = len(seq_data_cws)
+    if cfgs.deal_numeric:
+        seq_data_cws, num_ref = extract_num(seq_data_cws)
+    print(f"过滤了{sz_orig - len(seq_data_cws)}个无效句子")
+    data_size = len(seq_data_cws)
+
+    print('---------------------- generate chinese ref ------------------------------')
+    chinese_ref, kpi_info, sten_that_over_maxl = generate_chinese_ref(seq_data_cws, special_token, cfgs.deal_numeric, kpi_dic)
+
+    if len(sten_that_over_maxl) > 0:
+        print(f"{len(sten_that_over_maxl)} over the 500 len!")
+        save_path_max = osp.join(cfgs.data_path, f'{seq_data_name}_max_len_500.json')
+        with open(save_path_max, 'w') as fp:
+            json.dump(sten_that_over_maxl, fp, ensure_ascii=False)
+
+    if cfgs.deal_numeric:
+        print("KPI info combine")
+        kpi_ref = kpi_combine(kpi_info, num_ref)
+        # pdb.set_trace()
+    print('------------------------- match finished ------------------------------')
+
+    # 输出最后训练的时候用于做wwm的分词
+    save_path_ref = osp.join(cfgs.data_path, f'{seq_data_name}_chinese_ref.json')
+    with open(save_path_ref, 'w') as fp:
+        json.dump(chinese_ref, fp, ensure_ascii=False)
+    print(f"save chinese_ref done!")
+
+    seq_data_cws_output = []
+    for i in range(data_size):
+        seq = " ".join(seq_data_cws[i])
+        seq_data_cws_output.append(seq)
+
+    save_path_cws = osp.join(cfgs.data_path, f'{seq_data_name}_cws.json')
+    print("get the new training data!")
+    with open(save_path_cws, 'w', ) as fp:
+        json.dump(seq_data_cws_output, fp, ensure_ascii=False)
+
+    print("save seq_data_cws done!")
+
+    if cfgs.deal_numeric:
+        kpi_ref_path = osp.join(cfgs.data_path, f'{seq_data_name}_kpi_ref.json')
+        with open(kpi_ref_path, 'w', ) as fp:
+            json.dump(kpi_ref, fp, ensure_ascii=False)
+        print("save num and kpi done!")
diff --git a/KTeleBERT/main.py b/KTeleBERT/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..de6605986e6256bbaa486b7fcd938acdd7c09d26
--- /dev/null
+++ b/KTeleBERT/main.py
@@ -0,0 +1,851 @@
+import os
+import os.path as osp
+import torch
+from torch.utils.tensorboard import SummaryWriter
+from torch.utils.data import DataLoader, RandomSampler
+from torch.cuda.amp import GradScaler, autocast
+from datetime import datetime
+from easydict import EasyDict as edict
+from tqdm import tqdm
+import pdb
+import pprint
+import json
+import pickle
+from collections import defaultdict
+import copy
+from time import time
+
+from config import cfg
+from torchlight import initialize_exp, set_seed, get_dump_path
+from src.data import load_data, load_data_kg, Collator_base, Collator_kg, SeqDataset, KGDataset, Collator_order, load_order_data
+from src.utils import set_optim, Loss_log, add_special_token, time_trans
+from src.distributed_utils import init_distributed_mode, dist_pdb, is_main_process, reduce_value, cleanup
+import torch.distributed as dist
+
+from itertools import cycle
+from model import BertTokenizer, HWBert, KGEModel, OD_model, KE_model
+import torch.multiprocessing
+from torch.nn.parallel import DistributedDataParallel
+
+# 默认用cuda就行
+
+
+class Runner:
+    def __init__(self, args, writer=None, logger=None, rank=0):
+        self.datapath = edict()
+        self.datapath.log_dir = get_dump_path(args)
+        self.datapath.model_dir = os.path.join(self.datapath.log_dir, 'model')
+        self.rank = rank
+        # init code
+        self.mlm_probability = args.mlm_probability
+        self.args = args
+        self.writer = writer
+        self.logger = logger
+        # 模型选择
+        self.model_list = []
+        self.model = HWBert(self.args)
+        # 数据加载。添加special_token，同时把模型的embedding layer进行resize
+        self.data_init()
+        self.model.cuda()
+        # 模型加载
+        self.od_model, self.ke_model = None, None
+        self.scaler = GradScaler()
+
+        # 只要不是第一种训练策略就有新模型
+        if self.args.train_strategy >= 2:
+            self.ke_model = KE_model(self.args)
+        if self.args.train_strategy >= 3:
+            # TODO: 异常检测
+            pass
+        if self.args.train_strategy >= 4:
+            self.od_model = OD_model(self.args)
+
+        if self.args.model_name not in ['MacBert', 'TeleBert', 'TeleBert2', 'TeleBert3'] and not self.args.from_pretrain:
+            # 如果不存在模型会直接返回None或者原始模型
+            self.model = self._load_model(self.model, self.args.model_name)
+            self.od_model = self._load_model(self.od_model, f"od_{self.args.model_name}")
+            self.ke_model = self._load_model(self.ke_model, f"ke_{self.args.model_name}")
+            # TODO: 异常检测
+
+        # 测试的情况
+        if self.args.only_test:
+            self.dataloader_init(self.seq_test_set)
+        else:
+            # 训练
+            if self.args.ernie_stratege > 0:
+                self.args.mask_stratege = 'rand'
+            # 初始化dataloader
+            self.dataloader_init(self.seq_train_set, self.kg_train_set, self.order_train_set)
+            if self.args.dist:
+                # 并行训练需要权值共享
+                self.model_sync()
+            else:
+                self.model_list = [model for model in [self.model, self.od_model, self.ke_model] if model is not None]
+
+            self.optim_init(self.args)
+
+    def model_sync(self):
+        checkpoint_path = osp.join(self.args.data_path, "tmp", "initial_weights.pt")
+        checkpoint_path_od = osp.join(self.args.data_path, "tmp", "initial_weights_od.pt")
+        checkpoint_path_ke = osp.join(self.args.data_path, "tmp", "initial_weights_ke.pt")
+        if self.rank == 0:
+            torch.save(self.model.state_dict(), checkpoint_path)
+            if self.od_model is not None:
+                torch.save(self.od_model.state_dict(), checkpoint_path_od)
+            if self.ke_model is not None:
+                torch.save(self.ke_model.state_dict(), checkpoint_path_ke)
+        dist.barrier()
+
+        # if self.rank != 0:
+        # 这里注意，一定要指定map_location参数，否则会导致第一块GPU占用更多资源
+        self.model = self._model_sync(self.model, checkpoint_path)
+        if self.od_model is not None:
+            self.od_model = self._model_sync(self.od_model, checkpoint_path_od)
+        if self.ke_model is not None:
+            self.ke_model = self._model_sync(self.ke_model, checkpoint_path_ke)
+
+    def _model_sync(self, model, checkpoint_path):
+        model.load_state_dict(torch.load(checkpoint_path, map_location=self.args.device))
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(self.args.device)
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[self.args.gpu], find_unused_parameters=True)
+        self.model_list.append(model)
+        model = model.module
+        return model
+
+    def optim_init(self, opt, total_step=None, accumulation_step=None):
+        step_per_epoch = len(self.train_dataloader)
+        # 占总step 10% 的warmup_steps
+        opt.total_steps = int(step_per_epoch * opt.epoch) if total_step is None else int(total_step)
+        opt.warmup_steps = int(opt.total_steps * 0.15)
+
+        if self.rank == 0 and total_step is None:
+            self.logger.info(f"warmup_steps: {opt.warmup_steps}")
+            self.logger.info(f"total_steps: {opt.total_steps}")
+            self.logger.info(f"weight_decay: {opt.weight_decay}")
+
+        freeze_part = ['bert.encoder.layer.1.', 'bert.encoder.layer.2.', 'bert.encoder.layer.3.', 'bert.encoder.layer.4.'][:self.args.freeze_layer]
+        self.optimizer, self.scheduler = set_optim(opt, self.model_list, freeze_part, accumulation_step)
+
+    def data_init(self):
+        # 载入数据, 两部分数据包括：载入mask loss部分的数据（序列化的数据） 和 载入triple loss部分的数据（三元组）
+        # train_test_split: 训练集长度
+        self.seq_train_set, self.seq_test_set, self.kg_train_set, self.kg_data = None, None, None, None
+        self.order_train_set, self.order_test_set = None, None
+
+        if self.args.train_strategy >= 1 and self.args.train_strategy <= 4:
+            # 预训练 or multi task pretrain
+            self.seq_train_set, self.seq_test_set, train_test_split = load_data(self.logger, self.args)
+            if self.args.train_strategy >= 2:
+                self.kg_train_set, self.kg_data = load_data_kg(self.logger, self.args)
+            if self.args.train_strategy >= 3:
+                # TODO: 异常检测的数据载入
+                pass
+            if self.args.train_strategy >= 4:
+                self.order_train_set, self.order_test_set, train_test_split = load_order_data(self.logger, self.args)
+
+        if self.args.dist and not self.args.only_test:
+            # 测试不需要并行
+            if self.args.train_strategy >= 1 and self.args.train_strategy <= 4:
+                self.seq_train_sampler = torch.utils.data.distributed.DistributedSampler(self.seq_train_set)
+                if self.args.train_strategy >= 2:
+                    self.kg_train_sampler = torch.utils.data.distributed.DistributedSampler(self.kg_train_set)
+                if self.args.train_strategy >= 3:
+                    # TODO: 异常检测的数据载入
+                    pass
+                if self.args.train_strategy >= 4:
+                    self.order_train_sampler = torch.utils.data.distributed.DistributedSampler(self.order_train_set)
+
+            # self.seq_train_batch_sampler = torch.utils.data.BatchSampler(self.seq_train_sampler, self.args.batch_size, drop_last=True)
+            # self.kg_train_batch_sampler = torch.utils.data.BatchSampler(self.kg_train_sampler, int(self.args.batch_size / 4), drop_last=True)
+
+        # Tokenizer 载入
+        model_name = self.args.model_name
+        if self.args.model_name in ['TeleBert', 'TeleBert2', 'TeleBert3']:
+            self.tokenizer = BertTokenizer.from_pretrained(osp.join(self.args.data_root, 'transformer', model_name), do_lower_case=True)
+        else:
+            if not osp.exists(osp.join(self.args.data_root, 'transformer', self.args.model_name)):
+                model_name = 'MacBert'
+            self.tokenizer = BertTokenizer.from_pretrained(osp.join(self.args.data_root, 'transformer', model_name), do_lower_case=True)
+
+        # 添加special_token，同时把模型的embedding layer进行resize
+        self.special_token = None
+        # 单纯的telebert在测试时不需要特殊embedding
+        if self.args.add_special_word and not (self.args.only_test and self.args.model_name in ['MacBert', 'TeleBert', 'TeleBert2', 'TeleBert3']):
+            # tokenizer, special_token, norm_token
+            # special_token 不应该被MASK
+            self.tokenizer, special_token, _ = add_special_token(self.tokenizer, model=self.model.encoder, rank=self.rank, cache_path=self.args.specail_emb_path)
+            # pdb.set_trace()
+            self.special_token = [token.lower() for token in special_token]
+
+    def _dataloader_dist(self, train_set, train_sampler, batch_size, collator):
+        train_dataloader = DataLoader(
+            train_set,
+            sampler=train_sampler,
+            pin_memory=True,
+            num_workers=self.args.workers,
+            persistent_workers=True,
+            drop_last=True,
+            batch_size=batch_size,
+            collate_fn=collator
+        )
+        return train_dataloader
+
+    def _dataloader(self, train_set, batch_size, collator):
+        train_dataloader = DataLoader(
+            train_set,
+            num_workers=self.args.workers,
+            persistent_workers=True,
+            shuffle=(self.args.only_test == 0),
+            drop_last=(self.args.only_test == 0),
+            batch_size=batch_size,
+            collate_fn=collator
+        )
+        return train_dataloader
+
+    def dataloader_init(self, train_set=None, kg_train_set=None, order_train_set=None):
+        bs = self.args.batch_size
+        bs_ke = self.args.batch_size_ke
+        bs_od = self.args.batch_size_od
+        bs_ad = self.args.batch_size_ad
+        # 分布式
+        if self.args.dist and not self.args.only_test:
+            self.args.workers = min([os.cpu_count(), self.args.batch_size, self.args.workers])
+            # if self.rank == 0:
+            #     print(f'Using {self.args.workers} dataloader workers every process')
+
+            if train_set is not None:
+                seq_collator = Collator_base(self.args, tokenizer=self.tokenizer, special_token=self.special_token)
+                self.train_dataloader = self._dataloader_dist(train_set, self.seq_train_sampler, bs, seq_collator)
+            if kg_train_set is not None:
+                kg_collator = Collator_kg(self.args, tokenizer=self.tokenizer, data=self.kg_data)
+                self.train_dataloader_kg = self._dataloader_dist(kg_train_set, self.kg_train_sampler, bs_ke, kg_collator)
+            if order_train_set is not None:
+                order_collator = Collator_order(self.args, tokenizer=self.tokenizer)
+                self.train_dataloader_order = self._dataloader_dist(order_train_set, self.order_train_sampler, bs_od, order_collator)
+        else:
+            if train_set is not None:
+                seq_collator = Collator_base(self.args, tokenizer=self.tokenizer, special_token=self.special_token)
+                self.train_dataloader = self._dataloader(train_set, bs, seq_collator)
+            if kg_train_set is not None:
+                kg_collator = Collator_kg(self.args, tokenizer=self.tokenizer, data=self.kg_data)
+                self.train_dataloader_kg = self._dataloader(kg_train_set, bs_ke, kg_collator)
+            if order_train_set is not None:
+                order_collator = Collator_order(self.args, tokenizer=self.tokenizer)
+                self.train_dataloader_order = self._dataloader(order_train_set, bs_od, order_collator)
+
+    def dist_step(self, task=0):
+        # 分布式训练需要额外step
+        if self.args.dist:
+            if task == 0:
+                self.seq_train_sampler.set_epoch(self.dist_epoch)
+            if task == 1:
+                self.kg_train_sampler.set_epoch(self.dist_epoch)
+            if task == 2:
+                # TODO:异常检测
+                pass
+            if task == 3:
+                self.order_train_sampler.set_epoch(self.dist_epoch)
+            self.dist_epoch += 1
+
+    def mask_rate_update(self, i):
+        # 这种策略是曲线地增加 mask rate
+        if self.args.mlm_probability_increase == "curve":
+            self.args.mlm_probability += (i + 1) * ((self.args.final_mlm_probability - self.args.mlm_probability) / self.args.epoch)
+        # 这种是线性的
+        else:
+            assert self.args.mlm_probability_increase == "linear"
+            self.args.mlm_probability += (self.args.final_mlm_probability - self.mlm_probability) / self.args.epoch
+
+        if self.rank == 0:
+            self.logger.info(f"Moving Mlm_probability in next epoch to: {self.args.mlm_probability*100}%")
+
+    def task_switch(self, training_strategy):
+        # 同时训练或者策略1训练不需要切换任务，epoch也安装初始epoch就行
+        if training_strategy == 1 or self.args.train_together:
+            return (0, 0), None
+
+        # 4 阶段
+        # self.total_epoch -= 1
+
+        for i in range(4):
+            for task in range(training_strategy):
+                if self.args.epoch_matrix[task][i] > 0:
+                    self.args.epoch_matrix[task][i] -= 1
+                    return (task, i), self.args.epoch_matrix[task][i] + 1
+
+    def run(self):
+        self.loss_log = Loss_log()
+        self.curr_loss = 0.
+        self.lr = self.args.lr
+        self.curr_loss_dic = defaultdict(float)
+        self.curr_kpi_loss_dic = defaultdict(float)
+        self.loss_weight = [1, 1]
+        self.kpi_loss_weight = [1, 1]
+        self.step = 0
+        # 不同task 的累计step
+        self.total_step_sum = 0
+        task_last = 0
+        stage_last = 0
+        self.dist_epoch = 0
+        # 后面可以变成混合训练模式
+        # self.total_epoch = self.args.epoch
+        # --------- train -------------
+        with tqdm(total=self.args.epoch) as _tqdm:  # 使用需要的参数对tqdm进行初始化
+            for i in range(self.args.epoch):
+                # 切换Task
+                (task, stage), task_epoch = self.task_switch(self.args.train_strategy)
+                self.dist_step(task)
+                dataloader = self.task_dataloader_choose(task)
+                # 并行
+                if self.args.train_together and self.args.train_strategy > 1:
+                    self.dataloader_list = ['#']
+                    # 一个list 存下所有需要的dataloader的迭代
+                    for t in range(1, self.args.train_strategy):
+                        self.dist_step(t)
+                        self.dataloader_list.append(iter(self.task_dataloader_choose(t)))
+
+                if task != task_last or stage != stage_last:
+                    self.step = 0
+                    if self.rank == 0:
+                        print(f"switch to task [{task}] in stage [{stage}]...")
+                        if stage != stage_last:
+                            # 每一个阶段结束保存一次
+                            self._save_model(stage=f'_stg{stage_last}')
+                    # task 转换状态时需要重新初始化优化器
+                    # 并行训练或者单一task (0) 训练不需要切换opti
+                    if task_epoch is not None:
+                        self.optim_init(self.args, total_step=len(dataloader) * task_epoch, accumulation_step=self.args.accumulation_steps_dict[task])
+                        task_last = task
+                        stage_last = stage
+
+                # 调整学习阶段
+                if task == 0 and self.args.ernie_stratege > 0 and i >= self.args.ernie_stratege:
+                    # 不会再触发第二次
+                    self.args.ernie_stratege = 10000000
+                    if self.rank == 0:
+                        self.logger.info("switch to wwm stratege...")
+                    self.args.mask_stratege = 'wwm'
+
+                if self.args.mlm_probability != self.args.final_mlm_probability:
+                    # 更新 MASK rate
+                    # 初始化训练数据, 可以随epoch切换
+                    # 混合训练
+                    self.mask_rate_update(i)
+                    self.dataloader_init(self.seq_train_set, self.kg_train_set, self.order_train_set)
+                # -------------------------------
+                # 针对task 进行训练
+                self.train(_tqdm, dataloader, task)
+                # -------------------------------
+                _tqdm.update(1)
+
+        # DONE: save or load
+        if self.rank == 0:
+            self.logger.info(f"min loss {self.loss_log.get_min_loss()}")
+            # DONE: save or load
+            if not self.args.only_test and self.args.save_model:
+                self._save_model()
+
+    def task_dataloader_choose(self, task):
+        self.model.train()
+        # 同时训练就用基础dataloader就行
+        if task == 0:
+            dataloader = self.train_dataloader
+        elif task == 1:
+            self.ke_model.train()
+            dataloader = self.train_dataloader_kg
+        elif task == 2:
+            pass
+        elif task == 3:
+            self.od_model.train()
+            dataloader = self.train_dataloader_order
+        return dataloader
+    # one time train
+
+    def loss_output(self, batch, task):
+        # -------- 模型输出 loss --------
+        if task == 0:
+            # 输出
+            _output = self.model(batch)
+            loss = _output['loss']
+        elif task == 1:
+            loss = self.ke_model(batch, self.model)
+        elif task == 2:
+            pass
+        elif task == 3:
+            # TODO: finetune的时候多任务 accumulation_steps 自适应
+            # OD task
+            emb = self.model.cls_embedding(batch[0], tp=self.args.plm_emb_type)
+            loss, loss_dic = self.od_model(emb, batch[1].cuda())
+            order_score = self.od_model.predict(emb)
+            token_right = self.od_model.right_caculate(order_score, batch[1], threshold=0.5)
+            self.loss_log.update_token(batch[1].shape[0], [token_right])
+        return loss
+
+    def train(self, _tqdm, dataloader, task=0):
+        # cycle train
+        loss_weight, kpi_loss_weight, kpi_loss_dict, _output = None, None, None, None
+        # dataloader = zip(self.train_dataloader, cycle(self.train_dataloader_kg))
+        self.loss_log.acc_init()
+        # 如果self.train_dataloader比self.train_dataloader_kg长则会使得后者训练不完全
+        accumulation_steps = self.args.accumulation_steps_dict[task]
+        torch.cuda.empty_cache()
+
+        for batch in dataloader:
+            # with autocast():
+            loss = self.args.mask_loss_scale * self.loss_output(batch, task)
+            # 如果是同时训练的话使用迭代器的方法得到另外的epoch
+            if self.args.train_together and self.args.train_strategy > 1:
+                for t in range(1, self.args.train_strategy):
+                    try:
+                        batch = next(self.dataloader_list[t])
+                    except StopIteration:
+                        self.dist_step(t)
+                        self.dataloader_list[t] = iter(self.task_dataloader_choose(t))
+                        batch = next(self.dataloader_list[t])
+                    # 选择对应的模型得到loss
+                    # torch.cuda.empty_cache()
+                    loss += self.loss_output(batch, t)
+                    # torch.cuda.empty_cache()
+            loss = loss / accumulation_steps
+            self.scaler.scale(loss).backward()
+            # loss.backward()
+            if self.args.dist:
+                loss = reduce_value(loss, average=True)
+            # torch.cuda.empty_cache()
+            self.step += 1
+            self.total_step_sum += 1
+
+            # -------- 模型统计 --------
+            if not self.args.dist or is_main_process():
+                self.output_statistic(loss, _output)
+                acc_descrip = f"Acc: {self.loss_log.get_token_acc()}" if self.loss_log.get_token_acc() > 0 else ""
+                _tqdm.set_description(f'Train | step [{self.step}/{self.args.total_steps}] {acc_descrip} LR [{self.lr}] Loss {self.loss_log.get_loss():.5f} ')
+                if self.step % self.args.eval_step == 0 and self.step > 0:
+                    self.loss_log.update(self.curr_loss)
+                    self.update_loss_log()
+            # -------- 梯度累计与模型更新 --------
+            if self.step % accumulation_steps == 0 and self.step > 0:
+                # 更新优化器
+                self.scaler.unscale_(self.optimizer)
+                for model in self.model_list:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.clip)
+
+                # self.optimizer.step()
+                scale = self.scaler.get_scale()
+                self.scaler.step(self.optimizer)
+
+                self.scaler.update()
+                skip_lr_sched = (scale > self.scaler.get_scale())
+                if not skip_lr_sched:
+                    # pdb.set_trace()
+                    self.scheduler.step()
+
+                if not self.args.dist or is_main_process():
+                    # pdb.set_trace()
+                    self.lr = self.scheduler.get_last_lr()[-1]
+                    self.writer.add_scalars("lr", {"lr": self.lr}, self.total_step_sum)
+                # 模型update
+                for model in self.model_list:
+                    model.zero_grad(set_to_none=True)
+
+            if self.args.dist:
+                torch.cuda.synchronize(self.args.device)
+        return self.curr_loss, self.curr_loss_dic
+
+    def output_statistic(self, loss, output):
+        # 统计模型的各种输出
+        self.curr_loss += loss.item()
+        if output is None:
+            return
+        for key in output['loss_dic'].keys():
+            self.curr_loss_dic[key] += output['loss_dic'][key]
+        if 'kpi_loss_dict' in output and output['kpi_loss_dict'] is not None:
+            for key in output['kpi_loss_dict'].keys():
+                self.curr_kpi_loss_dic[key] += output['kpi_loss_dict'][key]
+        if 'loss_weight' in output and output['loss_weight'] is not None:
+            self.loss_weight = output['loss_weight']
+        # 需要用dict来判断
+        if 'kpi_loss_weight' in output and output['kpi_loss_weight'] is not None:
+            self.kpi_loss_weight = output['kpi_loss_weight']
+
+    def update_loss_log(self, task=0):
+        # 把统计的模型各种输出存下来
+        # https://zhuanlan.zhihu.com/p/382950853
+        #  "mask_loss": self.curr_loss_dic['mask_loss'], "ke_loss": self.curr_loss_dic['ke_loss']
+        vis_dict = {"train_loss": self.curr_loss}
+        vis_dict.update(self.curr_loss_dic)
+        self.writer.add_scalars("loss", vis_dict, self.total_step_sum)
+        if self.loss_weight is not None:
+            # 预训练
+            loss_weight_dic = {}
+            if self.args.train_strategy == 1:
+                loss_weight_dic["mask"] = 1 / (self.loss_weight[0]**2)
+                if self.args.use_NumEmb:
+                    loss_weight_dic["kpi"] = 1 / (self.loss_weight[1]**2)
+                    vis_kpi_dic = {"recover": 1 / (self.kpi_loss_weight[0]**2), "classifier": 1 / (self.kpi_loss_weight[1]**2)}
+                    if self.args.contrastive_loss and len(self.kpi_loss_weight) > 2:
+                        vis_kpi_dic.update({"contrastive": 1 / (self.kpi_loss_weight[2]**2)})
+                    self.writer.add_scalars("kpi_loss_weight", vis_kpi_dic, self.total_step_sum)
+                    self.writer.add_scalars("kpi_loss", self.curr_kpi_loss_dic, self.total_step_sum)
+                self.writer.add_scalars("loss_weight", loss_weight_dic, self.total_step_sum)
+            # TODO: Finetune
+
+        # init log loss
+        self.curr_loss = 0.
+        for key in self.curr_loss_dic:
+            self.curr_loss_dic[key] = 0.
+        if len(self.curr_kpi_loss_dic) > 0:
+            for key in self.curr_kpi_loss_dic:
+                self.curr_kpi_loss_dic[key] = 0.
+
+    # TODO: Finetune 阶段
+    def eval(self):
+        self.model.eval()
+        torch.cuda.empty_cache()
+
+    def mask_test(self, test_log):
+        # 如果大于1 就无法mask测试
+        assert self.args.train_ratio < 1
+        topk = (1, 100, 500)
+        test_log.acc_init(topk)
+        # 做 mask 预测的时候需要进入训练模式，以获得随机mask的token
+        self.args.only_test = 0
+        self.dataloader_init(self.seq_test_set)
+        # pdb.set_trace()
+        sz_test = len(self.train_dataloader)
+        loss_sum = 0
+        with tqdm(total=sz_test) as _tqdm:  # 使用需要的参数对tqdm进行初始化
+            for step, batch in enumerate(self.train_dataloader):
+                # DONE: 写好mask_prediction实现mask预测
+                with torch.no_grad():
+                    token_num, token_right, loss = self.model.mask_prediction(batch, len(self.tokenizer), topk)
+                test_log.update_token(token_num, token_right)
+                loss_sum += loss
+                # test_log.update_word(word_num, word_right)
+                _tqdm.update(1)
+                _tqdm.set_description(f'Test | step [{step}/{sz_test}] Top{topk} Token_Acc: {test_log.get_token_acc()}')
+        print(f"perplexity: {loss_sum}")
+        # 训练模式复位
+        self.args.only_test = 1
+        # if topk is not None:
+        print(f"Top{topk} acc is {test_log.get_token_acc()}")
+
+    def emb_generate(self, path_gen):
+        assert len(self.args.path_gen) > 0 or path_gen is not None
+        data_path = self.args.data_path
+        if path_gen is None:
+            path_gen = self.args.path_gen
+        with open(osp.join(data_path, 'downstream_task', f'{path_gen}.json'), "r") as fp:
+            data = json.load(fp)
+        print(f"read file {path_gen} done!")
+        test_set = SeqDataset(data)
+        self.dataloader_init(test_set)
+        sz_test = len(self.train_dataloader)
+        all_emb_dic = defaultdict(list)
+        emb_output = {}
+        all_emb_ent = []
+        # tps = ['cls', 'last_avg', 'last2avg', 'last3avg', 'first_last_avg']
+        tps = ['cls', 'last_avg']
+        # with tqdm(total=sz_test) as _tqdm:
+        for step, batch in enumerate(self.train_dataloader):
+            for tp in tps:
+                with torch.no_grad():
+                    batch_embedding = self.model.cls_embedding(batch, tp=tp)
+                    # batch_embedding = self.model.cls_embedding(batch, tp=tp)
+                    if tp in self.args.model_name and self.ke_model is not None:
+                        batch_embedding_ent = self.ke_model.get_embedding(batch_embedding, is_ent=True)
+                        # batch_embedding_ent = self.ke_model(batch, self.model)
+                        batch_embedding_ent = batch_embedding_ent.cpu()
+                        all_emb_ent.append(batch_embedding_ent)
+
+                batch_embedding = batch_embedding.cpu()
+                all_emb_dic[tp].append(batch_embedding)
+            # _tqdm.update(1)
+            # _tqdm.set_description(f'Test | step [{step}/{sz_test}]')
+            torch.cuda.empty_cache()
+        for tp in tps:
+            emb_output[tp] = torch.cat(all_emb_dic[tp])
+            assert emb_output[tp].shape[0] == len(data)
+        if len(all_emb_ent) > 0:
+            emb_output_ent = torch.cat(all_emb_ent)
+        # 后缀
+        save_path = osp.join(data_path, 'downstream_task', 'output')
+        os.makedirs(save_path, exist_ok=True)
+        for tp in tps:
+            save_dir = osp.join(save_path, f'{path_gen}_emb_{self.args.model_name.replace("DistributedDataParallel", "")}_{tp}.pt')
+            torch.save(emb_output[tp], save_dir)
+        # 有训练好的实体embedding可使用
+        if len(all_emb_ent) > 0:
+            save_dir = osp.join(save_path, f'{path_gen}_emb_{self.args.model_name.replace("DistributedDataParallel", "")}_ent.pt')
+            torch.save(emb_output_ent, save_dir)
+
+    def KGE_test(self):
+        # 直接用KG全集进行kge的测试
+        sz_test = len(self.kg_train_set)
+        # 先转换数据
+        ent_set = set()
+        rel_set = set()
+        with tqdm(total=sz_test) as _tqdm:  # 使用需要的参数对tqdm进行初始化
+            _tqdm.set_description('trans entity/relation ID')
+            for batch in self.kg_train_set:
+                ent_set.add(batch[0])
+                ent_set.add(batch[2])
+                rel_set.add(batch[1])
+                _tqdm.update(1)
+        all_ent, all_rel = list(ent_set), list(rel_set)
+        nent, nrel = len(all_ent), len(all_rel)
+        ent_dic, rel_dic = {}, {}
+        for i in range(nent):
+            ent_dic[all_ent[i]] = i
+        for i in range(nrel):
+            rel_dic[all_rel[i]] = i
+        id_format_triple = []
+        with tqdm(total=sz_test) as _tqdm:
+            _tqdm.set_description('trans triple ID')
+            for triple in self.kg_train_set:
+                id_format_triple.append((ent_dic[triple[0]], rel_dic[triple[1]], ent_dic[triple[2]]))
+                _tqdm.update(1)
+
+        # pdb.set_trace()
+        # 生成实体embedding并且保存
+        ent_dataset = KGDataset(all_ent)
+        rel_dataset = KGDataset(all_rel)
+
+        ent_dataloader = DataLoader(
+            ent_dataset,
+            batch_size=self.args.batch_size * 32,
+            num_workers=self.args.workers,
+            persistent_workers=True,
+            shuffle=False
+        )
+        rel_dataloader = DataLoader(
+            rel_dataset,
+            batch_size=self.args.batch_size * 32,
+            num_workers=self.args.workers,
+            persistent_workers=True,
+            shuffle=False
+        )
+
+        sz_test = len(ent_dataloader) + len(rel_dataloader)
+        with tqdm(total=sz_test) as _tqdm:
+            ent_emb = []
+            rel_emb = []
+            step = 0
+            _tqdm.set_description('get the ent embedding')
+            with torch.no_grad():
+                for batch in ent_dataloader:
+                    batch = self.tokenizer.batch_encode_plus(
+                        batch,
+                        padding='max_length',
+                        max_length=self.args.maxlength,
+                        truncation=True,
+                        return_tensors="pt",
+                        return_token_type_ids=False,
+                        return_attention_mask=True,
+                        add_special_tokens=False
+                    )
+
+                    batch_emb = self.model.cls_embedding(batch, tp=self.args.plm_emb_type)
+                    batch_emb = self.ke_model.get_embedding(batch_emb, is_ent=True)
+
+                    ent_emb.append(batch_emb.cpu())
+                    _tqdm.update(1)
+                    step += 1
+                    torch.cuda.empty_cache()
+                    _tqdm.set_description(f'ENT emb:  [{step}/{sz_test}]')
+
+                _tqdm.set_description('get the rel embedding')
+                for batch in rel_dataloader:
+                    batch = self.tokenizer.batch_encode_plus(
+                        batch,
+                        padding='max_length',
+                        max_length=self.args.maxlength,
+                        truncation=True,
+                        return_tensors="pt",
+                        return_token_type_ids=False,
+                        return_attention_mask=True,
+                        add_special_tokens=False
+                    )
+                    batch_emb = self.model.cls_embedding(batch, tp=self.args.plm_emb_type)
+                    batch_emb = self.ke_model.get_embedding(batch_emb, is_ent=False)
+                    # batch_emb = self.model.get_embedding(batch, is_ent=False)
+                    rel_emb.append(batch_emb.cpu())
+                    _tqdm.update(1)
+                    step += 1
+                    torch.cuda.empty_cache()
+                    _tqdm.set_description(f'REL emb: [{step}/{sz_test}]')
+
+        all_ent_emb = torch.cat(ent_emb).cuda()
+        all_rel_emb = torch.cat(rel_emb).cuda()
+        # embedding：emb_output
+        # dim 256
+        kge_model_for_test = KGEModel(nentity=len(all_ent), nrelation=len(all_rel), hidden_dim=self.args.ke_dim,
+                                      gamma=self.args.ke_margin, entity_embedding=all_ent_emb, relation_embedding=all_rel_emb).cuda()
+        if self.args.ke_test_num > 0:
+            test_triples = id_format_triple[:self.args.ke_test_num]
+        else:
+            test_triples = id_format_triple
+        with torch.no_grad():
+            metrics = kge_model_for_test.test_step(test_triples=test_triples, all_true_triples=id_format_triple, args=self.args, nentity=len(all_ent), nrelation=len(all_rel))
+        # pdb.set_trace()
+        print(f"result:{metrics}")
+
+    def OD_test(self):
+        # data_path = self.args.data_path
+        # with open(osp.join(data_path, f'{self.args.order_test_name}.json'), "r") as fp:
+        #     data = json.load(fp)
+        self.od_model.eval()
+        test_log = Loss_log()
+        test_log.acc_init()
+        sz_test = len(self.train_dataloader)
+        all_emb_ent = []
+        with tqdm(total=sz_test) as _tqdm:  # 使用需要的参数对tqdm进行初始化
+            for step, batch in enumerate(self.train_dataloader):
+                with torch.no_grad():
+                    emb = self.model.cls_embedding(batch[0], tp=self.args.plm_emb_type)
+                    out_emb = self.od_model.encode(emb)
+                    emb_cpu = out_emb.cpu()
+                    all_emb_ent.append(emb_cpu)
+                    order_score = self.od_model.predict(emb)
+                    token_right = self.od_model.right_caculate(order_score, batch[1], threshold=self.args.order_threshold)
+                test_log.update_token(batch[1].shape[0], [token_right])
+                _tqdm.update(1)
+                _tqdm.set_description(f'Test | step [{step}/{sz_test}] Acc: {test_log.get_token_acc()}')
+
+        emb_output = torch.cat(all_emb_ent)
+        data_path = self.args.data_path
+        save_path = osp.join(data_path, 'downstream_task', 'output')
+        os.makedirs(save_path, exist_ok=True)
+        save_dir = osp.join(save_path, f'ratio{self.args.train_ratio}_{emb_output.shape[0]}emb_{self.args.model_name.replace("DistributedDataParallel", "")}.pt')
+        torch.save(emb_output, save_dir)
+        print(f"save {emb_output.shape[0]} embeddings done...")
+
+    @ torch.no_grad()
+    def test(self, path_gen=None):
+        test_log = Loss_log()
+        self.model.eval()
+        if not (self.args.mask_test or self.args.embed_gen or self.args.ke_test or len(self.args.order_test_name) > 0):
+            return
+        if self.args.mask_test:
+            self.mask_test(test_log)
+        if self.args.embed_gen:
+            self.emb_generate(path_gen)
+        if self.args.ke_test:
+            self.KGE_test()
+        if len(self.args.order_test_name) > 0:
+            runner.OD_test()
+
+    def _load_model(self, model, name):
+        if model is None:
+            return None
+        # 没有训练过
+        _name = name if name[:3] not in ['od_', 'ke_'] else name[3:]
+        save_path = osp.join(self.args.data_path, 'save', _name)
+        save_name = osp.join(save_path, f'{name}.pkl')
+        if not osp.exists(save_path) or not osp.exists(save_name):
+            return model.cuda()
+        # 载入模型
+        if 'Distribute' in self.args.model_name:
+            model.load_state_dict({k.replace('module.', ''): v for k, v in torch.load(os.path.join(save_name), map_location=self.args.device).items()})
+        else:
+            model.load_state_dict(torch.load(save_name, map_location=self.args.device))
+        model.cuda()
+        if self.rank == 0:
+            print(f"loading model [{name}.pkl] done!")
+
+        return model
+
+    def _save_model(self, stage=''):
+        model_name = type(self.model).__name__
+        # TODO: path
+        save_path = osp.join(self.args.data_path, 'save')
+        os.makedirs(save_path, exist_ok=True)
+        if self.args.train_strategy == 1:
+            save_name = f'{self.args.exp_name}_{self.args.exp_id}_s{self.args.random_seed}{stage}'
+        else:
+            save_name = f'{self.args.exp_name}_{self.args.exp_id}_s{self.args.random_seed}_{self.args.plm_emb_type}{stage}'
+        save_path = osp.join(save_path, save_name)
+        os.makedirs(save_path, exist_ok=True)
+        # 预训练模型保存
+        self._save(self.model, save_path, save_name)
+
+        # 下游模型保存
+        save_name_od = f'od_{save_name}'
+        self._save(self.od_model, save_path, save_name_od)
+        save_name_ke = f'ke_{save_name}'
+        self._save(self.ke_model, save_path, save_name_ke)
+        return save_path
+
+    def _save(self, model, save_path, save_name):
+        if model is None:
+            return
+        if self.args.save_model:
+            torch.save(model.state_dict(), osp.join(save_path, f'{save_name}.pkl'))
+            print(f"saving {save_name} done!")
+
+        if self.args.save_pretrain and not save_name.startswith('od_') and not save_name.startswith('ke_'):
+            self.tokenizer.save_pretrained(osp.join(self.args.plm_path, f'{save_name}'))
+            self.model.encoder.save_pretrained(osp.join(self.args.plm_path, f'{save_name}'))
+            print(f"saving [pretrained] {save_name} done!")
+
+
+if __name__ == '__main__':
+    cfg = cfg()
+    cfg.get_args()
+    cfgs = cfg.update_train_configs()
+    set_seed(cfgs.random_seed)
+    # 初始化各进程环境
+    # pdb.set_trace()
+    if cfgs.dist and not cfgs.only_test:
+        init_distributed_mode(args=cfgs)
+        # cfgs.lr *= cfgs.world_size
+        # cfgs.ke_lr *= cfgs.world_size
+    else:
+        # 下面这条语句在并行的时候可能内存泄漏，导致无法停止
+        torch.multiprocessing.set_sharing_strategy('file_system')
+    rank = cfgs.rank
+
+    writer, logger = None, None
+    if rank == 0:
+        # 如果并行则只有一种情况打印
+        logger = initialize_exp(cfgs)
+        logger_path = get_dump_path(cfgs)
+        cfgs.time_stamp = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now())
+        comment = f'bath_size={cfgs.batch_size} exp_id={cfgs.exp_id}'
+        if not cfgs.no_tensorboard and not cfgs.only_test:
+            writer = SummaryWriter(log_dir=os.path.join(logger_path, 'tensorboard', cfgs.time_stamp), comment=comment)
+
+    cfgs.device = torch.device(cfgs.device)
+
+    # -----  Begin ----------
+    runner = Runner(cfgs, writer, logger, rank)
+
+    if cfgs.only_test:
+        if cfgs.embed_gen:
+            # 不需要生成的先搞定
+            if cfgs.mask_test or cfgs.ke_test:
+                runner.args.embed_gen = 0
+                runner.test()
+                runner.args.embed_gen = 1
+            # gen_dir = ['yht_data_merge', 'yht_data_whole5gc', 'yz_data_whole5gc', 'yz_data_merge', 'zyc_data_merge', 'zyc_data_whole5gc']
+            gen_dir = ['yht_serialize_withAttribute', 'yht_serialize_withoutAttr', 'yht_name_serialize', 'zyc_serialize_withAttribute', 'zyc_serialize_withoutAttr', 'zyc_name_serialize',
+                       'yz_serialize_withAttribute', 'yz_serialize_withoutAttr', 'yz_name_serialize', 'yz_serialize_net']
+            # gen_dir = ['zyc_serialize_withAttribute', 'zyc_normal_serialize', 'zyc_data_whole5gc', 'zyc_data_merge', 'yht_normal_serialize',
+            #            'yht_serialize_withAttribute', 'yz_serialize_withAttribute', 'yz_serialize_net', 'yz_normal_serialize']
+            runner.args.mask_test, runner.args.ke_test = 0, 0
+            for item in gen_dir:
+                runner.test(item)
+        else:
+            runner.test()
+    else:
+        runner.run()
+
+    # -----  End ----------
+    if not cfgs.no_tensorboard and not cfgs.only_test and rank == 0:
+        writer.close()
+        logger.info("done!")
+
+    if cfgs.dist and not cfgs.only_test:
+        dist.barrier()
+        dist.destroy_process_group()
+        # print("shut down...")
diff --git a/KTeleBERT/model/HWBert.py b/KTeleBERT/model/HWBert.py
new file mode 100644
index 0000000000000000000000000000000000000000..093f857d453988293edbf1964b20fcecc94ec413
--- /dev/null
+++ b/KTeleBERT/model/HWBert.py
@@ -0,0 +1,146 @@
+import os
+import os.path as osp
+import pdb
+import torch
+import torch.nn as nn
+import numpy as np
+from random import *
+import json
+from packaging import version
+import torch.distributed as dist
+
+from .Tool_model import AutomaticWeightedLoss
+from .Numeric import AttenNumeric
+from .KE_model import KE_model
+# from modeling_transformer import Transformer
+
+
+from .bert import BertModel, BertTokenizer, BertForMaskedLM, BertConfig
+import torch.nn.functional as F
+
+from copy import deepcopy
+from src.utils import torch_accuracy
+# 4.21.2
+
+
+def debug(input, kk, begin=None):
+    aaa = deepcopy(input[0])
+    if begin is None:
+        aaa.input_ids = input[0].input_ids[:kk]
+        aaa.attention_mask = input[0].attention_mask[:kk]
+        aaa.chinese_ref = input[0].chinese_ref[:kk]
+        aaa.kpi_ref = input[0].kpi_ref[:kk]
+        aaa.labels = input[0].labels[:kk]
+    else:
+        aaa.input_ids = input[0].input_ids[begin:kk]
+        aaa.attention_mask = input[0].attention_mask[begin:kk]
+        aaa.chinese_ref = input[0].chinese_ref[begin:kk]
+        aaa.kpi_ref = input[0].kpi_ref[begin:kk]
+        aaa.labels = input[0].labels[begin:kk]
+
+    return aaa
+
+
+class HWBert(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.loss_awl = AutomaticWeightedLoss(args.awl_num, args)
+        self.args = args
+        self.config = BertConfig()
+        model_name = args.model_name
+        if args.model_name in ['TeleBert', 'TeleBert2', 'TeleBert3']:
+            self.encoder = BertForMaskedLM.from_pretrained(osp.join(args.data_root, 'transformer', model_name))
+            # MacBert来初始化 predictions layer
+            if args.cls_head_init:
+                tmp = BertForMaskedLM.from_pretrained(osp.join(args.data_root, 'transformer', 'MacBert'))
+                self.encoder.cls.predictions = tmp.cls.predictions
+        else:
+            if not osp.exists(osp.join(args.data_root, 'transformer', args.model_name)):
+                model_name = 'MacBert'
+            self.encoder = BertForMaskedLM.from_pretrained(osp.join(args.data_root, 'transformer', model_name))
+        self.numeric_model = AttenNumeric(self.args)
+
+    # ----------------------- 主forward函数 ----------------------------------
+    def forward(self, input):
+        mask_loss, kpi_loss, kpi_loss_weight, kpi_loss_dict = self.mask_forward(input)
+        mask_loss = mask_loss.loss
+        loss_dic = {}
+        if not self.args.use_kpi_loss:
+            kpi_loss = None
+        if kpi_loss is not None:
+            loss_sum = self.loss_awl(mask_loss, 0.3 * kpi_loss)
+            loss_dic['kpi_loss'] = kpi_loss.item()
+        else:
+            loss_sum = self.loss_awl(mask_loss)
+        loss_dic['mask_loss'] = mask_loss.item()
+        return {
+            'loss': loss_sum,
+            'loss_dic': loss_dic,
+            'loss_weight': self.loss_awl.params.tolist(),
+            'kpi_loss_weight': kpi_loss_weight,
+            'kpi_loss_dict': kpi_loss_dict
+        }
+
+    # loss_sum, loss_dic, self.loss_awl.params.tolist(), kpi_loss_weight, kpi_loss_dict
+
+    # ----------------------------------------------------------------
+    # 测试代码，计算mask是否正确
+    def mask_prediction(self, inputs, tokenizer_sz, topk=(1,)):
+        token_num, token_right, word_num, word_right = None, None, None, None
+        outputs, kpi_loss, kpi_loss_weight, kpi_loss_dict = self.mask_forward(inputs)
+        inputs = inputs['labels'].view(-1)
+        input_list = inputs.tolist()
+        # 被修改的词
+        change_token_index = [i for i, x in enumerate(input_list) if x != -100]
+        change_token = torch.tensor(change_token_index)
+        inputs_used = inputs[change_token]
+        pred = outputs.logits.view(-1, tokenizer_sz)
+        pred_used = pred[change_token].cpu()
+        # 返回的list
+        # 计算acc
+        acc, token_right = torch_accuracy(pred_used, inputs_used, topk)
+        # 计算混乱分数
+
+        token_num = inputs_used.shape[0]
+        # TODO: 添加word_num, word_right
+        # token_right：list
+        return token_num, token_right, outputs.loss.item()
+
+    def mask_forward(self, inputs):
+        kpi_ref = None
+        if 'kpi_ref' in inputs:
+            kpi_ref = inputs['kpi_ref']
+
+        outputs, kpi_loss, kpi_loss_weight, kpi_loss_dict = self.encoder(
+            input_ids=inputs['input_ids'].cuda(),
+            attention_mask=inputs['attention_mask'].cuda(),
+            # token_type_ids=inputs.token_type_ids.cuda(),
+            labels=inputs['labels'].cuda(),
+            kpi_ref=kpi_ref,
+            kpi_model=self.numeric_model
+        )
+        return outputs, kpi_loss, kpi_loss_weight, kpi_loss_dict
+
+    # TODO: 垂直注意力考虑：https://github.com/lucidrains/axial-attention
+
+    def cls_embedding(self, inputs, tp='cls'):
+        hidden_states = self.encoder(
+            input_ids=inputs['input_ids'].cuda(),
+            attention_mask=inputs['attention_mask'].cuda(),
+            output_hidden_states=True)[0].hidden_states
+        if tp == 'cls':
+            return hidden_states[-1][:, 0]
+        else:
+            index_real = torch.tensor(inputs['input_ids'].clone().detach(), dtype=torch.bool)
+            res = []
+            for i in range(hidden_states[-1].shape[0]):
+                if tp == 'last_avg':
+                    res.append(hidden_states[-1][i][index_real[i]][:-1].mean(dim=0))
+                elif tp == 'last2avg':
+                    res.append((hidden_states[-1][i][index_real[i]][:-1] + hidden_states[-2][i][index_real[i]][:-1]).mean(dim=0))
+                elif tp == 'last3avg':
+                    res.append((hidden_states[-1][i][index_real[i]][:-1] + hidden_states[-2][i][index_real[i]][:-1] + hidden_states[-3][i][index_real[i]][:-1]).mean(dim=0))
+                elif tp == 'first_last_avg':
+                    res.append((hidden_states[-1][i][index_real[i]][:-1] + hidden_states[1][i][index_real[i]][:-1]).mean(dim=0))
+
+            return torch.stack(res)
diff --git a/KTeleBERT/model/KE_model.py b/KTeleBERT/model/KE_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b05b3db98beed2b69499d8ec7f980131b92f4d
--- /dev/null
+++ b/KTeleBERT/model/KE_model.py
@@ -0,0 +1,451 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+import numpy as np
+from sklearn.metrics import average_precision_score
+from tqdm import tqdm
+import pdb
+from torch.utils.data import DataLoader
+from collections import defaultdict
+import os.path as osp
+import json
+
+
+class KE_model(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        """
+        triple task: mask tail entity, total entity size-class classification 
+        """
+        """
+        :param hidden: BERT model output size
+        """
+        self.args = args
+        self.ke_dim = args.ke_dim
+
+        self.linear_ent = nn.Linear(args.hidden_size, self.ke_dim)
+        self.linear_rel = nn.Linear(args.hidden_size, self.ke_dim)
+
+        self.ke_margin = nn.Parameter(
+            torch.Tensor([args.ke_margin]),
+            requires_grad=False
+        )
+
+    def forward(self, batch, hw_model):
+        batch_triple = batch
+        pos_sample = batch_triple["positive_sample"]
+        neg_sample = batch_triple["negative_sample"]
+        neg_index = batch_triple["neg_index"]
+        
+        # 节省显存
+        all_entity = []
+        all_entity_mask = []
+        for i in range(3):
+            all_entity.append(pos_sample[i]['input_ids'])
+            all_entity_mask.append(pos_sample[i]['attention_mask'])
+        
+        all_entity = torch.cat(all_entity)
+        all_entity_mask = torch.cat(all_entity_mask)
+        entity_data = {'input_ids':all_entity, 'attention_mask':all_entity_mask}
+        entity_emb = hw_model.cls_embedding(entity_data, tp=self.args.plm_emb_type)
+
+        bs = pos_sample[0]['input_ids'].shape[0]
+        pos_sample_emb= [entity_emb[:bs], entity_emb[bs:2*bs], entity_emb[2*bs:3*bs]]
+        neg_sample_emb = entity_emb[neg_index]
+        mode = batch_triple["mode"]
+        # pos_score = self.get_score(pos_sample, hw_model)
+        # neg_score = self.get_score(pos_sample, hw_model, neg_sample, mode)
+        pos_score = self.get_score(pos_sample_emb, hw_model)
+        neg_score = self.get_score(pos_sample_emb, hw_model, neg_sample_emb, mode)
+        triple_loss = self.adv_loss(pos_score, neg_score, self.args)
+
+        return triple_loss
+
+        # pdb.set_trace()
+        # return emb.div_(emb.detach().norm(p=1, dim=-1, keepdim=True))
+
+# KE loss
+    def tri2emb(self, triples, hw_model, negs=None, mode="single"):
+        """Get embedding of triples.
+        This function get the embeddings of head, relation, and tail
+        respectively. each embedding has three dimensions.
+        Args:
+            triples (tensor): This tensor save triples id, which dimension is 
+                [triples number, 3].
+            negs (tensor, optional): This tenosr store the id of the entity to 
+                be replaced, which has one dimension. when negs is None, it is 
+                in the test/eval phase. Defaults to None.
+            mode (str, optional): This arg indicates that the negative entity 
+                will replace the head or tail entity. when it is 'single', it 
+                means that entity will not be replaced. Defaults to 'single'.
+        Returns:
+            head_emb: Head entity embedding.
+            relation_emb: Relation embedding.
+            tail_emb: Tail entity embedding.
+        """
+
+        if mode == "single":
+            head_emb = self.get_embedding(triples[0]).unsqueeze(1)  # [bs, 1, dim]
+            relation_emb = self.get_embedding(triples[1], is_ent=False).unsqueeze(1)  # [bs, 1, dim]
+            tail_emb = self.get_embedding(triples[2]).unsqueeze(1)  # [bs, 1, dim]
+
+        elif mode == "head-batch" or mode == "head_predict":
+            if negs is None:  # 说明这个时候是在evluation，所以需要直接用所有的entity embedding
+                # TODO：暂时不考虑KGC的测试情况
+                head_emb = self.ent_emb.weight.data.unsqueeze(0)  # [1, num_ent, dim]
+            else:
+                head_emb = self.get_embedding(negs).reshape(-1, self.args.neg_num, self.args.ke_dim)  # [bs, num_neg, dim]
+            relation_emb = self.get_embedding(triples[1], is_ent=False).unsqueeze(1)  # [bs, 1, dim]
+            tail_emb = self.get_embedding(triples[2]).unsqueeze(1)  # [bs, 1, dim]
+
+        elif mode == "tail-batch" or mode == "tail_predict":
+            head_emb = self.get_embedding(triples[0]).unsqueeze(1)  # [bs, 1, dim]
+            relation_emb = self.get_embedding(triples[1], is_ent=False).unsqueeze(1)  # [bs, 1, dim]
+            if negs is None:
+                tail_emb = self.ent_emb.weight.data.unsqueeze(0)  # [1, num_ent, dim]
+            else:
+                # pdb.set_trace()
+                tail_emb = self.get_embedding(negs).reshape(-1, self.args.neg_num, self.args.ke_dim)  # [bs, num_neg, dim]
+
+        return head_emb, relation_emb, tail_emb
+
+    def get_embedding(self, inputs, is_ent=True):
+        # pdb.set_trace()
+        if is_ent:
+            return self.linear_ent(inputs)
+        else:
+            return self.linear_rel(inputs)
+
+    def score_func(self, head_emb, relation_emb, tail_emb):
+        """Calculating the score of triples.
+
+        The formula for calculating the score is :math:`\gamma - ||h + r - t||_F`
+        Args:
+            head_emb: The head entity embedding.
+            relation_emb: The relation embedding.
+            tail_emb: The tail entity embedding.
+            mode: Choose head-predict or tail-predict.
+        Returns:
+            score: The score of triples.
+        """
+        score = (head_emb + relation_emb) - tail_emb
+        # pdb.set_trace()
+        score = self.ke_margin.item() - torch.norm(score, p=1, dim=-1)
+        return score
+
+    def get_score(self, triples, hw_model, negs=None, mode='single'):
+        """The functions used in the training phase
+
+        Args:
+            triples: The triples ids, as (h, r, t), shape:[batch_size, 3].
+            negs: Negative samples, defaults to None.
+            mode: Choose head-predict or tail-predict, Defaults to 'single'.
+
+        Returns:
+            score: The score of triples.
+        """
+        head_emb, relation_emb, tail_emb = self.tri2emb(triples, hw_model, negs, mode)
+        score = self.score_func(head_emb, relation_emb, tail_emb)
+
+        return score
+
+    def adv_loss(self, pos_score, neg_score, args):
+        """Negative sampling loss with self-adversarial training. In math:
+
+        L=-\log \sigma\left(\gamma-d_{r}(\mathbf{h}, \mathbf{t})\right)-\sum_{i=1}^{n} p\left(h_{i}^{\prime}, r, t_{i}^{\prime}\right) \log \sigma\left(d_{r}\left(\mathbf{h}_{i}^{\prime}, \mathbf{t}_{i}^{\prime}\right)-\gamma\right)
+
+        Args:
+            pos_score: The score of positive samples.
+            neg_score: The score of negative samples.
+            subsampling_weight: The weight for correcting pos_score and neg_score.
+
+        Returns:
+            loss: The training loss for back propagation.
+        """
+        neg_score = (F.softmax(neg_score * args.adv_temp, dim=1).detach()
+                     * F.logsigmoid(-neg_score)).sum(dim=1)  # shape:[bs]
+        pos_score = F.logsigmoid(pos_score).view(neg_score.shape[0])  # shape:[bs]
+        positive_sample_loss = - pos_score.mean()
+        negative_sample_loss = - neg_score.mean()
+        loss = (positive_sample_loss + negative_sample_loss) / 2
+        return loss
+
+
+class KGEModel(nn.Module):
+    def __init__(self, nentity, nrelation, hidden_dim, gamma, entity_embedding, relation_embedding):
+        super(KGEModel, self).__init__()
+        self.nentity = nentity
+        self.nrelation = nrelation
+        self.hidden_dim = hidden_dim
+
+        self.gamma = nn.Parameter(
+            torch.Tensor([gamma]),
+            requires_grad=False
+        )
+        self.entity_embedding = entity_embedding
+        self.relation_embedding = relation_embedding
+
+        assert self.relation_embedding.shape[0] == nrelation
+        assert self.entity_embedding.shape[0] == nentity
+
+    def forward(self, sample, mode='single'):
+        '''
+        Forward function that calculate the score of a batch of triples.
+        In the 'single' mode, sample is a batch of triple.
+        In the 'head-batch' or 'tail-batch' mode, sample consists two part.
+        The first part is usually the positive sample.
+        And the second part is the entities in the negative samples.
+        Because negative samples and positive samples usually share two elements 
+        in their triple ((head, relation) or (relation, tail)).
+        '''
+
+        if mode == 'single':
+            batch_size, negative_sample_size = sample.size(0), 1
+
+            head = torch.index_select(
+                self.entity_embedding,
+                dim=0,
+                index=sample[:, 0]
+            ).unsqueeze(1)
+
+            relation = torch.index_select(
+                self.relation_embedding,
+                dim=0,
+                index=sample[:, 1]
+            ).unsqueeze(1)
+
+            tail = torch.index_select(
+                self.entity_embedding,
+                dim=0,
+                index=sample[:, 2]
+            ).unsqueeze(1)
+
+        elif mode == 'head-batch':
+            tail_part, head_part = sample
+            batch_size, negative_sample_size = head_part.size(0), head_part.size(1)
+
+            head = torch.index_select(
+                self.entity_embedding,
+                dim=0,
+                index=head_part.view(-1)
+            ).view(batch_size, negative_sample_size, -1)
+
+            relation = torch.index_select(
+                self.relation_embedding,
+                dim=0,
+                index=tail_part[:, 1]
+            ).unsqueeze(1)
+
+            tail = torch.index_select(
+                self.entity_embedding,
+                dim=0,
+                index=tail_part[:, 2]
+            ).unsqueeze(1)
+
+        elif mode == 'tail-batch':
+            head_part, tail_part = sample
+            batch_size, negative_sample_size = tail_part.size(0), tail_part.size(1)
+
+            head = torch.index_select(
+                self.entity_embedding,
+                dim=0,
+                index=head_part[:, 0]
+            ).unsqueeze(1)
+
+            relation = torch.index_select(
+                self.relation_embedding,
+                dim=0,
+                index=head_part[:, 1]
+            ).unsqueeze(1)
+
+            tail = torch.index_select(
+                self.entity_embedding,
+                dim=0,
+                index=tail_part.view(-1)
+            ).view(batch_size, negative_sample_size, -1)
+
+        else:
+            raise ValueError('mode %s not supported' % mode)
+
+        score = self.TransE(head, relation, tail, mode)
+
+        return score
+
+    def TransE(self, head, relation, tail, mode):
+        if mode == 'head-batch':
+            score = head + (relation - tail)
+        else:
+            score = (head + relation) - tail
+
+        score = self.gamma.item() - torch.norm(score, p=1, dim=-1)
+        return score
+
+    @torch.no_grad()
+    def test_step(self, test_triples, all_true_triples, args, nentity, nrelation):
+        '''
+        Evaluate the model on test or valid datasets
+        '''
+        # Otherwise use standard (filtered) MRR, MR, HITS@1, HITS@3, and HITS@10 metrics
+        # Prepare dataloader for evaluation
+        test_dataloader_head = DataLoader(
+            KGTestDataset(
+                test_triples,
+                all_true_triples,
+                nentity,
+                nrelation,
+                'head-batch'
+            ),
+            batch_size=args.batch_size,
+            num_workers=args.workers,
+            persistent_workers=True,
+            collate_fn=KGTestDataset.collate_fn
+        )
+
+        test_dataloader_tail = DataLoader(
+            KGTestDataset(
+                test_triples,
+                all_true_triples,
+                nentity,
+                nrelation,
+                'tail-batch'
+            ),
+            batch_size=args.batch_size,
+            num_workers=args.workers,
+            persistent_workers=True,
+            collate_fn=KGTestDataset.collate_fn
+        )
+
+        test_dataset_list = [test_dataloader_head, test_dataloader_tail]
+
+        logs = []
+
+        step = 0
+        total_steps = sum([len(dataset) for dataset in test_dataset_list])
+
+        # pdb.set_trace()
+        with tqdm(total=total_steps) as _tqdm:
+            _tqdm.set_description(f'eval KGC')
+            for test_dataset in test_dataset_list:
+                for positive_sample, negative_sample, filter_bias, mode in test_dataset:
+
+                    positive_sample = positive_sample.cuda()
+                    negative_sample = negative_sample.cuda()
+                    filter_bias = filter_bias.cuda()
+
+                    batch_size = positive_sample.size(0)
+
+                    score = self.forward((positive_sample, negative_sample), mode)
+                    score += filter_bias
+
+                    # Explicitly sort all the entities to ensure that there is no test exposure bias
+                    argsort = torch.argsort(score, dim=1, descending=True)
+
+                    if mode == 'head-batch':
+                        positive_arg = positive_sample[:, 0]
+                    elif mode == 'tail-batch':
+                        positive_arg = positive_sample[:, 2]
+                    else:
+                        raise ValueError('mode %s not supported' % mode)
+
+                    for i in range(batch_size):
+                        # Notice that argsort is not ranking
+                        # ranking = (argsort[i, :] == positive_arg[i]).nonzero()
+                        ranking = (argsort[i, :] == positive_arg[i]).nonzero(as_tuple=False)
+                        assert ranking.size(0) == 1
+
+                        # ranking + 1 is the true ranking used in evaluation metrics
+                        ranking = 1 + ranking.item()
+                        logs.append({
+                            'MRR': 1.0 / ranking,
+                            'MR': float(ranking),
+                            'HITS@1': 1.0 if ranking <= 1 else 0.0,
+                            'HITS@3': 1.0 if ranking <= 3 else 0.0,
+                            'HITS@10': 1.0 if ranking <= 10 else 0.0,
+                        })
+
+                    # if step % args.test_log_steps == 0:
+                    #     logging.info('Evaluating the model... (%d/%d)' % (step, total_steps))
+                    _tqdm.update(1)
+                    _tqdm.set_description(f'KGC Eval:')
+                    step += 1
+
+        metrics = {}
+        for metric in logs[0].keys():
+            metrics[metric] = sum([log[metric] for log in logs]) / len(logs)
+
+        return metrics
+
+
+# 专门为KGE的测试设计一个dataset
+class KGTestDataset(torch.utils.data.Dataset):
+    def __init__(self, triples, all_true_triples, nentity, nrelation, mode, head4rel_tail=None, tail4head_rel=None):
+        self.len = len(triples)
+        self.triple_set = set(all_true_triples)
+        self.triples = triples
+
+        # 需要统计得到
+        self.nentity = nentity
+        self.nrelation = nrelation
+        self.mode = mode
+
+        # 给定关系尾实体对应头实体
+        # print("build head4rel_tail")
+        # self.head4rel_tail = self.find_head4rel_tail()
+        # print("build tail4head_rel")
+        # self.tail4head_rel = self.find_tail4head_rel()
+
+    def __len__(self):
+        return self.len
+
+    def find_head4rel_tail(self):
+        ans = defaultdict(list)
+        for (h, r, t) in self.triple_set:
+            ans[(r, t)].append(h)
+        return ans
+
+    def find_tail4head_rel(self):
+        ans = defaultdict(list)
+        for (h, r, t) in self.triple_set:
+            ans[(h, r)].append(t)
+        return ans
+
+    def __getitem__(self, idx):
+        head, relation, tail = self.triples[idx]
+
+        if self.mode == 'head-batch':
+            tmp = [(0, rand_head) if (rand_head, relation, tail) not in self.triple_set
+                   else (-100, head) for rand_head in range(self.nentity)]
+            tmp[head] = (0, head)
+        elif self.mode == 'tail-batch':
+            tmp = [(0, rand_tail) if (head, relation, rand_tail) not in self.triple_set
+                   else (-100, tail) for rand_tail in range(self.nentity)]
+            tmp[tail] = (0, tail)
+        else:
+            raise ValueError('negative batch mode %s not supported' % self.mode)
+        # if self.mode == 'head-batch':
+            #
+        #     tmp = [(0, rand_head) if rand_head not in  self.head4rel_tail[(relation, tail)]
+        #            else (-100, head) for rand_head in range(self.nentity)]
+        #     tmp[head] = (0, head)
+        # elif self.mode == 'tail-batch':
+        #     tmp = [(0, rand_tail) if rand_tail not in self.tail4head_rel[(head, relation)]
+        #            else (-100, tail) for rand_tail in range(self.nentity)]
+        #     tmp[tail] = (0, tail)
+        # else:
+        #     raise ValueError('negative batch mode %s not supported' % self.mode)
+
+        tmp = torch.LongTensor(tmp)
+        filter_bias = tmp[:, 0].float()
+        negative_sample = tmp[:, 1]
+
+        positive_sample = torch.LongTensor((head, relation, tail))
+
+        return positive_sample, negative_sample, filter_bias, self.mode
+
+    @staticmethod
+    def collate_fn(data):
+        positive_sample = torch.stack([_[0] for _ in data], dim=0)
+        negative_sample = torch.stack([_[1] for _ in data], dim=0)
+        filter_bias = torch.stack([_[2] for _ in data], dim=0)
+        mode = data[0][3]
+        return positive_sample, negative_sample, filter_bias, mode
diff --git a/KTeleBERT/model/Numeric.py b/KTeleBERT/model/Numeric.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32d584d7b46f6b79b4cfe4db379660faba00634
--- /dev/null
+++ b/KTeleBERT/model/Numeric.py
@@ -0,0 +1,218 @@
+import types
+import torch
+import transformers
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import numpy as np
+import pdb
+import math
+from .Tool_model import AutomaticWeightedLoss
+import os.path as osp
+import json
+
+
+def ortho_penalty(t):
+    return ((t @ t.T - torch.eye(t.shape[0]).cuda())**2).sum()
+
+
+class AttenNumeric(nn.Module):
+    def __init__(self, config):
+        super(AttenNumeric, self).__init__()
+        # -----------  加载kpi2id --------------------
+        kpi_file_path = osp.join(config.data_path, 'kpi2id.json')
+
+        with open(kpi_file_path, 'r') as f:
+            # pdb.set_trace()
+            kpi2id = json.load(f)
+        config.num_kpi = 303
+        # config.num_kpi = len(kpi2id)
+        # -------------------------------
+
+        self.config = config
+        self.fc = nn.Linear(1, config.hidden_size)
+        # self.actication = nn.ReLU()
+        self.actication = nn.LeakyReLU()
+        # self.embedding = nn.Linear(config.hidden_size, self.attention_head_size)
+        if config.contrastive_loss:
+            self.loss_awl = AutomaticWeightedLoss(3, config)
+        else:
+            self.loss_awl = AutomaticWeightedLoss(2, config)
+        self.encoder = AttNumEncoder(config)
+        self.decoder = AttNumDecoder(config)
+        self.classifier = NumClassifier(config)
+        self.ce_loss = nn.CrossEntropyLoss()
+
+    def contrastive_loss(self, hidden, kpi):
+        # in batch negative
+        bs_tmp = hidden.shape[0]
+        eye = torch.eye(bs_tmp).cuda()
+        hidden = F.normalize(hidden, dim=1)
+        # [12,12]
+        # 减去对角矩阵目的是防止对自身的相似程度影响了判断
+        hidden_sim = (torch.matmul(hidden, hidden.T) - eye) / 0.07
+        kpi = kpi.expand(-1, bs_tmp)
+        kpi_sim = torch.abs(kpi - kpi.T) + eye
+        kpi_sim = torch.min(kpi_sim, 1)[1]
+        sc_loss = self.ce_loss(hidden_sim, kpi_sim)
+        return sc_loss
+
+    def _encode(self, kpi, query):
+        kpi_emb = self.actication(self.fc(kpi))
+        # name_emb = self.embedding(query)
+        hidden, en_loss, scalar_list = self.encoder(kpi_emb, query)
+
+        # 两个及以下的对比学习没有意义
+        if self.config.contrastive_loss and hidden.shape[0] > 2:
+            con_loss = self.contrastive_loss(hidden.squeeze(1), kpi.squeeze(1))
+        else:
+            con_loss = None
+        hidden = self.actication(hidden)
+        assert query.shape[0] > 0
+        return hidden, en_loss, scalar_list, con_loss
+
+    def forward(self, kpi, query, kpi_id):
+        hidden, en_loss, scalar_list, con_loss = self._encode(kpi, query)
+        dec_kpi_score, de_loss = self.decoder(kpi, hidden)
+        cls_kpi, cls_loss = self.classifier(hidden, kpi_id)
+        if con_loss is not None:
+            # 0.001 * con_loss
+            loss_sum = self.loss_awl(de_loss, cls_loss, 0.1 * con_loss)
+            loss_all = loss_sum + en_loss
+            loss_dic = {'cls_loss': cls_loss.item(), 'reg_loss': de_loss.item(), 'orth_loss': en_loss.item(), 'con_loss': con_loss.item()}
+            # pdb.set_trace()
+        else:
+            loss_sum = self.loss_awl(de_loss, cls_loss)
+            loss_all = loss_sum + en_loss
+            loss_dic = {'cls_loss': cls_loss.item(), 'reg_loss': de_loss.item(), 'orth_loss': en_loss.item()}
+
+        return dec_kpi_score, cls_kpi, hidden, loss_all, self.loss_awl.params.tolist(), loss_dic, scalar_list
+
+
+class AttNumEncoder(nn.Module):
+    def __init__(self, config):
+        super(AttNumEncoder, self).__init__()
+        self.num_l_layers = config.l_layers
+        self.layer = nn.ModuleList([AttNumLayer(config) for _ in range(self.num_l_layers)])
+
+    def forward(self, kpi_emb, name_emb):
+        loss = 0.
+        scalar_list = []
+        for layer_module in self.layer:
+            kpi_emb, orth_loss, scalar = layer_module(kpi_emb, name_emb)
+            loss += orth_loss
+            scalar_list.append(scalar)
+        return kpi_emb, loss, scalar_list
+
+
+class AttNumDecoder(nn.Module):
+    def __init__(self, config):
+        super(AttNumDecoder, self).__init__()
+        self.dense_1 = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dense_2 = nn.Linear(config.hidden_size, 1)
+        self.actication = nn.LeakyReLU()
+        self.loss_func = nn.MSELoss(reduction='mean')
+
+    def forward(self, kpi_label, hidden):
+        # 修复异常值
+        pre = self.actication(self.dense_2(self.actication(self.dense_1(hidden))))
+        loss = self.loss_func(pre, kpi_label)
+        # pdb.set_trace()
+        return pre, loss
+
+
+class NumClassifier(nn.Module):
+    def __init__(self, config):
+        super(NumClassifier, self).__init__()
+        self.dense_1 = nn.Linear(config.hidden_size, int(config.hidden_size / 3))
+        self.dense_2 = nn.Linear(int(config.hidden_size / 3), config.num_kpi)
+        self.loss_func = nn.CrossEntropyLoss()
+        # self.actication = nn.ReLU()
+        self.actication = nn.LeakyReLU()
+
+    def forward(self, hidden, kpi_id):
+        hidden = self.actication(self.dense_1(hidden))
+        pre = self.actication(self.dense_2(hidden)).squeeze(1)
+        loss = self.loss_func(pre, kpi_id)
+        return pre, loss
+
+
+class AttNumLayer(nn.Module):
+    def __init__(self, config):
+        super(AttNumLayer, self).__init__()
+        self.config = config
+        # 768 / 8 = 8
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)  # 96
+        # self.head_size = config.hidden_size
+
+        # scaler
+        self.scalar = nn.Parameter(.3 * torch.ones(1, requires_grad=True))
+        self.key = nn.Parameter(torch.empty(self.num_attention_heads, self.attention_head_size))
+
+        self.dense_down = nn.Linear(config.hidden_size, 128)
+        self.dense_up = nn.Linear(128, config.hidden_size)
+
+        # name embedding
+        self.embedding = nn.Linear(config.hidden_size, self.attention_head_size)
+        # num_attention_heads�� value���� ת������k��
+        self.value = nn.Linear(config.hidden_size, config.hidden_size * self.num_attention_heads)
+
+        # add & norm
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12)
+        # 0.1
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # for m in self.modules().modules():
+        #     pdb.set_trace()
+
+        nn.init.kaiming_normal_(self.key, mode='fan_out', nonlinearity='leaky_relu')
+        # nn.init.orthogonal_(self.key)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.config.hidden_size,
+        )
+        x = x.view(*new_x_shape)
+        return x
+        # return x.permute(0, 2, 1, 3)
+
+    def forward(self, kpi_emb, name_emb):
+        # [64, 1, 96]
+        name_emb = self.embedding(name_emb)
+
+        mixed_value_layer = self.value(kpi_emb)
+
+        # [64, 1, 8, 768]
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # key: [8, 96] self.key.transpose(-1, -2): [96, 8]
+        # name_emb: [64, 1, 96]
+        attention_scores = torch.matmul(name_emb, self.key.transpose(-1, -2))
+        # [64, 1, 8]
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        attention_probs = attention_probs.unsqueeze(1)
+        # ��Ȩ��value��
+        # [64, 1, 1, 8] * [64, 1, 8, 768] = [64, 1, 1, 768]
+        context_layer = torch.matmul(attention_probs, value_layer)
+        # context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.config.hidden_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        # add & norm
+        output_emb = self.dense(context_layer)
+        output_emb = self.dropout(output_emb)
+        output_emb = self.LayerNorm(output_emb + self.scalar * self.dense_up(self.dense_down(kpi_emb)))
+        # output_emb = self.LayerNorm(self.LayerNorm(output_emb) + self.scalar * kpi_emb)
+        # pdb.set_trace()
+        wei = self.value.weight.chunk(8, dim=0)
+        orth_loss_value = sum([ortho_penalty(k) for k in wei])
+        # 0.01 * ortho_penalty(self.key) + ortho_penalty(self.value.weight)
+        orth_loss = 0.0001 * orth_loss_value + 0.0001 * ortho_penalty(self.dense.weight) + 0.01 * ((self.scalar[0])**2).sum()
+        return output_emb, orth_loss, self.scalar.tolist()[0]
diff --git a/KTeleBERT/model/OD_model.py b/KTeleBERT/model/OD_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..176f6a2a064f1c4fe1eb6af876ff42d39352298d
--- /dev/null
+++ b/KTeleBERT/model/OD_model.py
@@ -0,0 +1,74 @@
+import os
+import os.path as osp
+import pdb
+import torch
+import torch.nn as nn
+import numpy as np
+# from transformers import BertModel, BertTokenizer, BertForMaskedLM
+import json
+from packaging import version
+import torch.distributed as dist
+
+
+class OD_model(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.order_num = args.order_num
+        if args.od_type == 'linear_cat':
+            # self.order_dense_1 = nn.Linear(args.hidden_size * self.order_num, args.hidden_size)
+            # self.order_dense_2 = nn.Linear(args.hidden_size, 1)
+            self.order_dense_1 = nn.Linear(args.hidden_size * self.order_num, args.hidden_size)
+            if self.args.num_od_layer > 0:
+                self.layer = nn.ModuleList([OD_Layer_linear(args) for _ in range(args.num_od_layer)])
+
+        self.order_dense_2 = nn.Linear(args.hidden_size, 1)
+
+        self.actication = nn.LeakyReLU()
+        self.bn = torch.nn.BatchNorm1d(args.hidden_size)
+        self.dp = nn.Dropout(p=args.hidden_dropout_prob)
+        self.loss_func = nn.BCEWithLogitsLoss()
+        # self.loss_func = nn.CrossEntropyLoss()
+
+    def forward(self, input, labels):
+        # input 切成两半
+        # 换方向拼接
+        loss_dic = {}
+        pre = self.predict(input)
+        # pdb.set_trace()
+        loss = self.loss_func(pre, labels.unsqueeze(1))
+        loss_dic['order_loss'] = loss.item()
+        return loss, loss_dic
+
+    def encode(self, input):
+        if self.args.num_od_layer > 0:
+            for layer_module in self.layer:
+                input = layer_module(input)
+        inputs = torch.chunk(input, 2, dim=0)
+        emb = torch.concat(inputs, dim=1)
+        return self.actication(self.order_dense_1(self.dp(emb)))
+
+    def predict(self, input):
+        return self.order_dense_2(self.bn(self.encode(input)))
+
+    def right_caculate(self, input, labels, threshold=0.5):
+        input = input.squeeze(1).tolist()
+        labels = labels.tolist()
+        right = 0
+        for i in range(len(input)):
+            if (input[i] >= threshold and labels[i] >= 0.5) or (input[i] < threshold and labels[i] < 0.5):
+                right += 1
+        return right
+
+
+class OD_Layer_linear(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.dense = nn.Linear(args.hidden_size, args.hidden_size)
+        self.actication = nn.LeakyReLU()
+        self.bn = torch.nn.BatchNorm1d(args.hidden_size)
+        self.dropout = nn.Dropout(p=args.hidden_dropout_prob)
+
+    def forward(self, input):
+        return self.actication(self.bn(self.dense(self.dropout(input))))
diff --git a/KTeleBERT/model/Tool_model.py b/KTeleBERT/model/Tool_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fcaf6658f32277454c3d7cf6099a2b9df98a554
--- /dev/null
+++ b/KTeleBERT/model/Tool_model.py
@@ -0,0 +1,34 @@
+# -*- coding: UTF-8 -*-
+
+import torch
+from torch import nn
+
+# https://github.com/Mikoto10032/AutomaticWeightedLoss/blob/master/AutomaticWeightedLoss.py
+
+
+class AutomaticWeightedLoss(nn.Module):
+    # '''
+    # automatically weighted multi-task loss
+    # Params��
+    #     num: int��the number of loss
+    #     x: multi-task loss
+    # Examples��
+    #     loss1=1
+    #     loss2=2
+    #     awl = AutomaticWeightedLoss(2)
+    #     loss_sum = awl(loss1, loss2)
+    # '''
+    def __init__(self, num=2, args=None):
+        super(AutomaticWeightedLoss, self).__init__()
+        if args is None or args.use_awl:
+            params = torch.ones(num, requires_grad=True)
+            self.params = torch.nn.Parameter(params)
+        else:
+            params = torch.ones(num, requires_grad=False)
+            self.params = torch.nn.Parameter(params, requires_grad=False)
+
+    def forward(self, *x):
+        loss_sum = 0
+        for i, loss in enumerate(x):
+            loss_sum += 0.5 / (self.params[i] ** 2) * loss + torch.log(1 + self.params[i] ** 2)
+        return loss_sum
diff --git a/KTeleBERT/model/__init__.py b/KTeleBERT/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1e3efb081034d9cc1c914ed84d93fcc9e6e1b25
--- /dev/null
+++ b/KTeleBERT/model/__init__.py
@@ -0,0 +1,26 @@
+# from .vector import Vector
+# from .classifier import SimpleClassifier
+# # from .updn import UpDn
+# # from .ban import Ban
+
+from .bert import (
+    BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+    BertForMaskedLM,
+    BertForMultipleChoice,
+    BertForNextSentencePrediction,
+    BertForPreTraining,
+    BertForQuestionAnswering,
+    BertForSequenceClassification,
+    BertForTokenClassification,
+    BertLayer,
+    BertLMHeadModel,
+    BertModel,
+    BertPreTrainedModel,
+    load_tf_weights_in_bert,
+)
+
+from .bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
+from .bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer
+from .HWBert import HWBert
+from .KE_model import KGEModel, KE_model
+from .OD_model import OD_model
diff --git a/KTeleBERT/model/__pycache__/HWBert.cpython-38.pyc b/KTeleBERT/model/__pycache__/HWBert.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fcba0f6dca46b8cff544d6590deecb543340b7a5
Binary files /dev/null and b/KTeleBERT/model/__pycache__/HWBert.cpython-38.pyc differ
diff --git a/KTeleBERT/model/__pycache__/KE_model.cpython-38.pyc b/KTeleBERT/model/__pycache__/KE_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16f09b33ca4a3ca7303e3f020e8ce2426af92a2d
Binary files /dev/null and b/KTeleBERT/model/__pycache__/KE_model.cpython-38.pyc differ
diff --git a/KTeleBERT/model/__pycache__/Numeric.cpython-38.pyc b/KTeleBERT/model/__pycache__/Numeric.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fdb11cdc559b481f115d0f2ecff1cf8cc0f6f3aa
Binary files /dev/null and b/KTeleBERT/model/__pycache__/Numeric.cpython-38.pyc differ
diff --git a/KTeleBERT/model/__pycache__/OD_model.cpython-38.pyc b/KTeleBERT/model/__pycache__/OD_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..887c39fb7f043dfefe4cbb8ce4448540a4fd6599
Binary files /dev/null and b/KTeleBERT/model/__pycache__/OD_model.cpython-38.pyc differ
diff --git a/KTeleBERT/model/__pycache__/Tool_model.cpython-38.pyc b/KTeleBERT/model/__pycache__/Tool_model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7868c3f3b27b62f7d8a31322f170027444f84110
Binary files /dev/null and b/KTeleBERT/model/__pycache__/Tool_model.cpython-38.pyc differ
diff --git a/KTeleBERT/model/__pycache__/__init__.cpython-38.pyc b/KTeleBERT/model/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4884141f00bde8e2333a699ac97c0ab5a61f53d2
Binary files /dev/null and b/KTeleBERT/model/__pycache__/__init__.cpython-38.pyc differ
diff --git a/KTeleBERT/model/bert/__init__.py b/KTeleBERT/model/bert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea54209fbc13bffb6b0feae9d9156ac5f64defc4
--- /dev/null
+++ b/KTeleBERT/model/bert/__init__.py
@@ -0,0 +1,201 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_flax_available,
+    is_tensorflow_text_available,
+    is_tf_available,
+    is_tokenizers_available,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_bert": ["BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BertConfig", "BertOnnxConfig"],
+    "tokenization_bert": ["BasicTokenizer", "BertTokenizer", "WordpieceTokenizer"],
+}
+
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_bert_fast"] = ["BertTokenizerFast"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_bert"] = [
+        "BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "BertForMaskedLM",
+        "BertForMultipleChoice",
+        "BertForNextSentencePrediction",
+        "BertForPreTraining",
+        "BertForQuestionAnswering",
+        "BertForSequenceClassification",
+        "BertForTokenClassification",
+        "BertLayer",
+        "BertLMHeadModel",
+        "BertModel",
+        "BertPreTrainedModel",
+        "load_tf_weights_in_bert",
+    ]
+
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_bert"] = [
+        "TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TFBertEmbeddings",
+        "TFBertForMaskedLM",
+        "TFBertForMultipleChoice",
+        "TFBertForNextSentencePrediction",
+        "TFBertForPreTraining",
+        "TFBertForQuestionAnswering",
+        "TFBertForSequenceClassification",
+        "TFBertForTokenClassification",
+        "TFBertLMHeadModel",
+        "TFBertMainLayer",
+        "TFBertModel",
+        "TFBertPreTrainedModel",
+    ]
+try:
+    if not is_tensorflow_text_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_bert_tf"] = ["TFBertTokenizer"]
+
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_flax_bert"] = [
+        "FlaxBertForCausalLM",
+        "FlaxBertForMaskedLM",
+        "FlaxBertForMultipleChoice",
+        "FlaxBertForNextSentencePrediction",
+        "FlaxBertForPreTraining",
+        "FlaxBertForQuestionAnswering",
+        "FlaxBertForSequenceClassification",
+        "FlaxBertForTokenClassification",
+        "FlaxBertModel",
+        "FlaxBertPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig, BertOnnxConfig
+    from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer
+
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_bert_fast import BertTokenizerFast
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_bert import (
+            BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BertForMaskedLM,
+            BertForMultipleChoice,
+            BertForNextSentencePrediction,
+            BertForPreTraining,
+            BertForQuestionAnswering,
+            BertForSequenceClassification,
+            BertForTokenClassification,
+            BertLayer,
+            BertLMHeadModel,
+            BertModel,
+            BertPreTrainedModel,
+            load_tf_weights_in_bert,
+        )
+
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_bert import (
+            TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFBertEmbeddings,
+            TFBertForMaskedLM,
+            TFBertForMultipleChoice,
+            TFBertForNextSentencePrediction,
+            TFBertForPreTraining,
+            TFBertForQuestionAnswering,
+            TFBertForSequenceClassification,
+            TFBertForTokenClassification,
+            TFBertLMHeadModel,
+            TFBertMainLayer,
+            TFBertModel,
+            TFBertPreTrainedModel,
+        )
+
+    try:
+        if not is_tensorflow_text_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_bert_tf import TFBertTokenizer
+
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_flax_bert import (
+            FlaxBertForCausalLM,
+            FlaxBertForMaskedLM,
+            FlaxBertForMultipleChoice,
+            FlaxBertForNextSentencePrediction,
+            FlaxBertForPreTraining,
+            FlaxBertForQuestionAnswering,
+            FlaxBertForSequenceClassification,
+            FlaxBertForTokenClassification,
+            FlaxBertModel,
+            FlaxBertPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/KTeleBERT/model/bert/__pycache__/__init__.cpython-38.pyc b/KTeleBERT/model/bert/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..131338cfc9a6b584726b242fef385039fc142623
Binary files /dev/null and b/KTeleBERT/model/bert/__pycache__/__init__.cpython-38.pyc differ
diff --git a/KTeleBERT/model/bert/__pycache__/configuration_bert.cpython-38.pyc b/KTeleBERT/model/bert/__pycache__/configuration_bert.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75b36538aa98d279db67217485595fd93b9a5e45
Binary files /dev/null and b/KTeleBERT/model/bert/__pycache__/configuration_bert.cpython-38.pyc differ
diff --git a/KTeleBERT/model/bert/__pycache__/modeling_bert.cpython-38.pyc b/KTeleBERT/model/bert/__pycache__/modeling_bert.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c52d458d2549ec35352993174cf3d8f554c595b4
Binary files /dev/null and b/KTeleBERT/model/bert/__pycache__/modeling_bert.cpython-38.pyc differ
diff --git a/KTeleBERT/model/bert/__pycache__/tokenization_bert.cpython-38.pyc b/KTeleBERT/model/bert/__pycache__/tokenization_bert.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8693b20056ae5766746c82589ef4d21b49498de
Binary files /dev/null and b/KTeleBERT/model/bert/__pycache__/tokenization_bert.cpython-38.pyc differ
diff --git a/KTeleBERT/model/bert/configuration_bert.py b/KTeleBERT/model/bert/configuration_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f48a8f93e25bd747fbd9b090014476e3804162d
--- /dev/null
+++ b/KTeleBERT/model/bert/configuration_bert.py
@@ -0,0 +1,191 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration"""
+from collections import OrderedDict
+from typing import Mapping
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json",
+    "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/config.json",
+    "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/config.json",
+    "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/config.json",
+    "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json",
+    "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json",
+    "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/config.json",
+    "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/config.json",
+    "bert-large-uncased-whole-word-masking": (
+        "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/config.json"
+    ),
+    "bert-large-cased-whole-word-masking": (
+        "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/config.json"
+    ),
+    "bert-large-uncased-whole-word-masking-finetuned-squad": (
+        "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json"
+    ),
+    "bert-large-cased-whole-word-masking-finetuned-squad": (
+        "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/config.json"
+    ),
+    "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/config.json",
+    "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/config.json",
+    "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese-whole-word-masking": (
+        "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json"
+    ),
+    "cl-tohoku/bert-base-japanese-char": (
+        "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/config.json"
+    ),
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking": (
+        "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/config.json"
+    ),
+    "TurkuNLP/bert-base-finnish-cased-v1": (
+        "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json"
+    ),
+    "TurkuNLP/bert-base-finnish-uncased-v1": (
+        "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/config.json"
+    ),
+    "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/config.json",
+    # See all BERT models at https://huggingface.co/models?filter=bert
+}
+
+
+class BertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BertModel`] or a [`TFBertModel`]. It is used to
+    instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the BERT
+    [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import BertModel, BertConfig
+
+    >>> # Initializing a BERT bert-base-uncased style configuration
+    >>> configuration = BertConfig()
+
+    >>> # Initializing a model from the bert-base-uncased style configuration
+    >>> model = BertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "bert"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+class BertOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+                ("token_type_ids", dynamic_axis),
+            ]
+        )
diff --git a/KTeleBERT/model/bert/modeling_bert.py b/KTeleBERT/model/bert/modeling_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..92fd8b7a6384ce04cf4079f5a971bec390307f97
--- /dev/null
+++ b/KTeleBERT/model/bert/modeling_bert.py
@@ -0,0 +1,2010 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+import pdb
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from transformers.utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_bert import BertConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "bert-base-uncased"
+_CONFIG_FOR_DOC = "BertConfig"
+_TOKENIZER_FOR_DOC = "BertTokenizer"
+
+# TokenClassification docstring
+_CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/bert-large-cased-finetuned-conll03-english"
+_TOKEN_CLASS_EXPECTED_OUTPUT = (
+    "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC'] "
+)
+_TOKEN_CLASS_EXPECTED_LOSS = 0.01
+
+# QuestionAnswering docstring
+_CHECKPOINT_FOR_QA = "deepset/bert-base-cased-squad2"
+_QA_EXPECTED_OUTPUT = "'a nice puppet'"
+_QA_EXPECTED_LOSS = 7.41
+_QA_TARGET_START_INDEX = 14
+_QA_TARGET_END_INDEX = 15
+
+# SequenceClassification docstring
+_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "textattack/bert-base-uncased-yelp-polarity"
+_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'"
+_SEQ_CLASS_EXPECTED_LOSS = 0.01
+
+
+BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "bert-base-uncased",
+    "bert-large-uncased",
+    "bert-base-cased",
+    "bert-large-cased",
+    "bert-base-multilingual-uncased",
+    "bert-base-multilingual-cased",
+    "bert-base-chinese",
+    "bert-base-german-cased",
+    "bert-large-uncased-whole-word-masking",
+    "bert-large-cased-whole-word-masking",
+    "bert-large-uncased-whole-word-masking-finetuned-squad",
+    "bert-large-cased-whole-word-masking-finetuned-squad",
+    "bert-base-cased-finetuned-mrpc",
+    "bert-base-german-dbmdz-cased",
+    "bert-base-german-dbmdz-uncased",
+    "cl-tohoku/bert-base-japanese",
+    "cl-tohoku/bert-base-japanese-whole-word-masking",
+    "cl-tohoku/bert-base-japanese-char",
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking",
+    "TurkuNLP/bert-base-finnish-cased-v1",
+    "TurkuNLP/bert-base-finnish-uncased-v1",
+    "wietsedv/bert-base-dutch-cased",
+    # See all BERT models at https://huggingface.co/models?filter=bert
+]
+
+
+def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse("1.6.0"):
+            self.register_buffer(
+                "token_type_ids",
+                torch.zeros(self.position_ids.size(), dtype=torch.long),
+                persistent=False,
+            )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+        kpi_ref = None, # KPI数值替换的位置，以及参考的KPI name，KPI数值，类别
+        kpi_model = None,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+        
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        
+        # pdb.set_trace()
+        # TODO 得到 KPI的name embedding，pooling，输入数值编码模型，得到特征替换（mask+）特定位置原向量，
+        # 不产生新的embedding，直接读取生成的embedding
+        en_loss, scalar_list, con_loss, numeric_input, kpi_input = None, None, None, None, None
+        if kpi_ref is not None:
+            max_len = inputs_embeds.shape[1]
+            # 生成数值embedding
+            numeric_list = []
+            kpi_emb_list = []
+            kpi_id_list = []
+            for i in range(len(kpi_ref)):
+                if len(kpi_ref[i])>0:
+                    for item in kpi_ref[i]:
+                        # 可能[NUM]被截断了
+                        if item[2]>=max_len:
+                            continue
+                        numeric_list.append(item[4])
+                        kpi_id_list.append(item[3])
+                        # requires_grad=True
+                        kpi_name_embedding = torch.mean(inputs_embeds[i][item[0]:item[1]+1], dim=0)
+                        kpi_emb_list.append(kpi_name_embedding)
+            # 有可能出现没有KPI的情况
+            if len(kpi_emb_list)>0:
+                kpi_emb = torch.stack(kpi_emb_list).unsqueeze(1)
+                
+                # , dtype=torch.float64
+                numeric_input = torch.Tensor(numeric_list).unsqueeze(1).unsqueeze(1).cuda()
+                kpi_input = torch.tensor(kpi_id_list, dtype=torch.long).cuda()
+                # pdb.set_trace()
+                hidden, en_loss, scalar_list, con_loss = kpi_model._encode(numeric_input, kpi_emb)
+                # 替换
+                key = 0
+                for i in range(len(kpi_ref)):
+                    if len(kpi_ref[i])>0:
+                        for item in kpi_ref[i]:
+                            if item[2]>=max_len:
+                                continue
+                            # [NUM]的(x,y)坐标位置
+                            inputs_embeds[i,item[2]] = hidden[key][0]
+                            key += 1
+                assert key == hidden.shape[0]
+            
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        # 重构输出
+        return embeddings, en_loss, scalar_list, con_loss, numeric_input, kpi_input
+        # return embeddings
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = BertSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = BertAttention(config, position_embedding_type="absolute")
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BertConfig
+    load_tf_weights = load_tf_weights_in_bert
+    base_model_prefix = "bert"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BertEncoder):
+            module.gradient_checkpointing = value
+
+
+@dataclass
+class BertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`BertForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+BERT_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`BertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+)
+class BertModel(BertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        kpi_ref = None, # KPI数值替换的位置，以及参考的KPI name，KPI数值，类别
+        kpi_model = None, # 输入KPI模型
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output,  en_loss, scalar_list, con_loss, numeric_input, kpi_input = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+            kpi_ref=kpi_ref, # KPI数值替换的位置，以及参考的KPI name，KPI数值，类别
+            kpi_model=kpi_model,
+        )
+
+        # 输入了占位符的位置信息
+        # KPI的起始，结束位置embedding 的 pooling
+
+        # 在这里按位置替换数值embedding
+        # 同时用KPI的 embedding 作为监督信号
+        # 
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        
+        # 在这里对数值embedding的位置做回归loss
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        ),  en_loss, scalar_list, con_loss, numeric_input, kpi_input
+
+
+@add_start_docstrings(
+    """
+    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForPreTraining(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        next_sentence_label: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BertForPreTrainingOutput]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+                config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
+                the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
+                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
+
+                - 0 indicates sequence B is a continuation of sequence A,
+                - 1 indicates sequence B is a random sequence.
+            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+                Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import BertTokenizer, BertForPreTraining
+        >>> import torch
+
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> model = BertForPreTraining.from_pretrained("bert-base-uncased")
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.prediction_logits
+        >>> seq_relationship_logits = outputs.seq_relationship_logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return BertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING
+)
+class BertLMHeadModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past}
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+@add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING)
+class BertForMaskedLM(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+
+        # CZ: 添加了pooling
+        # self.bert = BertModel(config)
+        self.cls = BertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="'paris'",
+        expected_loss=0.88,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        kpi_ref = None, # KPI数值替换的位置，以及参考的KPI name，KPI数值，类别
+        kpi_model = None, # 输入KPI模型
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs,  en_loss, scalar_list, con_loss, numeric_input, kpi_input = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            kpi_ref=kpi_ref, # KPI数值替换的位置，以及参考的KPI name，KPI数值，类别
+            kpi_model=kpi_model, 
+        )
+        
+        
+        # decode 等loss 在这里计算
+        # 数值位本来就被maskl不参与loss计算，可以单独算loss
+        sequence_output = outputs[0]
+        
+        # 可能出现没有kpi的情况
+        # if kpi_ref is  None:
+        #     kpi_loss = None
+        # else:
+        #     # awl需要
+        #     kpi_loss = torch.tensor([0.]).cuda()
+        kpi_loss, kpi_loss_weight, kpi_loss_dict = None, None, None
+        if  kpi_input is not None:
+            max_len = sequence_output.shape[1]
+            # 生成数值embedding
+            kpi_emb_list = []
+            for i in range(len(kpi_ref)):
+                if len(kpi_ref[i])>0:
+                    for item in kpi_ref[i]:
+                        # 可能[NUM]被截断了
+                        if item[2]>=max_len:
+                            continue
+                        # requires_grad=True
+                        kpi_emb_list.append(sequence_output[i][item[2]])
+            
+            #  TODO: 把KPI con loss 归一化，因为KPI会浮动
+            kpi_emb = torch.stack(kpi_emb_list).unsqueeze(1)
+            
+            # numeric_input: 相关的数值
+            # kpi_input：相关的KPI id
+            _dec_kpi_score, de_loss = kpi_model.decoder(numeric_input, kpi_emb)
+            # pdb.set_trace()
+            _cls_kpi, cls_loss = kpi_model.classifier(kpi_emb, kpi_input)
+            # pdb.set_trace()
+            # pdb.set_trace()
+            # 提前乘一个系数降低影响
+            if con_loss is not None:
+                kpi_loss = kpi_model.loss_awl(de_loss, 0.2 * cls_loss, 0.2 * con_loss) + 0.5 * en_loss
+                kpi_loss_dict = {'de_loss':de_loss.item(), 'con_loss':con_loss.item(), 'cls_loss':cls_loss.item(), 'en_loss':en_loss.item()}
+            else:
+                kpi_loss = kpi_model.loss_awl(de_loss, 0.1 * cls_loss) + 0.5 * en_loss
+                kpi_loss_dict = {'de_loss':de_loss.item(), 'cls_loss':cls_loss.item(), 'en_loss':en_loss.item()}
+            kpi_loss_weight = kpi_model.loss_awl.params.tolist()
+        
+        
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token [ignore_index=- 100]
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        # pdb.set_trace()
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions
+        ), kpi_loss, kpi_loss_weight, kpi_loss_dict
+
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError("The PAD token should be defined for generation")
+
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top.""",
+    BERT_START_DOCSTRING,
+)
+class BertForNextSentencePrediction(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see `input_ids` docstring). Indices should be in `[0, 1]`:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import BertTokenizer, BertForNextSentencePrediction
+        >>> import torch
+
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
+
+        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+        >>> logits = outputs.logits
+        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
+        ```
+        """
+
+        if "next_sentence_label" in kwargs:
+            warnings.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
+                " `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForMultipleChoice(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForTokenClassification(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_TOKEN_CLASS_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForQuestionAnswering(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_QA,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        qa_target_start_index=_QA_TARGET_START_INDEX,
+        qa_target_end_index=_QA_TARGET_END_INDEX,
+        expected_output=_QA_EXPECTED_OUTPUT,
+        expected_loss=_QA_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/KTeleBERT/model/bert/tokenization_bert.py b/KTeleBERT/model/bert/tokenization_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..4da4ce3f234d8f41a6e3440741e31ef964a8fb46
--- /dev/null
+++ b/KTeleBERT/model/bert/tokenization_bert.py
@@ -0,0 +1,574 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Bert."""
+
+
+import collections
+import os
+import unicodedata
+from typing import List, Optional, Tuple
+
+from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
+        "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
+        "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt",
+        "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt",
+        "bert-base-multilingual-uncased": (
+            "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/vocab.txt"
+        ),
+        "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt",
+        "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt",
+        "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/vocab.txt",
+        "bert-large-uncased-whole-word-masking": (
+            "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt"
+        ),
+        "bert-large-cased-whole-word-masking": (
+            "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/vocab.txt"
+        ),
+        "bert-large-uncased-whole-word-masking-finetuned-squad": (
+            "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"
+        ),
+        "bert-large-cased-whole-word-masking-finetuned-squad": (
+            "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"
+        ),
+        "bert-base-cased-finetuned-mrpc": (
+            "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt"
+        ),
+        "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/vocab.txt",
+        "bert-base-german-dbmdz-uncased": (
+            "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt"
+        ),
+        "TurkuNLP/bert-base-finnish-cased-v1": (
+            "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt"
+        ),
+        "TurkuNLP/bert-base-finnish-uncased-v1": (
+            "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/vocab.txt"
+        ),
+        "wietsedv/bert-base-dutch-cased": (
+            "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/vocab.txt"
+        ),
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "bert-base-uncased": 512,
+    "bert-large-uncased": 512,
+    "bert-base-cased": 512,
+    "bert-large-cased": 512,
+    "bert-base-multilingual-uncased": 512,
+    "bert-base-multilingual-cased": 512,
+    "bert-base-chinese": 512,
+    "bert-base-german-cased": 512,
+    "bert-large-uncased-whole-word-masking": 512,
+    "bert-large-cased-whole-word-masking": 512,
+    "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
+    "bert-large-cased-whole-word-masking-finetuned-squad": 512,
+    "bert-base-cased-finetuned-mrpc": 512,
+    "bert-base-german-dbmdz-cased": 512,
+    "bert-base-german-dbmdz-uncased": 512,
+    "TurkuNLP/bert-base-finnish-cased-v1": 512,
+    "TurkuNLP/bert-base-finnish-uncased-v1": 512,
+    "wietsedv/bert-base-dutch-cased": 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "bert-base-uncased": {"do_lower_case": True},
+    "bert-large-uncased": {"do_lower_case": True},
+    "bert-base-cased": {"do_lower_case": False},
+    "bert-large-cased": {"do_lower_case": False},
+    "bert-base-multilingual-uncased": {"do_lower_case": True},
+    "bert-base-multilingual-cased": {"do_lower_case": False},
+    "bert-base-chinese": {"do_lower_case": False},
+    "bert-base-german-cased": {"do_lower_case": False},
+    "bert-large-uncased-whole-word-masking": {"do_lower_case": True},
+    "bert-large-cased-whole-word-masking": {"do_lower_case": False},
+    "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
+    "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
+    "bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
+    "bert-base-german-dbmdz-cased": {"do_lower_case": False},
+    "bert-base-german-dbmdz-uncased": {"do_lower_case": True},
+    "TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False},
+    "TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True},
+    "wietsedv/bert-base-dutch-cased": {"do_lower_case": False},
+}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class BertTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a BERT tokenizer. Based on WordPiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs
+    ):
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
+        pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+
+class BasicTokenizer(object):
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
+        WordPieceTokenizer.
+
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+
+        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
diff --git a/KTeleBERT/requirements.txt b/KTeleBERT/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..65af691759d00c75fb7fba2b70deff840fec5648
--- /dev/null
+++ b/KTeleBERT/requirements.txt
@@ -0,0 +1,10 @@
+transformers==4.12.2
+tqdm
+torch
+ltp 
+ltp-core 
+ltp-extension
+cycle
+torch>=1.10.0
+easydict
+re
diff --git a/KTeleBERT/run.sh b/KTeleBERT/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4e34ff6723aeb3d277d33da31cdfe986f8bf765b
--- /dev/null
+++ b/KTeleBERT/run.sh
@@ -0,0 +1,35 @@
+python -m torch.distributed.launch --nproc_per_node=4 main.py --LLRD 1 \
+               --eval_step 10 \
+               --save_model 1 \
+               --mask_stratege wwm \
+               --batch_size 64 \
+               --batch_size_ke 64 \
+               --exp_name Fine_tune_2 \
+               --exp_id v01 \
+               --workers 8 \
+               --use_NumEmb 1 \
+               --seq_data_name Seq_data_RuAlmEntKpiTbwDoc \
+               --maxlength 256 \
+               --lr 4e-5 \
+               --ke_lr 8e-5 \
+               --train_strategy 2 \
+               --model_name TeleBert2 \
+               --train_ratio 1 \
+               --save_pretrain 0 \
+               --dist 1 \
+               --accumulation_steps 8 \
+               --accumulation_steps_ke 6 \
+               --special_token_mask 0 \
+               --freeze_layer 0 \
+               --ernie_stratege -1 \
+               --mlm_probability_increase curve \
+               --use_kpi_loss 1 \
+               --mlm_probability 0.4 \
+               --use_awl 1 \
+               --cls_head_init 1 \
+               --emb_init 0 \
+               --final_mlm_probability 0.4 \
+               --ke_dim 256 \
+               --plm_emb_type cls \
+               --train_together 0 \
+
diff --git a/KTeleBERT/run_get_ref.sh b/KTeleBERT/run_get_ref.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ce64fd9460ce121c3214f4aed5e5ba2e34d156c4
--- /dev/null
+++ b/KTeleBERT/run_get_ref.sh
@@ -0,0 +1,22 @@
+python get_chinese_ref.py --batch_size 50 \
+                         --deal_numeric 1 \
+                         --seq_data_name Seq_data_large \
+                         --read_cws 0 \
+                        #  --seq_data_name Seq_data_base \
+                        #  --read_cws 1 \
+
+# python get_chinese_ref.py --batch_size 150
+
+# python get_chinese_ref.py --batch_size 200
+
+# python get_chinese_ref.py --batch_size 250
+
+# python get_chinese_ref.py --batch_size 300
+
+# python main.py --LLRD 1 \
+#                --eval_step 10 \
+#                --epoch 20 \
+#                --save_model 1 \
+#                --mask_stratege wwm \
+#                --batch_size 50 \
+#                --use_NumEmb 1 \
\ No newline at end of file
diff --git a/KTeleBERT/special_token_pre_emb.py b/KTeleBERT/special_token_pre_emb.py
new file mode 100644
index 0000000000000000000000000000000000000000..c149d3804d977c3bb5cbe005c5a11633961b7e85
--- /dev/null
+++ b/KTeleBERT/special_token_pre_emb.py
@@ -0,0 +1,119 @@
+from src.utils import add_special_token
+import os.path as osp
+import numpy as np
+import random
+import torch
+from easydict import EasyDict as edict
+import argparse
+import pdb
+import json
+from model import BertTokenizer
+from collections import Counter
+from tqdm import tqdm
+from time import time
+from numpy import mean
+import math
+
+from transformers import BertModel
+
+
+class cfg():
+    def __init__(self):
+        self.this_dir = osp.dirname(__file__)
+        # change
+        self.data_root = osp.abspath(osp.join(self.this_dir, '..', '..', 'data', ''))
+
+    def get_args(self):
+        parser = argparse.ArgumentParser()
+        # seq_data_name = "Seq_data_tiny_831"
+        parser.add_argument("--data_path", default="huawei", type=str, help="Experiment path")
+        parser.add_argument("--update_model_name", default='MacBert', type=str, help="MacBert")
+        parser.add_argument("--pretrained_model_name", default='TeleBert', type=str, help="TeleBert")
+        parser.add_argument("--read_cws", default=0, type=int, help="是否需要读训练好的cws文件")
+        self.cfg = parser.parse_args()
+
+    def update_train_configs(self):
+        # TODO: update some dynamic variable
+        self.cfg.data_root = self.data_root
+        self.cfg.data_path = osp.join(self.data_root, self.cfg.data_path)
+
+        return self.cfg
+
+
+if __name__ == '__main__':
+    '''
+    功能： 得到 chinese ref 文件，同时刷新训练/测试文件（仅针对序列的文本数据）
+    '''
+    cfg = cfg()
+    cfg.get_args()
+    cfgs = cfg.update_train_configs()
+
+    # 用来被更新的，需要添加token的tokenizer
+    path = osp.join(cfgs.data_root, 'transformer', cfgs.update_model_name)
+    assert osp.exists(path)
+    tokenizer = BertTokenizer.from_pretrained(path, do_lower_case=True)
+    tokenizer, special_token, norm_token = add_special_token(tokenizer)
+    added_vocab = tokenizer.get_added_vocab()
+    vocb_path = osp.join(cfgs.data_path, 'added_vocab.json')
+
+    with open(vocb_path, 'w') as fp:
+        json.dump(added_vocab, fp, ensure_ascii=False)
+
+    vocb_description = osp.join(cfgs.data_path, 'vocab_descrip.json')
+    vocb_descrip = None
+
+    vocb_descrip = {
+        "alm": "alarm",
+        "ran": "ran 无线接入网",
+        "mml": "MML 人机语言命令",
+        "nf": "NF 独立网络服务",
+        "apn": "APN 接入点名称",
+        "pgw": "PGW 数据管理子系统模块",
+        "lst": "LST 查询命令",
+        "qos": "QoS 定制服务质量",
+        "ipv": "IPV 互联网通讯协议版本",
+        "ims": "IMS IP多模态子系统",
+        "gtp": "GTP GPRS隧道协议",
+        "pdp": "PDP 分组数据协议",
+        "hss": "HSS HTTP Smooth Stream",
+        "[ALM]": "alarm 告警 标记",
+        "[KPI]": "kpi 关键性能指标 标记",
+        "[LOC]": "location 事件发生位置 标记",
+        "[EOS]": "end of the sentence 文档结尾 标记",
+        "[ENT]": "实体标记",
+        "[ATTR]": "属性标记",
+        "[NUM]": "数值标记",
+        "[REL]": "关系标记",
+        "[DOC]": "文档标记"
+    }
+
+    # if osp.exists(vocb_description):
+    #     with open(vocb_description, 'r') as fp:
+    #         vocb_descrip = json.load(added_vocab)
+
+    # 用来进行embedding的模型
+    path = osp.join(cfgs.data_root, 'transformer', cfgs.pretrained_model_name)
+    assert osp.exists(path)
+    pre_tokenizer = BertTokenizer.from_pretrained(path, do_lower_case=True)
+    model = BertModel.from_pretrained(path)
+
+    print("use the vocb_description")
+    key_to_emb = {}
+    for key in added_vocab.keys():
+        if vocb_description is not None:
+            if key in vocb_description:
+                # 一部分需要描述
+                key_tokens = pre_tokenizer(vocb_description[key], return_tensors='pt')
+            else:
+                key_tokens = pre_tokenizer(key, return_tensors='pt')
+        else:
+            key_tokens = pre_tokenizer(key, return_tensors='pt')
+
+        hidden_state = model(**key_tokens, output_hidden_states=True).hidden_states
+        pdb.set_trace()
+        key_to_emb[key] = hidden_state[-1][:, 1:-1, :].mean(dim=1)
+
+    emb_path = osp.join(cfgs.data_path, 'added_vocab_embedding.pt')
+
+    torch.save(key_to_emb, emb_path)
+    print(f'save to {emb_path}')
diff --git a/KTeleBERT/src/__init__.py b/KTeleBERT/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/KTeleBERT/src/__init__.py
@@ -0,0 +1 @@
+
diff --git a/KTeleBERT/src/__pycache__/__init__.cpython-38.pyc b/KTeleBERT/src/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d080e6ead7ca4ae6ce9dbeb5878d73b25a109c06
Binary files /dev/null and b/KTeleBERT/src/__pycache__/__init__.cpython-38.pyc differ
diff --git a/KTeleBERT/src/__pycache__/data.cpython-38.pyc b/KTeleBERT/src/__pycache__/data.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6db541ae48026f927cb9b3c488a331b74f940999
Binary files /dev/null and b/KTeleBERT/src/__pycache__/data.cpython-38.pyc differ
diff --git a/KTeleBERT/src/__pycache__/distributed_utils.cpython-38.pyc b/KTeleBERT/src/__pycache__/distributed_utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57d4f87a8b2c1c70792eace612b5b1a47f864e7e
Binary files /dev/null and b/KTeleBERT/src/__pycache__/distributed_utils.cpython-38.pyc differ
diff --git a/KTeleBERT/src/__pycache__/utils.cpython-38.pyc b/KTeleBERT/src/__pycache__/utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c9f69aaeb4d58e64d8f942a5d5a97a71a2b6b60
Binary files /dev/null and b/KTeleBERT/src/__pycache__/utils.cpython-38.pyc differ
diff --git a/KTeleBERT/src/data.py b/KTeleBERT/src/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..40abacf132e4b4c45f458ce3d140c0361f86ee9a
--- /dev/null
+++ b/KTeleBERT/src/data.py
@@ -0,0 +1,651 @@
+import torch
+import random
+import json
+import numpy as np
+import pdb
+import os.path as osp
+from model import BertTokenizer
+import torch.distributed as dist
+
+
+class SeqDataset(torch.utils.data.Dataset):
+    def __init__(self, data, chi_ref=None, kpi_ref=None):
+        self.data = data
+        self.chi_ref = chi_ref
+        self.kpi_ref = kpi_ref
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        sample = self.data[index]
+        if self.chi_ref is not None:
+            chi_ref = self.chi_ref[index]
+        else:
+            chi_ref = None
+
+        if self.kpi_ref is not None:
+            kpi_ref = self.kpi_ref[index]
+        else:
+            kpi_ref = None
+
+        return sample, chi_ref, kpi_ref
+
+
+class OrderDataset(torch.utils.data.Dataset):
+    def __init__(self, data, kpi_ref=None):
+        self.data = data
+        self.kpi_ref = kpi_ref
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        sample = self.data[index]
+        if self.kpi_ref is not None:
+            kpi_ref = self.kpi_ref[index]
+        else:
+            kpi_ref = None
+
+        return sample, kpi_ref
+
+
+class KGDataset(torch.utils.data.Dataset):
+    def __init__(self, data):
+        self.data = data
+        self.len = len(self.data)
+
+    def __len__(self):
+        return self.len
+
+    def __getitem__(self, index):
+
+        sample = self.data[index]
+        return sample
+
+# TODO: 重构 DataCollatorForLanguageModeling
+
+
+class Collator_base(object):
+    # TODO: 定义 collator，模仿Lako
+    # 完成mask，padding
+    def __init__(self, args, tokenizer, special_token=None):
+        self.tokenizer = tokenizer
+        if special_token is None:
+            self.special_token = ['[SEP]', '[MASK]', '[ALM]', '[KPI]', '[CLS]', '[LOC]', '[EOS]', '[ENT]', '[ATTR]', '[NUM]', '[REL]', '|', '[DOC]']
+        else:
+            self.special_token = special_token
+
+        self.text_maxlength = args.maxlength
+        self.mlm_probability = args.mlm_probability
+        self.args = args
+        if self.args.special_token_mask:
+            self.special_token = ['|', '[NUM]']
+
+        if not self.args.only_test and self.args.use_mlm_task:
+            if args.mask_stratege == 'rand':
+                self.mask_func = self.torch_mask_tokens
+            else:
+                if args.mask_stratege == 'wwm':
+                    # 必须使用special_word, 因为这里的wwm基于分词
+                    if args.rank == 0:
+                        print("use word-level Mask ...")
+                    assert args.add_special_word == 1
+                    self.mask_func = self.wwm_mask_tokens
+                else:  # domain
+                    if args.rank == 0:
+                        print("use token-level Mask ...")
+                    self.mask_func = self.domain_mask_tokens
+
+    def __call__(self, batch):
+        # 把 batch 中的数值提取出，用specail token 替换
+        # 把数值信息，以及数值的位置信息单独通过list传进去
+        # 后面训练的阶段直接把数值插入embedding的位置
+        # 数值不参与 mask
+        # wwm的时候可以把chinese ref 随batch一起输入
+        kpi_ref = None
+        if self.args.use_NumEmb:
+            kpi_ref = [item[2] for item in batch]
+        # if self.args.mask_stratege != 'rand':
+        chinese_ref = [item[1] for item in batch]
+        batch = [item[0] for item in batch]
+        # 此时batch不止有字符串
+        batch = self.tokenizer.batch_encode_plus(
+            batch,
+            padding='max_length',
+            max_length=self.text_maxlength,
+            truncation=True,
+            return_tensors="pt",
+            return_token_type_ids=False,
+            return_attention_mask=True,
+            add_special_tokens=False
+        )
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        # self.torch_mask_tokens
+
+        # if batch["input_ids"].shape[1] != 128:
+        #     pdb.set_trace()
+        if chinese_ref is not None:
+            batch["chinese_ref"] = chinese_ref
+        if kpi_ref is not None:
+            batch["kpi_ref"] = kpi_ref
+
+        # 训练需要 mask
+
+        if not self.args.only_test and self.args.use_mlm_task:
+            batch["input_ids"], batch["labels"] = self.mask_func(
+                batch, special_tokens_mask=special_tokens_mask
+            )
+        else:
+            # 非训练状态
+            # 且不用MLM进行训练
+            labels = batch["input_ids"].clone()
+            if self.tokenizer.pad_token_id is not None:
+                labels[labels == self.tokenizer.pad_token_id] = -100
+            batch["labels"] = labels
+
+        return batch
+
+    def torch_mask_tokens(self, inputs, special_tokens_mask=None):
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        if "input_ids" in inputs:
+            inputs = inputs["input_ids"]
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = torch.full(labels.shape, self.mlm_probability)
+        if special_tokens_mask is None:
+            special_tokens_mask = [
+                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+            ]
+            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
+        else:
+            special_tokens_mask = special_tokens_mask.bool()
+        # pdb.set_trace()
+
+        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
+        masked_indices = torch.bernoulli(probability_matrix).bool()
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+    def wwm_mask_tokens(self, inputs, special_tokens_mask=None):
+        mask_labels = []
+        ref_tokens = inputs["chinese_ref"]
+        input_ids = inputs["input_ids"]
+        sz = len(input_ids)
+
+        # 把input id 先恢复到token
+        for i in range(sz):
+            # 这里的主体是读入的ref，但是可能存在max_len不统一的情况
+            mask_labels.append(self._whole_word_mask(ref_tokens[i]))
+
+        batch_mask = _torch_collate_batch(mask_labels, self.tokenizer, self.text_maxlength, pad_to_multiple_of=None)
+        inputs, labels = self.torch_mask_tokens_4wwm(input_ids, batch_mask)
+        return inputs, labels
+
+    # input_tokens: List[str]
+    def _whole_word_mask(self, input_tokens, max_predictions=512):
+        """
+        Get 0/1 labels for masked tokens with whole word mask proxy
+        """
+        assert isinstance(self.tokenizer, (BertTokenizer))
+        # 输入是 [..., ..., ..., ...] 格式
+        cand_indexes = []
+        cand_token = []
+
+        for i, token in enumerate(input_tokens):
+            if i >= self.text_maxlength - 1:
+                # 不能超过最大值，截断一下
+                break
+            if token.lower() in self.special_token:
+                # special token 的词不应该被mask
+                continue
+            if len(cand_indexes) >= 1 and token.startswith("##"):
+                cand_indexes[-1].append(i)
+                cand_token.append(i)
+            else:
+                cand_indexes.append([i])
+                cand_token.append(i)
+
+        random.shuffle(cand_indexes)
+        # 原来是：input_tokens
+        # 但是这里的特殊token很多，因此提前去掉了特殊token
+        # 这里的15%是去掉了特殊token的15%。+2的原因是把CLS SEP两个 flag的长度加上
+        num_to_predict = min(max_predictions, max(1, int(round((len(cand_token) + 2) * self.mlm_probability))))
+        masked_lms = []
+        covered_indexes = set()
+        for index_set in cand_indexes:
+            # 到达长度了结束
+            if len(masked_lms) >= num_to_predict:
+                break
+            # If adding a whole-word mask would exceed the maximum number of
+            # predictions, then just skip this candidate.
+            # 不能让其长度大于15%，最多等于
+            if len(masked_lms) + len(index_set) > num_to_predict:
+                continue
+            is_any_index_covered = False
+            for index in index_set:
+                # 不考虑重叠的token进行mask
+                if index in covered_indexes:
+                    is_any_index_covered = True
+                    break
+            if is_any_index_covered:
+                continue
+            for index in index_set:
+                covered_indexes.add(index)
+                masked_lms.append(index)
+
+        if len(covered_indexes) != len(masked_lms):
+            # 一般不会出现，因为过程中避免重复了
+            raise ValueError("Length of covered_indexes is not equal to length of masked_lms.")
+            # 不能超过最大值，截断
+        mask_labels = [1 if i in covered_indexes else 0 for i in range(min(len(input_tokens), self.text_maxlength))]
+
+        return mask_labels
+
+        # 确定这里面需要mask的：置0/1
+
+        # 调用 self.torch_mask_tokens
+
+        #
+        pass
+
+    def torch_mask_tokens_4wwm(self, inputs, mask_labels):
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
+        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
+        """
+        # if "input_ids" in inputs:
+        #     inputs = inputs["input_ids"]
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
+                " --mlm flag if you want to use this tokenizer."
+            )
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+
+        probability_matrix = mask_labels
+
+        special_tokens_mask = [self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
+
+        if len(special_tokens_mask[0]) != probability_matrix.shape[1]:
+            print(f"len(special_tokens_mask[0]): {len(special_tokens_mask[0])}")
+            print(f"probability_matrix.shape[1]): {probability_matrix.shape[1]}")
+            print(f'max len {self.text_maxlength}')
+            print(f"pad_token_id: {self.tokenizer.pad_token_id}")
+            # if self.args.rank != in_rank:
+            if self.args.dist:
+                dist.barrier()
+                pdb.set_trace()
+            else:
+                pdb.set_trace()
+
+        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
+        if self.tokenizer._pad_token is not None:
+            padding_mask = labels.eq(self.tokenizer.pad_token_id)
+            probability_matrix.masked_fill_(padding_mask, value=0.0)
+
+        masked_indices = probability_matrix.bool()
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 这里的wwm，每次 mask/替换/不变的时候单位不是一体的，会拆开
+        # 其实不太合理，但是也没办法
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+    # TODO: 按区域cell 进行mask
+
+    def domain_mask_tokens(self, inputs, special_tokens_mask=None):
+        pass
+
+
+class Collator_kg(object):
+    # TODO: 定义 collator，模仿Lako
+    # 完成 随机减少一部分属性
+    def __init__(self, args, tokenizer, data):
+        self.tokenizer = tokenizer
+        self.text_maxlength = args.maxlength
+        self.cross_sampling_flag = 0
+        # ke 的bs 是正常bs的四分之一
+        self.neg_num = args.neg_num
+        # 负样本不能在全集中
+        self.data = data
+        self.args = args
+
+    def __call__(self, batch):
+        # 先编码成可token形式避免重复编码
+        outputs = self.sampling(batch)
+
+        return outputs
+
+    def sampling(self, data):
+        """Filtering out positive samples and selecting some samples randomly as negative samples.
+
+        Args:
+            data: The triples used to be sampled.
+
+        Returns:
+            batch_data: The training data.
+        """
+        batch_data = {}
+        neg_ent_sample = []
+
+        self.cross_sampling_flag = 1 - self.cross_sampling_flag
+
+        head_list = []
+        rel_list = []
+        tail_list = []
+        # pdb.set_trace()
+        if self.cross_sampling_flag == 0:
+            batch_data['mode'] = "head-batch"
+            for index, (head, relation, tail) in enumerate(data):
+                # in batch negative
+                neg_head = self.find_neghead(data, index, relation, tail)
+                neg_ent_sample.extend(random.sample(neg_head, self.neg_num))
+                head_list.append(head)
+                rel_list.append(relation)
+                tail_list.append(tail)
+        else:
+            batch_data['mode'] = "tail-batch"
+            for index, (head, relation, tail) in enumerate(data):
+                neg_tail = self.find_negtail(data, index, relation, head)
+                neg_ent_sample.extend(random.sample(neg_tail, self.neg_num))
+
+                head_list.append(head)
+                rel_list.append(relation)
+                tail_list.append(tail)
+
+        neg_ent_batch = self.batch_tokenizer(neg_ent_sample)
+        head_batch = self.batch_tokenizer(head_list)
+        rel_batch = self.batch_tokenizer(rel_list)
+        tail_batch = self.batch_tokenizer(tail_list)
+
+        ent_list = head_list + rel_list + tail_list
+        ent_dict = {k: v for v, k in enumerate(ent_list)}
+        # 用来索引负样本
+        neg_index = torch.tensor([ent_dict[i] for i in neg_ent_sample])
+        # pos_head_index = torch.tensor(list(range(len(head_list)))
+
+        batch_data["positive_sample"] = (head_batch, rel_batch, tail_batch)
+        batch_data['negative_sample'] = neg_ent_batch
+        batch_data['neg_index'] = neg_index
+        return batch_data
+
+    def batch_tokenizer(self, input_list):
+        return self.tokenizer.batch_encode_plus(
+            input_list,
+            padding='max_length',
+            max_length=self.text_maxlength,
+            truncation=True,
+            return_tensors="pt",
+            return_token_type_ids=False,
+            return_attention_mask=True,
+            add_special_tokens=False
+        )
+
+    def find_neghead(self, data, index, rel, ta):
+        head_list = []
+        for i, (head, relation, tail) in enumerate(data):
+            # 负样本不能被包含
+            if i != index and [head, rel, ta] not in self.data:
+                head_list.append(head)
+        # 可能存在负样本不够的情况
+        # 自补齐
+        while len(head_list) < self.neg_num:
+            head_list.extend(random.sample(head_list, min(self.neg_num - len(head_list), len(head_list))))
+
+        return head_list
+
+    def find_negtail(self, data, index, rel, he):
+        tail_list = []
+        for i, (head, relation, tail) in enumerate(data):
+            if i != index and [he, rel, tail] not in self.data:
+                tail_list.append(tail)
+        # 可能存在负样本不够的情况
+        # 自补齐
+        while len(tail_list) < self.neg_num:
+            tail_list.extend(random.sample(tail_list, min(self.neg_num - len(tail_list), len(tail_list))))
+        return tail_list
+
+# 载入mask loss部分的数据
+
+
+def load_data(logger, args):
+
+    data_path = args.data_path
+
+    data_name = args.seq_data_name
+    with open(osp.join(data_path, f'{data_name}_cws.json'), "r") as fp:
+        data = json.load(fp)
+    if args.rank == 0:
+        logger.info(f"[Start] Loading Seq dataset: [{len(data)}]...")
+    random.shuffle(data)
+
+    # data = data[:10000]
+    # pdb.set_trace()
+    train_test_split = int(args.train_ratio * len(data))
+    # random.shuffle(x)
+    # 训练/测试期间不应该打乱
+    train_data = data[0: train_test_split]
+    test_data = data[train_test_split: len(data)]
+
+    # 测试的时候也可能用到其实 not args.only_test
+    if args.use_mlm_task:
+        # if args.mask_stratege != 'rand':
+        # 读领域词汇
+        if args.rank == 0:
+            print("using the domain words .....")
+        domain_file_path = osp.join(args.data_path, f'{data_name}_chinese_ref.json')
+        with open(domain_file_path, 'r') as f:
+            chinese_ref = json.load(f)
+    # train_test_split=len(data)
+        chi_ref_train = chinese_ref[:train_test_split]
+        chi_ref_eval = chinese_ref[train_test_split:]
+    else:
+        chi_ref_train = None
+        chi_ref_eval = None
+
+    if args.use_NumEmb:
+        if args.rank == 0:
+            print("using the kpi and num  .....")
+
+        kpi_file_path = osp.join(args.data_path, f'{data_name}_kpi_ref.json')
+        with open(kpi_file_path, 'r') as f:
+            kpi_ref = json.load(f)
+        kpi_ref_train = kpi_ref[:train_test_split]
+        kpi_ref_eval = kpi_ref[train_test_split:]
+    else:
+        # num_ref_train = None
+        # num_ref_eval = None
+        kpi_ref_train = None
+        kpi_ref_eval = None
+
+    # pdb.set_trace()
+    test_set = None
+    train_set = SeqDataset(train_data, chi_ref=chi_ref_train, kpi_ref=kpi_ref_train)
+    if len(test_data) > 0:
+        test_set = SeqDataset(test_data, chi_ref=chi_ref_eval, kpi_ref=kpi_ref_eval)
+    if args.rank == 0:
+        logger.info("[End] Loading Seq dataset...")
+    return train_set, test_set, train_test_split
+
+# 载入triple loss部分的数据
+
+
+def load_data_kg(logger, args):
+    data_path = args.data_path
+    if args.rank == 0:
+        logger.info("[Start] Loading KG dataset...")
+    # # 三元组
+    # with open(osp.join(data_path, '5GC_KB/database_triples_831.json'), "r") as f:
+    #     data = json.load(f)
+    # random.shuffle(data)
+
+    # # # TODO: triple loss这一块还没有测试集
+    # train_data = data[0:int(len(data)/args.batch_size)*args.batch_size]
+
+    # with open(osp.join(data_path, 'KG_data_tiny_831.json'),"w") as fp:
+    #     json.dump(data[:1000], fp)
+    kg_data_name = args.kg_data_name
+    with open(osp.join(data_path, f'{kg_data_name}.json'), "r") as fp:
+        train_data = json.load(fp)
+    # pdb.set_trace()
+    # 124169
+    # 128482
+    # train_data = train_data[:124168]
+    # train_data = train_data[:1000]
+    train_set = KGDataset(train_data)
+    if args.rank == 0:
+        logger.info("[End] Loading KG dataset...")
+    return train_set, train_data
+
+
+def _torch_collate_batch(examples, tokenizer, max_length=None, pad_to_multiple_of=None):
+    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
+    import numpy as np
+    import torch
+
+    # Tensorize if necessary.
+    if isinstance(examples[0], (list, tuple, np.ndarray)):
+        examples = [torch.tensor(e, dtype=torch.long) for e in examples]
+
+    length_of_first = examples[0].size(0)
+
+    # Check if padding is necessary.
+
+    # are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
+    # if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
+    #     return torch.stack(examples, dim=0)
+
+    # If yes, check if we have a `pad_token`.
+    if tokenizer._pad_token is None:
+        raise ValueError(
+            "You are attempting to pad samples but the tokenizer you are using"
+            f" ({tokenizer.__class__.__name__}) does not have a pad token."
+        )
+
+    # Creating the full tensor and filling it with our data.
+
+    if max_length is None:
+        pdb.set_trace()
+        max_length = max(x.size(0) for x in examples)
+
+    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+    result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
+    for i, example in enumerate(examples):
+        if tokenizer.padding_side == "right":
+            result[i, : example.shape[0]] = example
+        else:
+            result[i, -example.shape[0]:] = example
+
+    return result
+
+
+def load_order_data(logger, args):
+    if args.rank == 0:
+        logger.info("[Start] Loading Order dataset...")
+
+    data_path = args.data_path
+    if len(args.order_test_name) > 0:
+        data_name = args.order_test_name
+    else:
+        data_name = args.order_data_name
+    tmp = osp.join(data_path, f'{data_name}.json')
+    if osp.exists(tmp):
+        dp = tmp
+    else:
+        dp = osp.join(data_path, 'downstream_task', f'{data_name}.json')
+    assert osp.exists(dp)
+    with open(dp, "r") as fp:
+        data = json.load(fp)
+    # data = data[:2000]
+    # pdb.set_trace()
+    train_test_split = int(args.train_ratio * len(data))
+
+    mid_split = int(train_test_split / 2)
+    mid = int(len(data) / 2)
+    # random.shuffle(x)
+    # 训练/测试期间不应该打乱
+    # train_data = data[0: train_test_split]
+    # test_data = data[train_test_split: len(data)]
+
+    # test_data = data[0: train_test_split]
+    # train_data = data[train_test_split: len(data)]
+
+    # 特殊分类 默认前一半和后一半对称
+    test_data = data[0: mid_split] + data[mid: mid + mid_split]
+    train_data = data[mid_split: mid] + data[mid + mid_split: len(data)]
+
+    # pdb.set_trace()
+    test_set = None
+    train_set = OrderDataset(train_data)
+    if len(test_data) > 0:
+        test_set = OrderDataset(test_data)
+    if args.rank == 0:
+        logger.info("[End] Loading Order dataset...")
+    return train_set, test_set, train_test_split
+
+
+class Collator_order(object):
+    # 输入一个batch的数据，合并order后面再解耦
+    def __init__(self, args, tokenizer):
+        self.tokenizer = tokenizer
+        self.text_maxlength = args.maxlength
+        self.args = args
+        # 每一个pair中包含的数据数量
+        self.order_num = args.order_num
+        self.p_label, self.n_label = smooth_BCE(args.eps)
+
+    def __call__(self, batch):
+        # 输入数据按顺序堆叠, 间隔拆分
+        #
+        # 编码然后输出
+        output = []
+        for item in range(self.order_num):
+            output.extend([dat[0][0][item] for dat in batch])
+        # label smoothing
+
+        labels = [1 if dat[0][1][0] == 2 else self.p_label if dat[0][1][0] == 1 else self.n_label for dat in batch]
+        batch = self.tokenizer.batch_encode_plus(
+            output,
+            padding='max_length',
+            max_length=self.text_maxlength,
+            truncation=True,
+            return_tensors="pt",
+            return_token_type_ids=False,
+            return_attention_mask=True,
+            add_special_tokens=False
+        )
+        # torch.tensor()
+        return batch, torch.FloatTensor(labels)
+
+
+def smooth_BCE(eps=0.1):   # eps 平滑系数  [0, 1]  =>  [0.95, 0.05]
+    # return positive, negative label smoothing BCE targets
+    # positive label= y_true * (1.0 - label_smoothing) + 0.5 * label_smoothing
+    # y_true=1  label_smoothing=eps=0.1
+    return 1.0 - 0.5 * eps, 0.5 * eps
diff --git a/KTeleBERT/src/distributed_utils.py b/KTeleBERT/src/distributed_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa25d4732f7bbe299301ea429d85fc7fbcb2dcad
--- /dev/null
+++ b/KTeleBERT/src/distributed_utils.py
@@ -0,0 +1,79 @@
+import os
+
+import torch
+import torch.distributed as dist
+import pdb
+
+
+def dist_pdb(rank, in_rank=0):
+    if rank != in_rank:
+        dist.barrier()
+    else:
+        pdb.set_trace()
+        dist.barrier()
+
+
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'  # 通信后端，nvidia GPU推荐使用NCCL
+    print('| distributed init (rank {}): {}'.format(
+        args.rank, args.dist_url), flush=True)
+    dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                            world_size=args.world_size, rank=args.rank)
+    dist.barrier()
+
+
+def cleanup():
+    dist.destroy_process_group()
+
+
+def is_dist_avail_and_initialized():
+    """检查是否支持分布式环境"""
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def reduce_value(value, average=True):
+    world_size = get_world_size()
+    if world_size < 2:  # 单GPU的情况
+        return value
+
+    with torch.no_grad():
+        dist.all_reduce(value)
+        if average:
+            value /= world_size
+
+        return value
diff --git a/KTeleBERT/src/utils.py b/KTeleBERT/src/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b7d9a95d425a0901b087e71650bead4e0b073b0
--- /dev/null
+++ b/KTeleBERT/src/utils.py
@@ -0,0 +1,374 @@
+
+import os
+import errno
+import torch
+import sys
+import logging
+import json
+from pathlib import Path
+import torch.distributed as dist
+import csv
+import os.path as osp
+from time import time
+from numpy import mean
+import re
+from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
+import pdb
+from torch import nn
+
+
+
+# Huggingface的实现中，自带多种warmup策略
+def set_optim(opt, model_list, freeze_part=[], accumulation_step=None):
+    # Bert optim
+    optimizer_list, scheduler_list, named_parameters = [], [], []
+    # cur_model = model.module if hasattr(model, 'module') else model
+    for model in model_list:
+        model_para = list(model.named_parameters())
+        model_para_train, freeze_layer = [], []
+        for n, p in model_para:
+            if not any(nd in n for nd in freeze_part):
+                model_para_train.append((n, p))
+            else:
+                p.requires_grad = False
+                freeze_layer.append((n, p))
+        named_parameters.extend(model_para_train) 
+
+    # for name, param in model_list[0].named_parameters():
+    #     if not param.requires_grad:
+    #         print(name, param.size())
+            
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    # numeric_model 也包括到这个部分中
+    ke_part = ['ke_model', 'loss_awl', 'numeric_model', 'order']
+    if opt.LLRD:
+        # 按层次衰减的学习率
+        all_name_orig = [n for n, p in named_parameters if not any(nd in n for nd in ke_part)]
+
+        opt_parameters, all_name = LLRD(opt, named_parameters, no_decay, ke_part)
+        remain = list(set(all_name_orig) - set(all_name))
+        remain_parameters = [
+                {'params': [p for n, p in named_parameters if not any(nd in n for nd in no_decay) and n in remain], "lr": opt.lr, 'weight_decay': opt.weight_decay},
+                {'params': [p for n, p in named_parameters if any(nd in n for nd in no_decay) and n in remain], "lr": opt.lr, 'weight_decay': 0.0}
+            ]
+        opt_parameters.extend(remain_parameters)
+    else:
+        opt_parameters = [
+                {'params': [p for n, p in named_parameters if not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)], "lr": opt.lr, 'weight_decay': opt.weight_decay},
+                {'params': [p for n, p in named_parameters if any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)], "lr": opt.lr, 'weight_decay': 0.0}
+            ]
+
+    ke_parameters = [
+        {'params': [p for n, p in named_parameters if not any(nd in n for nd in no_decay) and any(nd in n for nd in ke_part)], "lr": opt.ke_lr, 'weight_decay': opt.weight_decay},
+        {'params': [p for n, p in named_parameters if any(nd in n for nd in no_decay) and any(nd in n for nd in ke_part)], "lr": opt.ke_lr, 'weight_decay': 0.0}
+    ]
+    opt_parameters.extend(ke_parameters)
+    optimizer = AdamW(opt_parameters, lr=opt.lr, eps=opt.adam_epsilon)
+    if accumulation_step is None:
+        accumulation_step = opt.accumulation_steps
+    if opt.scheduler == 'linear':
+        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(opt.warmup_steps/accumulation_step), num_training_steps=int(opt.total_steps/accumulation_step))
+    else:
+        scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(opt.warmup_steps/accumulation_step), num_training_steps=int(opt.total_steps/accumulation_step))
+    
+    # ---- 判定所有参数是否被全部优化 ----
+    all_para_num = 0
+    for paras in opt_parameters:
+        all_para_num += len(paras['params'])
+    # pdb.set_trace()
+    assert len(named_parameters) == all_para_num
+    return optimizer, scheduler
+
+# LLRD 学习率逐层衰减但
+
+def LLRD(opt, named_parameters, no_decay, ke_part =[]):
+    opt_parameters = []
+    all_name = []
+    head_lr = opt.lr * 1.05
+    init_lr = opt.lr
+    lr = init_lr
+
+    # === Pooler and regressor ======================================================  
+    params_0 = [p for n,p in named_parameters if ("pooler" in n or "regressor" in n or "predictions" in n) 
+                and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)]
+    params_1 = [p for n,p in named_parameters if ("pooler" in n or "regressor" in n or "predictions" in n)
+                and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)]
+
+    name_0 = [n for n,p in named_parameters if ("pooler" in n or "regressor" in n or "predictions" in n) 
+                and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)]
+    name_1 = [n for n,p in named_parameters if ("pooler" in n or "regressor" in n or "predictions" in n)
+                and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)]
+
+    all_name.extend(name_0)
+    all_name.extend(name_1)
+    
+    head_params = {"params": params_0, "lr": head_lr, "weight_decay": 0.0}    
+    opt_parameters.append(head_params)
+
+    head_params = {"params": params_1, "lr": head_lr, "weight_decay": 0.01}    
+    opt_parameters.append(head_params)
+    
+    # === 12 Hidden layers ==========================================================
+    for layer in range(11,-1,-1):        
+        params_0 = [p for n,p in named_parameters if f"encoder.layer.{layer}." in n 
+                    and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)]
+        params_1 = [p for n,p in named_parameters if f"encoder.layer.{layer}." in n 
+                    and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)]
+
+        layer_params = {"params": params_0, "lr": lr, "weight_decay": 0.0}
+        opt_parameters.append(layer_params)   
+
+        layer_params = {"params": params_1, "lr": lr, "weight_decay": 0.01}
+        opt_parameters.append(layer_params) 
+
+        name_0 = [n for n,p in named_parameters if f"encoder.layer.{layer}." in n 
+                    and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)]
+        name_1 = [n for n,p in named_parameters if f"encoder.layer.{layer}." in n 
+                    and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)]
+        all_name.extend(name_0)
+        all_name.extend(name_1)      
+
+        lr *= 0.95 
+    # === Embeddings layer ==========================================================
+    
+    params_0 = [p for n,p in named_parameters if ("embeddings" in n )  
+                and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)]
+    params_1 = [p for n,p in named_parameters if ("embeddings" in n ) 
+                and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)]
+    
+    embed_params = {"params": params_0, "lr": lr, "weight_decay": 0.0} 
+    opt_parameters.append(embed_params)
+
+    embed_params = {"params": params_1, "lr": lr, "weight_decay": 0.01} 
+    opt_parameters.append(embed_params)   
+
+    name_0 = [n for n,p in named_parameters if ("embeddings" in n ) 
+                and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)]
+    name_1 = [n for n,p in named_parameters if ("embeddings" in n )
+                and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)]
+    all_name.extend(name_0)
+    all_name.extend(name_1) 
+    return opt_parameters, all_name
+
+class FixedScheduler(torch.optim.lr_scheduler.LambdaLR):
+    def __init__(self, optimizer, last_epoch=-1):
+        super(FixedScheduler, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
+
+    def lr_lambda(self, step):
+        return 1.0
+
+
+class WarmupLinearScheduler(torch.optim.lr_scheduler.LambdaLR):
+    def __init__(self, optimizer, warmup_steps, scheduler_steps, min_ratio, last_epoch=-1):
+        self.warmup_steps = warmup_steps
+        self.scheduler_steps = scheduler_steps
+        self.min_ratio = min_ratio
+        # self.fixed_lr = fixed_lr
+        super(WarmupLinearScheduler, self).__init__(
+            optimizer, self.lr_lambda, last_epoch=last_epoch
+        )
+
+    def lr_lambda(self, step):
+        if step < self.warmup_steps:
+            return (1 - self.min_ratio) * step / float(max(1, self.warmup_steps)) + self.min_ratio
+
+        # if self.fixed_lr:
+        #     return 1.0
+
+        return max(0.0,
+                   1.0 + (self.min_ratio - 1) * (step - self.warmup_steps) / float(max(1.0, self.scheduler_steps - self.warmup_steps)),
+                   )
+
+
+class Loss_log():
+    def __init__(self):
+        self.loss = []
+        self.acc = [0.]
+        self.flag = 0
+        self.token_right_num = []
+        self.token_all_num = []
+        self.word_right_num = []
+        self.word_all_num = []
+        # 默认不使用top_k acc
+        self.use_top_k_acc = 0
+
+    def acc_init(self, topn=[1]):
+        self.loss = []
+        self.token_right_num = []
+        self.token_all_num = []
+        self.topn = topn
+        self.use_top_k_acc = 1
+        self.top_k_word_right = {}
+        for n in topn:
+            self.top_k_word_right[n] = []
+
+    def time_init(self):
+        self.start = time()
+        self.last = self.start
+        self.time_used_epoch = []
+
+    def time_cpt(self, step, total_step):
+        # 时间统计
+        time_used_last_epoch = time()  - self.last
+        self.time_used_epoch.append(time_used_last_epoch)
+        time_used = time() - self.start
+        self.last = time()
+        h, m, s = time_trans(time_used)
+        time_remain = int(total_step - step) * mean(self.time_used_epoch)
+        h_r, m_r, s_r = time_trans(time_remain)
+
+        return h, m, s, h_r, m_r, s_r
+
+    def get_token_acc(self):
+        # 返回list
+        if len(self.token_all_num) == 0:
+            return 0.
+        elif self.use_top_k_acc == 1:
+            res = []
+            for n in self.topn:
+                res.append(round((sum(self.top_k_word_right[n]) / sum(self.token_all_num)) * 100 , 3))
+            return res
+        else:
+            return [sum(self.token_right_num)/sum(self.token_all_num)]
+        
+
+    def update_token(self, token_num, token_right):
+        # 输入是list文件
+        self.token_all_num.append(token_num)
+        if isinstance(token_right, list):
+            for i, n in enumerate(self.topn):
+                self.top_k_word_right[n].append(token_right[i])
+        self.token_right_num.append(token_right)
+
+    def update(self, case):
+        self.loss.append(case)
+
+    def update_acc(self, case):
+        self.acc.append(case)
+
+    def get_loss(self):
+        if len(self.loss) == 0:
+            return 500.
+        return mean(self.loss)
+
+    def get_acc(self):
+        return self.acc[-1]
+
+    def get_min_loss(self):
+        return min(self.loss)
+
+    def early_stop(self):
+        # min_loss = min(self.loss)
+        if self.loss[-1] > min(self.loss):
+            self.flag += 1
+        else:
+            self.flag = 0
+
+        if self.flag > 1000:
+            return True
+        else:
+            return False
+
+
+def add_special_token(tokenizer, model=None, rank=0, cache_path = None):
+    # model: bert layer
+    # 每次更新这个，所有模型需要重新训练，get_chinese_ref.py需要重新运行
+    # 主函数调用该函数的位置需要在载入模型之前
+    # ---------------------------------------
+    # 不会被mask的 token， 不参与 任何时候的MASK
+    special_token = ['[SEP]', '[MASK]', '[ALM]', '[KPI]', '[CLS]', '[LOC]', '[EOS]', '[ENT]', '[ATTR]', '[NUM]', '[REL]', '|', '[DOC]'] 
+
+    # ---------------------------------------
+    # 会被mask的但是---#不加入#---tokenizer的内容
+    # 出现次数多（>10000）但是长度较长(>=4符)
+    # 或者是一些难以理解的名词
+    # WWM 的主体
+    # TODO: 专家检查
+        # To Add： 'SGSN', '3GPP', 'Bearer', 'sbim', 'FusionSphere',  'IMSI', 'GGSN', 'RETCODE', 'PCRF', 'PDP', 'GTP', 'OCS', 'HLR', 'FFFF', 'VLR', 'DNN', 'PID', 'CSCF', 'PDN', 'SCTP', 'SPGW', 'TAU', 'PCEF', 'NSA', 'ACL', 'BGP', 'USCDB', 'VoLTE', 'RNC', 'GPRS', 'DRA', 'MOC'
+        # 拆分：配置原则，本端规划
+    norm_token = ['网元实例', '事件类型', '告警级别', '告警名称', '告警源', '通讯系统', '默认值', '链路故障', '取值范围', '可选必选说明', '数据来源', '用户平面', '配置', '原则',  '该参数', '失败次数', '可选参数', 'S1模式', '必选参数',  'IP地址', '响应消息', '成功次数', '测量指标', '用于', '统计周期', '该命令', '上下文', '请求次数', '本端',  'pod', 'amf', 'smf', 'nrf', 'ausf', 'upcf', 'upf', 'udm', 'PDU', 'alias', 'PLMN', 'MML', 'Info_Measure', 'icase', 'Diameter', 'MSISDN', 'RAT', 'RMV', 'PFCP', 'NSSAI', 'CCR', 'HDBNJjs', 'HNGZgd', 'SGSN', '3GPP', 'Bearer', 'sbim', 'FusionSphere',  'IMSI', 'GGSN', 'RETCODE', 'PCRF', 'PDP', 'GTP', 'OCS', 'HLR', 'FFFF', 'VLR', 'DNN', 'PID', 'CSCF', 'PDN', 'SCTP', 'SPGW', 'TAU', 'PCEF', 'NSA', 'ACL', 'BGP', 'USCDB', 'VoLTE', 'RNC', 'GPRS', 'DRA', 'MOC', '告警', '网元', '对端', '信令', '话单', '操作', '风险', '等级', '下发', '流控', '运营商', '寻呼', '漫游', '切片', '报文', '号段', '承载', '批量', '导致', '原因是', '影响', '造成', '引起', '随之', '情况下', '根因', 'trigger']
+    # ---------------------------------------
+    # , '', '', '', '', '', '', '', '', '', '', ''
+    # 会被mask的但是---#加入#---tokenizer的内容
+    # 长度小于等于3，缩写/专有名词 大于10000次
+    # 严谨性要求大于norm_token
+    # 出现次数多时有足够的影响力可以进行分离
+    norm_token_tobe_added = ['pod', 'amf', 'smf', 'nrf', 'ausf', 'upcf', 'upf', 'udm', 'ALM', '告警', '网元', '对端', '信令', '话单', 'RAN', 'MML', 'PGW', 'MME', 'SGW', 'NF', 'APN', 'LST', 'GW', 'QoS', 'IPv', 'PDU', 'IMS', 'EPS', 'GTP', 'PDP', 'LTE', 'HSS']
+
+    token_tobe_added = []
+    # all_token = special_token + norm_token_tobe_added
+    all_token = norm_token_tobe_added
+    for i in all_token:
+        if i not in  tokenizer.vocab.keys() and i.lower() not in  tokenizer.vocab.keys():
+            token_tobe_added.append(i)
+
+    # tokenizer.add_tokens(special_token, special_tokens=False)
+    # tokenizer.add_tokens(norm_token, special_tokens=False)
+    tokenizer.add_tokens(token_tobe_added, special_tokens=False)
+    special_tokens_dict = {"additional_special_tokens": special_token}
+    special_token_ = tokenizer.add_special_tokens(special_tokens_dict)
+    if rank == 0:
+        print("Added tokens:")
+        print(tokenizer.get_added_vocab())
+    
+    # pdb.set_trace()
+
+    if model is not None:
+        # TODO: 用预训练好的TeleBert进行这部分embedding（所有添加的embedding）的初始化
+        if rank == 0:
+            print(f"--------------------------------")
+            print(f"--------    orig word embedding shape: {model.get_input_embeddings().weight.shape}")
+        sz = model.resize_token_embeddings(len(tokenizer)) 
+        if cache_path is not None:
+            # model.cpu()
+            token_2_emb = torch.load(cache_path)
+            # 在这里加入embedding 初始化之后需要tie一下 
+            token_dic = tokenizer.get_added_vocab()
+            id_2_token = {v:k   for k,v in token_dic.items()}
+            with torch.no_grad():
+                for key in id_2_token.keys():
+                    model.bert.embeddings.word_embeddings.weight[key,:] = nn.Parameter(token_2_emb[id_2_token[key]][0]).cuda()
+                    # model.get_input_embeddings().weight[key,:] = nn.Parameter(token_2_emb[id_2_token[key]][0]).cuda()
+                # model.embedding
+            model.bert.tie_weights()
+        if rank == 0:
+            print(f"--------    resize_token_embeddings into {sz} done!")
+            print(f"--------------------------------")
+        # 这里替换embedding
+        
+    norm_token = list(set(norm_token).union(set(norm_token_tobe_added)))
+    return tokenizer, special_token, norm_token
+
+
+def time_trans(sec):
+    m, s = divmod(sec, 60)
+    h, m = divmod(m, 60)
+    return int(h), int(m), int(s)
+
+def torch_accuracy(output, target, topk=(1,)):
+    '''
+    param output, target: should be torch Variable
+    '''
+    # assert isinstance(output, torch.cuda.Tensor), 'expecting Torch Tensor'
+    # assert isinstance(target, torch.Tensor), 'expecting Torch Tensor'
+    # print(type(output))
+
+    topn = max(topk)
+    batch_size = output.size(0)
+
+    _, pred = output.topk(topn, 1, True, True) # 返回(values,indices）其中indices就是预测类别的值，0为第一类
+    pred = pred.t() # torch.t()转置，既可得到每一行为batch最好的一个预测序列
+
+    is_correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    ans = []
+    ans_num = []
+    for i in topk:
+        # is_correct_i = is_correct[:i].view(-1).float().sum(0, keepdim=True)
+        is_correct_i = is_correct[:i].contiguous().view(-1).float().sum(0, keepdim=True)
+        ans_num.append(int(is_correct_i.item()))
+        ans.append(is_correct_i.mul_(100.0 / batch_size))
+
+    return ans, ans_num
+
+    
\ No newline at end of file
diff --git a/KTeleBERT/test.sh b/KTeleBERT/test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c05a101359782a5074b996d3db47f8a9091accf8
--- /dev/null
+++ b/KTeleBERT/test.sh
@@ -0,0 +1,12 @@
+python main.py --only_test 1 \
+               --batch_size 150 \
+               --use_NumEmb 1 \
+               --mask_test 0 \
+               --mask_stratege wwm \
+               --model_name  model_name_vXX \
+               --ke_test 0 \
+               --embed_gen 1 \
+               --train_ratio 0 \
+               --ke_dim 256 \
+               --plm_emb_type cls \
+
diff --git a/KTeleBERT/torchlight/__init__.py b/KTeleBERT/torchlight/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8570f8f816a1ff556b5c52401d6527eb0a3e39ed
--- /dev/null
+++ b/KTeleBERT/torchlight/__init__.py
@@ -0,0 +1,20 @@
+from .logger import initialize_exp, get_dump_path
+from .metric import Metric, Top_K_Metric
+from .module import LSTM4VarLenSeq
+from .vocab import (PAD_TOKEN, UNK_TOKEN, BOS_TOKEN, EOS_TOKEN,
+                    DefaultLookupDict,
+                    Vocabulary)
+from .utils import (invert_dict,
+                    personal_display_settings,
+                    set_seed,
+                    normalize,
+                    snapshot,
+                    show_params,
+                    longest_substring,
+                    pad,
+                    to_cuda,
+                    get_code_version,
+                    cat_ragged_tensors,
+                    topk_accuracy,
+                    get_total_trainable_params)
+
diff --git a/KTeleBERT/torchlight/__pycache__/__init__.cpython-38.pyc b/KTeleBERT/torchlight/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c406e0323ffe22855fe7c3fb8caf9f63eccd6228
Binary files /dev/null and b/KTeleBERT/torchlight/__pycache__/__init__.cpython-38.pyc differ
diff --git a/KTeleBERT/torchlight/__pycache__/logger.cpython-38.pyc b/KTeleBERT/torchlight/__pycache__/logger.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be2fbae4af89bfdc256820c009cf5d93a86cc994
Binary files /dev/null and b/KTeleBERT/torchlight/__pycache__/logger.cpython-38.pyc differ
diff --git a/KTeleBERT/torchlight/__pycache__/metric.cpython-38.pyc b/KTeleBERT/torchlight/__pycache__/metric.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44c3660fe6af93a78993bc165fd1b4a5e890a090
Binary files /dev/null and b/KTeleBERT/torchlight/__pycache__/metric.cpython-38.pyc differ
diff --git a/KTeleBERT/torchlight/__pycache__/module.cpython-38.pyc b/KTeleBERT/torchlight/__pycache__/module.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae88a49ca3ec06ad543ccf6fee1a9dffbfddb06a
Binary files /dev/null and b/KTeleBERT/torchlight/__pycache__/module.cpython-38.pyc differ
diff --git a/KTeleBERT/torchlight/__pycache__/utils.cpython-38.pyc b/KTeleBERT/torchlight/__pycache__/utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c02137cd62e2cbab15b8248fdf9a7842fc7beeb
Binary files /dev/null and b/KTeleBERT/torchlight/__pycache__/utils.cpython-38.pyc differ
diff --git a/KTeleBERT/torchlight/__pycache__/vocab.cpython-38.pyc b/KTeleBERT/torchlight/__pycache__/vocab.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f445638c7a8e34bef9e2731ff7e58ae0ff4c884
Binary files /dev/null and b/KTeleBERT/torchlight/__pycache__/vocab.cpython-38.pyc differ
diff --git a/KTeleBERT/torchlight/logger.py b/KTeleBERT/torchlight/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..30d6a96d65ca455bdeb8c66f95a6859af427514a
--- /dev/null
+++ b/KTeleBERT/torchlight/logger.py
@@ -0,0 +1,147 @@
+import os
+import re
+import sys
+import time
+import json
+import torch
+import pickle
+import random
+import getpass
+import logging
+import argparse
+import subprocess
+import numpy as np
+from datetime import timedelta, date
+from .utils import get_code_version
+
+
+class LogFormatter():
+
+    def __init__(self):
+        self.start_time = time.time()
+
+    def format(self, record):
+        elapsed_seconds = round(record.created - self.start_time)
+
+        prefix = "%s - %s - %s" % (
+            record.levelname,
+            time.strftime('%x %X'),
+            timedelta(seconds=elapsed_seconds)
+        )
+        message = record.getMessage()
+        message = message.replace('\n', '\n' + ' ' * (len(prefix) + 3))
+        return "%s - %s" % (prefix, message) if message else ''
+
+
+def create_logger(filepath, rank):
+    """
+    Create a logger.
+    Use a different log file for each process.
+    """
+    # create log formatter
+    log_formatter = LogFormatter()
+
+    # create file handler and set level to debug
+    if filepath is not None:
+        if rank > 0:
+            filepath = '%s-%i' % (filepath, rank)
+        file_handler = logging.FileHandler(filepath, "a", encoding='utf-8')
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(log_formatter)
+
+    # create console handler and set level to info
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    console_handler.setFormatter(log_formatter)
+
+    # create logger and set level to debug
+    logger = logging.getLogger()
+    logger.handlers = []
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+    if filepath is not None:
+        logger.addHandler(file_handler)
+    logger.addHandler(console_handler)
+
+    # reset logger elapsed time
+    def reset_time():
+        log_formatter.start_time = time.time()
+    logger.reset_time = reset_time
+
+    return logger
+
+
+def initialize_exp(params):
+    """
+    Initialize the experiment:
+    - dump parameters
+    - create a logger
+    """
+    # dump parameters
+    exp_folder = get_dump_path(params)
+    json.dump(vars(params), open(os.path.join(exp_folder, 'params.pkl'), 'w'), indent=4)
+
+    # get running command
+    command = ["python", sys.argv[0]]
+    for x in sys.argv[1:]:
+        if x.startswith('--'):
+            assert '"' not in x and "'" not in x
+            command.append(x)
+        else:
+            assert "'" not in x
+            if re.match('^[a-zA-Z0-9_]+$', x):
+                command.append("%s" % x)
+            else:
+                command.append("'%s'" % x)
+    command = ' '.join(command)
+    params.command = command + ' --exp_id "%s"' % params.exp_id
+
+    # check experiment name
+    assert len(params.exp_name.strip()) > 0
+
+    # create a logger
+    logger = create_logger(os.path.join(exp_folder, 'train.log'), rank=getattr(params, 'global_rank', 0))
+    logger.info("============ Initialized logger ============")
+    # logger.info("\n".join("%s: %s" % (k, str(v))
+    #                       for k, v in sorted(dict(vars(params)).items())))
+    # text = f'# Git Version: {get_code_version()} #'
+    # logger.info("\n".join(['=' * 24, text, '=' * 24]))
+    logger.info("The experiment will be stored in %s\n" % exp_folder)
+    logger.info("Running command: %s" % command)
+    logger.info("")
+    return logger
+
+
+def get_dump_path(params):
+    """
+    Create a directory to store the experiment.
+    """
+    assert len(params.exp_name) > 0
+    assert not params.dump_path in ('', None), \
+            'Please choose your favorite destination for dump.'
+    dump_path = params.dump_path
+
+    # create the sweep path if it does not exist
+    when = date.today().strftime('%m%d-')
+    sweep_path = os.path.join(dump_path, when + params.exp_name)
+    if not os.path.exists(sweep_path):
+        subprocess.Popen("mkdir -p %s" % sweep_path, shell=True).wait()
+
+    # create an random ID for the job if it is not given in the parameters.
+    if params.exp_id == '':
+        chars = 'abcdefghijklmnopqrstuvwxyz0123456789'
+        while True:
+            exp_id = ''.join(random.choice(chars) for _ in range(10))
+            if not os.path.isdir(os.path.join(sweep_path, exp_id)):
+                break
+        params.exp_id = exp_id
+
+    # create the dump folder / update parameters
+    exp_folder = os.path.join(sweep_path, params.exp_id)
+    if not os.path.isdir(exp_folder):
+        subprocess.Popen("mkdir -p %s" % exp_folder, shell=True).wait()
+    return exp_folder
+
+
+if __name__ == '__main__':
+    pass
diff --git a/KTeleBERT/torchlight/metric.py b/KTeleBERT/torchlight/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..68cbfd798502c6f30e44c9ce10ddc2c8e0942b8a
--- /dev/null
+++ b/KTeleBERT/torchlight/metric.py
@@ -0,0 +1,121 @@
+# from abc import ABC, ABCMeta, abstractclassmethod
+import torch
+import numpy as np
+from abc import ABC, abstractmethod, ABCMeta
+
+class Metric(metaclass=ABCMeta):
+    """
+    -   reset() in the begining of every epoch.
+    -   update_per_batch() after every batch.
+    -   update_per_epoch() after every epoch.
+    """
+
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def reset(self):
+        pass
+
+    @abstractmethod
+    def update_per_batch(self, output):
+        pass
+
+    @abstractmethod
+    def update_per_epoch(self):
+        pass
+
+class Top_K_Metric(Metric):
+    """
+    Stores accuracy (score), loss and timing info
+    """
+    def __init__(self, topnum=[1,3,10]):
+        super().__init__()
+        # assert len(topnum) == 3
+        self.topnum = topnum
+        self.k_num = len(self.topnum)
+        self.reset()
+
+    def reset(self):
+        self.total_loss = 0
+        self.correct_list = [0] * self.k_num
+        self.acc_list = [0] * self.k_num
+        self.acc_all = 0
+        self.num_examples = 0
+        self.num_epoch = 0
+
+        self.mrr = 0
+        self.mr = 0
+        self.mrr_all = 0
+        self.mr_all = 0
+
+    def update_per_batch(self, loss, ans, pred):
+        self.total_loss += loss
+        self.num_epoch += 1
+        self.top_k_list = self.batch_accuracy(pred, ans)
+        self.num_examples += self.top_k_list[0].shape[0]
+        for i in range(self.k_num):
+            self.correct_list[i] += self.top_k_list[i].sum().item()
+
+        # mrr
+        mrr_tmp, mr_tmp =  self.batch_mr_mrr(pred, ans)
+        self.mrr_all += mrr_tmp.sum().item()
+        self.mr_all += mr_tmp.sum().item()
+
+
+
+    def update_per_epoch(self):
+        for i in range(self.k_num):
+            self.acc_list[i] = 100 * (self.correct_list[i] / self.num_examples)
+
+        self.mr = self.mr_all / self.num_examples
+        self.mrr = self.mrr_all / self.num_examples
+        self.total_loss = self.total_loss / self.num_epoch
+        self.acc_all = sum(self.acc_list)
+
+
+    def batch_accuracy(self, predicted, true):
+        """ Compute the accuracies for a batch of predictions and answers """
+        if len(true.shape) == 3:
+            true = true[0]
+        _, ok = predicted.topk(max(self.topnum), dim=1)
+        agreeing_all = torch.zeros([predicted.shape[0], 1], dtype=torch.float).cuda()
+        top_k_list = [0]*self.topnum
+        for i in range(max(self.topnum)):
+            tmp = ok[:, i].reshape(-1, 1)
+            agreeing_all += true.gather(dim=1, index=tmp)
+            for k in range(self.k_num):
+                if i == self.topnum[k] - 1:
+                    top_k_list[k] = (agreeing_all * 0.3).clamp(max=1)
+                    break
+
+        return top_k_list
+
+
+
+    def batch_mr_mrr(self, predicted, true):
+        if len(true.shape) == 3:
+            true = true[0]
+
+        # 计算
+        top_rank = predicted.shape[1]
+        batch_size = predicted.shape[0]
+        _, predict_ans_rank = predicted.topk(top_rank, dim=1) # 答案排名的坐标 batchsize * 500
+        _, real_ans = true.topk(1, dim=1) # 真正的答案：batchsize * 1
+
+        # 扩充维度
+        real_ans = real_ans.expand(batch_size, top_rank)
+        ans_different = torch.abs(predict_ans_rank - real_ans)
+        # 此时为0的位置就是预测正确的位置
+        _, real_ans_list = ans_different.topk(top_rank, dim=1) #此时最后一位的数值就是正确答案在预测答案里面的位置,为 0
+        real_ans_list = real_ans_list + 1.0
+        mr = real_ans_list[:,-1].reshape(-1,1).to(torch.float64)
+        mrr = 1.0 / mr
+        # pdb.set_trace()
+
+        return mrr,mr
+
+
+if __name__ == '__main__':
+    pass
\ No newline at end of file
diff --git a/KTeleBERT/torchlight/module.py b/KTeleBERT/torchlight/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e055a9e5bb66951d2f6fa0ecfac5a6705a49ae3
--- /dev/null
+++ b/KTeleBERT/torchlight/module.py
@@ -0,0 +1,133 @@
+import math
+from typing import Sequence, Union, Callable
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+
+torch.manual_seed(10086)
+# typing, everything in Python is Object.
+tensor_activation = Callable[[torch.Tensor], torch.Tensor]
+
+
+class LSTM4VarLenSeq(nn.Module):
+    def __init__(self, input_size, hidden_size,
+                 num_layers=1, bias=True, bidirectional=False, init='orthogonal', take_last=True):
+        """
+        no dropout support
+        batch_first support deprecated, the input and output tensors are
+        provided as (batch, seq_len, feature).
+
+        Args:
+            input_size:
+            hidden_size:
+            num_layers:
+            bias:
+            bidirectional:
+            init: ways to init the torch.nn.LSTM parameters,
+                supports 'orthogonal' and 'uniform'
+            take_last: 'True' if you only want the final hidden state
+                otherwise 'False'
+        """
+        super(LSTM4VarLenSeq, self).__init__()
+        self.lstm = nn.LSTM(input_size=input_size,
+                            hidden_size=hidden_size,
+                            num_layers=num_layers,
+                            bias=bias,
+                            bidirectional=bidirectional)
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bias = bias
+        self.bidirectional = bidirectional
+        self.init = init
+        self.take_last = take_last
+        self.batch_first = True  # Please don't modify this
+
+        self.init_parameters()
+
+    def init_parameters(self):
+        """orthogonal init yields generally good results than uniform init"""
+        if self.init == 'orthogonal':
+            gain = 1  # use default value
+            for nth in range(self.num_layers * self.bidirectional):
+                # w_ih, (4 * hidden_size x input_size)
+                nn.init.orthogonal_(self.lstm.all_weights[nth][0], gain=gain)
+                # w_hh, (4 * hidden_size x hidden_size)
+                nn.init.orthogonal_(self.lstm.all_weights[nth][1], gain=gain)
+                # b_ih, (4 * hidden_size)
+                nn.init.zeros_(self.lstm.all_weights[nth][2])
+                # b_hh, (4 * hidden_size)
+                nn.init.zeros_(self.lstm.all_weights[nth][3])
+        elif self.init == 'uniform':
+            k = math.sqrt(1 / self.hidden_size)
+            for nth in range(self.num_layers * self.bidirectional):
+                nn.init.uniform_(self.lstm.all_weights[nth][0], -k, k)
+                nn.init.uniform_(self.lstm.all_weights[nth][1], -k, k)
+                nn.init.zeros_(self.lstm.all_weights[nth][2])
+                nn.init.zeros_(self.lstm.all_weights[nth][3])
+        else:
+            raise NotImplemented('Unsupported Initialization')
+
+    def forward(self, x, x_len, hx=None):
+        # 1. Sort x and its corresponding length
+        sorted_x_len, sorted_x_idx = torch.sort(x_len, descending=True)
+        sorted_x = x[sorted_x_idx]
+        # 2. Ready to unsort after LSTM forward pass
+        # Note that PyTorch 0.4 has no argsort, but PyTorch 1.0 does.
+        _, unsort_x_idx = torch.sort(sorted_x_idx, descending=False)
+
+        # 3. Pack the sorted version of x and x_len, as required by the API.
+        x_emb = pack_padded_sequence(sorted_x, sorted_x_len,
+                                     batch_first=self.batch_first)
+
+        # 4. Forward lstm
+        # output_packed.data.shape is (valid_seq, num_directions * hidden_dim).
+        # See doc of torch.nn.LSTM for details.
+        out_packed, (hn, cn) = self.lstm(x_emb)
+
+        # 5. unsort h
+        # (num_layers * num_directions, batch, hidden_size) -> (batch, ...)
+        hn = hn.permute(1, 0, 2)[unsort_x_idx]  # swap the first two dim
+        hn = hn.permute(1, 0, 2)  # swap the first two again to recover
+        if self.take_last:
+            return hn.squeeze(0)
+        else:
+            # unpack: out
+            # (batch, max_seq_len, num_directions * hidden_size)
+            out, _ = pad_packed_sequence(out_packed,
+                                         batch_first=self.batch_first)
+            out = out[unsort_x_idx]
+            # unpack: c
+            # (num_layers * num_directions, batch, hidden_size) -> (batch, ...)
+            cn = cn.permute(1, 0, 2)[unsort_x_idx]  # swap the first two dim
+            cn = cn.permute(1, 0, 2)  # swap the first two again to recover
+            return out, (hn, cn)
+
+
+if __name__ == '__main__':
+    # Note that in the future we will import unittest
+    # and port the following examples to test folder.
+
+    # Unit test for LSTM variable length sequences
+    # ================
+    net = LSTM4VarLenSeq(200, 100,
+                         num_layers=3, bias=True, bidirectional=True, init='orthogonal', take_last=False)
+
+    inputs = torch.tensor([[1, 2, 3, 0],
+                           [2, 3, 0, 0],
+                           [2, 4, 3, 0],
+                           [1, 4, 3, 0],
+                           [1, 2, 3, 4]])
+    embedding = nn.Embedding(num_embeddings=5, embedding_dim=200, padding_idx=0)
+    lens = torch.LongTensor([3, 2, 3, 3, 4])
+
+    input_embed = embedding(inputs)
+    output, (h, c) = net(input_embed, lens)
+    # 5, 4, 200, batch, seq length, hidden_size * 2 (only last layer)
+    print(output.shape)
+    # 6, 5, 100, num_layers * num_directions, batch, hidden_size
+    print(h.shape)
+    # 6, 5, 100, num_layers * num_directions, batch, hidden_size
+    print(c.shape)
diff --git a/KTeleBERT/torchlight/utils.py b/KTeleBERT/torchlight/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..df9342829e94f6a9d6ab12526e1af203b97ddd2e
--- /dev/null
+++ b/KTeleBERT/torchlight/utils.py
@@ -0,0 +1,195 @@
+"""
+Utilizations for common usages.
+"""
+import os
+import random
+import torch
+import numpy as np
+from difflib import SequenceMatcher
+from unidecode import unidecode
+from datetime import datetime
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+
+def invert_dict(d):
+    return {v: k for k, v in d.items()}
+
+def personal_display_settings():
+    """
+    Pandas Doc
+    https://pandas.pydata.org/pandas-docs/stable/generated/pandas.set_option.html
+    NumPy Doc
+        -
+    """
+    from pandas import set_option
+    set_option('display.max_rows', 500)
+    set_option('display.max_columns', 500)
+    set_option('display.width', 2000)
+    set_option('display.max_colwidth', 1000)
+    from numpy import set_printoptions
+    set_printoptions(suppress=True)
+
+
+def set_seed(seed):
+    """
+    Freeze every seed for reproducibility.
+    torch.cuda.manual_seed_all is useful when using random generation on GPUs.
+    e.g. torch.cuda.FloatTensor(100).uniform_()
+    """
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+
+def normalize(s):
+    """
+    German and Frence have different vowels than English.
+    This utilization removes all the non-unicode characters.
+    Example:
+        āáǎà  -->  aaaa
+        ōóǒò  -->  oooo
+        ēéěè  -->  eeee
+        īíǐì  -->  iiii
+        ūúǔù  -->  uuuu
+        ǖǘǚǜ  -->  uuuu
+
+    :param s: unicode string
+    :return:  unicode string with regular English characters.
+    """
+    s = s.strip().lower()
+    s = unidecode(s)
+    return s
+
+
+def snapshot(model, epoch, save_path):
+    """
+    Saving models w/ its params.
+        Get rid of the ONNX Protocal.
+    F-string feature new in Python 3.6+ is used.
+    """
+    os.makedirs(save_path, exist_ok=True)
+    # timestamp = datetime.now().strftime('%m%d_%H%M')
+    save_path = os.path.join(save_path, f'{type(model).__name__}_{epoch}_epoch.pkl')
+    if isinstance(model, (DataParallel, DistributedDataParallel)):
+        torch.save(model.module.state_dict(), save_path)
+    else:
+        torch.save(model.state_dict(), save_path)
+    return save_path
+
+
+def save_checkpoint(model, optimizer, epoch, path):
+    torch.save({
+        'epoch': epoch,
+        'models': model.state_dict(),
+        'optimizer': optimizer.state_dict(),
+    }, path)
+
+
+def load_checkpoint(path, map_location):
+    checkpoint = torch.load(path, map_location=map_location)
+    return checkpoint
+
+
+def show_params(model):
+    """
+    Show models parameters for logging.
+    """
+    for name, param in model.named_parameters():
+        print('%-16s' % name, param.size())
+
+
+def longest_substring(str1, str2):
+    # initialize SequenceMatcher object with input string
+    seqMatch = SequenceMatcher(None, str1, str2)
+
+    # find match of longest sub-string
+    # output will be like Match(a=0, b=0, size=5)
+    match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
+
+    # print longest substring
+    return str1[match.a: match.a + match.size] if match.size != 0 else ""
+
+
+def pad(sent, max_len):
+    """
+    syntax "[0] * int" only works properly for Python 3.5+
+    Note that in testing time, the length of a sentence
+    might exceed the pre-defined max_len (of training data).
+    """
+    length = len(sent)
+    return (sent + [0] * (max_len - length))[:max_len] if length < max_len else sent[:max_len]
+
+
+def to_cuda(*args, device=None):
+    """
+    Move Tensors to CUDA. 
+    If no device provided, default to the first card in CUDA_VISIBLE_DEVICES.
+    """
+    assert all(torch.is_tensor(t) for t in args), \
+        'Only support for tensors, please check if any nn.Module exists.'
+    if device is None:
+        device = torch.device('cuda:0')
+    return [None if x is None else x.to(device) for x in args]
+
+
+def get_code_version(short_sha=True):
+    from subprocess import check_output, STDOUT, CalledProcessError
+    try:
+        sha = check_output('git rev-parse HEAD', stderr=STDOUT,
+                           shell=True, encoding='utf-8')
+        if short_sha:
+            sha = sha[:7]
+        return sha
+    except CalledProcessError:
+        # There was an error - command exited with non-zero code
+        pwd = check_output('pwd', stderr=STDOUT, shell=True, encoding='utf-8')
+        pwd = os.path.abspath(pwd).strip()
+        print(f'Working dir {pwd} is not a git repo.')
+
+
+def cat_ragged_tensors(left, right):
+    assert left.size(0) == right.size(0)
+    batch_size = left.size(0)
+    max_len = left.size(1) + right.size(1)
+
+    len_left = (left != 0).sum(dim=1)
+    len_right = (right != 0).sum(dim=1)
+
+    left_seq = left.unbind()
+    right_seq = right.unbind()
+    # handle zero padding
+    output = torch.zeros((batch_size, max_len), dtype=torch.long, device=left.device)
+    for i, row_left, row_right, l1, l2 in zip(range(batch_size),
+                                              left_seq, right_seq,
+                                              len_left, len_right):
+        l1 = l1.item()
+        l2 = l2.item()
+        j = l1 + l2
+        # concatenate rows of ragged tensors
+        row_cat = torch.cat((row_left[:l1], row_right[:l2]))
+        # copy to empty tensor
+        output[i, :j] = row_cat
+    return output
+
+
+def topk_accuracy(inputs, labels, k=1, largest=True):
+    assert len(inputs.size()) == 2
+    assert len(labels.size()) == 2
+    _, indices = inputs.topk(k=k, largest=largest)
+    result = indices - labels  # boardcast
+    nonzero_count = (result != 0).sum(dim=1, keepdim=True)
+    num_correct = (nonzero_count != result.size(1)).sum().item()
+    num_example = inputs.size(0)
+    return num_correct, num_example
+
+
+def get_total_trainable_params(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+if __name__ == '__main__':
+    print(normalize('ǖǘǚǜ'))
diff --git a/KTeleBERT/torchlight/vocab.py b/KTeleBERT/torchlight/vocab.py
new file mode 100644
index 0000000000000000000000000000000000000000..c776c4b489c16866359457fb006d215ae873ac05
--- /dev/null
+++ b/KTeleBERT/torchlight/vocab.py
@@ -0,0 +1,137 @@
+# coding: utf-8
+"""
+Every NLP task needs a Vocabulary
+Every Vocabulary is built from Instances
+Every Instance is a collection of Fields
+"""
+
+__all__ = ['DefaultLookupDict', 'Vocabulary']
+
+PAD_TOKEN = '<pad>'
+UNK_TOKEN = '<unk>'
+BOS_TOKEN = '<bos>'
+EOS_TOKEN = '<eos>'
+PAD_IDX = 0
+UNK_IDX = 1
+
+
+class DefaultLookupDict(dict):
+    def __init__(self, default):
+        super(DefaultLookupDict, self).__init__()
+        self._default = default
+
+    def __getitem__(self, item):
+        return self.get(item, self._default)
+
+
+class Vocabulary:
+    """
+    Define a vocabulary object that will be used to numericalize a field.
+    Attributes:
+        token2id: A collections.defaultdict instance mapping token strings to
+            numerical identifiers.
+        id2token: A list of token strings indexed by their numerical
+        identifiers.
+        embedding: pretrained vectors.
+
+    Examples:
+    >>> from torchlight.vocab import Vocabulary
+    >>> from collections import Counter
+    >>> text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world']
+    >>> vocab = Vocabulary(Counter(text_data))
+    """
+    def __init__(self, counter, max_size=None, min_freq=1, specials=None):
+        """
+        Create a Vocabulary given Counter.
+        Args:
+            counter: collections.Counter object holding the frequencies of
+                each value found in the data.
+            max_size: The maximum size of the vocabulary, or None for no
+                maximum. Default: None.
+            min_freq: The minimum frequency needed to include a token in the
+                vocabulary. Values less than 1 will be set to 1. Default: 1.
+            specials: The list of special tokens except ['<pad>', '<unk>'].
+                Possible choices: [CLS] [MASK] [SEP] in BERT or <bos> <eos>
+                in Machine Translation.
+        """
+        min_freq = max(min_freq, 1)  # must be positive
+
+        if specials is None:
+            self.specials = [PAD_TOKEN, UNK_TOKEN]
+        else:
+            assert isinstance(specials, list), "'specials' is of type list"
+            self.specials = [PAD_TOKEN, UNK_TOKEN] + specials
+
+        assert len(set(self.specials)) == len(self.specials), \
+            "specials can not contain duplicates."
+
+        if max_size is not None:
+            max_size = len(self.specials) + max_size
+
+        self.id2token = self.specials[:]
+        self.token2id = DefaultLookupDict(UNK_IDX)
+        self.token2id.update({tok: i for i, tok in enumerate(self.id2token)})
+
+        # sort by frequency, then alphabetically
+        token_freqs = sorted(counter.items(), key=lambda tup: tup[0])
+        token_freqs.sort(key=lambda tup: tup[1], reverse=True)
+
+        for token, freq in token_freqs:
+            if freq < min_freq or len(self.id2token) == max_size:
+                break
+            if token not in self.specials:
+                self.id2token.append(token)
+                self.token2id[token] = len(self.id2token) - 1
+
+        # TODO
+        self.embedding = None
+
+    def __len__(self):
+        return len(self.id2token)
+
+    def __repr__(self):
+        return 'Vocab(size={}, specials="{}")'.format(len(self), self.specials)
+
+    def __getitem__(self, tokens):
+        """Looks up indices of text tokens according to the vocabulary.
+        If `unknown_token` of the vocabulary is None, looking up unknown tokens
+        results in KeyError.
+        Parameters
+        ----------
+        tokens : str or list of strs
+            A source token or tokens to be converted.
+        Returns
+        -------
+        int or list of ints
+            A token index or a list of token indices according to the vocabulary.
+        """
+
+        if not isinstance(tokens, (list, tuple)):
+            return self.token2id[tokens]
+        else:
+            return [self.token2id[token] for token in tokens]
+
+    def __call__(self, tokens):
+        """Looks up indices of text tokens according to the vocabulary.
+        Parameters
+        ----------
+        tokens : str or list of strs
+            A source token or tokens to be converted.
+        Returns
+        -------
+        int or list of ints
+            A token index or a list of token indices according to the
+            vocabulary.
+        """
+
+        return self[tokens]
+
+    @classmethod
+    def from_json(cls, json_str):
+        pass
+
+    def to_json(self):
+        pass
+
+    def set_embedding(self):
+        pass
diff --git a/config.py b/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..643d7a1270213ddd48d922d30c9a450770b290b1
--- /dev/null
+++ b/config.py
@@ -0,0 +1,234 @@
+import os.path as osp
+import numpy as np
+import random
+import torch
+from easydict import EasyDict as edict
+import argparse
+
+
+LAYER_MAPPING = {
+    0: 'od_layer_0',
+    1: 'od_layer_1',
+    2: 'od_layer_2',
+}
+
+
+class cfg():
+    def __init__(self):
+        self.this_dir = osp.dirname(__file__)
+        # change
+        self.data_root = osp.abspath(osp.join(self.this_dir, '..', '..', 'data', ''))
+
+        # TODO: add some static variable  (The frequency of change is low)
+
+    def get_args(self):
+        parser = argparse.ArgumentParser()
+        # ------------ base ------------
+        parser.add_argument('--train_strategy', default=1, type=int)
+        parser.add_argument('--batch_size', default=64, type=int)
+        parser.add_argument('--batch_size_ke', default=14, type=int)
+        parser.add_argument('--batch_size_od', default=8, type=int)
+        parser.add_argument('--batch_size_ad', default=32, type=int)
+
+        parser.add_argument('--epoch', default=15, type=int)
+        parser.add_argument("--save_model", default=1, type=int, choices=[0, 1])
+        # 用transformer的 save_pretrain 方式保存
+        parser.add_argument("--save_pretrain", default=0, type=int, choices=[0, 1])
+        parser.add_argument("--from_pretrain", default=0, type=int, choices=[0, 1])
+
+        # torthlight
+        parser.add_argument("--no_tensorboard", default=False, action="store_true")
+        parser.add_argument("--exp_name", default="huawei_exp", type=str, help="Experiment name")
+        parser.add_argument("--dump_path", default="dump/", type=str, help="Experiment dump path")
+        parser.add_argument("--exp_id", default="ke256_raekt_ernie2_bs20_p3_c3_5e-6", type=str, help="Experiment ID")
+        # or 3407
+        parser.add_argument("--random_seed", default=42, type=int)
+        # 数据参数
+        parser.add_argument("--data_path", default="huawei", type=str, help="Experiment path")
+        parser.add_argument('--train_ratio', default=1, type=float, help='ratio for train/test')
+        parser.add_argument("--seq_data_name", default='Seq_data_base', type=str, help="seq_data 名字")
+        parser.add_argument("--kg_data_name", default='KG_data_base_rule', type=str, help="kg_data 名字")
+        parser.add_argument("--order_data_name", default='event_order_data', type=str, help="order_data 名字")
+        # TODO: add some dynamic variable
+        parser.add_argument("--model_name", default="MacBert", type=str, help="model name")
+
+        # ------------ 训练阶段 ------------
+        parser.add_argument("--scheduler", default="cos", type=str, choices=["linear", "cos"])
+        parser.add_argument("--optim", default="adamw", type=str)
+        parser.add_argument("--adam_epsilon", default=1e-8, type=float)
+        parser.add_argument('--workers', type=int, default=8)
+        parser.add_argument('--accumulation_steps', type=int, default=6)
+        parser.add_argument('--accumulation_steps_ke', type=int, default=6)
+        parser.add_argument('--accumulation_steps_ad', type=int, default=6)
+        parser.add_argument('--accumulation_steps_od', type=int, default=6)
+        parser.add_argument("--train_together", default=0, type=int)
+
+        # 3e-5
+        parser.add_argument('--lr', type=float, default=1e-5)
+        # 逐层学习率衰减
+        parser.add_argument("--LLRD", default=0, type=int, choices=[0, 1])
+        parser.add_argument('--weight_decay', type=float, default=0.01)
+        parser.add_argument('--clip', type=float, default=1., help='gradient clipping')
+        parser.add_argument('--scheduler_steps', type=int, default=None,
+                            help='total number of step for the scheduler, if None then scheduler_total_step = total_step')
+        parser.add_argument('--eval_step', default=100, type=int, help='evaluate each n step')
+
+        # ------------ PLM ------------
+        parser.add_argument('--maxlength', type=int, default=200)
+        parser.add_argument('--mlm_probability', type=float, default=0.15)
+        parser.add_argument('--final_mlm_probability', type=float, default=0.4)
+        parser.add_argument('--mlm_probability_increase', type=str, default="curve", choices=["linear", "curve"])
+        parser.add_argument("--mask_stratege", default="rand", type=str, choices=["rand", "wwm", "domain"])
+        # 前n个epoch 用rand，后面用wwm. multi-stage knowledge masking strategy
+        parser.add_argument("--ernie_stratege", default=-1, type=int)
+        # 用mlm任务进行训练,默认使用chinese_ref且添加新的special word
+        parser.add_argument("--use_mlm_task", default=1, type=int, choices=[0, 1])
+        # 添加新的special word
+        parser.add_argument("--add_special_word", default=1, type=int, choices=[0, 1])
+        # freeze
+        parser.add_argument("--freeze_layer", default=0, type=int, choices=[0, 1, 2, 3, 4])
+        # 是否mask 特殊token
+        parser.add_argument("--special_token_mask", default=0, type=int, choices=[0, 1])
+        parser.add_argument("--emb_init", default=1, type=int, choices=[0, 1])
+        parser.add_argument("--cls_head_init", default=1, type=int, choices=[0, 1])
+        # 是否使用自适应权重
+        parser.add_argument("--use_awl", default=1, type=int, choices=[0, 1])
+        parser.add_argument("--mask_loss_scale", default=1.0, type=float)
+
+        # ------------ KGE ------------
+        parser.add_argument('--ke_norm', type=int, default=1)
+        parser.add_argument('--ke_dim', type=int, default=768)
+        parser.add_argument('--ke_margin', type=float, default=1.0)
+        parser.add_argument('--neg_num', type=int, default=10)
+        parser.add_argument('--adv_temp', type=float, default=1.0, help='The temperature of sampling in self-adversarial negative sampling.')
+        # 5e-4
+        parser.add_argument('--ke_lr', type=float, default=3e-5)
+        parser.add_argument('--only_ke_loss', type=int, default=0)
+
+        # ------------ 数值embedding相关 ------------
+        parser.add_argument('--use_NumEmb', type=int, default=1)
+        parser.add_argument("--contrastive_loss", default=1, type=int, choices=[0, 1])
+        parser.add_argument("--l_layers", default=2, type=int)
+        parser.add_argument('--use_kpi_loss', type=int, default=1)
+
+        # ------------ 测试阶段 ------------
+        parser.add_argument("--only_test", default=0, type=int, choices=[0, 1])
+        parser.add_argument("--mask_test", default=0, type=int, choices=[0, 1])
+        parser.add_argument("--embed_gen", default=0, type=int, choices=[0, 1])
+        parser.add_argument("--ke_test", default=0, type=int, choices=[0, 1])
+        # -1: 测全集
+        parser.add_argument("--ke_test_num", default=-1, type=int)
+        parser.add_argument("--path_gen", default="", type=str)
+
+        # ------------ 时序阶段 ------------
+        # 1：预训练
+        # 2：时序 finetune
+        # 3. 异常检测 finetune + 时序, 且是迭代的
+        # 是否加载od模型
+        parser.add_argument("--order_load", default=0, type=int)
+        parser.add_argument("--order_num", default=2, type=int)
+        parser.add_argument("--od_type", default='linear_cat', type=str, choices=['linear_cat', 'vertical_attention'])
+        parser.add_argument("--eps", default=0.2, type=float, help='label smoothing..')
+        parser.add_argument("--num_od_layer", default=0, type=int)
+        parser.add_argument("--plm_emb_type", default='cls', type=str, choices=['cls', 'last_avg'])
+        parser.add_argument("--order_test_name", default='', type=str)
+        parser.add_argument("--order_threshold", default=0.5, type=float)
+        # ------------ 并行训练 ------------
+        # 是否并行
+        parser.add_argument('--rank', type=int, default=0, help='rank to dist')
+        parser.add_argument('--dist', type=int, default=0, help='whether to dist')
+        # 不要改该参数，系统会自动分配
+        parser.add_argument('--device', default='cuda', help='device id (i.e. 0 or 0,1 or cpu)')
+        # 开启的进程数(注意不是线程),不用设置该参数，会根据nproc_per_node自动设置
+        parser.add_argument('--world-size', default=4, type=int,
+                            help='number of distributed processes')
+        parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
+        parser.add_argument("--local_rank", default=-1, type=int)
+        self.cfg = parser.parse_args()
+
+    def update_train_configs(self):
+        # add some constraint for parameters
+        # e.g. cannot save and test at the same time
+        # 修正默认参数
+        # TODO: 测试逻辑有问题需要修改
+        if len(self.cfg.order_test_name) > 0:
+            self.cfg.save_model = 0
+            if len(self.cfg.order_test_name) == 0:
+                self.cfg.train_ratio = min(0.8, self.cfg.train_ratio)
+            # 自适应载入文件名
+            else:
+                print("od test ... ")
+                self.cfg.train_strategy == 5
+                self.cfg.plm_emb_type = 'last_avg' if 'last_avg' in self.cfg.model_name else 'cls'
+                for key in LAYER_MAPPING.keys():
+                    if LAYER_MAPPING[key] in self.cfg.model_name:
+                        self.cfg.num_od_layer = key
+                self.cfg.order_test_name = osp.join('downstream_task', f'{self.cfg.order_test_name}')
+
+        if self.cfg.mask_test or self.cfg.embed_gen or self.cfg.ke_test or len(self.cfg.order_test_name) > 0:
+            assert len(self.cfg.model_name) > 0
+            self.cfg.only_test = 1
+        if self.cfg.only_test == 1:
+            self.save_model = 0
+            self.save_pretrain = 0
+
+        # TODO: update some dynamic variable
+        self.cfg.data_root = self.data_root
+        self.cfg.data_path = osp.join(self.data_root, self.cfg.data_path)
+        self.cfg.plm_path = osp.join(self.data_root, 'transformer')
+        self.cfg.dump_path = osp.join(self.cfg.data_path, self.cfg.dump_path)
+        # bs 控制尽量在32
+
+        # 自适应权重的数量
+        self.cfg.awl_num = 1
+        # ------------ 数值embedding相关 ------------
+        self.cfg.hidden_size = 768
+        self.cfg.num_attention_heads = 8
+        self.cfg.hidden_dropout_prob = 0.1
+        self.cfg.num_kpi = 304
+        self.cfg.specail_emb_path = None
+        if self.cfg.emb_init:
+            self.cfg.specail_emb_path = osp.join(self.cfg.data_path, 'added_vocab_embedding.pt')
+
+        # ------------- 多任务学习相关 -------------
+        # 四个阶段
+        self.cfg.mask_epoch, self.cfg.ke_epoch, self.cfg.ad_epoch, self.cfg.od_epoch = None, None, None, None
+        # 触发多任务 学习
+        if self.cfg.train_strategy > 1:
+            self.cfg.mask_epoch = [0, 1, 1, 1, 0]
+            self.cfg.ke_epoch = [4, 3, 2, 2, 0]
+            if self.cfg.only_ke_loss:
+                self.cfg.mask_epoch = [0, 0, 0, 0, 0]
+            self.cfg.epoch = sum(self.cfg.mask_epoch) + sum(self.cfg.ke_epoch)
+            if self.cfg.train_strategy > 2:
+                self.cfg.ad_epoch = [0, 6, 3, 1, 0]
+                self.cfg.epoch += sum(self.cfg.ad_epoch)
+                if self.cfg.train_strategy > 3 and not self.cfg.only_ke_loss:
+                    self.cfg.od_epoch = [0, 0, 9, 1, 0]
+                    # self.cfg.mask_epoch[3] = 1
+                    self.cfg.epoch += sum(self.cfg.od_epoch)
+            self.cfg.epoch_matrix = []
+            for epochs in [self.cfg.mask_epoch, self.cfg.ke_epoch, self.cfg.ad_epoch, self.cfg.od_epoch]:
+                if epochs is not None:
+                    self.cfg.epoch_matrix.append(epochs)
+            if self.cfg.train_together:
+                # loss 直接相加，训练epoch就是mask的epoch
+                self.cfg.epoch = sum(self.cfg.mask_epoch)
+                self.cfg.batch_size = int((self.cfg.batch_size - 16) / self.cfg.train_strategy)
+                self.cfg.batch_size_ke = int(self.cfg.batch_size_ke / self.cfg.train_strategy) - 2
+                self.cfg.batch_size_ad = int(self.cfg.batch_size_ad / self.cfg.train_strategy) - 1
+                self.cfg.batch_size_od = int(self.cfg.batch_size_od / self.cfg.train_strategy) - 1
+                self.cfg.accumulation_steps = (self.cfg.accumulation_steps - 1) * self.cfg.train_strategy
+
+        self.cfg.neg_num = max(min(self.cfg.neg_num, self.cfg.batch_size_ke - 3), 1)
+
+        self.cfg.accumulation_steps_dict = {0: self.cfg.accumulation_steps, 1: self.cfg.accumulation_steps_ke, 2: self.cfg.accumulation_steps_ad, 3: self.cfg.accumulation_steps_od}
+
+        # 使用数值embedding也必须添加新词因为位置信息和tokenizer绑定
+        if self.cfg.use_mlm_task or self.cfg.use_NumEmb:
+            assert self.cfg.add_special_word == 1
+
+        if self.cfg.use_NumEmb:
+            self.cfg.awl_num += 1
+
+        return self.cfg