diff --git a/KTeleBERT/__pycache__/config.cpython-38.pyc b/KTeleBERT/__pycache__/config.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b210093bcab8230e6034a3a0f3db034bb4d80d7 Binary files /dev/null and b/KTeleBERT/__pycache__/config.cpython-38.pyc differ diff --git a/KTeleBERT/config.py b/KTeleBERT/config.py new file mode 100644 index 0000000000000000000000000000000000000000..643d7a1270213ddd48d922d30c9a450770b290b1 --- /dev/null +++ b/KTeleBERT/config.py @@ -0,0 +1,234 @@ +import os.path as osp +import numpy as np +import random +import torch +from easydict import EasyDict as edict +import argparse + + +LAYER_MAPPING = { + 0: 'od_layer_0', + 1: 'od_layer_1', + 2: 'od_layer_2', +} + + +class cfg(): + def __init__(self): + self.this_dir = osp.dirname(__file__) + # change + self.data_root = osp.abspath(osp.join(self.this_dir, '..', '..', 'data', '')) + + # TODO: add some static variable (The frequency of change is low) + + def get_args(self): + parser = argparse.ArgumentParser() + # ------------ base ------------ + parser.add_argument('--train_strategy', default=1, type=int) + parser.add_argument('--batch_size', default=64, type=int) + parser.add_argument('--batch_size_ke', default=14, type=int) + parser.add_argument('--batch_size_od', default=8, type=int) + parser.add_argument('--batch_size_ad', default=32, type=int) + + parser.add_argument('--epoch', default=15, type=int) + parser.add_argument("--save_model", default=1, type=int, choices=[0, 1]) + # 用transformer的 save_pretrain 方式保存 + parser.add_argument("--save_pretrain", default=0, type=int, choices=[0, 1]) + parser.add_argument("--from_pretrain", default=0, type=int, choices=[0, 1]) + + # torthlight + parser.add_argument("--no_tensorboard", default=False, action="store_true") + parser.add_argument("--exp_name", default="huawei_exp", type=str, help="Experiment name") + parser.add_argument("--dump_path", default="dump/", type=str, help="Experiment dump path") + parser.add_argument("--exp_id", default="ke256_raekt_ernie2_bs20_p3_c3_5e-6", type=str, help="Experiment ID") + # or 3407 + parser.add_argument("--random_seed", default=42, type=int) + # 数据参数 + parser.add_argument("--data_path", default="huawei", type=str, help="Experiment path") + parser.add_argument('--train_ratio', default=1, type=float, help='ratio for train/test') + parser.add_argument("--seq_data_name", default='Seq_data_base', type=str, help="seq_data 名字") + parser.add_argument("--kg_data_name", default='KG_data_base_rule', type=str, help="kg_data 名字") + parser.add_argument("--order_data_name", default='event_order_data', type=str, help="order_data 名字") + # TODO: add some dynamic variable + parser.add_argument("--model_name", default="MacBert", type=str, help="model name") + + # ------------ 训练阶段 ------------ + parser.add_argument("--scheduler", default="cos", type=str, choices=["linear", "cos"]) + parser.add_argument("--optim", default="adamw", type=str) + parser.add_argument("--adam_epsilon", default=1e-8, type=float) + parser.add_argument('--workers', type=int, default=8) + parser.add_argument('--accumulation_steps', type=int, default=6) + parser.add_argument('--accumulation_steps_ke', type=int, default=6) + parser.add_argument('--accumulation_steps_ad', type=int, default=6) + parser.add_argument('--accumulation_steps_od', type=int, default=6) + parser.add_argument("--train_together", default=0, type=int) + + # 3e-5 + parser.add_argument('--lr', type=float, default=1e-5) + # 逐层学习率衰减 + parser.add_argument("--LLRD", default=0, type=int, choices=[0, 1]) + parser.add_argument('--weight_decay', type=float, default=0.01) + parser.add_argument('--clip', type=float, default=1., help='gradient clipping') + parser.add_argument('--scheduler_steps', type=int, default=None, + help='total number of step for the scheduler, if None then scheduler_total_step = total_step') + parser.add_argument('--eval_step', default=100, type=int, help='evaluate each n step') + + # ------------ PLM ------------ + parser.add_argument('--maxlength', type=int, default=200) + parser.add_argument('--mlm_probability', type=float, default=0.15) + parser.add_argument('--final_mlm_probability', type=float, default=0.4) + parser.add_argument('--mlm_probability_increase', type=str, default="curve", choices=["linear", "curve"]) + parser.add_argument("--mask_stratege", default="rand", type=str, choices=["rand", "wwm", "domain"]) + # 前n个epoch 用rand,后面用wwm. multi-stage knowledge masking strategy + parser.add_argument("--ernie_stratege", default=-1, type=int) + # 用mlm任务进行训练,默认使用chinese_ref且添加新的special word + parser.add_argument("--use_mlm_task", default=1, type=int, choices=[0, 1]) + # 添加新的special word + parser.add_argument("--add_special_word", default=1, type=int, choices=[0, 1]) + # freeze + parser.add_argument("--freeze_layer", default=0, type=int, choices=[0, 1, 2, 3, 4]) + # 是否mask 特殊token + parser.add_argument("--special_token_mask", default=0, type=int, choices=[0, 1]) + parser.add_argument("--emb_init", default=1, type=int, choices=[0, 1]) + parser.add_argument("--cls_head_init", default=1, type=int, choices=[0, 1]) + # 是否使用自适应权重 + parser.add_argument("--use_awl", default=1, type=int, choices=[0, 1]) + parser.add_argument("--mask_loss_scale", default=1.0, type=float) + + # ------------ KGE ------------ + parser.add_argument('--ke_norm', type=int, default=1) + parser.add_argument('--ke_dim', type=int, default=768) + parser.add_argument('--ke_margin', type=float, default=1.0) + parser.add_argument('--neg_num', type=int, default=10) + parser.add_argument('--adv_temp', type=float, default=1.0, help='The temperature of sampling in self-adversarial negative sampling.') + # 5e-4 + parser.add_argument('--ke_lr', type=float, default=3e-5) + parser.add_argument('--only_ke_loss', type=int, default=0) + + # ------------ 数值embedding相关 ------------ + parser.add_argument('--use_NumEmb', type=int, default=1) + parser.add_argument("--contrastive_loss", default=1, type=int, choices=[0, 1]) + parser.add_argument("--l_layers", default=2, type=int) + parser.add_argument('--use_kpi_loss', type=int, default=1) + + # ------------ 测试阶段 ------------ + parser.add_argument("--only_test", default=0, type=int, choices=[0, 1]) + parser.add_argument("--mask_test", default=0, type=int, choices=[0, 1]) + parser.add_argument("--embed_gen", default=0, type=int, choices=[0, 1]) + parser.add_argument("--ke_test", default=0, type=int, choices=[0, 1]) + # -1: 测全集 + parser.add_argument("--ke_test_num", default=-1, type=int) + parser.add_argument("--path_gen", default="", type=str) + + # ------------ 时序阶段 ------------ + # 1:预训练 + # 2:时序 finetune + # 3. 异常检测 finetune + 时序, 且是迭代的 + # 是否加载od模型 + parser.add_argument("--order_load", default=0, type=int) + parser.add_argument("--order_num", default=2, type=int) + parser.add_argument("--od_type", default='linear_cat', type=str, choices=['linear_cat', 'vertical_attention']) + parser.add_argument("--eps", default=0.2, type=float, help='label smoothing..') + parser.add_argument("--num_od_layer", default=0, type=int) + parser.add_argument("--plm_emb_type", default='cls', type=str, choices=['cls', 'last_avg']) + parser.add_argument("--order_test_name", default='', type=str) + parser.add_argument("--order_threshold", default=0.5, type=float) + # ------------ 并行训练 ------------ + # 是否并行 + parser.add_argument('--rank', type=int, default=0, help='rank to dist') + parser.add_argument('--dist', type=int, default=0, help='whether to dist') + # 不要改该参数,系统会自动分配 + parser.add_argument('--device', default='cuda', help='device id (i.e. 0 or 0,1 or cpu)') + # 开启的进程数(注意不是线程),不用设置该参数,会根据nproc_per_node自动设置 + parser.add_argument('--world-size', default=4, type=int, + help='number of distributed processes') + parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training') + parser.add_argument("--local_rank", default=-1, type=int) + self.cfg = parser.parse_args() + + def update_train_configs(self): + # add some constraint for parameters + # e.g. cannot save and test at the same time + # 修正默认参数 + # TODO: 测试逻辑有问题需要修改 + if len(self.cfg.order_test_name) > 0: + self.cfg.save_model = 0 + if len(self.cfg.order_test_name) == 0: + self.cfg.train_ratio = min(0.8, self.cfg.train_ratio) + # 自适应载入文件名 + else: + print("od test ... ") + self.cfg.train_strategy == 5 + self.cfg.plm_emb_type = 'last_avg' if 'last_avg' in self.cfg.model_name else 'cls' + for key in LAYER_MAPPING.keys(): + if LAYER_MAPPING[key] in self.cfg.model_name: + self.cfg.num_od_layer = key + self.cfg.order_test_name = osp.join('downstream_task', f'{self.cfg.order_test_name}') + + if self.cfg.mask_test or self.cfg.embed_gen or self.cfg.ke_test or len(self.cfg.order_test_name) > 0: + assert len(self.cfg.model_name) > 0 + self.cfg.only_test = 1 + if self.cfg.only_test == 1: + self.save_model = 0 + self.save_pretrain = 0 + + # TODO: update some dynamic variable + self.cfg.data_root = self.data_root + self.cfg.data_path = osp.join(self.data_root, self.cfg.data_path) + self.cfg.plm_path = osp.join(self.data_root, 'transformer') + self.cfg.dump_path = osp.join(self.cfg.data_path, self.cfg.dump_path) + # bs 控制尽量在32 + + # 自适应权重的数量 + self.cfg.awl_num = 1 + # ------------ 数值embedding相关 ------------ + self.cfg.hidden_size = 768 + self.cfg.num_attention_heads = 8 + self.cfg.hidden_dropout_prob = 0.1 + self.cfg.num_kpi = 304 + self.cfg.specail_emb_path = None + if self.cfg.emb_init: + self.cfg.specail_emb_path = osp.join(self.cfg.data_path, 'added_vocab_embedding.pt') + + # ------------- 多任务学习相关 ------------- + # 四个阶段 + self.cfg.mask_epoch, self.cfg.ke_epoch, self.cfg.ad_epoch, self.cfg.od_epoch = None, None, None, None + # 触发多任务 学习 + if self.cfg.train_strategy > 1: + self.cfg.mask_epoch = [0, 1, 1, 1, 0] + self.cfg.ke_epoch = [4, 3, 2, 2, 0] + if self.cfg.only_ke_loss: + self.cfg.mask_epoch = [0, 0, 0, 0, 0] + self.cfg.epoch = sum(self.cfg.mask_epoch) + sum(self.cfg.ke_epoch) + if self.cfg.train_strategy > 2: + self.cfg.ad_epoch = [0, 6, 3, 1, 0] + self.cfg.epoch += sum(self.cfg.ad_epoch) + if self.cfg.train_strategy > 3 and not self.cfg.only_ke_loss: + self.cfg.od_epoch = [0, 0, 9, 1, 0] + # self.cfg.mask_epoch[3] = 1 + self.cfg.epoch += sum(self.cfg.od_epoch) + self.cfg.epoch_matrix = [] + for epochs in [self.cfg.mask_epoch, self.cfg.ke_epoch, self.cfg.ad_epoch, self.cfg.od_epoch]: + if epochs is not None: + self.cfg.epoch_matrix.append(epochs) + if self.cfg.train_together: + # loss 直接相加,训练epoch就是mask的epoch + self.cfg.epoch = sum(self.cfg.mask_epoch) + self.cfg.batch_size = int((self.cfg.batch_size - 16) / self.cfg.train_strategy) + self.cfg.batch_size_ke = int(self.cfg.batch_size_ke / self.cfg.train_strategy) - 2 + self.cfg.batch_size_ad = int(self.cfg.batch_size_ad / self.cfg.train_strategy) - 1 + self.cfg.batch_size_od = int(self.cfg.batch_size_od / self.cfg.train_strategy) - 1 + self.cfg.accumulation_steps = (self.cfg.accumulation_steps - 1) * self.cfg.train_strategy + + self.cfg.neg_num = max(min(self.cfg.neg_num, self.cfg.batch_size_ke - 3), 1) + + self.cfg.accumulation_steps_dict = {0: self.cfg.accumulation_steps, 1: self.cfg.accumulation_steps_ke, 2: self.cfg.accumulation_steps_ad, 3: self.cfg.accumulation_steps_od} + + # 使用数值embedding也必须添加新词因为位置信息和tokenizer绑定 + if self.cfg.use_mlm_task or self.cfg.use_NumEmb: + assert self.cfg.add_special_word == 1 + + if self.cfg.use_NumEmb: + self.cfg.awl_num += 1 + + return self.cfg diff --git a/KTeleBERT/data_trans.py b/KTeleBERT/data_trans.py new file mode 100644 index 0000000000000000000000000000000000000000..0273e20054017a876c57fb8752df897f83e2d04a --- /dev/null +++ b/KTeleBERT/data_trans.py @@ -0,0 +1,56 @@ +import os.path as osp +import numpy as np +import random +import torch +import argparse +import pdb +import json + +''' +把数据合并 +同时抽取一部分需要的数据出来 +''' + +this_dir = osp.dirname(__file__) + +data_root = osp.abspath(osp.join(this_dir, '..', '..', 'data', '')) + +data_path = "huawei" +data_path = osp.join(data_root, data_path) + + +with open(osp.join(data_path, 'product_corpus.json'), "r") as f: + data_doc = json.load(f) + +with open(osp.join(data_path, '831_alarm_serialize.json'), "r") as f: + data_alarm = json.load(f) +# kpi_info.json +with open(osp.join(data_path, '917_kpi_serialize_50_mn.json'), "r") as f: + data_kpi = json.load(f) + + +# 实体的序列化 +with open(osp.join(data_path, '5GC_KB/database_entity_serialize.json'), "r") as f: + data_entity = json.load(f) + +random.shuffle(data_kpi) +random.shuffle(data_doc) +random.shuffle(data_alarm) +random.shuffle(data_entity) +data = data_alarm + data_kpi + data_entity + data_doc +random.shuffle(data) + +# 241527 +pdb.set_trace() +with open(osp.join(data_path, 'Seq_data_large.json'), "w") as fp: + json.dump(data, fp, ensure_ascii=False) + + +# 三元组 +with open(osp.join(data_path, '5GC_KB/database_triples.json'), "r") as f: + data = json.load(f) +random.shuffle(data) + + +with open(osp.join(data_path, 'KG_data_base.json'), "w") as fp: + json.dump(data, fp, ensure_ascii=False) diff --git a/KTeleBERT/get_chinese_ref.py b/KTeleBERT/get_chinese_ref.py new file mode 100644 index 0000000000000000000000000000000000000000..ddb0216ea64eed071ec77a1d4d7126f7ed3fd912 --- /dev/null +++ b/KTeleBERT/get_chinese_ref.py @@ -0,0 +1,454 @@ +import os.path as osp +import numpy as np +import random +import torch +from easydict import EasyDict as edict +import argparse +import pdb +import json +from model import BertTokenizer +from collections import Counter +from ltp import LTP +from tqdm import tqdm +from src.utils import add_special_token +from functools import reduce +from time import time +from numpy import mean +import math + +from src.utils import Loss_log, time_trans +from collections import defaultdict + + +class cfg(): + def __init__(self): + self.this_dir = osp.dirname(__file__) + # change + self.data_root = osp.abspath(osp.join(self.this_dir, '..', '..', 'data', '')) + + def get_args(self): + parser = argparse.ArgumentParser() + # seq_data_name = "Seq_data_tiny_831" + parser.add_argument("--data_path", default="huawei", type=str, help="Experiment path") + # TODO: freq 可以考虑 150 + parser.add_argument("--freq", default=50, type=int, help="出现多少次的词认为是重要的") + parser.add_argument("--batch_size", default=100, type=int, help="分词的batch size") + parser.add_argument("--seq_data_name", default='Seq_data_large', type=str, help="seq_data 名字") + parser.add_argument("--deal_numeric", default=0, type=int, help="是否处理数值数据") + + parser.add_argument("--read_cws", default=0, type=int, help="是否需要读训练好的cws文件") + self.cfg = parser.parse_args() + + def update_train_configs(self): + # TODO: update some dynamic variable + self.cfg.data_root = self.data_root + self.cfg.data_path = osp.join(self.data_root, self.cfg.data_path) + + return self.cfg + + +def refresh_data(ref, freq, special_token): + ''' + 功能:在自定义的special token基础上基于最小出现频率得到更多新词分词系统的参考,作为wwm基础 + 输入: + freq: 在(37万)语义词典中的最小出现频率(空格为分词) + special_token: 前面手工定义的特殊token(可能存在交集) + 输出: + add_words:在定义的最小出现频率基础上筛选出来的新词 + ''' + # 经常出现的sub token + seq_sub_data = [line.split() for line in ref] + all_data = [] + for data in seq_sub_data: + all_data.extend(data) + sub_word_times = dict(Counter(all_data)) + asub_word_time_order = sorted(sub_word_times.items(), key=lambda x: x[1], reverse=True) + # ('LST', 1218), ('RMV', 851), ('DSP', 821), ('ADD', 820), ('MOD', 590), ('SET', 406), ('AWS', 122) + # ADD、ACT、ALM-XXX、DEL、DSP、LST + add_words = [] + + for i in asub_word_time_order: + # 把出现频率很高的词加进来 + if i[1] >= freq and len(i[0]) > 1 and len(i[0]) < 20 and not str.isdigit(i[0]): + add_words.append(i[0]) + add_words.extend(special_token) + # 卡100阈值时是935个特殊token + print(f"[{len(add_words)}] special words will be added with frequency [{freq}]!") + return add_words + + +def cws(seq_data, add_words, batch_size): + ''' + 功能:所有序列数据的输入转换成分词之后的结果 + 输入: + seq_data:所有序列数据输入 e.g.['KPI异常下降', 'KPI异常上升'] + add_words:添加的special words + batch_size:每次分多少句 + 输出: + all_segment:所有序列数据的输出 e.g. [['KPI', '异常', '下降'], ['KPI', '异常', '上升']] + data_size:输入/输出的序列数量(e.g. 2) + ''' + # seq_data = seq_data.cuda() + print(f"loading...") + ltp = LTP("LTP/base2") # 默认加载 base2 模型 + # ltp = LTP() + print(f"begin adding words ...") + # ltp.add_words(words=add_words, max_window=5) #4.1.5 + ltp.add_words(words=add_words) # 4.2.8 + ltp.to("cuda") + # for word in add_words: + # ltp.add_word(word) + print(f"{len(add_words)} special words are added!") + + # + # for data in seq_data: + # output = ltp.pipeline([data], tasks=["cws"]) + data_size = len(seq_data) + seq_data_cws = [] + size = int(data_size / batch_size) + 1 + b = 0 + e = b + batch_size + # pdb.set_trace() + + log = Loss_log() + + with tqdm(total=size) as _tqdm: + # pdb.set_trace() + # log.time_init() + # pdb.set_trace() + error_data = [] + for i in range(size): + + output = [] + try: + _output = ltp.pipeline(seq_data[b:e], tasks=["cws"]) + for data in _output.cws: + try: + data_out = ltp.pipeline(data, tasks=["cws"]) + # data_out_ = reduce(lambda x, y: x.extend(y) or x, data_out.cws) + data_out_ = [] + for i in data_out.cws: + data_out_.extend([k.strip() for k in i]) + output.append(data_out_) + except: + print(f"二阶段分词出错!范围是:[{b}]-[{e}]") + error_data.append(data) + + # pdb.set_trace() + except: + print(f"第一阶段分词出错!范围是:[{b}]-[{e}]") + error_data.append(f"第一阶段分词出错!范围是:[{b}]-[{e}]") + # continue + seq_data_cws.extend(output) + b = e + e += batch_size + + # 时间统计 + if e >= data_size: + if b >= data_size: + break + e = data_size + _tqdm.set_description(f'from {b} to {e}:') + _tqdm.update(1) + + print(f"过滤了{data_size - len(seq_data_cws)}个句子") + + return seq_data_cws, data_size, error_data + + +def ltp_debug(ltp, op): + output = [] + for data in op: + data_out = ltp.pipeline(data, tasks=["cws"]) + # data_out_ = reduce(lambda x, y: x.extend(y) or x, data_out.cws) + data_out_ = [] + for i in data_out.cws: + # 保留空格的话需要手动去除空格 + data_out_.append(i[0].strip()) + # 之前没有空格 + # data_out_.extend(i) + output.append(data_out_) + return output + + +def deal_sub_words(subwords, special_token): + ''' + 功能:把每个word的整体内,非首字符的部分加上 '##' 前缀, special_token 不应该被mask + ''' + for i in range(len(subwords)): + if i == 0: + continue + if subwords[i] in special_token: + continue + if subwords[i].startswith("##"): + continue + + subwords[i] = "##" + subwords[i] + return subwords + + +def generate_chinese_ref(seq_data_cws, special_token, deal_numeric, kpi_dic): + ''' + 输入: + seq_data_cws:所有序列数据的输出 e.g. [['KPI', '异常', '下降'], ['KPI', '异常', '上升']] + special_token:不应该被mask ['[SEP]', '[MASK]', '[ALM]', '[KPI]', '[CLS]', '[LOC]', '[EOS]', '[ENT]', '[ATTR]', '[NUM]', '|'] + data_size:数据量 e.g. 2 + 输出: + ww_return (whole word return):打标之后的chinese ref e.g. [['KPI', '异','##常', '下', '##降'], ['KPI', '异', '##常', '上', '##升']] + ''' + # 定义全局set和逆字典统计哪些KPI最后没有被涉及 + data_size = len(seq_data_cws) + kpi_static_set = set() + rev_kpi_dic = dict(zip(kpi_dic.values(), kpi_dic.keys())) + max_len = 0 + sten_that_over_maxl = [] + with tqdm(total=data_size) as _tqdm: + ww_return = [] + ww_list = [] + kpi_info = [] + not_in_KPI = defaultdict(int) + for i in range(data_size): + _tqdm.set_description(f'checking...[{i}/{data_size}] max len: [{max_len}]') + orig = tokenizer.tokenize(" ".join(seq_data_cws[i])) + + if deal_numeric: + # 得到元组信息,前两位是KPI下标范围 + _kpi_info, kpi_type_list = extract_kpi(orig, kpi_dic, not_in_KPI) + kpi_info.append(_kpi_info) + kpi_static_set.update(kpi_type_list) + + sub_total = [] + ww_seq_tmp = [] + ww_tmp = [] + for sub_data in seq_data_cws[i]: + sub = tokenizer.tokenize(sub_data) + sub_total.extend(sub) + # 在whole word 里面添加#号 + # 输入: ['异', '常'] + ref_token = deal_sub_words(sub, special_token) + # 输出: ['异', '##常'] + ww_seq_tmp.extend(ref_token) + ww_tmp.append(ref_token) + + if sub_total != orig: + print("error in match... ") + if len(orig) > 512: + print("the lenth is over the max lenth") + pdb.set_trace() + + # 变成[[...],[...],[...], ...] + # ww_return.append(ww_tmp) + sz_ww_seq = len(ww_seq_tmp) + # 求最大长度 + max_len = sz_ww_seq if sz_ww_seq > max_len else max_len + if sz_ww_seq > 500: + sten_that_over_maxl.append((ww_seq_tmp, sz_ww_seq)) + + assert len(sub_total) == sz_ww_seq + ww_return.append(ww_seq_tmp) + ww_list.append(ww_tmp) + # pdb.set_trace() + _tqdm.update(1) + # pdb.set_trace() + if deal_numeric: + in_kpi = [] + # pdb.set_trace() + for key in rev_kpi_dic.keys(): + if key in kpi_static_set: + in_kpi.append(rev_kpi_dic[key]) + if len(in_kpi) < len(rev_kpi_dic): + print(f"[{len(in_kpi)}] KPI are covered by data: {in_kpi}") + print(f" [{len(not_in_KPI)}] KPI无法匹配{not_in_KPI}") + else: + print("all KPI are covered!") + return ww_return, kpi_info, sten_that_over_maxl + + +def extract_num(seq_data_cws): + ''' + 功能:把序列中的数值信息提取出来 + 同时过滤 nan 数值 + ''' + num_ref = [] + seq_data_cws_new = [] + for j in range(len(seq_data_cws)): + num_index = [i for i, x in enumerate(seq_data_cws[j]) if x == '[NUM]'] + # kpi_score = [float(seq_data_cws[i][index+1]) for index in num_index] + kpi_score = [] + flag = 1 + for index in num_index: + # if math.isnan(tmp): + # pdb.set_trace() + try: + tmp = float(seq_data_cws[j][index + 1]) + except: + # pdb.set_trace() + flag = 0 + continue + if math.isnan(tmp): + flag = 0 + else: + kpi_score.append(tmp) + + if len(num_index) > 0: + for index in reversed(num_index): + seq_data_cws[j].pop(index + 1) + if flag == 1: + num_ref.append(kpi_score) + seq_data_cws_new.append(seq_data_cws[j]) + return seq_data_cws_new, num_ref + + +def extract_kpi(token_data, kpi_dic, not_in_KPI): + ''' + 功能:把序列中的[KPI]下标范围,[NUM]下标提取出来 + 输出格式: [(1,2,4),(5,6,7)] + ''' + kpi_and_num_info = [] + kpi_type = [] + kpi_index = [i for i, x in enumerate(token_data) if x.lower() == '[kpi]'] + num_index = [i for i, x in enumerate(token_data) if x.lower() == '[num]'] + sz = len(kpi_index) + assert sz == len(num_index) + for i in range(sz): + # (kpi 开始,kpi 结束,NUM token位置) + # DONE: 添加KPI的类别 + kpi_name = ''.join(token_data[kpi_index[i] + 1: num_index[i] - 1]) + kpi_name_clear = kpi_name.replace('##', '') + + if kpi_name in kpi_dic: + kpi_id = int(kpi_dic[kpi_name]) + elif kpi_name_clear in kpi_dic: + kpi_id = int(kpi_dic[kpi_name_clear]) + elif kpi_name_clear in not_in_KPI: + kpi_id = -1 + not_in_KPI[kpi_name_clear] += 1 + else: + # 只打印一次 + not_in_KPI[kpi_name_clear] += 1 + kpi_id = -1 + # print(f"{kpi_name_clear} not in KPI dict") + + kpi_info = [kpi_index[i] + 1, num_index[i] - 2, num_index[i], kpi_id] + kpi_and_num_info.append(kpi_info) + kpi_type.append(kpi_id) + # pdb.set_trace() + + return kpi_and_num_info, kpi_type + + +def kpi_combine(kpi_info, num_ref): + sz = len(kpi_info) + assert sz == len(num_ref) + for i in range(sz): + for j in range(len(kpi_info[i])): + kpi_info[i][j].append(num_ref[i][j]) + # pdb.set_trace() + return kpi_info + +# 所有字母小写 + + +def kpi_lower_update(kpi_dic): + new_dic = {} + for key in kpi_dic: + kk = key.lower().split() + kk = ''.join(kk).strip() + new_dic[kk] = kpi_dic[key] + return new_dic + + +if __name__ == '__main__': + ''' + 功能: 得到 chinese ref 文件,同时刷新训练/测试文件(仅针对序列的文本数据) + ''' + cfg = cfg() + cfg.get_args() + cfgs = cfg.update_train_configs() + + # 路径指定 + domain_file_path = osp.join(cfgs.data_path, 'special_vocab.txt') + with open(domain_file_path, encoding="utf-8") as f: + ref = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] + tokenizer = BertTokenizer.from_pretrained(osp.join(cfgs.data_root, 'transformer', 'MacBert'), do_lower_case=True) + seq_data_name = cfgs.seq_data_name + with open(osp.join(cfgs.data_path, f'{seq_data_name}.json'), "r") as fp: + seq_data = json.load(fp) + kpi_dic_name = 'kpi2id' + with open(osp.join(cfgs.data_path, f'{kpi_dic_name}.json'), "r") as fp: + kpi_dic = json.load(fp) + kpi_dic = kpi_lower_update(kpi_dic) + # 供测试 + random.shuffle(seq_data) + # seq_data = seq_data[:500] + print(f"tokenizer size before: {len(tokenizer)}") + tokenizer, special_token, norm_token = add_special_token(tokenizer) + special_token = special_token + norm_token + + print(f"tokenizer size after: {len(tokenizer)}") + print('------------------------ refresh data --------------------------------') + add_words = refresh_data(ref, cfgs.freq, special_token) + + if not cfgs.read_cws: + print('------------------------ cws ----------------------------------') + seq_data_cws, data_size, error_data = cws(seq_data, add_words, cfgs.batch_size) + print(f'batch size is {cfgs.batch_size}') + if len(error_data) > 0: + with open(osp.join(cfgs.data_path, f'{seq_data_name}_error.json'), "w") as fp: + json.dump(error_data, fp, ensure_ascii=False) + save_path_cws_orig = osp.join(cfgs.data_path, f'{seq_data_name}_cws_orig.json') + print("get the new training data! saving...") + with open(save_path_cws_orig, 'w', ) as fp: + json.dump(seq_data_cws, fp, ensure_ascii=False) + else: + print('------------------------ read ----------------------------------') + save_path_cws = osp.join(cfgs.data_path, f'{seq_data_name}_cws_orig.json') + print("get the new training data!") + with open(save_path_cws, 'r', ) as fp: + seq_data_cws = json.load(fp) + data_size = len(seq_data_cws) + + sz_orig = len(seq_data_cws) + if cfgs.deal_numeric: + seq_data_cws, num_ref = extract_num(seq_data_cws) + print(f"过滤了{sz_orig - len(seq_data_cws)}个无效句子") + data_size = len(seq_data_cws) + + print('---------------------- generate chinese ref ------------------------------') + chinese_ref, kpi_info, sten_that_over_maxl = generate_chinese_ref(seq_data_cws, special_token, cfgs.deal_numeric, kpi_dic) + + if len(sten_that_over_maxl) > 0: + print(f"{len(sten_that_over_maxl)} over the 500 len!") + save_path_max = osp.join(cfgs.data_path, f'{seq_data_name}_max_len_500.json') + with open(save_path_max, 'w') as fp: + json.dump(sten_that_over_maxl, fp, ensure_ascii=False) + + if cfgs.deal_numeric: + print("KPI info combine") + kpi_ref = kpi_combine(kpi_info, num_ref) + # pdb.set_trace() + print('------------------------- match finished ------------------------------') + + # 输出最后训练的时候用于做wwm的分词 + save_path_ref = osp.join(cfgs.data_path, f'{seq_data_name}_chinese_ref.json') + with open(save_path_ref, 'w') as fp: + json.dump(chinese_ref, fp, ensure_ascii=False) + print(f"save chinese_ref done!") + + seq_data_cws_output = [] + for i in range(data_size): + seq = " ".join(seq_data_cws[i]) + seq_data_cws_output.append(seq) + + save_path_cws = osp.join(cfgs.data_path, f'{seq_data_name}_cws.json') + print("get the new training data!") + with open(save_path_cws, 'w', ) as fp: + json.dump(seq_data_cws_output, fp, ensure_ascii=False) + + print("save seq_data_cws done!") + + if cfgs.deal_numeric: + kpi_ref_path = osp.join(cfgs.data_path, f'{seq_data_name}_kpi_ref.json') + with open(kpi_ref_path, 'w', ) as fp: + json.dump(kpi_ref, fp, ensure_ascii=False) + print("save num and kpi done!") diff --git a/KTeleBERT/main.py b/KTeleBERT/main.py new file mode 100644 index 0000000000000000000000000000000000000000..de6605986e6256bbaa486b7fcd938acdd7c09d26 --- /dev/null +++ b/KTeleBERT/main.py @@ -0,0 +1,851 @@ +import os +import os.path as osp +import torch +from torch.utils.tensorboard import SummaryWriter +from torch.utils.data import DataLoader, RandomSampler +from torch.cuda.amp import GradScaler, autocast +from datetime import datetime +from easydict import EasyDict as edict +from tqdm import tqdm +import pdb +import pprint +import json +import pickle +from collections import defaultdict +import copy +from time import time + +from config import cfg +from torchlight import initialize_exp, set_seed, get_dump_path +from src.data import load_data, load_data_kg, Collator_base, Collator_kg, SeqDataset, KGDataset, Collator_order, load_order_data +from src.utils import set_optim, Loss_log, add_special_token, time_trans +from src.distributed_utils import init_distributed_mode, dist_pdb, is_main_process, reduce_value, cleanup +import torch.distributed as dist + +from itertools import cycle +from model import BertTokenizer, HWBert, KGEModel, OD_model, KE_model +import torch.multiprocessing +from torch.nn.parallel import DistributedDataParallel + +# 默认用cuda就行 + + +class Runner: + def __init__(self, args, writer=None, logger=None, rank=0): + self.datapath = edict() + self.datapath.log_dir = get_dump_path(args) + self.datapath.model_dir = os.path.join(self.datapath.log_dir, 'model') + self.rank = rank + # init code + self.mlm_probability = args.mlm_probability + self.args = args + self.writer = writer + self.logger = logger + # 模型选择 + self.model_list = [] + self.model = HWBert(self.args) + # 数据加载。添加special_token,同时把模型的embedding layer进行resize + self.data_init() + self.model.cuda() + # 模型加载 + self.od_model, self.ke_model = None, None + self.scaler = GradScaler() + + # 只要不是第一种训练策略就有新模型 + if self.args.train_strategy >= 2: + self.ke_model = KE_model(self.args) + if self.args.train_strategy >= 3: + # TODO: 异常检测 + pass + if self.args.train_strategy >= 4: + self.od_model = OD_model(self.args) + + if self.args.model_name not in ['MacBert', 'TeleBert', 'TeleBert2', 'TeleBert3'] and not self.args.from_pretrain: + # 如果不存在模型会直接返回None或者原始模型 + self.model = self._load_model(self.model, self.args.model_name) + self.od_model = self._load_model(self.od_model, f"od_{self.args.model_name}") + self.ke_model = self._load_model(self.ke_model, f"ke_{self.args.model_name}") + # TODO: 异常检测 + + # 测试的情况 + if self.args.only_test: + self.dataloader_init(self.seq_test_set) + else: + # 训练 + if self.args.ernie_stratege > 0: + self.args.mask_stratege = 'rand' + # 初始化dataloader + self.dataloader_init(self.seq_train_set, self.kg_train_set, self.order_train_set) + if self.args.dist: + # 并行训练需要权值共享 + self.model_sync() + else: + self.model_list = [model for model in [self.model, self.od_model, self.ke_model] if model is not None] + + self.optim_init(self.args) + + def model_sync(self): + checkpoint_path = osp.join(self.args.data_path, "tmp", "initial_weights.pt") + checkpoint_path_od = osp.join(self.args.data_path, "tmp", "initial_weights_od.pt") + checkpoint_path_ke = osp.join(self.args.data_path, "tmp", "initial_weights_ke.pt") + if self.rank == 0: + torch.save(self.model.state_dict(), checkpoint_path) + if self.od_model is not None: + torch.save(self.od_model.state_dict(), checkpoint_path_od) + if self.ke_model is not None: + torch.save(self.ke_model.state_dict(), checkpoint_path_ke) + dist.barrier() + + # if self.rank != 0: + # 这里注意,一定要指定map_location参数,否则会导致第一块GPU占用更多资源 + self.model = self._model_sync(self.model, checkpoint_path) + if self.od_model is not None: + self.od_model = self._model_sync(self.od_model, checkpoint_path_od) + if self.ke_model is not None: + self.ke_model = self._model_sync(self.ke_model, checkpoint_path_ke) + + def _model_sync(self, model, checkpoint_path): + model.load_state_dict(torch.load(checkpoint_path, map_location=self.args.device)) + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(self.args.device) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[self.args.gpu], find_unused_parameters=True) + self.model_list.append(model) + model = model.module + return model + + def optim_init(self, opt, total_step=None, accumulation_step=None): + step_per_epoch = len(self.train_dataloader) + # 占总step 10% 的warmup_steps + opt.total_steps = int(step_per_epoch * opt.epoch) if total_step is None else int(total_step) + opt.warmup_steps = int(opt.total_steps * 0.15) + + if self.rank == 0 and total_step is None: + self.logger.info(f"warmup_steps: {opt.warmup_steps}") + self.logger.info(f"total_steps: {opt.total_steps}") + self.logger.info(f"weight_decay: {opt.weight_decay}") + + freeze_part = ['bert.encoder.layer.1.', 'bert.encoder.layer.2.', 'bert.encoder.layer.3.', 'bert.encoder.layer.4.'][:self.args.freeze_layer] + self.optimizer, self.scheduler = set_optim(opt, self.model_list, freeze_part, accumulation_step) + + def data_init(self): + # 载入数据, 两部分数据包括:载入mask loss部分的数据(序列化的数据) 和 载入triple loss部分的数据(三元组) + # train_test_split: 训练集长度 + self.seq_train_set, self.seq_test_set, self.kg_train_set, self.kg_data = None, None, None, None + self.order_train_set, self.order_test_set = None, None + + if self.args.train_strategy >= 1 and self.args.train_strategy <= 4: + # 预训练 or multi task pretrain + self.seq_train_set, self.seq_test_set, train_test_split = load_data(self.logger, self.args) + if self.args.train_strategy >= 2: + self.kg_train_set, self.kg_data = load_data_kg(self.logger, self.args) + if self.args.train_strategy >= 3: + # TODO: 异常检测的数据载入 + pass + if self.args.train_strategy >= 4: + self.order_train_set, self.order_test_set, train_test_split = load_order_data(self.logger, self.args) + + if self.args.dist and not self.args.only_test: + # 测试不需要并行 + if self.args.train_strategy >= 1 and self.args.train_strategy <= 4: + self.seq_train_sampler = torch.utils.data.distributed.DistributedSampler(self.seq_train_set) + if self.args.train_strategy >= 2: + self.kg_train_sampler = torch.utils.data.distributed.DistributedSampler(self.kg_train_set) + if self.args.train_strategy >= 3: + # TODO: 异常检测的数据载入 + pass + if self.args.train_strategy >= 4: + self.order_train_sampler = torch.utils.data.distributed.DistributedSampler(self.order_train_set) + + # self.seq_train_batch_sampler = torch.utils.data.BatchSampler(self.seq_train_sampler, self.args.batch_size, drop_last=True) + # self.kg_train_batch_sampler = torch.utils.data.BatchSampler(self.kg_train_sampler, int(self.args.batch_size / 4), drop_last=True) + + # Tokenizer 载入 + model_name = self.args.model_name + if self.args.model_name in ['TeleBert', 'TeleBert2', 'TeleBert3']: + self.tokenizer = BertTokenizer.from_pretrained(osp.join(self.args.data_root, 'transformer', model_name), do_lower_case=True) + else: + if not osp.exists(osp.join(self.args.data_root, 'transformer', self.args.model_name)): + model_name = 'MacBert' + self.tokenizer = BertTokenizer.from_pretrained(osp.join(self.args.data_root, 'transformer', model_name), do_lower_case=True) + + # 添加special_token,同时把模型的embedding layer进行resize + self.special_token = None + # 单纯的telebert在测试时不需要特殊embedding + if self.args.add_special_word and not (self.args.only_test and self.args.model_name in ['MacBert', 'TeleBert', 'TeleBert2', 'TeleBert3']): + # tokenizer, special_token, norm_token + # special_token 不应该被MASK + self.tokenizer, special_token, _ = add_special_token(self.tokenizer, model=self.model.encoder, rank=self.rank, cache_path=self.args.specail_emb_path) + # pdb.set_trace() + self.special_token = [token.lower() for token in special_token] + + def _dataloader_dist(self, train_set, train_sampler, batch_size, collator): + train_dataloader = DataLoader( + train_set, + sampler=train_sampler, + pin_memory=True, + num_workers=self.args.workers, + persistent_workers=True, + drop_last=True, + batch_size=batch_size, + collate_fn=collator + ) + return train_dataloader + + def _dataloader(self, train_set, batch_size, collator): + train_dataloader = DataLoader( + train_set, + num_workers=self.args.workers, + persistent_workers=True, + shuffle=(self.args.only_test == 0), + drop_last=(self.args.only_test == 0), + batch_size=batch_size, + collate_fn=collator + ) + return train_dataloader + + def dataloader_init(self, train_set=None, kg_train_set=None, order_train_set=None): + bs = self.args.batch_size + bs_ke = self.args.batch_size_ke + bs_od = self.args.batch_size_od + bs_ad = self.args.batch_size_ad + # 分布式 + if self.args.dist and not self.args.only_test: + self.args.workers = min([os.cpu_count(), self.args.batch_size, self.args.workers]) + # if self.rank == 0: + # print(f'Using {self.args.workers} dataloader workers every process') + + if train_set is not None: + seq_collator = Collator_base(self.args, tokenizer=self.tokenizer, special_token=self.special_token) + self.train_dataloader = self._dataloader_dist(train_set, self.seq_train_sampler, bs, seq_collator) + if kg_train_set is not None: + kg_collator = Collator_kg(self.args, tokenizer=self.tokenizer, data=self.kg_data) + self.train_dataloader_kg = self._dataloader_dist(kg_train_set, self.kg_train_sampler, bs_ke, kg_collator) + if order_train_set is not None: + order_collator = Collator_order(self.args, tokenizer=self.tokenizer) + self.train_dataloader_order = self._dataloader_dist(order_train_set, self.order_train_sampler, bs_od, order_collator) + else: + if train_set is not None: + seq_collator = Collator_base(self.args, tokenizer=self.tokenizer, special_token=self.special_token) + self.train_dataloader = self._dataloader(train_set, bs, seq_collator) + if kg_train_set is not None: + kg_collator = Collator_kg(self.args, tokenizer=self.tokenizer, data=self.kg_data) + self.train_dataloader_kg = self._dataloader(kg_train_set, bs_ke, kg_collator) + if order_train_set is not None: + order_collator = Collator_order(self.args, tokenizer=self.tokenizer) + self.train_dataloader_order = self._dataloader(order_train_set, bs_od, order_collator) + + def dist_step(self, task=0): + # 分布式训练需要额外step + if self.args.dist: + if task == 0: + self.seq_train_sampler.set_epoch(self.dist_epoch) + if task == 1: + self.kg_train_sampler.set_epoch(self.dist_epoch) + if task == 2: + # TODO:异常检测 + pass + if task == 3: + self.order_train_sampler.set_epoch(self.dist_epoch) + self.dist_epoch += 1 + + def mask_rate_update(self, i): + # 这种策略是曲线地增加 mask rate + if self.args.mlm_probability_increase == "curve": + self.args.mlm_probability += (i + 1) * ((self.args.final_mlm_probability - self.args.mlm_probability) / self.args.epoch) + # 这种是线性的 + else: + assert self.args.mlm_probability_increase == "linear" + self.args.mlm_probability += (self.args.final_mlm_probability - self.mlm_probability) / self.args.epoch + + if self.rank == 0: + self.logger.info(f"Moving Mlm_probability in next epoch to: {self.args.mlm_probability*100}%") + + def task_switch(self, training_strategy): + # 同时训练或者策略1训练不需要切换任务,epoch也安装初始epoch就行 + if training_strategy == 1 or self.args.train_together: + return (0, 0), None + + # 4 阶段 + # self.total_epoch -= 1 + + for i in range(4): + for task in range(training_strategy): + if self.args.epoch_matrix[task][i] > 0: + self.args.epoch_matrix[task][i] -= 1 + return (task, i), self.args.epoch_matrix[task][i] + 1 + + def run(self): + self.loss_log = Loss_log() + self.curr_loss = 0. + self.lr = self.args.lr + self.curr_loss_dic = defaultdict(float) + self.curr_kpi_loss_dic = defaultdict(float) + self.loss_weight = [1, 1] + self.kpi_loss_weight = [1, 1] + self.step = 0 + # 不同task 的累计step + self.total_step_sum = 0 + task_last = 0 + stage_last = 0 + self.dist_epoch = 0 + # 后面可以变成混合训练模式 + # self.total_epoch = self.args.epoch + # --------- train ------------- + with tqdm(total=self.args.epoch) as _tqdm: # 使用需要的参数对tqdm进行初始化 + for i in range(self.args.epoch): + # 切换Task + (task, stage), task_epoch = self.task_switch(self.args.train_strategy) + self.dist_step(task) + dataloader = self.task_dataloader_choose(task) + # 并行 + if self.args.train_together and self.args.train_strategy > 1: + self.dataloader_list = ['#'] + # 一个list 存下所有需要的dataloader的迭代 + for t in range(1, self.args.train_strategy): + self.dist_step(t) + self.dataloader_list.append(iter(self.task_dataloader_choose(t))) + + if task != task_last or stage != stage_last: + self.step = 0 + if self.rank == 0: + print(f"switch to task [{task}] in stage [{stage}]...") + if stage != stage_last: + # 每一个阶段结束保存一次 + self._save_model(stage=f'_stg{stage_last}') + # task 转换状态时需要重新初始化优化器 + # 并行训练或者单一task (0) 训练不需要切换opti + if task_epoch is not None: + self.optim_init(self.args, total_step=len(dataloader) * task_epoch, accumulation_step=self.args.accumulation_steps_dict[task]) + task_last = task + stage_last = stage + + # 调整学习阶段 + if task == 0 and self.args.ernie_stratege > 0 and i >= self.args.ernie_stratege: + # 不会再触发第二次 + self.args.ernie_stratege = 10000000 + if self.rank == 0: + self.logger.info("switch to wwm stratege...") + self.args.mask_stratege = 'wwm' + + if self.args.mlm_probability != self.args.final_mlm_probability: + # 更新 MASK rate + # 初始化训练数据, 可以随epoch切换 + # 混合训练 + self.mask_rate_update(i) + self.dataloader_init(self.seq_train_set, self.kg_train_set, self.order_train_set) + # ------------------------------- + # 针对task 进行训练 + self.train(_tqdm, dataloader, task) + # ------------------------------- + _tqdm.update(1) + + # DONE: save or load + if self.rank == 0: + self.logger.info(f"min loss {self.loss_log.get_min_loss()}") + # DONE: save or load + if not self.args.only_test and self.args.save_model: + self._save_model() + + def task_dataloader_choose(self, task): + self.model.train() + # 同时训练就用基础dataloader就行 + if task == 0: + dataloader = self.train_dataloader + elif task == 1: + self.ke_model.train() + dataloader = self.train_dataloader_kg + elif task == 2: + pass + elif task == 3: + self.od_model.train() + dataloader = self.train_dataloader_order + return dataloader + # one time train + + def loss_output(self, batch, task): + # -------- 模型输出 loss -------- + if task == 0: + # 输出 + _output = self.model(batch) + loss = _output['loss'] + elif task == 1: + loss = self.ke_model(batch, self.model) + elif task == 2: + pass + elif task == 3: + # TODO: finetune的时候多任务 accumulation_steps 自适应 + # OD task + emb = self.model.cls_embedding(batch[0], tp=self.args.plm_emb_type) + loss, loss_dic = self.od_model(emb, batch[1].cuda()) + order_score = self.od_model.predict(emb) + token_right = self.od_model.right_caculate(order_score, batch[1], threshold=0.5) + self.loss_log.update_token(batch[1].shape[0], [token_right]) + return loss + + def train(self, _tqdm, dataloader, task=0): + # cycle train + loss_weight, kpi_loss_weight, kpi_loss_dict, _output = None, None, None, None + # dataloader = zip(self.train_dataloader, cycle(self.train_dataloader_kg)) + self.loss_log.acc_init() + # 如果self.train_dataloader比self.train_dataloader_kg长则会使得后者训练不完全 + accumulation_steps = self.args.accumulation_steps_dict[task] + torch.cuda.empty_cache() + + for batch in dataloader: + # with autocast(): + loss = self.args.mask_loss_scale * self.loss_output(batch, task) + # 如果是同时训练的话使用迭代器的方法得到另外的epoch + if self.args.train_together and self.args.train_strategy > 1: + for t in range(1, self.args.train_strategy): + try: + batch = next(self.dataloader_list[t]) + except StopIteration: + self.dist_step(t) + self.dataloader_list[t] = iter(self.task_dataloader_choose(t)) + batch = next(self.dataloader_list[t]) + # 选择对应的模型得到loss + # torch.cuda.empty_cache() + loss += self.loss_output(batch, t) + # torch.cuda.empty_cache() + loss = loss / accumulation_steps + self.scaler.scale(loss).backward() + # loss.backward() + if self.args.dist: + loss = reduce_value(loss, average=True) + # torch.cuda.empty_cache() + self.step += 1 + self.total_step_sum += 1 + + # -------- 模型统计 -------- + if not self.args.dist or is_main_process(): + self.output_statistic(loss, _output) + acc_descrip = f"Acc: {self.loss_log.get_token_acc()}" if self.loss_log.get_token_acc() > 0 else "" + _tqdm.set_description(f'Train | step [{self.step}/{self.args.total_steps}] {acc_descrip} LR [{self.lr}] Loss {self.loss_log.get_loss():.5f} ') + if self.step % self.args.eval_step == 0 and self.step > 0: + self.loss_log.update(self.curr_loss) + self.update_loss_log() + # -------- 梯度累计与模型更新 -------- + if self.step % accumulation_steps == 0 and self.step > 0: + # 更新优化器 + self.scaler.unscale_(self.optimizer) + for model in self.model_list: + torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.clip) + + # self.optimizer.step() + scale = self.scaler.get_scale() + self.scaler.step(self.optimizer) + + self.scaler.update() + skip_lr_sched = (scale > self.scaler.get_scale()) + if not skip_lr_sched: + # pdb.set_trace() + self.scheduler.step() + + if not self.args.dist or is_main_process(): + # pdb.set_trace() + self.lr = self.scheduler.get_last_lr()[-1] + self.writer.add_scalars("lr", {"lr": self.lr}, self.total_step_sum) + # 模型update + for model in self.model_list: + model.zero_grad(set_to_none=True) + + if self.args.dist: + torch.cuda.synchronize(self.args.device) + return self.curr_loss, self.curr_loss_dic + + def output_statistic(self, loss, output): + # 统计模型的各种输出 + self.curr_loss += loss.item() + if output is None: + return + for key in output['loss_dic'].keys(): + self.curr_loss_dic[key] += output['loss_dic'][key] + if 'kpi_loss_dict' in output and output['kpi_loss_dict'] is not None: + for key in output['kpi_loss_dict'].keys(): + self.curr_kpi_loss_dic[key] += output['kpi_loss_dict'][key] + if 'loss_weight' in output and output['loss_weight'] is not None: + self.loss_weight = output['loss_weight'] + # 需要用dict来判断 + if 'kpi_loss_weight' in output and output['kpi_loss_weight'] is not None: + self.kpi_loss_weight = output['kpi_loss_weight'] + + def update_loss_log(self, task=0): + # 把统计的模型各种输出存下来 + # https://zhuanlan.zhihu.com/p/382950853 + # "mask_loss": self.curr_loss_dic['mask_loss'], "ke_loss": self.curr_loss_dic['ke_loss'] + vis_dict = {"train_loss": self.curr_loss} + vis_dict.update(self.curr_loss_dic) + self.writer.add_scalars("loss", vis_dict, self.total_step_sum) + if self.loss_weight is not None: + # 预训练 + loss_weight_dic = {} + if self.args.train_strategy == 1: + loss_weight_dic["mask"] = 1 / (self.loss_weight[0]**2) + if self.args.use_NumEmb: + loss_weight_dic["kpi"] = 1 / (self.loss_weight[1]**2) + vis_kpi_dic = {"recover": 1 / (self.kpi_loss_weight[0]**2), "classifier": 1 / (self.kpi_loss_weight[1]**2)} + if self.args.contrastive_loss and len(self.kpi_loss_weight) > 2: + vis_kpi_dic.update({"contrastive": 1 / (self.kpi_loss_weight[2]**2)}) + self.writer.add_scalars("kpi_loss_weight", vis_kpi_dic, self.total_step_sum) + self.writer.add_scalars("kpi_loss", self.curr_kpi_loss_dic, self.total_step_sum) + self.writer.add_scalars("loss_weight", loss_weight_dic, self.total_step_sum) + # TODO: Finetune + + # init log loss + self.curr_loss = 0. + for key in self.curr_loss_dic: + self.curr_loss_dic[key] = 0. + if len(self.curr_kpi_loss_dic) > 0: + for key in self.curr_kpi_loss_dic: + self.curr_kpi_loss_dic[key] = 0. + + # TODO: Finetune 阶段 + def eval(self): + self.model.eval() + torch.cuda.empty_cache() + + def mask_test(self, test_log): + # 如果大于1 就无法mask测试 + assert self.args.train_ratio < 1 + topk = (1, 100, 500) + test_log.acc_init(topk) + # 做 mask 预测的时候需要进入训练模式,以获得随机mask的token + self.args.only_test = 0 + self.dataloader_init(self.seq_test_set) + # pdb.set_trace() + sz_test = len(self.train_dataloader) + loss_sum = 0 + with tqdm(total=sz_test) as _tqdm: # 使用需要的参数对tqdm进行初始化 + for step, batch in enumerate(self.train_dataloader): + # DONE: 写好mask_prediction实现mask预测 + with torch.no_grad(): + token_num, token_right, loss = self.model.mask_prediction(batch, len(self.tokenizer), topk) + test_log.update_token(token_num, token_right) + loss_sum += loss + # test_log.update_word(word_num, word_right) + _tqdm.update(1) + _tqdm.set_description(f'Test | step [{step}/{sz_test}] Top{topk} Token_Acc: {test_log.get_token_acc()}') + print(f"perplexity: {loss_sum}") + # 训练模式复位 + self.args.only_test = 1 + # if topk is not None: + print(f"Top{topk} acc is {test_log.get_token_acc()}") + + def emb_generate(self, path_gen): + assert len(self.args.path_gen) > 0 or path_gen is not None + data_path = self.args.data_path + if path_gen is None: + path_gen = self.args.path_gen + with open(osp.join(data_path, 'downstream_task', f'{path_gen}.json'), "r") as fp: + data = json.load(fp) + print(f"read file {path_gen} done!") + test_set = SeqDataset(data) + self.dataloader_init(test_set) + sz_test = len(self.train_dataloader) + all_emb_dic = defaultdict(list) + emb_output = {} + all_emb_ent = [] + # tps = ['cls', 'last_avg', 'last2avg', 'last3avg', 'first_last_avg'] + tps = ['cls', 'last_avg'] + # with tqdm(total=sz_test) as _tqdm: + for step, batch in enumerate(self.train_dataloader): + for tp in tps: + with torch.no_grad(): + batch_embedding = self.model.cls_embedding(batch, tp=tp) + # batch_embedding = self.model.cls_embedding(batch, tp=tp) + if tp in self.args.model_name and self.ke_model is not None: + batch_embedding_ent = self.ke_model.get_embedding(batch_embedding, is_ent=True) + # batch_embedding_ent = self.ke_model(batch, self.model) + batch_embedding_ent = batch_embedding_ent.cpu() + all_emb_ent.append(batch_embedding_ent) + + batch_embedding = batch_embedding.cpu() + all_emb_dic[tp].append(batch_embedding) + # _tqdm.update(1) + # _tqdm.set_description(f'Test | step [{step}/{sz_test}]') + torch.cuda.empty_cache() + for tp in tps: + emb_output[tp] = torch.cat(all_emb_dic[tp]) + assert emb_output[tp].shape[0] == len(data) + if len(all_emb_ent) > 0: + emb_output_ent = torch.cat(all_emb_ent) + # 后缀 + save_path = osp.join(data_path, 'downstream_task', 'output') + os.makedirs(save_path, exist_ok=True) + for tp in tps: + save_dir = osp.join(save_path, f'{path_gen}_emb_{self.args.model_name.replace("DistributedDataParallel", "")}_{tp}.pt') + torch.save(emb_output[tp], save_dir) + # 有训练好的实体embedding可使用 + if len(all_emb_ent) > 0: + save_dir = osp.join(save_path, f'{path_gen}_emb_{self.args.model_name.replace("DistributedDataParallel", "")}_ent.pt') + torch.save(emb_output_ent, save_dir) + + def KGE_test(self): + # 直接用KG全集进行kge的测试 + sz_test = len(self.kg_train_set) + # 先转换数据 + ent_set = set() + rel_set = set() + with tqdm(total=sz_test) as _tqdm: # 使用需要的参数对tqdm进行初始化 + _tqdm.set_description('trans entity/relation ID') + for batch in self.kg_train_set: + ent_set.add(batch[0]) + ent_set.add(batch[2]) + rel_set.add(batch[1]) + _tqdm.update(1) + all_ent, all_rel = list(ent_set), list(rel_set) + nent, nrel = len(all_ent), len(all_rel) + ent_dic, rel_dic = {}, {} + for i in range(nent): + ent_dic[all_ent[i]] = i + for i in range(nrel): + rel_dic[all_rel[i]] = i + id_format_triple = [] + with tqdm(total=sz_test) as _tqdm: + _tqdm.set_description('trans triple ID') + for triple in self.kg_train_set: + id_format_triple.append((ent_dic[triple[0]], rel_dic[triple[1]], ent_dic[triple[2]])) + _tqdm.update(1) + + # pdb.set_trace() + # 生成实体embedding并且保存 + ent_dataset = KGDataset(all_ent) + rel_dataset = KGDataset(all_rel) + + ent_dataloader = DataLoader( + ent_dataset, + batch_size=self.args.batch_size * 32, + num_workers=self.args.workers, + persistent_workers=True, + shuffle=False + ) + rel_dataloader = DataLoader( + rel_dataset, + batch_size=self.args.batch_size * 32, + num_workers=self.args.workers, + persistent_workers=True, + shuffle=False + ) + + sz_test = len(ent_dataloader) + len(rel_dataloader) + with tqdm(total=sz_test) as _tqdm: + ent_emb = [] + rel_emb = [] + step = 0 + _tqdm.set_description('get the ent embedding') + with torch.no_grad(): + for batch in ent_dataloader: + batch = self.tokenizer.batch_encode_plus( + batch, + padding='max_length', + max_length=self.args.maxlength, + truncation=True, + return_tensors="pt", + return_token_type_ids=False, + return_attention_mask=True, + add_special_tokens=False + ) + + batch_emb = self.model.cls_embedding(batch, tp=self.args.plm_emb_type) + batch_emb = self.ke_model.get_embedding(batch_emb, is_ent=True) + + ent_emb.append(batch_emb.cpu()) + _tqdm.update(1) + step += 1 + torch.cuda.empty_cache() + _tqdm.set_description(f'ENT emb: [{step}/{sz_test}]') + + _tqdm.set_description('get the rel embedding') + for batch in rel_dataloader: + batch = self.tokenizer.batch_encode_plus( + batch, + padding='max_length', + max_length=self.args.maxlength, + truncation=True, + return_tensors="pt", + return_token_type_ids=False, + return_attention_mask=True, + add_special_tokens=False + ) + batch_emb = self.model.cls_embedding(batch, tp=self.args.plm_emb_type) + batch_emb = self.ke_model.get_embedding(batch_emb, is_ent=False) + # batch_emb = self.model.get_embedding(batch, is_ent=False) + rel_emb.append(batch_emb.cpu()) + _tqdm.update(1) + step += 1 + torch.cuda.empty_cache() + _tqdm.set_description(f'REL emb: [{step}/{sz_test}]') + + all_ent_emb = torch.cat(ent_emb).cuda() + all_rel_emb = torch.cat(rel_emb).cuda() + # embedding:emb_output + # dim 256 + kge_model_for_test = KGEModel(nentity=len(all_ent), nrelation=len(all_rel), hidden_dim=self.args.ke_dim, + gamma=self.args.ke_margin, entity_embedding=all_ent_emb, relation_embedding=all_rel_emb).cuda() + if self.args.ke_test_num > 0: + test_triples = id_format_triple[:self.args.ke_test_num] + else: + test_triples = id_format_triple + with torch.no_grad(): + metrics = kge_model_for_test.test_step(test_triples=test_triples, all_true_triples=id_format_triple, args=self.args, nentity=len(all_ent), nrelation=len(all_rel)) + # pdb.set_trace() + print(f"result:{metrics}") + + def OD_test(self): + # data_path = self.args.data_path + # with open(osp.join(data_path, f'{self.args.order_test_name}.json'), "r") as fp: + # data = json.load(fp) + self.od_model.eval() + test_log = Loss_log() + test_log.acc_init() + sz_test = len(self.train_dataloader) + all_emb_ent = [] + with tqdm(total=sz_test) as _tqdm: # 使用需要的参数对tqdm进行初始化 + for step, batch in enumerate(self.train_dataloader): + with torch.no_grad(): + emb = self.model.cls_embedding(batch[0], tp=self.args.plm_emb_type) + out_emb = self.od_model.encode(emb) + emb_cpu = out_emb.cpu() + all_emb_ent.append(emb_cpu) + order_score = self.od_model.predict(emb) + token_right = self.od_model.right_caculate(order_score, batch[1], threshold=self.args.order_threshold) + test_log.update_token(batch[1].shape[0], [token_right]) + _tqdm.update(1) + _tqdm.set_description(f'Test | step [{step}/{sz_test}] Acc: {test_log.get_token_acc()}') + + emb_output = torch.cat(all_emb_ent) + data_path = self.args.data_path + save_path = osp.join(data_path, 'downstream_task', 'output') + os.makedirs(save_path, exist_ok=True) + save_dir = osp.join(save_path, f'ratio{self.args.train_ratio}_{emb_output.shape[0]}emb_{self.args.model_name.replace("DistributedDataParallel", "")}.pt') + torch.save(emb_output, save_dir) + print(f"save {emb_output.shape[0]} embeddings done...") + + @ torch.no_grad() + def test(self, path_gen=None): + test_log = Loss_log() + self.model.eval() + if not (self.args.mask_test or self.args.embed_gen or self.args.ke_test or len(self.args.order_test_name) > 0): + return + if self.args.mask_test: + self.mask_test(test_log) + if self.args.embed_gen: + self.emb_generate(path_gen) + if self.args.ke_test: + self.KGE_test() + if len(self.args.order_test_name) > 0: + runner.OD_test() + + def _load_model(self, model, name): + if model is None: + return None + # 没有训练过 + _name = name if name[:3] not in ['od_', 'ke_'] else name[3:] + save_path = osp.join(self.args.data_path, 'save', _name) + save_name = osp.join(save_path, f'{name}.pkl') + if not osp.exists(save_path) or not osp.exists(save_name): + return model.cuda() + # 载入模型 + if 'Distribute' in self.args.model_name: + model.load_state_dict({k.replace('module.', ''): v for k, v in torch.load(os.path.join(save_name), map_location=self.args.device).items()}) + else: + model.load_state_dict(torch.load(save_name, map_location=self.args.device)) + model.cuda() + if self.rank == 0: + print(f"loading model [{name}.pkl] done!") + + return model + + def _save_model(self, stage=''): + model_name = type(self.model).__name__ + # TODO: path + save_path = osp.join(self.args.data_path, 'save') + os.makedirs(save_path, exist_ok=True) + if self.args.train_strategy == 1: + save_name = f'{self.args.exp_name}_{self.args.exp_id}_s{self.args.random_seed}{stage}' + else: + save_name = f'{self.args.exp_name}_{self.args.exp_id}_s{self.args.random_seed}_{self.args.plm_emb_type}{stage}' + save_path = osp.join(save_path, save_name) + os.makedirs(save_path, exist_ok=True) + # 预训练模型保存 + self._save(self.model, save_path, save_name) + + # 下游模型保存 + save_name_od = f'od_{save_name}' + self._save(self.od_model, save_path, save_name_od) + save_name_ke = f'ke_{save_name}' + self._save(self.ke_model, save_path, save_name_ke) + return save_path + + def _save(self, model, save_path, save_name): + if model is None: + return + if self.args.save_model: + torch.save(model.state_dict(), osp.join(save_path, f'{save_name}.pkl')) + print(f"saving {save_name} done!") + + if self.args.save_pretrain and not save_name.startswith('od_') and not save_name.startswith('ke_'): + self.tokenizer.save_pretrained(osp.join(self.args.plm_path, f'{save_name}')) + self.model.encoder.save_pretrained(osp.join(self.args.plm_path, f'{save_name}')) + print(f"saving [pretrained] {save_name} done!") + + +if __name__ == '__main__': + cfg = cfg() + cfg.get_args() + cfgs = cfg.update_train_configs() + set_seed(cfgs.random_seed) + # 初始化各进程环境 + # pdb.set_trace() + if cfgs.dist and not cfgs.only_test: + init_distributed_mode(args=cfgs) + # cfgs.lr *= cfgs.world_size + # cfgs.ke_lr *= cfgs.world_size + else: + # 下面这条语句在并行的时候可能内存泄漏,导致无法停止 + torch.multiprocessing.set_sharing_strategy('file_system') + rank = cfgs.rank + + writer, logger = None, None + if rank == 0: + # 如果并行则只有一种情况打印 + logger = initialize_exp(cfgs) + logger_path = get_dump_path(cfgs) + cfgs.time_stamp = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now()) + comment = f'bath_size={cfgs.batch_size} exp_id={cfgs.exp_id}' + if not cfgs.no_tensorboard and not cfgs.only_test: + writer = SummaryWriter(log_dir=os.path.join(logger_path, 'tensorboard', cfgs.time_stamp), comment=comment) + + cfgs.device = torch.device(cfgs.device) + + # ----- Begin ---------- + runner = Runner(cfgs, writer, logger, rank) + + if cfgs.only_test: + if cfgs.embed_gen: + # 不需要生成的先搞定 + if cfgs.mask_test or cfgs.ke_test: + runner.args.embed_gen = 0 + runner.test() + runner.args.embed_gen = 1 + # gen_dir = ['yht_data_merge', 'yht_data_whole5gc', 'yz_data_whole5gc', 'yz_data_merge', 'zyc_data_merge', 'zyc_data_whole5gc'] + gen_dir = ['yht_serialize_withAttribute', 'yht_serialize_withoutAttr', 'yht_name_serialize', 'zyc_serialize_withAttribute', 'zyc_serialize_withoutAttr', 'zyc_name_serialize', + 'yz_serialize_withAttribute', 'yz_serialize_withoutAttr', 'yz_name_serialize', 'yz_serialize_net'] + # gen_dir = ['zyc_serialize_withAttribute', 'zyc_normal_serialize', 'zyc_data_whole5gc', 'zyc_data_merge', 'yht_normal_serialize', + # 'yht_serialize_withAttribute', 'yz_serialize_withAttribute', 'yz_serialize_net', 'yz_normal_serialize'] + runner.args.mask_test, runner.args.ke_test = 0, 0 + for item in gen_dir: + runner.test(item) + else: + runner.test() + else: + runner.run() + + # ----- End ---------- + if not cfgs.no_tensorboard and not cfgs.only_test and rank == 0: + writer.close() + logger.info("done!") + + if cfgs.dist and not cfgs.only_test: + dist.barrier() + dist.destroy_process_group() + # print("shut down...") diff --git a/KTeleBERT/model/HWBert.py b/KTeleBERT/model/HWBert.py new file mode 100644 index 0000000000000000000000000000000000000000..093f857d453988293edbf1964b20fcecc94ec413 --- /dev/null +++ b/KTeleBERT/model/HWBert.py @@ -0,0 +1,146 @@ +import os +import os.path as osp +import pdb +import torch +import torch.nn as nn +import numpy as np +from random import * +import json +from packaging import version +import torch.distributed as dist + +from .Tool_model import AutomaticWeightedLoss +from .Numeric import AttenNumeric +from .KE_model import KE_model +# from modeling_transformer import Transformer + + +from .bert import BertModel, BertTokenizer, BertForMaskedLM, BertConfig +import torch.nn.functional as F + +from copy import deepcopy +from src.utils import torch_accuracy +# 4.21.2 + + +def debug(input, kk, begin=None): + aaa = deepcopy(input[0]) + if begin is None: + aaa.input_ids = input[0].input_ids[:kk] + aaa.attention_mask = input[0].attention_mask[:kk] + aaa.chinese_ref = input[0].chinese_ref[:kk] + aaa.kpi_ref = input[0].kpi_ref[:kk] + aaa.labels = input[0].labels[:kk] + else: + aaa.input_ids = input[0].input_ids[begin:kk] + aaa.attention_mask = input[0].attention_mask[begin:kk] + aaa.chinese_ref = input[0].chinese_ref[begin:kk] + aaa.kpi_ref = input[0].kpi_ref[begin:kk] + aaa.labels = input[0].labels[begin:kk] + + return aaa + + +class HWBert(nn.Module): + def __init__(self, args): + super().__init__() + self.loss_awl = AutomaticWeightedLoss(args.awl_num, args) + self.args = args + self.config = BertConfig() + model_name = args.model_name + if args.model_name in ['TeleBert', 'TeleBert2', 'TeleBert3']: + self.encoder = BertForMaskedLM.from_pretrained(osp.join(args.data_root, 'transformer', model_name)) + # MacBert来初始化 predictions layer + if args.cls_head_init: + tmp = BertForMaskedLM.from_pretrained(osp.join(args.data_root, 'transformer', 'MacBert')) + self.encoder.cls.predictions = tmp.cls.predictions + else: + if not osp.exists(osp.join(args.data_root, 'transformer', args.model_name)): + model_name = 'MacBert' + self.encoder = BertForMaskedLM.from_pretrained(osp.join(args.data_root, 'transformer', model_name)) + self.numeric_model = AttenNumeric(self.args) + + # ----------------------- 主forward函数 ---------------------------------- + def forward(self, input): + mask_loss, kpi_loss, kpi_loss_weight, kpi_loss_dict = self.mask_forward(input) + mask_loss = mask_loss.loss + loss_dic = {} + if not self.args.use_kpi_loss: + kpi_loss = None + if kpi_loss is not None: + loss_sum = self.loss_awl(mask_loss, 0.3 * kpi_loss) + loss_dic['kpi_loss'] = kpi_loss.item() + else: + loss_sum = self.loss_awl(mask_loss) + loss_dic['mask_loss'] = mask_loss.item() + return { + 'loss': loss_sum, + 'loss_dic': loss_dic, + 'loss_weight': self.loss_awl.params.tolist(), + 'kpi_loss_weight': kpi_loss_weight, + 'kpi_loss_dict': kpi_loss_dict + } + + # loss_sum, loss_dic, self.loss_awl.params.tolist(), kpi_loss_weight, kpi_loss_dict + + # ---------------------------------------------------------------- + # 测试代码,计算mask是否正确 + def mask_prediction(self, inputs, tokenizer_sz, topk=(1,)): + token_num, token_right, word_num, word_right = None, None, None, None + outputs, kpi_loss, kpi_loss_weight, kpi_loss_dict = self.mask_forward(inputs) + inputs = inputs['labels'].view(-1) + input_list = inputs.tolist() + # 被修改的词 + change_token_index = [i for i, x in enumerate(input_list) if x != -100] + change_token = torch.tensor(change_token_index) + inputs_used = inputs[change_token] + pred = outputs.logits.view(-1, tokenizer_sz) + pred_used = pred[change_token].cpu() + # 返回的list + # 计算acc + acc, token_right = torch_accuracy(pred_used, inputs_used, topk) + # 计算混乱分数 + + token_num = inputs_used.shape[0] + # TODO: 添加word_num, word_right + # token_right:list + return token_num, token_right, outputs.loss.item() + + def mask_forward(self, inputs): + kpi_ref = None + if 'kpi_ref' in inputs: + kpi_ref = inputs['kpi_ref'] + + outputs, kpi_loss, kpi_loss_weight, kpi_loss_dict = self.encoder( + input_ids=inputs['input_ids'].cuda(), + attention_mask=inputs['attention_mask'].cuda(), + # token_type_ids=inputs.token_type_ids.cuda(), + labels=inputs['labels'].cuda(), + kpi_ref=kpi_ref, + kpi_model=self.numeric_model + ) + return outputs, kpi_loss, kpi_loss_weight, kpi_loss_dict + + # TODO: 垂直注意力考虑:https://github.com/lucidrains/axial-attention + + def cls_embedding(self, inputs, tp='cls'): + hidden_states = self.encoder( + input_ids=inputs['input_ids'].cuda(), + attention_mask=inputs['attention_mask'].cuda(), + output_hidden_states=True)[0].hidden_states + if tp == 'cls': + return hidden_states[-1][:, 0] + else: + index_real = torch.tensor(inputs['input_ids'].clone().detach(), dtype=torch.bool) + res = [] + for i in range(hidden_states[-1].shape[0]): + if tp == 'last_avg': + res.append(hidden_states[-1][i][index_real[i]][:-1].mean(dim=0)) + elif tp == 'last2avg': + res.append((hidden_states[-1][i][index_real[i]][:-1] + hidden_states[-2][i][index_real[i]][:-1]).mean(dim=0)) + elif tp == 'last3avg': + res.append((hidden_states[-1][i][index_real[i]][:-1] + hidden_states[-2][i][index_real[i]][:-1] + hidden_states[-3][i][index_real[i]][:-1]).mean(dim=0)) + elif tp == 'first_last_avg': + res.append((hidden_states[-1][i][index_real[i]][:-1] + hidden_states[1][i][index_real[i]][:-1]).mean(dim=0)) + + return torch.stack(res) diff --git a/KTeleBERT/model/KE_model.py b/KTeleBERT/model/KE_model.py new file mode 100644 index 0000000000000000000000000000000000000000..e3b05b3db98beed2b69499d8ec7f980131b92f4d --- /dev/null +++ b/KTeleBERT/model/KE_model.py @@ -0,0 +1,451 @@ +import torch +from torch import nn +import torch.nn.functional as F +import numpy as np +from sklearn.metrics import average_precision_score +from tqdm import tqdm +import pdb +from torch.utils.data import DataLoader +from collections import defaultdict +import os.path as osp +import json + + +class KE_model(nn.Module): + def __init__(self, args): + super().__init__() + """ + triple task: mask tail entity, total entity size-class classification + """ + """ + :param hidden: BERT model output size + """ + self.args = args + self.ke_dim = args.ke_dim + + self.linear_ent = nn.Linear(args.hidden_size, self.ke_dim) + self.linear_rel = nn.Linear(args.hidden_size, self.ke_dim) + + self.ke_margin = nn.Parameter( + torch.Tensor([args.ke_margin]), + requires_grad=False + ) + + def forward(self, batch, hw_model): + batch_triple = batch + pos_sample = batch_triple["positive_sample"] + neg_sample = batch_triple["negative_sample"] + neg_index = batch_triple["neg_index"] + + # 节省显存 + all_entity = [] + all_entity_mask = [] + for i in range(3): + all_entity.append(pos_sample[i]['input_ids']) + all_entity_mask.append(pos_sample[i]['attention_mask']) + + all_entity = torch.cat(all_entity) + all_entity_mask = torch.cat(all_entity_mask) + entity_data = {'input_ids':all_entity, 'attention_mask':all_entity_mask} + entity_emb = hw_model.cls_embedding(entity_data, tp=self.args.plm_emb_type) + + bs = pos_sample[0]['input_ids'].shape[0] + pos_sample_emb= [entity_emb[:bs], entity_emb[bs:2*bs], entity_emb[2*bs:3*bs]] + neg_sample_emb = entity_emb[neg_index] + mode = batch_triple["mode"] + # pos_score = self.get_score(pos_sample, hw_model) + # neg_score = self.get_score(pos_sample, hw_model, neg_sample, mode) + pos_score = self.get_score(pos_sample_emb, hw_model) + neg_score = self.get_score(pos_sample_emb, hw_model, neg_sample_emb, mode) + triple_loss = self.adv_loss(pos_score, neg_score, self.args) + + return triple_loss + + # pdb.set_trace() + # return emb.div_(emb.detach().norm(p=1, dim=-1, keepdim=True)) + +# KE loss + def tri2emb(self, triples, hw_model, negs=None, mode="single"): + """Get embedding of triples. + This function get the embeddings of head, relation, and tail + respectively. each embedding has three dimensions. + Args: + triples (tensor): This tensor save triples id, which dimension is + [triples number, 3]. + negs (tensor, optional): This tenosr store the id of the entity to + be replaced, which has one dimension. when negs is None, it is + in the test/eval phase. Defaults to None. + mode (str, optional): This arg indicates that the negative entity + will replace the head or tail entity. when it is 'single', it + means that entity will not be replaced. Defaults to 'single'. + Returns: + head_emb: Head entity embedding. + relation_emb: Relation embedding. + tail_emb: Tail entity embedding. + """ + + if mode == "single": + head_emb = self.get_embedding(triples[0]).unsqueeze(1) # [bs, 1, dim] + relation_emb = self.get_embedding(triples[1], is_ent=False).unsqueeze(1) # [bs, 1, dim] + tail_emb = self.get_embedding(triples[2]).unsqueeze(1) # [bs, 1, dim] + + elif mode == "head-batch" or mode == "head_predict": + if negs is None: # 说明这个时候是在evluation,所以需要直接用所有的entity embedding + # TODO:暂时不考虑KGC的测试情况 + head_emb = self.ent_emb.weight.data.unsqueeze(0) # [1, num_ent, dim] + else: + head_emb = self.get_embedding(negs).reshape(-1, self.args.neg_num, self.args.ke_dim) # [bs, num_neg, dim] + relation_emb = self.get_embedding(triples[1], is_ent=False).unsqueeze(1) # [bs, 1, dim] + tail_emb = self.get_embedding(triples[2]).unsqueeze(1) # [bs, 1, dim] + + elif mode == "tail-batch" or mode == "tail_predict": + head_emb = self.get_embedding(triples[0]).unsqueeze(1) # [bs, 1, dim] + relation_emb = self.get_embedding(triples[1], is_ent=False).unsqueeze(1) # [bs, 1, dim] + if negs is None: + tail_emb = self.ent_emb.weight.data.unsqueeze(0) # [1, num_ent, dim] + else: + # pdb.set_trace() + tail_emb = self.get_embedding(negs).reshape(-1, self.args.neg_num, self.args.ke_dim) # [bs, num_neg, dim] + + return head_emb, relation_emb, tail_emb + + def get_embedding(self, inputs, is_ent=True): + # pdb.set_trace() + if is_ent: + return self.linear_ent(inputs) + else: + return self.linear_rel(inputs) + + def score_func(self, head_emb, relation_emb, tail_emb): + """Calculating the score of triples. + + The formula for calculating the score is :math:`\gamma - ||h + r - t||_F` + Args: + head_emb: The head entity embedding. + relation_emb: The relation embedding. + tail_emb: The tail entity embedding. + mode: Choose head-predict or tail-predict. + Returns: + score: The score of triples. + """ + score = (head_emb + relation_emb) - tail_emb + # pdb.set_trace() + score = self.ke_margin.item() - torch.norm(score, p=1, dim=-1) + return score + + def get_score(self, triples, hw_model, negs=None, mode='single'): + """The functions used in the training phase + + Args: + triples: The triples ids, as (h, r, t), shape:[batch_size, 3]. + negs: Negative samples, defaults to None. + mode: Choose head-predict or tail-predict, Defaults to 'single'. + + Returns: + score: The score of triples. + """ + head_emb, relation_emb, tail_emb = self.tri2emb(triples, hw_model, negs, mode) + score = self.score_func(head_emb, relation_emb, tail_emb) + + return score + + def adv_loss(self, pos_score, neg_score, args): + """Negative sampling loss with self-adversarial training. In math: + + L=-\log \sigma\left(\gamma-d_{r}(\mathbf{h}, \mathbf{t})\right)-\sum_{i=1}^{n} p\left(h_{i}^{\prime}, r, t_{i}^{\prime}\right) \log \sigma\left(d_{r}\left(\mathbf{h}_{i}^{\prime}, \mathbf{t}_{i}^{\prime}\right)-\gamma\right) + + Args: + pos_score: The score of positive samples. + neg_score: The score of negative samples. + subsampling_weight: The weight for correcting pos_score and neg_score. + + Returns: + loss: The training loss for back propagation. + """ + neg_score = (F.softmax(neg_score * args.adv_temp, dim=1).detach() + * F.logsigmoid(-neg_score)).sum(dim=1) # shape:[bs] + pos_score = F.logsigmoid(pos_score).view(neg_score.shape[0]) # shape:[bs] + positive_sample_loss = - pos_score.mean() + negative_sample_loss = - neg_score.mean() + loss = (positive_sample_loss + negative_sample_loss) / 2 + return loss + + +class KGEModel(nn.Module): + def __init__(self, nentity, nrelation, hidden_dim, gamma, entity_embedding, relation_embedding): + super(KGEModel, self).__init__() + self.nentity = nentity + self.nrelation = nrelation + self.hidden_dim = hidden_dim + + self.gamma = nn.Parameter( + torch.Tensor([gamma]), + requires_grad=False + ) + self.entity_embedding = entity_embedding + self.relation_embedding = relation_embedding + + assert self.relation_embedding.shape[0] == nrelation + assert self.entity_embedding.shape[0] == nentity + + def forward(self, sample, mode='single'): + ''' + Forward function that calculate the score of a batch of triples. + In the 'single' mode, sample is a batch of triple. + In the 'head-batch' or 'tail-batch' mode, sample consists two part. + The first part is usually the positive sample. + And the second part is the entities in the negative samples. + Because negative samples and positive samples usually share two elements + in their triple ((head, relation) or (relation, tail)). + ''' + + if mode == 'single': + batch_size, negative_sample_size = sample.size(0), 1 + + head = torch.index_select( + self.entity_embedding, + dim=0, + index=sample[:, 0] + ).unsqueeze(1) + + relation = torch.index_select( + self.relation_embedding, + dim=0, + index=sample[:, 1] + ).unsqueeze(1) + + tail = torch.index_select( + self.entity_embedding, + dim=0, + index=sample[:, 2] + ).unsqueeze(1) + + elif mode == 'head-batch': + tail_part, head_part = sample + batch_size, negative_sample_size = head_part.size(0), head_part.size(1) + + head = torch.index_select( + self.entity_embedding, + dim=0, + index=head_part.view(-1) + ).view(batch_size, negative_sample_size, -1) + + relation = torch.index_select( + self.relation_embedding, + dim=0, + index=tail_part[:, 1] + ).unsqueeze(1) + + tail = torch.index_select( + self.entity_embedding, + dim=0, + index=tail_part[:, 2] + ).unsqueeze(1) + + elif mode == 'tail-batch': + head_part, tail_part = sample + batch_size, negative_sample_size = tail_part.size(0), tail_part.size(1) + + head = torch.index_select( + self.entity_embedding, + dim=0, + index=head_part[:, 0] + ).unsqueeze(1) + + relation = torch.index_select( + self.relation_embedding, + dim=0, + index=head_part[:, 1] + ).unsqueeze(1) + + tail = torch.index_select( + self.entity_embedding, + dim=0, + index=tail_part.view(-1) + ).view(batch_size, negative_sample_size, -1) + + else: + raise ValueError('mode %s not supported' % mode) + + score = self.TransE(head, relation, tail, mode) + + return score + + def TransE(self, head, relation, tail, mode): + if mode == 'head-batch': + score = head + (relation - tail) + else: + score = (head + relation) - tail + + score = self.gamma.item() - torch.norm(score, p=1, dim=-1) + return score + + @torch.no_grad() + def test_step(self, test_triples, all_true_triples, args, nentity, nrelation): + ''' + Evaluate the model on test or valid datasets + ''' + # Otherwise use standard (filtered) MRR, MR, HITS@1, HITS@3, and HITS@10 metrics + # Prepare dataloader for evaluation + test_dataloader_head = DataLoader( + KGTestDataset( + test_triples, + all_true_triples, + nentity, + nrelation, + 'head-batch' + ), + batch_size=args.batch_size, + num_workers=args.workers, + persistent_workers=True, + collate_fn=KGTestDataset.collate_fn + ) + + test_dataloader_tail = DataLoader( + KGTestDataset( + test_triples, + all_true_triples, + nentity, + nrelation, + 'tail-batch' + ), + batch_size=args.batch_size, + num_workers=args.workers, + persistent_workers=True, + collate_fn=KGTestDataset.collate_fn + ) + + test_dataset_list = [test_dataloader_head, test_dataloader_tail] + + logs = [] + + step = 0 + total_steps = sum([len(dataset) for dataset in test_dataset_list]) + + # pdb.set_trace() + with tqdm(total=total_steps) as _tqdm: + _tqdm.set_description(f'eval KGC') + for test_dataset in test_dataset_list: + for positive_sample, negative_sample, filter_bias, mode in test_dataset: + + positive_sample = positive_sample.cuda() + negative_sample = negative_sample.cuda() + filter_bias = filter_bias.cuda() + + batch_size = positive_sample.size(0) + + score = self.forward((positive_sample, negative_sample), mode) + score += filter_bias + + # Explicitly sort all the entities to ensure that there is no test exposure bias + argsort = torch.argsort(score, dim=1, descending=True) + + if mode == 'head-batch': + positive_arg = positive_sample[:, 0] + elif mode == 'tail-batch': + positive_arg = positive_sample[:, 2] + else: + raise ValueError('mode %s not supported' % mode) + + for i in range(batch_size): + # Notice that argsort is not ranking + # ranking = (argsort[i, :] == positive_arg[i]).nonzero() + ranking = (argsort[i, :] == positive_arg[i]).nonzero(as_tuple=False) + assert ranking.size(0) == 1 + + # ranking + 1 is the true ranking used in evaluation metrics + ranking = 1 + ranking.item() + logs.append({ + 'MRR': 1.0 / ranking, + 'MR': float(ranking), + 'HITS@1': 1.0 if ranking <= 1 else 0.0, + 'HITS@3': 1.0 if ranking <= 3 else 0.0, + 'HITS@10': 1.0 if ranking <= 10 else 0.0, + }) + + # if step % args.test_log_steps == 0: + # logging.info('Evaluating the model... (%d/%d)' % (step, total_steps)) + _tqdm.update(1) + _tqdm.set_description(f'KGC Eval:') + step += 1 + + metrics = {} + for metric in logs[0].keys(): + metrics[metric] = sum([log[metric] for log in logs]) / len(logs) + + return metrics + + +# 专门为KGE的测试设计一个dataset +class KGTestDataset(torch.utils.data.Dataset): + def __init__(self, triples, all_true_triples, nentity, nrelation, mode, head4rel_tail=None, tail4head_rel=None): + self.len = len(triples) + self.triple_set = set(all_true_triples) + self.triples = triples + + # 需要统计得到 + self.nentity = nentity + self.nrelation = nrelation + self.mode = mode + + # 给定关系尾实体对应头实体 + # print("build head4rel_tail") + # self.head4rel_tail = self.find_head4rel_tail() + # print("build tail4head_rel") + # self.tail4head_rel = self.find_tail4head_rel() + + def __len__(self): + return self.len + + def find_head4rel_tail(self): + ans = defaultdict(list) + for (h, r, t) in self.triple_set: + ans[(r, t)].append(h) + return ans + + def find_tail4head_rel(self): + ans = defaultdict(list) + for (h, r, t) in self.triple_set: + ans[(h, r)].append(t) + return ans + + def __getitem__(self, idx): + head, relation, tail = self.triples[idx] + + if self.mode == 'head-batch': + tmp = [(0, rand_head) if (rand_head, relation, tail) not in self.triple_set + else (-100, head) for rand_head in range(self.nentity)] + tmp[head] = (0, head) + elif self.mode == 'tail-batch': + tmp = [(0, rand_tail) if (head, relation, rand_tail) not in self.triple_set + else (-100, tail) for rand_tail in range(self.nentity)] + tmp[tail] = (0, tail) + else: + raise ValueError('negative batch mode %s not supported' % self.mode) + # if self.mode == 'head-batch': + # + # tmp = [(0, rand_head) if rand_head not in self.head4rel_tail[(relation, tail)] + # else (-100, head) for rand_head in range(self.nentity)] + # tmp[head] = (0, head) + # elif self.mode == 'tail-batch': + # tmp = [(0, rand_tail) if rand_tail not in self.tail4head_rel[(head, relation)] + # else (-100, tail) for rand_tail in range(self.nentity)] + # tmp[tail] = (0, tail) + # else: + # raise ValueError('negative batch mode %s not supported' % self.mode) + + tmp = torch.LongTensor(tmp) + filter_bias = tmp[:, 0].float() + negative_sample = tmp[:, 1] + + positive_sample = torch.LongTensor((head, relation, tail)) + + return positive_sample, negative_sample, filter_bias, self.mode + + @staticmethod + def collate_fn(data): + positive_sample = torch.stack([_[0] for _ in data], dim=0) + negative_sample = torch.stack([_[1] for _ in data], dim=0) + filter_bias = torch.stack([_[2] for _ in data], dim=0) + mode = data[0][3] + return positive_sample, negative_sample, filter_bias, mode diff --git a/KTeleBERT/model/Numeric.py b/KTeleBERT/model/Numeric.py new file mode 100644 index 0000000000000000000000000000000000000000..a32d584d7b46f6b79b4cfe4db379660faba00634 --- /dev/null +++ b/KTeleBERT/model/Numeric.py @@ -0,0 +1,218 @@ +import types +import torch +import transformers +import torch.nn.functional as F +from torch import nn +from torch.nn import CrossEntropyLoss +import numpy as np +import pdb +import math +from .Tool_model import AutomaticWeightedLoss +import os.path as osp +import json + + +def ortho_penalty(t): + return ((t @ t.T - torch.eye(t.shape[0]).cuda())**2).sum() + + +class AttenNumeric(nn.Module): + def __init__(self, config): + super(AttenNumeric, self).__init__() + # ----------- 加载kpi2id -------------------- + kpi_file_path = osp.join(config.data_path, 'kpi2id.json') + + with open(kpi_file_path, 'r') as f: + # pdb.set_trace() + kpi2id = json.load(f) + config.num_kpi = 303 + # config.num_kpi = len(kpi2id) + # ------------------------------- + + self.config = config + self.fc = nn.Linear(1, config.hidden_size) + # self.actication = nn.ReLU() + self.actication = nn.LeakyReLU() + # self.embedding = nn.Linear(config.hidden_size, self.attention_head_size) + if config.contrastive_loss: + self.loss_awl = AutomaticWeightedLoss(3, config) + else: + self.loss_awl = AutomaticWeightedLoss(2, config) + self.encoder = AttNumEncoder(config) + self.decoder = AttNumDecoder(config) + self.classifier = NumClassifier(config) + self.ce_loss = nn.CrossEntropyLoss() + + def contrastive_loss(self, hidden, kpi): + # in batch negative + bs_tmp = hidden.shape[0] + eye = torch.eye(bs_tmp).cuda() + hidden = F.normalize(hidden, dim=1) + # [12,12] + # 减去对角矩阵目的是防止对自身的相似程度影响了判断 + hidden_sim = (torch.matmul(hidden, hidden.T) - eye) / 0.07 + kpi = kpi.expand(-1, bs_tmp) + kpi_sim = torch.abs(kpi - kpi.T) + eye + kpi_sim = torch.min(kpi_sim, 1)[1] + sc_loss = self.ce_loss(hidden_sim, kpi_sim) + return sc_loss + + def _encode(self, kpi, query): + kpi_emb = self.actication(self.fc(kpi)) + # name_emb = self.embedding(query) + hidden, en_loss, scalar_list = self.encoder(kpi_emb, query) + + # 两个及以下的对比学习没有意义 + if self.config.contrastive_loss and hidden.shape[0] > 2: + con_loss = self.contrastive_loss(hidden.squeeze(1), kpi.squeeze(1)) + else: + con_loss = None + hidden = self.actication(hidden) + assert query.shape[0] > 0 + return hidden, en_loss, scalar_list, con_loss + + def forward(self, kpi, query, kpi_id): + hidden, en_loss, scalar_list, con_loss = self._encode(kpi, query) + dec_kpi_score, de_loss = self.decoder(kpi, hidden) + cls_kpi, cls_loss = self.classifier(hidden, kpi_id) + if con_loss is not None: + # 0.001 * con_loss + loss_sum = self.loss_awl(de_loss, cls_loss, 0.1 * con_loss) + loss_all = loss_sum + en_loss + loss_dic = {'cls_loss': cls_loss.item(), 'reg_loss': de_loss.item(), 'orth_loss': en_loss.item(), 'con_loss': con_loss.item()} + # pdb.set_trace() + else: + loss_sum = self.loss_awl(de_loss, cls_loss) + loss_all = loss_sum + en_loss + loss_dic = {'cls_loss': cls_loss.item(), 'reg_loss': de_loss.item(), 'orth_loss': en_loss.item()} + + return dec_kpi_score, cls_kpi, hidden, loss_all, self.loss_awl.params.tolist(), loss_dic, scalar_list + + +class AttNumEncoder(nn.Module): + def __init__(self, config): + super(AttNumEncoder, self).__init__() + self.num_l_layers = config.l_layers + self.layer = nn.ModuleList([AttNumLayer(config) for _ in range(self.num_l_layers)]) + + def forward(self, kpi_emb, name_emb): + loss = 0. + scalar_list = [] + for layer_module in self.layer: + kpi_emb, orth_loss, scalar = layer_module(kpi_emb, name_emb) + loss += orth_loss + scalar_list.append(scalar) + return kpi_emb, loss, scalar_list + + +class AttNumDecoder(nn.Module): + def __init__(self, config): + super(AttNumDecoder, self).__init__() + self.dense_1 = nn.Linear(config.hidden_size, config.hidden_size) + self.dense_2 = nn.Linear(config.hidden_size, 1) + self.actication = nn.LeakyReLU() + self.loss_func = nn.MSELoss(reduction='mean') + + def forward(self, kpi_label, hidden): + # 修复异常值 + pre = self.actication(self.dense_2(self.actication(self.dense_1(hidden)))) + loss = self.loss_func(pre, kpi_label) + # pdb.set_trace() + return pre, loss + + +class NumClassifier(nn.Module): + def __init__(self, config): + super(NumClassifier, self).__init__() + self.dense_1 = nn.Linear(config.hidden_size, int(config.hidden_size / 3)) + self.dense_2 = nn.Linear(int(config.hidden_size / 3), config.num_kpi) + self.loss_func = nn.CrossEntropyLoss() + # self.actication = nn.ReLU() + self.actication = nn.LeakyReLU() + + def forward(self, hidden, kpi_id): + hidden = self.actication(self.dense_1(hidden)) + pre = self.actication(self.dense_2(hidden)).squeeze(1) + loss = self.loss_func(pre, kpi_id) + return pre, loss + + +class AttNumLayer(nn.Module): + def __init__(self, config): + super(AttNumLayer, self).__init__() + self.config = config + # 768 / 8 = 8 + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) # 96 + # self.head_size = config.hidden_size + + # scaler + self.scalar = nn.Parameter(.3 * torch.ones(1, requires_grad=True)) + self.key = nn.Parameter(torch.empty(self.num_attention_heads, self.attention_head_size)) + + self.dense_down = nn.Linear(config.hidden_size, 128) + self.dense_up = nn.Linear(128, config.hidden_size) + + # name embedding + self.embedding = nn.Linear(config.hidden_size, self.attention_head_size) + # num_attention_heads�� value���� ת������k�� + self.value = nn.Linear(config.hidden_size, config.hidden_size * self.num_attention_heads) + + # add & norm + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12) + # 0.1 + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # for m in self.modules().modules(): + # pdb.set_trace() + + nn.init.kaiming_normal_(self.key, mode='fan_out', nonlinearity='leaky_relu') + # nn.init.orthogonal_(self.key) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + ( + self.num_attention_heads, + self.config.hidden_size, + ) + x = x.view(*new_x_shape) + return x + # return x.permute(0, 2, 1, 3) + + def forward(self, kpi_emb, name_emb): + # [64, 1, 96] + name_emb = self.embedding(name_emb) + + mixed_value_layer = self.value(kpi_emb) + + # [64, 1, 8, 768] + value_layer = self.transpose_for_scores(mixed_value_layer) + + # key: [8, 96] self.key.transpose(-1, -2): [96, 8] + # name_emb: [64, 1, 96] + attention_scores = torch.matmul(name_emb, self.key.transpose(-1, -2)) + # [64, 1, 8] + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + attention_probs = attention_probs.unsqueeze(1) + # ��Ȩ��value�� + # [64, 1, 1, 8] * [64, 1, 8, 768] = [64, 1, 1, 768] + context_layer = torch.matmul(attention_probs, value_layer) + # context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.config.hidden_size,) + context_layer = context_layer.view(*new_context_layer_shape) + # add & norm + output_emb = self.dense(context_layer) + output_emb = self.dropout(output_emb) + output_emb = self.LayerNorm(output_emb + self.scalar * self.dense_up(self.dense_down(kpi_emb))) + # output_emb = self.LayerNorm(self.LayerNorm(output_emb) + self.scalar * kpi_emb) + # pdb.set_trace() + wei = self.value.weight.chunk(8, dim=0) + orth_loss_value = sum([ortho_penalty(k) for k in wei]) + # 0.01 * ortho_penalty(self.key) + ortho_penalty(self.value.weight) + orth_loss = 0.0001 * orth_loss_value + 0.0001 * ortho_penalty(self.dense.weight) + 0.01 * ((self.scalar[0])**2).sum() + return output_emb, orth_loss, self.scalar.tolist()[0] diff --git a/KTeleBERT/model/OD_model.py b/KTeleBERT/model/OD_model.py new file mode 100644 index 0000000000000000000000000000000000000000..176f6a2a064f1c4fe1eb6af876ff42d39352298d --- /dev/null +++ b/KTeleBERT/model/OD_model.py @@ -0,0 +1,74 @@ +import os +import os.path as osp +import pdb +import torch +import torch.nn as nn +import numpy as np +# from transformers import BertModel, BertTokenizer, BertForMaskedLM +import json +from packaging import version +import torch.distributed as dist + + +class OD_model(nn.Module): + def __init__(self, args): + super().__init__() + self.args = args + self.order_num = args.order_num + if args.od_type == 'linear_cat': + # self.order_dense_1 = nn.Linear(args.hidden_size * self.order_num, args.hidden_size) + # self.order_dense_2 = nn.Linear(args.hidden_size, 1) + self.order_dense_1 = nn.Linear(args.hidden_size * self.order_num, args.hidden_size) + if self.args.num_od_layer > 0: + self.layer = nn.ModuleList([OD_Layer_linear(args) for _ in range(args.num_od_layer)]) + + self.order_dense_2 = nn.Linear(args.hidden_size, 1) + + self.actication = nn.LeakyReLU() + self.bn = torch.nn.BatchNorm1d(args.hidden_size) + self.dp = nn.Dropout(p=args.hidden_dropout_prob) + self.loss_func = nn.BCEWithLogitsLoss() + # self.loss_func = nn.CrossEntropyLoss() + + def forward(self, input, labels): + # input 切成两半 + # 换方向拼接 + loss_dic = {} + pre = self.predict(input) + # pdb.set_trace() + loss = self.loss_func(pre, labels.unsqueeze(1)) + loss_dic['order_loss'] = loss.item() + return loss, loss_dic + + def encode(self, input): + if self.args.num_od_layer > 0: + for layer_module in self.layer: + input = layer_module(input) + inputs = torch.chunk(input, 2, dim=0) + emb = torch.concat(inputs, dim=1) + return self.actication(self.order_dense_1(self.dp(emb))) + + def predict(self, input): + return self.order_dense_2(self.bn(self.encode(input))) + + def right_caculate(self, input, labels, threshold=0.5): + input = input.squeeze(1).tolist() + labels = labels.tolist() + right = 0 + for i in range(len(input)): + if (input[i] >= threshold and labels[i] >= 0.5) or (input[i] < threshold and labels[i] < 0.5): + right += 1 + return right + + +class OD_Layer_linear(nn.Module): + def __init__(self, args): + super().__init__() + self.args = args + self.dense = nn.Linear(args.hidden_size, args.hidden_size) + self.actication = nn.LeakyReLU() + self.bn = torch.nn.BatchNorm1d(args.hidden_size) + self.dropout = nn.Dropout(p=args.hidden_dropout_prob) + + def forward(self, input): + return self.actication(self.bn(self.dense(self.dropout(input)))) diff --git a/KTeleBERT/model/Tool_model.py b/KTeleBERT/model/Tool_model.py new file mode 100644 index 0000000000000000000000000000000000000000..4fcaf6658f32277454c3d7cf6099a2b9df98a554 --- /dev/null +++ b/KTeleBERT/model/Tool_model.py @@ -0,0 +1,34 @@ +# -*- coding: UTF-8 -*- + +import torch +from torch import nn + +# https://github.com/Mikoto10032/AutomaticWeightedLoss/blob/master/AutomaticWeightedLoss.py + + +class AutomaticWeightedLoss(nn.Module): + # ''' + # automatically weighted multi-task loss + # Params�� + # num: int��the number of loss + # x: multi-task loss + # Examples�� + # loss1=1 + # loss2=2 + # awl = AutomaticWeightedLoss(2) + # loss_sum = awl(loss1, loss2) + # ''' + def __init__(self, num=2, args=None): + super(AutomaticWeightedLoss, self).__init__() + if args is None or args.use_awl: + params = torch.ones(num, requires_grad=True) + self.params = torch.nn.Parameter(params) + else: + params = torch.ones(num, requires_grad=False) + self.params = torch.nn.Parameter(params, requires_grad=False) + + def forward(self, *x): + loss_sum = 0 + for i, loss in enumerate(x): + loss_sum += 0.5 / (self.params[i] ** 2) * loss + torch.log(1 + self.params[i] ** 2) + return loss_sum diff --git a/KTeleBERT/model/__init__.py b/KTeleBERT/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e1e3efb081034d9cc1c914ed84d93fcc9e6e1b25 --- /dev/null +++ b/KTeleBERT/model/__init__.py @@ -0,0 +1,26 @@ +# from .vector import Vector +# from .classifier import SimpleClassifier +# # from .updn import UpDn +# # from .ban import Ban + +from .bert import ( + BERT_PRETRAINED_MODEL_ARCHIVE_LIST, + BertForMaskedLM, + BertForMultipleChoice, + BertForNextSentencePrediction, + BertForPreTraining, + BertForQuestionAnswering, + BertForSequenceClassification, + BertForTokenClassification, + BertLayer, + BertLMHeadModel, + BertModel, + BertPreTrainedModel, + load_tf_weights_in_bert, +) + +from .bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig +from .bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer +from .HWBert import HWBert +from .KE_model import KGEModel, KE_model +from .OD_model import OD_model diff --git a/KTeleBERT/model/__pycache__/HWBert.cpython-38.pyc b/KTeleBERT/model/__pycache__/HWBert.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fcba0f6dca46b8cff544d6590deecb543340b7a5 Binary files /dev/null and b/KTeleBERT/model/__pycache__/HWBert.cpython-38.pyc differ diff --git a/KTeleBERT/model/__pycache__/KE_model.cpython-38.pyc b/KTeleBERT/model/__pycache__/KE_model.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16f09b33ca4a3ca7303e3f020e8ce2426af92a2d Binary files /dev/null and b/KTeleBERT/model/__pycache__/KE_model.cpython-38.pyc differ diff --git a/KTeleBERT/model/__pycache__/Numeric.cpython-38.pyc b/KTeleBERT/model/__pycache__/Numeric.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fdb11cdc559b481f115d0f2ecff1cf8cc0f6f3aa Binary files /dev/null and b/KTeleBERT/model/__pycache__/Numeric.cpython-38.pyc differ diff --git a/KTeleBERT/model/__pycache__/OD_model.cpython-38.pyc b/KTeleBERT/model/__pycache__/OD_model.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..887c39fb7f043dfefe4cbb8ce4448540a4fd6599 Binary files /dev/null and b/KTeleBERT/model/__pycache__/OD_model.cpython-38.pyc differ diff --git a/KTeleBERT/model/__pycache__/Tool_model.cpython-38.pyc b/KTeleBERT/model/__pycache__/Tool_model.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7868c3f3b27b62f7d8a31322f170027444f84110 Binary files /dev/null and b/KTeleBERT/model/__pycache__/Tool_model.cpython-38.pyc differ diff --git a/KTeleBERT/model/__pycache__/__init__.cpython-38.pyc b/KTeleBERT/model/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4884141f00bde8e2333a699ac97c0ab5a61f53d2 Binary files /dev/null and b/KTeleBERT/model/__pycache__/__init__.cpython-38.pyc differ diff --git a/KTeleBERT/model/bert/__init__.py b/KTeleBERT/model/bert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ea54209fbc13bffb6b0feae9d9156ac5f64defc4 --- /dev/null +++ b/KTeleBERT/model/bert/__init__.py @@ -0,0 +1,201 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from transformers.utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_flax_available, + is_tensorflow_text_available, + is_tf_available, + is_tokenizers_available, + is_torch_available, +) + + +_import_structure = { + "configuration_bert": ["BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BertConfig", "BertOnnxConfig"], + "tokenization_bert": ["BasicTokenizer", "BertTokenizer", "WordpieceTokenizer"], +} + +try: + if not is_tokenizers_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["tokenization_bert_fast"] = ["BertTokenizerFast"] + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_bert"] = [ + "BERT_PRETRAINED_MODEL_ARCHIVE_LIST", + "BertForMaskedLM", + "BertForMultipleChoice", + "BertForNextSentencePrediction", + "BertForPreTraining", + "BertForQuestionAnswering", + "BertForSequenceClassification", + "BertForTokenClassification", + "BertLayer", + "BertLMHeadModel", + "BertModel", + "BertPreTrainedModel", + "load_tf_weights_in_bert", + ] + +try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_tf_bert"] = [ + "TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFBertEmbeddings", + "TFBertForMaskedLM", + "TFBertForMultipleChoice", + "TFBertForNextSentencePrediction", + "TFBertForPreTraining", + "TFBertForQuestionAnswering", + "TFBertForSequenceClassification", + "TFBertForTokenClassification", + "TFBertLMHeadModel", + "TFBertMainLayer", + "TFBertModel", + "TFBertPreTrainedModel", + ] +try: + if not is_tensorflow_text_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["tokenization_bert_tf"] = ["TFBertTokenizer"] + +try: + if not is_flax_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_flax_bert"] = [ + "FlaxBertForCausalLM", + "FlaxBertForMaskedLM", + "FlaxBertForMultipleChoice", + "FlaxBertForNextSentencePrediction", + "FlaxBertForPreTraining", + "FlaxBertForQuestionAnswering", + "FlaxBertForSequenceClassification", + "FlaxBertForTokenClassification", + "FlaxBertModel", + "FlaxBertPreTrainedModel", + ] + +if TYPE_CHECKING: + from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig, BertOnnxConfig + from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer + + try: + if not is_tokenizers_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .tokenization_bert_fast import BertTokenizerFast + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_bert import ( + BERT_PRETRAINED_MODEL_ARCHIVE_LIST, + BertForMaskedLM, + BertForMultipleChoice, + BertForNextSentencePrediction, + BertForPreTraining, + BertForQuestionAnswering, + BertForSequenceClassification, + BertForTokenClassification, + BertLayer, + BertLMHeadModel, + BertModel, + BertPreTrainedModel, + load_tf_weights_in_bert, + ) + + try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_tf_bert import ( + TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST, + TFBertEmbeddings, + TFBertForMaskedLM, + TFBertForMultipleChoice, + TFBertForNextSentencePrediction, + TFBertForPreTraining, + TFBertForQuestionAnswering, + TFBertForSequenceClassification, + TFBertForTokenClassification, + TFBertLMHeadModel, + TFBertMainLayer, + TFBertModel, + TFBertPreTrainedModel, + ) + + try: + if not is_tensorflow_text_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .tokenization_bert_tf import TFBertTokenizer + + try: + if not is_flax_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_flax_bert import ( + FlaxBertForCausalLM, + FlaxBertForMaskedLM, + FlaxBertForMultipleChoice, + FlaxBertForNextSentencePrediction, + FlaxBertForPreTraining, + FlaxBertForQuestionAnswering, + FlaxBertForSequenceClassification, + FlaxBertForTokenClassification, + FlaxBertModel, + FlaxBertPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/KTeleBERT/model/bert/__pycache__/__init__.cpython-38.pyc b/KTeleBERT/model/bert/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..131338cfc9a6b584726b242fef385039fc142623 Binary files /dev/null and b/KTeleBERT/model/bert/__pycache__/__init__.cpython-38.pyc differ diff --git a/KTeleBERT/model/bert/__pycache__/configuration_bert.cpython-38.pyc b/KTeleBERT/model/bert/__pycache__/configuration_bert.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75b36538aa98d279db67217485595fd93b9a5e45 Binary files /dev/null and b/KTeleBERT/model/bert/__pycache__/configuration_bert.cpython-38.pyc differ diff --git a/KTeleBERT/model/bert/__pycache__/modeling_bert.cpython-38.pyc b/KTeleBERT/model/bert/__pycache__/modeling_bert.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c52d458d2549ec35352993174cf3d8f554c595b4 Binary files /dev/null and b/KTeleBERT/model/bert/__pycache__/modeling_bert.cpython-38.pyc differ diff --git a/KTeleBERT/model/bert/__pycache__/tokenization_bert.cpython-38.pyc b/KTeleBERT/model/bert/__pycache__/tokenization_bert.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8693b20056ae5766746c82589ef4d21b49498de Binary files /dev/null and b/KTeleBERT/model/bert/__pycache__/tokenization_bert.cpython-38.pyc differ diff --git a/KTeleBERT/model/bert/configuration_bert.py b/KTeleBERT/model/bert/configuration_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..0f48a8f93e25bd747fbd9b090014476e3804162d --- /dev/null +++ b/KTeleBERT/model/bert/configuration_bert.py @@ -0,0 +1,191 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" BERT model configuration""" +from collections import OrderedDict +from typing import Mapping + +from transformers.configuration_utils import PretrainedConfig +from transformers.onnx import OnnxConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json", + "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/config.json", + "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/config.json", + "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/config.json", + "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json", + "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json", + "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/config.json", + "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/config.json", + "bert-large-uncased-whole-word-masking": ( + "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/config.json" + ), + "bert-large-cased-whole-word-masking": ( + "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/config.json" + ), + "bert-large-uncased-whole-word-masking-finetuned-squad": ( + "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json" + ), + "bert-large-cased-whole-word-masking-finetuned-squad": ( + "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/config.json" + ), + "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/config.json", + "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/config.json", + "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/config.json", + "cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/config.json", + "cl-tohoku/bert-base-japanese-whole-word-masking": ( + "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json" + ), + "cl-tohoku/bert-base-japanese-char": ( + "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/config.json" + ), + "cl-tohoku/bert-base-japanese-char-whole-word-masking": ( + "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/config.json" + ), + "TurkuNLP/bert-base-finnish-cased-v1": ( + "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json" + ), + "TurkuNLP/bert-base-finnish-uncased-v1": ( + "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/config.json" + ), + "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/config.json", + # See all BERT models at https://huggingface.co/models?filter=bert +} + + +class BertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BertModel`] or a [`TFBertModel`]. It is used to + instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the BERT + [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Examples: + + ```python + >>> from transformers import BertModel, BertConfig + + >>> # Initializing a BERT bert-base-uncased style configuration + >>> configuration = BertConfig() + + >>> # Initializing a model from the bert-base-uncased style configuration + >>> model = BertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "bert" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + use_cache=True, + classifier_dropout=None, + **kwargs + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + + +class BertOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ("token_type_ids", dynamic_axis), + ] + ) diff --git a/KTeleBERT/model/bert/modeling_bert.py b/KTeleBERT/model/bert/modeling_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..92fd8b7a6384ce04cf4079f5a971bec390307f97 --- /dev/null +++ b/KTeleBERT/model/bert/modeling_bert.py @@ -0,0 +1,2010 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model.""" + +import pdb +import math +import os +import warnings +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from packaging import version +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN + +from transformers.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + NextSentencePredictorOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from transformers.utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_bert import BertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "bert-base-uncased" +_CONFIG_FOR_DOC = "BertConfig" +_TOKENIZER_FOR_DOC = "BertTokenizer" + +# TokenClassification docstring +_CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/bert-large-cased-finetuned-conll03-english" +_TOKEN_CLASS_EXPECTED_OUTPUT = ( + "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC'] " +) +_TOKEN_CLASS_EXPECTED_LOSS = 0.01 + +# QuestionAnswering docstring +_CHECKPOINT_FOR_QA = "deepset/bert-base-cased-squad2" +_QA_EXPECTED_OUTPUT = "'a nice puppet'" +_QA_EXPECTED_LOSS = 7.41 +_QA_TARGET_START_INDEX = 14 +_QA_TARGET_END_INDEX = 15 + +# SequenceClassification docstring +_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "textattack/bert-base-uncased-yelp-polarity" +_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'" +_SEQ_CLASS_EXPECTED_LOSS = 0.01 + + +BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "bert-base-uncased", + "bert-large-uncased", + "bert-base-cased", + "bert-large-cased", + "bert-base-multilingual-uncased", + "bert-base-multilingual-cased", + "bert-base-chinese", + "bert-base-german-cased", + "bert-large-uncased-whole-word-masking", + "bert-large-cased-whole-word-masking", + "bert-large-uncased-whole-word-masking-finetuned-squad", + "bert-large-cased-whole-word-masking-finetuned-squad", + "bert-base-cased-finetuned-mrpc", + "bert-base-german-dbmdz-cased", + "bert-base-german-dbmdz-uncased", + "cl-tohoku/bert-base-japanese", + "cl-tohoku/bert-base-japanese-whole-word-masking", + "cl-tohoku/bert-base-japanese-char", + "cl-tohoku/bert-base-japanese-char-whole-word-masking", + "TurkuNLP/bert-base-finnish-cased-v1", + "TurkuNLP/bert-base-finnish-uncased-v1", + "wietsedv/bert-base-dutch-cased", + # See all BERT models at https://huggingface.co/models?filter=bert +] + + +def load_tf_weights_in_bert(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + return model + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + if version.parse(torch.__version__) > version.parse("1.6.0"): + self.register_buffer( + "token_type_ids", + torch.zeros(self.position_ids.size(), dtype=torch.long), + persistent=False, + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + kpi_ref = None, # KPI数值替换的位置,以及参考的KPI name,KPI数值,类别 + kpi_model = None, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + # pdb.set_trace() + # TODO 得到 KPI的name embedding,pooling,输入数值编码模型,得到特征替换(mask+)特定位置原向量, + # 不产生新的embedding,直接读取生成的embedding + en_loss, scalar_list, con_loss, numeric_input, kpi_input = None, None, None, None, None + if kpi_ref is not None: + max_len = inputs_embeds.shape[1] + # 生成数值embedding + numeric_list = [] + kpi_emb_list = [] + kpi_id_list = [] + for i in range(len(kpi_ref)): + if len(kpi_ref[i])>0: + for item in kpi_ref[i]: + # 可能[NUM]被截断了 + if item[2]>=max_len: + continue + numeric_list.append(item[4]) + kpi_id_list.append(item[3]) + # requires_grad=True + kpi_name_embedding = torch.mean(inputs_embeds[i][item[0]:item[1]+1], dim=0) + kpi_emb_list.append(kpi_name_embedding) + # 有可能出现没有KPI的情况 + if len(kpi_emb_list)>0: + kpi_emb = torch.stack(kpi_emb_list).unsqueeze(1) + + # , dtype=torch.float64 + numeric_input = torch.Tensor(numeric_list).unsqueeze(1).unsqueeze(1).cuda() + kpi_input = torch.tensor(kpi_id_list, dtype=torch.long).cuda() + # pdb.set_trace() + hidden, en_loss, scalar_list, con_loss = kpi_model._encode(numeric_input, kpi_emb) + # 替换 + key = 0 + for i in range(len(kpi_ref)): + if len(kpi_ref[i])>0: + for item in kpi_ref[i]: + if item[2]>=max_len: + continue + # [NUM]的(x,y)坐标位置 + inputs_embeds[i,item[2]] = hidden[key][0] + key += 1 + assert key == hidden.shape[0] + + + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + # 重构输出 + return embeddings, en_loss, scalar_list, con_loss, numeric_input, kpi_input + # return embeddings + + +class BertSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = BertSelfAttention(config, position_embedding_type=position_embedding_type) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = BertAttention(config, position_embedding_type="absolute") + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, past_key_value, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class BertPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertOnlyNSPHead(nn.Module): + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class BertPreTrainingHeads(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class BertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BertConfig + load_tf_weights = load_tf_weights_in_bert + base_model_prefix = "bert" + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, BertEncoder): + module.gradient_checkpointing = value + + +@dataclass +class BertForPreTrainingOutput(ModelOutput): + """ + Output type of [`BertForPreTraining`]. + + Args: + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + seq_relationship_logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +BERT_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", + BERT_START_DOCSTRING, +) +class BertModel(BertPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + + self.pooler = BertPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + kpi_ref = None, # KPI数值替换的位置,以及参考的KPI name,KPI数值,类别 + kpi_model = None, # 输入KPI模型 + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output, en_loss, scalar_list, con_loss, numeric_input, kpi_input = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + kpi_ref=kpi_ref, # KPI数值替换的位置,以及参考的KPI name,KPI数值,类别 + kpi_model=kpi_model, + ) + + # 输入了占位符的位置信息 + # KPI的起始,结束位置embedding 的 pooling + + # 在这里按位置替换数值embedding + # 同时用KPI的 embedding 作为监督信号 + # + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + + # 在这里对数值embedding的位置做回归loss + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ), en_loss, scalar_list, con_loss, numeric_input, kpi_input + + +@add_start_docstrings( + """ + Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next + sentence prediction (classification)` head. + """, + BERT_START_DOCSTRING, +) +class BertForPreTraining(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + self.cls = BertPreTrainingHeads(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + next_sentence_label: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BertForPreTrainingOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), + the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence + pair (see `input_ids` docstring) Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + kwargs (`Dict[str, any]`, optional, defaults to *{}*): + Used to hide legacy arguments that have been deprecated. + + Returns: + + Example: + + ```python + >>> from transformers import BertTokenizer, BertForPreTraining + >>> import torch + + >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + >>> model = BertForPreTraining.from_pretrained("bert-base-uncased") + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + total_loss = None + if labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + + if not return_dict: + output = (prediction_scores, seq_relationship_score) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return BertForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING +) +class BertLMHeadModel(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`") + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=CausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.Tensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]` + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss() + lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past} + + def _reorder_cache(self, past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past + + +@add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING) +class BertForMaskedLM(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.bert = BertModel(config, add_pooling_layer=False) + + # CZ: 添加了pooling + # self.bert = BertModel(config) + self.cls = BertOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="'paris'", + expected_loss=0.88, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + kpi_ref = None, # KPI数值替换的位置,以及参考的KPI name,KPI数值,类别 + kpi_model = None, # 输入KPI模型 + ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs, en_loss, scalar_list, con_loss, numeric_input, kpi_input = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + kpi_ref=kpi_ref, # KPI数值替换的位置,以及参考的KPI name,KPI数值,类别 + kpi_model=kpi_model, + ) + + + # decode 等loss 在这里计算 + # 数值位本来就被maskl不参与loss计算,可以单独算loss + sequence_output = outputs[0] + + # 可能出现没有kpi的情况 + # if kpi_ref is None: + # kpi_loss = None + # else: + # # awl需要 + # kpi_loss = torch.tensor([0.]).cuda() + kpi_loss, kpi_loss_weight, kpi_loss_dict = None, None, None + if kpi_input is not None: + max_len = sequence_output.shape[1] + # 生成数值embedding + kpi_emb_list = [] + for i in range(len(kpi_ref)): + if len(kpi_ref[i])>0: + for item in kpi_ref[i]: + # 可能[NUM]被截断了 + if item[2]>=max_len: + continue + # requires_grad=True + kpi_emb_list.append(sequence_output[i][item[2]]) + + # TODO: 把KPI con loss 归一化,因为KPI会浮动 + kpi_emb = torch.stack(kpi_emb_list).unsqueeze(1) + + # numeric_input: 相关的数值 + # kpi_input:相关的KPI id + _dec_kpi_score, de_loss = kpi_model.decoder(numeric_input, kpi_emb) + # pdb.set_trace() + _cls_kpi, cls_loss = kpi_model.classifier(kpi_emb, kpi_input) + # pdb.set_trace() + # pdb.set_trace() + # 提前乘一个系数降低影响 + if con_loss is not None: + kpi_loss = kpi_model.loss_awl(de_loss, 0.2 * cls_loss, 0.2 * con_loss) + 0.5 * en_loss + kpi_loss_dict = {'de_loss':de_loss.item(), 'con_loss':con_loss.item(), 'cls_loss':cls_loss.item(), 'en_loss':en_loss.item()} + else: + kpi_loss = kpi_model.loss_awl(de_loss, 0.1 * cls_loss) + 0.5 * en_loss + kpi_loss_dict = {'de_loss':de_loss.item(), 'cls_loss':cls_loss.item(), 'en_loss':en_loss.item()} + kpi_loss_weight = kpi_model.loss_awl.params.tolist() + + + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token [ignore_index=- 100] + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + # pdb.set_trace() + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions + ), kpi_loss, kpi_loss_weight, kpi_loss_dict + + def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + if self.config.pad_token_id is None: + raise ValueError("The PAD token should be defined for generation") + + attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1) + dummy_token = torch.full( + (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device + ) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {"input_ids": input_ids, "attention_mask": attention_mask} + + +@add_start_docstrings( + """Bert Model with a `next sentence prediction (classification)` head on top.""", + BERT_START_DOCSTRING, +) +class BertForNextSentencePrediction(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + self.cls = BertOnlyNSPHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair + (see `input_ids` docstring). Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + + Returns: + + Example: + + ```python + >>> from transformers import BertTokenizer, BertForNextSentencePrediction + >>> import torch + + >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + >>> model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased") + + >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." + >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." + >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt") + + >>> outputs = model(**encoding, labels=torch.LongTensor([1])) + >>> logits = outputs.logits + >>> assert logits[0, 0] < logits[0, 1] # next sentence was random + ``` + """ + + if "next_sentence_label" in kwargs: + warnings.warn( + "The `next_sentence_label` argument is deprecated and will be removed in a future version, use" + " `labels` instead.", + FutureWarning, + ) + labels = kwargs.pop("next_sentence_label") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + seq_relationship_scores = self.cls(pooled_output) + + next_sentence_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1)) + + if not return_dict: + output = (seq_relationship_scores,) + outputs[2:] + return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output + + return NextSentencePredictorOutput( + loss=next_sentence_loss, + logits=seq_relationship_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """, + BERT_START_DOCSTRING, +) +class BertForSequenceClassification(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_SEQ_CLASS_EXPECTED_OUTPUT, + expected_loss=_SEQ_CLASS_EXPECTED_LOSS, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + BERT_START_DOCSTRING, +) +class BertForMultipleChoice(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + BERT_START_DOCSTRING, +) +class BertForTokenClassification(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT, + expected_loss=_TOKEN_CLASS_EXPECTED_LOSS, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + BERT_START_DOCSTRING, +) +class BertForQuestionAnswering(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_QA, + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + qa_target_start_index=_QA_TARGET_START_INDEX, + qa_target_end_index=_QA_TARGET_END_INDEX, + expected_output=_QA_EXPECTED_OUTPUT, + expected_loss=_QA_EXPECTED_LOSS, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/KTeleBERT/model/bert/tokenization_bert.py b/KTeleBERT/model/bert/tokenization_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..4da4ce3f234d8f41a6e3440741e31ef964a8fb46 --- /dev/null +++ b/KTeleBERT/model/bert/tokenization_bert.py @@ -0,0 +1,574 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Bert.""" + + +import collections +import os +import unicodedata +from typing import List, Optional, Tuple + +from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", + "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", + "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt", + "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt", + "bert-base-multilingual-uncased": ( + "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/vocab.txt" + ), + "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt", + "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt", + "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/vocab.txt", + "bert-large-uncased-whole-word-masking": ( + "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt" + ), + "bert-large-cased-whole-word-masking": ( + "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/vocab.txt" + ), + "bert-large-uncased-whole-word-masking-finetuned-squad": ( + "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt" + ), + "bert-large-cased-whole-word-masking-finetuned-squad": ( + "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt" + ), + "bert-base-cased-finetuned-mrpc": ( + "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt" + ), + "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/vocab.txt", + "bert-base-german-dbmdz-uncased": ( + "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt" + ), + "TurkuNLP/bert-base-finnish-cased-v1": ( + "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt" + ), + "TurkuNLP/bert-base-finnish-uncased-v1": ( + "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/vocab.txt" + ), + "wietsedv/bert-base-dutch-cased": ( + "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/vocab.txt" + ), + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "bert-base-uncased": 512, + "bert-large-uncased": 512, + "bert-base-cased": 512, + "bert-large-cased": 512, + "bert-base-multilingual-uncased": 512, + "bert-base-multilingual-cased": 512, + "bert-base-chinese": 512, + "bert-base-german-cased": 512, + "bert-large-uncased-whole-word-masking": 512, + "bert-large-cased-whole-word-masking": 512, + "bert-large-uncased-whole-word-masking-finetuned-squad": 512, + "bert-large-cased-whole-word-masking-finetuned-squad": 512, + "bert-base-cased-finetuned-mrpc": 512, + "bert-base-german-dbmdz-cased": 512, + "bert-base-german-dbmdz-uncased": 512, + "TurkuNLP/bert-base-finnish-cased-v1": 512, + "TurkuNLP/bert-base-finnish-uncased-v1": 512, + "wietsedv/bert-base-dutch-cased": 512, +} + +PRETRAINED_INIT_CONFIGURATION = { + "bert-base-uncased": {"do_lower_case": True}, + "bert-large-uncased": {"do_lower_case": True}, + "bert-base-cased": {"do_lower_case": False}, + "bert-large-cased": {"do_lower_case": False}, + "bert-base-multilingual-uncased": {"do_lower_case": True}, + "bert-base-multilingual-cased": {"do_lower_case": False}, + "bert-base-chinese": {"do_lower_case": False}, + "bert-base-german-cased": {"do_lower_case": False}, + "bert-large-uncased-whole-word-masking": {"do_lower_case": True}, + "bert-large-cased-whole-word-masking": {"do_lower_case": False}, + "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True}, + "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False}, + "bert-base-cased-finetuned-mrpc": {"do_lower_case": False}, + "bert-base-german-dbmdz-cased": {"do_lower_case": False}, + "bert-base-german-dbmdz-uncased": {"do_lower_case": True}, + "TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False}, + "TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True}, + "wietsedv/bert-base-dutch-cased": {"do_lower_case": False}, +} + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class BertTokenizer(PreTrainedTokenizer): + r""" + Construct a BERT tokenizer. Based on WordPiece. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + do_basic_tokenize (`bool`, *optional*, defaults to `True`): + Whether or not to do basic tokenization before WordPiece. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs + ): + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" + " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + ) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + + @property + def do_lower_case(self): + return self.basic_tokenizer.do_lower_case + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + def _tokenize(self, text): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): + + # If the token is part of the never_split set + if token in self.basic_tokenizer.never_split: + split_tokens.append(token) + else: + split_tokens += self.wordpiece_tokenizer.tokenize(token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence + pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +class BasicTokenizer(object): + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents: (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + """ + + def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see + WordPieceTokenizer. + + Args: + never_split (`List[str]`, *optional*) + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if never_split is not None and text in never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) # + or (cp >= 0x20000 and cp <= 0x2A6DF) # + or (cp >= 0x2A700 and cp <= 0x2B73F) # + or (cp >= 0x2B740 and cp <= 0x2B81F) # + or (cp >= 0x2B820 and cp <= 0x2CEAF) # + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) # + ): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + + For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`. + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through *BasicTokenizer*. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens diff --git a/KTeleBERT/requirements.txt b/KTeleBERT/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..65af691759d00c75fb7fba2b70deff840fec5648 --- /dev/null +++ b/KTeleBERT/requirements.txt @@ -0,0 +1,10 @@ +transformers==4.12.2 +tqdm +torch +ltp +ltp-core +ltp-extension +cycle +torch>=1.10.0 +easydict +re diff --git a/KTeleBERT/run.sh b/KTeleBERT/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..4e34ff6723aeb3d277d33da31cdfe986f8bf765b --- /dev/null +++ b/KTeleBERT/run.sh @@ -0,0 +1,35 @@ +python -m torch.distributed.launch --nproc_per_node=4 main.py --LLRD 1 \ + --eval_step 10 \ + --save_model 1 \ + --mask_stratege wwm \ + --batch_size 64 \ + --batch_size_ke 64 \ + --exp_name Fine_tune_2 \ + --exp_id v01 \ + --workers 8 \ + --use_NumEmb 1 \ + --seq_data_name Seq_data_RuAlmEntKpiTbwDoc \ + --maxlength 256 \ + --lr 4e-5 \ + --ke_lr 8e-5 \ + --train_strategy 2 \ + --model_name TeleBert2 \ + --train_ratio 1 \ + --save_pretrain 0 \ + --dist 1 \ + --accumulation_steps 8 \ + --accumulation_steps_ke 6 \ + --special_token_mask 0 \ + --freeze_layer 0 \ + --ernie_stratege -1 \ + --mlm_probability_increase curve \ + --use_kpi_loss 1 \ + --mlm_probability 0.4 \ + --use_awl 1 \ + --cls_head_init 1 \ + --emb_init 0 \ + --final_mlm_probability 0.4 \ + --ke_dim 256 \ + --plm_emb_type cls \ + --train_together 0 \ + diff --git a/KTeleBERT/run_get_ref.sh b/KTeleBERT/run_get_ref.sh new file mode 100644 index 0000000000000000000000000000000000000000..ce64fd9460ce121c3214f4aed5e5ba2e34d156c4 --- /dev/null +++ b/KTeleBERT/run_get_ref.sh @@ -0,0 +1,22 @@ +python get_chinese_ref.py --batch_size 50 \ + --deal_numeric 1 \ + --seq_data_name Seq_data_large \ + --read_cws 0 \ + # --seq_data_name Seq_data_base \ + # --read_cws 1 \ + +# python get_chinese_ref.py --batch_size 150 + +# python get_chinese_ref.py --batch_size 200 + +# python get_chinese_ref.py --batch_size 250 + +# python get_chinese_ref.py --batch_size 300 + +# python main.py --LLRD 1 \ +# --eval_step 10 \ +# --epoch 20 \ +# --save_model 1 \ +# --mask_stratege wwm \ +# --batch_size 50 \ +# --use_NumEmb 1 \ \ No newline at end of file diff --git a/KTeleBERT/special_token_pre_emb.py b/KTeleBERT/special_token_pre_emb.py new file mode 100644 index 0000000000000000000000000000000000000000..c149d3804d977c3bb5cbe005c5a11633961b7e85 --- /dev/null +++ b/KTeleBERT/special_token_pre_emb.py @@ -0,0 +1,119 @@ +from src.utils import add_special_token +import os.path as osp +import numpy as np +import random +import torch +from easydict import EasyDict as edict +import argparse +import pdb +import json +from model import BertTokenizer +from collections import Counter +from tqdm import tqdm +from time import time +from numpy import mean +import math + +from transformers import BertModel + + +class cfg(): + def __init__(self): + self.this_dir = osp.dirname(__file__) + # change + self.data_root = osp.abspath(osp.join(self.this_dir, '..', '..', 'data', '')) + + def get_args(self): + parser = argparse.ArgumentParser() + # seq_data_name = "Seq_data_tiny_831" + parser.add_argument("--data_path", default="huawei", type=str, help="Experiment path") + parser.add_argument("--update_model_name", default='MacBert', type=str, help="MacBert") + parser.add_argument("--pretrained_model_name", default='TeleBert', type=str, help="TeleBert") + parser.add_argument("--read_cws", default=0, type=int, help="是否需要读训练好的cws文件") + self.cfg = parser.parse_args() + + def update_train_configs(self): + # TODO: update some dynamic variable + self.cfg.data_root = self.data_root + self.cfg.data_path = osp.join(self.data_root, self.cfg.data_path) + + return self.cfg + + +if __name__ == '__main__': + ''' + 功能: 得到 chinese ref 文件,同时刷新训练/测试文件(仅针对序列的文本数据) + ''' + cfg = cfg() + cfg.get_args() + cfgs = cfg.update_train_configs() + + # 用来被更新的,需要添加token的tokenizer + path = osp.join(cfgs.data_root, 'transformer', cfgs.update_model_name) + assert osp.exists(path) + tokenizer = BertTokenizer.from_pretrained(path, do_lower_case=True) + tokenizer, special_token, norm_token = add_special_token(tokenizer) + added_vocab = tokenizer.get_added_vocab() + vocb_path = osp.join(cfgs.data_path, 'added_vocab.json') + + with open(vocb_path, 'w') as fp: + json.dump(added_vocab, fp, ensure_ascii=False) + + vocb_description = osp.join(cfgs.data_path, 'vocab_descrip.json') + vocb_descrip = None + + vocb_descrip = { + "alm": "alarm", + "ran": "ran 无线接入网", + "mml": "MML 人机语言命令", + "nf": "NF 独立网络服务", + "apn": "APN 接入点名称", + "pgw": "PGW 数据管理子系统模块", + "lst": "LST 查询命令", + "qos": "QoS 定制服务质量", + "ipv": "IPV 互联网通讯协议版本", + "ims": "IMS IP多模态子系统", + "gtp": "GTP GPRS隧道协议", + "pdp": "PDP 分组数据协议", + "hss": "HSS HTTP Smooth Stream", + "[ALM]": "alarm 告警 标记", + "[KPI]": "kpi 关键性能指标 标记", + "[LOC]": "location 事件发生位置 标记", + "[EOS]": "end of the sentence 文档结尾 标记", + "[ENT]": "实体标记", + "[ATTR]": "属性标记", + "[NUM]": "数值标记", + "[REL]": "关系标记", + "[DOC]": "文档标记" + } + + # if osp.exists(vocb_description): + # with open(vocb_description, 'r') as fp: + # vocb_descrip = json.load(added_vocab) + + # 用来进行embedding的模型 + path = osp.join(cfgs.data_root, 'transformer', cfgs.pretrained_model_name) + assert osp.exists(path) + pre_tokenizer = BertTokenizer.from_pretrained(path, do_lower_case=True) + model = BertModel.from_pretrained(path) + + print("use the vocb_description") + key_to_emb = {} + for key in added_vocab.keys(): + if vocb_description is not None: + if key in vocb_description: + # 一部分需要描述 + key_tokens = pre_tokenizer(vocb_description[key], return_tensors='pt') + else: + key_tokens = pre_tokenizer(key, return_tensors='pt') + else: + key_tokens = pre_tokenizer(key, return_tensors='pt') + + hidden_state = model(**key_tokens, output_hidden_states=True).hidden_states + pdb.set_trace() + key_to_emb[key] = hidden_state[-1][:, 1:-1, :].mean(dim=1) + + emb_path = osp.join(cfgs.data_path, 'added_vocab_embedding.pt') + + torch.save(key_to_emb, emb_path) + print(f'save to {emb_path}') diff --git a/KTeleBERT/src/__init__.py b/KTeleBERT/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/KTeleBERT/src/__init__.py @@ -0,0 +1 @@ + diff --git a/KTeleBERT/src/__pycache__/__init__.cpython-38.pyc b/KTeleBERT/src/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d080e6ead7ca4ae6ce9dbeb5878d73b25a109c06 Binary files /dev/null and b/KTeleBERT/src/__pycache__/__init__.cpython-38.pyc differ diff --git a/KTeleBERT/src/__pycache__/data.cpython-38.pyc b/KTeleBERT/src/__pycache__/data.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6db541ae48026f927cb9b3c488a331b74f940999 Binary files /dev/null and b/KTeleBERT/src/__pycache__/data.cpython-38.pyc differ diff --git a/KTeleBERT/src/__pycache__/distributed_utils.cpython-38.pyc b/KTeleBERT/src/__pycache__/distributed_utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57d4f87a8b2c1c70792eace612b5b1a47f864e7e Binary files /dev/null and b/KTeleBERT/src/__pycache__/distributed_utils.cpython-38.pyc differ diff --git a/KTeleBERT/src/__pycache__/utils.cpython-38.pyc b/KTeleBERT/src/__pycache__/utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c9f69aaeb4d58e64d8f942a5d5a97a71a2b6b60 Binary files /dev/null and b/KTeleBERT/src/__pycache__/utils.cpython-38.pyc differ diff --git a/KTeleBERT/src/data.py b/KTeleBERT/src/data.py new file mode 100644 index 0000000000000000000000000000000000000000..40abacf132e4b4c45f458ce3d140c0361f86ee9a --- /dev/null +++ b/KTeleBERT/src/data.py @@ -0,0 +1,651 @@ +import torch +import random +import json +import numpy as np +import pdb +import os.path as osp +from model import BertTokenizer +import torch.distributed as dist + + +class SeqDataset(torch.utils.data.Dataset): + def __init__(self, data, chi_ref=None, kpi_ref=None): + self.data = data + self.chi_ref = chi_ref + self.kpi_ref = kpi_ref + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + sample = self.data[index] + if self.chi_ref is not None: + chi_ref = self.chi_ref[index] + else: + chi_ref = None + + if self.kpi_ref is not None: + kpi_ref = self.kpi_ref[index] + else: + kpi_ref = None + + return sample, chi_ref, kpi_ref + + +class OrderDataset(torch.utils.data.Dataset): + def __init__(self, data, kpi_ref=None): + self.data = data + self.kpi_ref = kpi_ref + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + sample = self.data[index] + if self.kpi_ref is not None: + kpi_ref = self.kpi_ref[index] + else: + kpi_ref = None + + return sample, kpi_ref + + +class KGDataset(torch.utils.data.Dataset): + def __init__(self, data): + self.data = data + self.len = len(self.data) + + def __len__(self): + return self.len + + def __getitem__(self, index): + + sample = self.data[index] + return sample + +# TODO: 重构 DataCollatorForLanguageModeling + + +class Collator_base(object): + # TODO: 定义 collator,模仿Lako + # 完成mask,padding + def __init__(self, args, tokenizer, special_token=None): + self.tokenizer = tokenizer + if special_token is None: + self.special_token = ['[SEP]', '[MASK]', '[ALM]', '[KPI]', '[CLS]', '[LOC]', '[EOS]', '[ENT]', '[ATTR]', '[NUM]', '[REL]', '|', '[DOC]'] + else: + self.special_token = special_token + + self.text_maxlength = args.maxlength + self.mlm_probability = args.mlm_probability + self.args = args + if self.args.special_token_mask: + self.special_token = ['|', '[NUM]'] + + if not self.args.only_test and self.args.use_mlm_task: + if args.mask_stratege == 'rand': + self.mask_func = self.torch_mask_tokens + else: + if args.mask_stratege == 'wwm': + # 必须使用special_word, 因为这里的wwm基于分词 + if args.rank == 0: + print("use word-level Mask ...") + assert args.add_special_word == 1 + self.mask_func = self.wwm_mask_tokens + else: # domain + if args.rank == 0: + print("use token-level Mask ...") + self.mask_func = self.domain_mask_tokens + + def __call__(self, batch): + # 把 batch 中的数值提取出,用specail token 替换 + # 把数值信息,以及数值的位置信息单独通过list传进去 + # 后面训练的阶段直接把数值插入embedding的位置 + # 数值不参与 mask + # wwm的时候可以把chinese ref 随batch一起输入 + kpi_ref = None + if self.args.use_NumEmb: + kpi_ref = [item[2] for item in batch] + # if self.args.mask_stratege != 'rand': + chinese_ref = [item[1] for item in batch] + batch = [item[0] for item in batch] + # 此时batch不止有字符串 + batch = self.tokenizer.batch_encode_plus( + batch, + padding='max_length', + max_length=self.text_maxlength, + truncation=True, + return_tensors="pt", + return_token_type_ids=False, + return_attention_mask=True, + add_special_tokens=False + ) + special_tokens_mask = batch.pop("special_tokens_mask", None) + # self.torch_mask_tokens + + # if batch["input_ids"].shape[1] != 128: + # pdb.set_trace() + if chinese_ref is not None: + batch["chinese_ref"] = chinese_ref + if kpi_ref is not None: + batch["kpi_ref"] = kpi_ref + + # 训练需要 mask + + if not self.args.only_test and self.args.use_mlm_task: + batch["input_ids"], batch["labels"] = self.mask_func( + batch, special_tokens_mask=special_tokens_mask + ) + else: + # 非训练状态 + # 且不用MLM进行训练 + labels = batch["input_ids"].clone() + if self.tokenizer.pad_token_id is not None: + labels[labels == self.tokenizer.pad_token_id] = -100 + batch["labels"] = labels + + return batch + + def torch_mask_tokens(self, inputs, special_tokens_mask=None): + """ + Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. + """ + if "input_ids" in inputs: + inputs = inputs["input_ids"] + labels = inputs.clone() + # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`) + probability_matrix = torch.full(labels.shape, self.mlm_probability) + if special_tokens_mask is None: + special_tokens_mask = [ + self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() + ] + special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool) + else: + special_tokens_mask = special_tokens_mask.bool() + # pdb.set_trace() + + probability_matrix.masked_fill_(special_tokens_mask, value=0.0) + masked_indices = torch.bernoulli(probability_matrix).bool() + labels[~masked_indices] = -100 # We only compute loss on masked tokens + + # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) + indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices + inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) + + # 10% of the time, we replace masked input tokens with random word + indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced + random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) + inputs[indices_random] = random_words[indices_random] + + # The rest of the time (10% of the time) we keep the masked input tokens unchanged + return inputs, labels + + def wwm_mask_tokens(self, inputs, special_tokens_mask=None): + mask_labels = [] + ref_tokens = inputs["chinese_ref"] + input_ids = inputs["input_ids"] + sz = len(input_ids) + + # 把input id 先恢复到token + for i in range(sz): + # 这里的主体是读入的ref,但是可能存在max_len不统一的情况 + mask_labels.append(self._whole_word_mask(ref_tokens[i])) + + batch_mask = _torch_collate_batch(mask_labels, self.tokenizer, self.text_maxlength, pad_to_multiple_of=None) + inputs, labels = self.torch_mask_tokens_4wwm(input_ids, batch_mask) + return inputs, labels + + # input_tokens: List[str] + def _whole_word_mask(self, input_tokens, max_predictions=512): + """ + Get 0/1 labels for masked tokens with whole word mask proxy + """ + assert isinstance(self.tokenizer, (BertTokenizer)) + # 输入是 [..., ..., ..., ...] 格式 + cand_indexes = [] + cand_token = [] + + for i, token in enumerate(input_tokens): + if i >= self.text_maxlength - 1: + # 不能超过最大值,截断一下 + break + if token.lower() in self.special_token: + # special token 的词不应该被mask + continue + if len(cand_indexes) >= 1 and token.startswith("##"): + cand_indexes[-1].append(i) + cand_token.append(i) + else: + cand_indexes.append([i]) + cand_token.append(i) + + random.shuffle(cand_indexes) + # 原来是:input_tokens + # 但是这里的特殊token很多,因此提前去掉了特殊token + # 这里的15%是去掉了特殊token的15%。+2的原因是把CLS SEP两个 flag的长度加上 + num_to_predict = min(max_predictions, max(1, int(round((len(cand_token) + 2) * self.mlm_probability)))) + masked_lms = [] + covered_indexes = set() + for index_set in cand_indexes: + # 到达长度了结束 + if len(masked_lms) >= num_to_predict: + break + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + # 不能让其长度大于15%,最多等于 + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + # 不考虑重叠的token进行mask + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + masked_lms.append(index) + + if len(covered_indexes) != len(masked_lms): + # 一般不会出现,因为过程中避免重复了 + raise ValueError("Length of covered_indexes is not equal to length of masked_lms.") + # 不能超过最大值,截断 + mask_labels = [1 if i in covered_indexes else 0 for i in range(min(len(input_tokens), self.text_maxlength))] + + return mask_labels + + # 确定这里面需要mask的:置0/1 + + # 调用 self.torch_mask_tokens + + # + pass + + def torch_mask_tokens_4wwm(self, inputs, mask_labels): + """ + Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set + 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref. + """ + # if "input_ids" in inputs: + # inputs = inputs["input_ids"] + if self.tokenizer.mask_token is None: + raise ValueError( + "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the" + " --mlm flag if you want to use this tokenizer." + ) + labels = inputs.clone() + # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) + + probability_matrix = mask_labels + + special_tokens_mask = [self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()] + + if len(special_tokens_mask[0]) != probability_matrix.shape[1]: + print(f"len(special_tokens_mask[0]): {len(special_tokens_mask[0])}") + print(f"probability_matrix.shape[1]): {probability_matrix.shape[1]}") + print(f'max len {self.text_maxlength}') + print(f"pad_token_id: {self.tokenizer.pad_token_id}") + # if self.args.rank != in_rank: + if self.args.dist: + dist.barrier() + pdb.set_trace() + else: + pdb.set_trace() + + probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) + if self.tokenizer._pad_token is not None: + padding_mask = labels.eq(self.tokenizer.pad_token_id) + probability_matrix.masked_fill_(padding_mask, value=0.0) + + masked_indices = probability_matrix.bool() + labels[~masked_indices] = -100 # We only compute loss on masked tokens + + # 这里的wwm,每次 mask/替换/不变的时候单位不是一体的,会拆开 + # 其实不太合理,但是也没办法 + + # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) + indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices + inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) + + # 10% of the time, we replace masked input tokens with random word + indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced + random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) + inputs[indices_random] = random_words[indices_random] + + # The rest of the time (10% of the time) we keep the masked input tokens unchanged + return inputs, labels + + # TODO: 按区域cell 进行mask + + def domain_mask_tokens(self, inputs, special_tokens_mask=None): + pass + + +class Collator_kg(object): + # TODO: 定义 collator,模仿Lako + # 完成 随机减少一部分属性 + def __init__(self, args, tokenizer, data): + self.tokenizer = tokenizer + self.text_maxlength = args.maxlength + self.cross_sampling_flag = 0 + # ke 的bs 是正常bs的四分之一 + self.neg_num = args.neg_num + # 负样本不能在全集中 + self.data = data + self.args = args + + def __call__(self, batch): + # 先编码成可token形式避免重复编码 + outputs = self.sampling(batch) + + return outputs + + def sampling(self, data): + """Filtering out positive samples and selecting some samples randomly as negative samples. + + Args: + data: The triples used to be sampled. + + Returns: + batch_data: The training data. + """ + batch_data = {} + neg_ent_sample = [] + + self.cross_sampling_flag = 1 - self.cross_sampling_flag + + head_list = [] + rel_list = [] + tail_list = [] + # pdb.set_trace() + if self.cross_sampling_flag == 0: + batch_data['mode'] = "head-batch" + for index, (head, relation, tail) in enumerate(data): + # in batch negative + neg_head = self.find_neghead(data, index, relation, tail) + neg_ent_sample.extend(random.sample(neg_head, self.neg_num)) + head_list.append(head) + rel_list.append(relation) + tail_list.append(tail) + else: + batch_data['mode'] = "tail-batch" + for index, (head, relation, tail) in enumerate(data): + neg_tail = self.find_negtail(data, index, relation, head) + neg_ent_sample.extend(random.sample(neg_tail, self.neg_num)) + + head_list.append(head) + rel_list.append(relation) + tail_list.append(tail) + + neg_ent_batch = self.batch_tokenizer(neg_ent_sample) + head_batch = self.batch_tokenizer(head_list) + rel_batch = self.batch_tokenizer(rel_list) + tail_batch = self.batch_tokenizer(tail_list) + + ent_list = head_list + rel_list + tail_list + ent_dict = {k: v for v, k in enumerate(ent_list)} + # 用来索引负样本 + neg_index = torch.tensor([ent_dict[i] for i in neg_ent_sample]) + # pos_head_index = torch.tensor(list(range(len(head_list))) + + batch_data["positive_sample"] = (head_batch, rel_batch, tail_batch) + batch_data['negative_sample'] = neg_ent_batch + batch_data['neg_index'] = neg_index + return batch_data + + def batch_tokenizer(self, input_list): + return self.tokenizer.batch_encode_plus( + input_list, + padding='max_length', + max_length=self.text_maxlength, + truncation=True, + return_tensors="pt", + return_token_type_ids=False, + return_attention_mask=True, + add_special_tokens=False + ) + + def find_neghead(self, data, index, rel, ta): + head_list = [] + for i, (head, relation, tail) in enumerate(data): + # 负样本不能被包含 + if i != index and [head, rel, ta] not in self.data: + head_list.append(head) + # 可能存在负样本不够的情况 + # 自补齐 + while len(head_list) < self.neg_num: + head_list.extend(random.sample(head_list, min(self.neg_num - len(head_list), len(head_list)))) + + return head_list + + def find_negtail(self, data, index, rel, he): + tail_list = [] + for i, (head, relation, tail) in enumerate(data): + if i != index and [he, rel, tail] not in self.data: + tail_list.append(tail) + # 可能存在负样本不够的情况 + # 自补齐 + while len(tail_list) < self.neg_num: + tail_list.extend(random.sample(tail_list, min(self.neg_num - len(tail_list), len(tail_list)))) + return tail_list + +# 载入mask loss部分的数据 + + +def load_data(logger, args): + + data_path = args.data_path + + data_name = args.seq_data_name + with open(osp.join(data_path, f'{data_name}_cws.json'), "r") as fp: + data = json.load(fp) + if args.rank == 0: + logger.info(f"[Start] Loading Seq dataset: [{len(data)}]...") + random.shuffle(data) + + # data = data[:10000] + # pdb.set_trace() + train_test_split = int(args.train_ratio * len(data)) + # random.shuffle(x) + # 训练/测试期间不应该打乱 + train_data = data[0: train_test_split] + test_data = data[train_test_split: len(data)] + + # 测试的时候也可能用到其实 not args.only_test + if args.use_mlm_task: + # if args.mask_stratege != 'rand': + # 读领域词汇 + if args.rank == 0: + print("using the domain words .....") + domain_file_path = osp.join(args.data_path, f'{data_name}_chinese_ref.json') + with open(domain_file_path, 'r') as f: + chinese_ref = json.load(f) + # train_test_split=len(data) + chi_ref_train = chinese_ref[:train_test_split] + chi_ref_eval = chinese_ref[train_test_split:] + else: + chi_ref_train = None + chi_ref_eval = None + + if args.use_NumEmb: + if args.rank == 0: + print("using the kpi and num .....") + + kpi_file_path = osp.join(args.data_path, f'{data_name}_kpi_ref.json') + with open(kpi_file_path, 'r') as f: + kpi_ref = json.load(f) + kpi_ref_train = kpi_ref[:train_test_split] + kpi_ref_eval = kpi_ref[train_test_split:] + else: + # num_ref_train = None + # num_ref_eval = None + kpi_ref_train = None + kpi_ref_eval = None + + # pdb.set_trace() + test_set = None + train_set = SeqDataset(train_data, chi_ref=chi_ref_train, kpi_ref=kpi_ref_train) + if len(test_data) > 0: + test_set = SeqDataset(test_data, chi_ref=chi_ref_eval, kpi_ref=kpi_ref_eval) + if args.rank == 0: + logger.info("[End] Loading Seq dataset...") + return train_set, test_set, train_test_split + +# 载入triple loss部分的数据 + + +def load_data_kg(logger, args): + data_path = args.data_path + if args.rank == 0: + logger.info("[Start] Loading KG dataset...") + # # 三元组 + # with open(osp.join(data_path, '5GC_KB/database_triples_831.json'), "r") as f: + # data = json.load(f) + # random.shuffle(data) + + # # # TODO: triple loss这一块还没有测试集 + # train_data = data[0:int(len(data)/args.batch_size)*args.batch_size] + + # with open(osp.join(data_path, 'KG_data_tiny_831.json'),"w") as fp: + # json.dump(data[:1000], fp) + kg_data_name = args.kg_data_name + with open(osp.join(data_path, f'{kg_data_name}.json'), "r") as fp: + train_data = json.load(fp) + # pdb.set_trace() + # 124169 + # 128482 + # train_data = train_data[:124168] + # train_data = train_data[:1000] + train_set = KGDataset(train_data) + if args.rank == 0: + logger.info("[End] Loading KG dataset...") + return train_set, train_data + + +def _torch_collate_batch(examples, tokenizer, max_length=None, pad_to_multiple_of=None): + """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary.""" + import numpy as np + import torch + + # Tensorize if necessary. + if isinstance(examples[0], (list, tuple, np.ndarray)): + examples = [torch.tensor(e, dtype=torch.long) for e in examples] + + length_of_first = examples[0].size(0) + + # Check if padding is necessary. + + # are_tensors_same_length = all(x.size(0) == length_of_first for x in examples) + # if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0): + # return torch.stack(examples, dim=0) + + # If yes, check if we have a `pad_token`. + if tokenizer._pad_token is None: + raise ValueError( + "You are attempting to pad samples but the tokenizer you are using" + f" ({tokenizer.__class__.__name__}) does not have a pad token." + ) + + # Creating the full tensor and filling it with our data. + + if max_length is None: + pdb.set_trace() + max_length = max(x.size(0) for x in examples) + + if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id) + for i, example in enumerate(examples): + if tokenizer.padding_side == "right": + result[i, : example.shape[0]] = example + else: + result[i, -example.shape[0]:] = example + + return result + + +def load_order_data(logger, args): + if args.rank == 0: + logger.info("[Start] Loading Order dataset...") + + data_path = args.data_path + if len(args.order_test_name) > 0: + data_name = args.order_test_name + else: + data_name = args.order_data_name + tmp = osp.join(data_path, f'{data_name}.json') + if osp.exists(tmp): + dp = tmp + else: + dp = osp.join(data_path, 'downstream_task', f'{data_name}.json') + assert osp.exists(dp) + with open(dp, "r") as fp: + data = json.load(fp) + # data = data[:2000] + # pdb.set_trace() + train_test_split = int(args.train_ratio * len(data)) + + mid_split = int(train_test_split / 2) + mid = int(len(data) / 2) + # random.shuffle(x) + # 训练/测试期间不应该打乱 + # train_data = data[0: train_test_split] + # test_data = data[train_test_split: len(data)] + + # test_data = data[0: train_test_split] + # train_data = data[train_test_split: len(data)] + + # 特殊分类 默认前一半和后一半对称 + test_data = data[0: mid_split] + data[mid: mid + mid_split] + train_data = data[mid_split: mid] + data[mid + mid_split: len(data)] + + # pdb.set_trace() + test_set = None + train_set = OrderDataset(train_data) + if len(test_data) > 0: + test_set = OrderDataset(test_data) + if args.rank == 0: + logger.info("[End] Loading Order dataset...") + return train_set, test_set, train_test_split + + +class Collator_order(object): + # 输入一个batch的数据,合并order后面再解耦 + def __init__(self, args, tokenizer): + self.tokenizer = tokenizer + self.text_maxlength = args.maxlength + self.args = args + # 每一个pair中包含的数据数量 + self.order_num = args.order_num + self.p_label, self.n_label = smooth_BCE(args.eps) + + def __call__(self, batch): + # 输入数据按顺序堆叠, 间隔拆分 + # + # 编码然后输出 + output = [] + for item in range(self.order_num): + output.extend([dat[0][0][item] for dat in batch]) + # label smoothing + + labels = [1 if dat[0][1][0] == 2 else self.p_label if dat[0][1][0] == 1 else self.n_label for dat in batch] + batch = self.tokenizer.batch_encode_plus( + output, + padding='max_length', + max_length=self.text_maxlength, + truncation=True, + return_tensors="pt", + return_token_type_ids=False, + return_attention_mask=True, + add_special_tokens=False + ) + # torch.tensor() + return batch, torch.FloatTensor(labels) + + +def smooth_BCE(eps=0.1): # eps 平滑系数 [0, 1] => [0.95, 0.05] + # return positive, negative label smoothing BCE targets + # positive label= y_true * (1.0 - label_smoothing) + 0.5 * label_smoothing + # y_true=1 label_smoothing=eps=0.1 + return 1.0 - 0.5 * eps, 0.5 * eps diff --git a/KTeleBERT/src/distributed_utils.py b/KTeleBERT/src/distributed_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..aa25d4732f7bbe299301ea429d85fc7fbcb2dcad --- /dev/null +++ b/KTeleBERT/src/distributed_utils.py @@ -0,0 +1,79 @@ +import os + +import torch +import torch.distributed as dist +import pdb + + +def dist_pdb(rank, in_rank=0): + if rank != in_rank: + dist.barrier() + else: + pdb.set_trace() + dist.barrier() + + +def init_distributed_mode(args): + if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ['WORLD_SIZE']) + args.gpu = int(os.environ['LOCAL_RANK']) + elif 'SLURM_PROCID' in os.environ: + args.rank = int(os.environ['SLURM_PROCID']) + args.gpu = args.rank % torch.cuda.device_count() + else: + print('Not using distributed mode') + args.distributed = False + return + + args.distributed = True + + torch.cuda.set_device(args.gpu) + args.dist_backend = 'nccl' # 通信后端,nvidia GPU推荐使用NCCL + print('| distributed init (rank {}): {}'.format( + args.rank, args.dist_url), flush=True) + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + dist.barrier() + + +def cleanup(): + dist.destroy_process_group() + + +def is_dist_avail_and_initialized(): + """检查是否支持分布式环境""" + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def reduce_value(value, average=True): + world_size = get_world_size() + if world_size < 2: # 单GPU的情况 + return value + + with torch.no_grad(): + dist.all_reduce(value) + if average: + value /= world_size + + return value diff --git a/KTeleBERT/src/utils.py b/KTeleBERT/src/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9b7d9a95d425a0901b087e71650bead4e0b073b0 --- /dev/null +++ b/KTeleBERT/src/utils.py @@ -0,0 +1,374 @@ + +import os +import errno +import torch +import sys +import logging +import json +from pathlib import Path +import torch.distributed as dist +import csv +import os.path as osp +from time import time +from numpy import mean +import re +from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup +import pdb +from torch import nn + + + +# Huggingface的实现中,自带多种warmup策略 +def set_optim(opt, model_list, freeze_part=[], accumulation_step=None): + # Bert optim + optimizer_list, scheduler_list, named_parameters = [], [], [] + # cur_model = model.module if hasattr(model, 'module') else model + for model in model_list: + model_para = list(model.named_parameters()) + model_para_train, freeze_layer = [], [] + for n, p in model_para: + if not any(nd in n for nd in freeze_part): + model_para_train.append((n, p)) + else: + p.requires_grad = False + freeze_layer.append((n, p)) + named_parameters.extend(model_para_train) + + # for name, param in model_list[0].named_parameters(): + # if not param.requires_grad: + # print(name, param.size()) + + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + # numeric_model 也包括到这个部分中 + ke_part = ['ke_model', 'loss_awl', 'numeric_model', 'order'] + if opt.LLRD: + # 按层次衰减的学习率 + all_name_orig = [n for n, p in named_parameters if not any(nd in n for nd in ke_part)] + + opt_parameters, all_name = LLRD(opt, named_parameters, no_decay, ke_part) + remain = list(set(all_name_orig) - set(all_name)) + remain_parameters = [ + {'params': [p for n, p in named_parameters if not any(nd in n for nd in no_decay) and n in remain], "lr": opt.lr, 'weight_decay': opt.weight_decay}, + {'params': [p for n, p in named_parameters if any(nd in n for nd in no_decay) and n in remain], "lr": opt.lr, 'weight_decay': 0.0} + ] + opt_parameters.extend(remain_parameters) + else: + opt_parameters = [ + {'params': [p for n, p in named_parameters if not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)], "lr": opt.lr, 'weight_decay': opt.weight_decay}, + {'params': [p for n, p in named_parameters if any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)], "lr": opt.lr, 'weight_decay': 0.0} + ] + + ke_parameters = [ + {'params': [p for n, p in named_parameters if not any(nd in n for nd in no_decay) and any(nd in n for nd in ke_part)], "lr": opt.ke_lr, 'weight_decay': opt.weight_decay}, + {'params': [p for n, p in named_parameters if any(nd in n for nd in no_decay) and any(nd in n for nd in ke_part)], "lr": opt.ke_lr, 'weight_decay': 0.0} + ] + opt_parameters.extend(ke_parameters) + optimizer = AdamW(opt_parameters, lr=opt.lr, eps=opt.adam_epsilon) + if accumulation_step is None: + accumulation_step = opt.accumulation_steps + if opt.scheduler == 'linear': + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(opt.warmup_steps/accumulation_step), num_training_steps=int(opt.total_steps/accumulation_step)) + else: + scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(opt.warmup_steps/accumulation_step), num_training_steps=int(opt.total_steps/accumulation_step)) + + # ---- 判定所有参数是否被全部优化 ---- + all_para_num = 0 + for paras in opt_parameters: + all_para_num += len(paras['params']) + # pdb.set_trace() + assert len(named_parameters) == all_para_num + return optimizer, scheduler + +# LLRD 学习率逐层衰减但 + +def LLRD(opt, named_parameters, no_decay, ke_part =[]): + opt_parameters = [] + all_name = [] + head_lr = opt.lr * 1.05 + init_lr = opt.lr + lr = init_lr + + # === Pooler and regressor ====================================================== + params_0 = [p for n,p in named_parameters if ("pooler" in n or "regressor" in n or "predictions" in n) + and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] + params_1 = [p for n,p in named_parameters if ("pooler" in n or "regressor" in n or "predictions" in n) + and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] + + name_0 = [n for n,p in named_parameters if ("pooler" in n or "regressor" in n or "predictions" in n) + and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] + name_1 = [n for n,p in named_parameters if ("pooler" in n or "regressor" in n or "predictions" in n) + and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] + + all_name.extend(name_0) + all_name.extend(name_1) + + head_params = {"params": params_0, "lr": head_lr, "weight_decay": 0.0} + opt_parameters.append(head_params) + + head_params = {"params": params_1, "lr": head_lr, "weight_decay": 0.01} + opt_parameters.append(head_params) + + # === 12 Hidden layers ========================================================== + for layer in range(11,-1,-1): + params_0 = [p for n,p in named_parameters if f"encoder.layer.{layer}." in n + and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] + params_1 = [p for n,p in named_parameters if f"encoder.layer.{layer}." in n + and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] + + layer_params = {"params": params_0, "lr": lr, "weight_decay": 0.0} + opt_parameters.append(layer_params) + + layer_params = {"params": params_1, "lr": lr, "weight_decay": 0.01} + opt_parameters.append(layer_params) + + name_0 = [n for n,p in named_parameters if f"encoder.layer.{layer}." in n + and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] + name_1 = [n for n,p in named_parameters if f"encoder.layer.{layer}." in n + and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] + all_name.extend(name_0) + all_name.extend(name_1) + + lr *= 0.95 + # === Embeddings layer ========================================================== + + params_0 = [p for n,p in named_parameters if ("embeddings" in n ) + and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] + params_1 = [p for n,p in named_parameters if ("embeddings" in n ) + and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] + + embed_params = {"params": params_0, "lr": lr, "weight_decay": 0.0} + opt_parameters.append(embed_params) + + embed_params = {"params": params_1, "lr": lr, "weight_decay": 0.01} + opt_parameters.append(embed_params) + + name_0 = [n for n,p in named_parameters if ("embeddings" in n ) + and any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] + name_1 = [n for n,p in named_parameters if ("embeddings" in n ) + and not any(nd in n for nd in no_decay) and not any(nd in n for nd in ke_part)] + all_name.extend(name_0) + all_name.extend(name_1) + return opt_parameters, all_name + +class FixedScheduler(torch.optim.lr_scheduler.LambdaLR): + def __init__(self, optimizer, last_epoch=-1): + super(FixedScheduler, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) + + def lr_lambda(self, step): + return 1.0 + + +class WarmupLinearScheduler(torch.optim.lr_scheduler.LambdaLR): + def __init__(self, optimizer, warmup_steps, scheduler_steps, min_ratio, last_epoch=-1): + self.warmup_steps = warmup_steps + self.scheduler_steps = scheduler_steps + self.min_ratio = min_ratio + # self.fixed_lr = fixed_lr + super(WarmupLinearScheduler, self).__init__( + optimizer, self.lr_lambda, last_epoch=last_epoch + ) + + def lr_lambda(self, step): + if step < self.warmup_steps: + return (1 - self.min_ratio) * step / float(max(1, self.warmup_steps)) + self.min_ratio + + # if self.fixed_lr: + # return 1.0 + + return max(0.0, + 1.0 + (self.min_ratio - 1) * (step - self.warmup_steps) / float(max(1.0, self.scheduler_steps - self.warmup_steps)), + ) + + +class Loss_log(): + def __init__(self): + self.loss = [] + self.acc = [0.] + self.flag = 0 + self.token_right_num = [] + self.token_all_num = [] + self.word_right_num = [] + self.word_all_num = [] + # 默认不使用top_k acc + self.use_top_k_acc = 0 + + def acc_init(self, topn=[1]): + self.loss = [] + self.token_right_num = [] + self.token_all_num = [] + self.topn = topn + self.use_top_k_acc = 1 + self.top_k_word_right = {} + for n in topn: + self.top_k_word_right[n] = [] + + def time_init(self): + self.start = time() + self.last = self.start + self.time_used_epoch = [] + + def time_cpt(self, step, total_step): + # 时间统计 + time_used_last_epoch = time() - self.last + self.time_used_epoch.append(time_used_last_epoch) + time_used = time() - self.start + self.last = time() + h, m, s = time_trans(time_used) + time_remain = int(total_step - step) * mean(self.time_used_epoch) + h_r, m_r, s_r = time_trans(time_remain) + + return h, m, s, h_r, m_r, s_r + + def get_token_acc(self): + # 返回list + if len(self.token_all_num) == 0: + return 0. + elif self.use_top_k_acc == 1: + res = [] + for n in self.topn: + res.append(round((sum(self.top_k_word_right[n]) / sum(self.token_all_num)) * 100 , 3)) + return res + else: + return [sum(self.token_right_num)/sum(self.token_all_num)] + + + def update_token(self, token_num, token_right): + # 输入是list文件 + self.token_all_num.append(token_num) + if isinstance(token_right, list): + for i, n in enumerate(self.topn): + self.top_k_word_right[n].append(token_right[i]) + self.token_right_num.append(token_right) + + def update(self, case): + self.loss.append(case) + + def update_acc(self, case): + self.acc.append(case) + + def get_loss(self): + if len(self.loss) == 0: + return 500. + return mean(self.loss) + + def get_acc(self): + return self.acc[-1] + + def get_min_loss(self): + return min(self.loss) + + def early_stop(self): + # min_loss = min(self.loss) + if self.loss[-1] > min(self.loss): + self.flag += 1 + else: + self.flag = 0 + + if self.flag > 1000: + return True + else: + return False + + +def add_special_token(tokenizer, model=None, rank=0, cache_path = None): + # model: bert layer + # 每次更新这个,所有模型需要重新训练,get_chinese_ref.py需要重新运行 + # 主函数调用该函数的位置需要在载入模型之前 + # --------------------------------------- + # 不会被mask的 token, 不参与 任何时候的MASK + special_token = ['[SEP]', '[MASK]', '[ALM]', '[KPI]', '[CLS]', '[LOC]', '[EOS]', '[ENT]', '[ATTR]', '[NUM]', '[REL]', '|', '[DOC]'] + + # --------------------------------------- + # 会被mask的但是---#不加入#---tokenizer的内容 + # 出现次数多(>10000)但是长度较长(>=4符) + # 或者是一些难以理解的名词 + # WWM 的主体 + # TODO: 专家检查 + # To Add: 'SGSN', '3GPP', 'Bearer', 'sbim', 'FusionSphere', 'IMSI', 'GGSN', 'RETCODE', 'PCRF', 'PDP', 'GTP', 'OCS', 'HLR', 'FFFF', 'VLR', 'DNN', 'PID', 'CSCF', 'PDN', 'SCTP', 'SPGW', 'TAU', 'PCEF', 'NSA', 'ACL', 'BGP', 'USCDB', 'VoLTE', 'RNC', 'GPRS', 'DRA', 'MOC' + # 拆分:配置原则,本端规划 + norm_token = ['网元实例', '事件类型', '告警级别', '告警名称', '告警源', '通讯系统', '默认值', '链路故障', '取值范围', '可选必选说明', '数据来源', '用户平面', '配置', '原则', '该参数', '失败次数', '可选参数', 'S1模式', '必选参数', 'IP地址', '响应消息', '成功次数', '测量指标', '用于', '统计周期', '该命令', '上下文', '请求次数', '本端', 'pod', 'amf', 'smf', 'nrf', 'ausf', 'upcf', 'upf', 'udm', 'PDU', 'alias', 'PLMN', 'MML', 'Info_Measure', 'icase', 'Diameter', 'MSISDN', 'RAT', 'RMV', 'PFCP', 'NSSAI', 'CCR', 'HDBNJjs', 'HNGZgd', 'SGSN', '3GPP', 'Bearer', 'sbim', 'FusionSphere', 'IMSI', 'GGSN', 'RETCODE', 'PCRF', 'PDP', 'GTP', 'OCS', 'HLR', 'FFFF', 'VLR', 'DNN', 'PID', 'CSCF', 'PDN', 'SCTP', 'SPGW', 'TAU', 'PCEF', 'NSA', 'ACL', 'BGP', 'USCDB', 'VoLTE', 'RNC', 'GPRS', 'DRA', 'MOC', '告警', '网元', '对端', '信令', '话单', '操作', '风险', '等级', '下发', '流控', '运营商', '寻呼', '漫游', '切片', '报文', '号段', '承载', '批量', '导致', '原因是', '影响', '造成', '引起', '随之', '情况下', '根因', 'trigger'] + # --------------------------------------- + # , '', '', '', '', '', '', '', '', '', '', '' + # 会被mask的但是---#加入#---tokenizer的内容 + # 长度小于等于3,缩写/专有名词 大于10000次 + # 严谨性要求大于norm_token + # 出现次数多时有足够的影响力可以进行分离 + norm_token_tobe_added = ['pod', 'amf', 'smf', 'nrf', 'ausf', 'upcf', 'upf', 'udm', 'ALM', '告警', '网元', '对端', '信令', '话单', 'RAN', 'MML', 'PGW', 'MME', 'SGW', 'NF', 'APN', 'LST', 'GW', 'QoS', 'IPv', 'PDU', 'IMS', 'EPS', 'GTP', 'PDP', 'LTE', 'HSS'] + + token_tobe_added = [] + # all_token = special_token + norm_token_tobe_added + all_token = norm_token_tobe_added + for i in all_token: + if i not in tokenizer.vocab.keys() and i.lower() not in tokenizer.vocab.keys(): + token_tobe_added.append(i) + + # tokenizer.add_tokens(special_token, special_tokens=False) + # tokenizer.add_tokens(norm_token, special_tokens=False) + tokenizer.add_tokens(token_tobe_added, special_tokens=False) + special_tokens_dict = {"additional_special_tokens": special_token} + special_token_ = tokenizer.add_special_tokens(special_tokens_dict) + if rank == 0: + print("Added tokens:") + print(tokenizer.get_added_vocab()) + + # pdb.set_trace() + + if model is not None: + # TODO: 用预训练好的TeleBert进行这部分embedding(所有添加的embedding)的初始化 + if rank == 0: + print(f"--------------------------------") + print(f"-------- orig word embedding shape: {model.get_input_embeddings().weight.shape}") + sz = model.resize_token_embeddings(len(tokenizer)) + if cache_path is not None: + # model.cpu() + token_2_emb = torch.load(cache_path) + # 在这里加入embedding 初始化之后需要tie一下 + token_dic = tokenizer.get_added_vocab() + id_2_token = {v:k for k,v in token_dic.items()} + with torch.no_grad(): + for key in id_2_token.keys(): + model.bert.embeddings.word_embeddings.weight[key,:] = nn.Parameter(token_2_emb[id_2_token[key]][0]).cuda() + # model.get_input_embeddings().weight[key,:] = nn.Parameter(token_2_emb[id_2_token[key]][0]).cuda() + # model.embedding + model.bert.tie_weights() + if rank == 0: + print(f"-------- resize_token_embeddings into {sz} done!") + print(f"--------------------------------") + # 这里替换embedding + + norm_token = list(set(norm_token).union(set(norm_token_tobe_added))) + return tokenizer, special_token, norm_token + + +def time_trans(sec): + m, s = divmod(sec, 60) + h, m = divmod(m, 60) + return int(h), int(m), int(s) + +def torch_accuracy(output, target, topk=(1,)): + ''' + param output, target: should be torch Variable + ''' + # assert isinstance(output, torch.cuda.Tensor), 'expecting Torch Tensor' + # assert isinstance(target, torch.Tensor), 'expecting Torch Tensor' + # print(type(output)) + + topn = max(topk) + batch_size = output.size(0) + + _, pred = output.topk(topn, 1, True, True) # 返回(values,indices)其中indices就是预测类别的值,0为第一类 + pred = pred.t() # torch.t()转置,既可得到每一行为batch最好的一个预测序列 + + is_correct = pred.eq(target.view(1, -1).expand_as(pred)) + + ans = [] + ans_num = [] + for i in topk: + # is_correct_i = is_correct[:i].view(-1).float().sum(0, keepdim=True) + is_correct_i = is_correct[:i].contiguous().view(-1).float().sum(0, keepdim=True) + ans_num.append(int(is_correct_i.item())) + ans.append(is_correct_i.mul_(100.0 / batch_size)) + + return ans, ans_num + + \ No newline at end of file diff --git a/KTeleBERT/test.sh b/KTeleBERT/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..c05a101359782a5074b996d3db47f8a9091accf8 --- /dev/null +++ b/KTeleBERT/test.sh @@ -0,0 +1,12 @@ +python main.py --only_test 1 \ + --batch_size 150 \ + --use_NumEmb 1 \ + --mask_test 0 \ + --mask_stratege wwm \ + --model_name model_name_vXX \ + --ke_test 0 \ + --embed_gen 1 \ + --train_ratio 0 \ + --ke_dim 256 \ + --plm_emb_type cls \ + diff --git a/KTeleBERT/torchlight/__init__.py b/KTeleBERT/torchlight/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8570f8f816a1ff556b5c52401d6527eb0a3e39ed --- /dev/null +++ b/KTeleBERT/torchlight/__init__.py @@ -0,0 +1,20 @@ +from .logger import initialize_exp, get_dump_path +from .metric import Metric, Top_K_Metric +from .module import LSTM4VarLenSeq +from .vocab import (PAD_TOKEN, UNK_TOKEN, BOS_TOKEN, EOS_TOKEN, + DefaultLookupDict, + Vocabulary) +from .utils import (invert_dict, + personal_display_settings, + set_seed, + normalize, + snapshot, + show_params, + longest_substring, + pad, + to_cuda, + get_code_version, + cat_ragged_tensors, + topk_accuracy, + get_total_trainable_params) + diff --git a/KTeleBERT/torchlight/__pycache__/__init__.cpython-38.pyc b/KTeleBERT/torchlight/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c406e0323ffe22855fe7c3fb8caf9f63eccd6228 Binary files /dev/null and b/KTeleBERT/torchlight/__pycache__/__init__.cpython-38.pyc differ diff --git a/KTeleBERT/torchlight/__pycache__/logger.cpython-38.pyc b/KTeleBERT/torchlight/__pycache__/logger.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be2fbae4af89bfdc256820c009cf5d93a86cc994 Binary files /dev/null and b/KTeleBERT/torchlight/__pycache__/logger.cpython-38.pyc differ diff --git a/KTeleBERT/torchlight/__pycache__/metric.cpython-38.pyc b/KTeleBERT/torchlight/__pycache__/metric.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44c3660fe6af93a78993bc165fd1b4a5e890a090 Binary files /dev/null and b/KTeleBERT/torchlight/__pycache__/metric.cpython-38.pyc differ diff --git a/KTeleBERT/torchlight/__pycache__/module.cpython-38.pyc b/KTeleBERT/torchlight/__pycache__/module.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae88a49ca3ec06ad543ccf6fee1a9dffbfddb06a Binary files /dev/null and b/KTeleBERT/torchlight/__pycache__/module.cpython-38.pyc differ diff --git a/KTeleBERT/torchlight/__pycache__/utils.cpython-38.pyc b/KTeleBERT/torchlight/__pycache__/utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c02137cd62e2cbab15b8248fdf9a7842fc7beeb Binary files /dev/null and b/KTeleBERT/torchlight/__pycache__/utils.cpython-38.pyc differ diff --git a/KTeleBERT/torchlight/__pycache__/vocab.cpython-38.pyc b/KTeleBERT/torchlight/__pycache__/vocab.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f445638c7a8e34bef9e2731ff7e58ae0ff4c884 Binary files /dev/null and b/KTeleBERT/torchlight/__pycache__/vocab.cpython-38.pyc differ diff --git a/KTeleBERT/torchlight/logger.py b/KTeleBERT/torchlight/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..30d6a96d65ca455bdeb8c66f95a6859af427514a --- /dev/null +++ b/KTeleBERT/torchlight/logger.py @@ -0,0 +1,147 @@ +import os +import re +import sys +import time +import json +import torch +import pickle +import random +import getpass +import logging +import argparse +import subprocess +import numpy as np +from datetime import timedelta, date +from .utils import get_code_version + + +class LogFormatter(): + + def __init__(self): + self.start_time = time.time() + + def format(self, record): + elapsed_seconds = round(record.created - self.start_time) + + prefix = "%s - %s - %s" % ( + record.levelname, + time.strftime('%x %X'), + timedelta(seconds=elapsed_seconds) + ) + message = record.getMessage() + message = message.replace('\n', '\n' + ' ' * (len(prefix) + 3)) + return "%s - %s" % (prefix, message) if message else '' + + +def create_logger(filepath, rank): + """ + Create a logger. + Use a different log file for each process. + """ + # create log formatter + log_formatter = LogFormatter() + + # create file handler and set level to debug + if filepath is not None: + if rank > 0: + filepath = '%s-%i' % (filepath, rank) + file_handler = logging.FileHandler(filepath, "a", encoding='utf-8') + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(log_formatter) + + # create console handler and set level to info + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + console_handler.setFormatter(log_formatter) + + # create logger and set level to debug + logger = logging.getLogger() + logger.handlers = [] + logger.setLevel(logging.DEBUG) + logger.propagate = False + if filepath is not None: + logger.addHandler(file_handler) + logger.addHandler(console_handler) + + # reset logger elapsed time + def reset_time(): + log_formatter.start_time = time.time() + logger.reset_time = reset_time + + return logger + + +def initialize_exp(params): + """ + Initialize the experiment: + - dump parameters + - create a logger + """ + # dump parameters + exp_folder = get_dump_path(params) + json.dump(vars(params), open(os.path.join(exp_folder, 'params.pkl'), 'w'), indent=4) + + # get running command + command = ["python", sys.argv[0]] + for x in sys.argv[1:]: + if x.startswith('--'): + assert '"' not in x and "'" not in x + command.append(x) + else: + assert "'" not in x + if re.match('^[a-zA-Z0-9_]+$', x): + command.append("%s" % x) + else: + command.append("'%s'" % x) + command = ' '.join(command) + params.command = command + ' --exp_id "%s"' % params.exp_id + + # check experiment name + assert len(params.exp_name.strip()) > 0 + + # create a logger + logger = create_logger(os.path.join(exp_folder, 'train.log'), rank=getattr(params, 'global_rank', 0)) + logger.info("============ Initialized logger ============") + # logger.info("\n".join("%s: %s" % (k, str(v)) + # for k, v in sorted(dict(vars(params)).items()))) + # text = f'# Git Version: {get_code_version()} #' + # logger.info("\n".join(['=' * 24, text, '=' * 24])) + logger.info("The experiment will be stored in %s\n" % exp_folder) + logger.info("Running command: %s" % command) + logger.info("") + return logger + + +def get_dump_path(params): + """ + Create a directory to store the experiment. + """ + assert len(params.exp_name) > 0 + assert not params.dump_path in ('', None), \ + 'Please choose your favorite destination for dump.' + dump_path = params.dump_path + + # create the sweep path if it does not exist + when = date.today().strftime('%m%d-') + sweep_path = os.path.join(dump_path, when + params.exp_name) + if not os.path.exists(sweep_path): + subprocess.Popen("mkdir -p %s" % sweep_path, shell=True).wait() + + # create an random ID for the job if it is not given in the parameters. + if params.exp_id == '': + chars = 'abcdefghijklmnopqrstuvwxyz0123456789' + while True: + exp_id = ''.join(random.choice(chars) for _ in range(10)) + if not os.path.isdir(os.path.join(sweep_path, exp_id)): + break + params.exp_id = exp_id + + # create the dump folder / update parameters + exp_folder = os.path.join(sweep_path, params.exp_id) + if not os.path.isdir(exp_folder): + subprocess.Popen("mkdir -p %s" % exp_folder, shell=True).wait() + return exp_folder + + +if __name__ == '__main__': + pass diff --git a/KTeleBERT/torchlight/metric.py b/KTeleBERT/torchlight/metric.py new file mode 100644 index 0000000000000000000000000000000000000000..68cbfd798502c6f30e44c9ce10ddc2c8e0942b8a --- /dev/null +++ b/KTeleBERT/torchlight/metric.py @@ -0,0 +1,121 @@ +# from abc import ABC, ABCMeta, abstractclassmethod +import torch +import numpy as np +from abc import ABC, abstractmethod, ABCMeta + +class Metric(metaclass=ABCMeta): + """ + - reset() in the begining of every epoch. + - update_per_batch() after every batch. + - update_per_epoch() after every epoch. + """ + + @abstractmethod + def __init__(self): + pass + + @abstractmethod + def reset(self): + pass + + @abstractmethod + def update_per_batch(self, output): + pass + + @abstractmethod + def update_per_epoch(self): + pass + +class Top_K_Metric(Metric): + """ + Stores accuracy (score), loss and timing info + """ + def __init__(self, topnum=[1,3,10]): + super().__init__() + # assert len(topnum) == 3 + self.topnum = topnum + self.k_num = len(self.topnum) + self.reset() + + def reset(self): + self.total_loss = 0 + self.correct_list = [0] * self.k_num + self.acc_list = [0] * self.k_num + self.acc_all = 0 + self.num_examples = 0 + self.num_epoch = 0 + + self.mrr = 0 + self.mr = 0 + self.mrr_all = 0 + self.mr_all = 0 + + def update_per_batch(self, loss, ans, pred): + self.total_loss += loss + self.num_epoch += 1 + self.top_k_list = self.batch_accuracy(pred, ans) + self.num_examples += self.top_k_list[0].shape[0] + for i in range(self.k_num): + self.correct_list[i] += self.top_k_list[i].sum().item() + + # mrr + mrr_tmp, mr_tmp = self.batch_mr_mrr(pred, ans) + self.mrr_all += mrr_tmp.sum().item() + self.mr_all += mr_tmp.sum().item() + + + + def update_per_epoch(self): + for i in range(self.k_num): + self.acc_list[i] = 100 * (self.correct_list[i] / self.num_examples) + + self.mr = self.mr_all / self.num_examples + self.mrr = self.mrr_all / self.num_examples + self.total_loss = self.total_loss / self.num_epoch + self.acc_all = sum(self.acc_list) + + + def batch_accuracy(self, predicted, true): + """ Compute the accuracies for a batch of predictions and answers """ + if len(true.shape) == 3: + true = true[0] + _, ok = predicted.topk(max(self.topnum), dim=1) + agreeing_all = torch.zeros([predicted.shape[0], 1], dtype=torch.float).cuda() + top_k_list = [0]*self.topnum + for i in range(max(self.topnum)): + tmp = ok[:, i].reshape(-1, 1) + agreeing_all += true.gather(dim=1, index=tmp) + for k in range(self.k_num): + if i == self.topnum[k] - 1: + top_k_list[k] = (agreeing_all * 0.3).clamp(max=1) + break + + return top_k_list + + + + def batch_mr_mrr(self, predicted, true): + if len(true.shape) == 3: + true = true[0] + + # 计算 + top_rank = predicted.shape[1] + batch_size = predicted.shape[0] + _, predict_ans_rank = predicted.topk(top_rank, dim=1) # 答案排名的坐标 batchsize * 500 + _, real_ans = true.topk(1, dim=1) # 真正的答案:batchsize * 1 + + # 扩充维度 + real_ans = real_ans.expand(batch_size, top_rank) + ans_different = torch.abs(predict_ans_rank - real_ans) + # 此时为0的位置就是预测正确的位置 + _, real_ans_list = ans_different.topk(top_rank, dim=1) #此时最后一位的数值就是正确答案在预测答案里面的位置,为 0 + real_ans_list = real_ans_list + 1.0 + mr = real_ans_list[:,-1].reshape(-1,1).to(torch.float64) + mrr = 1.0 / mr + # pdb.set_trace() + + return mrr,mr + + +if __name__ == '__main__': + pass \ No newline at end of file diff --git a/KTeleBERT/torchlight/module.py b/KTeleBERT/torchlight/module.py new file mode 100644 index 0000000000000000000000000000000000000000..2e055a9e5bb66951d2f6fa0ecfac5a6705a49ae3 --- /dev/null +++ b/KTeleBERT/torchlight/module.py @@ -0,0 +1,133 @@ +import math +from typing import Sequence, Union, Callable +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence + +torch.manual_seed(10086) +# typing, everything in Python is Object. +tensor_activation = Callable[[torch.Tensor], torch.Tensor] + + +class LSTM4VarLenSeq(nn.Module): + def __init__(self, input_size, hidden_size, + num_layers=1, bias=True, bidirectional=False, init='orthogonal', take_last=True): + """ + no dropout support + batch_first support deprecated, the input and output tensors are + provided as (batch, seq_len, feature). + + Args: + input_size: + hidden_size: + num_layers: + bias: + bidirectional: + init: ways to init the torch.nn.LSTM parameters, + supports 'orthogonal' and 'uniform' + take_last: 'True' if you only want the final hidden state + otherwise 'False' + """ + super(LSTM4VarLenSeq, self).__init__() + self.lstm = nn.LSTM(input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + bias=bias, + bidirectional=bidirectional) + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.bias = bias + self.bidirectional = bidirectional + self.init = init + self.take_last = take_last + self.batch_first = True # Please don't modify this + + self.init_parameters() + + def init_parameters(self): + """orthogonal init yields generally good results than uniform init""" + if self.init == 'orthogonal': + gain = 1 # use default value + for nth in range(self.num_layers * self.bidirectional): + # w_ih, (4 * hidden_size x input_size) + nn.init.orthogonal_(self.lstm.all_weights[nth][0], gain=gain) + # w_hh, (4 * hidden_size x hidden_size) + nn.init.orthogonal_(self.lstm.all_weights[nth][1], gain=gain) + # b_ih, (4 * hidden_size) + nn.init.zeros_(self.lstm.all_weights[nth][2]) + # b_hh, (4 * hidden_size) + nn.init.zeros_(self.lstm.all_weights[nth][3]) + elif self.init == 'uniform': + k = math.sqrt(1 / self.hidden_size) + for nth in range(self.num_layers * self.bidirectional): + nn.init.uniform_(self.lstm.all_weights[nth][0], -k, k) + nn.init.uniform_(self.lstm.all_weights[nth][1], -k, k) + nn.init.zeros_(self.lstm.all_weights[nth][2]) + nn.init.zeros_(self.lstm.all_weights[nth][3]) + else: + raise NotImplemented('Unsupported Initialization') + + def forward(self, x, x_len, hx=None): + # 1. Sort x and its corresponding length + sorted_x_len, sorted_x_idx = torch.sort(x_len, descending=True) + sorted_x = x[sorted_x_idx] + # 2. Ready to unsort after LSTM forward pass + # Note that PyTorch 0.4 has no argsort, but PyTorch 1.0 does. + _, unsort_x_idx = torch.sort(sorted_x_idx, descending=False) + + # 3. Pack the sorted version of x and x_len, as required by the API. + x_emb = pack_padded_sequence(sorted_x, sorted_x_len, + batch_first=self.batch_first) + + # 4. Forward lstm + # output_packed.data.shape is (valid_seq, num_directions * hidden_dim). + # See doc of torch.nn.LSTM for details. + out_packed, (hn, cn) = self.lstm(x_emb) + + # 5. unsort h + # (num_layers * num_directions, batch, hidden_size) -> (batch, ...) + hn = hn.permute(1, 0, 2)[unsort_x_idx] # swap the first two dim + hn = hn.permute(1, 0, 2) # swap the first two again to recover + if self.take_last: + return hn.squeeze(0) + else: + # unpack: out + # (batch, max_seq_len, num_directions * hidden_size) + out, _ = pad_packed_sequence(out_packed, + batch_first=self.batch_first) + out = out[unsort_x_idx] + # unpack: c + # (num_layers * num_directions, batch, hidden_size) -> (batch, ...) + cn = cn.permute(1, 0, 2)[unsort_x_idx] # swap the first two dim + cn = cn.permute(1, 0, 2) # swap the first two again to recover + return out, (hn, cn) + + +if __name__ == '__main__': + # Note that in the future we will import unittest + # and port the following examples to test folder. + + # Unit test for LSTM variable length sequences + # ================ + net = LSTM4VarLenSeq(200, 100, + num_layers=3, bias=True, bidirectional=True, init='orthogonal', take_last=False) + + inputs = torch.tensor([[1, 2, 3, 0], + [2, 3, 0, 0], + [2, 4, 3, 0], + [1, 4, 3, 0], + [1, 2, 3, 4]]) + embedding = nn.Embedding(num_embeddings=5, embedding_dim=200, padding_idx=0) + lens = torch.LongTensor([3, 2, 3, 3, 4]) + + input_embed = embedding(inputs) + output, (h, c) = net(input_embed, lens) + # 5, 4, 200, batch, seq length, hidden_size * 2 (only last layer) + print(output.shape) + # 6, 5, 100, num_layers * num_directions, batch, hidden_size + print(h.shape) + # 6, 5, 100, num_layers * num_directions, batch, hidden_size + print(c.shape) diff --git a/KTeleBERT/torchlight/utils.py b/KTeleBERT/torchlight/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..df9342829e94f6a9d6ab12526e1af203b97ddd2e --- /dev/null +++ b/KTeleBERT/torchlight/utils.py @@ -0,0 +1,195 @@ +""" +Utilizations for common usages. +""" +import os +import random +import torch +import numpy as np +from difflib import SequenceMatcher +from unidecode import unidecode +from datetime import datetime +from torch.nn.parallel import DataParallel, DistributedDataParallel + + +def invert_dict(d): + return {v: k for k, v in d.items()} + +def personal_display_settings(): + """ + Pandas Doc + https://pandas.pydata.org/pandas-docs/stable/generated/pandas.set_option.html + NumPy Doc + - + """ + from pandas import set_option + set_option('display.max_rows', 500) + set_option('display.max_columns', 500) + set_option('display.width', 2000) + set_option('display.max_colwidth', 1000) + from numpy import set_printoptions + set_printoptions(suppress=True) + + +def set_seed(seed): + """ + Freeze every seed for reproducibility. + torch.cuda.manual_seed_all is useful when using random generation on GPUs. + e.g. torch.cuda.FloatTensor(100).uniform_() + """ + os.environ['PYTHONHASHSEED'] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + + +def normalize(s): + """ + German and Frence have different vowels than English. + This utilization removes all the non-unicode characters. + Example: + āáǎà --> aaaa + ōóǒò --> oooo + ēéěè --> eeee + īíǐì --> iiii + ūúǔù --> uuuu + ǖǘǚǜ --> uuuu + + :param s: unicode string + :return: unicode string with regular English characters. + """ + s = s.strip().lower() + s = unidecode(s) + return s + + +def snapshot(model, epoch, save_path): + """ + Saving models w/ its params. + Get rid of the ONNX Protocal. + F-string feature new in Python 3.6+ is used. + """ + os.makedirs(save_path, exist_ok=True) + # timestamp = datetime.now().strftime('%m%d_%H%M') + save_path = os.path.join(save_path, f'{type(model).__name__}_{epoch}_epoch.pkl') + if isinstance(model, (DataParallel, DistributedDataParallel)): + torch.save(model.module.state_dict(), save_path) + else: + torch.save(model.state_dict(), save_path) + return save_path + + +def save_checkpoint(model, optimizer, epoch, path): + torch.save({ + 'epoch': epoch, + 'models': model.state_dict(), + 'optimizer': optimizer.state_dict(), + }, path) + + +def load_checkpoint(path, map_location): + checkpoint = torch.load(path, map_location=map_location) + return checkpoint + + +def show_params(model): + """ + Show models parameters for logging. + """ + for name, param in model.named_parameters(): + print('%-16s' % name, param.size()) + + +def longest_substring(str1, str2): + # initialize SequenceMatcher object with input string + seqMatch = SequenceMatcher(None, str1, str2) + + # find match of longest sub-string + # output will be like Match(a=0, b=0, size=5) + match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) + + # print longest substring + return str1[match.a: match.a + match.size] if match.size != 0 else "" + + +def pad(sent, max_len): + """ + syntax "[0] * int" only works properly for Python 3.5+ + Note that in testing time, the length of a sentence + might exceed the pre-defined max_len (of training data). + """ + length = len(sent) + return (sent + [0] * (max_len - length))[:max_len] if length < max_len else sent[:max_len] + + +def to_cuda(*args, device=None): + """ + Move Tensors to CUDA. + If no device provided, default to the first card in CUDA_VISIBLE_DEVICES. + """ + assert all(torch.is_tensor(t) for t in args), \ + 'Only support for tensors, please check if any nn.Module exists.' + if device is None: + device = torch.device('cuda:0') + return [None if x is None else x.to(device) for x in args] + + +def get_code_version(short_sha=True): + from subprocess import check_output, STDOUT, CalledProcessError + try: + sha = check_output('git rev-parse HEAD', stderr=STDOUT, + shell=True, encoding='utf-8') + if short_sha: + sha = sha[:7] + return sha + except CalledProcessError: + # There was an error - command exited with non-zero code + pwd = check_output('pwd', stderr=STDOUT, shell=True, encoding='utf-8') + pwd = os.path.abspath(pwd).strip() + print(f'Working dir {pwd} is not a git repo.') + + +def cat_ragged_tensors(left, right): + assert left.size(0) == right.size(0) + batch_size = left.size(0) + max_len = left.size(1) + right.size(1) + + len_left = (left != 0).sum(dim=1) + len_right = (right != 0).sum(dim=1) + + left_seq = left.unbind() + right_seq = right.unbind() + # handle zero padding + output = torch.zeros((batch_size, max_len), dtype=torch.long, device=left.device) + for i, row_left, row_right, l1, l2 in zip(range(batch_size), + left_seq, right_seq, + len_left, len_right): + l1 = l1.item() + l2 = l2.item() + j = l1 + l2 + # concatenate rows of ragged tensors + row_cat = torch.cat((row_left[:l1], row_right[:l2])) + # copy to empty tensor + output[i, :j] = row_cat + return output + + +def topk_accuracy(inputs, labels, k=1, largest=True): + assert len(inputs.size()) == 2 + assert len(labels.size()) == 2 + _, indices = inputs.topk(k=k, largest=largest) + result = indices - labels # boardcast + nonzero_count = (result != 0).sum(dim=1, keepdim=True) + num_correct = (nonzero_count != result.size(1)).sum().item() + num_example = inputs.size(0) + return num_correct, num_example + + +def get_total_trainable_params(model): + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + +if __name__ == '__main__': + print(normalize('ǖǘǚǜ')) diff --git a/KTeleBERT/torchlight/vocab.py b/KTeleBERT/torchlight/vocab.py new file mode 100644 index 0000000000000000000000000000000000000000..c776c4b489c16866359457fb006d215ae873ac05 --- /dev/null +++ b/KTeleBERT/torchlight/vocab.py @@ -0,0 +1,137 @@ +# coding: utf-8 +""" +Every NLP task needs a Vocabulary +Every Vocabulary is built from Instances +Every Instance is a collection of Fields +""" + +__all__ = ['DefaultLookupDict', 'Vocabulary'] + +PAD_TOKEN = '' +UNK_TOKEN = '' +BOS_TOKEN = '' +EOS_TOKEN = '' +PAD_IDX = 0 +UNK_IDX = 1 + + +class DefaultLookupDict(dict): + def __init__(self, default): + super(DefaultLookupDict, self).__init__() + self._default = default + + def __getitem__(self, item): + return self.get(item, self._default) + + +class Vocabulary: + """ + Define a vocabulary object that will be used to numericalize a field. + Attributes: + token2id: A collections.defaultdict instance mapping token strings to + numerical identifiers. + id2token: A list of token strings indexed by their numerical + identifiers. + embedding: pretrained vectors. + + Examples: + >>> from torchlight.vocab import Vocabulary + >>> from collections import Counter + >>> text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world'] + >>> vocab = Vocabulary(Counter(text_data)) + """ + def __init__(self, counter, max_size=None, min_freq=1, specials=None): + """ + Create a Vocabulary given Counter. + Args: + counter: collections.Counter object holding the frequencies of + each value found in the data. + max_size: The maximum size of the vocabulary, or None for no + maximum. Default: None. + min_freq: The minimum frequency needed to include a token in the + vocabulary. Values less than 1 will be set to 1. Default: 1. + specials: The list of special tokens except ['', '']. + Possible choices: [CLS] [MASK] [SEP] in BERT or + in Machine Translation. + """ + min_freq = max(min_freq, 1) # must be positive + + if specials is None: + self.specials = [PAD_TOKEN, UNK_TOKEN] + else: + assert isinstance(specials, list), "'specials' is of type list" + self.specials = [PAD_TOKEN, UNK_TOKEN] + specials + + assert len(set(self.specials)) == len(self.specials), \ + "specials can not contain duplicates." + + if max_size is not None: + max_size = len(self.specials) + max_size + + self.id2token = self.specials[:] + self.token2id = DefaultLookupDict(UNK_IDX) + self.token2id.update({tok: i for i, tok in enumerate(self.id2token)}) + + # sort by frequency, then alphabetically + token_freqs = sorted(counter.items(), key=lambda tup: tup[0]) + token_freqs.sort(key=lambda tup: tup[1], reverse=True) + + for token, freq in token_freqs: + if freq < min_freq or len(self.id2token) == max_size: + break + if token not in self.specials: + self.id2token.append(token) + self.token2id[token] = len(self.id2token) - 1 + + # TODO + self.embedding = None + + def __len__(self): + return len(self.id2token) + + def __repr__(self): + return 'Vocab(size={}, specials="{}")'.format(len(self), self.specials) + + def __getitem__(self, tokens): + """Looks up indices of text tokens according to the vocabulary. + If `unknown_token` of the vocabulary is None, looking up unknown tokens + results in KeyError. + Parameters + ---------- + tokens : str or list of strs + A source token or tokens to be converted. + Returns + ------- + int or list of ints + A token index or a list of token indices according to the vocabulary. + """ + + if not isinstance(tokens, (list, tuple)): + return self.token2id[tokens] + else: + return [self.token2id[token] for token in tokens] + + def __call__(self, tokens): + """Looks up indices of text tokens according to the vocabulary. + Parameters + ---------- + tokens : str or list of strs + A source token or tokens to be converted. + Returns + ------- + int or list of ints + A token index or a list of token indices according to the + vocabulary. + """ + + return self[tokens] + + @classmethod + def from_json(cls, json_str): + pass + + def to_json(self): + pass + + def set_embedding(self): + pass diff --git a/config.py b/config.py new file mode 100644 index 0000000000000000000000000000000000000000..643d7a1270213ddd48d922d30c9a450770b290b1 --- /dev/null +++ b/config.py @@ -0,0 +1,234 @@ +import os.path as osp +import numpy as np +import random +import torch +from easydict import EasyDict as edict +import argparse + + +LAYER_MAPPING = { + 0: 'od_layer_0', + 1: 'od_layer_1', + 2: 'od_layer_2', +} + + +class cfg(): + def __init__(self): + self.this_dir = osp.dirname(__file__) + # change + self.data_root = osp.abspath(osp.join(self.this_dir, '..', '..', 'data', '')) + + # TODO: add some static variable (The frequency of change is low) + + def get_args(self): + parser = argparse.ArgumentParser() + # ------------ base ------------ + parser.add_argument('--train_strategy', default=1, type=int) + parser.add_argument('--batch_size', default=64, type=int) + parser.add_argument('--batch_size_ke', default=14, type=int) + parser.add_argument('--batch_size_od', default=8, type=int) + parser.add_argument('--batch_size_ad', default=32, type=int) + + parser.add_argument('--epoch', default=15, type=int) + parser.add_argument("--save_model", default=1, type=int, choices=[0, 1]) + # 用transformer的 save_pretrain 方式保存 + parser.add_argument("--save_pretrain", default=0, type=int, choices=[0, 1]) + parser.add_argument("--from_pretrain", default=0, type=int, choices=[0, 1]) + + # torthlight + parser.add_argument("--no_tensorboard", default=False, action="store_true") + parser.add_argument("--exp_name", default="huawei_exp", type=str, help="Experiment name") + parser.add_argument("--dump_path", default="dump/", type=str, help="Experiment dump path") + parser.add_argument("--exp_id", default="ke256_raekt_ernie2_bs20_p3_c3_5e-6", type=str, help="Experiment ID") + # or 3407 + parser.add_argument("--random_seed", default=42, type=int) + # 数据参数 + parser.add_argument("--data_path", default="huawei", type=str, help="Experiment path") + parser.add_argument('--train_ratio', default=1, type=float, help='ratio for train/test') + parser.add_argument("--seq_data_name", default='Seq_data_base', type=str, help="seq_data 名字") + parser.add_argument("--kg_data_name", default='KG_data_base_rule', type=str, help="kg_data 名字") + parser.add_argument("--order_data_name", default='event_order_data', type=str, help="order_data 名字") + # TODO: add some dynamic variable + parser.add_argument("--model_name", default="MacBert", type=str, help="model name") + + # ------------ 训练阶段 ------------ + parser.add_argument("--scheduler", default="cos", type=str, choices=["linear", "cos"]) + parser.add_argument("--optim", default="adamw", type=str) + parser.add_argument("--adam_epsilon", default=1e-8, type=float) + parser.add_argument('--workers', type=int, default=8) + parser.add_argument('--accumulation_steps', type=int, default=6) + parser.add_argument('--accumulation_steps_ke', type=int, default=6) + parser.add_argument('--accumulation_steps_ad', type=int, default=6) + parser.add_argument('--accumulation_steps_od', type=int, default=6) + parser.add_argument("--train_together", default=0, type=int) + + # 3e-5 + parser.add_argument('--lr', type=float, default=1e-5) + # 逐层学习率衰减 + parser.add_argument("--LLRD", default=0, type=int, choices=[0, 1]) + parser.add_argument('--weight_decay', type=float, default=0.01) + parser.add_argument('--clip', type=float, default=1., help='gradient clipping') + parser.add_argument('--scheduler_steps', type=int, default=None, + help='total number of step for the scheduler, if None then scheduler_total_step = total_step') + parser.add_argument('--eval_step', default=100, type=int, help='evaluate each n step') + + # ------------ PLM ------------ + parser.add_argument('--maxlength', type=int, default=200) + parser.add_argument('--mlm_probability', type=float, default=0.15) + parser.add_argument('--final_mlm_probability', type=float, default=0.4) + parser.add_argument('--mlm_probability_increase', type=str, default="curve", choices=["linear", "curve"]) + parser.add_argument("--mask_stratege", default="rand", type=str, choices=["rand", "wwm", "domain"]) + # 前n个epoch 用rand,后面用wwm. multi-stage knowledge masking strategy + parser.add_argument("--ernie_stratege", default=-1, type=int) + # 用mlm任务进行训练,默认使用chinese_ref且添加新的special word + parser.add_argument("--use_mlm_task", default=1, type=int, choices=[0, 1]) + # 添加新的special word + parser.add_argument("--add_special_word", default=1, type=int, choices=[0, 1]) + # freeze + parser.add_argument("--freeze_layer", default=0, type=int, choices=[0, 1, 2, 3, 4]) + # 是否mask 特殊token + parser.add_argument("--special_token_mask", default=0, type=int, choices=[0, 1]) + parser.add_argument("--emb_init", default=1, type=int, choices=[0, 1]) + parser.add_argument("--cls_head_init", default=1, type=int, choices=[0, 1]) + # 是否使用自适应权重 + parser.add_argument("--use_awl", default=1, type=int, choices=[0, 1]) + parser.add_argument("--mask_loss_scale", default=1.0, type=float) + + # ------------ KGE ------------ + parser.add_argument('--ke_norm', type=int, default=1) + parser.add_argument('--ke_dim', type=int, default=768) + parser.add_argument('--ke_margin', type=float, default=1.0) + parser.add_argument('--neg_num', type=int, default=10) + parser.add_argument('--adv_temp', type=float, default=1.0, help='The temperature of sampling in self-adversarial negative sampling.') + # 5e-4 + parser.add_argument('--ke_lr', type=float, default=3e-5) + parser.add_argument('--only_ke_loss', type=int, default=0) + + # ------------ 数值embedding相关 ------------ + parser.add_argument('--use_NumEmb', type=int, default=1) + parser.add_argument("--contrastive_loss", default=1, type=int, choices=[0, 1]) + parser.add_argument("--l_layers", default=2, type=int) + parser.add_argument('--use_kpi_loss', type=int, default=1) + + # ------------ 测试阶段 ------------ + parser.add_argument("--only_test", default=0, type=int, choices=[0, 1]) + parser.add_argument("--mask_test", default=0, type=int, choices=[0, 1]) + parser.add_argument("--embed_gen", default=0, type=int, choices=[0, 1]) + parser.add_argument("--ke_test", default=0, type=int, choices=[0, 1]) + # -1: 测全集 + parser.add_argument("--ke_test_num", default=-1, type=int) + parser.add_argument("--path_gen", default="", type=str) + + # ------------ 时序阶段 ------------ + # 1:预训练 + # 2:时序 finetune + # 3. 异常检测 finetune + 时序, 且是迭代的 + # 是否加载od模型 + parser.add_argument("--order_load", default=0, type=int) + parser.add_argument("--order_num", default=2, type=int) + parser.add_argument("--od_type", default='linear_cat', type=str, choices=['linear_cat', 'vertical_attention']) + parser.add_argument("--eps", default=0.2, type=float, help='label smoothing..') + parser.add_argument("--num_od_layer", default=0, type=int) + parser.add_argument("--plm_emb_type", default='cls', type=str, choices=['cls', 'last_avg']) + parser.add_argument("--order_test_name", default='', type=str) + parser.add_argument("--order_threshold", default=0.5, type=float) + # ------------ 并行训练 ------------ + # 是否并行 + parser.add_argument('--rank', type=int, default=0, help='rank to dist') + parser.add_argument('--dist', type=int, default=0, help='whether to dist') + # 不要改该参数,系统会自动分配 + parser.add_argument('--device', default='cuda', help='device id (i.e. 0 or 0,1 or cpu)') + # 开启的进程数(注意不是线程),不用设置该参数,会根据nproc_per_node自动设置 + parser.add_argument('--world-size', default=4, type=int, + help='number of distributed processes') + parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training') + parser.add_argument("--local_rank", default=-1, type=int) + self.cfg = parser.parse_args() + + def update_train_configs(self): + # add some constraint for parameters + # e.g. cannot save and test at the same time + # 修正默认参数 + # TODO: 测试逻辑有问题需要修改 + if len(self.cfg.order_test_name) > 0: + self.cfg.save_model = 0 + if len(self.cfg.order_test_name) == 0: + self.cfg.train_ratio = min(0.8, self.cfg.train_ratio) + # 自适应载入文件名 + else: + print("od test ... ") + self.cfg.train_strategy == 5 + self.cfg.plm_emb_type = 'last_avg' if 'last_avg' in self.cfg.model_name else 'cls' + for key in LAYER_MAPPING.keys(): + if LAYER_MAPPING[key] in self.cfg.model_name: + self.cfg.num_od_layer = key + self.cfg.order_test_name = osp.join('downstream_task', f'{self.cfg.order_test_name}') + + if self.cfg.mask_test or self.cfg.embed_gen or self.cfg.ke_test or len(self.cfg.order_test_name) > 0: + assert len(self.cfg.model_name) > 0 + self.cfg.only_test = 1 + if self.cfg.only_test == 1: + self.save_model = 0 + self.save_pretrain = 0 + + # TODO: update some dynamic variable + self.cfg.data_root = self.data_root + self.cfg.data_path = osp.join(self.data_root, self.cfg.data_path) + self.cfg.plm_path = osp.join(self.data_root, 'transformer') + self.cfg.dump_path = osp.join(self.cfg.data_path, self.cfg.dump_path) + # bs 控制尽量在32 + + # 自适应权重的数量 + self.cfg.awl_num = 1 + # ------------ 数值embedding相关 ------------ + self.cfg.hidden_size = 768 + self.cfg.num_attention_heads = 8 + self.cfg.hidden_dropout_prob = 0.1 + self.cfg.num_kpi = 304 + self.cfg.specail_emb_path = None + if self.cfg.emb_init: + self.cfg.specail_emb_path = osp.join(self.cfg.data_path, 'added_vocab_embedding.pt') + + # ------------- 多任务学习相关 ------------- + # 四个阶段 + self.cfg.mask_epoch, self.cfg.ke_epoch, self.cfg.ad_epoch, self.cfg.od_epoch = None, None, None, None + # 触发多任务 学习 + if self.cfg.train_strategy > 1: + self.cfg.mask_epoch = [0, 1, 1, 1, 0] + self.cfg.ke_epoch = [4, 3, 2, 2, 0] + if self.cfg.only_ke_loss: + self.cfg.mask_epoch = [0, 0, 0, 0, 0] + self.cfg.epoch = sum(self.cfg.mask_epoch) + sum(self.cfg.ke_epoch) + if self.cfg.train_strategy > 2: + self.cfg.ad_epoch = [0, 6, 3, 1, 0] + self.cfg.epoch += sum(self.cfg.ad_epoch) + if self.cfg.train_strategy > 3 and not self.cfg.only_ke_loss: + self.cfg.od_epoch = [0, 0, 9, 1, 0] + # self.cfg.mask_epoch[3] = 1 + self.cfg.epoch += sum(self.cfg.od_epoch) + self.cfg.epoch_matrix = [] + for epochs in [self.cfg.mask_epoch, self.cfg.ke_epoch, self.cfg.ad_epoch, self.cfg.od_epoch]: + if epochs is not None: + self.cfg.epoch_matrix.append(epochs) + if self.cfg.train_together: + # loss 直接相加,训练epoch就是mask的epoch + self.cfg.epoch = sum(self.cfg.mask_epoch) + self.cfg.batch_size = int((self.cfg.batch_size - 16) / self.cfg.train_strategy) + self.cfg.batch_size_ke = int(self.cfg.batch_size_ke / self.cfg.train_strategy) - 2 + self.cfg.batch_size_ad = int(self.cfg.batch_size_ad / self.cfg.train_strategy) - 1 + self.cfg.batch_size_od = int(self.cfg.batch_size_od / self.cfg.train_strategy) - 1 + self.cfg.accumulation_steps = (self.cfg.accumulation_steps - 1) * self.cfg.train_strategy + + self.cfg.neg_num = max(min(self.cfg.neg_num, self.cfg.batch_size_ke - 3), 1) + + self.cfg.accumulation_steps_dict = {0: self.cfg.accumulation_steps, 1: self.cfg.accumulation_steps_ke, 2: self.cfg.accumulation_steps_ad, 3: self.cfg.accumulation_steps_od} + + # 使用数值embedding也必须添加新词因为位置信息和tokenizer绑定 + if self.cfg.use_mlm_task or self.cfg.use_NumEmb: + assert self.cfg.add_special_word == 1 + + if self.cfg.use_NumEmb: + self.cfg.awl_num += 1 + + return self.cfg