File size: 5,703 Bytes
664c81e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import py_vncorenlp
from transformers import AutoTokenizer, Pipeline, pipeline
import os
from model.keyword_extraction_utils import *
from model.process_text import process_text_pipeline
dir_path = os.path.dirname(os.path.realpath(__file__))
class KeywordExtractorPipeline(Pipeline):
def __init__(self, model, ner_model, **kwargs):
super().__init__(model, **kwargs)
self.annotator = py_vncorenlp.VnCoreNLP(annotators=["wseg", "pos"],
save_dir=f'{dir_path}/pretrained-models/vncorenlp')
print("Loading PhoBERT tokenizer")
self.phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
self.phobert = model
print("Loading NER tokenizer")
ner_tokenizer = AutoTokenizer.from_pretrained("NlpHUST/ner-vietnamese-electra-base")
self.ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)
stopwords_file_path = f'{dir_path}/vietnamese-stopwords-dash.txt'
with open(stopwords_file_path) as f:
self.stopwords = [w.strip() for w in f.readlines()]
def _sanitize_parameters(self, **kwargs):
preprocess_kwargs = {}
forward_kwargs = {}
postprocess_kwargs = {}
for possible_preprocess_kwarg in ["text", "title"]:
if possible_preprocess_kwarg in kwargs:
preprocess_kwargs[possible_preprocess_kwarg] = kwargs[possible_preprocess_kwarg]
for possible_forward_kwarg in ["ngram_n", "min_freq"]:
if possible_forward_kwarg in kwargs:
forward_kwargs[possible_forward_kwarg] = kwargs[possible_forward_kwarg]
for possible_postprocess_kwarg in ["top_n", "diversify_result"]:
if possible_postprocess_kwarg in kwargs:
postprocess_kwargs[possible_postprocess_kwarg] = kwargs[possible_postprocess_kwarg]
return preprocess_kwargs, forward_kwargs, postprocess_kwargs
def preprocess(self, inputs):
title = None
if inputs['title']:
title = process_text_pipeline(inputs['title'])
text = process_text_pipeline(inputs['text'])
return {"text": text, "title": title}
def _forward(self, model_inputs, ngram_n, min_freq):
text = model_inputs['text']
title = model_inputs['title']
# Getting segmentised document
ne_ls, doc_segmentised = get_segmentised_doc(self.ner_pipeline, self.annotator, title, text)
filtered_doc_segmentised = compute_filtered_text(self.annotator, title, text)
doc_embedding = get_doc_embeddings(filtered_doc_segmentised, self.phobert_tokenizer, self.phobert,
self.stopwords)
ngram_list = self.generate_ngram_list(doc_segmentised, filtered_doc_segmentised, ne_ls, ngram_n, min_freq)
print("Final ngram list")
print(sorted(ngram_list))
ngram_embeddings = compute_ngram_embeddings(self.phobert_tokenizer, self.phobert, ngram_list)
return {"ngram_list": ngram_list, "ngram_embeddings": ngram_embeddings, "doc_embedding": doc_embedding}
def postprocess(self, model_outputs, top_n, diversify_result):
ngram_list = model_outputs['ngram_list']
ngram_embeddings = model_outputs['ngram_embeddings']
doc_embedding = model_outputs['doc_embedding']
ngram_result = self.extract_keywords(doc_embedding, ngram_list, ngram_embeddings)
non_diversified = sorted([(ngram, ngram_result[ngram]) for ngram in ngram_result],
key=lambda x: x[1], reverse=True)[:top_n]
if diversify_result:
return diversify_result_kmeans(ngram_result, ngram_embeddings, top_n=top_n)
return non_diversified
def generate_ngram_list(self, doc_segmentised, filtered_doc_segmentised, ne_ls, ngram_n, min_freq):
ngram_low, ngram_high = ngram_n
# Adding ngram
ngram_list = set()
for n in range(ngram_low, ngram_high + 1):
ngram_list.update(get_candidate_ngrams(doc_segmentised, filtered_doc_segmentised, n, self.stopwords))
# print(sorted(ngram_list))
# Adding named entities ngram list
ne_ls_segmented = [self.annotator.word_segment(ne)[0] for ne in ne_ls]
print("Named Entities list")
print(ne_ls_segmented)
ngram_list.update(ne_ls_segmented)
# print(sorted(ngram_list))
# Removing overlapping ngrams
ngram_list = remove_overlapping_ngrams(ngram_list)
# print("Removed overlapping ngrams")
# print(sorted(ngram_list))
# Limit ngrams by minimum frequency
if min_freq > 1:
ngram_list = limit_minimum_frequency(doc_segmentised, ngram_list, min_freq=min_freq)
return ngram_list.keys()
return ngram_list
def extract_keywords(self, doc_embedding, ngram_list, ngram_embeddings):
ngram_result = compute_ngram_similarity(ngram_list, ngram_embeddings, doc_embedding)
ngram_result = remove_duplicates(ngram_result)
return ngram_result
if __name__ == "__main__":
phobert = torch.load(f'{dir_path}/pretrained-models/phobert.pt')
phobert.eval()
ner_model = torch.load(f'{dir_path}/pretrained-models/ner-vietnamese-electra-base.pt')
ner_model.eval()
kw_pipeline = KeywordExtractorPipeline(phobert, ner_model)
text_file_path = f'{dir_path}/test_file.txt'
with open(text_file_path, 'r') as f:
text = ' '.join([ln.strip() for ln in f.readlines()])
inp = {"text": text, "title": None}
kws = kw_pipeline(inputs=inp, min_freq=1, ngram_n=(1, 3), top_n=5, diversify_result=False)
print(kws)
|