Thao Pham commited on
Commit
664c81e
·
1 Parent(s): 97bb53e

Adding app.py and pipeline.py, changed code structure

Browse files
.gitignore CHANGED
@@ -1,2 +1,4 @@
1
  venv
2
  test_file.txt
 
 
 
1
  venv
2
  test_file.txt
3
+ scrap.py
4
+
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import os
4
+
5
+ from pipeline import KeywordExtractorPipeline
6
+
7
+ DIR_PATH = os.path.dirname(os.path.realpath(__file__))
8
+
9
+
10
+ def extract_keyword(title, text, top_n, ngram_low_range, ngram_high_range, min_freq, diversify_result):
11
+ inp = {"text": text, "title": title}
12
+ keyword_ls = kw_pipeline(inputs=inp, min_freq=min_freq, ngram_n=(ngram_low_range, ngram_high_range),
13
+ top_n=top_n, diversify_result=diversify_result)
14
+ result = ''
15
+ for kw, score in keyword_ls:
16
+ result += f'{kw}: {score}\n'
17
+ return result
18
+
19
+
20
+ if gr.NO_RELOAD:
21
+ print("Loading PhoBERT model")
22
+ phobert = torch.load(f'{DIR_PATH}/pretrained-models/phobert.pt')
23
+ phobert.eval()
24
+
25
+ print("Loading NER model")
26
+ ner_model = torch.load(f'{DIR_PATH}/pretrained-models/ner-vietnamese-electra-base.pt')
27
+ ner_model.eval()
28
+ kw_pipeline = KeywordExtractorPipeline(phobert, ner_model)
29
+
30
+ if __name__ == "__main__":
31
+ demo = gr.Interface(fn=extract_keyword,
32
+ inputs=[
33
+ gr.Text(
34
+ label="Title",
35
+ lines=1,
36
+ value="Enter title here",
37
+ ),
38
+ gr.Textbox(
39
+ label="Text",
40
+ lines=5,
41
+ value="Enter text here",
42
+ ),
43
+ gr.Number(
44
+ label="Top N keywords",
45
+ info="Number of keywords retrieved",
46
+ value=10
47
+ ),
48
+ gr.Number(
49
+ label="Ngram low range",
50
+ value=1
51
+ ),
52
+ gr.Number(
53
+ label="Ngram high range",
54
+ value=3
55
+ ),
56
+ gr.Number(
57
+ label="Ngram minimum frequency",
58
+ value=1
59
+ ),
60
+ gr.Checkbox(
61
+ label="Diversify result"
62
+ )
63
+ ],
64
+ # inputs=["text", "textbox", "number", "number", "number", "number", "checkbox"],
65
+ outputs=gr.Textbox(
66
+ label="Keywords Extracted",
67
+ )
68
+ )
69
+
70
+ demo.launch(share=True) # Share your demo with just 1 extra parameter 🚀
keybertvi_model.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import py_vncorenlp
2
+ # from transformers import AutoTokenizer, pipeline
3
+ # import torch
4
+ # import os
5
+ # from model.keyword_extraction_utils import extract_keywords
6
+ #
7
+ #
8
+ # class KeyBERTVi:
9
+ #
10
+ # def __init__(self, stopwords_file_path=None):
11
+ # self.annotator = py_vncorenlp.VnCoreNLP(annotators=["wseg", "pos"],
12
+ # save_dir=f'{dir_path}/pretrained-models/vncorenlp')
13
+ # # model = py_vncorenlp.VnCoreNLP(save_dir='/absolute/path/to/vncorenlp')
14
+ # print("Loading PhoBERT model")
15
+ # self.phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
16
+ #
17
+ # # use absolute path because torch is cached
18
+ # self.phobert = torch.load(f'{dir_path}/pretrained-models/phobert.pt')
19
+ # self.phobert.eval()
20
+ #
21
+ # print("Loading NER model")
22
+ # ner_tokenizer = AutoTokenizer.from_pretrained("NlpHUST/ner-vietnamese-electra-base")
23
+ # ner_model = torch.load(f'{dir_path}/pretrained-models/ner-vietnamese-electra-base.pt')
24
+ # ner_model.eval()
25
+ # self.ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)
26
+ #
27
+ # if stopwords_file_path is None:
28
+ # stopwords_file_path = f'{dir_path}/vietnamese-stopwords-dash.txt'
29
+ # with open(stopwords_file_path) as f:
30
+ # self.stopwords = [w.strip() for w in f.readlines()]
31
+ #
32
+ # def extract_keywords(self, title, text, ngram_range=(1, 3), top_n=5, use_kmeans=False, use_mmr=False, min_freq=1):
33
+ # keyword_ls = extract_keywords(text, title,
34
+ # self.ner_pipeline,
35
+ # self.annotator,
36
+ # self.phobert_tokenizer,
37
+ # self.phobert,
38
+ # self.stopwords,
39
+ # ngram_n=ngram_range,
40
+ # top_n=top_n,
41
+ # use_kmeans=use_kmeans,
42
+ # use_mmr=use_mmr,
43
+ # min_freq=min_freq)
44
+ # return keyword_ls
45
+ #
46
+ # def highlight(self, text, keywords):
47
+ # kw_ls = [' '.join(kw.split('_')) for kw, score in keywords]
48
+ # for key in kw_ls:
49
+ # text = text.replace(f" {key}", f" <mark>{key}</mark>")
50
+ # return text
51
+ #
52
+ #
53
+ # dir_path = os.path.dirname(os.path.realpath(__file__))
54
+ # if __name__ == "__main__":
55
+ # # args
56
+ # # print(dir_path)
57
+ #
58
+ # stopwords_file_path = f'{dir_path}/vietnamese-stopwords-dash.txt'
59
+ #
60
+ # # text_file_path = sys.argv[1]
61
+ # # with open(f'{dir_path}/{text_file_path}', 'r') as f:
62
+ # # text = ' '.join([ln.strip() for ln in f.readlines()])
63
+ # # print(text)
64
+ #
65
+ # # kw_model = KeyBERTVi()
66
+ # # model_name_on_hub = "KeyBERTVi"
67
+ # # kw_model.save_pretrained(model_name_on_hub)
68
+ # # kw_model.phobert_tokenizer.save_pretrained(model_name_on_hub)
69
+ #
70
+ # # title = None
71
+ # # keyword_ls = kw_model.extract_keywords(title, text, ngram_range=(1, 3), top_n=5)
72
+ # # print(keyword_ls)
model.py DELETED
@@ -1,58 +0,0 @@
1
- import py_vncorenlp
2
- from transformers import AutoTokenizer, pipeline
3
- import torch
4
- import os
5
- from keyword_extraction import extract_keywords
6
- import sys
7
-
8
-
9
- class KeyBERTVi:
10
-
11
- def __init__(self, stopwords_file_path):
12
- self.annotator = py_vncorenlp.VnCoreNLP(annotators=["wseg", "pos"],
13
- save_dir=f'{dir_path}/pretrained-models/vncorenlp')
14
- # model = py_vncorenlp.VnCoreNLP(save_dir='/absolute/path/to/vncorenlp')
15
- print("Loading PhoBERT model")
16
- self.phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
17
-
18
- # use absolute path because torch is cached
19
- self.phobert = torch.load(f'{dir_path}/pretrained-models/phobert.pt')
20
- self.phobert.eval()
21
-
22
- print("Loading NER model")
23
- ner_tokenizer = AutoTokenizer.from_pretrained("NlpHUST/ner-vietnamese-electra-base")
24
- ner_model = torch.load(f'{dir_path}/pretrained-models/ner-vietnamese-electra-base.pt')
25
- ner_model.eval()
26
- self.ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)
27
-
28
- with open(stopwords_file_path) as f:
29
- self.stopwords = [w.strip() for w in f.readlines()]
30
-
31
- def extract_keywords(self, title, text, ngram_range=(1, 3), top_n=5):
32
- keyword_ls = extract_keywords(text, title,
33
- self.ner_pipeline,
34
- self.annotator,
35
- self.phobert_tokenizer,
36
- self.phobert,
37
- self.stopwords,
38
- ngram_n=ngram_range,
39
- top_n=top_n)
40
- return keyword_ls
41
-
42
-
43
- dir_path = os.path.dirname(os.path.realpath(__file__))
44
- if __name__ == "__main__":
45
- # args
46
- # print(dir_path)
47
-
48
- stopwords_file_path = f'{dir_path}/vietnamese-stopwords-dash.txt'
49
-
50
- text_file_path = sys.argv[1]
51
- with open(f'{dir_path}/{text_file_path}', 'r') as f:
52
- text = ' '.join([ln.strip() for ln in f.readlines()])
53
- # print(text)
54
-
55
- kw_model = KeyBERTVi(stopwords_file_path)
56
- title = None
57
- keyword_ls = kw_model.extract_keywords(title, text, ngram_range=(1, 3), top_n=5)
58
- print(keyword_ls)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
keyword_extraction.py → model/keyword_extraction_utils.py RENAMED
@@ -2,13 +2,11 @@ from string import punctuation
2
  import numpy as np
3
  import torch
4
  from sklearn.cluster import KMeans
5
- from named_entities import get_named_entities
6
 
7
  punctuation = [c for c in punctuation if c != "_"]
8
  punctuation += ["“", "–", ",", "…", "”", "–"]
9
 
10
- # with open('/Users/thaopham/bao-lao-dong-data/utils/vietnamese-stopwords-dash.txt') as f:
11
- # stopwords = [w.strip() for w in f.readlines()]
12
 
13
  ethnicity_dict_map = {"H'Mông": "HMông",
14
  "H'mông": "HMông",
@@ -69,8 +67,8 @@ def compute_ngram_list(segmentised_doc, ngram_n, stopwords_ls, subsentences=True
69
  ngram_list = []
70
  for sentence in output_sub_sentences:
71
  sent = sentence.split()
72
- for i in range(len(sent) - (ngram_n) + 1):
73
- ngram = ' '.join(sent[i:i + (ngram_n)])
74
  if ngram not in ngram_list and not check_for_stopwords(ngram, stopwords_ls):
75
  ngram_list.append(ngram)
76
 
@@ -116,15 +114,11 @@ def get_segmentised_doc(nlp, rdrsegmenter, title, doc):
116
  title = title.replace(i, j)
117
  doc = doc.replace(i, j)
118
 
119
- print(title)
120
- # print(doc)
121
  segmentised_doc = rdrsegmenter.word_segment(doc)
122
 
123
  if title is not None:
124
  segmentised_doc = rdrsegmenter.word_segment(title) + rdrsegmenter.word_segment(doc)
125
- print(segmentised_doc)
126
  ne_ls = set(get_named_entities(nlp, doc))
127
- print(sorted(ne_ls))
128
 
129
  segmentised_doc_ne = []
130
  for sent in segmentised_doc:
@@ -150,13 +144,6 @@ def compute_ngram_embeddings(tokenizer, phobert, ngram_list):
150
  return ngram_embeddings
151
 
152
 
153
- def normalised_cosine_similarity(ngram_embedding, document_embedding):
154
- similarity_score = cosine_similarity(ngram_embedding, document_embedding)
155
- magnitude_ngram = np.linalg.norm(ngram_embedding)
156
- magnitude_doc = np.linalg.norm(document_embedding)
157
- return similarity_score / np.sqrt(magnitude_ngram * magnitude_doc)
158
-
159
-
160
  def compute_ngram_similarity(ngram_list, ngram_embeddings, doc_embedding):
161
  ngram_similarity_dict = {}
162
 
@@ -168,20 +155,7 @@ def compute_ngram_similarity(ngram_list, ngram_embeddings, doc_embedding):
168
  return ngram_similarity_dict
169
 
170
 
171
- # def compute_ngram_similarity(ngram_list, ngram_embeddings, doc_embedding):
172
- # ngram_similarity_dict = {}
173
- #
174
- # for ngram in ngram_list:
175
- # score = [cosine_similarity(ngram_embeddings[ngram], doc_embedding.T).flatten()[0]]
176
- # for w in ngram.split():
177
- # similarity_score = cosine_similarity(ngram_embeddings[w], doc_embedding.T).flatten()[0]
178
- # score.append(similarity_score)
179
- # ngram_similarity_dict[ngram] = np.mean(score)
180
- #
181
- # return ngram_similarity_dict
182
-
183
-
184
- def diversify_result(ngram_result, ngram_embeddings, top_n=5):
185
  best_ngrams = sorted(ngram_result, key=ngram_result.get, reverse=True)[:top_n * 4]
186
  best_ngram_embeddings = np.array([ngram_embeddings[ngram] for ngram in best_ngrams]).squeeze()
187
  vote = {}
@@ -208,36 +182,8 @@ def diversify_result(ngram_result, ngram_embeddings, top_n=5):
208
  return diversify_result_ls[:top_n]
209
 
210
 
211
- def mmr(ngram_result, ngram_embeddings, lambda_=0.7, top_n=5):
212
- ngram_result = {key: ngram_result[key] for key in
213
- sorted(ngram_result, key=ngram_result.get, reverse=True)[:top_n * 4]}
214
-
215
- mmr_result = {}
216
- for ngram1 in ngram_result:
217
- similary_score_to_doc = ngram_result[ngram1]
218
- max_sim = -1
219
- # most_similar_ngram = None
220
- for ngram2 in ngram_result:
221
- if ngram2 != ngram1:
222
- similarity_score_to_ngram = \
223
- cosine_similarity(ngram_embeddings[ngram1], ngram_embeddings[ngram2].T).flatten()[0]
224
-
225
- if ngram2.lower() == ngram1.lower():
226
- similarity_score_to_ngram = 1
227
-
228
- if similarity_score_to_ngram > max_sim:
229
- max_sim = similarity_score_to_ngram
230
- # most_similar_ngram = ngram2
231
- # print(ngram1, most_similar_ngram, max_sim)
232
- mmr_result[ngram1] = lambda_ * similary_score_to_doc - (1 - lambda_) * max_sim
233
-
234
- mmr_result_ls = [(key, mmr_result[key]) for key in mmr_result]
235
- mmr_result_ls = sorted(mmr_result_ls, key=lambda x: x[1], reverse=True)
236
- return mmr_result_ls[:top_n]
237
-
238
-
239
  def remove_duplicates(ngram_result):
240
- to_remove = []
241
  for ngram in ngram_result:
242
 
243
  for ngram2 in ngram_result:
@@ -245,7 +191,7 @@ def remove_duplicates(ngram_result):
245
  new_score = np.mean([ngram_result[ngram], ngram_result[ngram2]])
246
 
247
  ngram_result[ngram] = new_score
248
- to_remove.append(ngram2)
249
 
250
  for ngram in to_remove:
251
  ngram_result.pop(ngram)
@@ -257,9 +203,9 @@ def compute_filtered_text(annotator, title, text):
257
  if title is not None:
258
  annotated = annotator.annotate_text(title + '. ' + text)
259
  filtered_sentences = []
260
- # keep_tags = ['N', 'Np', 'V', 'A']
261
- keep_tags = ['N', 'Np', 'V']
262
  for key in annotated.keys():
 
263
  sent = ' '.join([dict_['wordForm'] for dict_ in annotated[key] if dict_['posTag'] in keep_tags])
264
  filtered_sentences.append(sent)
265
  return filtered_sentences
@@ -267,11 +213,9 @@ def compute_filtered_text(annotator, title, text):
267
 
268
  def get_candidate_ngrams(segmentised_doc, filtered_segmentised_doc, ngram_n, stopwords_ls):
269
  # get actual ngrams
270
- # segmentised_doc = get_segmentised_doc(nlp, annotator, title, text)
271
  actual_ngram_list = compute_ngram_list(segmentised_doc, ngram_n, stopwords_ls, subsentences=True)
272
 
273
  # get filtered ngrams
274
- # filtered_segmentised_doc = compute_filtered_text(annotator, title, text)
275
  filtered_ngram_list = compute_ngram_list(filtered_segmentised_doc, ngram_n, stopwords_ls,
276
  subsentences=False)
277
 
@@ -280,71 +224,37 @@ def get_candidate_ngrams(segmentised_doc, filtered_segmentised_doc, ngram_n, sto
280
  return candidate_ngram
281
 
282
 
283
- def get_ngram_frequencies(doc_segmentised, ngram_list):
284
  ngram_dict_freq = {}
285
  for ngram in ngram_list:
286
  ngram_n = len(ngram.split())
287
  count = 0
288
  for sentence in doc_segmentised:
289
  sent = sentence.split()
 
290
  for i in range(len(sent) - ngram_n + 1):
291
  pair = ' '.join(sent[i:i + ngram_n])
 
292
  if pair == ngram:
293
  count += 1
294
- if count > 1:
 
295
  ngram_dict_freq[ngram] = count
296
 
297
  return ngram_dict_freq
298
 
299
 
300
- def extract_keywords(text, title, nlp, annotator, tokenizer, phobert, stopwords_ls, ngram_n=(2, 2), top_n=5,
301
- use_kmeans=False, use_mmr=False):
302
- # ROOT_DIR = '/Users/thaopham/bao-lao-dong-data'
303
- # with open(f'{ROOT_DIR}/utils/vietnamese-stopwords-dash.txt') as f:
304
- # stopwords = [w.strip() for w in f.readlines()]
305
-
306
- ngram_low, ngram_high = ngram_n
307
-
308
- ne_ls, doc_segmentised = get_segmentised_doc(nlp, annotator, title, text)
309
- filtered_doc_segmentised = compute_filtered_text(annotator, title, text)
310
- # print(doc_segmentised)
311
-
312
- print("Generate ngram list")
313
- ngram_list = set()
314
- for n in range(ngram_low, ngram_high + 1):
315
- # ngram_list += compute_ngram_list(segmentised_doc=doc_segmentised, ngram_n=n, stopwords_ls=stopwords)
316
- # ngram_list.update(compute_ngram_list(segmentised_doc=doc_segmentised, ngram_n=n, stopwords_ls=stopwords))
317
- ngram_list.update(get_candidate_ngrams(doc_segmentised, filtered_doc_segmentised, n, stopwords_ls))
318
- ngram_list.update([annotator.word_segment(ne)[0] for ne in ne_ls])
319
-
320
- print(len(ngram_list))
321
- ngram_list = get_ngram_frequencies(doc_segmentised, ngram_list)
322
- print(len(ngram_list))
323
- # print(sorted(ngram_list))
324
-
325
- print("Generate ngram embeddings")
326
- ngram_embeddings = compute_ngram_embeddings(tokenizer, phobert, ngram_list)
327
-
328
- print("Generate document embeddings")
329
- doc_embedding = get_doc_embeddings(filtered_doc_segmentised, tokenizer, phobert, stopwords_ls)
330
-
331
- ngram_result = compute_ngram_similarity(ngram_list, ngram_embeddings, doc_embedding)
332
- ngram_result = remove_duplicates(ngram_result)
333
- non_diversified = sorted([(ngram, ngram_result[ngram]) for ngram in ngram_result],
334
- key=lambda x: x[1], reverse=True)[:top_n]
335
-
336
- # Diversify result
337
- if use_kmeans:
338
- diversified_kw_kmeans = diversify_result(ngram_result, ngram_embeddings, top_n=top_n)
339
- return diversified_kw_kmeans
340
-
341
- if use_mmr:
342
- diversified_kw_mmr = mmr(ngram_result, ngram_embeddings, lambda_=0.85, top_n=top_n)
343
- return diversified_kw_mmr
344
- return non_diversified
345
-
346
-
347
- def highlight(text, keywords):
348
- for key in keywords:
349
- text = text.replace(f" {key}", f" <mark>{key}</mark>")
350
- return text
 
2
  import numpy as np
3
  import torch
4
  from sklearn.cluster import KMeans
5
+ from model.named_entities import get_named_entities
6
 
7
  punctuation = [c for c in punctuation if c != "_"]
8
  punctuation += ["“", "–", ",", "…", "”", "–"]
9
 
 
 
10
 
11
  ethnicity_dict_map = {"H'Mông": "HMông",
12
  "H'mông": "HMông",
 
67
  ngram_list = []
68
  for sentence in output_sub_sentences:
69
  sent = sentence.split()
70
+ for i in range(len(sent) - ngram_n + 1):
71
+ ngram = ' '.join(sent[i:i + ngram_n])
72
  if ngram not in ngram_list and not check_for_stopwords(ngram, stopwords_ls):
73
  ngram_list.append(ngram)
74
 
 
114
  title = title.replace(i, j)
115
  doc = doc.replace(i, j)
116
 
 
 
117
  segmentised_doc = rdrsegmenter.word_segment(doc)
118
 
119
  if title is not None:
120
  segmentised_doc = rdrsegmenter.word_segment(title) + rdrsegmenter.word_segment(doc)
 
121
  ne_ls = set(get_named_entities(nlp, doc))
 
122
 
123
  segmentised_doc_ne = []
124
  for sent in segmentised_doc:
 
144
  return ngram_embeddings
145
 
146
 
 
 
 
 
 
 
 
147
  def compute_ngram_similarity(ngram_list, ngram_embeddings, doc_embedding):
148
  ngram_similarity_dict = {}
149
 
 
155
  return ngram_similarity_dict
156
 
157
 
158
+ def diversify_result_kmeans(ngram_result, ngram_embeddings, top_n=5):
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  best_ngrams = sorted(ngram_result, key=ngram_result.get, reverse=True)[:top_n * 4]
160
  best_ngram_embeddings = np.array([ngram_embeddings[ngram] for ngram in best_ngrams]).squeeze()
161
  vote = {}
 
182
  return diversify_result_ls[:top_n]
183
 
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  def remove_duplicates(ngram_result):
186
+ to_remove = set()
187
  for ngram in ngram_result:
188
 
189
  for ngram2 in ngram_result:
 
191
  new_score = np.mean([ngram_result[ngram], ngram_result[ngram2]])
192
 
193
  ngram_result[ngram] = new_score
194
+ to_remove.add(ngram2)
195
 
196
  for ngram in to_remove:
197
  ngram_result.pop(ngram)
 
203
  if title is not None:
204
  annotated = annotator.annotate_text(title + '. ' + text)
205
  filtered_sentences = []
206
+ keep_tags = ['N', 'Np', 'V', 'Nc']
 
207
  for key in annotated.keys():
208
+ # print(key,annotated[key])
209
  sent = ' '.join([dict_['wordForm'] for dict_ in annotated[key] if dict_['posTag'] in keep_tags])
210
  filtered_sentences.append(sent)
211
  return filtered_sentences
 
213
 
214
  def get_candidate_ngrams(segmentised_doc, filtered_segmentised_doc, ngram_n, stopwords_ls):
215
  # get actual ngrams
 
216
  actual_ngram_list = compute_ngram_list(segmentised_doc, ngram_n, stopwords_ls, subsentences=True)
217
 
218
  # get filtered ngrams
 
219
  filtered_ngram_list = compute_ngram_list(filtered_segmentised_doc, ngram_n, stopwords_ls,
220
  subsentences=False)
221
 
 
224
  return candidate_ngram
225
 
226
 
227
+ def limit_minimum_frequency(doc_segmentised, ngram_list, min_freq=1):
228
  ngram_dict_freq = {}
229
  for ngram in ngram_list:
230
  ngram_n = len(ngram.split())
231
  count = 0
232
  for sentence in doc_segmentised:
233
  sent = sentence.split()
234
+ # print(sent)
235
  for i in range(len(sent) - ngram_n + 1):
236
  pair = ' '.join(sent[i:i + ngram_n])
237
+ # print(pair, ngram)
238
  if pair == ngram:
239
  count += 1
240
+ # print(ngram, count)
241
+ if count >= min_freq:
242
  ngram_dict_freq[ngram] = count
243
 
244
  return ngram_dict_freq
245
 
246
 
247
+ def remove_overlapping_ngrams(ngram_list):
248
+ to_remove = set()
249
+ for ngram1 in ngram_list:
250
+ for ngram2 in ngram_list:
251
+ if len(ngram1.split()) > len(ngram2.split()) and (ngram1.startswith(ngram2) or ngram1.endswith(ngram2)):
252
+ # print(ngram1, ngram2)
253
+ # print()
254
+ to_remove.add(ngram2)
255
+
256
+ # print("To removed")
257
+ # print(to_remove)
258
+ for kw in to_remove:
259
+ ngram_list.remove(kw)
260
+ return ngram_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
named_entities.py → model/named_entities.py RENAMED
@@ -1,6 +1,4 @@
1
- # from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
2
  from underthesea import sent_tokenize
3
- import torch
4
 
5
 
6
  def substring(w, ls):
 
 
1
  from underthesea import sent_tokenize
 
2
 
3
 
4
  def substring(w, ls):
process_text.py → model/process_text.py RENAMED
@@ -1,8 +1,7 @@
1
  from string import punctuation
2
- import re
3
 
4
 
5
- def process_article_content(text):
6
  full_text_processed = replace_all(text.strip())
7
 
8
  while '\n\n' in full_text_processed:
@@ -10,12 +9,6 @@ def process_article_content(text):
10
 
11
  full_text_processed = process_sticking_sentences(full_text_processed)
12
 
13
- pattern = "Ảnh: [A-ZĐÀÁÂÃÈÉÊẾÌÍÒÓÔÕÙÚĂĐĨŨƠƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỂỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪỬỮỰỲỴÝỶỸa-zđ][a-zàáâãèéêếìíòóôõùúăđĩũơưăạảấầẩẫậắằẳẵặẹẻẽềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹ]+ [A-ZĐÀÁÂÃÈÉÊẾÌÍÒÓÔÕÙÚĂĐĨŨƠƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỂỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪỬỮỰỲỴÝỶỸa-zđ][a-zàáâãèéêếìíòóôõùúăđĩũơưăạảấầẩẫậắằẳẵặẹẻẽềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹ]+\.?"
14
- full_text_processed = re.sub(pattern, '', full_text_processed)
15
-
16
- pattern = "Ảnh: [A-ZĐÀÁÂÃÈÉÊẾÌÍÒÓÔÕÙÚĂĐĨŨƠƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỂỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪỬỮỰỲỴÝỶỸ]+\.?"
17
- full_text_processed = re.sub(pattern, '', full_text_processed)
18
-
19
  while ' ' in full_text_processed:
20
  full_text_processed = full_text_processed.replace(' ', ' ')
21
  return full_text_processed
 
1
  from string import punctuation
 
2
 
3
 
4
+ def process_text_pipeline(text):
5
  full_text_processed = replace_all(text.strip())
6
 
7
  while '\n\n' in full_text_processed:
 
9
 
10
  full_text_processed = process_sticking_sentences(full_text_processed)
11
 
 
 
 
 
 
 
12
  while ' ' in full_text_processed:
13
  full_text_processed = full_text_processed.replace(' ', ' ')
14
  return full_text_processed
pipeline.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import py_vncorenlp
2
+ from transformers import AutoTokenizer, Pipeline, pipeline
3
+ import os
4
+
5
+ from model.keyword_extraction_utils import *
6
+ from model.process_text import process_text_pipeline
7
+
8
+ dir_path = os.path.dirname(os.path.realpath(__file__))
9
+
10
+
11
+ class KeywordExtractorPipeline(Pipeline):
12
+ def __init__(self, model, ner_model, **kwargs):
13
+ super().__init__(model, **kwargs)
14
+ self.annotator = py_vncorenlp.VnCoreNLP(annotators=["wseg", "pos"],
15
+ save_dir=f'{dir_path}/pretrained-models/vncorenlp')
16
+
17
+ print("Loading PhoBERT tokenizer")
18
+ self.phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
19
+ self.phobert = model
20
+
21
+ print("Loading NER tokenizer")
22
+ ner_tokenizer = AutoTokenizer.from_pretrained("NlpHUST/ner-vietnamese-electra-base")
23
+ self.ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)
24
+
25
+ stopwords_file_path = f'{dir_path}/vietnamese-stopwords-dash.txt'
26
+ with open(stopwords_file_path) as f:
27
+ self.stopwords = [w.strip() for w in f.readlines()]
28
+
29
+ def _sanitize_parameters(self, **kwargs):
30
+ preprocess_kwargs = {}
31
+ forward_kwargs = {}
32
+ postprocess_kwargs = {}
33
+
34
+ for possible_preprocess_kwarg in ["text", "title"]:
35
+ if possible_preprocess_kwarg in kwargs:
36
+ preprocess_kwargs[possible_preprocess_kwarg] = kwargs[possible_preprocess_kwarg]
37
+
38
+ for possible_forward_kwarg in ["ngram_n", "min_freq"]:
39
+ if possible_forward_kwarg in kwargs:
40
+ forward_kwargs[possible_forward_kwarg] = kwargs[possible_forward_kwarg]
41
+
42
+ for possible_postprocess_kwarg in ["top_n", "diversify_result"]:
43
+ if possible_postprocess_kwarg in kwargs:
44
+ postprocess_kwargs[possible_postprocess_kwarg] = kwargs[possible_postprocess_kwarg]
45
+
46
+ return preprocess_kwargs, forward_kwargs, postprocess_kwargs
47
+
48
+ def preprocess(self, inputs):
49
+ title = None
50
+ if inputs['title']:
51
+ title = process_text_pipeline(inputs['title'])
52
+ text = process_text_pipeline(inputs['text'])
53
+ return {"text": text, "title": title}
54
+
55
+ def _forward(self, model_inputs, ngram_n, min_freq):
56
+ text = model_inputs['text']
57
+ title = model_inputs['title']
58
+
59
+ # Getting segmentised document
60
+ ne_ls, doc_segmentised = get_segmentised_doc(self.ner_pipeline, self.annotator, title, text)
61
+ filtered_doc_segmentised = compute_filtered_text(self.annotator, title, text)
62
+
63
+ doc_embedding = get_doc_embeddings(filtered_doc_segmentised, self.phobert_tokenizer, self.phobert,
64
+ self.stopwords)
65
+
66
+ ngram_list = self.generate_ngram_list(doc_segmentised, filtered_doc_segmentised, ne_ls, ngram_n, min_freq)
67
+ print("Final ngram list")
68
+ print(sorted(ngram_list))
69
+
70
+ ngram_embeddings = compute_ngram_embeddings(self.phobert_tokenizer, self.phobert, ngram_list)
71
+
72
+ return {"ngram_list": ngram_list, "ngram_embeddings": ngram_embeddings, "doc_embedding": doc_embedding}
73
+
74
+ def postprocess(self, model_outputs, top_n, diversify_result):
75
+ ngram_list = model_outputs['ngram_list']
76
+ ngram_embeddings = model_outputs['ngram_embeddings']
77
+ doc_embedding = model_outputs['doc_embedding']
78
+
79
+ ngram_result = self.extract_keywords(doc_embedding, ngram_list, ngram_embeddings)
80
+ non_diversified = sorted([(ngram, ngram_result[ngram]) for ngram in ngram_result],
81
+ key=lambda x: x[1], reverse=True)[:top_n]
82
+
83
+ if diversify_result:
84
+ return diversify_result_kmeans(ngram_result, ngram_embeddings, top_n=top_n)
85
+ return non_diversified
86
+
87
+ def generate_ngram_list(self, doc_segmentised, filtered_doc_segmentised, ne_ls, ngram_n, min_freq):
88
+ ngram_low, ngram_high = ngram_n
89
+
90
+ # Adding ngram
91
+ ngram_list = set()
92
+ for n in range(ngram_low, ngram_high + 1):
93
+ ngram_list.update(get_candidate_ngrams(doc_segmentised, filtered_doc_segmentised, n, self.stopwords))
94
+
95
+ # print(sorted(ngram_list))
96
+ # Adding named entities ngram list
97
+ ne_ls_segmented = [self.annotator.word_segment(ne)[0] for ne in ne_ls]
98
+ print("Named Entities list")
99
+ print(ne_ls_segmented)
100
+ ngram_list.update(ne_ls_segmented)
101
+
102
+ # print(sorted(ngram_list))
103
+ # Removing overlapping ngrams
104
+ ngram_list = remove_overlapping_ngrams(ngram_list)
105
+ # print("Removed overlapping ngrams")
106
+ # print(sorted(ngram_list))
107
+
108
+ # Limit ngrams by minimum frequency
109
+ if min_freq > 1:
110
+ ngram_list = limit_minimum_frequency(doc_segmentised, ngram_list, min_freq=min_freq)
111
+ return ngram_list.keys()
112
+
113
+ return ngram_list
114
+
115
+ def extract_keywords(self, doc_embedding, ngram_list, ngram_embeddings):
116
+ ngram_result = compute_ngram_similarity(ngram_list, ngram_embeddings, doc_embedding)
117
+ ngram_result = remove_duplicates(ngram_result)
118
+ return ngram_result
119
+
120
+
121
+ if __name__ == "__main__":
122
+ phobert = torch.load(f'{dir_path}/pretrained-models/phobert.pt')
123
+ phobert.eval()
124
+ ner_model = torch.load(f'{dir_path}/pretrained-models/ner-vietnamese-electra-base.pt')
125
+ ner_model.eval()
126
+ kw_pipeline = KeywordExtractorPipeline(phobert, ner_model)
127
+
128
+ text_file_path = f'{dir_path}/test_file.txt'
129
+ with open(text_file_path, 'r') as f:
130
+ text = ' '.join([ln.strip() for ln in f.readlines()])
131
+
132
+ inp = {"text": text, "title": None}
133
+ kws = kw_pipeline(inputs=inp, min_freq=1, ngram_n=(1, 3), top_n=5, diversify_result=False)
134
+ print(kws)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ py_vncorenlp
3
+ transformers
vietnamese-stopwords-dash.txt CHANGED
@@ -868,7 +868,6 @@ ngày_xửa
868
  ngày_đến
869
  ngày_ấy
870
  ngôi
871
- ngôi_nhà
872
  ngôi_thứ
873
  ngõ_hầu
874
  ngăn_ngắt
@@ -1031,7 +1030,6 @@ năm
1031
  năm_tháng
1032
  nơi
1033
  nơi_nơi
1034
- nước
1035
  nước_bài
1036
  nước_cùng
1037
  nước_lên
@@ -1987,4 +1985,14 @@ Nhờ
1987
  Hiện
1988
  Hiện_tại
1989
  hiện
1990
- cạnh
 
 
 
 
 
 
 
 
 
 
 
868
  ngày_đến
869
  ngày_ấy
870
  ngôi
 
871
  ngôi_thứ
872
  ngõ_hầu
873
  ngăn_ngắt
 
1030
  năm_tháng
1031
  nơi
1032
  nơi_nơi
 
1033
  nước_bài
1034
  nước_cùng
1035
  nước_lên
 
1985
  Hiện
1986
  Hiện_tại
1987
  hiện
1988
+ cạnh
1989
+ Xung_quanh
1990
+ tóm_lại
1991
+ Cho
1992
+ Sau_này
1993
+ Vào
1994
+ ngày_nay
1995
+ chung_quanh
1996
+ Qua
1997
+ Thông_qua
1998
+ bao_gồm