Harsh502s commited on
Commit
f24359a
·
1 Parent(s): e1f4ca7

Remove unused KeyBERT model and update BERTopic

Browse files
Models/{stackexchange_topic_model.pkl → topic_key_model_130.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f58e481e2bf3282ad0a39bd203dbad662ff3d1c70ae787a953c799fcc7159dbf
3
- size 597236536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:807e4facbc8beded07885eb54a9a7cd85871feb329828ec23d17cb45566d5133
3
+ size 601417294
Pages/Models.py CHANGED
@@ -2,7 +2,6 @@ import streamlit as st
2
  from streamlit_extras.tags import tagger_component
3
  import re
4
  import pickle
5
- from keybert import KeyBERT
6
  from bertopic import BERTopic
7
  from keras.models import load_model
8
  from keras.preprocessing.sequence import pad_sequences
@@ -12,8 +11,7 @@ from keras.preprocessing.sequence import pad_sequences
12
  @st.cache_resource
13
  def load_models():
14
  return (
15
- BERTopic.load(r"Models/stackexchange_topic_model.pkl"),
16
- KeyBERT("all-MiniLM-L6-v2"),
17
  load_model(r"Models/tag_model.h5"),
18
  pickle.load(open(r"Models/token.pkl", "rb")),
19
  pickle.load(open(r"Models/bin.pkl", "rb")),
@@ -21,7 +19,7 @@ def load_models():
21
 
22
 
23
  # Load the model into memory
24
- bertopic_model, keybert_model, cnn_model, tokenizer, binarizer = load_models()
25
 
26
 
27
  # Clean the input text
@@ -43,72 +41,29 @@ def tag_cnn_model(text):
43
 
44
 
45
  # Retrieve the keyphrases from the input text using the KeyBERT model
46
- def retrieve_keyphrases(text, n, ngram_range):
47
- keywords = keybert_model.extract_keywords(
48
- text,
49
- keyphrase_ngram_range=ngram_range,
50
- top_n=n,
51
- diversity=0.5,
52
- use_maxsum=True,
53
- use_mmr=True,
54
- seed_keywords=[
55
- "machine-learning",
56
- "r",
57
- "regression",
58
- "deep-learning",
59
- "neural-networks",
60
- "data-request",
61
- "python",
62
- "reinforcement-learning",
63
- "classification",
64
- "time-series",
65
- "probability",
66
- "neural-network",
67
- "distributions",
68
- "bayesian",
69
- "hypothesis-testing",
70
- "keras",
71
- "mathematical-statistics",
72
- "scikit-learn",
73
- "logistic",
74
- "convolutional-neural-networks",
75
- "clustering",
76
- "tensorflow",
77
- "terminology",
78
- "nlp",
79
- "correlation",
80
- "self-study",
81
- "normal-distribution",
82
- "geospatial",
83
- "cross-validation",
84
- "optimization",
85
- "random-forest",
86
- "mixed-model",
87
- "data-mining",
88
- "feature-selection",
89
- "pca",
90
- "references",
91
- "computer-vision",
92
- "data-visualization",
93
- "confidence-interval",
94
- "generalized-linear-model",
95
- "variance",
96
- "natural-language-processing",
97
- "dataset",
98
- "svm",
99
- "training",
100
- "maximum-likelihood",
101
- "statistical-significance",
102
- "gradient-descent",
103
- "multiple-regression",
104
- "estimation",
105
- ],
106
- )
107
- return sorted(keywords, key=lambda x: x[1], reverse=True)
108
 
109
 
110
  # Find the most similar topics for the input text using the BERTopic model
111
- def output_unsupervised(text, n):
112
  new_review = text
113
  similar_topics, similarity = bertopic_model.find_topics(new_review, top_n=n)
114
  similar_topics = sorted(similar_topics)
@@ -139,38 +94,34 @@ def unsupervised_page_bertopic():
139
  "Enter number of tags to assign", value=5, key="unsupervised_n_bertopic"
140
  )
141
  if st.button("Assign tags", key="unsupervised_button_bertopic"):
142
- output_unsupervised(text, n)
143
 
144
 
145
- # Display the unsupervised model using keybert page of the app
146
- def semi_unsupervised_page_keybert():
147
- st.header("Unsupervised Model Using KeyBERT Model")
148
  text = st.text_area(
149
  "Enter text to assign tags", height=200, key="unsupervised_text_keybert"
150
  )
151
  text = clean_text(text)
152
  n = st.number_input(
153
- "Enter number of tags to assign", value=10, key="unsupervised_n_keybert"
154
  )
155
- ngram_range_lower = st.number_input(
156
- "Enter lower limit of ngram range",
157
- value=1,
158
- min_value=1,
159
- max_value=6,
160
- key="unsupervised_ngram_lower",
 
 
 
161
  )
162
- ngram_range_upper = st.number_input(
163
- "Enter upper limit of ngram range",
164
- value=3,
165
- min_value=1,
166
- max_value=6,
167
- key="unsupervised_ngram_upper",
168
  )
169
- ngram_range = (ngram_range_lower, ngram_range_upper)
170
- if st.button("Assign tags", key="unsupervised_button_keybert"):
171
- topics = retrieve_keyphrases(text, n, ngram_range)
172
- topics = [topic[0] for topic in topics]
173
- tagger_component("Tags:", topics)
174
 
175
 
176
  # Display the model page of the app
@@ -187,14 +138,21 @@ def model_page():
187
 
188
  st.title("Select a model to use:")
189
  with st.container():
190
- tab1, tab2, tab3 = st.tabs(
191
- ["Supervised Using CNN", "Semi-Supervised-KeyBERT", "UnSupervised-BERTopic"]
 
 
 
 
 
192
  )
193
  with tab1:
194
  supervised_page()
195
  with tab2:
196
- semi_unsupervised_page_keybert()
197
  with tab3:
 
 
198
  unsupervised_page_bertopic()
199
  with st.container():
200
  with st.expander("Example Texts", expanded=False):
 
2
  from streamlit_extras.tags import tagger_component
3
  import re
4
  import pickle
 
5
  from bertopic import BERTopic
6
  from keras.models import load_model
7
  from keras.preprocessing.sequence import pad_sequences
 
11
  @st.cache_resource
12
  def load_models():
13
  return (
14
+ BERTopic.load(r"Models/topic_key_model_130.pkl"),
 
15
  load_model(r"Models/tag_model.h5"),
16
  pickle.load(open(r"Models/token.pkl", "rb")),
17
  pickle.load(open(r"Models/bin.pkl", "rb")),
 
19
 
20
 
21
  # Load the model into memory
22
+ bertopic_model, cnn_model, tokenizer, binarizer = load_models()
23
 
24
 
25
  # Clean the input text
 
41
 
42
 
43
  # Retrieve the keyphrases from the input text using the KeyBERT model
44
+ def output_keybert(text, n):
45
+ new_review = text
46
+ similar_topics, similarity = bertopic_model.find_topics(new_review, top_n=n)
47
+ similar_topics = sorted(similar_topics)
48
+ for i in range(n):
49
+ tags = bertopic_model.get_topic(similar_topics[i], full=True)["KeyBERT"]
50
+ tags = [tag[0] for tag in tags]
51
+ tagger_component(f"Tags from cluster {i+1}:", tags)
52
+
53
+
54
+ # Retrieve the keyphrases from the input text using the Bertopics MMR model
55
+ def output_mmr(text, n):
56
+ new_review = text
57
+ similar_topics, similarity = bertopic_model.find_topics(new_review, top_n=n)
58
+ similar_topics = sorted(similar_topics)
59
+ for i in range(n):
60
+ tags = bertopic_model.get_topic(similar_topics[i], full=True)["MMR"]
61
+ tags = [tag[0] for tag in tags]
62
+ tagger_component(f"Tags from cluster {i+1}:", tags)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
  # Find the most similar topics for the input text using the BERTopic model
66
+ def output_bertopic(text, n):
67
  new_review = text
68
  similar_topics, similarity = bertopic_model.find_topics(new_review, top_n=n)
69
  similar_topics = sorted(similar_topics)
 
94
  "Enter number of tags to assign", value=5, key="unsupervised_n_bertopic"
95
  )
96
  if st.button("Assign tags", key="unsupervised_button_bertopic"):
97
+ output_bertopic(text, n)
98
 
99
 
100
+ def unsupervised_page_keybert():
101
+ st.header("Unsupervised Model Using BERTopic Model")
 
102
  text = st.text_area(
103
  "Enter text to assign tags", height=200, key="unsupervised_text_keybert"
104
  )
105
  text = clean_text(text)
106
  n = st.number_input(
107
+ "Enter number of tags to assign", value=5, key="unsupervised_n_keybert"
108
  )
109
+ if st.button("Assign tags", key="unsupervised_button_keybert"):
110
+ output_keybert(text, n)
111
+
112
+
113
+ # Display the unsupervised model using bertopic page of the app
114
+ def unsupervised_page_mmr():
115
+ st.header("Unsupervised Model Using BERTopic Model")
116
+ text = st.text_area(
117
+ "Enter text to assign tags", height=200, key="unsupervised_text_mmr"
118
  )
119
+ text = clean_text(text)
120
+ n = st.number_input(
121
+ "Enter number of tags to assign", value=5, key="unsupervised_n_mmr"
 
 
 
122
  )
123
+ if st.button("Assign tags", key="unsupervised_button_mmr"):
124
+ output_mmr(text, n)
 
 
 
125
 
126
 
127
  # Display the model page of the app
 
138
 
139
  st.title("Select a model to use:")
140
  with st.container():
141
+ tab1, tab2, tab3, tab4 = st.tabs(
142
+ [
143
+ "Supervised Using CNN",
144
+ "UnSupervised-KeyBERT",
145
+ "UnSupervised-MMR",
146
+ "UnSupervised-BERTopic",
147
+ ]
148
  )
149
  with tab1:
150
  supervised_page()
151
  with tab2:
152
+ unsupervised_page_keybert()
153
  with tab3:
154
+ unsupervised_page_mmr()
155
+ with tab4:
156
  unsupervised_page_bertopic()
157
  with st.container():
158
  with st.expander("Example Texts", expanded=False):
Pages/Topic Model Results.py CHANGED
@@ -4,7 +4,7 @@ from bertopic import BERTopic
4
 
5
  @st.cache_resource
6
  def load_model():
7
- return BERTopic.load(r"Models/stackexchange_topic_model.pkl")
8
 
9
 
10
  bertopic_model = load_model()
 
4
 
5
  @st.cache_resource
6
  def load_model():
7
+ return BERTopic.load(r"Models/topic_key_model_130.pkl")
8
 
9
 
10
  bertopic_model = load_model()