Sonnyjim commited on
Commit
e09dd3b
·
1 Parent(s): 43ac0d8

Added option to reduce outliers based on closest topic

Browse files
Files changed (2) hide show
  1. app.py +59 -23
  2. funcs/anonymiser.py +17 -10
app.py CHANGED
@@ -80,7 +80,7 @@ hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1
80
  hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
81
 
82
 
83
- def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics):
84
 
85
  all_tic = time.perf_counter()
86
 
@@ -99,12 +99,17 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
99
  in_label_list_first = in_colnames_list_first
100
 
101
  if anonymise_drop == "Yes":
 
 
102
  in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
103
  in_files[in_colnames_list_first] = in_files_anon_col[in_colnames_list_first]
104
  anonymise_data_name = "anonymised_data.csv"
105
  in_files.to_csv(anonymise_data_name)
106
  output_list.append(anonymise_data_name)
107
 
 
 
 
108
  docs = list(in_files[in_colnames_list_first].str.lower())
109
  label_col = in_files[in_label_list_first]
110
 
@@ -115,7 +120,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
115
  print("Low resource mode: ", low_resource_mode)
116
 
117
  if low_resource_mode == "No":
118
- print("Choosing high resource Jina transformer model")
119
  try:
120
  embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
121
  except:
@@ -125,7 +130,8 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
125
 
126
  embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
127
 
128
- umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_seed)
 
129
 
130
  elif low_resource_mode == "Yes":
131
  print("Choosing low resource TF-IDF model.")
@@ -140,8 +146,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
140
 
141
  embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
142
 
143
-
144
- vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.05, max_df=0.9)
145
 
146
  from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
147
  from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
@@ -152,13 +157,22 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
152
 
153
  if not candidate_topics:
154
 
155
- topic_model = BERTopic( embedding_model=embedding_model_pipe,
156
- vectorizer_model=vectoriser_model,
157
- umap_model=umap_model,
158
- min_topic_size= min_docs_slider,
159
- nr_topics = max_topics_slider,
160
- representation_model=representation_model,
161
- verbose = True)
 
 
 
 
 
 
 
 
 
162
 
163
  topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
164
 
@@ -174,15 +188,26 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
174
  zero_shot_topics = read_file(candidate_topics.name)
175
  zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
176
 
177
- topic_model = BERTopic( embedding_model=embedding_model_pipe,
178
- vectorizer_model=vectoriser_model,
179
- umap_model=umap_model,
180
- min_topic_size = min_docs_slider,
181
- nr_topics = max_topics_slider,
182
- zeroshot_topic_list = zero_shot_topics_lower,
183
- zeroshot_min_similarity = 0.5,#0.7,
184
- representation_model=representation_model,
185
- verbose = True)
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
188
 
@@ -192,6 +217,15 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
192
  else:
193
  print("Preparing topic model outputs.")
194
 
 
 
 
 
 
 
 
 
 
195
  topic_dets = topic_model.get_topic_info()
196
  #print(topic_dets.columns)
197
 
@@ -299,7 +333,7 @@ with block:
299
 
300
  with gr.Row():
301
  min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of similar documents needed to make a topic.")
302
- max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
303
 
304
  with gr.Row():
305
  topics_btn = gr.Button("Extract topics")
@@ -319,6 +353,8 @@ with block:
319
  with gr.Row():
320
  low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
321
  create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
 
 
322
  save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
323
  visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
324
 
@@ -326,7 +362,7 @@ with block:
326
  in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
327
  in_colnames.change(dummy_function, in_colnames, None)
328
 
329
- topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics], outputs=[output_single_text, output_file, plot], api_name="topics")
330
 
331
  block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
332
 
 
80
  hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
81
 
82
 
83
+ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers):
84
 
85
  all_tic = time.perf_counter()
86
 
 
99
  in_label_list_first = in_colnames_list_first
100
 
101
  if anonymise_drop == "Yes":
102
+ anon_tic = time.perf_counter()
103
+ time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
104
  in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
105
  in_files[in_colnames_list_first] = in_files_anon_col[in_colnames_list_first]
106
  anonymise_data_name = "anonymised_data.csv"
107
  in_files.to_csv(anonymise_data_name)
108
  output_list.append(anonymise_data_name)
109
 
110
+ anon_toc = time.perf_counter()
111
+ time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
112
+
113
  docs = list(in_files[in_colnames_list_first].str.lower())
114
  label_col = in_files[in_label_list_first]
115
 
 
120
  print("Low resource mode: ", low_resource_mode)
121
 
122
  if low_resource_mode == "No":
123
+ print("Using high resource Jina transformer model")
124
  try:
125
  embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
126
  except:
 
130
 
131
  embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
132
 
133
+ # UMAP model uses Bertopic defaults
134
+ umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
135
 
136
  elif low_resource_mode == "Yes":
137
  print("Choosing low resource TF-IDF model.")
 
146
 
147
  embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
148
 
149
+ vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
 
150
 
151
  from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
152
  from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
 
157
 
158
  if not candidate_topics:
159
 
160
+ # Generate representation model here if topics won't be changed later
161
+ if reduce_outliers == "No":
162
+ topic_model = BERTopic( embedding_model=embedding_model_pipe,
163
+ vectorizer_model=vectoriser_model,
164
+ umap_model=umap_model,
165
+ min_topic_size = min_docs_slider,
166
+ nr_topics = max_topics_slider,
167
+ representation_model=representation_model,
168
+ verbose = True)
169
+ else:
170
+ topic_model = BERTopic( embedding_model=embedding_model_pipe,
171
+ vectorizer_model=vectoriser_model,
172
+ umap_model=umap_model,
173
+ min_topic_size = min_docs_slider,
174
+ nr_topics = max_topics_slider,
175
+ verbose = True)
176
 
177
  topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
178
 
 
188
  zero_shot_topics = read_file(candidate_topics.name)
189
  zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
190
 
191
+ # Generate representation model here if topics won't be changed later
192
+ if reduce_outliers == "No":
193
+ topic_model = BERTopic( embedding_model=embedding_model_pipe,
194
+ vectorizer_model=vectoriser_model,
195
+ umap_model=umap_model,
196
+ min_topic_size = min_docs_slider,
197
+ nr_topics = max_topics_slider,
198
+ zeroshot_topic_list = zero_shot_topics_lower,
199
+ zeroshot_min_similarity = 0.5,#0.7,
200
+ representation_model=representation_model,
201
+ verbose = True)
202
+ else:
203
+ topic_model = BERTopic( embedding_model=embedding_model_pipe,
204
+ vectorizer_model=vectoriser_model,
205
+ umap_model=umap_model,
206
+ min_topic_size = min_docs_slider,
207
+ nr_topics = max_topics_slider,
208
+ zeroshot_topic_list = zero_shot_topics_lower,
209
+ zeroshot_min_similarity = 0.5,#0.7,
210
+ verbose = True)
211
 
212
  topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
213
 
 
217
  else:
218
  print("Preparing topic model outputs.")
219
 
220
+ # Reduce outliers if required
221
+ if reduce_outliers == "Yes":
222
+ print("Reducing outliers.")
223
+ # Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity.
224
+ topics_text = topic_model.reduce_outliers(docs, topics_text, strategy="embeddings")
225
+ # Then, update the topics to the ones that considered the new data
226
+ topic_model.update_topics(docs, topics=topics_text, vectorizer_model=vectoriser_model, representation_model=representation_model)
227
+ print("Finished reducing outliers.")
228
+
229
  topic_dets = topic_model.get_topic_info()
230
  #print(topic_dets.columns)
231
 
 
333
 
334
  with gr.Row():
335
  min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of similar documents needed to make a topic.")
336
+ max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 10, step = 1, label = "Maximum number of topics")
337
 
338
  with gr.Row():
339
  topics_btn = gr.Button("Extract topics")
 
353
  with gr.Row():
354
  low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
355
  create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
356
+ reduce_outliers = gr.Dropdown(label = "Reduce outliers by selecting closest topic.", value="No", choices=["Yes", "No"])
357
+ with gr.Row():
358
  save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
359
  visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
360
 
 
362
  in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
363
  in_colnames.change(dummy_function, in_colnames, None)
364
 
365
+ topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers], outputs=[output_single_text, output_file, plot], api_name="topics")
366
 
367
  block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
368
 
funcs/anonymiser.py CHANGED
@@ -1,26 +1,33 @@
 
1
  import spacy
 
2
  import os
3
 
4
- def is_model_installed(model_name):
5
  try:
6
- # Try to load the model
 
 
 
 
 
 
7
  spacy.load(model_name)
8
- return True
9
- except OSError:
10
- return False
11
 
12
- model_name = "en_core_web_sm"
13
- if not is_model_installed(model_name):
14
- os.system(f"python -m spacy download {model_name}")
15
 
 
 
 
 
16
 
 
17
  # Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
18
  #os.system("pip uninstall -y gradio")
19
  #os.system("pip install gradio==3.50.0")
20
  #os.system("python -m spacy download en_core_web_lg")
21
 
22
- spacy.load(model_name)
23
-
24
  import re
25
  import secrets
26
  import base64
 
1
+ from spacy.cli import download
2
  import spacy
3
+ spacy.prefer_gpu()
4
  import os
5
 
6
+ def spacy_model_installed(model_name):
7
  try:
8
+ import en_core_web_sm
9
+ en_core_web_sm.load()
10
+ print("Successfully imported spaCy model")
11
+ #nlp = spacy.load("en_core_web_sm")
12
+ #print(nlp._path)
13
+ except:
14
+ download(model_name)
15
  spacy.load(model_name)
16
+ print("Successfully imported spaCy model")
17
+ #print(nlp._path)
 
18
 
 
 
 
19
 
20
+ #if not is_model_installed(model_name):
21
+ # os.system(f"python -m spacy download {model_name}")
22
+ model_name = "en_core_web_sm"
23
+ spacy_model_installed(model_name)
24
 
25
+ spacy.load(model_name)
26
  # Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
27
  #os.system("pip uninstall -y gradio")
28
  #os.system("pip install gradio==3.50.0")
29
  #os.system("python -m spacy download en_core_web_lg")
30
 
 
 
31
  import re
32
  import secrets
33
  import base64