Spaces:
Running
Running
Added option to reduce outliers based on closest topic
Browse files- app.py +59 -23
- funcs/anonymiser.py +17 -10
app.py
CHANGED
@@ -80,7 +80,7 @@ hf_model_name = 'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1
|
|
80 |
hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
|
81 |
|
82 |
|
83 |
-
def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics):
|
84 |
|
85 |
all_tic = time.perf_counter()
|
86 |
|
@@ -99,12 +99,17 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
99 |
in_label_list_first = in_colnames_list_first
|
100 |
|
101 |
if anonymise_drop == "Yes":
|
|
|
|
|
102 |
in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
|
103 |
in_files[in_colnames_list_first] = in_files_anon_col[in_colnames_list_first]
|
104 |
anonymise_data_name = "anonymised_data.csv"
|
105 |
in_files.to_csv(anonymise_data_name)
|
106 |
output_list.append(anonymise_data_name)
|
107 |
|
|
|
|
|
|
|
108 |
docs = list(in_files[in_colnames_list_first].str.lower())
|
109 |
label_col = in_files[in_label_list_first]
|
110 |
|
@@ -115,7 +120,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
115 |
print("Low resource mode: ", low_resource_mode)
|
116 |
|
117 |
if low_resource_mode == "No":
|
118 |
-
print("
|
119 |
try:
|
120 |
embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
|
121 |
except:
|
@@ -125,7 +130,8 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
125 |
|
126 |
embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
|
127 |
|
128 |
-
|
|
|
129 |
|
130 |
elif low_resource_mode == "Yes":
|
131 |
print("Choosing low resource TF-IDF model.")
|
@@ -140,8 +146,7 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
140 |
|
141 |
embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
|
142 |
|
143 |
-
|
144 |
-
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.05, max_df=0.9)
|
145 |
|
146 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
147 |
from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
|
@@ -152,13 +157,22 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
152 |
|
153 |
if not candidate_topics:
|
154 |
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
164 |
|
@@ -174,15 +188,26 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
174 |
zero_shot_topics = read_file(candidate_topics.name)
|
175 |
zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
|
176 |
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
188 |
|
@@ -192,6 +217,15 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
192 |
else:
|
193 |
print("Preparing topic model outputs.")
|
194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
topic_dets = topic_model.get_topic_info()
|
196 |
#print(topic_dets.columns)
|
197 |
|
@@ -299,7 +333,7 @@ with block:
|
|
299 |
|
300 |
with gr.Row():
|
301 |
min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of similar documents needed to make a topic.")
|
302 |
-
max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value =
|
303 |
|
304 |
with gr.Row():
|
305 |
topics_btn = gr.Button("Extract topics")
|
@@ -319,6 +353,8 @@ with block:
|
|
319 |
with gr.Row():
|
320 |
low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
|
321 |
create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
|
|
|
|
|
322 |
save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
|
323 |
visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
|
324 |
|
@@ -326,7 +362,7 @@ with block:
|
|
326 |
in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
|
327 |
in_colnames.change(dummy_function, in_colnames, None)
|
328 |
|
329 |
-
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics], outputs=[output_single_text, output_file, plot], api_name="topics")
|
330 |
|
331 |
block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
|
332 |
|
|
|
80 |
hf_model_file = 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf' # 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf'
|
81 |
|
82 |
|
83 |
+
def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers):
|
84 |
|
85 |
all_tic = time.perf_counter()
|
86 |
|
|
|
99 |
in_label_list_first = in_colnames_list_first
|
100 |
|
101 |
if anonymise_drop == "Yes":
|
102 |
+
anon_tic = time.perf_counter()
|
103 |
+
time_out = f"Creating visualisation took {all_toc - vis_tic:0.1f} seconds"
|
104 |
in_files_anon_col, anonymisation_success = anon.anonymise_script(in_files, in_colnames_list_first, anon_strat="replace")
|
105 |
in_files[in_colnames_list_first] = in_files_anon_col[in_colnames_list_first]
|
106 |
anonymise_data_name = "anonymised_data.csv"
|
107 |
in_files.to_csv(anonymise_data_name)
|
108 |
output_list.append(anonymise_data_name)
|
109 |
|
110 |
+
anon_toc = time.perf_counter()
|
111 |
+
time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
|
112 |
+
|
113 |
docs = list(in_files[in_colnames_list_first].str.lower())
|
114 |
label_col = in_files[in_label_list_first]
|
115 |
|
|
|
120 |
print("Low resource mode: ", low_resource_mode)
|
121 |
|
122 |
if low_resource_mode == "No":
|
123 |
+
print("Using high resource Jina transformer model")
|
124 |
try:
|
125 |
embedding_model = AutoModel.from_pretrained(local_embeddings_location, revision = revision_choice, trust_remote_code=True,local_files_only=True, device_map="auto")
|
126 |
except:
|
|
|
130 |
|
131 |
embedding_model_pipe = pipeline("feature-extraction", model=embedding_model, tokenizer=tokenizer)
|
132 |
|
133 |
+
# UMAP model uses Bertopic defaults
|
134 |
+
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
|
135 |
|
136 |
elif low_resource_mode == "Yes":
|
137 |
print("Choosing low resource TF-IDF model.")
|
|
|
146 |
|
147 |
embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
|
148 |
|
149 |
+
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.1)
|
|
|
150 |
|
151 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
152 |
from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
|
|
|
157 |
|
158 |
if not candidate_topics:
|
159 |
|
160 |
+
# Generate representation model here if topics won't be changed later
|
161 |
+
if reduce_outliers == "No":
|
162 |
+
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
163 |
+
vectorizer_model=vectoriser_model,
|
164 |
+
umap_model=umap_model,
|
165 |
+
min_topic_size = min_docs_slider,
|
166 |
+
nr_topics = max_topics_slider,
|
167 |
+
representation_model=representation_model,
|
168 |
+
verbose = True)
|
169 |
+
else:
|
170 |
+
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
171 |
+
vectorizer_model=vectoriser_model,
|
172 |
+
umap_model=umap_model,
|
173 |
+
min_topic_size = min_docs_slider,
|
174 |
+
nr_topics = max_topics_slider,
|
175 |
+
verbose = True)
|
176 |
|
177 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
178 |
|
|
|
188 |
zero_shot_topics = read_file(candidate_topics.name)
|
189 |
zero_shot_topics_lower = list(zero_shot_topics.iloc[:, 0].str.lower())
|
190 |
|
191 |
+
# Generate representation model here if topics won't be changed later
|
192 |
+
if reduce_outliers == "No":
|
193 |
+
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
194 |
+
vectorizer_model=vectoriser_model,
|
195 |
+
umap_model=umap_model,
|
196 |
+
min_topic_size = min_docs_slider,
|
197 |
+
nr_topics = max_topics_slider,
|
198 |
+
zeroshot_topic_list = zero_shot_topics_lower,
|
199 |
+
zeroshot_min_similarity = 0.5,#0.7,
|
200 |
+
representation_model=representation_model,
|
201 |
+
verbose = True)
|
202 |
+
else:
|
203 |
+
topic_model = BERTopic( embedding_model=embedding_model_pipe,
|
204 |
+
vectorizer_model=vectoriser_model,
|
205 |
+
umap_model=umap_model,
|
206 |
+
min_topic_size = min_docs_slider,
|
207 |
+
nr_topics = max_topics_slider,
|
208 |
+
zeroshot_topic_list = zero_shot_topics_lower,
|
209 |
+
zeroshot_min_similarity = 0.5,#0.7,
|
210 |
+
verbose = True)
|
211 |
|
212 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
213 |
|
|
|
217 |
else:
|
218 |
print("Preparing topic model outputs.")
|
219 |
|
220 |
+
# Reduce outliers if required
|
221 |
+
if reduce_outliers == "Yes":
|
222 |
+
print("Reducing outliers.")
|
223 |
+
# Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity.
|
224 |
+
topics_text = topic_model.reduce_outliers(docs, topics_text, strategy="embeddings")
|
225 |
+
# Then, update the topics to the ones that considered the new data
|
226 |
+
topic_model.update_topics(docs, topics=topics_text, vectorizer_model=vectoriser_model, representation_model=representation_model)
|
227 |
+
print("Finished reducing outliers.")
|
228 |
+
|
229 |
topic_dets = topic_model.get_topic_info()
|
230 |
#print(topic_dets.columns)
|
231 |
|
|
|
333 |
|
334 |
with gr.Row():
|
335 |
min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of similar documents needed to make a topic.")
|
336 |
+
max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 10, step = 1, label = "Maximum number of topics")
|
337 |
|
338 |
with gr.Row():
|
339 |
topics_btn = gr.Button("Extract topics")
|
|
|
353 |
with gr.Row():
|
354 |
low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
|
355 |
create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
|
356 |
+
reduce_outliers = gr.Dropdown(label = "Reduce outliers by selecting closest topic.", value="No", choices=["Yes", "No"])
|
357 |
+
with gr.Row():
|
358 |
save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
|
359 |
visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
|
360 |
|
|
|
362 |
in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
|
363 |
in_colnames.change(dummy_function, in_colnames, None)
|
364 |
|
365 |
+
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, create_llm_topic_labels, save_topic_model, visualise_topics, reduce_outliers], outputs=[output_single_text, output_file, plot], api_name="topics")
|
366 |
|
367 |
block.queue().launch(debug=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
|
368 |
|
funcs/anonymiser.py
CHANGED
@@ -1,26 +1,33 @@
|
|
|
|
1 |
import spacy
|
|
|
2 |
import os
|
3 |
|
4 |
-
def
|
5 |
try:
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
spacy.load(model_name)
|
8 |
-
|
9 |
-
|
10 |
-
return False
|
11 |
|
12 |
-
model_name = "en_core_web_sm"
|
13 |
-
if not is_model_installed(model_name):
|
14 |
-
os.system(f"python -m spacy download {model_name}")
|
15 |
|
|
|
|
|
|
|
|
|
16 |
|
|
|
17 |
# Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
|
18 |
#os.system("pip uninstall -y gradio")
|
19 |
#os.system("pip install gradio==3.50.0")
|
20 |
#os.system("python -m spacy download en_core_web_lg")
|
21 |
|
22 |
-
spacy.load(model_name)
|
23 |
-
|
24 |
import re
|
25 |
import secrets
|
26 |
import base64
|
|
|
1 |
+
from spacy.cli import download
|
2 |
import spacy
|
3 |
+
spacy.prefer_gpu()
|
4 |
import os
|
5 |
|
6 |
+
def spacy_model_installed(model_name):
|
7 |
try:
|
8 |
+
import en_core_web_sm
|
9 |
+
en_core_web_sm.load()
|
10 |
+
print("Successfully imported spaCy model")
|
11 |
+
#nlp = spacy.load("en_core_web_sm")
|
12 |
+
#print(nlp._path)
|
13 |
+
except:
|
14 |
+
download(model_name)
|
15 |
spacy.load(model_name)
|
16 |
+
print("Successfully imported spaCy model")
|
17 |
+
#print(nlp._path)
|
|
|
18 |
|
|
|
|
|
|
|
19 |
|
20 |
+
#if not is_model_installed(model_name):
|
21 |
+
# os.system(f"python -m spacy download {model_name}")
|
22 |
+
model_name = "en_core_web_sm"
|
23 |
+
spacy_model_installed(model_name)
|
24 |
|
25 |
+
spacy.load(model_name)
|
26 |
# Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
|
27 |
#os.system("pip uninstall -y gradio")
|
28 |
#os.system("pip install gradio==3.50.0")
|
29 |
#os.system("python -m spacy download en_core_web_lg")
|
30 |
|
|
|
|
|
31 |
import re
|
32 |
import secrets
|
33 |
import base64
|