Spaces:
Running
Running
Allowed for uploading custom regex for cleaning. Fixed calculate all probabilities, reduce outliers. Added text tree for hierarchical modelling.
Browse files- app.py +10 -5
- funcs/clean_funcs.py +26 -9
- funcs/helper_functions.py +27 -0
- funcs/topic_core_funcs.py +39 -11
app.py
CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
|
|
7 |
import numpy as np
|
8 |
|
9 |
from funcs.topic_core_funcs import pre_clean, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model
|
10 |
-
from funcs.helper_functions import dummy_function, initial_file_load
|
11 |
from sklearn.feature_extraction.text import CountVectorizer
|
12 |
|
13 |
# Gradio app
|
@@ -19,6 +19,7 @@ with block:
|
|
19 |
data_state = gr.State(pd.DataFrame())
|
20 |
embeddings_state = gr.State(np.array([]))
|
21 |
topic_model_state = gr.State()
|
|
|
22 |
docs_state = gr.State()
|
23 |
data_file_name_no_ext_state = gr.State()
|
24 |
label_list_state = gr.State(pd.DataFrame())
|
@@ -42,9 +43,12 @@ with block:
|
|
42 |
|
43 |
with gr.Accordion("Clean data", open = False):
|
44 |
with gr.Row():
|
45 |
-
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with >
|
46 |
-
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop <
|
47 |
-
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
|
|
|
|
|
|
|
48 |
clean_btn = gr.Button("Clean data")
|
49 |
|
50 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
@@ -101,7 +105,8 @@ with block:
|
|
101 |
in_colnames.change(dummy_function, in_colnames, None)
|
102 |
|
103 |
# Clean data
|
104 |
-
|
|
|
105 |
|
106 |
# Extract topics
|
107 |
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state], api_name="topics")
|
|
|
7 |
import numpy as np
|
8 |
|
9 |
from funcs.topic_core_funcs import pre_clean, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model
|
10 |
+
from funcs.helper_functions import dummy_function, initial_file_load, custom_regex_load
|
11 |
from sklearn.feature_extraction.text import CountVectorizer
|
12 |
|
13 |
# Gradio app
|
|
|
19 |
data_state = gr.State(pd.DataFrame())
|
20 |
embeddings_state = gr.State(np.array([]))
|
21 |
topic_model_state = gr.State()
|
22 |
+
custom_regex_state = gr.State(pd.DataFrame())
|
23 |
docs_state = gr.State()
|
24 |
data_file_name_no_ext_state = gr.State()
|
25 |
label_list_state = gr.State(pd.DataFrame())
|
|
|
43 |
|
44 |
with gr.Accordion("Clean data", open = False):
|
45 |
with gr.Row():
|
46 |
+
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with > 1 digits, emails, postcodes (UK).")
|
47 |
+
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 char strings. May make old embedding files incompatible due to differing lengths.")
|
48 |
+
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
|
49 |
+
with gr.Row():
|
50 |
+
gr.Markdown("""Import custom regex - csv table with one column of raw text regex patterns with header. Example pattern: r'example'""")
|
51 |
+
custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple")
|
52 |
clean_btn = gr.Button("Clean data")
|
53 |
|
54 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
|
|
105 |
in_colnames.change(dummy_function, in_colnames, None)
|
106 |
|
107 |
# Clean data
|
108 |
+
custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_state])
|
109 |
+
clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state], api_name="clean")
|
110 |
|
111 |
# Extract topics
|
112 |
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state], api_name="topics")
|
funcs/clean_funcs.py
CHANGED
@@ -8,15 +8,17 @@ custom_words = []
|
|
8 |
my_stop_words = custom_words
|
9 |
|
10 |
# #### Some of my cleaning functions
|
11 |
-
email_start_pattern_regex = r'.*importance:|.*subject:'
|
12 |
-
email_end_pattern_regex = r'kind regards.*|many thanks.*|sincerely.*'
|
13 |
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
14 |
email_pattern_regex = r'\S*@\S*\s?'
|
15 |
num_pattern_regex = r'[0-9]+'
|
16 |
-
|
17 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
18 |
-
warning_pattern_regex = r'caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
|
|
|
19 |
nbsp_pattern_regex = r' '
|
|
|
20 |
|
21 |
# Pre-compiling the regular expressions for efficiency (not actually used)
|
22 |
# email_start_pattern = re.compile(email_start_pattern_regex)
|
@@ -24,18 +26,33 @@ nbsp_pattern_regex = r' '
|
|
24 |
# html_pattern = re.compile(html_pattern_regex)
|
25 |
# email_pattern = re.compile(email_end_pattern_regex)
|
26 |
# num_pattern = re.compile(num_pattern_regex)
|
27 |
-
#
|
28 |
# postcode_pattern = re.compile(postcode_pattern_regex)
|
29 |
# warning_pattern = re.compile(warning_pattern_regex)
|
30 |
# nbsp_pattern = re.compile(nbsp_pattern_regex)
|
31 |
|
32 |
-
def initial_clean(texts , progress=gr.Progress()):
|
33 |
-
texts = pl.Series(texts)
|
34 |
text = texts.str.replace_all(html_pattern_regex, '')
|
35 |
text = text.str.replace_all(email_pattern_regex, '')
|
36 |
-
text = text.str.replace_all(
|
37 |
text = text.str.replace_all(postcode_pattern_regex, '')
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
text = text.to_list()
|
40 |
|
41 |
return text
|
|
|
8 |
my_stop_words = custom_words
|
9 |
|
10 |
# #### Some of my cleaning functions
|
11 |
+
email_start_pattern_regex = r'.*(?i)importance:|.*(?i)subject:'
|
12 |
+
email_end_pattern_regex = r'(?i)kind regards.*|(?i)many thanks.*|(?i)sincerely.*'
|
13 |
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
14 |
email_pattern_regex = r'\S*@\S*\s?'
|
15 |
num_pattern_regex = r'[0-9]+'
|
16 |
+
nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
|
17 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
18 |
+
warning_pattern_regex = r'(?i)caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
|
19 |
+
egress_pattern_regex = r'(?i)has been securely delivered by egress switch and was securely decoded on'
|
20 |
nbsp_pattern_regex = r' '
|
21 |
+
multiple_spaces_regex = r'\s{2,}'
|
22 |
|
23 |
# Pre-compiling the regular expressions for efficiency (not actually used)
|
24 |
# email_start_pattern = re.compile(email_start_pattern_regex)
|
|
|
26 |
# html_pattern = re.compile(html_pattern_regex)
|
27 |
# email_pattern = re.compile(email_end_pattern_regex)
|
28 |
# num_pattern = re.compile(num_pattern_regex)
|
29 |
+
# nums_two_more_regex_pattern = re.compile(nums_two_more_regex)
|
30 |
# postcode_pattern = re.compile(postcode_pattern_regex)
|
31 |
# warning_pattern = re.compile(warning_pattern_regex)
|
32 |
# nbsp_pattern = re.compile(nbsp_pattern_regex)
|
33 |
|
34 |
+
def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
35 |
+
texts = pl.Series(texts).str.strip_chars()
|
36 |
text = texts.str.replace_all(html_pattern_regex, '')
|
37 |
text = text.str.replace_all(email_pattern_regex, '')
|
38 |
+
text = text.str.replace_all(nums_two_more_regex, '')
|
39 |
text = text.str.replace_all(postcode_pattern_regex, '')
|
40 |
+
text = text.str.replace_all(multiple_spaces_regex, '')
|
41 |
+
|
42 |
+
# Allow for custom regex patterns to be removed
|
43 |
+
if len(custom_regex) > 0:
|
44 |
+
for pattern in custom_regex:
|
45 |
+
text = text.str.replace_all(pattern, '')
|
46 |
+
|
47 |
+
#text = text.str.replace_all(warning_pattern_regex, '') # This one is quite particular to Lambeth emails
|
48 |
+
#text = text.str.replace_all(egress_pattern_regex, '')
|
49 |
+
#text = text.str.replace_all(r'(?i)2nd floor civic centre', '')
|
50 |
+
#text = text.str.replace_all(r'(?i)6 brixton hill', '')
|
51 |
+
#text = text.str.replace_all(r'(?i)\bsocial care\b', '')
|
52 |
+
#text = text.str.replace_all(r'(?i)\basc\b', '')
|
53 |
+
#text = text.str.replace_all(r'(?i)\bcsc\b', '')
|
54 |
+
#text = text.str.replace_all(r'(?i)\blambeth\b', '')
|
55 |
+
|
56 |
text = text.to_list()
|
57 |
|
58 |
return text
|
funcs/helper_functions.py
CHANGED
@@ -132,6 +132,33 @@ def initial_file_load(in_file):
|
|
132 |
#The np.array([]) at the end is for clearing the embedding state when a new file is loaded
|
133 |
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, output_text, topic_model, embeddings, data_file_name_no_ext, custom_labels
|
134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
def get_file_path_end(file_path):
|
136 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
137 |
basename = os.path.basename(file_path)
|
|
|
132 |
#The np.array([]) at the end is for clearing the embedding state when a new file is loaded
|
133 |
return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, output_text, topic_model, embeddings, data_file_name_no_ext, custom_labels
|
134 |
|
135 |
+
def custom_regex_load(in_file):
|
136 |
+
'''
|
137 |
+
When file is loaded, update the column dropdown choices and write to relevant data states.
|
138 |
+
'''
|
139 |
+
|
140 |
+
custom_regex = pd.DataFrame()
|
141 |
+
|
142 |
+
file_list = [string.name for string in in_file]
|
143 |
+
|
144 |
+
regex_file_names = [string for string in file_list if "csv" in string.lower()]
|
145 |
+
if regex_file_names:
|
146 |
+
regex_file_name = regex_file_names[0]
|
147 |
+
custom_regex = read_file(regex_file_name)
|
148 |
+
#regex_file_name_no_ext = get_file_path_end(regex_file_name)
|
149 |
+
|
150 |
+
output_text = "Data file loaded."
|
151 |
+
print(output_text)
|
152 |
+
else:
|
153 |
+
error = "No regex file provided."
|
154 |
+
print(error)
|
155 |
+
output_text = error
|
156 |
+
return custom_regex
|
157 |
+
|
158 |
+
return custom_regex
|
159 |
+
|
160 |
+
|
161 |
+
|
162 |
def get_file_path_end(file_path):
|
163 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
164 |
basename = os.path.basename(file_path)
|
funcs/topic_core_funcs.py
CHANGED
@@ -51,7 +51,7 @@ embeddings_name = "BAAI/bge-small-en-v1.5" #"jinaai/jina-embeddings-v2-base-en"
|
|
51 |
hf_model_name = 'second-state/stablelm-2-zephyr-1.6b-GGUF' #'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF'
|
52 |
hf_model_file = 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf' # 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf'
|
53 |
|
54 |
-
def pre_clean(data, in_colnames, data_file_name_no_ext, clean_text, drop_duplicate_text, anonymise_drop, progress=gr.Progress(track_tqdm=True)):
|
55 |
|
56 |
output_text = ""
|
57 |
output_list = []
|
@@ -76,7 +76,10 @@ def pre_clean(data, in_colnames, data_file_name_no_ext, clean_text, drop_duplica
|
|
76 |
|
77 |
data_file_name_no_ext = data_file_name_no_ext + "_clean"
|
78 |
|
79 |
-
|
|
|
|
|
|
|
80 |
|
81 |
clean_toc = time.perf_counter()
|
82 |
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
@@ -90,7 +93,7 @@ def pre_clean(data, in_colnames, data_file_name_no_ext, clean_text, drop_duplica
|
|
90 |
#print("Removing duplicates and short entries from data")
|
91 |
#print("Data shape before: ", data.shape)
|
92 |
data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
|
93 |
-
data = data[data[in_colnames_list_first].str.len() >=
|
94 |
data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
|
95 |
|
96 |
#print("Data shape after duplicate/null removal: ", data.shape)
|
@@ -197,6 +200,12 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
197 |
|
198 |
assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
|
199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
except:
|
201 |
print(fail_error_message)
|
202 |
|
@@ -228,6 +237,12 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
228 |
|
229 |
assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
|
230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
except:
|
232 |
print(fail_error_message)
|
233 |
|
@@ -312,18 +327,21 @@ def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, sa
|
|
312 |
assigned_topics = topic_model.reduce_outliers(docs, assigned_topics, strategy="embeddings")
|
313 |
# Then, update the topics to the ones that considered the new data
|
314 |
|
|
|
|
|
|
|
315 |
print("Finished reducing outliers.")
|
316 |
|
317 |
-
progress(0.7, desc= "Replacing topic names with LLMs if necessary")
|
318 |
|
319 |
-
topic_dets = topic_model.get_topic_info()
|
320 |
|
321 |
-
# Replace original labels with LLM labels
|
322 |
-
if "LLM" in topic_model.get_topic_info().columns:
|
323 |
-
|
324 |
-
|
325 |
-
else:
|
326 |
-
|
327 |
|
328 |
# Outputs
|
329 |
progress(0.9, desc= "Saving to file")
|
@@ -448,6 +466,16 @@ def visualise_topics(topic_model, data, data_file_name_no_ext, low_resource_mode
|
|
448 |
|
449 |
hierarchical_topics = topic_model.hierarchical_topics(docs)
|
450 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
# Save new hierarchical topic model to file
|
452 |
hierarchical_topics_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topics_' + today_rev + '.csv'
|
453 |
hierarchical_topics.to_csv(hierarchical_topics_name)
|
|
|
51 |
hf_model_name = 'second-state/stablelm-2-zephyr-1.6b-GGUF' #'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF'
|
52 |
hf_model_file = 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf' # 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf'
|
53 |
|
54 |
+
def pre_clean(data, in_colnames, data_file_name_no_ext, custom_regex, clean_text, drop_duplicate_text, anonymise_drop, progress=gr.Progress(track_tqdm=True)):
|
55 |
|
56 |
output_text = ""
|
57 |
output_list = []
|
|
|
76 |
|
77 |
data_file_name_no_ext = data_file_name_no_ext + "_clean"
|
78 |
|
79 |
+
if not custom_regex.empty:
|
80 |
+
data[in_colnames_list_first] = initial_clean(data[in_colnames_list_first], custom_regex.iloc[:, 0].to_list())
|
81 |
+
else:
|
82 |
+
data[in_colnames_list_first] = initial_clean(data[in_colnames_list_first], [])
|
83 |
|
84 |
clean_toc = time.perf_counter()
|
85 |
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
|
|
93 |
#print("Removing duplicates and short entries from data")
|
94 |
#print("Data shape before: ", data.shape)
|
95 |
data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
|
96 |
+
data = data[data[in_colnames_list_first].str.len() >= 50]
|
97 |
data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
|
98 |
|
99 |
#print("Data shape after duplicate/null removal: ", data.shape)
|
|
|
200 |
|
201 |
assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
|
202 |
|
203 |
+
if calc_probs == True:
|
204 |
+
topics_probs_out = pd.DataFrame(topic_model.probabilities_)
|
205 |
+
topics_probs_out_name = "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
|
206 |
+
topics_probs_out.to_csv(topics_probs_out_name)
|
207 |
+
output_list.append(topics_probs_out_name)
|
208 |
+
|
209 |
except:
|
210 |
print(fail_error_message)
|
211 |
|
|
|
237 |
|
238 |
assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
|
239 |
|
240 |
+
if calc_probs == True:
|
241 |
+
topics_probs_out = pd.DataFrame(topic_model.probabilities_)
|
242 |
+
topics_probs_out_name = "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
|
243 |
+
topics_probs_out.to_csv(topics_probs_out_name)
|
244 |
+
output_list.append(topics_probs_out_name)
|
245 |
+
|
246 |
except:
|
247 |
print(fail_error_message)
|
248 |
|
|
|
327 |
assigned_topics = topic_model.reduce_outliers(docs, assigned_topics, strategy="embeddings")
|
328 |
# Then, update the topics to the ones that considered the new data
|
329 |
|
330 |
+
progress(0.6, desc= "Updating original model")
|
331 |
+
topic_model.update_topics(docs, topics=assigned_topics)
|
332 |
+
|
333 |
print("Finished reducing outliers.")
|
334 |
|
335 |
+
#progress(0.7, desc= "Replacing topic names with LLMs if necessary")
|
336 |
|
337 |
+
#topic_dets = topic_model.get_topic_info()
|
338 |
|
339 |
+
# # Replace original labels with LLM labels
|
340 |
+
# if "LLM" in topic_model.get_topic_info().columns:
|
341 |
+
# llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["LLM"].values()]
|
342 |
+
# topic_model.set_topic_labels(llm_labels)
|
343 |
+
# else:
|
344 |
+
# topic_model.set_topic_labels(list(topic_dets["Name"]))
|
345 |
|
346 |
# Outputs
|
347 |
progress(0.9, desc= "Saving to file")
|
|
|
466 |
|
467 |
hierarchical_topics = topic_model.hierarchical_topics(docs)
|
468 |
|
469 |
+
# Print topic tree
|
470 |
+
tree = topic_model.get_topic_tree(hierarchical_topics, tight_layout = True)
|
471 |
+
tree_name = data_file_name_no_ext + '_' + 'vis_hierarchy_tree_' + today_rev + '.txt'
|
472 |
+
|
473 |
+
with open(tree_name, "w") as file:
|
474 |
+
# Write the string to the file
|
475 |
+
file.write(tree)
|
476 |
+
|
477 |
+
output_list.append(tree_name)
|
478 |
+
|
479 |
# Save new hierarchical topic model to file
|
480 |
hierarchical_topics_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topics_' + today_rev + '.csv'
|
481 |
hierarchical_topics.to_csv(hierarchical_topics_name)
|