Sonnyjim commited on
Commit
381f959
·
1 Parent(s): 0a177ca

Allowed for uploading custom regex for cleaning. Fixed calculate all probabilities, reduce outliers. Added text tree for hierarchical modelling.

Browse files
app.py CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
7
  import numpy as np
8
 
9
  from funcs.topic_core_funcs import pre_clean, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model
10
- from funcs.helper_functions import dummy_function, initial_file_load
11
  from sklearn.feature_extraction.text import CountVectorizer
12
 
13
  # Gradio app
@@ -19,6 +19,7 @@ with block:
19
  data_state = gr.State(pd.DataFrame())
20
  embeddings_state = gr.State(np.array([]))
21
  topic_model_state = gr.State()
 
22
  docs_state = gr.State()
23
  data_file_name_no_ext_state = gr.State()
24
  label_list_state = gr.State(pd.DataFrame())
@@ -42,9 +43,12 @@ with block:
42
 
43
  with gr.Accordion("Clean data", open = False):
44
  with gr.Row():
45
- clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with > 2 digits, emails, postcodes (UK).")
46
- drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 10 char strings. May make old embedding files incompatible due to differing lengths.")
47
- anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
 
 
 
48
  clean_btn = gr.Button("Clean data")
49
 
50
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
@@ -101,7 +105,8 @@ with block:
101
  in_colnames.change(dummy_function, in_colnames, None)
102
 
103
  # Clean data
104
- clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, clean_text, drop_duplicate_text, anonymise_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state], api_name="clean")
 
105
 
106
  # Extract topics
107
  topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state], api_name="topics")
 
7
  import numpy as np
8
 
9
  from funcs.topic_core_funcs import pre_clean, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model
10
+ from funcs.helper_functions import dummy_function, initial_file_load, custom_regex_load
11
  from sklearn.feature_extraction.text import CountVectorizer
12
 
13
  # Gradio app
 
19
  data_state = gr.State(pd.DataFrame())
20
  embeddings_state = gr.State(np.array([]))
21
  topic_model_state = gr.State()
22
+ custom_regex_state = gr.State(pd.DataFrame())
23
  docs_state = gr.State()
24
  data_file_name_no_ext_state = gr.State()
25
  label_list_state = gr.State(pd.DataFrame())
 
43
 
44
  with gr.Accordion("Clean data", open = False):
45
  with gr.Row():
46
+ clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with > 1 digits, emails, postcodes (UK).")
47
+ drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 char strings. May make old embedding files incompatible due to differing lengths.")
48
+ anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
49
+ with gr.Row():
50
+ gr.Markdown("""Import custom regex - csv table with one column of raw text regex patterns with header. Example pattern: r'example'""")
51
+ custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple")
52
  clean_btn = gr.Button("Clean data")
53
 
54
  with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
 
105
  in_colnames.change(dummy_function, in_colnames, None)
106
 
107
  # Clean data
108
+ custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_state])
109
+ clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state], api_name="clean")
110
 
111
  # Extract topics
112
  topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state], api_name="topics")
funcs/clean_funcs.py CHANGED
@@ -8,15 +8,17 @@ custom_words = []
8
  my_stop_words = custom_words
9
 
10
  # #### Some of my cleaning functions
11
- email_start_pattern_regex = r'.*importance:|.*subject:'
12
- email_end_pattern_regex = r'kind regards.*|many thanks.*|sincerely.*'
13
  html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
14
  email_pattern_regex = r'\S*@\S*\s?'
15
  num_pattern_regex = r'[0-9]+'
16
- nums_three_more_regex = r'\b[0-9]{3,}\b|\b[0-9]+\s[0-9]+\b'
17
  postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
18
- warning_pattern_regex = r'caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
 
19
  nbsp_pattern_regex = r'&nbsp;'
 
20
 
21
  # Pre-compiling the regular expressions for efficiency (not actually used)
22
  # email_start_pattern = re.compile(email_start_pattern_regex)
@@ -24,18 +26,33 @@ nbsp_pattern_regex = r'&nbsp;'
24
  # html_pattern = re.compile(html_pattern_regex)
25
  # email_pattern = re.compile(email_end_pattern_regex)
26
  # num_pattern = re.compile(num_pattern_regex)
27
- # nums_three_more_regex_pattern = re.compile(nums_three_more_regex)
28
  # postcode_pattern = re.compile(postcode_pattern_regex)
29
  # warning_pattern = re.compile(warning_pattern_regex)
30
  # nbsp_pattern = re.compile(nbsp_pattern_regex)
31
 
32
- def initial_clean(texts , progress=gr.Progress()):
33
- texts = pl.Series(texts)
34
  text = texts.str.replace_all(html_pattern_regex, '')
35
  text = text.str.replace_all(email_pattern_regex, '')
36
- text = text.str.replace_all(nums_three_more_regex, '')
37
  text = text.str.replace_all(postcode_pattern_regex, '')
38
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  text = text.to_list()
40
 
41
  return text
 
8
  my_stop_words = custom_words
9
 
10
  # #### Some of my cleaning functions
11
+ email_start_pattern_regex = r'.*(?i)importance:|.*(?i)subject:'
12
+ email_end_pattern_regex = r'(?i)kind regards.*|(?i)many thanks.*|(?i)sincerely.*'
13
  html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
14
  email_pattern_regex = r'\S*@\S*\s?'
15
  num_pattern_regex = r'[0-9]+'
16
+ nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
17
  postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
18
+ warning_pattern_regex = r'(?i)caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
19
+ egress_pattern_regex = r'(?i)has been securely delivered by egress switch and was securely decoded on'
20
  nbsp_pattern_regex = r'&nbsp;'
21
+ multiple_spaces_regex = r'\s{2,}'
22
 
23
  # Pre-compiling the regular expressions for efficiency (not actually used)
24
  # email_start_pattern = re.compile(email_start_pattern_regex)
 
26
  # html_pattern = re.compile(html_pattern_regex)
27
  # email_pattern = re.compile(email_end_pattern_regex)
28
  # num_pattern = re.compile(num_pattern_regex)
29
+ # nums_two_more_regex_pattern = re.compile(nums_two_more_regex)
30
  # postcode_pattern = re.compile(postcode_pattern_regex)
31
  # warning_pattern = re.compile(warning_pattern_regex)
32
  # nbsp_pattern = re.compile(nbsp_pattern_regex)
33
 
34
+ def initial_clean(texts, custom_regex, progress=gr.Progress()):
35
+ texts = pl.Series(texts).str.strip_chars()
36
  text = texts.str.replace_all(html_pattern_regex, '')
37
  text = text.str.replace_all(email_pattern_regex, '')
38
+ text = text.str.replace_all(nums_two_more_regex, '')
39
  text = text.str.replace_all(postcode_pattern_regex, '')
40
+ text = text.str.replace_all(multiple_spaces_regex, '')
41
+
42
+ # Allow for custom regex patterns to be removed
43
+ if len(custom_regex) > 0:
44
+ for pattern in custom_regex:
45
+ text = text.str.replace_all(pattern, '')
46
+
47
+ #text = text.str.replace_all(warning_pattern_regex, '') # This one is quite particular to Lambeth emails
48
+ #text = text.str.replace_all(egress_pattern_regex, '')
49
+ #text = text.str.replace_all(r'(?i)2nd floor civic centre', '')
50
+ #text = text.str.replace_all(r'(?i)6 brixton hill', '')
51
+ #text = text.str.replace_all(r'(?i)\bsocial care\b', '')
52
+ #text = text.str.replace_all(r'(?i)\basc\b', '')
53
+ #text = text.str.replace_all(r'(?i)\bcsc\b', '')
54
+ #text = text.str.replace_all(r'(?i)\blambeth\b', '')
55
+
56
  text = text.to_list()
57
 
58
  return text
funcs/helper_functions.py CHANGED
@@ -132,6 +132,33 @@ def initial_file_load(in_file):
132
  #The np.array([]) at the end is for clearing the embedding state when a new file is loaded
133
  return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, output_text, topic_model, embeddings, data_file_name_no_ext, custom_labels
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  def get_file_path_end(file_path):
136
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
137
  basename = os.path.basename(file_path)
 
132
  #The np.array([]) at the end is for clearing the embedding state when a new file is loaded
133
  return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, output_text, topic_model, embeddings, data_file_name_no_ext, custom_labels
134
 
135
+ def custom_regex_load(in_file):
136
+ '''
137
+ When file is loaded, update the column dropdown choices and write to relevant data states.
138
+ '''
139
+
140
+ custom_regex = pd.DataFrame()
141
+
142
+ file_list = [string.name for string in in_file]
143
+
144
+ regex_file_names = [string for string in file_list if "csv" in string.lower()]
145
+ if regex_file_names:
146
+ regex_file_name = regex_file_names[0]
147
+ custom_regex = read_file(regex_file_name)
148
+ #regex_file_name_no_ext = get_file_path_end(regex_file_name)
149
+
150
+ output_text = "Data file loaded."
151
+ print(output_text)
152
+ else:
153
+ error = "No regex file provided."
154
+ print(error)
155
+ output_text = error
156
+ return custom_regex
157
+
158
+ return custom_regex
159
+
160
+
161
+
162
  def get_file_path_end(file_path):
163
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
164
  basename = os.path.basename(file_path)
funcs/topic_core_funcs.py CHANGED
@@ -51,7 +51,7 @@ embeddings_name = "BAAI/bge-small-en-v1.5" #"jinaai/jina-embeddings-v2-base-en"
51
  hf_model_name = 'second-state/stablelm-2-zephyr-1.6b-GGUF' #'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF'
52
  hf_model_file = 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf' # 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf'
53
 
54
- def pre_clean(data, in_colnames, data_file_name_no_ext, clean_text, drop_duplicate_text, anonymise_drop, progress=gr.Progress(track_tqdm=True)):
55
 
56
  output_text = ""
57
  output_list = []
@@ -76,7 +76,10 @@ def pre_clean(data, in_colnames, data_file_name_no_ext, clean_text, drop_duplica
76
 
77
  data_file_name_no_ext = data_file_name_no_ext + "_clean"
78
 
79
- data[in_colnames_list_first] = initial_clean(data[in_colnames_list_first])
 
 
 
80
 
81
  clean_toc = time.perf_counter()
82
  clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
@@ -90,7 +93,7 @@ def pre_clean(data, in_colnames, data_file_name_no_ext, clean_text, drop_duplica
90
  #print("Removing duplicates and short entries from data")
91
  #print("Data shape before: ", data.shape)
92
  data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
93
- data = data[data[in_colnames_list_first].str.len() >= 10]
94
  data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
95
 
96
  #print("Data shape after duplicate/null removal: ", data.shape)
@@ -197,6 +200,12 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
197
 
198
  assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
199
 
 
 
 
 
 
 
200
  except:
201
  print(fail_error_message)
202
 
@@ -228,6 +237,12 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
228
 
229
  assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
230
 
 
 
 
 
 
 
231
  except:
232
  print(fail_error_message)
233
 
@@ -312,18 +327,21 @@ def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, sa
312
  assigned_topics = topic_model.reduce_outliers(docs, assigned_topics, strategy="embeddings")
313
  # Then, update the topics to the ones that considered the new data
314
 
 
 
 
315
  print("Finished reducing outliers.")
316
 
317
- progress(0.7, desc= "Replacing topic names with LLMs if necessary")
318
 
319
- topic_dets = topic_model.get_topic_info()
320
 
321
- # Replace original labels with LLM labels
322
- if "LLM" in topic_model.get_topic_info().columns:
323
- llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["LLM"].values()]
324
- topic_model.set_topic_labels(llm_labels)
325
- else:
326
- topic_model.set_topic_labels(list(topic_dets["Name"]))
327
 
328
  # Outputs
329
  progress(0.9, desc= "Saving to file")
@@ -448,6 +466,16 @@ def visualise_topics(topic_model, data, data_file_name_no_ext, low_resource_mode
448
 
449
  hierarchical_topics = topic_model.hierarchical_topics(docs)
450
 
 
 
 
 
 
 
 
 
 
 
451
  # Save new hierarchical topic model to file
452
  hierarchical_topics_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topics_' + today_rev + '.csv'
453
  hierarchical_topics.to_csv(hierarchical_topics_name)
 
51
  hf_model_name = 'second-state/stablelm-2-zephyr-1.6b-GGUF' #'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF'
52
  hf_model_file = 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf' # 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf'
53
 
54
+ def pre_clean(data, in_colnames, data_file_name_no_ext, custom_regex, clean_text, drop_duplicate_text, anonymise_drop, progress=gr.Progress(track_tqdm=True)):
55
 
56
  output_text = ""
57
  output_list = []
 
76
 
77
  data_file_name_no_ext = data_file_name_no_ext + "_clean"
78
 
79
+ if not custom_regex.empty:
80
+ data[in_colnames_list_first] = initial_clean(data[in_colnames_list_first], custom_regex.iloc[:, 0].to_list())
81
+ else:
82
+ data[in_colnames_list_first] = initial_clean(data[in_colnames_list_first], [])
83
 
84
  clean_toc = time.perf_counter()
85
  clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
 
93
  #print("Removing duplicates and short entries from data")
94
  #print("Data shape before: ", data.shape)
95
  data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
96
+ data = data[data[in_colnames_list_first].str.len() >= 50]
97
  data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
98
 
99
  #print("Data shape after duplicate/null removal: ", data.shape)
 
200
 
201
  assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
202
 
203
+ if calc_probs == True:
204
+ topics_probs_out = pd.DataFrame(topic_model.probabilities_)
205
+ topics_probs_out_name = "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
206
+ topics_probs_out.to_csv(topics_probs_out_name)
207
+ output_list.append(topics_probs_out_name)
208
+
209
  except:
210
  print(fail_error_message)
211
 
 
237
 
238
  assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
239
 
240
+ if calc_probs == True:
241
+ topics_probs_out = pd.DataFrame(topic_model.probabilities_)
242
+ topics_probs_out_name = "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
243
+ topics_probs_out.to_csv(topics_probs_out_name)
244
+ output_list.append(topics_probs_out_name)
245
+
246
  except:
247
  print(fail_error_message)
248
 
 
327
  assigned_topics = topic_model.reduce_outliers(docs, assigned_topics, strategy="embeddings")
328
  # Then, update the topics to the ones that considered the new data
329
 
330
+ progress(0.6, desc= "Updating original model")
331
+ topic_model.update_topics(docs, topics=assigned_topics)
332
+
333
  print("Finished reducing outliers.")
334
 
335
+ #progress(0.7, desc= "Replacing topic names with LLMs if necessary")
336
 
337
+ #topic_dets = topic_model.get_topic_info()
338
 
339
+ # # Replace original labels with LLM labels
340
+ # if "LLM" in topic_model.get_topic_info().columns:
341
+ # llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["LLM"].values()]
342
+ # topic_model.set_topic_labels(llm_labels)
343
+ # else:
344
+ # topic_model.set_topic_labels(list(topic_dets["Name"]))
345
 
346
  # Outputs
347
  progress(0.9, desc= "Saving to file")
 
466
 
467
  hierarchical_topics = topic_model.hierarchical_topics(docs)
468
 
469
+ # Print topic tree
470
+ tree = topic_model.get_topic_tree(hierarchical_topics, tight_layout = True)
471
+ tree_name = data_file_name_no_ext + '_' + 'vis_hierarchy_tree_' + today_rev + '.txt'
472
+
473
+ with open(tree_name, "w") as file:
474
+ # Write the string to the file
475
+ file.write(tree)
476
+
477
+ output_list.append(tree_name)
478
+
479
  # Save new hierarchical topic model to file
480
  hierarchical_topics_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topics_' + today_rev + '.csv'
481
  hierarchical_topics.to_csv(hierarchical_topics_name)