Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

Sonnyjim commited on Feb 6, 2024

Commit

381f959

1 Parent(s): 0a177ca

Allowed for uploading custom regex for cleaning. Fixed calculate all probabilities, reduce outliers. Added text tree for hierarchical modelling.

Browse files

Files changed (4) hide show

app.py +10 -5
funcs/clean_funcs.py +26 -9
funcs/helper_functions.py +27 -0
funcs/topic_core_funcs.py +39 -11

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import pandas as pd
 import numpy as np
 from funcs.topic_core_funcs import pre_clean, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model
-from funcs.helper_functions import dummy_function, initial_file_load
 from sklearn.feature_extraction.text import CountVectorizer
 # Gradio app
@@ -19,6 +19,7 @@ with block:
     data_state = gr.State(pd.DataFrame())
     embeddings_state = gr.State(np.array([]))
     topic_model_state = gr.State()
     docs_state = gr.State()
     data_file_name_no_ext_state = gr.State()
     label_list_state = gr.State(pd.DataFrame())
@@ -42,9 +43,12 @@ with block:
         with gr.Accordion("Clean data", open = False):
             with gr.Row():
-                clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with > 2 digits, emails, postcodes (UK).")
-                drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 10 char strings. May make old embedding files incompatible due to differing lengths.")
-                anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
             clean_btn = gr.Button("Clean data")
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
@@ -101,7 +105,8 @@ with block:
     in_colnames.change(dummy_function, in_colnames, None)
     # Clean data
-    clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, clean_text, drop_duplicate_text, anonymise_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state], api_name="clean")
     # Extract topics
     topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state], api_name="topics")

 import numpy as np
 from funcs.topic_core_funcs import pre_clean, extract_topics, reduce_outliers, represent_topics, visualise_topics, save_as_pytorch_model
+from funcs.helper_functions import dummy_function, initial_file_load, custom_regex_load
 from sklearn.feature_extraction.text import CountVectorizer
 # Gradio app
     data_state = gr.State(pd.DataFrame())
     embeddings_state = gr.State(np.array([]))
     topic_model_state = gr.State()
+    custom_regex_state = gr.State(pd.DataFrame())
     docs_state = gr.State()
     data_file_name_no_ext_state = gr.State()
     label_list_state = gr.State(pd.DataFrame())
         with gr.Accordion("Clean data", open = False):
             with gr.Row():
+                clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with > 1 digits, emails, postcodes (UK).")
+                drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 char strings. May make old embedding files incompatible due to differing lengths.")
+                anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
+            with gr.Row():
+                gr.Markdown("""Import custom regex - csv table with one column of raw text regex patterns with header. Example pattern: r'example'""")
+                custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple")
             clean_btn = gr.Button("Clean data")
         with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
     in_colnames.change(dummy_function, in_colnames, None)
     # Clean data
+    custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_state])
+    clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state], api_name="clean")
     # Extract topics
     topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state], api_name="topics")

funcs/clean_funcs.py CHANGED Viewed

@@ -8,15 +8,17 @@ custom_words = []
 my_stop_words = custom_words
 # #### Some of my cleaning functions
-email_start_pattern_regex = r'.*importance:|.*subject:'
-email_end_pattern_regex = r'kind regards.*|many thanks.*|sincerely.*'
 html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
 email_pattern_regex = r'\S*@\S*\s?'
 num_pattern_regex = r'[0-9]+'
-nums_three_more_regex = r'\b[0-9]{3,}\b|\b[0-9]+\s[0-9]+\b'
 postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
-warning_pattern_regex = r'caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
 nbsp_pattern_regex = r'&nbsp;'
 # Pre-compiling the regular expressions for efficiency (not actually used)
 # email_start_pattern = re.compile(email_start_pattern_regex)
@@ -24,18 +26,33 @@ nbsp_pattern_regex = r'&nbsp;'
 # html_pattern = re.compile(html_pattern_regex)
 # email_pattern = re.compile(email_end_pattern_regex)
 # num_pattern = re.compile(num_pattern_regex)
-# nums_three_more_regex_pattern = re.compile(nums_three_more_regex)
 # postcode_pattern = re.compile(postcode_pattern_regex)
 # warning_pattern = re.compile(warning_pattern_regex)
 # nbsp_pattern = re.compile(nbsp_pattern_regex)
-def initial_clean(texts , progress=gr.Progress()):
-    texts = pl.Series(texts)
     text = texts.str.replace_all(html_pattern_regex, '')
     text = text.str.replace_all(email_pattern_regex, '')
-    text = text.str.replace_all(nums_three_more_regex, '')
     text = text.str.replace_all(postcode_pattern_regex, '')
     text = text.to_list()
     return text

 my_stop_words = custom_words
 # #### Some of my cleaning functions
+email_start_pattern_regex = r'.*(?i)importance:|.*(?i)subject:'
+email_end_pattern_regex = r'(?i)kind regards.*|(?i)many thanks.*|(?i)sincerely.*'
 html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
 email_pattern_regex = r'\S*@\S*\s?'
 num_pattern_regex = r'[0-9]+'
+nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
 postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
+warning_pattern_regex = r'(?i)caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
+egress_pattern_regex = r'(?i)has been securely delivered by egress switch and was securely decoded on'
 nbsp_pattern_regex = r'&nbsp;'
+multiple_spaces_regex = r'\s{2,}'
 # Pre-compiling the regular expressions for efficiency (not actually used)
 # email_start_pattern = re.compile(email_start_pattern_regex)
 # html_pattern = re.compile(html_pattern_regex)
 # email_pattern = re.compile(email_end_pattern_regex)
 # num_pattern = re.compile(num_pattern_regex)
+# nums_two_more_regex_pattern = re.compile(nums_two_more_regex)
 # postcode_pattern = re.compile(postcode_pattern_regex)
 # warning_pattern = re.compile(warning_pattern_regex)
 # nbsp_pattern = re.compile(nbsp_pattern_regex)
+def initial_clean(texts, custom_regex, progress=gr.Progress()):
+    texts = pl.Series(texts).str.strip_chars()
     text = texts.str.replace_all(html_pattern_regex, '')
     text = text.str.replace_all(email_pattern_regex, '')
+    text = text.str.replace_all(nums_two_more_regex, '')
     text = text.str.replace_all(postcode_pattern_regex, '')
+    text = text.str.replace_all(multiple_spaces_regex, '')
+    # Allow for custom regex patterns to be removed
+    if len(custom_regex) > 0:
+        for pattern in custom_regex:
+            text = text.str.replace_all(pattern, '')
+    #text = text.str.replace_all(warning_pattern_regex, '') # This one is quite particular to Lambeth emails
+    #text = text.str.replace_all(egress_pattern_regex, '')
+    #text = text.str.replace_all(r'(?i)2nd floor civic centre', '')
+    #text = text.str.replace_all(r'(?i)6 brixton hill', '')
+    #text = text.str.replace_all(r'(?i)\bsocial care\b', '')
+    #text = text.str.replace_all(r'(?i)\basc\b', '')
+    #text = text.str.replace_all(r'(?i)\bcsc\b', '')
+    #text = text.str.replace_all(r'(?i)\blambeth\b', '')
     text = text.to_list()
     return text

funcs/helper_functions.py CHANGED Viewed

@@ -132,6 +132,33 @@ def initial_file_load(in_file):
     #The np.array([]) at the end is for clearing the embedding state when a new file is loaded
     return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, output_text, topic_model, embeddings, data_file_name_no_ext, custom_labels
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)

     #The np.array([]) at the end is for clearing the embedding state when a new file is loaded
     return gr.Dropdown(choices=concat_choices), gr.Dropdown(choices=concat_choices), df, output_text, topic_model, embeddings, data_file_name_no_ext, custom_labels
+def custom_regex_load(in_file):
+    '''
+    When file is loaded, update the column dropdown choices and write to relevant data states.
+    '''
+    custom_regex = pd.DataFrame()
+    file_list = [string.name for string in in_file]
+    regex_file_names = [string for string in file_list if "csv" in string.lower()]
+    if regex_file_names:
+        regex_file_name = regex_file_names[0]
+        custom_regex = read_file(regex_file_name)
+        #regex_file_name_no_ext = get_file_path_end(regex_file_name)
+        output_text = "Data file loaded."
+        print(output_text)
+    else:
+        error = "No regex file provided."
+        print(error)
+        output_text = error
+        return custom_regex
+    return custom_regex
 def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)

funcs/topic_core_funcs.py CHANGED Viewed

@@ -51,7 +51,7 @@ embeddings_name = "BAAI/bge-small-en-v1.5" #"jinaai/jina-embeddings-v2-base-en"
 hf_model_name =  'second-state/stablelm-2-zephyr-1.6b-GGUF' #'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF'
 hf_model_file =   'stablelm-2-zephyr-1_6b-Q5_K_M.gguf' # 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf'
-def pre_clean(data, in_colnames, data_file_name_no_ext, clean_text, drop_duplicate_text, anonymise_drop, progress=gr.Progress(track_tqdm=True)):
     output_text = ""
     output_list = []
@@ -76,7 +76,10 @@ def pre_clean(data, in_colnames, data_file_name_no_ext, clean_text, drop_duplica
         data_file_name_no_ext = data_file_name_no_ext + "_clean"
-        data[in_colnames_list_first] = initial_clean(data[in_colnames_list_first])
         clean_toc = time.perf_counter()
         clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
@@ -90,7 +93,7 @@ def pre_clean(data, in_colnames, data_file_name_no_ext, clean_text, drop_duplica
         #print("Removing duplicates and short entries from data")
         #print("Data shape before: ", data.shape)
         data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
-        data = data[data[in_colnames_list_first].str.len() >= 10]
         data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
         #print("Data shape after duplicate/null removal: ", data.shape)
@@ -197,6 +200,12 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
             assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
         except:
             print(fail_error_message)
@@ -228,6 +237,12 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
             assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
         except:
             print(fail_error_message)
@@ -312,18 +327,21 @@ def reduce_outliers(topic_model, docs, embeddings_out, data_file_name_no_ext, sa
     assigned_topics = topic_model.reduce_outliers(docs, assigned_topics, strategy="embeddings")
     # Then, update the topics to the ones that considered the new data
     print("Finished reducing outliers.")
-    progress(0.7, desc= "Replacing topic names with LLMs if necessary")
-    topic_dets = topic_model.get_topic_info()
-    # Replace original labels with LLM labels
-    if "LLM" in topic_model.get_topic_info().columns:
-        llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["LLM"].values()]
-        topic_model.set_topic_labels(llm_labels)
-    else:
-        topic_model.set_topic_labels(list(topic_dets["Name"]))
     # Outputs
     progress(0.9, desc= "Saving to file")
@@ -448,6 +466,16 @@ def visualise_topics(topic_model, data, data_file_name_no_ext, low_resource_mode
         hierarchical_topics = topic_model.hierarchical_topics(docs)
         # Save new hierarchical topic model to file
         hierarchical_topics_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topics_' + today_rev + '.csv'
         hierarchical_topics.to_csv(hierarchical_topics_name)

 hf_model_name =  'second-state/stablelm-2-zephyr-1.6b-GGUF' #'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF'
 hf_model_file =   'stablelm-2-zephyr-1_6b-Q5_K_M.gguf' # 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf'
+def pre_clean(data, in_colnames, data_file_name_no_ext, custom_regex, clean_text, drop_duplicate_text, anonymise_drop, progress=gr.Progress(track_tqdm=True)):
     output_text = ""
     output_list = []
         data_file_name_no_ext = data_file_name_no_ext + "_clean"
+        if not custom_regex.empty:
+            data[in_colnames_list_first] = initial_clean(data[in_colnames_list_first], custom_regex.iloc[:, 0].to_list())
+        else:
+            data[in_colnames_list_first] = initial_clean(data[in_colnames_list_first], [])
         clean_toc = time.perf_counter()
         clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
         #print("Removing duplicates and short entries from data")
         #print("Data shape before: ", data.shape)
         data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
+        data = data[data[in_colnames_list_first].str.len() >= 50]
         data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
         #print("Data shape after duplicate/null removal: ", data.shape)
             assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
+            if calc_probs == True:
+                topics_probs_out = pd.DataFrame(topic_model.probabilities_)
+                topics_probs_out_name = "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
+                topics_probs_out.to_csv(topics_probs_out_name)
+                output_list.append(topics_probs_out_name)
         except:
             print(fail_error_message)
             assigned_topics, probs = topic_model.fit_transform(docs, embeddings_out)
+            if calc_probs == True:
+                topics_probs_out = pd.DataFrame(topic_model.probabilities_)
+                topics_probs_out_name = "topic_full_probs_" + data_file_name_no_ext + "_" + today_rev + ".csv"
+                topics_probs_out.to_csv(topics_probs_out_name)
+                output_list.append(topics_probs_out_name)
         except:
             print(fail_error_message)
     assigned_topics = topic_model.reduce_outliers(docs, assigned_topics, strategy="embeddings")
     # Then, update the topics to the ones that considered the new data
+    progress(0.6, desc= "Updating original model")
+    topic_model.update_topics(docs, topics=assigned_topics)
     print("Finished reducing outliers.")
+    #progress(0.7, desc= "Replacing topic names with LLMs if necessary")
+    #topic_dets = topic_model.get_topic_info()
+    # # Replace original labels with LLM labels
+    # if "LLM" in topic_model.get_topic_info().columns:
+    #     llm_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["LLM"].values()]
+    #     topic_model.set_topic_labels(llm_labels)
+    # else:
+    #     topic_model.set_topic_labels(list(topic_dets["Name"]))
     # Outputs
     progress(0.9, desc= "Saving to file")
         hierarchical_topics = topic_model.hierarchical_topics(docs)
+        # Print topic tree
+        tree = topic_model.get_topic_tree(hierarchical_topics, tight_layout = True)
+        tree_name = data_file_name_no_ext + '_' + 'vis_hierarchy_tree_' + today_rev + '.txt'
+        with open(tree_name, "w") as file:
+            # Write the string to the file
+            file.write(tree)
+        output_list.append(tree_name)
         # Save new hierarchical topic model to file
         hierarchical_topics_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topics_' + today_rev + '.csv'
         hierarchical_topics.to_csv(hierarchical_topics_name)