Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

seanpedrickcase commited on Nov 18, 2024

Commit

89c4d20

1 Parent(s): 593153e

Improved initial clean options. Now has option to return embeddings only.

Browse files

Files changed (6) hide show

app.py +5 -2
funcs/clean_funcs.py +54 -6
funcs/topic_core_funcs.py +25 -15
requirements.txt +4 -4
requirements_aws.txt +5 -5
requirements_gpu.txt +4 -4

app.py CHANGED Viewed

@@ -40,7 +40,7 @@ with block:
     # Topic modeller
     Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
-    Uses fast TF-IDF-based embeddings by default, which are fast but does not lead to high quality clusering. Change to higher quality [mxbai-embed-large-v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) model embeddings (512 dimensions) for better results but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available such as maximum topics allowed, minimum documents per topic etc.. Topic representation with LLMs currently based on [Phi-3.1-mini-128k-instruct-GGUF](https://huggingface.co/bartowski/Phi-3.1-mini-128k-instruct-GGUF), which is quite slow on CPU, so use a GPU-enabled computer if possible, building from the requirements_gpu.txt file in the base folder.
     For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.
@@ -124,8 +124,11 @@ with block:
                 calc_probs = gr.Dropdown(label="Calculate all topic probabilities", value="No", choices=["Yes", "No"])
             with gr.Row():
                 embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp: smaller files but lower quality.", value="No", choices=["Yes", "No"])
                 return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation.", value="Yes", choices=["Yes", "No"])
                 save_topic_model = gr.Dropdown(label = "Save topic model to BERTopic format pkl file.", value="No", choices=["Yes", "No"])
     # Load in data. Update column names dropdown when file uploaded
     in_files.upload(fn=initial_file_load, inputs=[in_files], outputs=[in_colnames, in_label, data_state, output_single_text, topic_model_state, embeddings_state, data_file_name_no_ext_state, label_list_state, original_data_state])
@@ -141,7 +144,7 @@ with block:
     zero_shot_optimiser_btn.click(fn=optimise_zero_shot, outputs=[quality_mode_drop, min_docs_slider, max_topics_slider, min_word_occurence_slider, max_word_occurence_slider, zero_shot_similarity])
     # Extract topics
-    topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, quality_mode_drop, save_topic_model, embeddings_state, embeddings_type_state, zero_shot_similarity, calc_probs, vectoriser_state, min_word_occurence_slider, max_word_occurence_slider, split_sentence_drop, seed_number], outputs=[output_single_text, output_file, embeddings_state, embeddings_type_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state, assigned_topics_state], api_name="topics")
     # Reduce outliers
     reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, assigned_topics_state, vectoriser_state, save_topic_model, split_sentence_drop, data_state], outputs=[output_single_text, output_file, topic_model_state], api_name="reduce_outliers")

     # Topic modeller
     Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
+    Uses fast TF-IDF-based embeddings by default, which are fast but does not lead to high quality clusering. Change to higher quality [mxbai-embed-large-v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) model embeddings (1024 dimensions) for better results but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available such as maximum topics allowed, minimum documents per topic etc.. Topic representation with LLMs currently based on [Phi-3.1-mini-128k-instruct-GGUF](https://huggingface.co/bartowski/Phi-3.1-mini-128k-instruct-GGUF), which is quite slow on CPU, so use a GPU-enabled computer if possible, building from the requirements_gpu.txt file in the base folder.
     For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.
                 calc_probs = gr.Dropdown(label="Calculate all topic probabilities", value="No", choices=["Yes", "No"])
             with gr.Row():
                 embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp: smaller files but lower quality.", value="No", choices=["Yes", "No"])
+                return_only_embeddings_drop = gr.Dropdown(label="Return only embeddings", value="No", choices=["Yes", "No"])
+            with gr.Row():
                 return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation.", value="Yes", choices=["Yes", "No"])
                 save_topic_model = gr.Dropdown(label = "Save topic model to BERTopic format pkl file.", value="No", choices=["Yes", "No"])
     # Load in data. Update column names dropdown when file uploaded
     in_files.upload(fn=initial_file_load, inputs=[in_files], outputs=[in_colnames, in_label, data_state, output_single_text, topic_model_state, embeddings_state, data_file_name_no_ext_state, label_list_state, original_data_state])
     zero_shot_optimiser_btn.click(fn=optimise_zero_shot, outputs=[quality_mode_drop, min_docs_slider, max_topics_slider, min_word_occurence_slider, max_word_occurence_slider, zero_shot_similarity])
     # Extract topics
+    topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, quality_mode_drop, save_topic_model, embeddings_state, embeddings_type_state, zero_shot_similarity, calc_probs, vectoriser_state, min_word_occurence_slider, max_word_occurence_slider, split_sentence_drop, seed_number, return_only_embeddings_drop], outputs=[output_single_text, output_file, embeddings_state, embeddings_type_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state, assigned_topics_state], api_name="topics")
     # Reduce outliers
     reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, assigned_topics_state, vectoriser_state, save_topic_model, split_sentence_drop, data_state], outputs=[output_single_text, output_file, topic_model_state], api_name="reduce_outliers")

funcs/clean_funcs.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import re
 import string
 import polars as pl
 import gradio as gr
@@ -17,13 +18,36 @@ num_pattern_regex = r'[0-9]+'
 nums_two_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b' # Should match two digit numbers or more, and also if there are full stops or commas in between
 postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
 multiple_spaces_regex = r'\s{2,}'
 def initial_clean(texts, custom_regex, progress=gr.Progress()):
     # Convert to polars Series
     texts = pl.Series(texts).str.strip_chars()
     # Define a list of patterns and their replacements
     patterns = [
         (url_pattern, ' '),
         (html_pattern_regex, ' '),
         (html_start_pattern_end_dots_regex, ' '),
@@ -31,7 +55,8 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
         (email_pattern_regex, ' '),
         (nums_two_more_regex, ' '),
         (postcode_pattern_regex, ' '),
-        (multiple_spaces_regex, ' ')
     ]
     # Apply each regex replacement
@@ -43,22 +68,45 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
     return texts
 def regex_clean(texts, custom_regex, progress=gr.Progress()):
     texts = pl.Series(texts).str.strip_chars()
     # Allow for custom regex patterns to be removed
     if len(custom_regex) > 0:
         for pattern in custom_regex:
-            raw_string_pattern = r'{}'.format(pattern)
-            print("Removing regex pattern: ", raw_string_pattern)
-            texts = texts.str.replace_all(raw_string_pattern, ' ')
     texts = texts.str.replace_all(multiple_spaces_regex, ' ')
     texts = texts.to_list()
     return texts
 def remove_hyphens(text_text):
     return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)

 import re
 import string
+import unicodedata
 import polars as pl
 import gradio as gr
 nums_two_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b' # Should match two digit numbers or more, and also if there are full stops or commas in between
 postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
 multiple_spaces_regex = r'\s{2,}'
+multiple_new_lines_regex = r'(\r\n|\n)+'
 def initial_clean(texts, custom_regex, progress=gr.Progress()):
+    for text in texts:
+        if not text:
+            text = ""
+        # Normalize unicode characters to decompose any special forms
+        normalized_text = unicodedata.normalize('NFKC', text)
+        # Replace smart quotes and special punctuation with standard ASCII equivalents
+        replacements = {
+            '‘': "'", '’': "'", '“': '"', '”': '"',
+            '–': '-', '—': '-', '…': '...', '•': '*',
+        }
+        # Perform replacements
+        for old_char, new_char in replacements.items():
+            normalised_text = normalized_text.replace(old_char, new_char)
+        text = normalised_text
     # Convert to polars Series
     texts = pl.Series(texts).str.strip_chars()
     # Define a list of patterns and their replacements
     patterns = [
+        (multiple_new_lines_regex, '  '),
+        (r'\r', ''),
         (url_pattern, ' '),
         (html_pattern_regex, ' '),
         (html_start_pattern_end_dots_regex, ' '),
         (email_pattern_regex, ' '),
         (nums_two_more_regex, ' '),
         (postcode_pattern_regex, ' '),
+        (multiple_spaces_regex, ' '),
+        (r"(\p{P})\p{P}+", "${1}")
     ]
     # Apply each regex replacement
     return texts
+# def regex_clean(texts, custom_regex, progress=gr.Progress()):
+#     texts = pl.Series(texts).str.strip_chars()
+#     # Allow for custom regex patterns to be removed
+#     if len(custom_regex) > 0:
+#         for pattern in custom_regex:
+#             raw_string_pattern = r'{}'.format(pattern)
+#             print("Removing regex pattern: ", raw_string_pattern)
+#             texts = texts.str.replace_all(raw_string_pattern, ' ')
+#     texts = texts.str.replace_all(multiple_spaces_regex, ' ')
+#     texts = texts.to_list()
+#     return texts
 def regex_clean(texts, custom_regex, progress=gr.Progress()):
     texts = pl.Series(texts).str.strip_chars()
     # Allow for custom regex patterns to be removed
     if len(custom_regex) > 0:
         for pattern in custom_regex:
+            print("Removing regex pattern:", pattern)
+            # Method 1: Using polars with regex flags
+            texts = texts.str.replace_all(pattern, ' ')
+            # Alternative Method 2: Using Python re directly if needed
+            #texts = pl.Series([re.sub(pattern, ' ', text, flags=re.DOTALL)
+            #                   for text in texts])
+    # Replace multiple spaces with a single space
     texts = texts.str.replace_all(multiple_spaces_regex, ' ')
+    # Convert series back to a list
     texts = texts.to_list()
     return texts
 def remove_hyphens(text_text):
     return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)

funcs/topic_core_funcs.py CHANGED Viewed

@@ -168,6 +168,8 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
         print(time_out)
     out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev +  ".csv"
     data.to_csv(out_data_name)
     output_list.append(out_data_name)
@@ -208,7 +210,8 @@ def extract_topics(
     min_word_occurence_slider: float,
     max_word_occurence_slider: float,
     split_sentence_drop: str,
-    random_seed: int = random_seed,
     output_folder: str = output_folder,
     umap_n_neighbours:int = umap_n_neighbours,
     umap_min_dist:float = umap_min_dist,
@@ -235,6 +238,7 @@ def extract_topics(
         embeddings_type_state (str): State of the embeddings type.
         zero_shot_similarity (float): Zero-shot similarity threshold.
         random_seed (int): Random seed for reproducibility.
         calc_probs (str): Whether to calculate all topic probabilities.
         vectoriser_state (CountVectorizer): Vectorizer state.
         min_word_occurence_slider (float): Minimum word occurrence slider value.
@@ -337,6 +341,25 @@ def extract_topics(
     embeddings_out = make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, embeddings_super_compress, high_quality_mode)
     # This is saved as a Gradio state object
     vectoriser_model = vectoriser_state
@@ -466,20 +489,7 @@ def extract_topics(
     # Outputs
     output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model, data, split_sentence_drop)
-     # If you want to save your embedding files
-    if return_intermediate_files == "Yes":
-        print("Saving embeddings to file")
-        if high_quality_mode == "No":
-            embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
-        else:
-            if embeddings_super_compress == "No":
-                embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings.npz'
-            else:
-                embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings_compress.npz'
-        np.savez_compressed(embeddings_file_name, embeddings_out)
-        output_list.append(embeddings_file_name)
     all_toc = time.perf_counter()
     time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."

         print(time_out)
+        data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
     out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev +  ".csv"
     data.to_csv(out_data_name)
     output_list.append(out_data_name)
     min_word_occurence_slider: float,
     max_word_occurence_slider: float,
     split_sentence_drop: str,
+    random_seed: int = random_seed,
+    return_only_embeddings_drop: str = "No",
     output_folder: str = output_folder,
     umap_n_neighbours:int = umap_n_neighbours,
     umap_min_dist:float = umap_min_dist,
         embeddings_type_state (str): State of the embeddings type.
         zero_shot_similarity (float): Zero-shot similarity threshold.
         random_seed (int): Random seed for reproducibility.
+        return_only_embeddings_drop (str): If you only want to output embeddings.
         calc_probs (str): Whether to calculate all topic probabilities.
         vectoriser_state (CountVectorizer): Vectorizer state.
         min_word_occurence_slider (float): Minimum word occurrence slider value.
     embeddings_out = make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, embeddings_super_compress, high_quality_mode)
+     # If you want to save your embedding files
+    if return_intermediate_files == "Yes":
+        print("Saving embeddings to file")
+        if high_quality_mode == "No":
+            embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
+        else:
+            if embeddings_super_compress == "No":
+                embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings.npz'
+            else:
+                embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings_compress.npz'
+        np.savez_compressed(embeddings_file_name, embeddings_out)
+        output_list.append(embeddings_file_name)
+        if return_only_embeddings_drop == "Yes":
+            return "Embeddings output returned", output_list, embeddings_out, embeddings_type_state, data_file_name_no_ext, None, docs, vectoriser_state, []
     # This is saved as a Gradio state object
     vectoriser_model = vectoriser_state
     # Outputs
     output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model, data, split_sentence_drop)
     all_toc = time.perf_counter()
     time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."

requirements.txt CHANGED Viewed

@@ -1,12 +1,12 @@
-gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
 boto3
 transformers==4.41.2
 accelerate==0.26.1
 torch==2.4.0
 bertopic==0.16.2
-spacy==3.7.5
-en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
-pyarrow==14.0.2
 openpyxl==3.1.2
 Faker==22.2.0
 presidio_analyzer==2.2.354

+gradio==4.44.1
 boto3
 transformers==4.41.2
 accelerate==0.26.1
 torch==2.4.0
 bertopic==0.16.2
+spacy==3.8.0
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+pyarrow==17.0.0
 openpyxl==3.1.2
 Faker==22.2.0
 presidio_analyzer==2.2.354

requirements_aws.txt CHANGED Viewed

@@ -3,11 +3,11 @@ pandas==2.2.2
 plotly==5.23.0
 scikit-learn==1.5.1
 umap-learn==0.5.6
-boto3==1.34.158
-spacy==3.7.5
-en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
-gradio>=4.26.0
-pyarrow==14.0.2
 openpyxl==3.1.2
 Faker==22.2.0
 presidio_analyzer==2.2.354

 plotly==5.23.0
 scikit-learn==1.5.1
 umap-learn==0.5.6
+boto3
+spacy==3.8.0
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+gradio==4.44.1
+pyarrow==17.0.0
 openpyxl==3.1.2
 Faker==22.2.0
 presidio_analyzer==2.2.354

requirements_gpu.txt CHANGED Viewed

@@ -1,11 +1,11 @@
-gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
 boto3
 transformers==4.41.2
 accelerate==0.26.1
 bertopic==0.16.2
-spacy==3.7.4
-en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
-pyarrow==14.0.2
 openpyxl==3.1.3
 Faker==22.2.0
 presidio_analyzer==2.2.354

+gradio==4.44.1
 boto3
 transformers==4.41.2
 accelerate==0.26.1
 bertopic==0.16.2
+spacy==3.8.0
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+pyarrow==17.0.0
 openpyxl==3.1.3
 Faker==22.2.0
 presidio_analyzer==2.2.354