seanpedrickcase commited on
Commit
89c4d20
·
1 Parent(s): 593153e

Improved initial clean options. Now has option to return embeddings only.

Browse files
app.py CHANGED
@@ -40,7 +40,7 @@ with block:
40
  # Topic modeller
41
  Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
42
 
43
- Uses fast TF-IDF-based embeddings by default, which are fast but does not lead to high quality clusering. Change to higher quality [mxbai-embed-large-v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) model embeddings (512 dimensions) for better results but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available such as maximum topics allowed, minimum documents per topic etc.. Topic representation with LLMs currently based on [Phi-3.1-mini-128k-instruct-GGUF](https://huggingface.co/bartowski/Phi-3.1-mini-128k-instruct-GGUF), which is quite slow on CPU, so use a GPU-enabled computer if possible, building from the requirements_gpu.txt file in the base folder.
44
 
45
  For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.
46
 
@@ -124,8 +124,11 @@ with block:
124
  calc_probs = gr.Dropdown(label="Calculate all topic probabilities", value="No", choices=["Yes", "No"])
125
  with gr.Row():
126
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp: smaller files but lower quality.", value="No", choices=["Yes", "No"])
 
 
127
  return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation.", value="Yes", choices=["Yes", "No"])
128
  save_topic_model = gr.Dropdown(label = "Save topic model to BERTopic format pkl file.", value="No", choices=["Yes", "No"])
 
129
 
130
  # Load in data. Update column names dropdown when file uploaded
131
  in_files.upload(fn=initial_file_load, inputs=[in_files], outputs=[in_colnames, in_label, data_state, output_single_text, topic_model_state, embeddings_state, data_file_name_no_ext_state, label_list_state, original_data_state])
@@ -141,7 +144,7 @@ with block:
141
  zero_shot_optimiser_btn.click(fn=optimise_zero_shot, outputs=[quality_mode_drop, min_docs_slider, max_topics_slider, min_word_occurence_slider, max_word_occurence_slider, zero_shot_similarity])
142
 
143
  # Extract topics
144
- topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, quality_mode_drop, save_topic_model, embeddings_state, embeddings_type_state, zero_shot_similarity, calc_probs, vectoriser_state, min_word_occurence_slider, max_word_occurence_slider, split_sentence_drop, seed_number], outputs=[output_single_text, output_file, embeddings_state, embeddings_type_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state, assigned_topics_state], api_name="topics")
145
 
146
  # Reduce outliers
147
  reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, assigned_topics_state, vectoriser_state, save_topic_model, split_sentence_drop, data_state], outputs=[output_single_text, output_file, topic_model_state], api_name="reduce_outliers")
 
40
  # Topic modeller
41
  Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
42
 
43
+ Uses fast TF-IDF-based embeddings by default, which are fast but does not lead to high quality clusering. Change to higher quality [mxbai-embed-large-v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) model embeddings (1024 dimensions) for better results but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available such as maximum topics allowed, minimum documents per topic etc.. Topic representation with LLMs currently based on [Phi-3.1-mini-128k-instruct-GGUF](https://huggingface.co/bartowski/Phi-3.1-mini-128k-instruct-GGUF), which is quite slow on CPU, so use a GPU-enabled computer if possible, building from the requirements_gpu.txt file in the base folder.
44
 
45
  For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.
46
 
 
124
  calc_probs = gr.Dropdown(label="Calculate all topic probabilities", value="No", choices=["Yes", "No"])
125
  with gr.Row():
126
  embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp: smaller files but lower quality.", value="No", choices=["Yes", "No"])
127
+ return_only_embeddings_drop = gr.Dropdown(label="Return only embeddings", value="No", choices=["Yes", "No"])
128
+ with gr.Row():
129
  return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation.", value="Yes", choices=["Yes", "No"])
130
  save_topic_model = gr.Dropdown(label = "Save topic model to BERTopic format pkl file.", value="No", choices=["Yes", "No"])
131
+
132
 
133
  # Load in data. Update column names dropdown when file uploaded
134
  in_files.upload(fn=initial_file_load, inputs=[in_files], outputs=[in_colnames, in_label, data_state, output_single_text, topic_model_state, embeddings_state, data_file_name_no_ext_state, label_list_state, original_data_state])
 
144
  zero_shot_optimiser_btn.click(fn=optimise_zero_shot, outputs=[quality_mode_drop, min_docs_slider, max_topics_slider, min_word_occurence_slider, max_word_occurence_slider, zero_shot_similarity])
145
 
146
  # Extract topics
147
+ topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, quality_mode_drop, save_topic_model, embeddings_state, embeddings_type_state, zero_shot_similarity, calc_probs, vectoriser_state, min_word_occurence_slider, max_word_occurence_slider, split_sentence_drop, seed_number, return_only_embeddings_drop], outputs=[output_single_text, output_file, embeddings_state, embeddings_type_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state, assigned_topics_state], api_name="topics")
148
 
149
  # Reduce outliers
150
  reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, assigned_topics_state, vectoriser_state, save_topic_model, split_sentence_drop, data_state], outputs=[output_single_text, output_file, topic_model_state], api_name="reduce_outliers")
funcs/clean_funcs.py CHANGED
@@ -1,5 +1,6 @@
1
  import re
2
  import string
 
3
  import polars as pl
4
  import gradio as gr
5
 
@@ -17,13 +18,36 @@ num_pattern_regex = r'[0-9]+'
17
  nums_two_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b' # Should match two digit numbers or more, and also if there are full stops or commas in between
18
  postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
19
  multiple_spaces_regex = r'\s{2,}'
 
20
 
21
  def initial_clean(texts, custom_regex, progress=gr.Progress()):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # Convert to polars Series
23
  texts = pl.Series(texts).str.strip_chars()
24
 
25
  # Define a list of patterns and their replacements
26
  patterns = [
 
 
27
  (url_pattern, ' '),
28
  (html_pattern_regex, ' '),
29
  (html_start_pattern_end_dots_regex, ' '),
@@ -31,7 +55,8 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
31
  (email_pattern_regex, ' '),
32
  (nums_two_more_regex, ' '),
33
  (postcode_pattern_regex, ' '),
34
- (multiple_spaces_regex, ' ')
 
35
  ]
36
 
37
  # Apply each regex replacement
@@ -43,22 +68,45 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
43
 
44
  return texts
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def regex_clean(texts, custom_regex, progress=gr.Progress()):
47
  texts = pl.Series(texts).str.strip_chars()
48
 
49
  # Allow for custom regex patterns to be removed
50
  if len(custom_regex) > 0:
51
  for pattern in custom_regex:
52
- raw_string_pattern = r'{}'.format(pattern)
53
- print("Removing regex pattern: ", raw_string_pattern)
54
- texts = texts.str.replace_all(raw_string_pattern, ' ')
55
-
 
 
 
 
 
56
  texts = texts.str.replace_all(multiple_spaces_regex, ' ')
57
-
 
58
  texts = texts.to_list()
59
 
60
  return texts
61
 
 
62
  def remove_hyphens(text_text):
63
  return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)
64
 
 
1
  import re
2
  import string
3
+ import unicodedata
4
  import polars as pl
5
  import gradio as gr
6
 
 
18
  nums_two_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b' # Should match two digit numbers or more, and also if there are full stops or commas in between
19
  postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
20
  multiple_spaces_regex = r'\s{2,}'
21
+ multiple_new_lines_regex = r'(\r\n|\n)+'
22
 
23
  def initial_clean(texts, custom_regex, progress=gr.Progress()):
24
+
25
+ for text in texts:
26
+ if not text:
27
+ text = ""
28
+
29
+ # Normalize unicode characters to decompose any special forms
30
+ normalized_text = unicodedata.normalize('NFKC', text)
31
+
32
+ # Replace smart quotes and special punctuation with standard ASCII equivalents
33
+ replacements = {
34
+ '‘': "'", '’': "'", '“': '"', '”': '"',
35
+ '–': '-', '—': '-', '…': '...', '•': '*',
36
+ }
37
+
38
+ # Perform replacements
39
+ for old_char, new_char in replacements.items():
40
+ normalised_text = normalized_text.replace(old_char, new_char)
41
+
42
+ text = normalised_text
43
+
44
  # Convert to polars Series
45
  texts = pl.Series(texts).str.strip_chars()
46
 
47
  # Define a list of patterns and their replacements
48
  patterns = [
49
+ (multiple_new_lines_regex, ' '),
50
+ (r'\r', ''),
51
  (url_pattern, ' '),
52
  (html_pattern_regex, ' '),
53
  (html_start_pattern_end_dots_regex, ' '),
 
55
  (email_pattern_regex, ' '),
56
  (nums_two_more_regex, ' '),
57
  (postcode_pattern_regex, ' '),
58
+ (multiple_spaces_regex, ' '),
59
+ (r"(\p{P})\p{P}+", "${1}")
60
  ]
61
 
62
  # Apply each regex replacement
 
68
 
69
  return texts
70
 
71
+ # def regex_clean(texts, custom_regex, progress=gr.Progress()):
72
+ # texts = pl.Series(texts).str.strip_chars()
73
+
74
+ # # Allow for custom regex patterns to be removed
75
+ # if len(custom_regex) > 0:
76
+ # for pattern in custom_regex:
77
+ # raw_string_pattern = r'{}'.format(pattern)
78
+ # print("Removing regex pattern: ", raw_string_pattern)
79
+ # texts = texts.str.replace_all(raw_string_pattern, ' ')
80
+
81
+ # texts = texts.str.replace_all(multiple_spaces_regex, ' ')
82
+
83
+ # texts = texts.to_list()
84
+
85
+ # return texts
86
+
87
  def regex_clean(texts, custom_regex, progress=gr.Progress()):
88
  texts = pl.Series(texts).str.strip_chars()
89
 
90
  # Allow for custom regex patterns to be removed
91
  if len(custom_regex) > 0:
92
  for pattern in custom_regex:
93
+ print("Removing regex pattern:", pattern)
94
+ # Method 1: Using polars with regex flags
95
+ texts = texts.str.replace_all(pattern, ' ')
96
+
97
+ # Alternative Method 2: Using Python re directly if needed
98
+ #texts = pl.Series([re.sub(pattern, ' ', text, flags=re.DOTALL)
99
+ # for text in texts])
100
+
101
+ # Replace multiple spaces with a single space
102
  texts = texts.str.replace_all(multiple_spaces_regex, ' ')
103
+
104
+ # Convert series back to a list
105
  texts = texts.to_list()
106
 
107
  return texts
108
 
109
+
110
  def remove_hyphens(text_text):
111
  return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)
112
 
funcs/topic_core_funcs.py CHANGED
@@ -168,6 +168,8 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
168
 
169
  print(time_out)
170
 
 
 
171
  out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev + ".csv"
172
  data.to_csv(out_data_name)
173
  output_list.append(out_data_name)
@@ -208,7 +210,8 @@ def extract_topics(
208
  min_word_occurence_slider: float,
209
  max_word_occurence_slider: float,
210
  split_sentence_drop: str,
211
- random_seed: int = random_seed,
 
212
  output_folder: str = output_folder,
213
  umap_n_neighbours:int = umap_n_neighbours,
214
  umap_min_dist:float = umap_min_dist,
@@ -235,6 +238,7 @@ def extract_topics(
235
  embeddings_type_state (str): State of the embeddings type.
236
  zero_shot_similarity (float): Zero-shot similarity threshold.
237
  random_seed (int): Random seed for reproducibility.
 
238
  calc_probs (str): Whether to calculate all topic probabilities.
239
  vectoriser_state (CountVectorizer): Vectorizer state.
240
  min_word_occurence_slider (float): Minimum word occurrence slider value.
@@ -337,6 +341,25 @@ def extract_topics(
337
 
338
  embeddings_out = make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, embeddings_super_compress, high_quality_mode)
339
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  # This is saved as a Gradio state object
341
  vectoriser_model = vectoriser_state
342
 
@@ -466,20 +489,7 @@ def extract_topics(
466
  # Outputs
467
  output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model, data, split_sentence_drop)
468
 
469
- # If you want to save your embedding files
470
- if return_intermediate_files == "Yes":
471
- print("Saving embeddings to file")
472
- if high_quality_mode == "No":
473
- embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
474
- else:
475
- if embeddings_super_compress == "No":
476
- embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings.npz'
477
- else:
478
- embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings_compress.npz'
479
-
480
- np.savez_compressed(embeddings_file_name, embeddings_out)
481
-
482
- output_list.append(embeddings_file_name)
483
 
484
  all_toc = time.perf_counter()
485
  time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
 
168
 
169
  print(time_out)
170
 
171
+ data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
172
+
173
  out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev + ".csv"
174
  data.to_csv(out_data_name)
175
  output_list.append(out_data_name)
 
210
  min_word_occurence_slider: float,
211
  max_word_occurence_slider: float,
212
  split_sentence_drop: str,
213
+ random_seed: int = random_seed,
214
+ return_only_embeddings_drop: str = "No",
215
  output_folder: str = output_folder,
216
  umap_n_neighbours:int = umap_n_neighbours,
217
  umap_min_dist:float = umap_min_dist,
 
238
  embeddings_type_state (str): State of the embeddings type.
239
  zero_shot_similarity (float): Zero-shot similarity threshold.
240
  random_seed (int): Random seed for reproducibility.
241
+ return_only_embeddings_drop (str): If you only want to output embeddings.
242
  calc_probs (str): Whether to calculate all topic probabilities.
243
  vectoriser_state (CountVectorizer): Vectorizer state.
244
  min_word_occurence_slider (float): Minimum word occurrence slider value.
 
341
 
342
  embeddings_out = make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, embeddings_super_compress, high_quality_mode)
343
 
344
+ # If you want to save your embedding files
345
+ if return_intermediate_files == "Yes":
346
+ print("Saving embeddings to file")
347
+ if high_quality_mode == "No":
348
+ embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
349
+ else:
350
+ if embeddings_super_compress == "No":
351
+ embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings.npz'
352
+ else:
353
+ embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings_compress.npz'
354
+
355
+ np.savez_compressed(embeddings_file_name, embeddings_out)
356
+
357
+ output_list.append(embeddings_file_name)
358
+
359
+ if return_only_embeddings_drop == "Yes":
360
+
361
+ return "Embeddings output returned", output_list, embeddings_out, embeddings_type_state, data_file_name_no_ext, None, docs, vectoriser_state, []
362
+
363
  # This is saved as a Gradio state object
364
  vectoriser_model = vectoriser_state
365
 
 
489
  # Outputs
490
  output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model, data, split_sentence_drop)
491
 
492
+
 
 
 
 
 
 
 
 
 
 
 
 
 
493
 
494
  all_toc = time.perf_counter()
495
  time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
requirements.txt CHANGED
@@ -1,12 +1,12 @@
1
- gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
2
  boto3
3
  transformers==4.41.2
4
  accelerate==0.26.1
5
  torch==2.4.0
6
  bertopic==0.16.2
7
- spacy==3.7.5
8
- en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
9
- pyarrow==14.0.2
10
  openpyxl==3.1.2
11
  Faker==22.2.0
12
  presidio_analyzer==2.2.354
 
1
+ gradio==4.44.1
2
  boto3
3
  transformers==4.41.2
4
  accelerate==0.26.1
5
  torch==2.4.0
6
  bertopic==0.16.2
7
+ spacy==3.8.0
8
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
9
+ pyarrow==17.0.0
10
  openpyxl==3.1.2
11
  Faker==22.2.0
12
  presidio_analyzer==2.2.354
requirements_aws.txt CHANGED
@@ -3,11 +3,11 @@ pandas==2.2.2
3
  plotly==5.23.0
4
  scikit-learn==1.5.1
5
  umap-learn==0.5.6
6
- boto3==1.34.158
7
- spacy==3.7.5
8
- en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
9
- gradio>=4.26.0
10
- pyarrow==14.0.2
11
  openpyxl==3.1.2
12
  Faker==22.2.0
13
  presidio_analyzer==2.2.354
 
3
  plotly==5.23.0
4
  scikit-learn==1.5.1
5
  umap-learn==0.5.6
6
+ boto3
7
+ spacy==3.8.0
8
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
9
+ gradio==4.44.1
10
+ pyarrow==17.0.0
11
  openpyxl==3.1.2
12
  Faker==22.2.0
13
  presidio_analyzer==2.2.354
requirements_gpu.txt CHANGED
@@ -1,11 +1,11 @@
1
- gradio # Not specified version due to interaction with spacy - reinstall latest version after requirements.txt load
2
  boto3
3
  transformers==4.41.2
4
  accelerate==0.26.1
5
  bertopic==0.16.2
6
- spacy==3.7.4
7
- en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
8
- pyarrow==14.0.2
9
  openpyxl==3.1.3
10
  Faker==22.2.0
11
  presidio_analyzer==2.2.354
 
1
+ gradio==4.44.1
2
  boto3
3
  transformers==4.41.2
4
  accelerate==0.26.1
5
  bertopic==0.16.2
6
+ spacy==3.8.0
7
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
8
+ pyarrow==17.0.0
9
  openpyxl==3.1.3
10
  Faker==22.2.0
11
  presidio_analyzer==2.2.354