Spaces:
Running
Running
seanpedrickcase
commited on
Commit
·
89c4d20
1
Parent(s):
593153e
Improved initial clean options. Now has option to return embeddings only.
Browse files- app.py +5 -2
- funcs/clean_funcs.py +54 -6
- funcs/topic_core_funcs.py +25 -15
- requirements.txt +4 -4
- requirements_aws.txt +5 -5
- requirements_gpu.txt +4 -4
app.py
CHANGED
@@ -40,7 +40,7 @@ with block:
|
|
40 |
# Topic modeller
|
41 |
Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
|
42 |
|
43 |
-
Uses fast TF-IDF-based embeddings by default, which are fast but does not lead to high quality clusering. Change to higher quality [mxbai-embed-large-v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) model embeddings (
|
44 |
|
45 |
For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.
|
46 |
|
@@ -124,8 +124,11 @@ with block:
|
|
124 |
calc_probs = gr.Dropdown(label="Calculate all topic probabilities", value="No", choices=["Yes", "No"])
|
125 |
with gr.Row():
|
126 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp: smaller files but lower quality.", value="No", choices=["Yes", "No"])
|
|
|
|
|
127 |
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation.", value="Yes", choices=["Yes", "No"])
|
128 |
save_topic_model = gr.Dropdown(label = "Save topic model to BERTopic format pkl file.", value="No", choices=["Yes", "No"])
|
|
|
129 |
|
130 |
# Load in data. Update column names dropdown when file uploaded
|
131 |
in_files.upload(fn=initial_file_load, inputs=[in_files], outputs=[in_colnames, in_label, data_state, output_single_text, topic_model_state, embeddings_state, data_file_name_no_ext_state, label_list_state, original_data_state])
|
@@ -141,7 +144,7 @@ with block:
|
|
141 |
zero_shot_optimiser_btn.click(fn=optimise_zero_shot, outputs=[quality_mode_drop, min_docs_slider, max_topics_slider, min_word_occurence_slider, max_word_occurence_slider, zero_shot_similarity])
|
142 |
|
143 |
# Extract topics
|
144 |
-
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, quality_mode_drop, save_topic_model, embeddings_state, embeddings_type_state, zero_shot_similarity, calc_probs, vectoriser_state, min_word_occurence_slider, max_word_occurence_slider, split_sentence_drop, seed_number], outputs=[output_single_text, output_file, embeddings_state, embeddings_type_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state, assigned_topics_state], api_name="topics")
|
145 |
|
146 |
# Reduce outliers
|
147 |
reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, assigned_topics_state, vectoriser_state, save_topic_model, split_sentence_drop, data_state], outputs=[output_single_text, output_file, topic_model_state], api_name="reduce_outliers")
|
|
|
40 |
# Topic modeller
|
41 |
Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
|
42 |
|
43 |
+
Uses fast TF-IDF-based embeddings by default, which are fast but does not lead to high quality clusering. Change to higher quality [mxbai-embed-large-v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) model embeddings (1024 dimensions) for better results but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available such as maximum topics allowed, minimum documents per topic etc.. Topic representation with LLMs currently based on [Phi-3.1-mini-128k-instruct-GGUF](https://huggingface.co/bartowski/Phi-3.1-mini-128k-instruct-GGUF), which is quite slow on CPU, so use a GPU-enabled computer if possible, building from the requirements_gpu.txt file in the base folder.
|
44 |
|
45 |
For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.
|
46 |
|
|
|
124 |
calc_probs = gr.Dropdown(label="Calculate all topic probabilities", value="No", choices=["Yes", "No"])
|
125 |
with gr.Row():
|
126 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp: smaller files but lower quality.", value="No", choices=["Yes", "No"])
|
127 |
+
return_only_embeddings_drop = gr.Dropdown(label="Return only embeddings", value="No", choices=["Yes", "No"])
|
128 |
+
with gr.Row():
|
129 |
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation.", value="Yes", choices=["Yes", "No"])
|
130 |
save_topic_model = gr.Dropdown(label = "Save topic model to BERTopic format pkl file.", value="No", choices=["Yes", "No"])
|
131 |
+
|
132 |
|
133 |
# Load in data. Update column names dropdown when file uploaded
|
134 |
in_files.upload(fn=initial_file_load, inputs=[in_files], outputs=[in_colnames, in_label, data_state, output_single_text, topic_model_state, embeddings_state, data_file_name_no_ext_state, label_list_state, original_data_state])
|
|
|
144 |
zero_shot_optimiser_btn.click(fn=optimise_zero_shot, outputs=[quality_mode_drop, min_docs_slider, max_topics_slider, min_word_occurence_slider, max_word_occurence_slider, zero_shot_similarity])
|
145 |
|
146 |
# Extract topics
|
147 |
+
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, quality_mode_drop, save_topic_model, embeddings_state, embeddings_type_state, zero_shot_similarity, calc_probs, vectoriser_state, min_word_occurence_slider, max_word_occurence_slider, split_sentence_drop, seed_number, return_only_embeddings_drop], outputs=[output_single_text, output_file, embeddings_state, embeddings_type_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state, assigned_topics_state], api_name="topics")
|
148 |
|
149 |
# Reduce outliers
|
150 |
reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, assigned_topics_state, vectoriser_state, save_topic_model, split_sentence_drop, data_state], outputs=[output_single_text, output_file, topic_model_state], api_name="reduce_outliers")
|
funcs/clean_funcs.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import re
|
2 |
import string
|
|
|
3 |
import polars as pl
|
4 |
import gradio as gr
|
5 |
|
@@ -17,13 +18,36 @@ num_pattern_regex = r'[0-9]+'
|
|
17 |
nums_two_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b' # Should match two digit numbers or more, and also if there are full stops or commas in between
|
18 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
19 |
multiple_spaces_regex = r'\s{2,}'
|
|
|
20 |
|
21 |
def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
# Convert to polars Series
|
23 |
texts = pl.Series(texts).str.strip_chars()
|
24 |
|
25 |
# Define a list of patterns and their replacements
|
26 |
patterns = [
|
|
|
|
|
27 |
(url_pattern, ' '),
|
28 |
(html_pattern_regex, ' '),
|
29 |
(html_start_pattern_end_dots_regex, ' '),
|
@@ -31,7 +55,8 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
|
31 |
(email_pattern_regex, ' '),
|
32 |
(nums_two_more_regex, ' '),
|
33 |
(postcode_pattern_regex, ' '),
|
34 |
-
(multiple_spaces_regex, ' ')
|
|
|
35 |
]
|
36 |
|
37 |
# Apply each regex replacement
|
@@ -43,22 +68,45 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
|
43 |
|
44 |
return texts
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
def regex_clean(texts, custom_regex, progress=gr.Progress()):
|
47 |
texts = pl.Series(texts).str.strip_chars()
|
48 |
|
49 |
# Allow for custom regex patterns to be removed
|
50 |
if len(custom_regex) > 0:
|
51 |
for pattern in custom_regex:
|
52 |
-
|
53 |
-
|
54 |
-
texts = texts.str.replace_all(
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
56 |
texts = texts.str.replace_all(multiple_spaces_regex, ' ')
|
57 |
-
|
|
|
58 |
texts = texts.to_list()
|
59 |
|
60 |
return texts
|
61 |
|
|
|
62 |
def remove_hyphens(text_text):
|
63 |
return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)
|
64 |
|
|
|
1 |
import re
|
2 |
import string
|
3 |
+
import unicodedata
|
4 |
import polars as pl
|
5 |
import gradio as gr
|
6 |
|
|
|
18 |
nums_two_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b' # Should match two digit numbers or more, and also if there are full stops or commas in between
|
19 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
20 |
multiple_spaces_regex = r'\s{2,}'
|
21 |
+
multiple_new_lines_regex = r'(\r\n|\n)+'
|
22 |
|
23 |
def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
24 |
+
|
25 |
+
for text in texts:
|
26 |
+
if not text:
|
27 |
+
text = ""
|
28 |
+
|
29 |
+
# Normalize unicode characters to decompose any special forms
|
30 |
+
normalized_text = unicodedata.normalize('NFKC', text)
|
31 |
+
|
32 |
+
# Replace smart quotes and special punctuation with standard ASCII equivalents
|
33 |
+
replacements = {
|
34 |
+
'‘': "'", '’': "'", '“': '"', '”': '"',
|
35 |
+
'–': '-', '—': '-', '…': '...', '•': '*',
|
36 |
+
}
|
37 |
+
|
38 |
+
# Perform replacements
|
39 |
+
for old_char, new_char in replacements.items():
|
40 |
+
normalised_text = normalized_text.replace(old_char, new_char)
|
41 |
+
|
42 |
+
text = normalised_text
|
43 |
+
|
44 |
# Convert to polars Series
|
45 |
texts = pl.Series(texts).str.strip_chars()
|
46 |
|
47 |
# Define a list of patterns and their replacements
|
48 |
patterns = [
|
49 |
+
(multiple_new_lines_regex, ' '),
|
50 |
+
(r'\r', ''),
|
51 |
(url_pattern, ' '),
|
52 |
(html_pattern_regex, ' '),
|
53 |
(html_start_pattern_end_dots_regex, ' '),
|
|
|
55 |
(email_pattern_regex, ' '),
|
56 |
(nums_two_more_regex, ' '),
|
57 |
(postcode_pattern_regex, ' '),
|
58 |
+
(multiple_spaces_regex, ' '),
|
59 |
+
(r"(\p{P})\p{P}+", "${1}")
|
60 |
]
|
61 |
|
62 |
# Apply each regex replacement
|
|
|
68 |
|
69 |
return texts
|
70 |
|
71 |
+
# def regex_clean(texts, custom_regex, progress=gr.Progress()):
|
72 |
+
# texts = pl.Series(texts).str.strip_chars()
|
73 |
+
|
74 |
+
# # Allow for custom regex patterns to be removed
|
75 |
+
# if len(custom_regex) > 0:
|
76 |
+
# for pattern in custom_regex:
|
77 |
+
# raw_string_pattern = r'{}'.format(pattern)
|
78 |
+
# print("Removing regex pattern: ", raw_string_pattern)
|
79 |
+
# texts = texts.str.replace_all(raw_string_pattern, ' ')
|
80 |
+
|
81 |
+
# texts = texts.str.replace_all(multiple_spaces_regex, ' ')
|
82 |
+
|
83 |
+
# texts = texts.to_list()
|
84 |
+
|
85 |
+
# return texts
|
86 |
+
|
87 |
def regex_clean(texts, custom_regex, progress=gr.Progress()):
|
88 |
texts = pl.Series(texts).str.strip_chars()
|
89 |
|
90 |
# Allow for custom regex patterns to be removed
|
91 |
if len(custom_regex) > 0:
|
92 |
for pattern in custom_regex:
|
93 |
+
print("Removing regex pattern:", pattern)
|
94 |
+
# Method 1: Using polars with regex flags
|
95 |
+
texts = texts.str.replace_all(pattern, ' ')
|
96 |
+
|
97 |
+
# Alternative Method 2: Using Python re directly if needed
|
98 |
+
#texts = pl.Series([re.sub(pattern, ' ', text, flags=re.DOTALL)
|
99 |
+
# for text in texts])
|
100 |
+
|
101 |
+
# Replace multiple spaces with a single space
|
102 |
texts = texts.str.replace_all(multiple_spaces_regex, ' ')
|
103 |
+
|
104 |
+
# Convert series back to a list
|
105 |
texts = texts.to_list()
|
106 |
|
107 |
return texts
|
108 |
|
109 |
+
|
110 |
def remove_hyphens(text_text):
|
111 |
return re.sub(r'(\w+)-(\w+)-?(\w)?', r'\1 \2 \3', text_text)
|
112 |
|
funcs/topic_core_funcs.py
CHANGED
@@ -168,6 +168,8 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
|
|
168 |
|
169 |
print(time_out)
|
170 |
|
|
|
|
|
171 |
out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev + ".csv"
|
172 |
data.to_csv(out_data_name)
|
173 |
output_list.append(out_data_name)
|
@@ -208,7 +210,8 @@ def extract_topics(
|
|
208 |
min_word_occurence_slider: float,
|
209 |
max_word_occurence_slider: float,
|
210 |
split_sentence_drop: str,
|
211 |
-
random_seed: int = random_seed,
|
|
|
212 |
output_folder: str = output_folder,
|
213 |
umap_n_neighbours:int = umap_n_neighbours,
|
214 |
umap_min_dist:float = umap_min_dist,
|
@@ -235,6 +238,7 @@ def extract_topics(
|
|
235 |
embeddings_type_state (str): State of the embeddings type.
|
236 |
zero_shot_similarity (float): Zero-shot similarity threshold.
|
237 |
random_seed (int): Random seed for reproducibility.
|
|
|
238 |
calc_probs (str): Whether to calculate all topic probabilities.
|
239 |
vectoriser_state (CountVectorizer): Vectorizer state.
|
240 |
min_word_occurence_slider (float): Minimum word occurrence slider value.
|
@@ -337,6 +341,25 @@ def extract_topics(
|
|
337 |
|
338 |
embeddings_out = make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, embeddings_super_compress, high_quality_mode)
|
339 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
# This is saved as a Gradio state object
|
341 |
vectoriser_model = vectoriser_state
|
342 |
|
@@ -466,20 +489,7 @@ def extract_topics(
|
|
466 |
# Outputs
|
467 |
output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model, data, split_sentence_drop)
|
468 |
|
469 |
-
|
470 |
-
if return_intermediate_files == "Yes":
|
471 |
-
print("Saving embeddings to file")
|
472 |
-
if high_quality_mode == "No":
|
473 |
-
embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
|
474 |
-
else:
|
475 |
-
if embeddings_super_compress == "No":
|
476 |
-
embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings.npz'
|
477 |
-
else:
|
478 |
-
embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings_compress.npz'
|
479 |
-
|
480 |
-
np.savez_compressed(embeddings_file_name, embeddings_out)
|
481 |
-
|
482 |
-
output_list.append(embeddings_file_name)
|
483 |
|
484 |
all_toc = time.perf_counter()
|
485 |
time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
|
|
|
168 |
|
169 |
print(time_out)
|
170 |
|
171 |
+
data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
|
172 |
+
|
173 |
out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev + ".csv"
|
174 |
data.to_csv(out_data_name)
|
175 |
output_list.append(out_data_name)
|
|
|
210 |
min_word_occurence_slider: float,
|
211 |
max_word_occurence_slider: float,
|
212 |
split_sentence_drop: str,
|
213 |
+
random_seed: int = random_seed,
|
214 |
+
return_only_embeddings_drop: str = "No",
|
215 |
output_folder: str = output_folder,
|
216 |
umap_n_neighbours:int = umap_n_neighbours,
|
217 |
umap_min_dist:float = umap_min_dist,
|
|
|
238 |
embeddings_type_state (str): State of the embeddings type.
|
239 |
zero_shot_similarity (float): Zero-shot similarity threshold.
|
240 |
random_seed (int): Random seed for reproducibility.
|
241 |
+
return_only_embeddings_drop (str): If you only want to output embeddings.
|
242 |
calc_probs (str): Whether to calculate all topic probabilities.
|
243 |
vectoriser_state (CountVectorizer): Vectorizer state.
|
244 |
min_word_occurence_slider (float): Minimum word occurrence slider value.
|
|
|
341 |
|
342 |
embeddings_out = make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, embeddings_super_compress, high_quality_mode)
|
343 |
|
344 |
+
# If you want to save your embedding files
|
345 |
+
if return_intermediate_files == "Yes":
|
346 |
+
print("Saving embeddings to file")
|
347 |
+
if high_quality_mode == "No":
|
348 |
+
embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
|
349 |
+
else:
|
350 |
+
if embeddings_super_compress == "No":
|
351 |
+
embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings.npz'
|
352 |
+
else:
|
353 |
+
embeddings_file_name = output_folder + data_file_name_no_ext + '_' + 'large_embeddings_compress.npz'
|
354 |
+
|
355 |
+
np.savez_compressed(embeddings_file_name, embeddings_out)
|
356 |
+
|
357 |
+
output_list.append(embeddings_file_name)
|
358 |
+
|
359 |
+
if return_only_embeddings_drop == "Yes":
|
360 |
+
|
361 |
+
return "Embeddings output returned", output_list, embeddings_out, embeddings_type_state, data_file_name_no_ext, None, docs, vectoriser_state, []
|
362 |
+
|
363 |
# This is saved as a Gradio state object
|
364 |
vectoriser_model = vectoriser_state
|
365 |
|
|
|
489 |
# Outputs
|
490 |
output_list, output_text = save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, save_topic_model, data, split_sentence_drop)
|
491 |
|
492 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
493 |
|
494 |
all_toc = time.perf_counter()
|
495 |
time_out = f"All processes took {all_toc - all_tic:0.1f} seconds."
|
requirements.txt
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
-
gradio
|
2 |
boto3
|
3 |
transformers==4.41.2
|
4 |
accelerate==0.26.1
|
5 |
torch==2.4.0
|
6 |
bertopic==0.16.2
|
7 |
-
spacy==3.
|
8 |
-
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.
|
9 |
-
pyarrow==
|
10 |
openpyxl==3.1.2
|
11 |
Faker==22.2.0
|
12 |
presidio_analyzer==2.2.354
|
|
|
1 |
+
gradio==4.44.1
|
2 |
boto3
|
3 |
transformers==4.41.2
|
4 |
accelerate==0.26.1
|
5 |
torch==2.4.0
|
6 |
bertopic==0.16.2
|
7 |
+
spacy==3.8.0
|
8 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
9 |
+
pyarrow==17.0.0
|
10 |
openpyxl==3.1.2
|
11 |
Faker==22.2.0
|
12 |
presidio_analyzer==2.2.354
|
requirements_aws.txt
CHANGED
@@ -3,11 +3,11 @@ pandas==2.2.2
|
|
3 |
plotly==5.23.0
|
4 |
scikit-learn==1.5.1
|
5 |
umap-learn==0.5.6
|
6 |
-
boto3
|
7 |
-
spacy==3.
|
8 |
-
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.
|
9 |
-
gradio
|
10 |
-
pyarrow==
|
11 |
openpyxl==3.1.2
|
12 |
Faker==22.2.0
|
13 |
presidio_analyzer==2.2.354
|
|
|
3 |
plotly==5.23.0
|
4 |
scikit-learn==1.5.1
|
5 |
umap-learn==0.5.6
|
6 |
+
boto3
|
7 |
+
spacy==3.8.0
|
8 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
9 |
+
gradio==4.44.1
|
10 |
+
pyarrow==17.0.0
|
11 |
openpyxl==3.1.2
|
12 |
Faker==22.2.0
|
13 |
presidio_analyzer==2.2.354
|
requirements_gpu.txt
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
-
gradio
|
2 |
boto3
|
3 |
transformers==4.41.2
|
4 |
accelerate==0.26.1
|
5 |
bertopic==0.16.2
|
6 |
-
spacy==3.
|
7 |
-
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.
|
8 |
-
pyarrow==
|
9 |
openpyxl==3.1.3
|
10 |
Faker==22.2.0
|
11 |
presidio_analyzer==2.2.354
|
|
|
1 |
+
gradio==4.44.1
|
2 |
boto3
|
3 |
transformers==4.41.2
|
4 |
accelerate==0.26.1
|
5 |
bertopic==0.16.2
|
6 |
+
spacy==3.8.0
|
7 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
8 |
+
pyarrow==17.0.0
|
9 |
openpyxl==3.1.3
|
10 |
Faker==22.2.0
|
11 |
presidio_analyzer==2.2.354
|