Spaces:
Running
Running
Sean-Case
commited on
Commit
·
43ac0d8
1
Parent(s):
fac3624
Returned TruncatedSVD components to 100 - higher values don't seem to help
Browse files- app.py +11 -8
- funcs/embeddings.py +2 -1
app.py
CHANGED
@@ -128,21 +128,20 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
128 |
umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_seed)
|
129 |
|
130 |
elif low_resource_mode == "Yes":
|
131 |
-
print("Choosing low resource TF-IDF model")
|
|
|
132 |
embedding_model_pipe = make_pipeline(
|
133 |
TfidfVectorizer(),
|
134 |
TruncatedSVD(100) # 100 # To be compatible with zero shot, this needs to be lower than number of suggested topics
|
135 |
)
|
136 |
embedding_model = embedding_model_pipe
|
137 |
|
138 |
-
umap_model = TruncatedSVD(n_components=
|
139 |
-
|
140 |
-
|
141 |
|
142 |
embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
|
143 |
|
144 |
|
145 |
-
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.
|
146 |
|
147 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
148 |
from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
|
@@ -241,10 +240,14 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
241 |
|
242 |
if return_intermediate_files == "Yes":
|
243 |
print("Saving embeddings to file")
|
244 |
-
|
245 |
-
|
|
|
|
|
|
|
|
|
246 |
|
247 |
-
output_list.append(
|
248 |
|
249 |
if visualise_topics == "Yes":
|
250 |
# Visualise the topics:
|
|
|
128 |
umap_model = UMAP(n_neighbors=15, n_components=5, random_state=random_seed)
|
129 |
|
130 |
elif low_resource_mode == "Yes":
|
131 |
+
print("Choosing low resource TF-IDF model.")
|
132 |
+
|
133 |
embedding_model_pipe = make_pipeline(
|
134 |
TfidfVectorizer(),
|
135 |
TruncatedSVD(100) # 100 # To be compatible with zero shot, this needs to be lower than number of suggested topics
|
136 |
)
|
137 |
embedding_model = embedding_model_pipe
|
138 |
|
139 |
+
umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
|
|
|
|
|
140 |
|
141 |
embeddings_out, reduced_embeddings = make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_model, return_intermediate_files, embeddings_super_compress, low_resource_mode, create_llm_topic_labels)
|
142 |
|
143 |
|
144 |
+
vectoriser_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.05, max_df=0.9)
|
145 |
|
146 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
147 |
from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag
|
|
|
240 |
|
241 |
if return_intermediate_files == "Yes":
|
242 |
print("Saving embeddings to file")
|
243 |
+
if low_resource_mode == "Yes":
|
244 |
+
embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
|
245 |
+
else:
|
246 |
+
embeddings_file_name = data_file_name_no_ext + '_' + 'ai_embeddings.npz'
|
247 |
+
|
248 |
+
np.savez_compressed(embeddings_file_name, embeddings_out)
|
249 |
|
250 |
+
output_list.append(embeddings_file_name)
|
251 |
|
252 |
if visualise_topics == "Yes":
|
253 |
# Visualise the topics:
|
funcs/embeddings.py
CHANGED
@@ -33,9 +33,10 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
|
|
33 |
# If on CPU, don't resort to embedding models
|
34 |
if low_resource_mode_opt == "Yes":
|
35 |
print("Creating simplified 'sparse' embeddings based on TfIDF")
|
|
|
36 |
embedding_model = make_pipeline(
|
37 |
TfidfVectorizer(),
|
38 |
-
TruncatedSVD(
|
39 |
)
|
40 |
|
41 |
# Fit the pipeline to the text data
|
|
|
33 |
# If on CPU, don't resort to embedding models
|
34 |
if low_resource_mode_opt == "Yes":
|
35 |
print("Creating simplified 'sparse' embeddings based on TfIDF")
|
36 |
+
|
37 |
embedding_model = make_pipeline(
|
38 |
TfidfVectorizer(),
|
39 |
+
TruncatedSVD(100, random_state=random_seed)
|
40 |
)
|
41 |
|
42 |
# Fit the pipeline to the text data
|