Spaces:
Running
Running
Sean-Case
commited on
Commit
·
fac3624
1
Parent(s):
aa3df37
Greatly increased low resource process dimensions for higher quality. Visualisations disabled by default to increase speed.
Browse files- app.py +3 -3
- funcs/embeddings.py +1 -1
- funcs/representation_model.py +1 -1
app.py
CHANGED
@@ -295,7 +295,7 @@ with block:
|
|
295 |
candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")
|
296 |
|
297 |
with gr.Row():
|
298 |
-
min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of documents
|
299 |
max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
|
300 |
|
301 |
with gr.Row():
|
@@ -305,7 +305,7 @@ with block:
|
|
305 |
output_single_text = gr.Textbox(label="Output example (first example in dataset)")
|
306 |
output_file = gr.File(label="Output file")
|
307 |
|
308 |
-
plot = gr.Plot(label="Visualise your topics here
|
309 |
|
310 |
with gr.Tab("Options"):
|
311 |
with gr.Accordion("Data load and processing options", open = True):
|
@@ -317,7 +317,7 @@ with block:
|
|
317 |
low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
|
318 |
create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
|
319 |
save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
|
320 |
-
visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="
|
321 |
|
322 |
# Update column names dropdown when file uploaded
|
323 |
in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
|
|
|
295 |
candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")
|
296 |
|
297 |
with gr.Row():
|
298 |
+
min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of similar documents needed to make a topic.")
|
299 |
max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 3, step = 1, label = "Maximum number of topics")
|
300 |
|
301 |
with gr.Row():
|
|
|
305 |
output_single_text = gr.Textbox(label="Output example (first example in dataset)")
|
306 |
output_file = gr.File(label="Output file")
|
307 |
|
308 |
+
plot = gr.Plot(label="Visualise your topics here. Go to the 'Options' tab to enable.")
|
309 |
|
310 |
with gr.Tab("Options"):
|
311 |
with gr.Accordion("Data load and processing options", open = True):
|
|
|
317 |
low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings and processing.", value="No", choices=["Yes", "No"])
|
318 |
create_llm_topic_labels = gr.Dropdown(label = "Create LLM-generated topic labels.", value="No", choices=["Yes", "No"])
|
319 |
save_topic_model = gr.Dropdown(label = "Save topic model to file.", value="Yes", choices=["Yes", "No"])
|
320 |
+
visualise_topics = gr.Dropdown(label = "Create a visualisation to map topics.", value="No", choices=["Yes", "No"])
|
321 |
|
322 |
# Update column names dropdown when file uploaded
|
323 |
in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state])
|
funcs/embeddings.py
CHANGED
@@ -35,7 +35,7 @@ def make_or_load_embeddings(docs, file_list, data_file_name_no_ext, embedding_mo
|
|
35 |
print("Creating simplified 'sparse' embeddings based on TfIDF")
|
36 |
embedding_model = make_pipeline(
|
37 |
TfidfVectorizer(),
|
38 |
-
TruncatedSVD(
|
39 |
)
|
40 |
|
41 |
# Fit the pipeline to the text data
|
|
|
35 |
print("Creating simplified 'sparse' embeddings based on TfIDF")
|
36 |
embedding_model = make_pipeline(
|
37 |
TfidfVectorizer(),
|
38 |
+
TruncatedSVD(2000, random_state=random_seed)
|
39 |
)
|
40 |
|
41 |
# Fit the pipeline to the text data
|
funcs/representation_model.py
CHANGED
@@ -119,7 +119,7 @@ llm_config = LLamacppInitConfigGpu(last_n_tokens_size=last_n_tokens_size,
|
|
119 |
# KeyBERT
|
120 |
keybert = KeyBERTInspired(random_state=random_seed)
|
121 |
# MMR
|
122 |
-
mmr = MaximalMarginalRelevance(diversity=0.
|
123 |
|
124 |
def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode):
|
125 |
|
|
|
119 |
# KeyBERT
|
120 |
keybert = KeyBERTInspired(random_state=random_seed)
|
121 |
# MMR
|
122 |
+
mmr = MaximalMarginalRelevance(diversity=0.2)
|
123 |
|
124 |
def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode):
|
125 |
|