Spaces:
Running
Running
Sean-Case
commited on
Commit
·
0fe5421
1
Parent(s):
6622531
Added minimum similarity slider for zero shot topic modelling
Browse files
app.py
CHANGED
@@ -130,7 +130,7 @@ def save_topic_outputs(topic_model, data_file_name_no_ext, output_list, docs, sa
|
|
130 |
|
131 |
return output_list, output_text
|
132 |
|
133 |
-
def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, save_topic_model, embeddings_out, progress=gr.Progress()):
|
134 |
|
135 |
progress(0, desc= "Loading data")
|
136 |
|
@@ -243,7 +243,7 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
243 |
min_topic_size = min_docs_slider,
|
244 |
nr_topics = max_topics_slider,
|
245 |
zeroshot_topic_list = zero_shot_topics_lower,
|
246 |
-
zeroshot_min_similarity =
|
247 |
verbose = True)
|
248 |
|
249 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
@@ -478,7 +478,8 @@ with block:
|
|
478 |
|
479 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
480 |
candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")
|
481 |
-
|
|
|
482 |
with gr.Row():
|
483 |
min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of similar documents needed to make a topic.")
|
484 |
max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 10, step = 1, label = "Maximum number of topics")
|
@@ -522,7 +523,7 @@ with block:
|
|
522 |
in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state, embeddings_state, output_single_text, topic_model_state])
|
523 |
in_colnames.change(dummy_function, in_colnames, None)
|
524 |
|
525 |
-
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state], outputs=[output_single_text, output_file, plot, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state, label_list_state], api_name="topics")
|
526 |
|
527 |
reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, low_resource_mode_opt], outputs=[output_single_text, output_file, embeddings_state], api_name="reduce_outliers")
|
528 |
|
|
|
130 |
|
131 |
return output_list, output_text
|
132 |
|
133 |
+
def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embeddings_super_compress, low_resource_mode, save_topic_model, embeddings_out, zero_shot_similarity, progress=gr.Progress()):
|
134 |
|
135 |
progress(0, desc= "Loading data")
|
136 |
|
|
|
243 |
min_topic_size = min_docs_slider,
|
244 |
nr_topics = max_topics_slider,
|
245 |
zeroshot_topic_list = zero_shot_topics_lower,
|
246 |
+
zeroshot_min_similarity = zero_shot_similarity, # 0.7
|
247 |
verbose = True)
|
248 |
|
249 |
topics_text, probs = topic_model.fit_transform(docs, embeddings_out)
|
|
|
478 |
|
479 |
with gr.Accordion("I have my own list of topics (zero shot topic modelling).", open = False):
|
480 |
candidate_topics = gr.File(label="Input topics from file (csv). File should have at least one column with a header and topic keywords in cells below. Topics will be taken from the first column of the file. Currently not compatible with low-resource embeddings.")
|
481 |
+
zero_shot_similarity = gr.Slider(minimum = 0.5, maximum = 1, value = 0.65, step = 0.001, label = "Minimum similarity value for document to be assigned to zero-shot topic.")
|
482 |
+
|
483 |
with gr.Row():
|
484 |
min_docs_slider = gr.Slider(minimum = 2, maximum = 1000, value = 15, step = 1, label = "Minimum number of similar documents needed to make a topic.")
|
485 |
max_topics_slider = gr.Slider(minimum = 2, maximum = 500, value = 10, step = 1, label = "Maximum number of topics")
|
|
|
523 |
in_files.upload(fn=put_columns_in_df, inputs=[in_files], outputs=[in_colnames, in_label, data_state, embeddings_state, output_single_text, topic_model_state])
|
524 |
in_colnames.change(dummy_function, in_colnames, None)
|
525 |
|
526 |
+
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, in_label, anonymise_drop, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, zero_shot_similarity], outputs=[output_single_text, output_file, plot, embeddings_state, data_file_name_no_ext_state, topic_model_state, docs_state, label_list_state], api_name="topics")
|
527 |
|
528 |
reduce_outliers_btn.click(fn=reduce_outliers, inputs=[topic_model_state, docs_state, embeddings_state, data_file_name_no_ext_state, low_resource_mode_opt], outputs=[output_single_text, output_file, embeddings_state], api_name="reduce_outliers")
|
529 |
|