Spaces:
Running
Running
seanpedrickcase
commited on
Commit
·
04a15c5
1
Parent(s):
d80c8f5
Updated packages. Improve hierarchy vis. Better models - mixedbread and phi3. Now option to split texts into sentences before modelling.
Browse files- Dockerfile +45 -1
- README.md +10 -2
- app.py +10 -4
- funcs/anonymiser.py +14 -4
- funcs/embeddings.py +1 -1
- funcs/helper_functions.py +1 -1
- funcs/prompts.py +33 -1
- funcs/representation_model.py +34 -41
- funcs/topic_core_funcs.py +47 -12
- requirements.txt +9 -8
- requirements_gpu.txt +17 -0
Dockerfile
CHANGED
@@ -1,4 +1,16 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
WORKDIR /src
|
4 |
|
@@ -6,10 +18,36 @@ COPY requirements.txt .
|
|
6 |
|
7 |
RUN pip install --no-cache-dir -r requirements.txt
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
# Set up a new user named "user" with user ID 1000
|
10 |
RUN useradd -m -u 1000 user
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
# Switch to the "user" user
|
12 |
USER user
|
|
|
13 |
# Set home to the user's home directory
|
14 |
ENV HOME=/home/user \
|
15 |
PATH=/home/user/.local/bin:$PATH \
|
@@ -18,7 +56,11 @@ ENV HOME=/home/user \
|
|
18 |
GRADIO_ALLOW_FLAGGING=never \
|
19 |
GRADIO_NUM_PORTS=1 \
|
20 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
|
|
21 |
GRADIO_THEME=huggingface \
|
|
|
|
|
|
|
22 |
SYSTEM=spaces
|
23 |
|
24 |
# Set the working directory to the user's home directory
|
@@ -26,5 +68,7 @@ WORKDIR $HOME/app
|
|
26 |
|
27 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
28 |
COPY --chown=user . $HOME/app
|
|
|
|
|
29 |
|
30 |
CMD ["python", "app.py"]
|
|
|
1 |
+
# First stage: build dependencies
|
2 |
+
FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
|
3 |
+
|
4 |
+
# Install Lambda web adapter in case you want to run with with an AWS Lamba function URL
|
5 |
+
COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
|
6 |
+
|
7 |
+
# Install wget and curl
|
8 |
+
RUN apt-get update && apt-get install -y \
|
9 |
+
wget \
|
10 |
+
curl
|
11 |
+
|
12 |
+
# Create a directory for the model
|
13 |
+
RUN mkdir /model
|
14 |
|
15 |
WORKDIR /src
|
16 |
|
|
|
18 |
|
19 |
RUN pip install --no-cache-dir -r requirements.txt
|
20 |
|
21 |
+
# Gradio needs to be installed after due to conflict with spacy in requirements
|
22 |
+
RUN pip install --no-cache-dir gradio==4.36.1
|
23 |
+
|
24 |
+
# Download the quantised phi model directly with curl
|
25 |
+
RUN curl -L -o Phi-3-mini-128k-instruct.Q4_K_M.gguf https://huggingface.co/QuantFactory/Phi-3-mini-128k-instruct-GGUF/tree/main/Phi-3-mini-128k-instruct.Q4_K_M.gguf
|
26 |
+
|
27 |
+
# If needed, move the file to your desired directory in the Docker image
|
28 |
+
RUN mv Phi-3-mini-128k-instruct.Q4_K_M.gguf /model/rep/
|
29 |
+
|
30 |
+
# Download the Mixed bread embedding model during the build process
|
31 |
+
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
|
32 |
+
RUN apt-get install git-lfs -y
|
33 |
+
RUN git lfs install
|
34 |
+
RUN git clone https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1 /model/embed
|
35 |
+
RUN rm -rf /model/embed/.git
|
36 |
+
|
37 |
# Set up a new user named "user" with user ID 1000
|
38 |
RUN useradd -m -u 1000 user
|
39 |
+
|
40 |
+
# Change ownership of /home/user directory
|
41 |
+
RUN chown -R user:user /home/user
|
42 |
+
|
43 |
+
# Make output folder
|
44 |
+
RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
|
45 |
+
RUN mkdir -p /home/user/.cache/huggingface/hub && chown -R user:user /home/user/.cache/huggingface/hub
|
46 |
+
RUN mkdir -p /home/user/.cache/matplotlib && chown -R user:user /home/user/.cache/matplotlib
|
47 |
+
|
48 |
# Switch to the "user" user
|
49 |
USER user
|
50 |
+
|
51 |
# Set home to the user's home directory
|
52 |
ENV HOME=/home/user \
|
53 |
PATH=/home/user/.local/bin:$PATH \
|
|
|
56 |
GRADIO_ALLOW_FLAGGING=never \
|
57 |
GRADIO_NUM_PORTS=1 \
|
58 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
59 |
+
GRADIO_SERVER_PORT=7860 \
|
60 |
GRADIO_THEME=huggingface \
|
61 |
+
AWS_STS_REGIONAL_ENDPOINT=regional \
|
62 |
+
GRADIO_OUTPUT_FOLDER='output/' \
|
63 |
+
#GRADIO_ROOT_PATH=/data-text-search \
|
64 |
SYSTEM=spaces
|
65 |
|
66 |
# Set the working directory to the user's home directory
|
|
|
68 |
|
69 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
70 |
COPY --chown=user . $HOME/app
|
71 |
+
#COPY . $HOME/app
|
72 |
+
|
73 |
|
74 |
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -4,10 +4,18 @@ emoji: 🚀
|
|
4 |
colorFrom: red
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
colorFrom: red
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.36.1
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
+
# Topic modeller
|
14 |
+
|
15 |
+
Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
|
16 |
+
|
17 |
+
Uses fast TF-IDF-based embeddings by default, which are fast but not very performant in terms of cluster. Change to [Mixedbread large v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) model embeddings (512 dimensions, 8 bit quantisation) on the options page for topics of much higher quality, but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available under the 'Options' tab. Topic representation with LLMs currently based on [Phi-3-mini-128k-instruct-GGUF](https://huggingface.co/QuantFactory/Phi-3-mini-128k-instruct-GGUF), which is quite slow on CPU, so use a GPU-enabled computer if possible, building from the requirements_gpu.txt file in the base folder.
|
18 |
+
|
19 |
+
For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.
|
20 |
+
|
21 |
+
I suggest [Wikipedia mini dataset](https://huggingface.co/datasets/rag-datasets/mini_wikipedia/tree/main/data) for testing the tool here, choose passages.parquet.
|
app.py
CHANGED
@@ -10,6 +10,7 @@ from funcs.topic_core_funcs import pre_clean, extract_topics, reduce_outliers, r
|
|
10 |
from funcs.helper_functions import initial_file_load, custom_regex_load
|
11 |
from sklearn.feature_extraction.text import CountVectorizer
|
12 |
|
|
|
13 |
# Gradio app
|
14 |
|
15 |
block = gr.Blocks(theme = gr.themes.Base())
|
@@ -32,7 +33,9 @@ with block:
|
|
32 |
# Topic modeller
|
33 |
Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
|
34 |
|
35 |
-
Uses fast TF-IDF-based embeddings by default, which are fast but not very performant in terms of cluster. Change to [
|
|
|
|
|
36 |
|
37 |
I suggest [Wikipedia mini dataset](https://huggingface.co/datasets/rag-datasets/mini_wikipedia/tree/main/data) for testing the tool here, choose passages.parquet.
|
38 |
""")
|
@@ -48,9 +51,10 @@ with block:
|
|
48 |
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with > 1 digits, emails, postcodes (UK), custom regex.")
|
49 |
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 char strings. May make old embedding files incompatible due to differing lengths.")
|
50 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
|
|
|
51 |
with gr.Row():
|
52 |
custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple")
|
53 |
-
gr.Markdown("""Import custom regex - csv table with one column of regex patterns with header. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
|
54 |
custom_regex_text = gr.Textbox(label="Custom regex load status")
|
55 |
clean_btn = gr.Button("Clean data")
|
56 |
|
@@ -108,7 +112,7 @@ with block:
|
|
108 |
|
109 |
# Clean data
|
110 |
custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])
|
111 |
-
clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state], api_name="clean")
|
112 |
|
113 |
# Extract topics
|
114 |
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, embeddings_type_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, embeddings_type_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state, assigned_topics_state], api_name="topics")
|
@@ -125,4 +129,6 @@ with block:
|
|
125 |
# Visualise topics
|
126 |
plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, data_state, data_file_name_no_ext_state, low_resource_mode_opt, embeddings_state, in_label, in_colnames, legend_label, sample_slide, visualisation_type_radio, seed_number], outputs=[vis_output_single_text, out_plot_file, plot, plot_2], api_name="plot")
|
127 |
|
128 |
-
|
|
|
|
|
|
10 |
from funcs.helper_functions import initial_file_load, custom_regex_load
|
11 |
from sklearn.feature_extraction.text import CountVectorizer
|
12 |
|
13 |
+
|
14 |
# Gradio app
|
15 |
|
16 |
block = gr.Blocks(theme = gr.themes.Base())
|
|
|
33 |
# Topic modeller
|
34 |
Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
|
35 |
|
36 |
+
Uses fast TF-IDF-based embeddings by default, which are fast but not very performant in terms of cluster. Change to [Mixedbread large v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) model embeddings (512 dimensions, 8 bit quantisation) on the options page for topics of much higher quality, but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available under the 'Options' tab. Topic representation with LLMs currently based on [Phi-3-mini-128k-instruct-GGUF](https://huggingface.co/QuantFactory/Phi-3-mini-128k-instruct-GGUF), which is quite slow on CPU, so use a GPU-enabled computer if possible, building from the requirements_gpu.txt file in the base folder.
|
37 |
+
|
38 |
+
For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.
|
39 |
|
40 |
I suggest [Wikipedia mini dataset](https://huggingface.co/datasets/rag-datasets/mini_wikipedia/tree/main/data) for testing the tool here, choose passages.parquet.
|
41 |
""")
|
|
|
51 |
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Clean data - remove html, numbers with > 1 digits, emails, postcodes (UK), custom regex.")
|
52 |
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 char strings. May make old embedding files incompatible due to differing lengths.")
|
53 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Personal details are redacted - not 100% effective. This is slow!")
|
54 |
+
split_sentence_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Split open text into sentences. Useful for small datasets.")
|
55 |
with gr.Row():
|
56 |
custom_regex = gr.UploadButton(label="Import custom regex file", file_count="multiple")
|
57 |
+
gr.Markdown("""Import custom regex - csv table with one column of regex patterns with no header. Example pattern: (?i)roosevelt for case insensitive removal of this term.""")
|
58 |
custom_regex_text = gr.Textbox(label="Custom regex load status")
|
59 |
clean_btn = gr.Button("Clean data")
|
60 |
|
|
|
112 |
|
113 |
# Clean data
|
114 |
custom_regex.upload(fn=custom_regex_load, inputs=[custom_regex], outputs=[custom_regex_text, custom_regex_state])
|
115 |
+
clean_btn.click(fn=pre_clean, inputs=[data_state, in_colnames, data_file_name_no_ext_state, custom_regex_state, clean_text, drop_duplicate_text, anonymise_drop, split_sentence_drop], outputs=[output_single_text, output_file, data_state, data_file_name_no_ext_state], api_name="clean")
|
116 |
|
117 |
# Extract topics
|
118 |
topics_btn.click(fn=extract_topics, inputs=[data_state, in_files, min_docs_slider, in_colnames, max_topics_slider, candidate_topics, data_file_name_no_ext_state, label_list_state, return_intermediate_files, embedding_super_compress, low_resource_mode_opt, save_topic_model, embeddings_state, embeddings_type_state, zero_shot_similarity, seed_number, calc_probs, vectoriser_state], outputs=[output_single_text, output_file, embeddings_state, embeddings_type_state, data_file_name_no_ext_state, topic_model_state, docs_state, vectoriser_state, assigned_topics_state], api_name="topics")
|
|
|
129 |
# Visualise topics
|
130 |
plot_btn.click(fn=visualise_topics, inputs=[topic_model_state, data_state, data_file_name_no_ext_state, low_resource_mode_opt, embeddings_state, in_label, in_colnames, legend_label, sample_slide, visualisation_type_radio, seed_number], outputs=[vis_output_single_text, out_plot_file, plot, plot_2], api_name="plot")
|
131 |
|
132 |
+
# Launch the Gradio app
|
133 |
+
if __name__ == "__main__":
|
134 |
+
block.queue().launch(show_error=True)#, server_name="0.0.0.0", ssl_verify=False, server_port=7860)
|
funcs/anonymiser.py
CHANGED
@@ -8,19 +8,21 @@ def spacy_model_installed(model_name):
|
|
8 |
import en_core_web_sm
|
9 |
en_core_web_sm.load()
|
10 |
print("Successfully imported spaCy model")
|
11 |
-
|
12 |
#print(nlp._path)
|
13 |
except:
|
14 |
download(model_name)
|
15 |
-
spacy.load(model_name)
|
16 |
print("Successfully imported spaCy model")
|
17 |
#print(nlp._path)
|
18 |
|
|
|
|
|
19 |
|
20 |
#if not is_model_installed(model_name):
|
21 |
# os.system(f"python -m spacy download {model_name}")
|
22 |
model_name = "en_core_web_sm"
|
23 |
-
spacy_model_installed(model_name)
|
24 |
|
25 |
#spacy.load(model_name)
|
26 |
# Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
|
@@ -41,7 +43,15 @@ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, PatternRecogn
|
|
41 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
42 |
from presidio_anonymizer.entities import OperatorConfig
|
43 |
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
def anon_consistent_names(df):
|
47 |
# ## Pick out common names and replace them with the same person value
|
|
|
8 |
import en_core_web_sm
|
9 |
en_core_web_sm.load()
|
10 |
print("Successfully imported spaCy model")
|
11 |
+
nlp = spacy.load("en_core_web_sm")
|
12 |
#print(nlp._path)
|
13 |
except:
|
14 |
download(model_name)
|
15 |
+
nlp = spacy.load(model_name)
|
16 |
print("Successfully imported spaCy model")
|
17 |
#print(nlp._path)
|
18 |
|
19 |
+
return nlp
|
20 |
+
|
21 |
|
22 |
#if not is_model_installed(model_name):
|
23 |
# os.system(f"python -m spacy download {model_name}")
|
24 |
model_name = "en_core_web_sm"
|
25 |
+
nlp = spacy_model_installed(model_name)
|
26 |
|
27 |
#spacy.load(model_name)
|
28 |
# Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
|
|
|
43 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
44 |
from presidio_anonymizer.entities import OperatorConfig
|
45 |
|
46 |
+
# Function to Split Text and Create DataFrame using SpaCy
|
47 |
+
def expand_sentences_spacy(df, colname, nlp=nlp):
|
48 |
+
expanded_data = []
|
49 |
+
df = df.reset_index(names='index')
|
50 |
+
for index, row in df.iterrows():
|
51 |
+
doc = nlp(row[colname])
|
52 |
+
for sent in doc.sents:
|
53 |
+
expanded_data.append({'document_index': row['index'], colname: sent.text})
|
54 |
+
return pd.DataFrame(expanded_data)
|
55 |
|
56 |
def anon_consistent_names(df):
|
57 |
# ## Pick out common names and replace them with the same person value
|
funcs/embeddings.py
CHANGED
@@ -47,7 +47,7 @@ def make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, em
|
|
47 |
print("Creating dense embeddings based on transformers model")
|
48 |
|
49 |
#embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
|
50 |
-
embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32) # For
|
51 |
|
52 |
toc = time.perf_counter()
|
53 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
|
|
47 |
print("Creating dense embeddings based on transformers model")
|
48 |
|
49 |
#embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # #
|
50 |
+
embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32, precision="int8") # For large
|
51 |
|
52 |
toc = time.perf_counter()
|
53 |
time_out = f"The embedding took {toc - tic:0.1f} seconds"
|
funcs/helper_functions.py
CHANGED
@@ -144,7 +144,7 @@ def custom_regex_load(in_file):
|
|
144 |
regex_file_names = [string for string in file_list if "csv" in string.lower()]
|
145 |
if regex_file_names:
|
146 |
regex_file_name = regex_file_names[0]
|
147 |
-
custom_regex =
|
148 |
#regex_file_name_no_ext = get_file_path_end(regex_file_name)
|
149 |
|
150 |
output_text = "Data file loaded."
|
|
|
144 |
regex_file_names = [string for string in file_list if "csv" in string.lower()]
|
145 |
if regex_file_names:
|
146 |
regex_file_name = regex_file_names[0]
|
147 |
+
custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
|
148 |
#regex_file_name_no_ext = get_file_path_end(regex_file_name)
|
149 |
|
150 |
output_text = "Data file loaded."
|
funcs/prompts.py
CHANGED
@@ -103,4 +103,36 @@ Topic label:"""
|
|
103 |
|
104 |
stablelm_prompt = stablelm_example_prompt + stablelm_main_prompt
|
105 |
|
106 |
-
#print("StableLM prompt: ", stablelm_prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
stablelm_prompt = stablelm_example_prompt + stablelm_main_prompt
|
105 |
|
106 |
+
#print("StableLM prompt: ", stablelm_prompt)
|
107 |
+
|
108 |
+
|
109 |
+
phi3_start = "<|user|>"
|
110 |
+
phi3_example_prompt = """<|user|>
|
111 |
+
I have a topic that contains the following documents:
|
112 |
+
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
|
113 |
+
- Meat, but especially beef, is the word food in terms of emissions.
|
114 |
+
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
|
115 |
+
|
116 |
+
The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
|
117 |
+
|
118 |
+
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
|
119 |
+
|
120 |
+
Topic label: Environmental impacts of eating meat
|
121 |
+
"""
|
122 |
+
|
123 |
+
# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
|
124 |
+
phi3_main_prompt = """
|
125 |
+
Now, create a new topic label given the following information.
|
126 |
+
|
127 |
+
I have a topic that contains the following documents:
|
128 |
+
[DOCUMENTS]
|
129 |
+
|
130 |
+
The topic is described by the following keywords: '[KEYWORDS]'.
|
131 |
+
|
132 |
+
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.<|end|>
|
133 |
+
<|assistant|>
|
134 |
+
Topic label:"""
|
135 |
+
|
136 |
+
phi3_prompt = phi3_example_prompt + phi3_main_prompt
|
137 |
+
|
138 |
+
#print("phi3 prompt: ", phi3_prompt)
|
funcs/representation_model.py
CHANGED
@@ -6,12 +6,12 @@ import torch.cuda
|
|
6 |
from huggingface_hub import hf_hub_download, snapshot_download
|
7 |
|
8 |
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, BaseRepresentation
|
9 |
-
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
10 |
|
11 |
random_seed = 42
|
12 |
|
13 |
-
chosen_prompt = open_hermes_prompt # stablelm_prompt
|
14 |
-
chosen_start_tag = open_hermes_start # stablelm_start
|
15 |
|
16 |
|
17 |
# Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
|
@@ -91,13 +91,14 @@ mmr = MaximalMarginalRelevance(diversity=0.5)
|
|
91 |
base_rep = BaseRepresentation()
|
92 |
|
93 |
# Find model file
|
94 |
-
def find_model_file(hf_model_name, hf_model_file, search_folder):
|
95 |
hf_loc = search_folder #os.environ["HF_HOME"]
|
96 |
-
hf_sub_loc = search_folder +
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
101 |
|
102 |
def find_file(root_folder, file_name):
|
103 |
for root, dirs, files in os.walk(root_folder):
|
@@ -109,36 +110,11 @@ def find_model_file(hf_model_name, hf_model_file, search_folder):
|
|
109 |
folder_path = hf_model_name_path # Replace with your folder path
|
110 |
file_to_find = hf_model_file # Replace with the file name you're looking for
|
111 |
|
112 |
-
|
113 |
-
if found_file:
|
114 |
-
print(f"Model file found: {found_file}")
|
115 |
-
return found_file
|
116 |
-
else:
|
117 |
-
error = "File not found."
|
118 |
-
print(error, " Downloading model from hub")
|
119 |
-
|
120 |
-
# Specify your custom directory
|
121 |
-
# Get HF_HOME environment variable or default to "~/.cache/huggingface/hub"
|
122 |
-
#hf_home_value = search_folder
|
123 |
-
|
124 |
-
# Check if the directory exists, create it if it doesn't
|
125 |
-
#if not os.path.exists(hf_home_value):
|
126 |
-
# os.makedirs(hf_home_value)
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
found_file = hf_hub_download(repo_id=hf_model_name, filename=hf_model_file)#, local_dir=hf_home_value) # cache_dir
|
131 |
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
# local_files_only=False
|
136 |
-
#)
|
137 |
-
|
138 |
-
print("Downloaded model to: ", found_file)
|
139 |
-
|
140 |
-
#found_file = find_file(path, file_to_find)
|
141 |
-
return found_file
|
142 |
|
143 |
|
144 |
def create_representation_model(representation_type, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode):
|
@@ -151,7 +127,7 @@ def create_representation_model(representation_type, llm_config, hf_model_name,
|
|
151 |
|
152 |
# Check for HF_HOME environment variable and supply a default value if it's not found (typical location for huggingface models)
|
153 |
# Get HF_HOME environment variable or default to "~/.cache/huggingface/hub"
|
154 |
-
base_folder = "
|
155 |
hf_home_value = os.getenv("HF_HOME", base_folder)
|
156 |
|
157 |
# Expand the user symbol '~' to the full home directory path
|
@@ -162,12 +138,29 @@ def create_representation_model(representation_type, llm_config, hf_model_name,
|
|
162 |
if not os.path.exists(hf_home_value):
|
163 |
os.makedirs(hf_home_value)
|
164 |
|
165 |
-
print(hf_home_value)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
-
|
168 |
|
169 |
-
llm = Llama(model_path=found_file, stop=chosen_start_tag, n_gpu_layers=llm_config.n_gpu_layers, n_ctx=llm_config.n_ctx,
|
170 |
#print(llm.n_gpu_layers)
|
|
|
171 |
llm_model = LlamaCPP(llm, prompt=chosen_prompt)#, **gen_config.model_dump())
|
172 |
|
173 |
# All representation models
|
|
|
6 |
from huggingface_hub import hf_hub_download, snapshot_download
|
7 |
|
8 |
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, BaseRepresentation
|
9 |
+
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start, phi3_prompt, phi3_start
|
10 |
|
11 |
random_seed = 42
|
12 |
|
13 |
+
chosen_prompt = phi3_prompt #open_hermes_prompt # stablelm_prompt
|
14 |
+
chosen_start_tag = phi3_start #open_hermes_start # stablelm_start
|
15 |
|
16 |
|
17 |
# Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
|
|
|
91 |
base_rep = BaseRepresentation()
|
92 |
|
93 |
# Find model file
|
94 |
+
def find_model_file(hf_model_name, hf_model_file, search_folder, sub_folder):
|
95 |
hf_loc = search_folder #os.environ["HF_HOME"]
|
96 |
+
hf_sub_loc = search_folder + sub_folder #os.environ["HF_HOME"]
|
97 |
|
98 |
+
if sub_folder == "/hub/":
|
99 |
+
hf_model_name_path = hf_sub_loc + 'models--' + hf_model_name.replace("/","--")
|
100 |
+
else:
|
101 |
+
hf_model_name_path = hf_sub_loc
|
102 |
|
103 |
def find_file(root_folder, file_name):
|
104 |
for root, dirs, files in os.walk(root_folder):
|
|
|
110 |
folder_path = hf_model_name_path # Replace with your folder path
|
111 |
file_to_find = hf_model_file # Replace with the file name you're looking for
|
112 |
|
113 |
+
print("Searching for model file", hf_model_file, "in:", hf_model_name_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
+
found_file = find_file(folder_path, file_to_find) # os.environ["HF_HOME"]
|
116 |
+
|
117 |
+
return found_file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
|
120 |
def create_representation_model(representation_type, llm_config, hf_model_name, hf_model_file, chosen_start_tag, low_resource_mode):
|
|
|
127 |
|
128 |
# Check for HF_HOME environment variable and supply a default value if it's not found (typical location for huggingface models)
|
129 |
# Get HF_HOME environment variable or default to "~/.cache/huggingface/hub"
|
130 |
+
base_folder = "model" #"~/.cache/huggingface/hub"
|
131 |
hf_home_value = os.getenv("HF_HOME", base_folder)
|
132 |
|
133 |
# Expand the user symbol '~' to the full home directory path
|
|
|
138 |
if not os.path.exists(hf_home_value):
|
139 |
os.makedirs(hf_home_value)
|
140 |
|
141 |
+
print("Searching base folder for model:", hf_home_value)
|
142 |
+
|
143 |
+
found_file = find_model_file(hf_model_name, hf_model_file, hf_home_value, "/rep/")
|
144 |
+
|
145 |
+
if found_file:
|
146 |
+
print(f"Model file found in model folder: {found_file}")
|
147 |
+
|
148 |
+
else:
|
149 |
+
found_file = find_model_file(hf_model_name, hf_model_file, hf_home_value, "/hub/")
|
150 |
+
|
151 |
+
if not found_file:
|
152 |
+
error = "File not found in HF hub directory or in local model file."
|
153 |
+
print(error, " Downloading model from hub")
|
154 |
+
|
155 |
+
found_file = hf_hub_download(repo_id=hf_model_name, filename=hf_model_file)#, local_dir=hf_home_value) # cache_dir
|
156 |
+
|
157 |
+
print("Downloaded model from Huggingface Hub to: ", found_file)
|
158 |
|
159 |
+
print("Loading representation model with", llm_config.n_gpu_layers, "layers allocated to GPU.")
|
160 |
|
161 |
+
llm = Llama(model_path=found_file, stop=chosen_start_tag, n_gpu_layers=llm_config.n_gpu_layers, n_ctx=llm_config.n_ctx,seed=seed) #**llm_config.model_dump())# rope_freq_scale=0.5,
|
162 |
#print(llm.n_gpu_layers)
|
163 |
+
print("Chosen prompt:", chosen_prompt)
|
164 |
llm_model = LlamaCPP(llm, prompt=chosen_prompt)#, **gen_config.model_dump())
|
165 |
|
166 |
# All representation models
|
funcs/topic_core_funcs.py
CHANGED
@@ -9,6 +9,7 @@ import time
|
|
9 |
from bertopic import BERTopic
|
10 |
|
11 |
from funcs.clean_funcs import initial_clean
|
|
|
12 |
from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder, save_topic_outputs
|
13 |
from funcs.embeddings import make_or_load_embeddings
|
14 |
from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
|
@@ -47,13 +48,13 @@ today = datetime.now().strftime("%d%m%Y")
|
|
47 |
today_rev = datetime.now().strftime("%Y%m%d")
|
48 |
|
49 |
# Load embeddings
|
50 |
-
embeddings_name = "BAAI/
|
51 |
|
52 |
# LLM model used for representing topics
|
53 |
-
hf_model_name = 'second-state/stablelm-2-zephyr-1.6b-GGUF' #'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF'
|
54 |
-
hf_model_file = 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf' # 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf'
|
55 |
|
56 |
-
def pre_clean(data, in_colnames, data_file_name_no_ext, custom_regex, clean_text, drop_duplicate_text, anonymise_drop, progress=gr.Progress(track_tqdm=True)):
|
57 |
|
58 |
output_text = ""
|
59 |
output_list = []
|
@@ -116,6 +117,19 @@ def pre_clean(data, in_colnames, data_file_name_no_ext, custom_regex, clean_text
|
|
116 |
anon_toc = time.perf_counter()
|
117 |
time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
out_data_name = data_file_name_no_ext + "_" + today_rev + ".csv"
|
120 |
data.to_csv(out_data_name)
|
121 |
output_list.append(out_data_name)
|
@@ -159,15 +173,36 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
159 |
print("Low resource mode: ", low_resource_mode)
|
160 |
|
161 |
if low_resource_mode == "No":
|
162 |
-
print("Using high resource
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
-
embedding_model = SentenceTransformer(embeddings_name)
|
165 |
|
166 |
# If tfidf embeddings currently exist, wipe these empty
|
167 |
if embeddings_type_state == "tfidf":
|
168 |
embeddings_out = np.array([])
|
169 |
|
170 |
-
embeddings_type_state = "
|
171 |
|
172 |
# UMAP model uses Bertopic defaults
|
173 |
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
|
@@ -180,8 +215,8 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
180 |
TruncatedSVD(100, random_state=random_seed)
|
181 |
)
|
182 |
|
183 |
-
# If
|
184 |
-
if embeddings_type_state == "
|
185 |
embeddings_out = np.array([])
|
186 |
|
187 |
embeddings_type_state = "tfidf"
|
@@ -316,9 +351,9 @@ def extract_topics(data, in_files, min_docs_slider, in_colnames, max_topics_slid
|
|
316 |
embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
|
317 |
else:
|
318 |
if embeddings_super_compress == "No":
|
319 |
-
embeddings_file_name = data_file_name_no_ext + '_' + '
|
320 |
else:
|
321 |
-
embeddings_file_name = data_file_name_no_ext + '_' + '
|
322 |
|
323 |
np.savez_compressed(embeddings_file_name, embeddings_out)
|
324 |
|
@@ -516,7 +551,7 @@ def visualise_topics(topic_model, data, data_file_name_no_ext, low_resource_mode
|
|
516 |
|
517 |
|
518 |
#try:
|
519 |
-
topics_vis, hierarchy_df, hierarchy_topic_names = visualize_hierarchical_documents_custom(topic_model, docs, label_list, hierarchical_topics, reduced_embeddings=reduced_embeddings, sample = sample_prop, hide_document_hover= False, custom_labels=True, width= 1200, height = 750)
|
520 |
topics_vis_2 = visualize_hierarchy_custom(topic_model, hierarchical_topics=hierarchical_topics, width= 1200, height = 750)
|
521 |
|
522 |
# Write hierarchical topics levels to df
|
|
|
9 |
from bertopic import BERTopic
|
10 |
|
11 |
from funcs.clean_funcs import initial_clean
|
12 |
+
from funcs.anonymiser import expand_sentences_spacy
|
13 |
from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder, save_topic_outputs
|
14 |
from funcs.embeddings import make_or_load_embeddings
|
15 |
from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
|
|
|
48 |
today_rev = datetime.now().strftime("%Y%m%d")
|
49 |
|
50 |
# Load embeddings
|
51 |
+
embeddings_name = "mixedbread-ai/mxbai-embed-large-v1" #"BAAI/large-small-en-v1.5" #"jinaai/jina-embeddings-v2-base-en"
|
52 |
|
53 |
# LLM model used for representing topics
|
54 |
+
hf_model_name = "QuantFactory/Phi-3-mini-128k-instruct-GGUF"#'second-state/stablelm-2-zephyr-1.6b-GGUF' #'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF'
|
55 |
+
hf_model_file = "Phi-3-mini-128k-instruct.Q4_K_M.gguf"#'stablelm-2-zephyr-1_6b-Q5_K_M.gguf' # 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf'
|
56 |
|
57 |
+
def pre_clean(data, in_colnames, data_file_name_no_ext, custom_regex, clean_text, drop_duplicate_text, anonymise_drop, sentence_split_drop, progress=gr.Progress(track_tqdm=True)):
|
58 |
|
59 |
output_text = ""
|
60 |
output_list = []
|
|
|
117 |
anon_toc = time.perf_counter()
|
118 |
time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
|
119 |
|
120 |
+
if sentence_split_drop == "Yes":
|
121 |
+
progress(0.6, desc= "Splitting text into sentences")
|
122 |
+
|
123 |
+
data_file_name_no_ext = data_file_name_no_ext + "_split"
|
124 |
+
|
125 |
+
anon_tic = time.perf_counter()
|
126 |
+
|
127 |
+
data = expand_sentences_spacy(data, in_colnames_list_first)
|
128 |
+
data = data[data[in_colnames_list_first].str.len() >= 5] # Keep only rows with at least 5 characters
|
129 |
+
|
130 |
+
anon_toc = time.perf_counter()
|
131 |
+
time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
|
132 |
+
|
133 |
out_data_name = data_file_name_no_ext + "_" + today_rev + ".csv"
|
134 |
data.to_csv(out_data_name)
|
135 |
output_list.append(out_data_name)
|
|
|
173 |
print("Low resource mode: ", low_resource_mode)
|
174 |
|
175 |
if low_resource_mode == "No":
|
176 |
+
print("Using high resource embedding model")
|
177 |
+
|
178 |
+
# Define a list of possible local locations to search for the model
|
179 |
+
local_embeddings_locations = [
|
180 |
+
"model/embed/", # Potential local location
|
181 |
+
"/model/embed/", # Potential location in Docker container
|
182 |
+
"/home/user/app/model/embed/" # This is inside a Docker container
|
183 |
+
]
|
184 |
+
|
185 |
+
# Attempt to load the model from each local location
|
186 |
+
for location in local_embeddings_locations:
|
187 |
+
try:
|
188 |
+
embedding_model = SentenceTransformer(location, truncate_dim=512)
|
189 |
+
print(f"Found local model installation at: {location}")
|
190 |
+
break # Exit the loop if the model is found
|
191 |
+
except Exception as e:
|
192 |
+
print(f"Failed to load model from {location}: {e}")
|
193 |
+
continue
|
194 |
+
else:
|
195 |
+
# If the loop completes without finding the model in any local location
|
196 |
+
embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
|
197 |
+
print("Could not find local model installation. Downloading from Huggingface")
|
198 |
|
199 |
+
#embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
|
200 |
|
201 |
# If tfidf embeddings currently exist, wipe these empty
|
202 |
if embeddings_type_state == "tfidf":
|
203 |
embeddings_out = np.array([])
|
204 |
|
205 |
+
embeddings_type_state = "large"
|
206 |
|
207 |
# UMAP model uses Bertopic defaults
|
208 |
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', low_memory=False, random_state=random_seed)
|
|
|
215 |
TruncatedSVD(100, random_state=random_seed)
|
216 |
)
|
217 |
|
218 |
+
# If large embeddings currently exist, wipe these empty, then rename embeddings type
|
219 |
+
if embeddings_type_state == "large":
|
220 |
embeddings_out = np.array([])
|
221 |
|
222 |
embeddings_type_state = "tfidf"
|
|
|
351 |
embeddings_file_name = data_file_name_no_ext + '_' + 'tfidf_embeddings.npz'
|
352 |
else:
|
353 |
if embeddings_super_compress == "No":
|
354 |
+
embeddings_file_name = data_file_name_no_ext + '_' + 'large_embeddings.npz'
|
355 |
else:
|
356 |
+
embeddings_file_name = data_file_name_no_ext + '_' + 'large_embeddings_compress.npz'
|
357 |
|
358 |
np.savez_compressed(embeddings_file_name, embeddings_out)
|
359 |
|
|
|
551 |
|
552 |
|
553 |
#try:
|
554 |
+
topics_vis, hierarchy_df, hierarchy_topic_names = visualize_hierarchical_documents_custom(topic_model, docs, label_list, hierarchical_topics, hide_annotations=True, reduced_embeddings=reduced_embeddings, sample = sample_prop, hide_document_hover= False, custom_labels=True, width= 1200, height = 750)
|
555 |
topics_vis_2 = visualize_hierarchy_custom(topic_model, hierarchical_topics=hierarchical_topics, width= 1200, height = 750)
|
556 |
|
557 |
# Write hierarchical topics levels to df
|
requirements.txt
CHANGED
@@ -1,15 +1,16 @@
|
|
1 |
-
gradio
|
2 |
-
transformers==4.
|
3 |
accelerate==0.26.1
|
4 |
-
torch==2.1
|
5 |
-
llama-cpp-python==0.2.
|
6 |
-
bertopic==0.16.
|
7 |
-
spacy==3.7.
|
8 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
9 |
pyarrow==14.0.2
|
10 |
openpyxl==3.1.2
|
11 |
Faker==22.2.0
|
12 |
-
presidio_analyzer==2.2.
|
13 |
-
presidio_anonymizer==2.2.
|
14 |
scipy==1.11.4
|
15 |
polars==0.20.6
|
|
|
|
1 |
+
gradio
|
2 |
+
transformers==4.41.2
|
3 |
accelerate==0.26.1
|
4 |
+
torch==2.3.1
|
5 |
+
llama-cpp-python==0.2.79
|
6 |
+
bertopic==0.16.2
|
7 |
+
spacy==3.7.4
|
8 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
9 |
pyarrow==14.0.2
|
10 |
openpyxl==3.1.2
|
11 |
Faker==22.2.0
|
12 |
+
presidio_analyzer==2.2.354
|
13 |
+
presidio_anonymizer==2.2.354
|
14 |
scipy==1.11.4
|
15 |
polars==0.20.6
|
16 |
+
numpy==1.26.4
|
requirements_gpu.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
transformers==4.41.2
|
3 |
+
accelerate==0.26.1
|
4 |
+
torch==2.3.1
|
5 |
+
bertopic==0.16.2
|
6 |
+
spacy==3.7.4
|
7 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
|
8 |
+
pyarrow==14.0.2
|
9 |
+
openpyxl==3.1.2
|
10 |
+
Faker==22.2.0
|
11 |
+
presidio_analyzer==2.2.354
|
12 |
+
presidio_anonymizer==2.2.354
|
13 |
+
scipy==1.11.4
|
14 |
+
polars==0.20.6
|
15 |
+
torch --index-url https://download.pytorch.org/whl/cu121
|
16 |
+
llama-cpp-python==0.2.77 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
17 |
+
numpy==1.26.4
|