Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

seanpedrickcase commited on Aug 12, 2024

Commit

22ca76e

1 Parent(s): 3c1c3de

Allowed for app running on AWS to use smaller embedding model and not to load representation LLM (due to size restrictions).

Browse files

Files changed (7) hide show

Dockerfile +13 -9
app.py +1 -1
download_model.py +16 -0
funcs/representation_model.py +13 -0
funcs/topic_core_funcs.py +8 -5
requirements.txt +2 -2
requirements_gpu.txt +3 -2

Dockerfile CHANGED Viewed

@@ -35,15 +35,19 @@ RUN mkdir -p /home/user/.cache/matplotlib && chown -R user:user /home/user/.cach
 RUN mkdir -p /home/user/app/model/rep && chown -R user:user /home/user/app/model/rep
 RUN mkdir -p /home/user/app/model/embed && chown -R user:user /home/user/app/model/embed
-# Download the quantised phi model directly with curl
-RUN curl -L -o /home/user/app/model/rep/Phi-3-mini-128k-instruct.Q4_K_M.gguf https://huggingface.co/QuantFactory/Phi-3-mini-128k-instruct-GGUF/tree/main/Phi-3-mini-128k-instruct.Q4_K_M.gguf
-# Download the Mixed bread embedding model during the build process
-RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
-RUN apt-get install git-lfs -y
-RUN git lfs install
-RUN git clone https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1 /home/user/app/model/embed
-RUN rm -rf /home/user/app/model/embed/.git
 # Switch to the "user" user
 USER user

 RUN mkdir -p /home/user/app/model/rep && chown -R user:user /home/user/app/model/rep
 RUN mkdir -p /home/user/app/model/embed && chown -R user:user /home/user/app/model/embed
+# Download the quantised phi model directly with curl. Changed at it is so big - not loaded
+#RUN curl -L -o /home/user/app/model/rep/Phi-3.1-mini-128k-instruct-Q4_K_M.gguf https://huggingface.co/bartowski/Phi-3.1-mini-128k-instruct-GGUF/tree/main/Phi-3.1-mini-128k-instruct-Q4_K_M.gguf
+# Download the Mixed bread embedding model during the build process - changed as it is too big for AWS. Not loaded.
+#RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
+#RUN apt-get install git-lfs -y
+#RUN git lfs install
+#RUN git clone https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1 /home/user/app/model/embed
+#RUN rm -rf /home/user/app/model/embed/.git
+# Download the BGE embedding model during the build process. Create a directory for the model and download specific files using huggingface_hub
+COPY download_model.py /src/download_model.py
+RUN python /src/download_model.py
 # Switch to the "user" user
 USER user

app.py CHANGED Viewed

@@ -39,7 +39,7 @@ with block:
     # Topic modeller
     Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
-    Uses fast TF-IDF-based embeddings by default, which are fast but does not lead to high quality clusering. Change to higher quality [mxbai-embed-large-v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) model embeddings (512 dimensions) for better results but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available such as maximum topics allowed, minimum documents per topic etc.. Topic representation with LLMs currently based on [Phi-3-mini-128k-instruct-GGUF](https://huggingface.co/QuantFactory/Phi-3-mini-128k-instruct-GGUF), which is quite slow on CPU, so use a GPU-enabled computer if possible, building from the requirements_gpu.txt file in the base folder.
     For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.

     # Topic modeller
     Generate topics from open text in tabular data, based on [BERTopic](https://maartengr.github.io/BERTopic/). Upload a data file (csv, xlsx, or parquet), then specify the open text column that you want to use to generate topics. Click 'Extract topics' after you have selected the minimum similar documents per topic and maximum total topics. Duplicate this space, or clone to your computer to avoid queues here!
+    Uses fast TF-IDF-based embeddings by default, which are fast but does not lead to high quality clusering. Change to higher quality [mxbai-embed-large-v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) model embeddings (512 dimensions) for better results but slower processing time. If you have an embeddings .npz file previously made using this model, you can load this in at the same time to skip the first modelling step. If you have a pre-defined list of topics for zero-shot modelling, you can upload this as a csv file under 'I have my own list of topics...'. Further configuration options are available such as maximum topics allowed, minimum documents per topic etc.. Topic representation with LLMs currently based on [Phi-3.1-mini-128k-instruct-GGUF](https://huggingface.co/bartowski/Phi-3.1-mini-128k-instruct-GGUF), which is quite slow on CPU, so use a GPU-enabled computer if possible, building from the requirements_gpu.txt file in the base folder.
     For small datasets, consider breaking up your text into sentences under 'Clean data' -> 'Split open text...' before topic modelling.

download_model.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from huggingface_hub import hf_hub_download
+# Define the repository and files to download
+repo_id = "sentence-transformers/all-MiniLM-L6-v2" #"BAAI/bge-small-en-v1.5"
+files_to_download = [
+    "config.json",
+    "config_sentence_transformers.json",
+    "model.safetensors",
+    "tokenizer_config.json",
+    "vocab.txt"
+]
+# Download each file and save it to the /model/bge directory
+for file_name in files_to_download:
+    print("Checking for file", file_name)
+    hf_hub_download(repo_id=repo_id, filename=file_name, local_dir="/model/embed") #"/model/bge"

funcs/representation_model.py CHANGED Viewed

@@ -4,16 +4,21 @@ from llama_cpp import Llama
 from pydantic import BaseModel
 import torch.cuda
 from huggingface_hub import hf_hub_download
 from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, BaseRepresentation
 from funcs.embeddings import torch_device
 from funcs.prompts import phi3_prompt, phi3_start
 chosen_prompt = phi3_prompt #open_hermes_prompt # stablelm_prompt
 chosen_start_tag =  phi3_start #open_hermes_start # stablelm_start
 random_seed = 42
 # Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
 print("torch device for representation functions:", torch_device)
 if torch_device == "gpu":
@@ -140,6 +145,14 @@ def create_representation_model(representation_type: str, llm_config: dict, hf_m
     """
     if representation_type == "LLM":
         print("Generating LLM representation")
         # Use llama.cpp to load in model

 from pydantic import BaseModel
 import torch.cuda
 from huggingface_hub import hf_hub_download
+from gradio import Warning
 from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, BaseRepresentation
 from funcs.embeddings import torch_device
 from funcs.prompts import phi3_prompt, phi3_start
+from funcs.helper_functions import get_or_create_env_var
 chosen_prompt = phi3_prompt #open_hermes_prompt # stablelm_prompt
 chosen_start_tag =  phi3_start #open_hermes_start # stablelm_start
 random_seed = 42
+RUNNING_ON_AWS = get_or_create_env_var('RUNNING_ON_AWS', '0')
+print(f'The value of RUNNING_ON_AWS is {RUNNING_ON_AWS}')
 # Currently set n_gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
 print("torch device for representation functions:", torch_device)
 if torch_device == "gpu":
     """
     if representation_type == "LLM":
+        print("RUNNING_ON_AWS:", RUNNING_ON_AWS)
+        if RUNNING_ON_AWS=="1":
+            error_message = "LLM representation not available on AWS due to model size restrictions. Returning base representation"
+            Warning(error_message, duration=5)
+            print(error_message)
+            representation_model = {"LLM":base_rep}
+            return representation_model
         print("Generating LLM representation")
         # Use llama.cpp to load in model

funcs/topic_core_funcs.py CHANGED Viewed

@@ -13,10 +13,10 @@ PandasDataFrame = Type[pd.DataFrame]
 from funcs.clean_funcs import initial_clean, regex_clean
 from funcs.anonymiser import expand_sentences_spacy
-from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder, save_topic_outputs, output_folder
 from funcs.embeddings import make_or_load_embeddings, torch_device
 from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
-from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed
 from sklearn.feature_extraction.text import CountVectorizer
@@ -36,11 +36,14 @@ today = datetime.now().strftime("%d%m%Y")
 today_rev = datetime.now().strftime("%Y%m%d")
 # Load embeddings
-embeddings_name = "mixedbread-ai/mxbai-embed-large-v1" #"BAAI/large-small-en-v1.5" #"jinaai/jina-embeddings-v2-base-en"
 # LLM model used for representing topics
-hf_model_name =  "QuantFactory/Phi-3-mini-128k-instruct-GGUF"#'second-state/stablelm-2-zephyr-1.6b-GGUF' #'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF'
-hf_model_file =   "Phi-3-mini-128k-instruct.Q4_K_M.gguf"#'stablelm-2-zephyr-1_6b-Q5_K_M.gguf' # 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf'
 # When topic modelling column is chosen, change the default visualisation column to the same
 def change_default_vis_col(in_colnames:List[str]):

 from funcs.clean_funcs import initial_clean, regex_clean
 from funcs.anonymiser import expand_sentences_spacy
+from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder, save_topic_outputs, output_folder, get_or_create_env_var
 from funcs.embeddings import make_or_load_embeddings, torch_device
 from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
+from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed, RUNNING_ON_AWS
 from sklearn.feature_extraction.text import CountVectorizer
 today_rev = datetime.now().strftime("%Y%m%d")
 # Load embeddings
+if RUNNING_ON_AWS=="0":
+    embeddings_name = "mixedbread-ai/mxbai-embed-large-v1" #"BAAI/large-small-en-v1.5" #"jinaai/jina-embeddings-v2-base-en"
+else:
+    embeddings_name = "sentence-transformers/all-MiniLM-L6-v2"
 # LLM model used for representing topics
+hf_model_name =  "bartowski/Phi-3.1-mini-128k-instruct-GGUF"#'second-state/stablelm-2-zephyr-1.6b-GGUF' #'TheBloke/phi-2-orange-GGUF' #'NousResearch/Nous-Capybara-7B-V1.9-GGUF'
+hf_model_file =   "Phi-3.1-mini-128k-instruct-Q4_K_M.gguf"#'stablelm-2-zephyr-1_6b-Q5_K_M.gguf' # 'phi-2-orange.Q5_K_M.gguf' #'Capybara-7B-V1.9-Q5_K_M.gguf'
 # When topic modelling column is chosen, change the default visualisation column to the same
 def change_default_vis_col(in_colnames:List[str]):

requirements.txt CHANGED Viewed

@@ -2,7 +2,7 @@ gradio # Not specified version due to interaction with spacy - reinstall latest
 boto3
 transformers==4.41.2
 accelerate==0.26.1
-torch==2.3.1
 bertopic==0.16.2
 spacy==3.7.4
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
@@ -14,5 +14,5 @@ presidio_anonymizer==2.2.354
 scipy==1.11.4
 polars==0.20.6
 sentence-transformers==3.0.1
-llama-cpp-python==0.2.79 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 numpy==1.26.4

 boto3
 transformers==4.41.2
 accelerate==0.26.1
+torch==2.4.0
 bertopic==0.16.2
 spacy==3.7.4
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz
 scipy==1.11.4
 polars==0.20.6
 sentence-transformers==3.0.1
+llama-cpp-python==0.2.87 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 numpy==1.26.4

requirements_gpu.txt CHANGED Viewed

@@ -12,7 +12,8 @@ presidio_analyzer==2.2.354
 presidio_anonymizer==2.2.354
 scipy==1.11.4
 polars==0.20.6
 torch --index-url https://download.pytorch.org/whl/cu121
-llama-cpp-python==0.2.77 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
-numpy==1.26.4
 sentence-transformers==3.0.1

 presidio_anonymizer==2.2.354
 scipy==1.11.4
 polars==0.20.6
+llama-cpp-python==0.2.87 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
 torch --index-url https://download.pytorch.org/whl/cu121
 sentence-transformers==3.0.1
+numpy==1.26.4