Spaces:
Sleeping
Sleeping
Bimal Bhattarai
commited on
Commit
·
6e43359
0
Parent(s):
first
Browse files- .env +7 -0
- .gitattributes +3 -0
- __pycache__/constants.cpython-311.pyc +0 -0
- __pycache__/constants.cpython-39.pyc +0 -0
- app.py +105 -0
- app_v2.py +118 -0
- constants.py +14 -0
- db/687b62b3-4364-499d-9927-b084c01b0b0b/data_level0.bin +3 -0
- db/687b62b3-4364-499d-9927-b084c01b0b0b/header.bin +3 -0
- db/687b62b3-4364-499d-9927-b084c01b0b0b/index_metadata.pickle +3 -0
- db/687b62b3-4364-499d-9927-b084c01b0b0b/length.bin +3 -0
- db/687b62b3-4364-499d-9927-b084c01b0b0b/link_lists.bin +3 -0
- db/chroma.sqlite3 +3 -0
- ingest.py +147 -0
- requirements.txt +15 -0
.env
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PERSIST_DIRECTORY=db
|
2 |
+
MODEL_TYPE=GPT4All
|
3 |
+
MODEL_PATH=models/ggml-gpt4all-j-v1.3-groovy.bin
|
4 |
+
EMBEDDINGS_MODEL_NAME=all-MiniLM-L6-v2
|
5 |
+
MODEL_N_CTX=1000
|
6 |
+
MODEL_N_BATCH=8
|
7 |
+
TARGET_SOURCE_CHUNKS=5
|
.gitattributes
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
__pycache__/constants.cpython-311.pyc
ADDED
Binary file (669 Bytes). View file
|
|
__pycache__/constants.cpython-39.pyc
ADDED
Binary file (471 Bytes). View file
|
|
app.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain import PromptTemplate, LLMChain
|
2 |
+
from langchain.llms import CTransformers
|
3 |
+
import os
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain.vectorstores import Chroma
|
6 |
+
from langchain.chains import RetrievalQA
|
7 |
+
from langchain.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings
|
8 |
+
from io import BytesIO
|
9 |
+
from langchain.document_loaders import PyPDFLoader
|
10 |
+
import gradio as gr
|
11 |
+
import chromadb
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
from constants import CHROMA_SETTINGS
|
14 |
+
from io import BytesIO
|
15 |
+
import gradio as gr
|
16 |
+
|
17 |
+
|
18 |
+
local_llm = "mistralai/Mistral-7B-v0.1"
|
19 |
+
|
20 |
+
config = {
|
21 |
+
'max_new_tokens': 1024,
|
22 |
+
'repetition_penalty': 1.1,
|
23 |
+
'temperature': 0.1,
|
24 |
+
'top_k': 50,
|
25 |
+
'top_p': 0.9,
|
26 |
+
'stream': True,
|
27 |
+
'threads': int(os.cpu_count() / 2)
|
28 |
+
}
|
29 |
+
|
30 |
+
|
31 |
+
llm = CTransformers(
|
32 |
+
model=local_llm,
|
33 |
+
model_type="mistral",
|
34 |
+
lib="avx2", #for CPU use
|
35 |
+
**config
|
36 |
+
)
|
37 |
+
|
38 |
+
embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME')
|
39 |
+
persist_directory = os.environ.get('PERSIST_DIRECTORY')
|
40 |
+
target_source_chunks = int(os.environ.get('TARGET_SOURCE_CHUNKS',4))
|
41 |
+
|
42 |
+
if not load_dotenv():
|
43 |
+
print("Could not load .env file or it is empty. Please check if it exists and is readable.")
|
44 |
+
exit(1)
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
print("Loading embeddings model...")
|
49 |
+
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
|
50 |
+
chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
|
51 |
+
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
|
52 |
+
|
53 |
+
prompt_template = """Use the following pieces of information to answer the user's question.
|
54 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
55 |
+
|
56 |
+
Context: {context}
|
57 |
+
Question: {question}
|
58 |
+
|
59 |
+
Only return the helpful answer below and nothing else.
|
60 |
+
Helpful answer:
|
61 |
+
"""
|
62 |
+
prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
|
63 |
+
retriever = db.as_retriever(search_kwargs={"k": target_source_chunks})
|
64 |
+
# activate/deactivate the streaming StdOut callback for LLMs
|
65 |
+
'''
|
66 |
+
query="What is state ownership report"
|
67 |
+
semantic_search_results = retriever.get_relevant_documents(query)
|
68 |
+
print(semantic_search_results)
|
69 |
+
query="What is state ownership report"
|
70 |
+
chain_type_kwargs = {"prompt": prompt}
|
71 |
+
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= False, chain_type_kwargs=chain_type_kwargs, verbose=True)
|
72 |
+
response= qa(query)
|
73 |
+
print(response)'''
|
74 |
+
|
75 |
+
chain_type_kwargs = {"prompt": prompt}
|
76 |
+
|
77 |
+
input_gradio= gr.Text(
|
78 |
+
label="Prompt",
|
79 |
+
show_label=False,
|
80 |
+
max_lines=2,
|
81 |
+
placeholder="Enter your question here",
|
82 |
+
container=False,
|
83 |
+
|
84 |
+
)
|
85 |
+
|
86 |
+
|
87 |
+
def get_response(input_gradio ):
|
88 |
+
query=input_gradio
|
89 |
+
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= False, chain_type_kwargs=chain_type_kwargs, verbose=True)
|
90 |
+
response= qa(query)
|
91 |
+
return response['result']
|
92 |
+
|
93 |
+
iface= gr.Interface(
|
94 |
+
fn=get_response,
|
95 |
+
inputs=input_gradio,
|
96 |
+
outputs="text",
|
97 |
+
title="Stimline Chatbot",
|
98 |
+
description="A chatbot that uses the LLM to answer anything regarding Stimline",
|
99 |
+
allow_flagging='never'
|
100 |
+
|
101 |
+
)
|
102 |
+
# Interactive questions and answers
|
103 |
+
iface.launch()
|
104 |
+
|
105 |
+
|
app_v2.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain import PromptTemplate, LLMChain
|
2 |
+
from langchain.llms import CTransformers
|
3 |
+
import os
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain.vectorstores import Chroma
|
6 |
+
from langchain.chains import RetrievalQA
|
7 |
+
from langchain.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings
|
8 |
+
from io import BytesIO
|
9 |
+
from langchain.document_loaders import PyPDFLoader
|
10 |
+
import gradio as gr
|
11 |
+
import chromadb
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
from constants import CHROMA_SETTINGS
|
14 |
+
from io import BytesIO
|
15 |
+
import gradio as gr
|
16 |
+
from langchain.chains import ConversationalRetrievalChain
|
17 |
+
from langchain.memory import ConversationBufferMemory
|
18 |
+
|
19 |
+
|
20 |
+
local_llm = "TheBloke/zephyr-7B-beta-GGUF"
|
21 |
+
|
22 |
+
config = {
|
23 |
+
'max_new_tokens': 1024,
|
24 |
+
'repetition_penalty': 1.1,
|
25 |
+
'temperature': 0.1,
|
26 |
+
'top_k': 50,
|
27 |
+
'top_p': 0.9,
|
28 |
+
'stream': True,
|
29 |
+
'threads': int(os.cpu_count() / 2)
|
30 |
+
}
|
31 |
+
|
32 |
+
|
33 |
+
llm = CTransformers(
|
34 |
+
model=local_llm,
|
35 |
+
model_type="mistral",
|
36 |
+
lib="avx2", #for CPU use
|
37 |
+
**config
|
38 |
+
)
|
39 |
+
|
40 |
+
embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME')
|
41 |
+
persist_directory = os.environ.get('PERSIST_DIRECTORY')
|
42 |
+
target_source_chunks = int(os.environ.get('TARGET_SOURCE_CHUNKS',4))
|
43 |
+
|
44 |
+
if not load_dotenv():
|
45 |
+
print("Could not load .env file or it is empty. Please check if it exists and is readable.")
|
46 |
+
exit(1)
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
print("Loading embeddings model...")
|
51 |
+
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
|
52 |
+
chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
|
53 |
+
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
|
54 |
+
|
55 |
+
prompt_template = """Use the following pieces of information to answer the user's question.
|
56 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
57 |
+
|
58 |
+
Context: {context}
|
59 |
+
Question: {question}
|
60 |
+
|
61 |
+
Only return the helpful answer below and nothing else.
|
62 |
+
Helpful answer:
|
63 |
+
"""
|
64 |
+
prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
|
65 |
+
retriever = db.as_retriever(search_kwargs={"k": target_source_chunks})
|
66 |
+
|
67 |
+
memory= ConversationBufferMemory(
|
68 |
+
memory_key="chat_history",
|
69 |
+
return_messages=False
|
70 |
+
)
|
71 |
+
|
72 |
+
# activate/deactivate the streaming StdOut callback for LLMs
|
73 |
+
'''
|
74 |
+
query="What is state ownership report"
|
75 |
+
semantic_search_results = retriever.get_relevant_documents(query)
|
76 |
+
print(semantic_search_results)
|
77 |
+
query="What is state ownership report"
|
78 |
+
chain_type_kwargs = {"prompt": prompt}
|
79 |
+
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= False, chain_type_kwargs=chain_type_kwargs, verbose=True)
|
80 |
+
response= qa(query)
|
81 |
+
print(response)'''
|
82 |
+
|
83 |
+
chain_type_kwargs = {"prompt": prompt}
|
84 |
+
|
85 |
+
input_gradio= gr.Text(
|
86 |
+
label="Prompt",
|
87 |
+
show_label=False,
|
88 |
+
max_lines=2,
|
89 |
+
placeholder="Enter your question here",
|
90 |
+
container=False,
|
91 |
+
|
92 |
+
)
|
93 |
+
|
94 |
+
|
95 |
+
def get_response(input_gradio ):
|
96 |
+
query=input_gradio
|
97 |
+
qa= ConversationalRetrievalChain.from_llm(
|
98 |
+
llm=llm,
|
99 |
+
chain_type="stuff",
|
100 |
+
retriever=retriever,
|
101 |
+
memory=memory,
|
102 |
+
verbose=True)
|
103 |
+
response= qa(query)
|
104 |
+
return response['answer']
|
105 |
+
|
106 |
+
iface= gr.Interface(
|
107 |
+
fn=get_response,
|
108 |
+
inputs=input_gradio,
|
109 |
+
outputs="text",
|
110 |
+
title="Stimline Chatbot",
|
111 |
+
description="A chatbot that uses the LLM to answer anything regarding Stimline",
|
112 |
+
allow_flagging='never'
|
113 |
+
|
114 |
+
)
|
115 |
+
# Interactive questions and answers
|
116 |
+
iface.launch()
|
117 |
+
|
118 |
+
|
constants.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from chromadb.config import Settings
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY')
|
8 |
+
if PERSIST_DIRECTORY is None:
|
9 |
+
raise Exception("please put the directory path in chroma db")
|
10 |
+
|
11 |
+
CHROMA_SETTINGS = Settings(
|
12 |
+
persist_directory=PERSIST_DIRECTORY,
|
13 |
+
anonymized_telemetry=False
|
14 |
+
)
|
db/687b62b3-4364-499d-9927-b084c01b0b0b/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6e4322b89a3b8555e5d66d0504381d6435056f578e55e68df672987d35657745
|
3 |
+
size 3352000
|
db/687b62b3-4364-499d-9927-b084c01b0b0b/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dc0a39849941cfdf9e77f39fd384516fcd6a5a66278336eea1d145b1934f5777
|
3 |
+
size 100
|
db/687b62b3-4364-499d-9927-b084c01b0b0b/index_metadata.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:42e803042f98dd389cac8f0b7599312cc5c45bb6619962fc9b58792cdcad353d
|
3 |
+
size 113989
|
db/687b62b3-4364-499d-9927-b084c01b0b0b/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ca72c4a6b3756f03937fc5c776e895b57eba80b5296dab003dc9874a5fc4c96
|
3 |
+
size 8000
|
db/687b62b3-4364-499d-9927-b084c01b0b0b/link_lists.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0777500aa1b9b54c7d1f523eb7ffc49120fcd0d00ad9ea117d349e3303fadb59
|
3 |
+
size 18268
|
db/chroma.sqlite3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:24fbf72e0acbdd7bc265370c3ae32a78ab58bbae0ba628e02755170019d1e14a
|
3 |
+
size 18460672
|
ingest.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
import os
|
3 |
+
import glob
|
4 |
+
from typing import List
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from multiprocessing import Pool
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
from langchain.document_loaders import (
|
10 |
+
CSVLoader,
|
11 |
+
EverNoteLoader,
|
12 |
+
PyMuPDFLoader,
|
13 |
+
TextLoader,
|
14 |
+
UnstructuredEmailLoader,
|
15 |
+
UnstructuredEPubLoader,
|
16 |
+
UnstructuredHTMLLoader,
|
17 |
+
UnstructuredMarkdownLoader,
|
18 |
+
UnstructuredODTLoader,
|
19 |
+
UnstructuredPowerPointLoader,
|
20 |
+
UnstructuredWordDocumentLoader,
|
21 |
+
PyPDFLoader
|
22 |
+
)
|
23 |
+
|
24 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
25 |
+
from langchain.vectorstores import Chroma
|
26 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
27 |
+
from langchain.docstore.document import Document
|
28 |
+
|
29 |
+
if not load_dotenv():
|
30 |
+
print("Could not load .env file or it is empty. Please check if it exists and is readable.")
|
31 |
+
exit(1)
|
32 |
+
|
33 |
+
from constants import CHROMA_SETTINGS
|
34 |
+
import chromadb
|
35 |
+
|
36 |
+
# Load environment variables
|
37 |
+
persist_directory = os.environ.get('PERSIST_DIRECTORY')
|
38 |
+
source_directory = os.environ.get('SOURCE_DIRECTORY', 'source_documents')
|
39 |
+
embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME')
|
40 |
+
chunk_size = 500
|
41 |
+
chunk_overlap = 100
|
42 |
+
|
43 |
+
# Map file extensions to document loaders and their arguments
|
44 |
+
LOADER_MAPPING = {
|
45 |
+
".csv": (CSVLoader, {}),
|
46 |
+
# ".docx": (Docx2txtLoader, {}),
|
47 |
+
".doc": (UnstructuredWordDocumentLoader, {}),
|
48 |
+
".docx": (UnstructuredWordDocumentLoader, {}),
|
49 |
+
".enex": (EverNoteLoader, {}),
|
50 |
+
# ".eml": (MyElmLoader, {}),
|
51 |
+
".epub": (UnstructuredEPubLoader, {}),
|
52 |
+
".html": (UnstructuredHTMLLoader, {}),
|
53 |
+
".md": (UnstructuredMarkdownLoader, {}),
|
54 |
+
".odt": (UnstructuredODTLoader, {}),
|
55 |
+
# ".pdf": (PyMuPDFLoader, {}),
|
56 |
+
".pdf": (PyPDFLoader, {}),
|
57 |
+
".ppt": (UnstructuredPowerPointLoader, {}),
|
58 |
+
".pptx": (UnstructuredPowerPointLoader, {}),
|
59 |
+
".txt": (TextLoader, {"encoding": "utf8"}),
|
60 |
+
# Add more mappings for other file extensions and loaders as needed
|
61 |
+
}
|
62 |
+
|
63 |
+
|
64 |
+
def load_single_document(file_path: str) -> List[Document]:
|
65 |
+
ext = "." + file_path.rsplit(".", 1)[-1].lower()
|
66 |
+
if ext in LOADER_MAPPING:
|
67 |
+
loader_class, loader_args = LOADER_MAPPING[ext]
|
68 |
+
loader = loader_class(file_path, **loader_args)
|
69 |
+
return loader.load()
|
70 |
+
|
71 |
+
raise ValueError(f"Unsupported file extension '{ext}'")
|
72 |
+
|
73 |
+
def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
|
74 |
+
"""
|
75 |
+
Loads all documents from the source documents directory, ignoring specified files
|
76 |
+
"""
|
77 |
+
all_files = []
|
78 |
+
for ext in LOADER_MAPPING:
|
79 |
+
all_files.extend(
|
80 |
+
glob.glob(os.path.join(source_dir, f"**/*{ext.lower()}"), recursive=True)
|
81 |
+
)
|
82 |
+
all_files.extend(
|
83 |
+
glob.glob(os.path.join(source_dir, f"**/*{ext.upper()}"), recursive=True)
|
84 |
+
)
|
85 |
+
filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]
|
86 |
+
|
87 |
+
with Pool(processes=os.cpu_count()) as pool:
|
88 |
+
results = []
|
89 |
+
with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
|
90 |
+
for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
|
91 |
+
results.extend(docs)
|
92 |
+
pbar.update()
|
93 |
+
|
94 |
+
return results
|
95 |
+
|
96 |
+
def process_documents(ignored_files: List[str] = []) -> List[Document]:
|
97 |
+
"""
|
98 |
+
Load documents and split in chunks
|
99 |
+
"""
|
100 |
+
print(f"Loading documents from {source_directory}")
|
101 |
+
documents = load_documents(source_directory, ignored_files)
|
102 |
+
if not documents:
|
103 |
+
print("No new documents to load")
|
104 |
+
exit(0)
|
105 |
+
print(f"Loaded {len(documents)} new documents from {source_directory}")
|
106 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
107 |
+
texts = text_splitter.split_documents(documents)
|
108 |
+
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
|
109 |
+
return texts
|
110 |
+
|
111 |
+
def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool:
|
112 |
+
"""
|
113 |
+
Checks if vectorstore exists
|
114 |
+
"""
|
115 |
+
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
|
116 |
+
if not db.get()['documents']:
|
117 |
+
return False
|
118 |
+
return True
|
119 |
+
|
120 |
+
def main():
|
121 |
+
# Create embeddings
|
122 |
+
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
|
123 |
+
# Chroma client
|
124 |
+
chroma_client = chromadb.PersistentClient(settings=CHROMA_SETTINGS , path=persist_directory)
|
125 |
+
|
126 |
+
if does_vectorstore_exist(persist_directory, embeddings):
|
127 |
+
# Update and store locally vectorstore
|
128 |
+
print(f"Appending to existing vectorstore at {persist_directory}")
|
129 |
+
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS, client=chroma_client)
|
130 |
+
collection = db.get()
|
131 |
+
texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
|
132 |
+
print(f"Creating embeddings. May take some minutes...")
|
133 |
+
db.add_documents(texts)
|
134 |
+
else:
|
135 |
+
# Create and store locally vectorstore
|
136 |
+
print("Creating new vectorstore")
|
137 |
+
texts = process_documents()
|
138 |
+
print(f"Creating embeddings. May take some minutes...")
|
139 |
+
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS, client=chroma_client)
|
140 |
+
db.persist()
|
141 |
+
db = None
|
142 |
+
|
143 |
+
print(f"Ingestion complete! You can now run app.py to query your documents")
|
144 |
+
|
145 |
+
|
146 |
+
if __name__ == "__main__":
|
147 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain==0.0.274
|
2 |
+
gradio==3.50.2
|
3 |
+
gpt4all==1.0.8
|
4 |
+
chromadb==0.4.7
|
5 |
+
urllib3==2.0.4
|
6 |
+
PyMuPDF==1.23.1
|
7 |
+
python-dotenv==1.0.0
|
8 |
+
unstructured==0.10.8
|
9 |
+
extract-msg==0.45.0
|
10 |
+
tabulate==0.9.0
|
11 |
+
pandoc==2.3
|
12 |
+
pypandoc==1.11
|
13 |
+
tqdm==4.66.1
|
14 |
+
sentence_transformers==2.2.2
|
15 |
+
pypdf
|