thesizer / vector_store.py
sakuexe
tweaked the code a bit to make answering faster
0b367ea
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_community.document_loaders import BSHTMLLoader, UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS, VectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from pypdf.errors import PyPdfError
# stdlib
from glob import glob
import pathlib
def load_text(file_path: str) -> list[Document] | None:
"""Loads text documents (.txt) asynchronously from a passed file_path."""
assert file_path != ""
assert pathlib.Path(file_path).suffix == ".txt"
try:
loader = TextLoader(file_path)
return loader.load()
except UnicodeError or RuntimeError as err:
print(f"could not load file: {file_path}")
print(f"error: {err}")
# https://python.langchain.com/docs/how_to/document_loader_markdown/
def load_markdown(file_path: str) -> list[Document] | None:
"""Loads markdown files asynchronously from a passed file_path."""
assert file_path != ""
assert pathlib.Path(file_path).suffix == ".md"
try:
# use the mode elements to keep metadata about if the information is
# a paragraph, link or a heading for example
loader = UnstructuredMarkdownLoader(file_path, mode="elements")
return loader.load()
except UnicodeError or RuntimeError as err:
print(f"could not load file: {file_path}")
print(f"error: {err}")
# https://python.langchain.com/docs/how_to/document_loader_pdf/
def load_pdf(file_path: str) -> list[Document] | None:
"""Loads pdf documents (.pdf) asynchronously from a passed file_path."""
assert file_path != ""
assert pathlib.Path(file_path).suffix == ".pdf"
loader = PyPDFLoader(file_path)
try:
return loader.load()
except PyPdfError as err:
print(f"could not read file: {file_path}")
print(f"error: {err}")
def load_html(file_path: str) -> list[Document]:
"""Loads html documents (.html) asynchronously from a passed file_path."""
assert file_path != ""
assert pathlib.Path(file_path).suffix == ".html" or ".htm"
loader = BSHTMLLoader(file_path)
return loader.load()
# hold all of the loader functions for easy 0(1) fetching
LOADER_MAP = {
".pdf": load_pdf,
".html": load_html,
".htm": load_html,
".txt": load_text,
".md": load_markdown,
}
# https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/
def get_document_database(
data_folder="learning_material/*/*/*",
embedding_model="BAAI/bge-base-en-v1.5",
chunk_size=1028, chunk_overlap=0,
) -> VectorStore:
# get all the filepaths of the learning materials
files = glob(data_folder)
all_docs = []
for file_path in files:
extension = pathlib.Path(file_path).suffix
if not extension:
print(f"{file_path} is a folder, skipping")
continue
load_fn = LOADER_MAP.get(extension)
if not load_fn:
print(f"no document loader for file extension '{extension}'")
print(f"file {file_path} will be skipped")
continue
# load the document with a filetype specific loader
result_documents = load_fn(file_path)
if not result_documents:
print(f"file {file_path} does not include any content, skipping")
continue
all_docs.extend(result_documents)
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
chunked_docs = splitter.split_documents(all_docs)
return FAISS.from_documents(
chunked_docs,
HuggingFaceEmbeddings(model_name=embedding_model)
)