from langchain_community.document_loaders import TextLoader, PyPDFLoader from langchain_community.document_loaders import BSHTMLLoader, UnstructuredMarkdownLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS, VectorStore from langchain_huggingface import HuggingFaceEmbeddings from langchain_core.documents import Document from pypdf.errors import PyPdfError # stdlib from glob import glob import pathlib def load_text(file_path: str) -> list[Document] | None: """Loads text documents (.txt) asynchronously from a passed file_path.""" assert file_path != "" assert pathlib.Path(file_path).suffix == ".txt" try: loader = TextLoader(file_path) return loader.load() except UnicodeError or RuntimeError as err: print(f"could not load file: {file_path}") print(f"error: {err}") # https://python.langchain.com/docs/how_to/document_loader_markdown/ def load_markdown(file_path: str) -> list[Document] | None: """Loads markdown files asynchronously from a passed file_path.""" assert file_path != "" assert pathlib.Path(file_path).suffix == ".md" try: # use the mode elements to keep metadata about if the information is # a paragraph, link or a heading for example loader = UnstructuredMarkdownLoader(file_path, mode="elements") return loader.load() except UnicodeError or RuntimeError as err: print(f"could not load file: {file_path}") print(f"error: {err}") # https://python.langchain.com/docs/how_to/document_loader_pdf/ def load_pdf(file_path: str) -> list[Document] | None: """Loads pdf documents (.pdf) asynchronously from a passed file_path.""" assert file_path != "" assert pathlib.Path(file_path).suffix == ".pdf" loader = PyPDFLoader(file_path) try: return loader.load() except PyPdfError as err: print(f"could not read file: {file_path}") print(f"error: {err}") def load_html(file_path: str) -> list[Document]: """Loads html documents (.html) asynchronously from a passed file_path.""" assert file_path != "" assert pathlib.Path(file_path).suffix == ".html" or ".htm" loader = BSHTMLLoader(file_path) return loader.load() # hold all of the loader functions for easy 0(1) fetching LOADER_MAP = { ".pdf": load_pdf, ".html": load_html, ".htm": load_html, ".txt": load_text, ".md": load_markdown, } # https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/ def get_document_database( data_folder="learning_material/*/*/*", embedding_model="BAAI/bge-base-en-v1.5", chunk_size=1028, chunk_overlap=0, ) -> VectorStore: # get all the filepaths of the learning materials files = glob(data_folder) all_docs = [] for file_path in files: extension = pathlib.Path(file_path).suffix if not extension: print(f"{file_path} is a folder, skipping") continue load_fn = LOADER_MAP.get(extension) if not load_fn: print(f"no document loader for file extension '{extension}'") print(f"file {file_path} will be skipped") continue # load the document with a filetype specific loader result_documents = load_fn(file_path) if not result_documents: print(f"file {file_path} does not include any content, skipping") continue all_docs.extend(result_documents) splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) chunked_docs = splitter.split_documents(all_docs) return FAISS.from_documents( chunked_docs, HuggingFaceEmbeddings(model_name=embedding_model) )