thesizer

Sleeping

File size: 3,840 Bytes

from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_community.document_loaders import BSHTMLLoader, UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS, VectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from pypdf.errors import PyPdfError
# stdlib
from glob import glob
import pathlib


def load_text(file_path: str) -> list[Document] | None:
    """Loads text documents (.txt) asynchronously from a passed file_path."""
    assert file_path != ""
    assert pathlib.Path(file_path).suffix == ".txt"

    try:
        loader = TextLoader(file_path)
        return loader.load()
    except UnicodeError or RuntimeError as err:
        print(f"could not load file: {file_path}")
        print(f"error: {err}")


# https://python.langchain.com/docs/how_to/document_loader_markdown/
def load_markdown(file_path: str) -> list[Document] | None:
    """Loads markdown files asynchronously from a passed file_path."""
    assert file_path != ""
    assert pathlib.Path(file_path).suffix == ".md"

    try:
        # use the mode elements to keep metadata about if the information is
        # a paragraph, link or a heading for example
        loader = UnstructuredMarkdownLoader(file_path, mode="elements")
        return loader.load()
    except UnicodeError or RuntimeError as err:
        print(f"could not load file: {file_path}")
        print(f"error: {err}")


# https://python.langchain.com/docs/how_to/document_loader_pdf/
def load_pdf(file_path: str) -> list[Document] | None:
    """Loads pdf documents (.pdf) asynchronously from a passed file_path."""
    assert file_path != ""
    assert pathlib.Path(file_path).suffix == ".pdf"

    loader = PyPDFLoader(file_path)
    try:
        return loader.load()
    except PyPdfError as err:
        print(f"could not read file: {file_path}")
        print(f"error: {err}")


def load_html(file_path: str) -> list[Document]:
    """Loads html documents (.html) asynchronously from a passed file_path."""
    assert file_path != ""
    assert pathlib.Path(file_path).suffix == ".html" or ".htm"

    loader = BSHTMLLoader(file_path)
    return loader.load()


# hold all of the loader functions for easy 0(1) fetching
LOADER_MAP = {
    ".pdf": load_pdf,
    ".html": load_html,
    ".htm": load_html,
    ".txt": load_text,
    ".md": load_markdown,
}


# https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/
def get_document_database(
    data_folder="learning_material/*/*/*",
    embedding_model="BAAI/bge-base-en-v1.5",
    chunk_size=1028, chunk_overlap=0,
) -> VectorStore:

    # get all the filepaths of the learning materials
    files = glob(data_folder)

    all_docs = []
    for file_path in files:
        extension = pathlib.Path(file_path).suffix
        if not extension:
            print(f"{file_path} is a folder, skipping")
            continue

        load_fn = LOADER_MAP.get(extension)
        if not load_fn:
            print(f"no document loader for file extension '{extension}'")
            print(f"file {file_path} will be skipped")
            continue

        # load the document with a filetype specific loader
        result_documents = load_fn(file_path)

        if not result_documents:
            print(f"file {file_path} does not include any content, skipping")
            continue

        all_docs.extend(result_documents)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    chunked_docs = splitter.split_documents(all_docs)

    return FAISS.from_documents(
        chunked_docs,
        HuggingFaceEmbeddings(model_name=embedding_model)
    )