File size: 3,840 Bytes
ae85316
 
 
 
 
 
 
 
 
 
 
 
0b367ea
ae85316
 
 
 
 
 
0b367ea
ae85316
 
 
 
 
 
0b367ea
ae85316
 
 
 
 
 
 
 
0b367ea
ae85316
 
 
 
 
 
0b367ea
ae85316
 
 
 
 
 
0b367ea
ae85316
 
 
 
 
0b367ea
ae85316
 
 
 
 
0b367ea
ae85316
 
 
 
 
 
 
 
 
 
 
 
 
0b367ea
ae85316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b367ea
ae85316
 
 
 
 
 
 
 
 
 
 
 
 
 
0b367ea
ae85316
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_community.document_loaders import BSHTMLLoader, UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS, VectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from pypdf.errors import PyPdfError
# stdlib
from glob import glob
import pathlib


def load_text(file_path: str) -> list[Document] | None:
    """Loads text documents (.txt) asynchronously from a passed file_path."""
    assert file_path != ""
    assert pathlib.Path(file_path).suffix == ".txt"

    try:
        loader = TextLoader(file_path)
        return loader.load()
    except UnicodeError or RuntimeError as err:
        print(f"could not load file: {file_path}")
        print(f"error: {err}")


# https://python.langchain.com/docs/how_to/document_loader_markdown/
def load_markdown(file_path: str) -> list[Document] | None:
    """Loads markdown files asynchronously from a passed file_path."""
    assert file_path != ""
    assert pathlib.Path(file_path).suffix == ".md"

    try:
        # use the mode elements to keep metadata about if the information is
        # a paragraph, link or a heading for example
        loader = UnstructuredMarkdownLoader(file_path, mode="elements")
        return loader.load()
    except UnicodeError or RuntimeError as err:
        print(f"could not load file: {file_path}")
        print(f"error: {err}")


# https://python.langchain.com/docs/how_to/document_loader_pdf/
def load_pdf(file_path: str) -> list[Document] | None:
    """Loads pdf documents (.pdf) asynchronously from a passed file_path."""
    assert file_path != ""
    assert pathlib.Path(file_path).suffix == ".pdf"

    loader = PyPDFLoader(file_path)
    try:
        return loader.load()
    except PyPdfError as err:
        print(f"could not read file: {file_path}")
        print(f"error: {err}")


def load_html(file_path: str) -> list[Document]:
    """Loads html documents (.html) asynchronously from a passed file_path."""
    assert file_path != ""
    assert pathlib.Path(file_path).suffix == ".html" or ".htm"

    loader = BSHTMLLoader(file_path)
    return loader.load()


# hold all of the loader functions for easy 0(1) fetching
LOADER_MAP = {
    ".pdf": load_pdf,
    ".html": load_html,
    ".htm": load_html,
    ".txt": load_text,
    ".md": load_markdown,
}


# https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/
def get_document_database(
    data_folder="learning_material/*/*/*",
    embedding_model="BAAI/bge-base-en-v1.5",
    chunk_size=1028, chunk_overlap=0,
) -> VectorStore:

    # get all the filepaths of the learning materials
    files = glob(data_folder)

    all_docs = []
    for file_path in files:
        extension = pathlib.Path(file_path).suffix
        if not extension:
            print(f"{file_path} is a folder, skipping")
            continue

        load_fn = LOADER_MAP.get(extension)
        if not load_fn:
            print(f"no document loader for file extension '{extension}'")
            print(f"file {file_path} will be skipped")
            continue

        # load the document with a filetype specific loader
        result_documents = load_fn(file_path)

        if not result_documents:
            print(f"file {file_path} does not include any content, skipping")
            continue

        all_docs.extend(result_documents)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    chunked_docs = splitter.split_documents(all_docs)

    return FAISS.from_documents(
        chunked_docs,
        HuggingFaceEmbeddings(model_name=embedding_model)
    )