File size: 3,753 Bytes
99afe26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f4bbb8
 
99afe26
 
 
 
 
 
 
 
1f4bbb8
 
99afe26
 
 
 
 
1f4bbb8
99afe26
 
 
 
 
1f4bbb8
99afe26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f4bbb8
99afe26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import (
    OpenAIEmbeddings,
    HuggingFaceBgeEmbeddings,
    HuggingFaceEmbeddings,
    HuggingFaceInstructEmbeddings,
)


class Ingest:
    def __init__(
        self,
        openai_api_key=None,
        chunk=512,
        overlap=256,
        czech_store="stores/czech_512",
        english_store="stores/english_512",
        data_czech="data/czech",
        data_english="data/english",
        english_embedding_model="text-embedding-3-large",
        czech_embedding_model="Seznam/simcse-dist-mpnet-paracrawl-cs-en",
    ):
        self.openai_api_key = openai_api_key
        self.chunk = chunk
        self.overlap = overlap
        self.czech_store = czech_store
        self.english_store = english_store
        self.data_czech = data_czech
        self.data_english = data_english
        self.english_embedding_model = english_embedding_model
        self.czech_embedding_model = czech_embedding_model

    def ingest_english(self):

        embedding = OpenAIEmbeddings(
            openai_api_key=self.openai_api_key,
            model=self.english_embedding_model,
        )

        loader = DirectoryLoader(
            self.data_english,
            show_progress=True,
            loader_cls=PyPDFLoader,
        )

        documents = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk,
            chunk_overlap=self.overlap,
        )
        texts = text_splitter.split_documents(documents)

        vectordb = Chroma.from_documents(
            documents=texts,
            embedding=embedding,
            persist_directory=self.english_store,
            collection_metadata={"hnsw:space": "cosine"},
        )

        print("\n English vector Store Created.......\n\n")

    def ingest_czech(self):
        embedding_model = self.czech_embedding_model
        model_kwargs = {"device": "cpu"}
        encode_kwargs = {"normalize_embeddings": False}
        embedding = HuggingFaceEmbeddings(
            model_name=embedding_model,
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs,
        )

        loader = DirectoryLoader(
            self.data_czech,
            show_progress=True,
        )

        documents = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk,
            chunk_overlap=self.overlap,
        )

        texts = text_splitter.split_documents(documents)
        vectordb = Chroma.from_documents(
            documents=texts,
            embedding=embedding,
            persist_directory=self.czech_store,
            collection_metadata={"hnsw:space": "cosine"},
        )

        print("\n Czech vector Store Created.......\n\n")


"""       
    
    
    
openai_api_key = "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"
persist_directory = "stores/store_512"
data = "data/"
chunk = 512
overlap = 256

embedding = OpenAIEmbeddings(
    openai_api_key=openai_api_key,
    model="text-embedding-3-large",
    #    model_kwargs={"device": "cpu"},
)

loader = DirectoryLoader(
    data, glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader
)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk,
    chunk_overlap=overlap,
)
texts = text_splitter.split_documents(documents)

vectordb = Chroma.from_documents(
    documents=texts,
    embedding=embedding,
    persist_directory=persist_directory,
    collection_metadata={"hnsw:space": "cosine"},
)

print("\n Vector Store Created.......\n\n")

"""