Spaces:
Sleeping
Sleeping
File size: 3,753 Bytes
99afe26 1f4bbb8 99afe26 1f4bbb8 99afe26 1f4bbb8 99afe26 1f4bbb8 99afe26 1f4bbb8 99afe26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import (
OpenAIEmbeddings,
HuggingFaceBgeEmbeddings,
HuggingFaceEmbeddings,
HuggingFaceInstructEmbeddings,
)
class Ingest:
def __init__(
self,
openai_api_key=None,
chunk=512,
overlap=256,
czech_store="stores/czech_512",
english_store="stores/english_512",
data_czech="data/czech",
data_english="data/english",
english_embedding_model="text-embedding-3-large",
czech_embedding_model="Seznam/simcse-dist-mpnet-paracrawl-cs-en",
):
self.openai_api_key = openai_api_key
self.chunk = chunk
self.overlap = overlap
self.czech_store = czech_store
self.english_store = english_store
self.data_czech = data_czech
self.data_english = data_english
self.english_embedding_model = english_embedding_model
self.czech_embedding_model = czech_embedding_model
def ingest_english(self):
embedding = OpenAIEmbeddings(
openai_api_key=self.openai_api_key,
model=self.english_embedding_model,
)
loader = DirectoryLoader(
self.data_english,
show_progress=True,
loader_cls=PyPDFLoader,
)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk,
chunk_overlap=self.overlap,
)
texts = text_splitter.split_documents(documents)
vectordb = Chroma.from_documents(
documents=texts,
embedding=embedding,
persist_directory=self.english_store,
collection_metadata={"hnsw:space": "cosine"},
)
print("\n English vector Store Created.......\n\n")
def ingest_czech(self):
embedding_model = self.czech_embedding_model
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}
embedding = HuggingFaceEmbeddings(
model_name=embedding_model,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs,
)
loader = DirectoryLoader(
self.data_czech,
show_progress=True,
)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk,
chunk_overlap=self.overlap,
)
texts = text_splitter.split_documents(documents)
vectordb = Chroma.from_documents(
documents=texts,
embedding=embedding,
persist_directory=self.czech_store,
collection_metadata={"hnsw:space": "cosine"},
)
print("\n Czech vector Store Created.......\n\n")
"""
openai_api_key = "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"
persist_directory = "stores/store_512"
data = "data/"
chunk = 512
overlap = 256
embedding = OpenAIEmbeddings(
openai_api_key=openai_api_key,
model="text-embedding-3-large",
# model_kwargs={"device": "cpu"},
)
loader = DirectoryLoader(
data, glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader
)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk,
chunk_overlap=overlap,
)
texts = text_splitter.split_documents(documents)
vectordb = Chroma.from_documents(
documents=texts,
embedding=embedding,
persist_directory=persist_directory,
collection_metadata={"hnsw:space": "cosine"},
)
print("\n Vector Store Created.......\n\n")
"""
|