Teapack1's picture
FAISS db store
c56a0e8
raw
history blame
2.82 kB
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import (
OpenAIEmbeddings,
HuggingFaceBgeEmbeddings,
HuggingFaceEmbeddings,
HuggingFaceInstructEmbeddings,
)
class Ingest:
def __init__(
self,
openai_api_key=None,
chunk=512,
overlap=256,
czech_store="stores/czech_512",
english_store="stores/english_512",
data_czech="data/czech",
data_english="data/english",
english_embedding_model="text-embedding-3-large",
czech_embedding_model="Seznam/simcse-dist-mpnet-paracrawl-cs-en",
):
self.openai_api_key = openai_api_key
self.chunk = chunk
self.overlap = overlap
self.czech_store = czech_store
self.english_store = english_store
self.data_czech = data_czech
self.data_english = data_english
self.english_embedding_model = english_embedding_model
self.czech_embedding_model = czech_embedding_model
def ingest_english(self):
embedding = OpenAIEmbeddings(
openai_api_key=self.openai_api_key,
model=self.english_embedding_model,
)
loader = DirectoryLoader(
self.data_english,
show_progress=True,
loader_cls=PyPDFLoader,
)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk,
chunk_overlap=self.overlap,
)
texts = text_splitter.split_documents(documents)
vectordb = FAISS.from_documents(
documents=texts,
embedding=embedding,
)
vectordb.save_local(self.english_store)
print("\n English vector Store Created.......\n\n")
def ingest_czech(self):
embedding_model = self.czech_embedding_model
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}
embedding = HuggingFaceEmbeddings(
model_name=embedding_model,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs,
)
loader = DirectoryLoader(
self.data_czech,
show_progress=True,
)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk,
chunk_overlap=self.overlap,
)
texts = text_splitter.split_documents(documents)
vectordb = FAISS.from_documents(
documents=texts,
embedding=embedding,
)
vectordb.save_local(self.czech_store)
print("\n Czech vector Store Created.......\n\n")