RAG-Retrieve-Ingest-cz-eng / ingest(obsolete).py
Teapack1's picture
initial
99afe26
raw
history blame
1.45 kB
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import (
PyPDFLoader,
DirectoryLoader,
UnstructuredFileLoader,
)
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings import (
OpenAIEmbeddings,
HuggingFaceBgeEmbeddings,
HuggingFaceEmbeddings,
HuggingFaceInstructEmbeddings,
)
persist_directory = "stores/test_512"
data = "data\czech"
chunk = 512
overlap = 128
# embedding_model = "Seznam/simcse-dist-mpnet-czeng-cs-en"
embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
model_name = embedding_model
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}
embedding = HuggingFaceEmbeddings(
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)
"""
loader = CSVLoader(
file_path="data/emails.csv",
encoding="utf-8",
csv_args={
"delimiter": ";",
},
)
"""
loader = DirectoryLoader(data, show_progress=True)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk,
chunk_overlap=overlap,
)
texts = text_splitter.split_documents(documents)
vectordb = Chroma.from_documents(
documents=texts,
embedding=embedding,
persist_directory=persist_directory,
collection_metadata={"hnsw:space": "cosine"},
)
print("\n Vector Store Created.......\n\n")