Spaces:
Sleeping
Sleeping
from langchain.vectorstores import Chroma | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.document_loaders import ( | |
PyPDFLoader, | |
DirectoryLoader, | |
UnstructuredFileLoader, | |
) | |
from langchain.document_loaders.csv_loader import CSVLoader | |
from langchain.embeddings import ( | |
OpenAIEmbeddings, | |
HuggingFaceBgeEmbeddings, | |
HuggingFaceEmbeddings, | |
HuggingFaceInstructEmbeddings, | |
) | |
persist_directory = "stores/test_512" | |
data = "data\czech" | |
chunk = 512 | |
overlap = 128 | |
# embedding_model = "Seznam/simcse-dist-mpnet-czeng-cs-en" | |
embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en" | |
model_name = embedding_model | |
model_kwargs = {"device": "cpu"} | |
encode_kwargs = {"normalize_embeddings": False} | |
embedding = HuggingFaceEmbeddings( | |
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs | |
) | |
""" | |
loader = CSVLoader( | |
file_path="data/emails.csv", | |
encoding="utf-8", | |
csv_args={ | |
"delimiter": ";", | |
}, | |
) | |
""" | |
loader = DirectoryLoader(data, show_progress=True) | |
documents = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk, | |
chunk_overlap=overlap, | |
) | |
texts = text_splitter.split_documents(documents) | |
vectordb = Chroma.from_documents( | |
documents=texts, | |
embedding=embedding, | |
persist_directory=persist_directory, | |
collection_metadata={"hnsw:space": "cosine"}, | |
) | |
print("\n Vector Store Created.......\n\n") | |