tekinfo-bot-hf / app /document_processor.py
alvinfadli's picture
initial commit
a3f9c29
raw
history blame contribute delete
654 Bytes
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
def process_documents(docs):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
text_chunks = text_splitter.split_documents(docs)
# print(*text_chunks, sep='\n\n')
embeddings = HuggingFaceEmbeddings(model_name="LazarusNLP/congen-indo-e5-small",
model_kwargs={'device': 'cpu'})
vector_store = FAISS.from_documents(text_chunks, embedding=embeddings)
return vector_store