Spaces:
Sleeping
Sleeping
import glob | |
import os | |
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader, CSVLoader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.docstore.document import Document | |
from sentence_transformers import SentenceTransformer | |
from langchain_pinecone import PineconeVectorStore | |
from pinecone.grpc import PineconeGRPC as Pinecone | |
from pinecone import ServerlessSpec | |
import time | |
from langchain_community.embeddings import SentenceTransformerEmbeddings | |
from dotenv import load_dotenv | |
load_dotenv() | |
# ๋ฐ์ดํฐ ๋ฐ์ผ๋ฉด ๊ฐ๋ผ์ค | |
def come_data(splits): | |
docs = [] | |
for i in range(len(splits)): | |
spcon = splits[i].page_content | |
url = splits[i].metadata['source'] | |
con = Document(page_content=spcon, metadata={'source': url}) | |
docs.append(con) | |
return docs | |
# ํํํ | |
def flatten_list(lst): | |
return [item for sublist in lst for item in flatten_list(sublist)] if isinstance(lst, list) else [lst] | |
# ๋ชจ๋ธ ๋ถ๋ฌ์์ VectorDB๋ก ์ฌ๋ฆฌ๋ ๋ถ๋ถ | |
def all_files(path): | |
print(f'RAG์ ๋ค์ด๊ฐ ๋ชจ๋ ๋ฐ์ดํฐ๋ {path}์ ๋ด์์ฃผ์ธ์.\n\n\n') | |
f = glob.glob(path + '/**', recursive=True) | |
f_docs = [] | |
for file in f: | |
a = False | |
if file.endswith('.txt'): | |
loader = TextLoader(file) | |
document = loader.load() | |
a = True | |
elif file.endswith('.csv'): | |
loader = CSVLoader(file) | |
document = loader.load() | |
a = True | |
elif file.endswith('.pdf'): | |
loader = PyMuPDFLoader(file) | |
document = loader.load() | |
a = True | |
# ------------------- ํ์ผ ํ์ ์ถ๊ฐ ์ฌํญ ์์ ์ ์์ ์ถ๊ฐ ----------------# | |
if a: | |
print(file.split('/')[-1] + ' split ์งํ ์ค') | |
text_splitter = CharacterTextSplitter.from_tiktoken_encoder( | |
separator=".", | |
chunk_size=500, | |
chunk_overlap=0, | |
) | |
splits = text_splitter.split_documents(document) | |
docs = come_data(splits) | |
f_docs.append(docs) | |
print(file.split('/')[-1] + ' split ์งํ ์๋ฃ. \n' + file.split('/')[-1] + ' split ๊ฐฏ์ : ' + str(len(docs))) | |
flattened_list = flatten_list(f_docs) | |
''' | |
flattened ๋ docs๋ฅผ ๋ฒกํฐ db๋ก ๋ฃ์ด์ค ๊ฒ | |
''' | |
# ์๋ฒ ๋ฉ ๋ชจ๋ธ ์ ์ธ | |
embedding_model = SentenceTransformerEmbeddings(model_name='BM-K/KoSimCSE-roberta-multitask', model_kwargs={"trust_remote_code":True}) | |
# ๋ฒกํฐ์คํ ์ด ์ ์ธ | |
api_key = os.environ['PINECONE_API_KEY'] | |
pc = Pinecone(api_key=api_key) | |
index_name = os.getenv('INDEX_NAME') | |
print('Vector DB ์ด๊ธฐํ. Index_name = ' + str(index_name)) | |
spec = ServerlessSpec(cloud='aws', region='us-east-1') | |
# ์ธ๋ฑ์ค ์กด์ฌ์ฌ๋ถ ํ์ธ ๋ฐ ์ญ์ | |
collect_name = [] | |
for n in pc.list_indexes().indexes: | |
collect_name.append(n.name) | |
if index_name in collect_name: | |
pc.delete_index(index_name) | |
print('๊ธฐ์กด ์ธ๋ฑ์ค ์ญ์ ์๋ฃ') | |
time.sleep(3) | |
# ํ์ธ์ฝ ์ธ๋ฑ์ค ์์ฑ | |
pc.create_index( | |
index_name, | |
dimension=768, | |
metric='cosine', | |
spec=spec | |
) | |
# ์ธ๋ฑ์ค ์ฌ์์ฑ ๋ฐ ๋ฐ์ดํฐ ์ ๋ ฅ | |
# index = pc.Index(index_name) | |
print('Vector DB ๋ค์ด๊ฐ๋ ์ค. Index_name = ' + str(index_name)) | |
# # ํ ์คํธ ์๋ฒ ๋ฉ ์์ฑ | |
# texts = [doc.page_content for doc in flattened_list] | |
# embedded_texts = [] | |
# for txt in texts: | |
# embedded_texts.append(embedding_model.embed_query(txt)) | |
# # ๋ฒกํฐ DB์ ์๋ฒ ๋ฉ ์ถ๊ฐ | |
# ids = [str(i) for i in range(len(embedded_texts))] | |
# metadata = [doc.metadata for doc in flattened_list] | |
# # db์ฌ๋ฆด๋ ๋ฌด๋ฃ๋ฒ์ ์ด๊ธฐ๋๋ฌธ์ ์ฉ๋ ํฐ์ง -> ๋๋ ์ ์ฌ๋ฆฌ์ | |
# batch_size = 28 | |
# for i in range(0, len(embedded_texts), batch_size): | |
# batch_vectors = [{"id": id, "values": vector, "metadata": meta} for id, vector, meta in zip(ids[i:i + batch_size], embedded_texts[i:i + batch_size], metadata[i:i + batch_size])] | |
# index.upsert(vectors=batch_vectors) | |
Vectorstore = PineconeVectorStore.from_documents( | |
documents=flattened_list, | |
index_name=index_name, | |
embedding=embedding_model | |
) | |
print('์ ์ฅ ์๋ฃ') | |
return Vectorstore, flattened_list |