viboognesh-doaz commited on
Commit
4b993b3
·
1 Parent(s): 5e32aa7

create new vectorstore each time

Browse files
Files changed (1) hide show
  1. pdf_processing.py +32 -32
pdf_processing.py CHANGED
@@ -11,7 +11,7 @@ import os
11
  from llama_index.core.indices import MultiModalVectorStoreIndex
12
  from llama_index.vector_stores.qdrant import QdrantVectorStore
13
  from llama_index.core import SimpleDirectoryReader, StorageContext
14
- from awsfunctions import upload_folder_to_s3, check_file_exists_in_s3, download_folder_from_s3
15
  import qdrant_client
16
  import streamlit as st
17
 
@@ -111,38 +111,38 @@ def process_pdf(pdf_file):
111
  username = "ptchecker"
112
  aws_prefix_path = os.path.join(os.getenv("FOLDER_PREFIX"), username, "FILES", os.path.splitext(pdf_file.name)[0])
113
  if check_file_exists_in_s3(os.path.join(aws_prefix_path, pdf_file.name)):
114
- temp_dir = tempfile.mkdtemp()
115
- download_folder_from_s3(local_folder=temp_dir, aws_folder_prefix=os.path.join(aws_prefix_path, "qdrant"))
116
- client = qdrant_client.QdrantClient(path=os.path.join(temp_dir, "qdrant"))
117
- image_store = QdrantVectorStore(client = client , collection_name=f"image_collection")
118
- text_store = QdrantVectorStore(client = client , collection_name=f"text_collection")
119
- index = MultiModalVectorStoreIndex.from_vector_store(vector_store=text_store, image_store=image_store)
120
- retriever_engine = index.as_retriever(similarity_top_k=1, image_similarity_top_k=1)
121
- shutil.rmtree(temp_dir)
122
- return retriever_engine
123
- else:
124
- temp_dir = tempfile.mkdtemp()
125
- temp_pdf_path = os.path.join(temp_dir, pdf_file.name)
126
- with open(temp_pdf_path, "wb") as f:
127
- f.write(pdf_file.getvalue())
128
 
129
- data_path = os.path.join(temp_dir, "data")
130
- os.makedirs(data_path , exist_ok=True)
131
- img_save_path = os.path.join(temp_dir, "images")
132
- os.makedirs(img_save_path , exist_ok=True)
133
 
134
- extracted_text = extract_text_from_pdf(temp_pdf_path)
135
- with open(os.path.join(data_path, "content.txt"), "w") as file:
136
- file.write(extracted_text)
137
 
138
- extract_images_from_pdf(temp_pdf_path, img_save_path)
139
- moved_count = move_images(img_save_path, data_path)
140
- print("Images moved count : ", moved_count)
141
- remove_low_size_images(data_path)
142
- remove_duplicate_images(data_path)
143
- shutil.rmtree(img_save_path)
144
- retriever_engine = initialize_qdrant(temp_dir=temp_dir, aws_prefix=aws_prefix_path) # os.path.join("folder" , os.path.splitext(pdf_file.name)[0] , unique_folder_name)
145
- upload_folder_to_s3(temp_dir, aws_prefix_path)
146
- shutil.rmtree(temp_dir)
147
 
148
- return retriever_engine
 
11
  from llama_index.core.indices import MultiModalVectorStoreIndex
12
  from llama_index.vector_stores.qdrant import QdrantVectorStore
13
  from llama_index.core import SimpleDirectoryReader, StorageContext
14
+ from awsfunctions import upload_folder_to_s3, check_file_exists_in_s3, download_folder_from_s3, delete_s3_folder
15
  import qdrant_client
16
  import streamlit as st
17
 
 
111
  username = "ptchecker"
112
  aws_prefix_path = os.path.join(os.getenv("FOLDER_PREFIX"), username, "FILES", os.path.splitext(pdf_file.name)[0])
113
  if check_file_exists_in_s3(os.path.join(aws_prefix_path, pdf_file.name)):
114
+ delete_s3_folder(aws_prefix_path)
115
+ # temp_dir = tempfile.mkdtemp()
116
+ # download_folder_from_s3(local_folder=temp_dir, aws_folder_prefix=os.path.join(aws_prefix_path, "qdrant"))
117
+ # client = qdrant_client.QdrantClient(path=os.path.join(temp_dir, "qdrant"))
118
+ # image_store = QdrantVectorStore(client = client , collection_name=f"image_collection")
119
+ # text_store = QdrantVectorStore(client = client , collection_name=f"text_collection")
120
+ # index = MultiModalVectorStoreIndex.from_vector_store(vector_store=text_store, image_store=image_store)
121
+ # retriever_engine = index.as_retriever(similarity_top_k=1, image_similarity_top_k=1)
122
+ # shutil.rmtree(temp_dir)
123
+ # return retriever_engine
124
+ temp_dir = tempfile.mkdtemp()
125
+ temp_pdf_path = os.path.join(temp_dir, pdf_file.name)
126
+ with open(temp_pdf_path, "wb") as f:
127
+ f.write(pdf_file.getvalue())
128
 
129
+ data_path = os.path.join(temp_dir, "data")
130
+ os.makedirs(data_path , exist_ok=True)
131
+ img_save_path = os.path.join(temp_dir, "images")
132
+ os.makedirs(img_save_path , exist_ok=True)
133
 
134
+ extracted_text = extract_text_from_pdf(temp_pdf_path)
135
+ with open(os.path.join(data_path, "content.txt"), "w") as file:
136
+ file.write(extracted_text)
137
 
138
+ extract_images_from_pdf(temp_pdf_path, img_save_path)
139
+ moved_count = move_images(img_save_path, data_path)
140
+ print("Images moved count : ", moved_count)
141
+ remove_low_size_images(data_path)
142
+ remove_duplicate_images(data_path)
143
+ shutil.rmtree(img_save_path)
144
+ retriever_engine = initialize_qdrant(temp_dir=temp_dir, aws_prefix=aws_prefix_path) # os.path.join("folder" , os.path.splitext(pdf_file.name)[0] , unique_folder_name)
145
+ upload_folder_to_s3(temp_dir, aws_prefix_path)
146
+ shutil.rmtree(temp_dir)
147
 
148
+ return retriever_engine