viboognesh commited on
Commit
e239fba
·
verified ·
1 Parent(s): ff6d755

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +39 -18
app.py CHANGED
@@ -18,12 +18,22 @@ from llama_index.core.query_engine import SimpleMultiModalQueryEngine
18
  from llama_index.llms.openai import OpenAI
19
  from llama_index.core import load_index_from_storage, get_response_synthesizer
20
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- # from dotenv import load_dotenv
23
- # load_dotenv()
24
 
25
- OPENAI_API_KEY = "sk-proj-beorroDjV4FeoL6OAzbET3BlbkFJT4WcMiP0x30GxzmbpIEC"
26
- os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
27
 
28
 
29
  def extract_text_from_pdf(pdf_path):
@@ -104,20 +114,27 @@ def remove_duplicate_images(data_path) :
104
  except Exception as e:
105
  print(e)
106
  pass
107
-
108
- def initialize_qdrant(temp_dir):
 
109
 
110
- # client = qdrant_client.QdrantClient(path="qdrant_mm_db_pipeline")
111
- # client = qdrant_client.QdrantClient(host = "192.168.0.1" , port = 2401 , https = True)
112
  # client = qdrant_client.QdrantClient(url = "http://localhost:2452")
113
- client = qdrant_client.QdrantClient(url="4b0af7be-d5b3-47ac-b215-128ebd6aa495.europe-west3-0.gcp.cloud.qdrant.io:6333", api_key="CO1sNGLmC6R_Q45qSIUxBSX8sxwHud4MCm4as_GTI-vzQqdUs-bXqw",)
 
114
 
115
  if "vectordatabase" not in st.session_state or not st.session_state.vectordatabase:
116
- text_store = QdrantVectorStore(client=client, collection_name="text_collection_pipeline")
117
- image_store = QdrantVectorStore(client=client, collection_name="image_collection_pipeline")
 
 
 
 
 
118
  storage_context = StorageContext.from_defaults(vector_store=text_store, image_store=image_store)
119
- documents = SimpleDirectoryReader(os.path.join(temp_dir, "my_own_data")).load_data()
120
  index = MultiModalVectorStoreIndex.from_documents(documents, storage_context=storage_context)
 
121
  st.session_state.vectordatabase = index
122
  else :
123
  index = st.session_state.vectordatabase
@@ -153,7 +170,7 @@ def retrieve_and_query(query, retriever_engine):
153
  )
154
  qa_tmpl = PromptTemplate(qa_tmpl_str)
155
 
156
- llm = OpenAI(model="gpt-4o-mini", temperature=0)
157
  response_synthesizer = get_response_synthesizer(response_mode="refine", text_qa_template=qa_tmpl, llm=llm)
158
 
159
  response = response_synthesizer.synthesize(query, nodes=retrieval_results)
@@ -166,15 +183,16 @@ def retrieve_and_query(query, retriever_engine):
166
 
167
  return response, retrieved_image_path_list
168
 
169
- def process_pdf(pdf_file):
 
170
  temp_dir = tempfile.TemporaryDirectory()
171
  temp_pdf_path = os.path.join(temp_dir.name, pdf_file.name)
172
  with open(temp_pdf_path, "wb") as f:
173
  f.write(pdf_file.getvalue())
174
 
175
- data_path = os.path.join(temp_dir.name, "my_own_data")
176
  os.makedirs(data_path , exist_ok=True)
177
- img_save_path = os.path.join(temp_dir.name, "extracted_images")
178
  os.makedirs(img_save_path , exist_ok=True)
179
 
180
  extracted_text = extract_text_from_pdf(temp_pdf_path)
@@ -185,7 +203,8 @@ def process_pdf(pdf_file):
185
  moved_count = move_images(img_save_path, data_path)
186
  remove_low_size_images(data_path)
187
  remove_duplicate_images(data_path)
188
- retriever_engine = initialize_qdrant(temp_dir.name)
 
189
 
190
  return temp_dir, retriever_engine
191
 
@@ -199,13 +218,15 @@ def main():
199
  st.session_state.vectordatabase = None
200
 
201
  uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
 
202
  if uploaded_file is None:
203
  st.info("Please upload a PDF file.")
204
  else:
 
205
  st.info(f"Uploaded PDF: {uploaded_file.name}")
206
  if st.button("Process PDF"):
207
  with st.spinner("Processing PDF..."):
208
- temp_dir, st.session_state.retriever_engine = process_pdf(uploaded_file)
209
 
210
  st.success("PDF processed successfully!")
211
 
 
18
  from llama_index.llms.openai import OpenAI
19
  from llama_index.core import load_index_from_storage, get_response_synthesizer
20
  import tempfile
21
+ from qdrant_client import QdrantClient, models
22
+ import getpass
23
+
24
+ curr_user = getpass.getuser()
25
+ # from langchain.vectorstores import Chroma
26
+ # To connect to the same event-loop,
27
+ # allows async events to run on notebook
28
+
29
+ # import nest_asyncio
30
+
31
+ # nest_asyncio.apply()
32
+
33
+ from dotenv import load_dotenv
34
+ load_dotenv()
35
 
 
 
36
 
 
 
37
 
38
 
39
  def extract_text_from_pdf(pdf_path):
 
114
  except Exception as e:
115
  print(e)
116
  pass
117
+ # from langchain_chroma import Chroma
118
+ # import chromadb
119
+ def initialize_qdrant(temp_dir , file_name , user):
120
 
121
+ client = qdrant_client.QdrantClient(path=f"qdrant_mm_db_pipeline_{user}_{file_name}")
 
122
  # client = qdrant_client.QdrantClient(url = "http://localhost:2452")
123
+ # client = qdrant_client.QdrantClient(url="4b0af7be-d5b3-47ac-b215-128ebd6aa495.europe-west3-0.gcp.cloud.qdrant.io:6333", api_key="CO1sNGLmC6R_Q45qSIUxBSX8sxwHud4MCm4as_GTI-vzQqdUs-bXqw",)
124
+ # client = qdrant_client.AsyncQdrantClient(location = ":memory:")
125
 
126
  if "vectordatabase" not in st.session_state or not st.session_state.vectordatabase:
127
+
128
+ # text_store = client.create_collection(f"text_collection_pipeline_{user}_{file_name}" )
129
+ # image_store = client.create_collection(f"image_collection_pipeline_{user}_{file_name}" )
130
+
131
+
132
+ text_store = QdrantVectorStore( client = client , collection_name=f"text_collection_pipeline_{user}_{file_name}" )
133
+ image_store = QdrantVectorStore(client = client , collection_name=f"image_collection_pipeline_{user}_{file_name}")
134
  storage_context = StorageContext.from_defaults(vector_store=text_store, image_store=image_store)
135
+ documents = SimpleDirectoryReader(os.path.join(temp_dir, f"my_own_data_{user}_{file_name}")).load_data()
136
  index = MultiModalVectorStoreIndex.from_documents(documents, storage_context=storage_context)
137
+
138
  st.session_state.vectordatabase = index
139
  else :
140
  index = st.session_state.vectordatabase
 
170
  )
171
  qa_tmpl = PromptTemplate(qa_tmpl_str)
172
 
173
+ llm = OpenAI(model="gpt-4o", temperature=0)
174
  response_synthesizer = get_response_synthesizer(response_mode="refine", text_qa_template=qa_tmpl, llm=llm)
175
 
176
  response = response_synthesizer.synthesize(query, nodes=retrieval_results)
 
183
 
184
  return response, retrieved_image_path_list
185
 
186
+ def process_pdf(pdf_file , user):
187
+ import pdb; pdb.set_trace()
188
  temp_dir = tempfile.TemporaryDirectory()
189
  temp_pdf_path = os.path.join(temp_dir.name, pdf_file.name)
190
  with open(temp_pdf_path, "wb") as f:
191
  f.write(pdf_file.getvalue())
192
 
193
+ data_path = os.path.join(temp_dir.name, f"my_own_data_{user}_{os.path.splitext(pdf_file.name)[0]}")
194
  os.makedirs(data_path , exist_ok=True)
195
+ img_save_path = os.path.join(temp_dir.name, f"extracted_images_{user}_{os.path.splitext(pdf_file.name)[0]}")
196
  os.makedirs(img_save_path , exist_ok=True)
197
 
198
  extracted_text = extract_text_from_pdf(temp_pdf_path)
 
203
  moved_count = move_images(img_save_path, data_path)
204
  remove_low_size_images(data_path)
205
  remove_duplicate_images(data_path)
206
+ import pdb; pdb.set_trace()
207
+ retriever_engine = initialize_qdrant(temp_dir.name , os.path.splitext(pdf_file.name)[0] , curr_user)
208
 
209
  return temp_dir, retriever_engine
210
 
 
218
  st.session_state.vectordatabase = None
219
 
220
  uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
221
+ # import pdb; pdb.set_trace()
222
  if uploaded_file is None:
223
  st.info("Please upload a PDF file.")
224
  else:
225
+ # import pdb; pdb.set_trace()
226
  st.info(f"Uploaded PDF: {uploaded_file.name}")
227
  if st.button("Process PDF"):
228
  with st.spinner("Processing PDF..."):
229
+ temp_dir, st.session_state.retriever_engine = process_pdf(uploaded_file , curr_user)
230
 
231
  st.success("PDF processed successfully!")
232