Spaces:
Sleeping
Sleeping
viboognesh
commited on
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
@@ -18,12 +18,22 @@ from llama_index.core.query_engine import SimpleMultiModalQueryEngine
|
|
18 |
from llama_index.llms.openai import OpenAI
|
19 |
from llama_index.core import load_index_from_storage, get_response_synthesizer
|
20 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
# from dotenv import load_dotenv
|
23 |
-
# load_dotenv()
|
24 |
|
25 |
-
OPENAI_API_KEY = "sk-proj-beorroDjV4FeoL6OAzbET3BlbkFJT4WcMiP0x30GxzmbpIEC"
|
26 |
-
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
|
27 |
|
28 |
|
29 |
def extract_text_from_pdf(pdf_path):
|
@@ -104,20 +114,27 @@ def remove_duplicate_images(data_path) :
|
|
104 |
except Exception as e:
|
105 |
print(e)
|
106 |
pass
|
107 |
-
|
108 |
-
|
|
|
109 |
|
110 |
-
|
111 |
-
# client = qdrant_client.QdrantClient(host = "192.168.0.1" , port = 2401 , https = True)
|
112 |
# client = qdrant_client.QdrantClient(url = "http://localhost:2452")
|
113 |
-
client = qdrant_client.QdrantClient(url="4b0af7be-d5b3-47ac-b215-128ebd6aa495.europe-west3-0.gcp.cloud.qdrant.io:6333", api_key="CO1sNGLmC6R_Q45qSIUxBSX8sxwHud4MCm4as_GTI-vzQqdUs-bXqw",)
|
|
|
114 |
|
115 |
if "vectordatabase" not in st.session_state or not st.session_state.vectordatabase:
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
118 |
storage_context = StorageContext.from_defaults(vector_store=text_store, image_store=image_store)
|
119 |
-
documents = SimpleDirectoryReader(os.path.join(temp_dir, "
|
120 |
index = MultiModalVectorStoreIndex.from_documents(documents, storage_context=storage_context)
|
|
|
121 |
st.session_state.vectordatabase = index
|
122 |
else :
|
123 |
index = st.session_state.vectordatabase
|
@@ -153,7 +170,7 @@ def retrieve_and_query(query, retriever_engine):
|
|
153 |
)
|
154 |
qa_tmpl = PromptTemplate(qa_tmpl_str)
|
155 |
|
156 |
-
llm = OpenAI(model="gpt-4o
|
157 |
response_synthesizer = get_response_synthesizer(response_mode="refine", text_qa_template=qa_tmpl, llm=llm)
|
158 |
|
159 |
response = response_synthesizer.synthesize(query, nodes=retrieval_results)
|
@@ -166,15 +183,16 @@ def retrieve_and_query(query, retriever_engine):
|
|
166 |
|
167 |
return response, retrieved_image_path_list
|
168 |
|
169 |
-
def process_pdf(pdf_file):
|
|
|
170 |
temp_dir = tempfile.TemporaryDirectory()
|
171 |
temp_pdf_path = os.path.join(temp_dir.name, pdf_file.name)
|
172 |
with open(temp_pdf_path, "wb") as f:
|
173 |
f.write(pdf_file.getvalue())
|
174 |
|
175 |
-
data_path = os.path.join(temp_dir.name, "
|
176 |
os.makedirs(data_path , exist_ok=True)
|
177 |
-
img_save_path = os.path.join(temp_dir.name, "
|
178 |
os.makedirs(img_save_path , exist_ok=True)
|
179 |
|
180 |
extracted_text = extract_text_from_pdf(temp_pdf_path)
|
@@ -185,7 +203,8 @@ def process_pdf(pdf_file):
|
|
185 |
moved_count = move_images(img_save_path, data_path)
|
186 |
remove_low_size_images(data_path)
|
187 |
remove_duplicate_images(data_path)
|
188 |
-
|
|
|
189 |
|
190 |
return temp_dir, retriever_engine
|
191 |
|
@@ -199,13 +218,15 @@ def main():
|
|
199 |
st.session_state.vectordatabase = None
|
200 |
|
201 |
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
|
|
|
202 |
if uploaded_file is None:
|
203 |
st.info("Please upload a PDF file.")
|
204 |
else:
|
|
|
205 |
st.info(f"Uploaded PDF: {uploaded_file.name}")
|
206 |
if st.button("Process PDF"):
|
207 |
with st.spinner("Processing PDF..."):
|
208 |
-
temp_dir, st.session_state.retriever_engine = process_pdf(uploaded_file)
|
209 |
|
210 |
st.success("PDF processed successfully!")
|
211 |
|
|
|
18 |
from llama_index.llms.openai import OpenAI
|
19 |
from llama_index.core import load_index_from_storage, get_response_synthesizer
|
20 |
import tempfile
|
21 |
+
from qdrant_client import QdrantClient, models
|
22 |
+
import getpass
|
23 |
+
|
24 |
+
curr_user = getpass.getuser()
|
25 |
+
# from langchain.vectorstores import Chroma
|
26 |
+
# To connect to the same event-loop,
|
27 |
+
# allows async events to run on notebook
|
28 |
+
|
29 |
+
# import nest_asyncio
|
30 |
+
|
31 |
+
# nest_asyncio.apply()
|
32 |
+
|
33 |
+
from dotenv import load_dotenv
|
34 |
+
load_dotenv()
|
35 |
|
|
|
|
|
36 |
|
|
|
|
|
37 |
|
38 |
|
39 |
def extract_text_from_pdf(pdf_path):
|
|
|
114 |
except Exception as e:
|
115 |
print(e)
|
116 |
pass
|
117 |
+
# from langchain_chroma import Chroma
|
118 |
+
# import chromadb
|
119 |
+
def initialize_qdrant(temp_dir , file_name , user):
|
120 |
|
121 |
+
client = qdrant_client.QdrantClient(path=f"qdrant_mm_db_pipeline_{user}_{file_name}")
|
|
|
122 |
# client = qdrant_client.QdrantClient(url = "http://localhost:2452")
|
123 |
+
# client = qdrant_client.QdrantClient(url="4b0af7be-d5b3-47ac-b215-128ebd6aa495.europe-west3-0.gcp.cloud.qdrant.io:6333", api_key="CO1sNGLmC6R_Q45qSIUxBSX8sxwHud4MCm4as_GTI-vzQqdUs-bXqw",)
|
124 |
+
# client = qdrant_client.AsyncQdrantClient(location = ":memory:")
|
125 |
|
126 |
if "vectordatabase" not in st.session_state or not st.session_state.vectordatabase:
|
127 |
+
|
128 |
+
# text_store = client.create_collection(f"text_collection_pipeline_{user}_{file_name}" )
|
129 |
+
# image_store = client.create_collection(f"image_collection_pipeline_{user}_{file_name}" )
|
130 |
+
|
131 |
+
|
132 |
+
text_store = QdrantVectorStore( client = client , collection_name=f"text_collection_pipeline_{user}_{file_name}" )
|
133 |
+
image_store = QdrantVectorStore(client = client , collection_name=f"image_collection_pipeline_{user}_{file_name}")
|
134 |
storage_context = StorageContext.from_defaults(vector_store=text_store, image_store=image_store)
|
135 |
+
documents = SimpleDirectoryReader(os.path.join(temp_dir, f"my_own_data_{user}_{file_name}")).load_data()
|
136 |
index = MultiModalVectorStoreIndex.from_documents(documents, storage_context=storage_context)
|
137 |
+
|
138 |
st.session_state.vectordatabase = index
|
139 |
else :
|
140 |
index = st.session_state.vectordatabase
|
|
|
170 |
)
|
171 |
qa_tmpl = PromptTemplate(qa_tmpl_str)
|
172 |
|
173 |
+
llm = OpenAI(model="gpt-4o", temperature=0)
|
174 |
response_synthesizer = get_response_synthesizer(response_mode="refine", text_qa_template=qa_tmpl, llm=llm)
|
175 |
|
176 |
response = response_synthesizer.synthesize(query, nodes=retrieval_results)
|
|
|
183 |
|
184 |
return response, retrieved_image_path_list
|
185 |
|
186 |
+
def process_pdf(pdf_file , user):
|
187 |
+
import pdb; pdb.set_trace()
|
188 |
temp_dir = tempfile.TemporaryDirectory()
|
189 |
temp_pdf_path = os.path.join(temp_dir.name, pdf_file.name)
|
190 |
with open(temp_pdf_path, "wb") as f:
|
191 |
f.write(pdf_file.getvalue())
|
192 |
|
193 |
+
data_path = os.path.join(temp_dir.name, f"my_own_data_{user}_{os.path.splitext(pdf_file.name)[0]}")
|
194 |
os.makedirs(data_path , exist_ok=True)
|
195 |
+
img_save_path = os.path.join(temp_dir.name, f"extracted_images_{user}_{os.path.splitext(pdf_file.name)[0]}")
|
196 |
os.makedirs(img_save_path , exist_ok=True)
|
197 |
|
198 |
extracted_text = extract_text_from_pdf(temp_pdf_path)
|
|
|
203 |
moved_count = move_images(img_save_path, data_path)
|
204 |
remove_low_size_images(data_path)
|
205 |
remove_duplicate_images(data_path)
|
206 |
+
import pdb; pdb.set_trace()
|
207 |
+
retriever_engine = initialize_qdrant(temp_dir.name , os.path.splitext(pdf_file.name)[0] , curr_user)
|
208 |
|
209 |
return temp_dir, retriever_engine
|
210 |
|
|
|
218 |
st.session_state.vectordatabase = None
|
219 |
|
220 |
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
|
221 |
+
# import pdb; pdb.set_trace()
|
222 |
if uploaded_file is None:
|
223 |
st.info("Please upload a PDF file.")
|
224 |
else:
|
225 |
+
# import pdb; pdb.set_trace()
|
226 |
st.info(f"Uploaded PDF: {uploaded_file.name}")
|
227 |
if st.button("Process PDF"):
|
228 |
with st.spinner("Processing PDF..."):
|
229 |
+
temp_dir, st.session_state.retriever_engine = process_pdf(uploaded_file , curr_user)
|
230 |
|
231 |
st.success("PDF processed successfully!")
|
232 |
|