Nirav-Khanpara commited on
Commit
df697c8
Β·
1 Parent(s): b63ae73

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +12 -7
  2. scanned_pdf_parser.py +10 -0
app.py CHANGED
@@ -4,7 +4,7 @@ load_dotenv()
4
  import os
5
  import pickle
6
  import streamlit as st
7
- from scan_pdf_parser import get_text_from_scanned_pdf
8
  from langchain.embeddings import HuggingFaceInstructEmbeddings
9
  from langchain.llms import GooglePalm
10
  from langchain.prompts import PromptTemplate
@@ -16,7 +16,8 @@ from langchain.docstore.document import Document
16
 
17
  llm = GooglePalm(temperature=0.9)
18
 
19
- st.title("Query PDF Tool")
 
20
 
21
  uploaded_file = st.file_uploader("Choose a PDF file")
22
  main_placeholder = st.empty()
@@ -24,8 +25,12 @@ second_placeholder = st.empty()
24
 
25
 
26
  if uploaded_file:
27
- if not os.path.exists(uploaded_file.name):
28
- main_placeholder.text("Data Loading...Started...βŒ›βŒ›βŒ›")
 
 
 
 
29
  with open(f'{uploaded_file.name}', 'wb') as f:
30
  f.write(uploaded_file.getbuffer())
31
 
@@ -40,7 +45,7 @@ if uploaded_file:
40
  main_placeholder.text("It looks like Scanned PDF, No worries converting it...βŒ›βŒ›βŒ›")
41
  raw_text = get_text_from_scanned_pdf(uploaded_file.name)
42
 
43
- main_placeholder.text("Text Splitter...Started...βœ…βœ…βœ…")
44
  text_splitter = RecursiveCharacterTextSplitter(
45
  separators=['\n\n', '\n', '.', ','],
46
  chunk_size=2000
@@ -50,14 +55,14 @@ if uploaded_file:
50
  docs = [Document(page_content=t) for t in texts]
51
 
52
  embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
53
- main_placeholder.text("Embedding Vector Started Building...βœ…βœ…βœ…")
54
  vectorstore = FAISS.from_documents(docs, embeddings)
55
 
56
  # Save the FAISS index to a pickle file
57
  with open(f'vector_store_{uploaded_file.name}.pkl', "wb") as f:
58
  pickle.dump(vectorstore, f)
59
 
60
- main_placeholder.text("Data Loading...Completed...βœ…βœ…βœ…")
61
 
62
 
63
  query = second_placeholder.text_input("Question:")
 
4
  import os
5
  import pickle
6
  import streamlit as st
7
+ from scanned_pdf_parser import get_text_from_scanned_pdf
8
  from langchain.embeddings import HuggingFaceInstructEmbeddings
9
  from langchain.llms import GooglePalm
10
  from langchain.prompts import PromptTemplate
 
16
 
17
  llm = GooglePalm(temperature=0.9)
18
 
19
+ st.title("PDF Query Tool")
20
+ st.write("Upload your PDF and ask question from it")
21
 
22
  uploaded_file = st.file_uploader("Choose a PDF file")
23
  main_placeholder = st.empty()
 
25
 
26
 
27
  if uploaded_file:
28
+ filename = uploaded_file.name
29
+ if not filename.endswith(('.pdf', '.PDF')):
30
+ main_placeholder.warning("Choose PDF Document !!!")
31
+ exit()
32
+ elif not os.path.exists(uploaded_file.name):
33
+ main_placeholder.text("Data Loading Started...βŒ›βŒ›βŒ›")
34
  with open(f'{uploaded_file.name}', 'wb') as f:
35
  f.write(uploaded_file.getbuffer())
36
 
 
45
  main_placeholder.text("It looks like Scanned PDF, No worries converting it...βŒ›βŒ›βŒ›")
46
  raw_text = get_text_from_scanned_pdf(uploaded_file.name)
47
 
48
+ main_placeholder.text("Splitting text into smaller chunks...βŒ›βŒ›βŒ›")
49
  text_splitter = RecursiveCharacterTextSplitter(
50
  separators=['\n\n', '\n', '.', ','],
51
  chunk_size=2000
 
55
  docs = [Document(page_content=t) for t in texts]
56
 
57
  embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
58
+ main_placeholder.text("Storing data into Vector Database...βŒ›βŒ›βŒ›")
59
  vectorstore = FAISS.from_documents(docs, embeddings)
60
 
61
  # Save the FAISS index to a pickle file
62
  with open(f'vector_store_{uploaded_file.name}.pkl', "wb") as f:
63
  pickle.dump(vectorstore, f)
64
 
65
+ main_placeholder.text("Data Loading Completed...βœ…βœ…βœ…")
66
 
67
 
68
  query = second_placeholder.text_input("Question:")
scanned_pdf_parser.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytesseract
2
+ from pdf2image import convert_from_path
3
+
4
+
5
+ def get_text_from_scanned_pdf(pdf_path):
6
+ text = ''
7
+ images = convert_from_path(pdf_path)
8
+ for img in images:
9
+ text += pytesseract.image_to_string(img)
10
+ return text