micbon commited on
Commit
f2cfbb6
Β·
1 Parent(s): 2e86579
Files changed (1) hide show
  1. app.py +0 -96
app.py DELETED
@@ -1,96 +0,0 @@
1
- from dotenv import load_dotenv
2
- load_dotenv()
3
-
4
- import os
5
- import pickle
6
- import streamlit as st
7
- from scanned_pdf_parser import get_text_from_scanned_pdf
8
- from langchain.embeddings import HuggingFaceInstructEmbeddings
9
- from langchain.llms import GooglePalm
10
- from langchain.prompts import PromptTemplate
11
- from langchain.chains import RetrievalQA
12
- from langchain.text_splitter import RecursiveCharacterTextSplitter
13
- from langchain.document_loaders import PyPDFLoader
14
- from langchain.vectorstores import FAISS
15
- from langchain.docstore.document import Document
16
-
17
- llm = GooglePalm(temperature=0.9)
18
-
19
- st.title("PDF Query Tool")
20
- st.write("Upload your PDF and ask question from it")
21
-
22
- uploaded_file = st.file_uploader("Choose a PDF file")
23
- main_placeholder = st.empty()
24
- second_placeholder = st.empty()
25
-
26
-
27
- if uploaded_file:
28
- filename = uploaded_file.name
29
- if not filename.endswith(('.pdf', '.PDF')):
30
- main_placeholder.warning("Choose PDF Document !!!")
31
- exit()
32
- elif not os.path.exists(uploaded_file.name):
33
- main_placeholder.text("Data Loading Started...βŒ›βŒ›βŒ›")
34
- with open(f'{uploaded_file.name}', 'wb') as f:
35
- f.write(uploaded_file.getbuffer())
36
-
37
- pdf_loader = PyPDFLoader(uploaded_file.name)
38
- documents = pdf_loader.load()
39
-
40
- raw_text = ''
41
- for doc in documents:
42
- raw_text += doc.page_content
43
-
44
- if len(raw_text) < 10:
45
- main_placeholder.text("It looks like Scanned PDF, No worries converting it...βŒ›βŒ›βŒ›")
46
- raw_text = get_text_from_scanned_pdf(uploaded_file.name)
47
-
48
- main_placeholder.text("Splitting text into smaller chunks...βŒ›βŒ›βŒ›")
49
- text_splitter = RecursiveCharacterTextSplitter(
50
- separators=['\n\n', '\n', '.', ','],
51
- chunk_size=2000
52
- )
53
-
54
- texts = text_splitter.split_text(raw_text)
55
- docs = [Document(page_content=t) for t in texts]
56
-
57
- embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
58
- main_placeholder.text("Storing data into Vector Database...βŒ›βŒ›βŒ›")
59
- vectorstore = FAISS.from_documents(docs, embeddings)
60
-
61
- # Save the FAISS index to a pickle file
62
- with open(f'vector_store_{uploaded_file.name}.pkl', "wb") as f:
63
- pickle.dump(vectorstore, f)
64
-
65
- main_placeholder.text("Data Loading Completed...βœ…βœ…βœ…")
66
-
67
-
68
- query = second_placeholder.text_input("Question:")
69
- if query:
70
- if os.path.exists(f'vector_store_{uploaded_file.name}.pkl'):
71
- with open(f'vector_store_{uploaded_file.name}.pkl', "rb") as f:
72
- vector_store = pickle.load(f)
73
-
74
- prompt_template = """
75
- <context>
76
- {context}
77
- </context>
78
- Question: {question}
79
- Assistant:"""
80
- prompt = PromptTemplate(
81
- template=prompt_template, input_variables=["context", "question"]
82
- )
83
-
84
- chain = RetrievalQA.from_chain_type(
85
- llm=llm,
86
- chain_type="stuff",
87
- retriever=vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 1}),
88
- return_source_documents=True,
89
- chain_type_kwargs={"prompt": prompt}
90
- )
91
-
92
- with st.spinner("Searching for the answer..."):
93
- result = chain({"query": query})
94
- st.header("Answer")
95
- st.write(result["result"])
96
-