Jagannath95 commited on
Commit
494a30d
·
verified ·
1 Parent(s): 3d847bf

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +111 -0
  2. requirements.txt +14 -0
app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ import sys
4
+ from PyPDF2 import PdfReader
5
+ from langchain_community.llms import OpenAI
6
+ from langchain_community.chat_models import ChatOpenAI
7
+ from langchain_text_splitters import CharacterTextSplitter
8
+ from langchain_openai.embeddings import OpenAIEmbeddings
9
+ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
10
+ from langchain_community.vectorstores import FAISS
11
+ from langchain.memory import ConversationBufferMemory
12
+ from langchain.chains import ConversationalRetrievalChain
13
+ from langchain.retrievers import ContextualCompressionRetriever
14
+ from langchain.retrievers.document_compressors import LLMChainExtractor
15
+ from langchain.retrievers import MultiQueryRetriever
16
+ from langchain.chains import RetrievalQA
17
+ from langchain.llms import OpenAI , Cohere
18
+
19
+
20
+ def get_pdf_text(pdf_docs):
21
+ text = ""
22
+ pdf_reader = PdfReader(pdf_docs)
23
+
24
+ for page in pdf_reader.pages:
25
+ text += page.extract_text()
26
+
27
+ return text
28
+
29
+ def get_text_chunks(text):
30
+ text_splitter = CharacterTextSplitter(
31
+ separator="\n",
32
+ chunk_size=1000,
33
+ chunk_overlap=200,
34
+ length_function=len,
35
+ is_separator_regex=False,)
36
+
37
+ chunks = text_splitter.split_text(text)
38
+
39
+ return chunks
40
+
41
+ def get_vectorstore(text_chunks):
42
+ embeddings = OpenAIEmbeddings()
43
+ # embeddings = HuggingFaceInstructEmbeddings(model_name = "hkunlp/instructor-large")
44
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
45
+ return vectorstore
46
+
47
+ def ll_retriver(vectorstore):
48
+ llm = OpenAI(temperature=0)
49
+ llm_based_retriver=MultiQueryRetriever.from_llm(
50
+ retriever=vectorstore.as_retriever(),
51
+ llm=llm
52
+ )
53
+ return llm_based_retriver
54
+
55
+ def chain(llm_based_retriever):
56
+ llm = OpenAI(temperature=0)
57
+ QA_Chain = RetrievalQA.from_chain_type(
58
+ llm=llm,
59
+ chain_type="stuff",
60
+ retriever=llm_based_retriever
61
+ )
62
+ return QA_Chain
63
+
64
+
65
+ def main():
66
+ load_dotenv()
67
+
68
+ st.set_page_config(page_title = "Chat with a PDFs",page_icon=":books:")
69
+
70
+ if "conversation" not in st.session_state:
71
+ st.session_state.conversation = None
72
+
73
+ if "Q_A_Chain" not in st.session_state:
74
+ st.session_state.Q_A_Chain = None
75
+
76
+ st.header("Chat with PDF :books:")
77
+ # question = st.text_input("Ask a Question about your document:")
78
+
79
+ with st.sidebar:
80
+ st.subheader("Upload your PDF")
81
+ pdf_docs = st.file_uploader("Upload your PDF here then Process")
82
+
83
+ if st.button("Process"):
84
+ with st.spinner("Processing"):
85
+
86
+ # get the raw PDF context
87
+ raw_text = get_pdf_text(pdf_docs)
88
+ # st.write(raw_text)
89
+
90
+ # get the chunks
91
+ text_chunks = get_text_chunks(raw_text)
92
+ # st.write(text_chunks)
93
+
94
+ #Create Vector Store
95
+ vectorstore = get_vectorstore(text_chunks)
96
+
97
+ # Conversation chain
98
+ llm_based_retriver = ll_retriver(vectorstore)
99
+ st.session_state.Q_A_Chain = chain(llm_based_retriver)
100
+ st.success("PDF processed successfully, you can now ask Questions.")
101
+
102
+ if st.session_state.Q_A_Chain:
103
+ question = st.text_input("Ask a Question about your document:")
104
+ if st.button("Submit Question"):
105
+ if question:
106
+ with st.spinner("Getting answer..."):
107
+ docs = st.session_state.Q_A_Chain({"query":question})
108
+ st.write(docs['result'])
109
+
110
+ if __name__ == "__main__":
111
+ main()
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai
2
+ langchain
3
+ cohere
4
+ tiktoken
5
+ langchain-community
6
+ pypdf
7
+ langchain-openai
8
+ chromadb
9
+ streamlit
10
+ PyPDF2
11
+ langchain-text-splitters
12
+ faiss-cpu
13
+ sentence-transformers
14
+ InstructorEmbedding