Spaces:
Running
Running
import streamlit as st | |
from dotenv import load_dotenv | |
import sys | |
from PyPDF2 import PdfReader | |
from langchain_community.llms import OpenAI | |
from langchain_community.chat_models import ChatOpenAI | |
from langchain_text_splitters import CharacterTextSplitter | |
from langchain_openai.embeddings import OpenAIEmbeddings | |
from langchain_community.embeddings import HuggingFaceInstructEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain.memory import ConversationBufferMemory | |
from langchain.chains import ConversationalRetrievalChain | |
from langchain.retrievers import ContextualCompressionRetriever | |
from langchain.retrievers.document_compressors import LLMChainExtractor | |
from langchain.retrievers import MultiQueryRetriever | |
from langchain.chains import RetrievalQA | |
from langchain.llms import OpenAI , Cohere | |
def get_pdf_text(pdf_docs): | |
text = "" | |
pdf_reader = PdfReader(pdf_docs) | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
return text | |
def get_text_chunks(text): | |
text_splitter = CharacterTextSplitter( | |
separator="\n", | |
chunk_size=1000, | |
chunk_overlap=200, | |
length_function=len, | |
is_separator_regex=False,) | |
chunks = text_splitter.split_text(text) | |
return chunks | |
def get_vectorstore(text_chunks): | |
embeddings = OpenAIEmbeddings() | |
# embeddings = HuggingFaceInstructEmbeddings(model_name = "hkunlp/instructor-large") | |
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings) | |
return vectorstore | |
def ll_retriver(vectorstore): | |
llm = OpenAI(temperature=0) | |
llm_based_retriver=MultiQueryRetriever.from_llm( | |
retriever=vectorstore.as_retriever(), | |
llm=llm | |
) | |
return llm_based_retriver | |
def chain(llm_based_retriever): | |
llm = Cohere(temperature=0) | |
QA_Chain = RetrievalQA.from_chain_type( | |
llm=llm, | |
chain_type="stuff", | |
retriever=llm_based_retriever | |
) | |
return QA_Chain | |
def main(): | |
load_dotenv() | |
st.set_page_config(page_title = "Chat with a PDFs",page_icon=":books:") | |
if "conversation" not in st.session_state: | |
st.session_state.conversation = None | |
if "Q_A_Chain" not in st.session_state: | |
st.session_state.Q_A_Chain = None | |
st.header("Chat with PDF :books:") | |
# question = st.text_input("Ask a Question about your document:") | |
with st.sidebar: | |
st.subheader("Upload your PDF") | |
pdf_docs = st.file_uploader("Upload your PDF here then Process") | |
if st.button("Process"): | |
with st.spinner("Processing"): | |
# get the raw PDF context | |
raw_text = get_pdf_text(pdf_docs) | |
# st.write(raw_text) | |
# get the chunks | |
text_chunks = get_text_chunks(raw_text) | |
# st.write(text_chunks) | |
#Create Vector Store | |
vectorstore = get_vectorstore(text_chunks) | |
# Conversation chain | |
llm_based_retriver = ll_retriver(vectorstore) | |
st.session_state.Q_A_Chain = chain(llm_based_retriver) | |
st.success("PDF processed successfully, you can now ask Questions.") | |
if st.session_state.Q_A_Chain: | |
question = st.text_input("Ask a Question about your document:") | |
if st.button("Submit Question"): | |
if question: | |
with st.spinner("Getting answer..."): | |
docs = st.session_state.Q_A_Chain({"query":question}) | |
st.write(docs['result']) | |
if __name__ == "__main__": | |
main() |