import os import shutil import streamlit as st import chromadb import config as cf from langchain.chains.question_answering import load_qa_chain from langchain.prompts import PromptTemplate # from langchain_community.embeddings import SentenceTransformerEmbeddings from sentence_transformers import SentenceTransformer from langchain_groq import ChatGroq from langchain.schema import Document from source.utils.data_processing import ProcessDocs from source.utils.store_data import get_vector_store, check_pdfs_chromadb, save_uploaded_files from source.utils.process_data import get_pdf_text, get_text_chunks llm = None def get_conversational_chain(model): global llm # prompt_template = """ # Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in # provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n # Context:\n {context}?\n # Question: \n{question}\n # Answer: # """ # model = ChatGoogleGenerativeAI(model="gemini-pro",temperature=0.3) if model == 'gemma-7b-it': llm = ChatGroq(temperature=0, model_name="gemma-7b-it") if model == 'mixtral-8x7b-32768': llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768") if model == 'llama3-70b-8192': llm = ChatGroq(temperature=0, model_name="llama3-70b-8192") if model == 'llama3-8b-8192': llm = ChatGroq(temperature=0, model_name="llama3-8b-8192") # prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"]) chain = load_qa_chain(llm, chain_type="stuff", # prompt=prompt ) return chain def user_input(user_question,model): embedding_model = SentenceTransformer("all-mpnet-base-v2") chain = get_conversational_chain(model) docs = [] input_embeddings = embedding_model.encode(user_question).tolist() client = chromadb.PersistentClient("chromadb") collection = client.get_collection("Chromadb_pdf") results = collection.query( query_embeddings = [input_embeddings], n_results = 5, include=['distances', 'metadatas', 'documents'] ) if results['documents']: pg_num = [] for i in range(len(results['documents'][0])): document = results['documents'][0][i] metadata = results['metadatas'][0][i] pdf_name = metadata['pdf_name'] page_number = metadata['page_number'] docs.append(Document( page_content=document, metadata={ 'source': pdf_name, 'page': page_number } )) pg_num.append(str(page_number)) response = chain( {"input_documents": docs, "question": user_question}, # return_only_outputs=True return_only_outputs= False ) # st.write("Reply: ", document) # st.write("Reply:", response) st.write("Reply:", response["output_text"]) st.write("Metadata: ", f"PDF Name: {pdf_name}, Page Numbers: {','.join(pg_num)}") else: st.write("No results found.") def main(): st.set_page_config("Chat PDF") model = st.selectbox("Select Model", ["llama3-8b-8192", "llama3-70b-8192","mixtral-8x7b-32768","gemma-7b-it"]) st.header("Chat with PDF after Uploading") user_question = st.text_input("Ask a Question from the PDF Files") if user_question: db_obj = ProcessDocs(cf.db_collection_name) response = db_obj.retrieval_qa(user_question, model) st.write("Response:", response) # st.write("Metadata: ", f"PDF Name: {pdf_name}, Page Numbers: {','.join(pg_num)}") # user_input(user_question, model) with st.sidebar: st.title("Menu:") pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True) db_obj = ProcessDocs(cf.db_collection_name) # print(pdf_docs) if st.button("Submit & Process"): # global list_of_pdfs # list_of_pdfs = check_pdfs_chromadb() # check_pdfs_chromadb(list_of_pdfs) new_files = [doc.name for doc in pdf_docs] # new_files = [pdf_name for pdf_name in uploaded_docs_list] # docs_directory = 'docs' print(new_files) if new_files: if os.path.exists(cf.pdf_download_path): shutil.rmtree(cf.pdf_download_path) os.makedirs(cf.pdf_download_path) pdf_docs = [pdf for pdf in pdf_docs if pdf.name in new_files] print(pdf_docs) save_uploaded_files(pdf_docs, cf.pdf_download_path) with st.spinner("Processing..."): new_unique_files = db_obj.identify_new_uploaded_files() pdf_docs = db_obj.create_pdf_docx_loader(new_unique_files, model) splits = db_obj.split_documents(pdf_docs) db_obj.vector_store(splits) # raw_text = get_pdf_text(cf.pdf_download_path) # text_chunks = get_text_chunks(raw_text) # get_vector_store(text_chunks) st.success("Done") # st.success("Done") else: st.success("No new files to process") if __name__ == "__main__": main()