import streamlit as st import os import requests import re from langchain_community.document_loaders import PyPDFLoader from langchain.docstore.document import Document from langchain.text_splitter import CharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.vectorstores.faiss import FAISS from langchain.prompts.prompt import PromptTemplate from langchain_community.llms import LlamaCpp from langchain.chains import RetrievalQA # Upload pdf file into 'pdf-data' folder if it does not exist def fn_upload_pdf(mv_pdf_input_file, mv_processing_message): """Upload pdf file into 'pdf-data' folder if it does not exist""" lv_file_name = mv_pdf_input_file.name if not os.path.exists("pdf-data"): os.makedirs("pdf-data") lv_temp_file_path = os.path.join("pdf-data",lv_file_name) if os.path.exists(lv_temp_file_path): print("File already available") fn_display_user_messages("File already available","Warning", mv_processing_message) else: with open(lv_temp_file_path,"wb") as lv_file: lv_file.write(mv_pdf_input_file.getbuffer()) print("Step1: PDF uploaded successfully at -> " + lv_temp_file_path) fn_display_user_messages("Step1: PDF uploaded successfully at -> " + lv_temp_file_path, "Info", mv_processing_message) # Create Vector DB of uploaded PDF def fn_create_vector_db(mv_pdf_input_file, mv_processing_message): """Create Vector DB of uploaded PDF""" lv_file_name = mv_pdf_input_file.name[:-4] + ".vectorstore" if not os.path.exists(os.path.join("vectordb","fiaas")): os.makedirs(os.path.join("vectordb","fiaas")) lv_temp_file_path = os.path.join(os.path.join("vectordb","fiaas"),lv_file_name) lv_embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={'device': 'cpu'} ) if os.path.exists(lv_temp_file_path): print("VectorDB already available for uploaded file") fn_display_user_messages("VectorDB already available for uploaded file","Warning", mv_processing_message) lv_vector_store = FAISS.load_local(lv_temp_file_path, lv_embeddings,allow_dangerous_deserialization=True) return lv_vector_store else: lv_temp_pdf_file_path = os.path.join("pdf-data",mv_pdf_input_file.name) # -- Loading PDF Data lv_pdf_loader = PyPDFLoader(lv_temp_pdf_file_path) lv_pdf_content = lv_pdf_loader.load() # -- Define patterns with flexibility pattern1 = r"(\w+)-\n(\w+)" # Match hyphenated words separated by a line break pattern2 = r"(?