import os import utils.constants as constants_utils import utils.data_loader as data_loader_utils import utils.langchain_utils as langchain_utils import utils.weather as weather_utils import utils.mandi_price as mandi_utils import utils.translator as translator_utils from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader, GPTListIndex from langchain.indexes import VectorstoreIndexCreator from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import FAISS import warnings warnings.filterwarnings('ignore') class KKMS_KSSW: def __init__(self): self.index = None self.documents = [] self.response = None # Instantiate langchain_utils class object self.langchain_utils_obj = langchain_utils.LANGCHAIN_UTILS() # Instantiate Mandi Price utils class object self.mandi_utils_obj = mandi_utils.MANDI_PRICE() # Instantiate Weather class object self.weather_utils_obj = weather_utils.WEATHER() # Instantiate translator_utils class object self.translator_utils_obj = translator_utils.TRANSLATOR() if not os.path.exists(constants_utils.DATA_PATH): os.makedirs(constants_utils.DATA_PATH) if not os.path.exists(constants_utils.OUTPUT_PATH): os.makedirs(constants_utils.OUTPUT_PATH) # Initialize index (vector store) def initialize_index(self, save_index_to_disk=True, index_type='GPTSimpleVectorIndex'): # Load the index from the saved index.json file if os.path.exists(constants_utils.INDEX_FILENAME): print(f'Loading pre-generated index from: {constants_utils.INDEX_FILENAME}') self.index = self.langchain_utils_obj.load_index(index_type='GPTSimpleVectorIndex', filepath=constants_utils.INDEX_FILENAME) else: # Load data from Docs if os.path.exists(constants_utils.DATA_PATH): doc_documents = SimpleDirectoryReader(constants_utils.DATA_PATH).load_data() self.documents = doc_documents[:] # Load data from PDFs only # pdf_documents = data_loader_utils.load_document(doc_type='pdf', doc_filepath=doc_filepath) # Load data from URLs & append it to the documents that we read from PDFs # url_documents = data_loader_utils.load_document(doc_type='url', urls=urls) # self.documents.extend(url_documents) # Build the Vector store for docs if index_type == 'GPTSimpleVectorIndex': self.index = GPTSimpleVectorIndex.from_documents(self.documents) elif index_type == 'FAISS': self.index = FAISS.from_documents( self.documents, OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY')) ) def merge_documents_from_different_sources(doc_documents, url_documents): # Build the Vector store for docs doc_index = GPTSimpleVectorIndex.from_documents(doc_documents) # Build the Vector store for URLs url_index = GPTSimpleVectorIndex.from_documents(url_documents) # Set summary of each index doc_index.set_text("index_from_docs") url_index.set_text("index_from_urls") # Merge index of different data sources self.index = GPTListIndex([doc_index]) self.index.insert(url_index) # can also be passed directly as GPTListIndex([doc_index, url_index]) return self.index if save_index_to_disk: # Save index to a index.json file print(f'Saving newly generated index: {constants_utils.INDEX_FILENAME}') if index_type == 'GPTSimpleVectorIndex': self.index.save_to_disk(constants_utils.INDEX_FILENAME) elif index_type == 'FAISS': self.index.save_local(constants_utils.INDEX_FILENAME) # Define query on index to retrieve the most relevant top K documents from the vector store def query(self, question, mode='default', response_mode="default", similarity_top_k=1, required_keywords=[], exclude_keywords=[], verbose=False ): ''' Args: mode: can be any of [default, embedding] response_mode: can be any of [default, compact, tree_summarize] ''' # Querying the index self.response = self.index.query(question, mode=mode, response_mode=response_mode, similarity_top_k=similarity_top_k, required_keywords=required_keywords, exclude_keywords=exclude_keywords, verbose=verbose) return self.response