Spaces:

Chintan-Donda
/

KKMS-KSSW-HF

Runtime error

File size: 4,241 Bytes

import os

import utils.constants as constants_utils
import utils.data_loader as data_loader_utils
import utils.langchain_utils as langchain_utils
import utils.weather as weather_utils
import utils.mandi_price as mandi_utils
import utils.translator as translator_utils

from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader, GPTListIndex
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

import warnings
warnings.filterwarnings('ignore')



class KKMS_KSSW:
	def __init__(self):
		self.index = None
		self.documents = []
		self.response = None

		# Instantiate langchain_utils class object
		self.langchain_utils_obj = langchain_utils.LANGCHAIN_UTILS()
		# Instantiate Mandi Price utils class object
		self.mandi_utils_obj = mandi_utils.MANDI_PRICE()
		# Instantiate Weather class object
		self.weather_utils_obj = weather_utils.WEATHER()
		# Instantiate translator_utils class object
		self.translator_utils_obj = translator_utils.TRANSLATOR()

		if not os.path.exists(constants_utils.DATA_PATH):
			os.makedirs(constants_utils.DATA_PATH)
		if not os.path.exists(constants_utils.OUTPUT_PATH):
			os.makedirs(constants_utils.OUTPUT_PATH)


	# Initialize index (vector store)
	def initialize_index(self, save_index_to_disk=True, index_type='GPTSimpleVectorIndex'):
		# Load the index from the saved index.json file
		if os.path.exists(constants_utils.INDEX_FILENAME):
			print(f'Loading pre-generated index from: {constants_utils.INDEX_FILENAME}')
			self.index = self.langchain_utils_obj.load_index(index_type='GPTSimpleVectorIndex', filepath=constants_utils.INDEX_FILENAME)
		else:
			# Load data from Docs
			if os.path.exists(constants_utils.DATA_PATH):
				doc_documents = SimpleDirectoryReader(constants_utils.DATA_PATH).load_data()
				self.documents = doc_documents[:]
				
				# Load data from PDFs only
				# pdf_documents = data_loader_utils.load_document(doc_type='pdf', doc_filepath=doc_filepath)
			
			# Load data from URLs & append it to the documents that we read from PDFs
			# url_documents = data_loader_utils.load_document(doc_type='url', urls=urls)
			# self.documents.extend(url_documents)
			
			# Build the Vector store for docs
			if index_type == 'GPTSimpleVectorIndex':
				self.index = GPTSimpleVectorIndex.from_documents(self.documents)
			elif index_type == 'FAISS':
				self.index = FAISS.from_documents(
					self.documents,
					OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
				)

			
			def merge_documents_from_different_sources(doc_documents, url_documents):
				# Build the Vector store for docs
				doc_index = GPTSimpleVectorIndex.from_documents(doc_documents)
				# Build the Vector store for URLs
				url_index = GPTSimpleVectorIndex.from_documents(url_documents)
			
				# Set summary of each index
				doc_index.set_text("index_from_docs")
				url_index.set_text("index_from_urls")

				# Merge index of different data sources
				self.index = GPTListIndex([doc_index])
				self.index.insert(url_index)   # can also be passed directly as GPTListIndex([doc_index, url_index])
				
				return self.index
			
			
			if save_index_to_disk:
				# Save index to a index.json file
				print(f'Saving newly generated index: {constants_utils.INDEX_FILENAME}')
				
				if index_type == 'GPTSimpleVectorIndex':
					self.index.save_to_disk(constants_utils.INDEX_FILENAME)
				elif index_type == 'FAISS':
					self.index.save_local(constants_utils.INDEX_FILENAME)



	# Define query on index to retrieve the most relevant top K documents from the vector store
	def query(self,
		question,
		mode='default',
		response_mode="default",
		similarity_top_k=1,
		required_keywords=[],
		exclude_keywords=[],
		verbose=False
	):
		'''
			Args:
				mode: can be any of [default, embedding]
				response_mode: can be any of [default, compact, tree_summarize]
		'''
		
		# Querying the index
		self.response = self.index.query(question,
							   mode=mode,
							   response_mode=response_mode,
							   similarity_top_k=similarity_top_k,
							   required_keywords=required_keywords,
							   exclude_keywords=exclude_keywords,
							   verbose=verbose)

		return self.response