File size: 4,241 Bytes
a447435
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e00db83
 
 
 
 
a447435
 
 
e00db83
a447435
 
 
 
 
 
 
7d05526
a447435
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d05526
a447435
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os

import utils.constants as constants_utils
import utils.data_loader as data_loader_utils
import utils.langchain_utils as langchain_utils
import utils.weather as weather_utils
import utils.mandi_price as mandi_utils
import utils.translator as translator_utils

from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader, GPTListIndex
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

import warnings
warnings.filterwarnings('ignore')



class KKMS_KSSW:
	def __init__(self):
		self.index = None
		self.documents = []
		self.response = None

		# Instantiate langchain_utils class object
		self.langchain_utils_obj = langchain_utils.LANGCHAIN_UTILS()
		# Instantiate Mandi Price utils class object
		self.mandi_utils_obj = mandi_utils.MANDI_PRICE()
		# Instantiate Weather class object
		self.weather_utils_obj = weather_utils.WEATHER()
		# Instantiate translator_utils class object
		self.translator_utils_obj = translator_utils.TRANSLATOR()

		if not os.path.exists(constants_utils.DATA_PATH):
			os.makedirs(constants_utils.DATA_PATH)
		if not os.path.exists(constants_utils.OUTPUT_PATH):
			os.makedirs(constants_utils.OUTPUT_PATH)


	# Initialize index (vector store)
	def initialize_index(self, save_index_to_disk=True, index_type='GPTSimpleVectorIndex'):
		# Load the index from the saved index.json file
		if os.path.exists(constants_utils.INDEX_FILENAME):
			print(f'Loading pre-generated index from: {constants_utils.INDEX_FILENAME}')
			self.index = self.langchain_utils_obj.load_index(index_type='GPTSimpleVectorIndex', filepath=constants_utils.INDEX_FILENAME)
		else:
			# Load data from Docs
			if os.path.exists(constants_utils.DATA_PATH):
				doc_documents = SimpleDirectoryReader(constants_utils.DATA_PATH).load_data()
				self.documents = doc_documents[:]
				
				# Load data from PDFs only
				# pdf_documents = data_loader_utils.load_document(doc_type='pdf', doc_filepath=doc_filepath)
			
			# Load data from URLs & append it to the documents that we read from PDFs
			# url_documents = data_loader_utils.load_document(doc_type='url', urls=urls)
			# self.documents.extend(url_documents)
			
			# Build the Vector store for docs
			if index_type == 'GPTSimpleVectorIndex':
				self.index = GPTSimpleVectorIndex.from_documents(self.documents)
			elif index_type == 'FAISS':
				self.index = FAISS.from_documents(
					self.documents,
					OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
				)

			
			def merge_documents_from_different_sources(doc_documents, url_documents):
				# Build the Vector store for docs
				doc_index = GPTSimpleVectorIndex.from_documents(doc_documents)
				# Build the Vector store for URLs
				url_index = GPTSimpleVectorIndex.from_documents(url_documents)
			
				# Set summary of each index
				doc_index.set_text("index_from_docs")
				url_index.set_text("index_from_urls")

				# Merge index of different data sources
				self.index = GPTListIndex([doc_index])
				self.index.insert(url_index)   # can also be passed directly as GPTListIndex([doc_index, url_index])
				
				return self.index
			
			
			if save_index_to_disk:
				# Save index to a index.json file
				print(f'Saving newly generated index: {constants_utils.INDEX_FILENAME}')
				
				if index_type == 'GPTSimpleVectorIndex':
					self.index.save_to_disk(constants_utils.INDEX_FILENAME)
				elif index_type == 'FAISS':
					self.index.save_local(constants_utils.INDEX_FILENAME)



	# Define query on index to retrieve the most relevant top K documents from the vector store
	def query(self,
		question,
		mode='default',
		response_mode="default",
		similarity_top_k=1,
		required_keywords=[],
		exclude_keywords=[],
		verbose=False
	):
		'''
			Args:
				mode: can be any of [default, embedding]
				response_mode: can be any of [default, compact, tree_summarize]
		'''
		
		# Querying the index
		self.response = self.index.query(question,
							   mode=mode,
							   response_mode=response_mode,
							   similarity_top_k=similarity_top_k,
							   required_keywords=required_keywords,
							   exclude_keywords=exclude_keywords,
							   verbose=verbose)

		return self.response