Chintan Donda commited on
Commit
b16454e
·
1 Parent(s): 3cd2948

- Adding new widgets for Uploading Custom data
- Updating web crawler, LangChain utils, and other scripts

app.py CHANGED
@@ -1,224 +1,440 @@
1
  import gradio as gr
2
  import os
3
  import datetime
4
- import kkms_kssw as kkms_kssw
5
  import utils.constants as constants_utils
 
 
 
 
6
 
7
 
8
- # Create and launch Gradio Web UI
9
  class DomState:
10
- def __init__(self):
11
- self.relevant_paragraphs = ''
12
- self.answer = ''
13
- self.summary = ''
14
- self.mandi_price = ''
15
- self.mandi_from_date = (datetime.datetime.now() - datetime.timedelta(days=2)).strftime('%Y-%m-%d')
16
- self.mandi_to_date = datetime.datetime.now().strftime('%Y-%m-%d')
17
- self.weather_info = ''
18
- self.weather_forecast = ''
19
- self.weather_forecast_summary = ''
20
- self.indic_lang_answer = ''
21
-
22
-
23
- # Initialize index (vector store)
24
- self.kkms_kssw_obj = kkms_kssw.KKMS_KSSW()
25
- self.kkms_kssw_obj.initialize_index(constants_utils.INDEX_FILENAME, index_type='GPTSimpleVectorIndex')
26
-
27
-
28
- def click_handler_for_get_relevant_paragraphs(self,
29
- question,
30
- mode='default',
31
- response_mode='default',
32
- similarity_top_k=2,
33
- ):
34
- self.relevant_paragraphs = self.kkms_kssw_obj.query(question,
35
- mode=mode,
36
- response_mode=response_mode,
37
- similarity_top_k=similarity_top_k,
38
- # required_keywords=required_keywords_list,
39
- # exclude_keywords=exclude_keywords_list,
40
- )
41
- return self.relevant_paragraphs
42
-
43
-
44
- def click_handler_for_summary(self, answer):
45
- self.sumamry = self.kkms_kssw_obj.langchain_utils_obj.get_textual_summary(answer)
46
- return self.sumamry
47
-
48
-
49
- def click_handler_for_get_answer(self,
50
- relevant_paragraphs,
51
- question
52
- ):
53
- self.answer = self.kkms_kssw_obj.langchain_utils_obj.get_answer_from_para(relevant_paragraphs, question)
54
- return self.answer
55
-
56
-
57
- def click_handler_for_mandi_price(self,
58
- state_name,
59
- apmc_name,
60
- commodity_name,
61
- from_date,
62
- to_date
63
- ):
64
- if state_name and apmc_name and commodity_name and from_date and to_date:
65
- self.mandi_price = self.kkms_kssw_obj.mandi_utils_obj.get_mandi_price(state_name, apmc_name, commodity_name, from_date, to_date)
66
- return self.mandi_price
67
-
68
-
69
- def click_handler_for_get_weather(self, city):
70
- time, info, temperature = self.kkms_kssw_obj.weather_utils_obj.get_weather(city)
71
- self.weather_info = f'Weather in {city.capitalize()} on {time} is {temperature} with {info}.'
72
- return self.weather_info
73
-
74
-
75
- def click_handler_for_get_weather_forecast(self, state, district):
76
- self.weather_forecast = self.kkms_kssw_obj.weather_utils_obj.get_weather_forecast(state, district)
77
- return self.weather_forecast
78
-
79
-
80
- def click_handler_for_weather_forecast_summary(self, weather_forecast):
81
- self.weather_forecast_summary = self.kkms_kssw_obj.langchain_utils_obj.get_weather_forecast_summary(weather_forecast)
82
- return self.weather_forecast_summary
83
-
84
-
85
- def click_handler_for_get_indic_answer(self, eng_ans, language='Hindi'):
86
- self.indic_lang_answer = self.kkms_kssw_obj.translator_utils_obj.get_indic_google_translate(eng_ans, language)
87
- return self.indic_lang_answer
88
-
89
-
90
- def select_widget(self, choice):
91
- if choice == "General":
92
- return [
93
- gr.update(visible=True),
94
- gr.update(visible=False),
95
- gr.update(visible=False)
96
- ]
97
-
98
- elif choice == "Mandi Price":
99
- return [
100
- gr.update(visible=False),
101
- gr.update(visible=True),
102
- gr.update(visible=False)
103
- ]
104
-
105
- elif choice == "Weather":
106
- return [
107
- gr.update(visible=False),
108
- gr.update(visible=False),
109
- gr.update(visible=True)
110
- ]
111
-
112
- else:
113
- return gr.update(visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
 
116
  with gr.Blocks(title='KKMS-KSSW Demo') as demo:
117
- dom = DomState()
118
-
119
- radio = gr.Radio(
120
- ["General", "Mandi Price", "Weather"], label="Query related to"
121
- )
122
-
123
- ########################### Widget for Govt. Policy #################################################
124
- with gr.Row(visible=True) as rowGeneral:
125
- with gr.Column(scale=1, min_width=600):
126
- with gr.Tab(label='Relevant paragraphs'):
127
- question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
128
- # Get the Relevant paragraphs for the question asked
129
- relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are:", value=dom.relevant_paragraphs, interactive=False)
130
- b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
131
- b_relevant_paragraphs.click(fn=dom.click_handler_for_get_relevant_paragraphs, inputs=question, outputs=[relevant_paragraphs])
132
-
133
- with gr.Column(scale=1, min_width=600):
134
- with gr.Tab(label='Extractive Summary'):
135
- # Get the extractive text summary from the retrieved Relevant paragraphs
136
- summary = gr.Textbox(label="Extractive Summary is:", value=dom.summary, interactive=False)
137
- b_summary = gr.Button("Extract Summary").style(size='sm')
138
- b_summary.click(fn=dom.click_handler_for_summary, inputs=relevant_paragraphs, outputs=[summary])
139
-
140
- # Get the exact answer for the question asked from the retrieved Relevant paragraphs
141
- with gr.Row():
142
- with gr.Column(scale=1, min_width=600):
143
- with gr.Tab(label='Answer'):
144
- answer = gr.Textbox(label="Answer is:", value=dom.answer, interactive=False)
145
- b_answer = gr.Button("Get Answer").style(size='sm')
146
- b_answer.click(fn=dom.click_handler_for_get_answer, inputs=[relevant_paragraphs, question], outputs=[answer])
147
-
148
- # Covert the answer to Indian language
149
- with gr.Row():
150
- with gr.Column(scale=1, min_width=600):
151
- with gr.Tab(label='Answer in selected language'):
152
- # Select the language
153
- language = gr.Dropdown(
154
- ['English', 'Hindi', 'Gujarati', 'Marathi', 'Kannada', 'Bengali', 'Panjabi', 'Telugu', 'Tamil', 'Malayalam'],
155
- label="Select language")
156
- indic_lang_answer = gr.Textbox(label="Answer in the selected language is:", value=dom.indic_lang_answer, interactive=False)
157
- b_indic_lang_answer = gr.Button("Get answer in selected language").style(size='sm')
158
- b_indic_lang_answer.click(fn=dom.click_handler_for_get_indic_answer, inputs=[answer, language], outputs=[indic_lang_answer])
159
-
160
-
161
- ########################## Widget for Mandi Price ###################################################
162
- with gr.Row(visible=False) as rowMandiPrice:
163
- with gr.Column(scale=1, min_width=600):
164
- # Select State
165
- state_name = gr.Dropdown(['ANDAMAN AND NICOBAR ISLANDS', 'ANDHRA PRADESH', 'ASSAM', 'BIHAR', 'CHANDIGARH', 'CHHATTISGARH', 'GOA', 'GUJARAT', 'HARYANA', 'HIMACHAL PRADESH', 'JAMMU AND KASHMIR', 'JHARKHAND', 'KARNATAKA', 'KERALA', 'MADHYA PRADESH', 'MAHARASHTRA', 'NAGALAND', 'ODISHA', 'PUDUCHERRY', 'PUNJAB', 'RAJASTHAN', 'TAMIL NADU', 'TELANGANA', 'TRIPURA', 'UTTAR PRADESH', 'UTTARAKHAND', 'WEST BENGAL'], label="Select state")
166
-
167
- # APMC name
168
- apmc_name = gr.Textbox(label="Enter APMC name", placeholder='Type the APMC name here')
169
-
170
- # APMC name
171
- commodity_name = gr.Textbox(label="Enter Commodity name", placeholder='Type the Commodity name here')
172
-
173
- # From/To date in yyyy-mm-dd format
174
- from_date = gr.Textbox(label="From date?", value=dom.mandi_from_date, placeholder='Please enter the From date here in yyyy-mm-dd format')
175
- to_date = gr.Textbox(label="To date?", value=dom.mandi_to_date, placeholder='Please enter the To date here in yyyy-mm-dd format')
176
-
177
- with gr.Column(scale=1, min_width=600):
178
- mandi_price = gr.Textbox(label=f"Mandi Price is:", value=dom.mandi_price, interactive=False)
179
- b_summary = gr.Button("Get Mandi Price").style(size='sm')
180
- b_summary.click(fn=dom.click_handler_for_mandi_price, inputs=[state_name, apmc_name, commodity_name, from_date, to_date], outputs=[mandi_price])
181
-
182
-
183
- ########################## Widget for Weather Info ###################################################
184
- with gr.Row(visible=False) as rowWeather:
185
- with gr.Column(scale=1, min_width=600):
186
- with gr.Tab(label='Weather Info'):
187
- city = gr.Textbox(label="Enter city name", placeholder='Type the city name here')
188
- weather = gr.Textbox(label=f"Current weather is:", value=dom.weather_info, interactive=False)
189
- b_weather = gr.Button("Get weather info").style(size='sm')
190
- b_weather.click(fn=dom.click_handler_for_get_weather, inputs=city, outputs=[weather])
191
-
192
- ########### Weather Forecast ###########
193
- with gr.Column(scale=1, min_width=600):
194
- with gr.Tab(label='Weather Forecast for next 5 days'):
195
- # Select the State
196
- state = gr.Dropdown(
197
- ['Andaman-Nicobar', 'Andhra-Pradesh', 'Arunachal-Pradesh', 'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh', 'Dadra-and-Nagar-Haveli', 'Daman-and-Diu', 'Delhi', 'Goa', 'Gujarat', 'Haryana', 'Himachal-Pradesh', 'Jammu-Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 'Lakshadweep', 'Madhya-Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Pondicherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamilnadu', 'Telangana', 'Tripura', 'Uttar-Pradesh', 'Uttarakhand', 'West-Bengal'],
198
- label="Select state"
199
- )
200
-
201
- # # Select district
202
- # district = gr.Dropdown(
203
- # weather_utils.STATES.get(state, {}),
204
- # label="Select district"
205
- # )
206
-
207
- district = gr.Textbox(label="Enter district name", placeholder='Type the district name here')
208
- district_weather = gr.Textbox(label=f"Weather forecast is:", value=dom.weather_forecast, interactive=False)
209
- bd_weather = gr.Button("Get weather forecast").style(size='sm')
210
- bd_weather.click(fn=dom.click_handler_for_get_weather_forecast, inputs=[state, district], outputs=[district_weather])
211
-
212
-
213
- with gr.Column(scale=1, min_width=600):
214
- with gr.Tab(label='Weather Forecast Summary'):
215
- # Get the summary of the weather forecast
216
- weather_forecast_summary = gr.Textbox(label="Weather Forecast Summary is:", value=dom.weather_forecast_summary, interactive=False)
217
- b_weather_forecast_summary = gr.Button("Get Weather Forecast Summary").style(size='sm')
218
- b_weather_forecast_summary.click(fn=dom.click_handler_for_weather_forecast_summary, inputs=district_weather, outputs=[weather_forecast_summary])
219
-
220
-
221
- radio.change(fn=dom.select_widget, inputs=radio, outputs=[rowGeneral, rowMandiPrice, rowWeather])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
 
224
  demo.launch(share=False)
 
1
  import gradio as gr
2
  import os
3
  import datetime
4
+
5
  import utils.constants as constants_utils
6
+ import kkms_kssw as kkms_kssw
7
+
8
+ import warnings
9
+ warnings.filterwarnings('ignore')
10
 
11
 
 
12
  class DomState:
13
+ def __init__(
14
+ self,
15
+ index_type,
16
+ load_from_existing_index_file
17
+ ):
18
+ self.index_type = index_type
19
+ self.load_from_existing_index_file = load_from_existing_index_file
20
+
21
+ self.relevant_paragraphs = ''
22
+ self.sources_relevant_paragraphs = ''
23
+ self.answer = ''
24
+ self.summary = ''
25
+ self.mandi_price = ''
26
+ self.mandi_from_date = (datetime.datetime.now() - datetime.timedelta(days=5)).strftime('%Y-%m-%d')
27
+ self.mandi_to_date = datetime.datetime.now().strftime('%Y-%m-%d')
28
+ self.weather_info = ''
29
+ self.weather_forecast = ''
30
+ self.weather_forecast_summary = ''
31
+ self.indic_translation = ''
32
+
33
+ # Initialize index (vector store) - This will create a new index from scratch if load_from_existing_index_file == False
34
+ self.kkms_kssw_obj = kkms_kssw.KKMS_KSSW()
35
+ self.kkms_kssw_obj.load_create_index()
36
+
37
+
38
+ def click_handler_for_get_relevant_paragraphs(
39
+ self,
40
+ question,
41
+ question_category='general'
42
+ ):
43
+ self.relevant_paragraphs = self.kkms_kssw_obj.query(
44
+ question=question,
45
+ question_category=question_category
46
+ )
47
+ if self.index_type in ['FAISS', 'Chroma']:
48
+ self.sources_relevant_paragraphs = [doc.metadata for doc in self.relevant_paragraphs]
49
+ self.relevant_paragraphs = [doc.page_content.replace('\n', '').replace('\t', ' ') for doc in self.relevant_paragraphs]
50
+ return self.relevant_paragraphs
51
+
52
+
53
+ def click_handler_for_relevant_paragraphs_source(
54
+ self,
55
+ relevant_paragraphs
56
+ ):
57
+ return self.sources_relevant_paragraphs
58
+
59
+
60
+ def click_handler_for_summary(
61
+ self,
62
+ answer
63
+ ):
64
+ self.sumamry = self.kkms_kssw_obj.langchain_utils_obj.get_textual_summary(answer)
65
+ return self.sumamry
66
+
67
+
68
+ def click_handler_for_get_answer(
69
+ self,
70
+ relevant_paragraphs, question
71
+ ):
72
+ self.answer = self.kkms_kssw_obj.langchain_utils_obj.get_answer_from_para(relevant_paragraphs, question)
73
+ return self.answer
74
+
75
+
76
+ def click_handler_for_mandi_price(self,
77
+ state_name,
78
+ apmc_name,
79
+ commodity_name,
80
+ from_date,
81
+ to_date
82
+ ):
83
+ if state_name and apmc_name and commodity_name and from_date and to_date:
84
+ self.mandi_price = self.kkms_kssw_obj.mandi_utils_obj.get_mandi_price(state_name, apmc_name, commodity_name, from_date, to_date)
85
+ return self.mandi_price
86
+
87
+
88
+ def click_handler_for_get_weather(
89
+ self,
90
+ city
91
+ ):
92
+ time, info, temperature = self.kkms_kssw_obj.weather_utils_obj.get_weather(city)
93
+ self.weather_info = f'Weather in {city.capitalize()} on {time} is {temperature} with {info}.'
94
+ return self.weather_info
95
+
96
+
97
+ def click_handler_for_get_weather_forecast(
98
+ self,
99
+ state,
100
+ district
101
+ ):
102
+ self.weather_forecast = self.kkms_kssw_obj.weather_utils_obj.get_weather_forecast(state, district)
103
+ return self.weather_forecast
104
+
105
+
106
+ def click_handler_for_weather_forecast_summary(
107
+ self,
108
+ weather_forecast
109
+ ):
110
+ self.weather_forecast_summary = self.kkms_kssw_obj.langchain_utils_obj.get_weather_forecast_summary(weather_forecast)
111
+ return self.weather_forecast_summary
112
+
113
+
114
+ def click_handler_for_load_files_urls(
115
+ self,
116
+ doc_type,
117
+ files_or_urls,
118
+ index_category='general'
119
+ ):
120
+ self.kkms_kssw_obj.upload_data(
121
+ doc_type=constants_utils.DATA_SOURCES[doc_type],
122
+ files_or_urls=files_or_urls,
123
+ index_category=index_category
124
+ )
125
+
126
+
127
+ def click_handler_for_get_indic_translation(
128
+ self,
129
+ eng_ans,
130
+ language='Hindi'
131
+ ):
132
+ self.indic_translation = self.kkms_kssw_obj.translator_utils_obj.get_indic_google_translate(eng_ans, language)
133
+ return self.indic_translation
134
+
135
+
136
+ def _upload_file(self, files):
137
+ file_paths = [file.name for file in files]
138
+ return file_paths
139
+
140
+
141
+ def select_widget(
142
+ self,
143
+ choice
144
+ ):
145
+ if choice == "General":
146
+ return [
147
+ gr.update(visible=True),
148
+ gr.update(visible=False),
149
+ gr.update(visible=False),
150
+ gr.update(visible=False),
151
+ ]
152
+
153
+ elif choice == "Mandi Price":
154
+ return [
155
+ gr.update(visible=False),
156
+ gr.update(visible=True),
157
+ gr.update(visible=False),
158
+ gr.update(visible=False),
159
+ ]
160
+
161
+ elif choice == "Weather":
162
+ return [
163
+ gr.update(visible=False),
164
+ gr.update(visible=False),
165
+ gr.update(visible=True),
166
+ gr.update(visible=False),
167
+ ]
168
+
169
+ elif choice == "Load Custom Data":
170
+ return [
171
+ gr.update(visible=False),
172
+ gr.update(visible=False),
173
+ gr.update(visible=False),
174
+ gr.update(visible=True)
175
+ ]
176
+
177
+ else:
178
+ return gr.update(visible=False)
179
+
180
+
181
+ def select_files_urls(
182
+ self,
183
+ choice
184
+ ):
185
+ if choice == "PDF":
186
+ return [
187
+ gr.update(visible=True),
188
+ gr.update(visible=False),
189
+ gr.update(visible=False),
190
+ gr.update(visible=False),
191
+ ]
192
+
193
+ elif choice == "Online PDF":
194
+ return [
195
+ gr.update(visible=False),
196
+ gr.update(visible=True),
197
+ gr.update(visible=False),
198
+ gr.update(visible=False),
199
+ ]
200
+
201
+ elif choice == "Text File":
202
+ return [
203
+ gr.update(visible=False),
204
+ gr.update(visible=False),
205
+ gr.update(visible=True),
206
+ gr.update(visible=False),
207
+ ]
208
+
209
+ elif choice == "URLs":
210
+ return [
211
+ gr.update(visible=False),
212
+ gr.update(visible=False),
213
+ gr.update(visible=False),
214
+ gr.update(visible=True),
215
+ ]
216
+
217
+ else:
218
+ return [
219
+ gr.update(visible=True),
220
+ gr.update(visible=False),
221
+ gr.update(visible=False),
222
+ gr.update(visible=False),
223
+ ]
224
+
225
 
226
 
227
  with gr.Blocks(title='KKMS-KSSW Demo') as demo:
228
+ dom = DomState(
229
+ index_type=constants_utils.INDEX_TYPE,
230
+ load_from_existing_index_file=constants_utils.LOAD_FROM_EXISTING_INDEX_STORE
231
+ )
232
+
233
+ widgets = gr.Radio(
234
+ [
235
+ "General",
236
+ "Mandi Price",
237
+ "Weather",
238
+ "Load Custom Data"
239
+ ],
240
+ label="Query related to",
241
+ value="General"
242
+ )
243
+
244
+ #############################################################################
245
+ # Widget for Govt. Policy
246
+ with gr.Row(visible=True) as rowGeneral:
247
+ with gr.Column(scale=1, min_width=600):
248
+ with gr.Tab(label='Relevant paragraphs'):
249
+ question = gr.Textbox(label="Enter your question", placeholder='Type the question here')
250
+ # Get the Relevant paragraphs for the question asked
251
+ relevant_paragraphs = gr.Textbox(label="Relevant paragraphs are:", value=dom.relevant_paragraphs, interactive=False)
252
+ b_relevant_paragraphs = gr.Button("Get Relevant paragraphs").style(size='sm')
253
+ b_relevant_paragraphs.click(
254
+ fn=dom.click_handler_for_get_relevant_paragraphs,
255
+ inputs=question,
256
+ outputs=[relevant_paragraphs]
257
+ )
258
+
259
+ with gr.Column(scale=1):
260
+ with gr.Tab(label='Sources of relevant paragraphs'):
261
+ # Get the Sources of relevant paragraphs
262
+ sources_relevant_paragraphs = gr.Textbox(label="Sources of relevant paragraphs are:", interactive=False)
263
+ b_sources_relevant_paragraphs = gr.Button("Get Sources of relevant paragraphs").style(size='sm')
264
+ b_sources_relevant_paragraphs.click(fn=dom.click_handler_for_relevant_paragraphs_source, inputs=relevant_paragraphs, outputs=[sources_relevant_paragraphs])
265
+
266
+ # NOTE: Don't show extractive summary unless requested by FTA.
267
+ # with gr.Column(scale=1, min_width=600):
268
+ # with gr.Tab(label='Extractive Summary'):
269
+ # # Get the extractive text summary from the retrieved Relevant paragraphs
270
+ # summary = gr.Textbox(label="Extractive Summary is:", value=dom.summary, interactive=False)
271
+ # b_summary = gr.Button("Extract Summary").style(size='sm')
272
+ # b_summary.click(fn=dom.click_handler_for_summary, inputs=relevant_paragraphs, outputs=[summary])
273
+
274
+ # Get the exact answer for the question asked from the retrieved Relevant paragraphs
275
+ with gr.Column(scale=1, min_width=600):
276
+ with gr.Tab(label='Answer'):
277
+ answer = gr.Textbox(label="Answer is:", value=dom.answer, interactive=False)
278
+ b_answer = gr.Button("Get Answer").style(size='sm')
279
+ b_answer.click(fn=dom.click_handler_for_get_answer, inputs=[relevant_paragraphs, question], outputs=[answer])
280
+
281
+ # Covert the answer to Indian language
282
+ with gr.Column(scale=1, min_width=600):
283
+ with gr.Tab(label='Answer in selected language'):
284
+ # Select the language
285
+ language = gr.Dropdown(
286
+ ['English', 'Hindi', 'Gujarati', 'Marathi', 'Kannada', 'Bengali', 'Panjabi', 'Telugu', 'Tamil', 'Malayalam'],
287
+ label="Select language")
288
+ indic_lang_answer = gr.Textbox(label="Answer in the selected language is:", value=dom.indic_translation, interactive=False)
289
+ b_indic_lang_answer = gr.Button("Get answer in selected language").style(size='sm')
290
+ b_indic_lang_answer.click(fn=dom.click_handler_for_get_indic_translation, inputs=[answer, language], outputs=[indic_lang_answer])
291
+
292
+
293
+ #############################################################################
294
+ # Widget for Mandi Price
295
+ with gr.Row(visible=False) as rowMandiPrice:
296
+ with gr.Column(scale=1, min_width=600):
297
+ # Select State
298
+ state_name = gr.Dropdown(constants_utils.MANDI_PRICE_STATES, label="Select state")
299
+ # APMC name
300
+ apmc_name = gr.Textbox(label="Enter APMC name", placeholder='Type the APMC name here')
301
+ # APMC name
302
+ commodity_name = gr.Textbox(label="Enter Commodity name", placeholder='Type the Commodity name here')
303
+
304
+ # From/To date in yyyy-mm-dd format
305
+ from_date = gr.Textbox(label="From date?", value=dom.mandi_from_date, placeholder='Please enter the From date here in yyyy-mm-dd format')
306
+ to_date = gr.Textbox(label="To date?", value=dom.mandi_to_date, placeholder='Please enter the To date here in yyyy-mm-dd format')
307
+
308
+ with gr.Column(scale=1, min_width=600):
309
+ mandi_price = gr.Textbox(label=f"Mandi Price is:", value=dom.mandi_price, interactive=False)
310
+ b_summary = gr.Button("Get Mandi Price").style(size='sm')
311
+ b_summary.click(fn=dom.click_handler_for_mandi_price, inputs=[state_name, apmc_name, commodity_name, from_date, to_date], outputs=[mandi_price])
312
+
313
+
314
+ #############################################################################
315
+ # Widget for Weather Info
316
+ with gr.Row(visible=False) as rowWeather:
317
+ ########### Weather Forecast ###########
318
+ with gr.Column(scale=1, min_width=600):
319
+ with gr.Tab(label='Weather Forecast for next 5 days'):
320
+ # Select the State
321
+ state = gr.Dropdown(
322
+ constants_utils.WEATHER_FORECAST_STATES,
323
+ label="Select state"
324
+ )
325
+
326
+ # # Select district
327
+ # district = gr.Dropdown(
328
+ # weather_utils.STATES.get(state, {}),
329
+ # label="Select district"
330
+ # )
331
+
332
+ district = gr.Textbox(label="Enter district name", placeholder='Type the district name here')
333
+ district_weather = gr.Textbox(label=f"Weather forecast is:", value=dom.weather_forecast, interactive=False)
334
+ bd_weather = gr.Button("Get weather forecast").style(size='sm')
335
+ bd_weather.click(fn=dom.click_handler_for_get_weather_forecast, inputs=[state, district], outputs=[district_weather])
336
+
337
+ with gr.Column(scale=1, min_width=600):
338
+ with gr.Tab(label='Weather Forecast Summary'):
339
+ # Get the summary of the weather forecast
340
+ weather_forecast_summary = gr.Textbox(label="Weather Forecast Summary is:", value=dom.weather_forecast_summary, interactive=False)
341
+ b_weather_forecast_summary = gr.Button("Get Weather Forecast Summary").style(size='sm')
342
+ b_weather_forecast_summary.click(fn=dom.click_handler_for_weather_forecast_summary, inputs=district_weather, outputs=[weather_forecast_summary])
343
+
344
+ # Covert the weather forcast summary in Indian language
345
+ with gr.Column(scale=1, min_width=600):
346
+ with gr.Tab(label='Weather Forecast Summary in selected language'):
347
+ # Select the language
348
+ language = gr.Dropdown(
349
+ ['English', 'Hindi', 'Gujarati', 'Marathi', 'Kannada', 'Bengali', 'Panjabi', 'Telugu', 'Tamil', 'Malayalam'],
350
+ label="Select language")
351
+ indic_weather_forecast_summary = gr.Textbox(label="Weather Forecast Summary in the selected language is:", value=dom.indic_translation, interactive=False)
352
+ b_indic_weather_forecast_summary = gr.Button("Get answer in selected language").style(size='sm')
353
+ b_indic_weather_forecast_summary.click(fn=dom.click_handler_for_get_indic_translation, inputs=[weather_forecast_summary, language], outputs=[indic_weather_forecast_summary])
354
+
355
+ with gr.Column(scale=1, min_width=600):
356
+ # with gr.Tab(label='Weather Info'):
357
+ city = gr.Textbox(label="Enter city name", placeholder='Type the city name here')
358
+ weather = gr.Textbox(label=f"Current weather is:", value=dom.weather_info, interactive=False)
359
+ b_weather = gr.Button("Get weather info").style(size='sm')
360
+ b_weather.click(fn=dom.click_handler_for_get_weather, inputs=city, outputs=[weather])
361
+
362
+
363
+ #############################################################################
364
+ # Widget to load and process from the custom data source
365
+ with gr.Row(visible=False) as rowLoadCustomData:
366
+ with gr.Column(scale=1, min_width=600):
367
+ with gr.Tab(label='Load Custom Data'):
368
+ doc_type = gr.Radio(
369
+ list(constants_utils.DATA_SOURCES.keys()),
370
+ label="Select data source (Supports uploading multiple Files/URLs)",
371
+ value="PDF"
372
+ )
373
+
374
+ with gr.Row(visible=True) as rowUploadPdf:
375
+ with gr.Column(scale=1, min_width=600):
376
+ file_output = gr.File()
377
+ upload_button = gr.UploadButton(
378
+ "Click to Upload PDF Files",
379
+ file_types=['.pdf'],
380
+ file_count="multiple"
381
+ )
382
+ upload_button.upload(dom._upload_file, upload_button, file_output)
383
+ b_files = gr.Button("Load PDF Files").style(size='sm')
384
+ b_files.click(
385
+ fn=dom.click_handler_for_load_files_urls,
386
+ inputs=[doc_type, upload_button]
387
+ )
388
+
389
+ with gr.Row(visible=False) as rowUploadOnlinePdf:
390
+ with gr.Column(scale=1, min_width=600):
391
+ urls = gr.Textbox(label="Enter URLs for Online PDF (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
392
+ b_urls = gr.Button("Load Online PDFs").style(size='sm')
393
+ b_urls.click(fn=dom.click_handler_for_load_files_urls, inputs=[doc_type, urls])
394
+
395
+ with gr.Row(visible=False) as rowUploadTextFile:
396
+ with gr.Column(scale=1, min_width=600):
397
+ file_output = gr.File()
398
+ upload_button = gr.UploadButton(
399
+ "Click to Upload Text Files",
400
+ file_types=['.txt'],
401
+ file_count="multiple"
402
+ )
403
+ upload_button.upload(dom._upload_file, upload_button, file_output)
404
+ b_files = gr.Button("Load Text Files").style(size='sm')
405
+ b_files.click(
406
+ fn=dom.click_handler_for_load_files_urls,
407
+ inputs=[doc_type, file_output]
408
+ )
409
+
410
+ with gr.Row(visible=False) as rowUploadUrls:
411
+ with gr.Column(scale=1, min_width=600):
412
+ urls = gr.Textbox(label="Enter URLs (Supports uploading from multiple URLs. Enter the URLs in comma (,) separated format)", placeholder='Type the URLs here')
413
+ b_urls = gr.Button("Load URLs").style(size='sm')
414
+ b_urls.click(fn=dom.click_handler_for_load_files_urls, inputs=[doc_type, urls])
415
+
416
+ doc_type.change(
417
+ fn=dom.select_files_urls,
418
+ inputs=doc_type,
419
+ outputs=[
420
+ rowUploadPdf,
421
+ rowUploadOnlinePdf,
422
+ rowUploadTextFile,
423
+ rowUploadUrls,
424
+ ],
425
+ )
426
+
427
+
428
+ widgets.change(
429
+ fn=dom.select_widget,
430
+ inputs=widgets,
431
+ outputs=[
432
+ rowGeneral,
433
+ rowMandiPrice,
434
+ rowWeather,
435
+ rowLoadCustomData,
436
+ ],
437
+ )
438
 
439
 
440
  demo.launch(share=False)
kkms_kssw.py CHANGED
@@ -1,16 +1,17 @@
1
  import os
2
 
3
  import utils.constants as constants_utils
4
- import utils.data_loader as data_loader_utils
5
  import utils.langchain_utils as langchain_utils
6
  import utils.weather as weather_utils
7
  import utils.mandi_price as mandi_utils
8
  import utils.translator as translator_utils
 
9
 
10
- from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader, GPTListIndex
11
- from langchain.indexes import VectorstoreIndexCreator
12
- from langchain.embeddings.openai import OpenAIEmbeddings
13
- from langchain.vectorstores import FAISS
 
14
 
15
  import warnings
16
  warnings.filterwarnings('ignore')
@@ -18,106 +19,59 @@ warnings.filterwarnings('ignore')
18
 
19
 
20
  class KKMS_KSSW:
21
- def __init__(self):
22
- self.index = None
23
- self.documents = []
24
- self.response = None
25
-
26
- # Instantiate langchain_utils class object
27
- self.langchain_utils_obj = langchain_utils.LANGCHAIN_UTILS()
28
- # Instantiate Mandi Price utils class object
29
- self.mandi_utils_obj = mandi_utils.MANDI_PRICE()
30
- # Instantiate Weather class object
31
- self.weather_utils_obj = weather_utils.WEATHER()
32
- # Instantiate translator_utils class object
33
- self.translator_utils_obj = translator_utils.TRANSLATOR()
34
-
35
- if not os.path.exists(constants_utils.DATA_PATH):
36
- os.makedirs(constants_utils.DATA_PATH)
37
- if not os.path.exists(constants_utils.OUTPUT_PATH):
38
- os.makedirs(constants_utils.OUTPUT_PATH)
39
-
40
-
41
- # Initialize index (vector store)
42
- def initialize_index(self, save_index_to_disk=True, index_type='GPTSimpleVectorIndex'):
43
- # Load the index from the saved index.json file
44
- if os.path.exists(constants_utils.INDEX_FILENAME):
45
- print(f'Loading pre-generated index from: {constants_utils.INDEX_FILENAME}')
46
- self.index = self.langchain_utils_obj.load_index(index_type='GPTSimpleVectorIndex', filepath=constants_utils.INDEX_FILENAME)
47
- else:
48
- # Load data from Docs
49
- if os.path.exists(constants_utils.DATA_PATH):
50
- doc_documents = SimpleDirectoryReader(constants_utils.DATA_PATH).load_data()
51
- self.documents = doc_documents[:]
52
-
53
- # Load data from PDFs only
54
- # pdf_documents = data_loader_utils.load_document(doc_type='pdf', doc_filepath=doc_filepath)
55
-
56
- # Load data from URLs & append it to the documents that we read from PDFs
57
- # url_documents = data_loader_utils.load_document(doc_type='url', urls=urls)
58
- # self.documents.extend(url_documents)
59
-
60
- # Build the Vector store for docs
61
- if index_type == 'GPTSimpleVectorIndex':
62
- self.index = GPTSimpleVectorIndex.from_documents(self.documents)
63
- elif index_type == 'FAISS':
64
- self.index = FAISS.from_documents(
65
- self.documents,
66
- OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
67
- )
68
-
69
-
70
- def merge_documents_from_different_sources(doc_documents, url_documents):
71
- # Build the Vector store for docs
72
- doc_index = GPTSimpleVectorIndex.from_documents(doc_documents)
73
- # Build the Vector store for URLs
74
- url_index = GPTSimpleVectorIndex.from_documents(url_documents)
75
-
76
- # Set summary of each index
77
- doc_index.set_text("index_from_docs")
78
- url_index.set_text("index_from_urls")
79
-
80
- # Merge index of different data sources
81
- self.index = GPTListIndex([doc_index])
82
- self.index.insert(url_index) # can also be passed directly as GPTListIndex([doc_index, url_index])
83
-
84
- return self.index
85
-
86
-
87
- if save_index_to_disk:
88
- # Save index to a index.json file
89
- print(f'Saving newly generated index: {constants_utils.INDEX_FILENAME}')
90
-
91
- if index_type == 'GPTSimpleVectorIndex':
92
- self.index.save_to_disk(constants_utils.INDEX_FILENAME)
93
- elif index_type == 'FAISS':
94
- self.index.save_local(constants_utils.INDEX_FILENAME)
95
-
96
-
97
-
98
- # Define query on index to retrieve the most relevant top K documents from the vector store
99
- def query(self,
100
- question,
101
- mode='default',
102
- response_mode="default",
103
- similarity_top_k=1,
104
- required_keywords=[],
105
- exclude_keywords=[],
106
- verbose=False
107
- ):
108
- '''
109
- Args:
110
- mode: can be any of [default, embedding]
111
- response_mode: can be any of [default, compact, tree_summarize]
112
- '''
113
-
114
- # Querying the index
115
- self.response = self.index.query(question,
116
- mode=mode,
117
- response_mode=response_mode,
118
- similarity_top_k=similarity_top_k,
119
- required_keywords=required_keywords,
120
- exclude_keywords=exclude_keywords,
121
- verbose=verbose)
122
-
123
- return self.response
 
1
  import os
2
 
3
  import utils.constants as constants_utils
 
4
  import utils.langchain_utils as langchain_utils
5
  import utils.weather as weather_utils
6
  import utils.mandi_price as mandi_utils
7
  import utils.translator as translator_utils
8
+ import utils.web_crawler as web_crawler_utils
9
 
10
+ import logging
11
+ logger = logging.getLogger(__name__)
12
+ logging.basicConfig(
13
+ format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
14
+ )
15
 
16
  import warnings
17
  warnings.filterwarnings('ignore')
 
19
 
20
 
21
  class KKMS_KSSW:
22
+ def __init__(self):
23
+ self.index_type = constants_utils.INDEX_TYPE
24
+ self.load_from_existing_index_store = constants_utils.LOAD_FROM_EXISTING_INDEX_STORE
25
+
26
+ # Instantiate langchain_utils class object
27
+ self.langchain_utils_obj = langchain_utils.LANGCHAIN_UTILS(
28
+ index_type=self.index_type,
29
+ load_from_existing_index_store=self.load_from_existing_index_store
30
+ )
31
+ # Instantiate Mandi Price utils class object
32
+ self.mandi_utils_obj = mandi_utils.MANDI_PRICE()
33
+ # Instantiate Weather class object
34
+ self.weather_utils_obj = weather_utils.WEATHER()
35
+ # Instantiate translator_utils class object
36
+ self.translator_utils_obj = translator_utils.TRANSLATOR()
37
+
38
+
39
+
40
+ # Initialize index (vector store)
41
+ def load_create_index(self):
42
+ logger.info(f"Load/Create index")
43
+ self.langchain_utils_obj.load_create_index()
44
+
45
+
46
+ # Upload data and update the index
47
+ def upload_data(
48
+ self,
49
+ doc_type,
50
+ files_or_urls,
51
+ index_category
52
+ ):
53
+ logger.info(f"Uploading data")
54
+ self.langchain_utils_obj.upload_data(
55
+ doc_type=doc_type,
56
+ files_or_urls=files_or_urls,
57
+ index_category=index_category
58
+ )
59
+
60
+
61
+ # Define query on index to retrieve the most relevant top K documents from the vector store
62
+ def query(
63
+ self,
64
+ question,
65
+ question_category
66
+ ):
67
+ '''
68
+ Args:
69
+ mode: can be any of [default, embedding]
70
+ response_mode: can be any of [default, compact, tree_summarize]
71
+ '''
72
+ logger.info(f"Querying from index/vector store")
73
+
74
+ return self.langchain_utils_obj.query(
75
+ question=question,
76
+ question_category=question_category
77
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -17,4 +17,5 @@ faiss-cpu
17
  tiktoken
18
  googletrans==3.1.0a0
19
  BeautifulSoup4
20
- PyPDF2
 
 
17
  tiktoken
18
  googletrans==3.1.0a0
19
  BeautifulSoup4
20
+ pymupdf
21
+ PyPDF2
utils/constants.py CHANGED
@@ -1,42 +1,174 @@
 
 
 
 
 
 
 
1
  DATA_PATH = './data/'
2
- OUTPUT_PATH = './output'
3
- INDEX_FILENAME = f'{OUTPUT_PATH}/index.json'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  URLS = [
6
- 'https://dmi.gov.in/Documents/GrantCAGrapes.pdf',
7
- 'https://dmi.gov.in/Documents/organicfaq.pdf',
8
- 'https://dmi.gov.in/Documents/CAGMOrganic-III.pdf',
9
- 'https://dmi.gov.in/GradesStandard.aspx',
10
- 'https://www.india.gov.in/topics/agriculture',
11
- 'https://www.india.gov.in/farmers-portal',
12
-
13
- # Pest Management related
14
- 'https://niphm.gov.in/IPMPackages/Maize.pdf',
15
-
16
- # Mandi Price related
17
- 'https://agmarknet.gov.in/',
18
- 'https://enam.gov.in/web/dashboard/trade-data',
19
-
20
- # General information related: Information of interests are present on the 2nd level url
21
- 'https://agricoop.nic.in/#gsc.tab=0',
22
- 'https://www.manage.gov.in/nf/nf.asp',
23
-
24
- # Weather forecast related
25
- 'https://nwp.imd.gov.in/blf/blf_temp/', # need to select state -> district (on the new page) -> displays detailed table -> can get info at the block level as well from the same page on selection
26
- 'https://nwp.imd.gov.in/blf/blf_temp/dis.php?value=12gujarat', # to get weather forecast for the given state
27
- 'https://nwp.imd.gov.in/blf/blf_temp/block.php?dis=12BHAVNAGAR', # to get the weather forecast for the given district
 
 
 
28
  ]
29
 
30
 
31
  # Supported Indian laguages for translating the English text to Indian language
32
  INDIC_LANGUAGE = {
33
- 'Hindi': 'hi',
34
- 'Gujarati': 'gu',
35
- 'Kannada': 'kn',
36
- 'Marathi': 'mr',
37
- 'Panjabi': 'pa',
38
- 'Bengali': "bn",
39
- 'Telugu': 'te',
40
- 'Tamil': 'ta',
41
- 'Malayalam': 'ml',
42
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import utils.web_crawler as web_crawler_utils
3
+
4
+ LOAD_FROM_EXISTING_INDEX_STORE = True
5
+ INDEX_TYPE = 'FAISS'
6
+
7
+ # Path from where to load the data (from the local directory)
8
  DATA_PATH = './data/'
9
+
10
+ # Path to store the index/vector db
11
+ OUTPUT_PATH = os.path.join('./output/', INDEX_TYPE)
12
+ # Create OUTPUT_PATH directory if not present
13
+ if not os.path.exists(OUTPUT_PATH):
14
+ os.makedirs(OUTPUT_PATH)
15
+
16
+ # Index categories (There would be an index for each category. On asking the query, App will search for the relevant docs/information only from the respective index category.)
17
+ INDEX_CATEGORY = [
18
+ # 'crops',
19
+ # 'fruits',
20
+ # 'pest_management',
21
+ # 'govt_policy',
22
+ # 'insurance',
23
+ # 'soil',
24
+ 'general',
25
+ ]
26
+
27
+ # Doctype of the master index of each index category. Master index for each index category would be stored under this key.
28
+ INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE = 'master'
29
+
30
+ # Output index name if creating the index/vector store using GPTSimpleVectorIndex
31
+ INDEX_FILENAME = os.path.join(OUTPUT_PATH, 'index.json')
32
+
33
+ # List of data sources/types & from where to load the data and create the index/vector store
34
+ # 2nd item is the type of source from where the data would be loaded. Currently it could come from either a file or URL.
35
+ DATA_SOURCES = {
36
+ 'PDF': 'pdf',
37
+ 'Text File': 'textfile',
38
+ 'Online PDF': 'online_pdf', # web_crawler_utils.get_ipm_packages_pdfs_urls()[:1]
39
+ 'URLs': 'urls',
40
+ }
41
+
42
+ # LangChain related constants
43
+ TEXT_SPLITTER_CHUNK_SIZE = 1000
44
+ TEXT_SPLITTER_CHUNK_OVERLAP = 0
45
+
46
 
47
  URLS = [
48
+ 'https://agricoop.nic.in/#gsc.tab=0',
49
+
50
+ 'https://dmi.gov.in/Documents/GrantCAGrapes.pdf',
51
+ 'https://dmi.gov.in/Documents/organicfaq.pdf',
52
+ 'https://dmi.gov.in/Documents/CAGMOrganic-III.pdf',
53
+ 'https://dmi.gov.in/GradesStandard.aspx',
54
+ 'https://www.india.gov.in/topics/agriculture',
55
+ 'https://www.india.gov.in/farmers-portal',
56
+
57
+ # Pest Management related
58
+ 'https://niphm.gov.in/IPMPackages/Maize.pdf',
59
+
60
+ # Banned Pesticides
61
+ 'https://ppqs.gov.in/divisions/cib-rc/registered-products', # Online PDF links on the page
62
+
63
+ # Mandi Price related
64
+ 'https://agmarknet.gov.in/',
65
+
66
+ # General information related: Information of interests are present on the 2nd level url
67
+ 'https://www.manage.gov.in/nf/nf.asp',
68
+
69
+ # Weather forecast related
70
+ 'https://nwp.imd.gov.in/blf/blf_temp/', # need to select state -> district (on the new page) -> displays detailed table -> can get info at the block level as well from the same page on selection
71
+ 'https://nwp.imd.gov.in/blf/blf_temp/dis.php?value=12gujarat', # to get weather forecast for the given state
72
+ 'https://nwp.imd.gov.in/blf/blf_temp/block.php?dis=12BHAVNAGAR', # to get the weather forecast for the given district
73
  ]
74
 
75
 
76
  # Supported Indian laguages for translating the English text to Indian language
77
  INDIC_LANGUAGE = {
78
+ 'Hindi': 'hi',
79
+ 'Gujarati': 'gu',
80
+ 'Kannada': 'kn',
81
+ 'Marathi': 'mr',
82
+ 'Panjabi': 'pa',
83
+ 'Bengali': "bn",
84
+ 'Telugu': 'te',
85
+ 'Tamil': 'ta',
86
+ 'Malayalam': 'ml',
87
  }
88
+
89
+ # State list used in the Mandi Price widget dropdown list
90
+ MANDI_PRICE_STATES = [
91
+ 'ANDAMAN AND NICOBAR ISLANDS',
92
+ 'ANDHRA PRADESH',
93
+ 'ASSAM',
94
+ 'BIHAR',
95
+ 'CHANDIGARH',
96
+ 'CHHATTISGARH',
97
+ 'GOA',
98
+ 'GUJARAT',
99
+ 'HARYANA',
100
+ 'HIMACHAL PRADESH',
101
+ 'JAMMU AND KASHMIR',
102
+ 'JHARKHAND',
103
+ 'KARNATAKA',
104
+ 'KERALA',
105
+ 'MADHYA PRADESH',
106
+ 'MAHARASHTRA',
107
+ 'NAGALAND',
108
+ 'ODISHA',
109
+ 'PUDUCHERRY',
110
+ 'PUNJAB',
111
+ 'RAJASTHAN',
112
+ 'TAMIL NADU',
113
+ 'TELANGANA',
114
+ 'TRIPURA',
115
+ 'UTTAR PRADESH',
116
+ 'UTTARAKHAND',
117
+ 'WEST BENGAL'
118
+ ]
119
+
120
+ # State list used in the Weather forecast widget dropdown list
121
+ WEATHER_FORECAST_STATES = [
122
+ 'Andaman-Nicobar',
123
+ 'Andhra-Pradesh',
124
+ 'Arunachal-Pradesh',
125
+ 'Assam',
126
+ 'Bihar',
127
+ 'Chandigarh',
128
+ 'Chhattisgarh',
129
+ 'Dadra-and-Nagar-Haveli',
130
+ 'Daman-and-Diu',
131
+ 'Delhi',
132
+ 'Goa',
133
+ 'Gujarat',
134
+ 'Haryana',
135
+ 'Himachal-Pradesh',
136
+ 'Jammu-Kashmir',
137
+ 'Jharkhand',
138
+ 'Karnataka',
139
+ 'Kerala',
140
+ 'Lakshadweep',
141
+ 'Madhya-Pradesh',
142
+ 'Maharashtra',
143
+ 'Manipur',
144
+ 'Meghalaya',
145
+ 'Mizoram',
146
+ 'Nagaland',
147
+ 'Odisha',
148
+ 'Pondicherry',
149
+ 'Punjab',
150
+ 'Rajasthan',
151
+ 'Sikkim',
152
+ 'Tamilnadu',
153
+ 'Telangana',
154
+ 'Tripura',
155
+ 'Uttar-Pradesh',
156
+ 'Uttarakhand',
157
+ 'West-Bengal'
158
+ ]
159
+
160
+ # LIST OF PESTICIDES WHICH ARE BANNED AND RESTRICTED USE (List created from: https://pib.gov.in/PressReleaseIframePage.aspx?PRID=1896140)
161
+ BANNED_PESTICIDES_FORMULATIONS = [
162
+ 'Alachlor',
163
+ 'Aldicarb',
164
+ 'Aldrin',
165
+ 'Benzene Hexachloride',
166
+ 'Benomyl',
167
+ 'Calcium Cyanide',
168
+ 'Carbaryl',
169
+ 'Chlorbenzilate',
170
+ 'Chlordane',
171
+ 'Chlorofenvinphos',
172
+ 'Copper Acetoarsenite',
173
+ ]
174
+
utils/data_loader.py CHANGED
@@ -1,104 +1,214 @@
1
  import os
 
2
  import pandas as pd
3
  from pathlib import Path
4
- from llama_index import GPTSimpleVectorIndex, download_loader
 
 
 
5
  from langchain.agents import initialize_agent, Tool
6
  from langchain.llms import OpenAI
7
  from langchain.chains.conversation.memory import ConversationBufferMemory
8
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
  class DATA_LOADER:
12
- def __init__(self):
13
- print()
14
-
15
-
16
- def clean_df(self, df, dropna=True, fillna=False):
17
- if fillna:
18
- df.fillna('', inplace=True)
19
- if dropna:
20
- df.dropna(inplace=True)
21
- # df = df[~df.isna()]
22
- df = df.drop_duplicates().reset_index(drop=True)
23
- return df
24
-
25
-
26
- def load_external_links_used_by_FTAs(self,
27
- sheet_filepath='./data/urls_used_by_ftas/external_links_used_by_FTAs.xlsx'
28
- ):
29
- xls = pd.ExcelFile(sheet_filepath)
30
- df = pd.DataFrame(columns=['S.No.', 'Link used for', 'Link type', 'Link'])
31
- for sheet_name in xls.sheet_names:
32
- sheet = pd.read_excel(xls, sheet_name)
33
- if sheet.shape[0] > 0:
34
- df = pd.concat([df, sheet])
35
- else:
36
- print(f'{sheet_name} has no content.')
37
-
38
- df = df[['Link used for', 'Link type', 'Link']]
39
- # Clean df
40
- df = clean_df(df)
41
- print(f'Total links available across all cities: {df.shape[0]}')
42
- return df
43
-
44
-
45
- def load_document(self,
46
- doc_type='pdf',
47
- doc_filepath='',
48
- urls=[]
49
- ):
50
- documents = []
51
-
52
- if doc_type == 'pdf':
53
- PDFReader = download_loader("PDFReader")
54
- loader = PDFReader()
55
- if os.path.exists(doc_filepath):
56
- documents = loader.load_data(file=Path(doc_filepath))
57
-
58
- elif doc_type == 'url':
59
- BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
60
- loader = BeautifulSoupWebReader()
61
- if len(urls) > 0:
62
- # Load data from URLs
63
- documents = loader.load_data(urls=urls)
64
-
65
- elif doc_type == 'url-kb':
66
- KnowledgeBaseWebReader = download_loader("KnowledgeBaseWebReader")
67
- loader = KnowledgeBaseWebReader()
68
- for url in urls:
69
- doc = loader.load_data(
70
- root_url=url,
71
- link_selectors=['.article-list a', '.article-list a'],
72
- article_path='/articles',
73
- body_selector='.article-body',
74
- title_selector='.article-title',
75
- subtitle_selector='.article-subtitle',
76
- )
77
- documents.extend(doc)
78
-
79
- elif doc_type == 'url-chatgpt':
80
- BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
81
- loader = BeautifulSoupWebReader()
82
- if len(urls) > 0:
83
- # Load data from URLs
84
- documents = loader.load_data(urls=urls)
85
- # Build the Vector database
86
- index = GPTSimpleVectorIndex(documents)
87
- tools = [
88
- Tool(
89
- name="Website Index",
90
- func=lambda q: index.query(q),
91
- description=f"Useful when you want answer questions about the text retrieved from websites.",
92
- ),
93
- ]
94
-
95
- # Call ChatGPT API
96
- llm = OpenAI(temperature=0) # Keep temperature=0 to search from the given urls only
97
- memory = ConversationBufferMemory(memory_key="chat_history")
98
- agent_chain = initialize_agent(
99
- tools, llm, agent="zero-shot-react-description", memory=memory
100
- )
101
-
102
- output = agent_chain.run(input="What language is on this website?")
103
-
104
- return documents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import re
3
  import pandas as pd
4
  from pathlib import Path
5
+ import glob
6
+
7
+ from llama_index import GPTSimpleVectorIndex, download_loader, SimpleDirectoryReader
8
+ from langchain.document_loaders import PyPDFLoader, TextLoader
9
  from langchain.agents import initialize_agent, Tool
10
  from langchain.llms import OpenAI
11
  from langchain.chains.conversation.memory import ConversationBufferMemory
12
 
13
+ import utils.utils as utils
14
+
15
+ import logging
16
+ logger = logging.getLogger(__name__)
17
+ logging.basicConfig(
18
+ format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
19
+ )
20
+
21
+ import warnings
22
+ warnings.filterwarnings('ignore')
23
+
24
 
25
 
26
  class DATA_LOADER:
27
+ def __init__(self):
28
+ # Instantiate UTILS class object
29
+ self.utils_obj = utils.UTILS()
30
+
31
+
32
+ def load_documents_from_urls(self, urls=[], doc_type='urls'):
33
+ url_documents = self.load_document(doc_type=doc_type, urls=urls)
34
+ return url_documents
35
+
36
+
37
+ def load_documents_from_pdf(self, doc_filepath='', urls=[], doc_type='pdf'):
38
+ if doc_type == 'pdf':
39
+ pdf_documents = self.load_document(doc_type=doc_type, doc_filepath=doc_filepath)
40
+ elif doc_type == 'online_pdf':
41
+ pdf_documents = self.load_document(doc_type=doc_type, urls=urls)
42
+ return pdf_documents
43
+
44
+
45
+ def load_documents_from_directory(self, doc_filepath='', doc_type='directory'):
46
+ doc_documents = self.load_document(doc_type=doc_type, doc_filepath=doc_filepath)
47
+ return doc_documents
48
+
49
+
50
+ def load_documents_from_text(self, doc_filepath='', doc_type='textfile'):
51
+ text_documents = self.load_document(doc_type=doc_type, doc_filepath=doc_filepath)
52
+ return text_documents
53
+
54
+
55
+ def pdf_loader(self, filepath):
56
+ loader = PyPDFLoader(filepath)
57
+ return loader.load_and_split()
58
+
59
+
60
+ def text_loader(self, filepath):
61
+ loader = TextLoader(filepath)
62
+ return loader.load()
63
+
64
+
65
+ def load_document(self,
66
+ doc_type='pdf',
67
+ doc_filepath='',
68
+ urls=[]
69
+ ):
70
+ logger.info(f'Loading {doc_type} in raw format from: {doc_filepath}')
71
+
72
+ documents = []
73
+
74
+ # Validation checks
75
+ if doc_type in ['directory', 'pdf', 'textfile']:
76
+ if not os.path.exists(doc_filepath):
77
+ logger.warning(f"{doc_filepath} does not exist, nothing can be loaded!")
78
+ return documents
79
+
80
+ elif doc_type in ['online_pdf', 'urls']:
81
+ if len(urls) == 0:
82
+ logger.warning(f"URLs list empty, nothing can be loaded!")
83
+ return documents
84
+
85
+
86
+ ######### Load documents #########
87
+ # Load PDF
88
+ if doc_type == 'pdf':
89
+ # Load multiple PDFs from directory
90
+ if os.path.isdir(doc_filepath):
91
+ pdfs = glob.glob(f"{doc_filepath}/*.pdf")
92
+ logger.info(f'Total PDF files to load: {len(pdfs)}')
93
+ for pdf in pdfs:
94
+ documents.extend(self.pdf_loader(pdf))
95
+
96
+ # Loading from a single PDF file
97
+ elif os.path.isfile(doc_filepath) and doc_filepath.endswith('.pdf'):
98
+ documents.extend(self.pdf_loader(doc_filepath))
99
+
100
+ # Load PDFs from online (urls). Can read multiple PDFs from multiple URLs in one-shot
101
+ elif doc_type == 'online_pdf':
102
+ logger.info(f'URLs to load Online PDFs are from: {urls}')
103
+ valid_urls = self.utils_obj.validate_url_format(
104
+ urls=urls,
105
+ url_type=doc_type
106
+ )
107
+ for url in valid_urls:
108
+ # Load and split PDF pages per document
109
+ documents.extend(self.pdf_loader(url))
110
+
111
+ # Load data from URLs (can load data from multiple URLs)
112
+ elif doc_type == 'urls':
113
+ logger.info(f'URLs to load data from are: {urls}')
114
+ valid_urls = self.utils_obj.validate_url_format(
115
+ urls=urls,
116
+ url_type=doc_type
117
+ )
118
+ BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
119
+ loader = BeautifulSoupWebReader()
120
+ # Load data from URLs
121
+ documents = loader.load_data(urls=valid_urls)
122
+
123
+ # Load data from text file(s)
124
+ elif doc_type == 'textfile':
125
+ # Load multiple text files from directory
126
+ if os.path.isdir(doc_filepath):
127
+ text_files = glob.glob(f"{doc_filepath}/*.txt")
128
+ logger.info(f'Total text files to load: {len(text_files)}')
129
+ for tf in text_files:
130
+ documents.extend(self.text_loader(tf))
131
+
132
+ # Loading from a single text file
133
+ elif os.path.isfile(doc_filepath) and doc_filepath.endswith('.txt'):
134
+ documents.extend(self.text_loader(doc_filepath))
135
+
136
+ # Load data from files on the local directory (files may be of type .pdf, .txt, .doc, etc.)
137
+ elif doc_type == 'directory':
138
+ documents = SimpleDirectoryReader(doc_filepath).load_data()
139
+
140
+ # Load data from URLs in Knowledge Base format
141
+ elif doc_type == 'url-kb':
142
+ KnowledgeBaseWebReader = download_loader("KnowledgeBaseWebReader")
143
+ loader = KnowledgeBaseWebReader()
144
+ for url in urls:
145
+ doc = loader.load_data(
146
+ root_url=url,
147
+ link_selectors=['.article-list a', '.article-list a'],
148
+ article_path='/articles',
149
+ body_selector='.article-body',
150
+ title_selector='.article-title',
151
+ subtitle_selector='.article-subtitle',
152
+ )
153
+ documents.extend(doc)
154
+
155
+ # Load data from URLs and create an agent chain using ChatGPT
156
+ elif doc_type == 'url-chatgpt':
157
+ BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
158
+ loader = BeautifulSoupWebReader()
159
+ # Load data from URLs
160
+ documents = loader.load_data(urls=urls)
161
+ # Build the Vector database
162
+ index = GPTSimpleVectorIndex(documents)
163
+ tools = [
164
+ Tool(
165
+ name="Website Index",
166
+ func=lambda q: index.query(q),
167
+ description=f"Useful when you want answer questions about the text retrieved from websites.",
168
+ ),
169
+ ]
170
+
171
+ # Call ChatGPT API
172
+ llm = OpenAI(temperature=0) # Keep temperature=0 to search from the given urls only
173
+ memory = ConversationBufferMemory(memory_key="chat_history")
174
+ agent_chain = initialize_agent(
175
+ tools, llm, agent="zero-shot-react-description", memory=memory
176
+ )
177
+
178
+ output = agent_chain.run(input="What language is on this website?")
179
+
180
+
181
+ # Clean documents
182
+ documents = self.clean_documents(documents)
183
+ logger.info(f'{doc_type} in raw format from: {doc_filepath} loaded successfully!')
184
+ return documents
185
+
186
+
187
+ def clean_documents(
188
+ self,
189
+ documents
190
+ ):
191
+ cleaned_documents = []
192
+ for document in documents:
193
+ document.page_content = self.utils_obj.replace_newlines_and_spaces(document.page_content)
194
+ cleaned_documents.append(document)
195
+ return cleaned_documents
196
+
197
+
198
+ def load_external_links_used_by_FTAs(self,
199
+ sheet_filepath='./data/urls_used_by_ftas/external_links_used_by_FTAs.xlsx'
200
+ ):
201
+ xls = pd.ExcelFile(sheet_filepath)
202
+ df = pd.DataFrame(columns=['S.No.', 'Link used for', 'Link type', 'Link'])
203
+ for sheet_name in xls.sheet_names:
204
+ sheet = pd.read_excel(xls, sheet_name)
205
+ if sheet.shape[0] > 0:
206
+ df = pd.concat([df, sheet])
207
+ else:
208
+ logger.info(f'{sheet_name} has no content.')
209
+
210
+ df = df[['Link used for', 'Link type', 'Link']]
211
+ # Clean df
212
+ df = self.utils_obj.clean_df(df)
213
+ logger.info(f'Total links available across all cities: {df.shape[0]}')
214
+ return df
utils/langchain_utils.py CHANGED
@@ -1,169 +1,839 @@
 
 
 
 
1
  from langchain.llms import OpenAI
2
- from langchain.text_splitter import CharacterTextSplitter
3
  from langchain.chains.summarize import load_summarize_chain
4
  from langchain.docstore.document import Document
5
  from langchain.embeddings.openai import OpenAIEmbeddings
6
  from langchain.vectorstores import Chroma
 
7
  from langchain.chains.question_answering import load_qa_chain
8
  from langchain.chains.qa_with_sources import load_qa_with_sources_chain
9
  from langchain.prompts import PromptTemplate
10
- from llama_index import GPTSimpleVectorIndex
11
  from langchain.vectorstores import FAISS
12
 
13
  import pickle
 
 
 
14
  import os
15
  os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
16
  os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')
17
 
 
 
 
 
 
 
 
 
 
18
 
19
 
20
  class LANGCHAIN_UTILS:
21
- def __init__(self):
22
- print()
23
-
24
-
25
- def generate_prompt_template(self, prompt_type='general'):
26
- prompt_template = ''
27
-
28
- if prompt_type == 'general':
29
- prompt_template = """Write a concise summary of the following:
30
-
31
- {text}
32
-
33
- CONCISE SUMMARY IN ENGLISH:"""
34
-
35
- elif prompt_type == 'weather':
36
- prompt_template = """
37
- What would be the weather based on the below data:
38
- {text}
39
- """
40
-
41
- return prompt_template
42
-
43
-
44
-
45
- def get_textual_summary(self,
46
- text,
47
- chain_type="stuff",
48
- custom_prompt=True,
49
- prompt_type='general'
50
- ):
51
- texts = [text]
52
- docs = [Document(page_content=t) for t in texts[:3]]
53
-
54
- llm = OpenAI(temperature=0)
55
- if custom_prompt:
56
- prompt_template = self.generate_prompt_template(prompt_type)
57
- PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
58
- chain = load_summarize_chain(llm, chain_type=chain_type, prompt=PROMPT)
59
- else:
60
- chain = load_summarize_chain(llm, chain_type=chain_type)
61
-
62
- text_summary = chain.run(docs)
63
- return text_summary
64
-
65
-
66
- def get_weather_forecast_summary(self,
67
- text,
68
- chain_type="stuff"
69
- ):
70
- text = f"""
71
- What would be the weather based on the below data:
72
- {text}
73
-
74
- Give simple response without technical numbers which can be explained to human.
75
- """
76
- texts = [text]
77
- docs = [Document(page_content=t) for t in texts[:3]]
78
-
79
- llm = OpenAI(temperature=0)
80
- chain = load_summarize_chain(llm, chain_type=chain_type)
81
- text_summary = chain.run(docs)
82
-
83
- return text_summary
84
-
85
-
86
- def get_answer_from_para(self,
87
- para,
88
- question,
89
- chain_type="stuff",
90
- custom_prompt=True
91
- ):
92
- # Prepare data (Split paragraph into chunks of small documents)
93
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
94
- texts = text_splitter.split_text(para)
95
-
96
- # Find similar docs that are relevant to the question
97
- embeddings = OpenAIEmbeddings()
98
- docsearch = Chroma.from_texts(
99
- texts, embeddings,
100
- metadatas=[{"source": str(i)} for i in range(len(texts))]
101
- )
102
-
103
- # Search for the similar docs
104
- docs = docsearch.similarity_search(question, k=1)
105
-
106
- llm = OpenAI(temperature=0)
107
- # Create a Chain for question answering
108
- if custom_prompt:
109
- prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
110
-
111
- {context}
112
-
113
- Question: {question}
114
- Answer in English:"""
115
-
116
- PROMPT = PromptTemplate(
117
- template=prompt_template, input_variables=["context", "question"]
118
- )
119
- chain = load_qa_chain(llm, chain_type=chain_type, prompt=PROMPT)
120
- else:
121
- # chain = load_qa_with_sources_chain(llm, chain_type=chain_type)
122
- chain = load_qa_chain(llm, chain_type=chain_type)
123
- # chain.run(input_documents=docs, question=question)
124
-
125
- out_dict = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
126
- return out_dict['output_text']
127
-
128
-
129
- def store_index(self,
130
- index,
131
- index_type='GPTSimpleVectorIndex',
132
- filepath='./output/index.json'
133
- ):
134
- if index_type == 'GPTSimpleVectorIndex':
135
- index.save_to_disk(filepath)
136
-
137
- elif index_type == 'pickle':
138
- with open(filepath, "wb") as f:
139
- pickle.dump(index, f)
140
-
141
- elif index_type == 'FAISS':
142
- index.save_local(filepath)
143
-
144
-
145
- def load_index(self,
146
- index_type='GPTSimpleVectorIndex',
147
- filepath='./output/index.json'
148
- ):
149
- if index_type == 'GPTSimpleVectorIndex':
150
- index = GPTSimpleVectorIndex.load_from_disk(filepath)
151
-
152
- elif index_type == 'pickle':
153
- with open(filepath, "rb") as f:
154
- index = pickle.load(f)
155
-
156
- elif index_type == 'FAISS':
157
- index = FAISS.load_local(filepath, OpenAIEmbeddings()) # can we use open-source embeddings?
158
-
159
- return index
160
-
161
-
162
- def convert_text_to_documents(self, text_list=[]):
163
- """
164
- Converts the list of text data to Documents format that can be feed to GPT API to build the Vector store
165
- """
166
-
167
- from llama_index import Document
168
- documents = [Document(t) for t in text_list]
169
- return documents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import utils.constants as constants_utils
2
+ import utils.data_loader as data_loader_utils
3
+ import utils.utils as utils
4
+
5
  from langchain.llms import OpenAI
6
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
7
  from langchain.chains.summarize import load_summarize_chain
8
  from langchain.docstore.document import Document
9
  from langchain.embeddings.openai import OpenAIEmbeddings
10
  from langchain.vectorstores import Chroma
11
+ import chromadb
12
  from langchain.chains.question_answering import load_qa_chain
13
  from langchain.chains.qa_with_sources import load_qa_with_sources_chain
14
  from langchain.prompts import PromptTemplate
15
+ from llama_index import GPTSimpleVectorIndex, GPTListIndex
16
  from langchain.vectorstores import FAISS
17
 
18
  import pickle
19
+ import shutil
20
+ from typing import Dict, List, Optional
21
+
22
  import os
23
  os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
24
  os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')
25
 
26
+ import logging
27
+ logger = logging.getLogger(__name__)
28
+ logging.basicConfig(
29
+ format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
30
+ )
31
+
32
+ import warnings
33
+ warnings.filterwarnings('ignore')
34
+
35
 
36
 
37
  class LANGCHAIN_UTILS:
38
+ def __init__(self,
39
+ index_type=constants_utils.INDEX_TYPE,
40
+ load_from_existing_index_store=constants_utils.LOAD_FROM_EXISTING_INDEX_STORE
41
+ ):
42
+ self.index_type = index_type
43
+ self.load_from_existing_index_store = load_from_existing_index_store
44
+
45
+ # Temporary index in the current context for the doc_type in consideration
46
+ self.index = None
47
+ # Master index which contains data from multiple sources (PDF, Online PDF, Text files, URLs, etc. It gets updated on Uploading the data from new files/urls without downtime of the application on-demand.)
48
+ self.master_index = None
49
+
50
+ # Data source wise index
51
+ self.index_category_doc_type_wise_index = dict(
52
+ (ic, dict(
53
+ (ds, None) for ds in list(constants_utils.DATA_SOURCES.values()))
54
+ ) for ic in constants_utils.INDEX_CATEGORY)
55
+
56
+ # Data loaded as a Document format in the current context for the doc_type in consideration
57
+ self.documents = []
58
+
59
+ # Instantiate data_loader_utils class object
60
+ self.data_loader_utils_obj = data_loader_utils.DATA_LOADER()
61
+ # Instantiate UTILS class object
62
+ self.utils_obj = utils.UTILS()
63
+
64
+ # Initialize embeddings (we can also use other embeddings)
65
+ self.embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
66
+
67
+
68
+ def generate_prompt_template(
69
+ self,
70
+ prompt_type='general'
71
+ ):
72
+ prompt_template = ''
73
+
74
+ if prompt_type == 'general':
75
+ prompt_template = """Write a concise summary of the following:
76
+
77
+ {text}
78
+
79
+ SUMMARIZE IN ENGLISH:"""
80
+
81
+ elif prompt_type == 'weather':
82
+ prompt_template = """
83
+ What would be the weather based on the below data:
84
+ {text}
85
+ """
86
+
87
+ return prompt_template
88
+
89
+
90
+
91
+ def get_textual_summary(
92
+ self,
93
+ text,
94
+ chain_type="stuff",
95
+ custom_prompt=True,
96
+ prompt_type='general'
97
+ ):
98
+ texts = [text]
99
+ docs = [Document(page_content=t) for t in texts[:3]]
100
+
101
+ llm = OpenAI(temperature=0)
102
+ if custom_prompt:
103
+ prompt_template = self.generate_prompt_template(prompt_type)
104
+ PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
105
+ chain = load_summarize_chain(llm, chain_type=chain_type, prompt=PROMPT)
106
+ else:
107
+ chain = load_summarize_chain(llm, chain_type=chain_type)
108
+
109
+ text_summary = chain.run(docs)
110
+ return text_summary
111
+
112
+
113
+ def get_weather_forecast_summary(
114
+ self,
115
+ text,
116
+ chain_type="stuff"
117
+ ):
118
+ text = f"""
119
+ What would be the weather based on the below data:
120
+ {text}
121
+
122
+ Give simple response without technical numbers which can be explained to human.
123
+ """
124
+ texts = [text]
125
+ docs = [Document(page_content=t) for t in texts[:3]]
126
+
127
+ llm = OpenAI(temperature=0)
128
+ chain = load_summarize_chain(llm, chain_type=chain_type)
129
+ text_summary = chain.run(docs)
130
+
131
+ return text_summary
132
+
133
+
134
+ def get_answer_from_para(
135
+ self,
136
+ para,
137
+ question,
138
+ chain_type="stuff",
139
+ custom_prompt=True
140
+ ):
141
+ # Prepare data (Split paragraph into chunks of small documents)
142
+ text_splitter = CharacterTextSplitter(chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE, chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP)
143
+ texts = text_splitter.split_text(para)
144
+
145
+ if self.index_type == 'FAISS':
146
+ # Find similar docs that are relevant to the question
147
+ docsearch = FAISS.from_texts(
148
+ texts, self.embeddings,
149
+ metadatas=[{"source": str(i)} for i in range(len(texts))]
150
+ )
151
+
152
+ elif self.index_type == 'Chroma':
153
+ # Find similar docs that are relevant to the question
154
+ docsearch = Chroma.from_texts(
155
+ texts, self.embeddings,
156
+ metadatas=[{"source": str(i)} for i in range(len(texts))]
157
+ )
158
+
159
+ # Search for the similar docs
160
+ docs = docsearch.similarity_search(question, k=1)
161
+
162
+ llm = OpenAI(temperature=0)
163
+ # Create a Chain for question answering
164
+ if custom_prompt:
165
+ prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
166
+
167
+ {context}
168
+
169
+ Question: {question}
170
+ Answer in English:"""
171
+
172
+ PROMPT = PromptTemplate(
173
+ template=prompt_template, input_variables=["context", "question"]
174
+ )
175
+ chain = load_qa_chain(llm, chain_type=chain_type, prompt=PROMPT)
176
+ else:
177
+ # chain = load_qa_with_sources_chain(llm, chain_type=chain_type)
178
+ chain = load_qa_chain(llm, chain_type=chain_type)
179
+ # chain.run(input_documents=docs, question=question)
180
+
181
+ out_dict = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
182
+ return out_dict['output_text']
183
+
184
+
185
+ def load_documents(
186
+ self,
187
+ doc_type,
188
+ doc_filepath='',
189
+ urls=[]
190
+ ):
191
+ """
192
+ Load data in Document format of the given doc_type from either doc_filepath or list of urls.
193
+ It can load multiple files/urls in one shot.
194
+
195
+ Args:
196
+ doc_type: can be any of [pdf, online_pdf, urls, textfile]
197
+ doc_filepath: can be a directory or a filepath
198
+ urls: list of urls
199
+ """
200
+
201
+ logger.info(f'Loading {doc_type} data into Documents format')
202
+
203
+ if doc_type == 'pdf':
204
+ # Load data from PDFs stored in local directory
205
+ self.documents.extend(
206
+ self.data_loader_utils_obj.load_documents_from_pdf(
207
+ doc_filepath=doc_filepath,
208
+ doc_type=doc_type
209
+ ))
210
+
211
+ elif doc_type == 'online_pdf':
212
+ # Load data from PDFs stored in local directory
213
+ self.documents.extend(
214
+ self.data_loader_utils_obj.load_documents_from_pdf(
215
+ urls=urls,
216
+ doc_type=doc_type
217
+ ))
218
+
219
+ elif doc_type == 'urls':
220
+ # Load data from URLs
221
+ self.documents.extend(
222
+ self.data_loader_utils_obj.load_documents_from_urls(
223
+ urls=urls,
224
+ doc_type=doc_type
225
+ ))
226
+
227
+ elif doc_type == 'textfile':
228
+ # Load data from text files & Convert texts into Document format
229
+ self.documents.extend(
230
+ self.convert_text_to_documents(
231
+ self.data_loader_utils_obj.load_documents_from_text(
232
+ doc_filepath=doc_filepath,
233
+ doc_type=doc_type
234
+ )
235
+ ))
236
+
237
+ elif doc_type == 'directory':
238
+ # Load data from local directory
239
+ self.documents.extend(
240
+ self.data_loader_utils_obj.load_documents_from_directory(
241
+ doc_filepath=doc_filepath,
242
+ doc_type=doc_type
243
+ ))
244
+
245
+ logger.info(f'{doc_type} data into Documents format loaded successfully!')
246
+
247
+
248
+ def create_index(
249
+ self
250
+ ):
251
+ logger.info(f'Creating index')
252
+
253
+ if not self.documents:
254
+ logger.warning(f'Empty documents. Index cannot be created!')
255
+ return None
256
+
257
+ ############## Build the Vector store for docs ##############
258
+ # Vector store using Facebook AI Similarity Search
259
+ if self.index_type == 'FAISS':
260
+ text_splitter = CharacterTextSplitter(
261
+ chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
262
+ chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
263
+ )
264
+ self.documents = text_splitter.split_documents(self.documents)
265
+
266
+ self.index = FAISS.from_documents(
267
+ self.documents,
268
+ self.embeddings
269
+ )
270
+
271
+ # Vector store using Chroma DB
272
+ elif self.index_type == 'Chroma':
273
+ if not os.path.exists(self.index_filepath):
274
+ os.makedirs(self.index_filepath)
275
+
276
+ text_splitter = CharacterTextSplitter(
277
+ chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
278
+ chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP
279
+ )
280
+ self.documents = text_splitter.split_documents(self.documents)
281
+ self.index = Chroma.from_documents(
282
+ self.documents,
283
+ self.embeddings,
284
+ persist_directory=self.index_filepath
285
+ )
286
+
287
+ # Vector store using GPT vector index
288
+ elif self.index_type == 'GPTSimpleVectorIndex':
289
+ self.index = GPTSimpleVectorIndex.from_documents(self.documents)
290
+
291
+ logger.info(f'Index created successfully!')
292
+ return self.index
293
+
294
+
295
+ def get_index_filepath(
296
+ self,
297
+ index_category,
298
+ doc_type
299
+ ):
300
+ if doc_type == 'master':
301
+ self.index_filepath = os.path.join(
302
+ constants_utils.OUTPUT_PATH, f'index_{index_category}') if self.index_type in ['FAISS', 'Chroma'] else os.path.join(constants_utils.OUTPUT_PATH, f'index_{index_category}.json')
303
+ else:
304
+ self.index_filepath = os.path.join(
305
+ constants_utils.OUTPUT_PATH, f'index_{index_category}', f'index_{doc_type}') if self.index_type in ['FAISS', 'Chroma'] else os.path.join(constants_utils.OUTPUT_PATH, f'index_{index_category}', f'index_{doc_type}.json')
306
+
307
+ return self.index_filepath
308
+
309
+
310
+ def load_master_doctype_indices_for_index_category(
311
+ self,
312
+ index_category
313
+ ):
314
+ logger.info(f'Loading master and doc_type indices for: {index_category}')
315
+
316
+ # Set master index of index_category = None
317
+ self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = None
318
+
319
+ for doc_type in self.index_category_doc_type_wise_index[index_category].keys():
320
+ self.index = None
321
+ self.index_filepath = self.get_index_filepath(
322
+ index_category=index_category,
323
+ doc_type=doc_type
324
+ )
325
+ self.load_index()
326
+ # Set master/doc_type index
327
+ self.index_category_doc_type_wise_index[index_category][doc_type] = self.index
328
+
329
+ logger.info(f'Master and doc_type indices for: {index_category} loaded successfully!')
330
+
331
+
332
+ def load_create_index(
333
+ self
334
+ ):
335
+ logger.info(f'Loading/Creating index for each index_category')
336
+
337
+ for index_category in constants_utils.INDEX_CATEGORY:
338
+ # Load master index_category index if self.load_from_existing_index_store == True
339
+ if self.load_from_existing_index_store:
340
+ self.load_master_doctype_indices_for_index_category(index_category)
341
+
342
+ # For any reason, if master index is not loaded then create the new index/vector store
343
+ if not self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]:
344
+ logger.info(f'Creating a new Vector/Index store for: {index_category}')
345
+
346
+ doc_filepath = os.path.join(constants_utils.DATA_PATH, index_category)
347
+ urls = []
348
+
349
+ # Build the Vector/Index store
350
+ for doc_type in list(constants_utils.DATA_SOURCES.values()):
351
+ logger.info(f'Creating a new Vector/Index store for: {index_category} from data source: {doc_type}')
352
+
353
+ index = None
354
+ if doc_type in ['pdf', 'textfile']:
355
+ index = self.create_store_index(
356
+ doc_type=doc_type,
357
+ doc_filepath=doc_filepath,
358
+ index_category=index_category
359
+ )
360
+ else:
361
+ # Build the Vector/Index store from web urls
362
+ index = self.create_store_index(
363
+ doc_type=doc_type,
364
+ urls=urls,
365
+ index_category=index_category
366
+ )
367
+
368
+ if index:
369
+ self.index_category_doc_type_wise_index[index_category][doc_type] = index
370
+
371
+ logger.info(f'New Vector/Index store for: {index_category} from data source: {doc_type} created successfully!')
372
+
373
+ logger.info(f'New Vector/Index store for: {index_category} created successfully!')
374
+
375
+ # Merge index of each doc_type into a single index_category
376
+ self.merge_store_master_index(
377
+ index_category=index_category
378
+ )
379
+
380
+ logger.info(f'Index for each index_category loaded successfully!')
381
+
382
+
383
+ def create_store_index(
384
+ self,
385
+ doc_type='pdf',
386
+ doc_filepath=constants_utils.DATA_PATH,
387
+ urls=[],
388
+ index_category=constants_utils.INDEX_CATEGORY[0]
389
+ ):
390
+ logger.info(f'Creating and storing {doc_type} index')
391
+
392
+ self.documents = []
393
+ self.index = None
394
+
395
+ self.index_filepath = self.get_index_filepath(
396
+ index_category=index_category,
397
+ doc_type=doc_type
398
+ )
399
+
400
+ # Delete the old index file
401
+ shutil.rmtree(self.index_filepath, ignore_errors=True)
402
+ logger.info(f'{self.index_filepath} deleted.')
403
+
404
+ # Load data in Documents format that can be consumed for index creation
405
+ self.load_documents(
406
+ doc_type,
407
+ doc_filepath,
408
+ urls
409
+ )
410
+
411
+ # Create the index from documents for search/retrieval
412
+ self.index = self.create_index()
413
+
414
+ # Store index
415
+ self.store_index(
416
+ index=self.index,
417
+ index_filepath=self.index_filepath
418
+ )
419
+
420
+ logger.info(f'{doc_type} index created and stored successfully!')
421
+ # Return the index of the given doc_type (this is an index for a single doc_type). Indices from multiple doc_types should be merged later on in the master index so that query could be made from a single index.
422
+ return self.index
423
+
424
+
425
+ def store_index(
426
+ self,
427
+ index,
428
+ index_filepath
429
+ ):
430
+ logger.info(f'Saving index to: {index_filepath}')
431
+
432
+ if not index:
433
+ logger.warning(f'Cannot write an empty index to: {index_filepath}!')
434
+ return
435
+
436
+ if not os.path.exists(index_filepath):
437
+ os.makedirs(index_filepath)
438
+
439
+ if self.index_type == 'FAISS':
440
+ index.save_local(index_filepath)
441
+
442
+ elif self.index_type == 'Chroma':
443
+ index.persist()
444
+
445
+ elif self.index_type == 'GPTSimpleVectorIndex':
446
+ index.save_to_disk(index_filepath)
447
+
448
+ elif self.index_type == 'pickle':
449
+ with open(index_filepath, "wb") as f:
450
+ pickle.dump(index, f)
451
+
452
+ logger.info(f'Index saved to: {index_filepath} successfully!')
453
+
454
+
455
+ def load_index(
456
+ self
457
+ ):
458
+ logger.info(f'Loading index from: {self.index_filepath}')
459
+
460
+ if not os.path.exists(self.index_filepath):
461
+ logger.warning(f"Cannot load index from {self.index_filepath} as the path doest not exist!")
462
+ return
463
+
464
+ if self.index_type == 'FAISS':
465
+ self.index = FAISS.load_local(self.index_filepath, self.embeddings)
466
+
467
+ elif self.index_type == 'Chroma':
468
+ self.index = Chroma(
469
+ persist_directory=self.index_filepath,
470
+ embedding_function=self.embeddings
471
+ )
472
+
473
+ elif self.index_type == 'GPTSimpleVectorIndex':
474
+ self.index = GPTSimpleVectorIndex.load_from_disk(self.index_filepath)
475
+
476
+ elif self.index_type == 'pickle':
477
+ with open(self.index_filepath, "rb") as f:
478
+ self.index = pickle.load(f)
479
+
480
+ logger.info(f'Index loaded from: {self.index_filepath} successfully!')
481
+
482
+
483
+ def convert_text_to_documents(
484
+ self,
485
+ text_list=[]
486
+ ):
487
+ """
488
+ Converts the list of text data to Documents format that can be feed to GPT API to build the Vector store
489
+ """
490
+
491
+ from llama_index import Document
492
+ documents = [Document(t) for t in text_list]
493
+ return documents
494
+
495
+
496
+ def merge_documents_from_different_sources(
497
+ self,
498
+ doc_documents,
499
+ url_documents
500
+ ):
501
+ # Build the Vector store for docs
502
+ doc_index = GPTSimpleVectorIndex.from_documents(doc_documents)
503
+ # Build the Vector store for URLs
504
+ url_index = GPTSimpleVectorIndex.from_documents(url_documents)
505
+
506
+ # Set summary of each index
507
+ doc_index.set_text("index_from_docs")
508
+ url_index.set_text("index_from_urls")
509
+
510
+ # Merge index of different data sources
511
+ index = GPTListIndex([doc_index, url_index])
512
+
513
+ return index
514
+
515
+
516
+ def merge_store_master_index(
517
+ self,
518
+ index_category
519
+ ):
520
+ """
521
+ Merge multiple doc_type indices into a single master index. Query/search would be performed on this merged index.
522
+
523
+ Args:
524
+ index_category: index_category (can be any of: [crops, fruits, pest_management, govt_policy, soil, etc.])
525
+ """
526
+ logger.info('Merging doc_type indices of different index categories into a master index')
527
+
528
+ self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = None
529
+ doc_type_indices = self.index_category_doc_type_wise_index[index_category]
530
+
531
+ if self.index_type == 'FAISS':
532
+ for doc_type, index in doc_type_indices.items():
533
+ if doc_type == constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE:
534
+ # Only merge the non-master doc_type_indices
535
+ continue
536
+ if not index or not isinstance(index, FAISS):
537
+ logger.warning(f'{doc_type} index to be merged is not an instance of type langchain.vectorstores.faiss.FAISS')
538
+ continue
539
+ if not self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]:
540
+ self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = index
541
+ else:
542
+ self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE].merge_from(index)
543
+
544
+ elif self.index_type == 'Chroma':
545
+ for doc_type, index in doc_type_indices.items():
546
+ if not index or not isinstance(index, Chroma):
547
+ logger.warning(f'{doc_type} index to be merged is not an instance of type langchain.vectorstores.Chroma')
548
+ continue
549
+ raise NotImplementedError
550
+
551
+ elif self.index_type == 'GPTSimpleVectorIndex':
552
+ for doc_type, index in doc_type_indices.items():
553
+ if not index or not isinstance(index, GPTSimpleVectorIndex):
554
+ logger.warning(f'{doc_type} index to be merged is not an instance of type llama_index.GPTSimpleVectorIndex')
555
+ continue
556
+ raise NotImplementedError
557
+
558
+ # Store index_category master index
559
+ self.store_index(
560
+ index=self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE],
561
+ index_filepath=self.get_index_filepath(
562
+ index_category=index_category,
563
+ doc_type=constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE
564
+ )
565
+ )
566
+
567
+ logger.info('doc_type indices of different index categories into a master index merged successfully!')
568
+
569
+
570
+ def init_chromadb(self):
571
+ logger.info('Initializing Chroma DB')
572
+
573
+ if not os.path.exists(self.index_filepath):
574
+ os.makedirs(self.index_filepath)
575
+
576
+ client_settings = chromadb.config.Settings(
577
+ chroma_db_impl="duckdb+parquet",
578
+ persist_directory=self.index_filepath,
579
+ anonymized_telemetry=False
580
+ )
581
+
582
+ self.index = Chroma(
583
+ collection_name="langchain_store",
584
+ embedding_function=self.embeddings,
585
+ client_settings=client_settings,
586
+ persist_directory=self.index_filepath,
587
+ )
588
+
589
+ logger.info('Chroma DB initialized successfully!')
590
+
591
+
592
+ def query_chromadb(self, question, k=1):
593
+ return self.index.similarity_search(query=question, k=k)
594
+
595
+
596
+ def query(self,
597
+ question,
598
+ question_category,
599
+ mode='embedding',
600
+ response_mode="default",
601
+ similarity_top_k=2,
602
+ required_keywords=[],
603
+ exclude_keywords=[],
604
+ verbose=False
605
+ ):
606
+ '''
607
+ Args:
608
+ mode: can be any of [default, embedding]
609
+ response_mode: can be any of [default, compact, tree_summarize]
610
+ '''
611
+ logger.info(f'question category: {question_category}; question: {question}')
612
+
613
+ response = None
614
+
615
+ # Get the index of the given question_category
616
+ index = self.index_category_doc_type_wise_index[question_category]['master']
617
+
618
+ if self.index_type == 'FAISS':
619
+ response = index.similarity_search(
620
+ question,
621
+ k=similarity_top_k
622
+ )
623
+
624
+ elif self.index_type == 'Chroma':
625
+ response = index.similarity_search(
626
+ question,
627
+ k=similarity_top_k
628
+ )
629
+
630
+ elif self.index_type == 'GPTSimpleVectorIndex':
631
+ # Querying the index
632
+ response = index.query(
633
+ question,
634
+ mode=mode,
635
+ response_mode=response_mode,
636
+ similarity_top_k=similarity_top_k,
637
+ required_keywords=required_keywords,
638
+ exclude_keywords=exclude_keywords,
639
+ verbose=verbose
640
+ )
641
+
642
+ return response
643
+
644
+
645
+ def load_uploaded_documents(
646
+ self,
647
+ doc_type,
648
+ files_or_urls
649
+ ):
650
+ logger.info(f'Loading uploaded documents from: {doc_type}')
651
+
652
+ if doc_type == 'pdf':
653
+ if not isinstance(files_or_urls, list):
654
+ files_or_urls = [files_or_urls]
655
+ for pdf in files_or_urls:
656
+ if not pdf.name.endswith('.pdf'):
657
+ logger.warning(f'Found a file other than .pdf format. Cannot load {pdf.name} file!')
658
+ continue
659
+ logger.info(f'Loading PDF from: {pdf.name}')
660
+ # Load PDF as documents
661
+ self.documents.extend(
662
+ self.data_loader_utils_obj.load_documents_from_pdf(
663
+ doc_filepath=pdf.name,
664
+ doc_type=doc_type
665
+ )
666
+ )
667
+
668
+ elif doc_type == 'textfile':
669
+ if not isinstance(files_or_urls, list):
670
+ files_or_urls = [files_or_urls]
671
+ for text_file in files_or_urls:
672
+ if not text_file.name.endswith('.txt'):
673
+ logger.warning(f'Found a file other than .txt format. Cannot load {text_file.name} file!')
674
+ continue
675
+ logger.info(f'Loading textfile from: {text_file.name}')
676
+ # Load textfile as documents
677
+ self.documents.extend(
678
+ self.data_loader_utils_obj.load_documents_from_text(
679
+ doc_filepath=text_file.name,
680
+ doc_type=doc_type
681
+ )
682
+ )
683
+
684
+ elif doc_type == 'online_pdf':
685
+ files_or_urls = self.utils_obj.split_text(files_or_urls)
686
+ # Load online_pdfs as documents
687
+ self.documents.extend(
688
+ self.data_loader_utils_obj.load_documents_from_pdf(
689
+ doc_type=doc_type,
690
+ urls=files_or_urls
691
+ )
692
+ )
693
+
694
+ elif doc_type == 'urls':
695
+ files_or_urls = self.utils_obj.split_text(files_or_urls)
696
+ # Load URLs as documents
697
+ self.documents.extend(
698
+ self.data_loader_utils_obj.load_documents_from_urls(
699
+ doc_type=doc_type,
700
+ urls=files_or_urls
701
+ )
702
+ )
703
+
704
+ logger.info(f'Uploaded documents from: {doc_type} loaded successfully!')
705
+
706
+
707
+ def upload_data(
708
+ self,
709
+ doc_type,
710
+ files_or_urls,
711
+ index_category
712
+ ):
713
+ logger.info(f'Uploading data for: {index_category}-{doc_type}')
714
+
715
+ self.documents = []
716
+ self.index = None
717
+
718
+ # Create documents of the uploaded files
719
+ self.load_uploaded_documents(
720
+ doc_type,
721
+ files_or_urls
722
+ )
723
+
724
+ # Create the index from documents for search/retrieval
725
+ self.index = self.create_index()
726
+
727
+ # Update the existing index with the newly data
728
+ self.upsert_index(
729
+ doc_type=doc_type,
730
+ index_category=index_category
731
+ )
732
+
733
+ logger.info(f'{index_category}-{doc_type} data uploaded successfully!')
734
+
735
+
736
+ def upsert_index(
737
+ self,
738
+ doc_type,
739
+ index_category
740
+ ):
741
+ """
742
+ Updates the index of the given index_category-doc_type, if present.
743
+ Creates a new index if index_category-doc_type index is not present.
744
+ Also updates the master index for the given index_category.
745
+ """
746
+ logger.info(f'Upserting index for: {index_category}-{doc_type}')
747
+
748
+ if not self.index_category_doc_type_wise_index.get(index_category, None):
749
+ """
750
+ If index_category index does not exists
751
+ Steps:
752
+ - set index_category index
753
+ - set doc_type index
754
+ - Store new index_category index as master
755
+ - Store new doc_type index
756
+ """
757
+ logger.info(f'Master index does not exist for: {index_category}. A new {index_category} master index & {doc_type} index would be created.')
758
+ self.index_category_doc_type_wise_index.setdefault(index_category, {})
759
+ # Set a master index only if it doesn't exist. Else keep it's value as-it-is.
760
+ self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index
761
+ # Set an index for the given doc_type only if it doesn't exist. Else keep it's value as-it-is.
762
+ self.index_category_doc_type_wise_index[index_category][doc_type] = self.index
763
+
764
+ elif not self.index_category_doc_type_wise_index[index_category].get(doc_type, None):
765
+ """
766
+ If doc_type index does not exists
767
+ Steps:
768
+ - set doc_type index
769
+ - if master index does not exist for the index_category - set a master index
770
+ - if master index exists - update the master index to merge it with doc_type index
771
+ - Store new/updated index_category index as master
772
+ - Store new doc_type index
773
+ """
774
+ logger.info(f'{doc_type} index does not exist for: {index_category}-{doc_type}. A new {doc_type} index would be created.')
775
+ # create doc_type index
776
+ self.index_category_doc_type_wise_index[index_category][doc_type] = self.index
777
+ # if master index does not exist for the index_category - create a master index
778
+ if not self.index_category_doc_type_wise_index[index_category].get(constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE, None):
779
+ logger.info(f'Master index does not exist for: {index_category}-{doc_type}. A new master index would be created.')
780
+ self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index
781
+
782
+ else:
783
+ """
784
+ If the new document is of the existing index_category & doc_type
785
+ Steps:
786
+ - if master index does not exist for the index_category - set a master index
787
+ - if master index exists - update the master index to merge it with doc_type index
788
+ - update the doc_type index
789
+ - Store updated index_category index as master
790
+ - Store updated doc_type index
791
+ """
792
+ # if master index does not exist for the index_category - create a master index
793
+ if not self.index_category_doc_type_wise_index[index_category].get(constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE, None):
794
+ logger.info(f'Master index does not exist for: {index_category}-{doc_type}. A new master index would be created.')
795
+ self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index
796
+ # Merge new self.index with existing doc_type index
797
+ self.index_category_doc_type_wise_index[index_category][doc_type].merge_from(self.index)
798
+ # Update self.index to store/overwrite the existing index with the updated index
799
+ self.index = self.index_category_doc_type_wise_index[index_category][doc_type]
800
+
801
+
802
+ # Store newly created/merged index
803
+ self.store_index(
804
+ index=self.index,
805
+ index_filepath=self.get_index_filepath(
806
+ index_category=index_category,
807
+ doc_type=doc_type
808
+ )
809
+ )
810
+
811
+ # Merge and store master index for index_category
812
+ self.merge_store_master_index(
813
+ index_category=index_category
814
+ )
815
+
816
+ logger.info(f'Index for: {index_category}-{doc_type} upserted successful!')
817
+
818
+
819
+ def delete_index(
820
+ self,
821
+ ids: Optional[List[str]] = None,
822
+ # filter: Optional[DocumentMetadataFilter] = None,
823
+ delete_all: Optional[bool] = None,
824
+ ):
825
+ """
826
+ Removes vectors by ids, filter, or everything in the datastore.
827
+ Multiple parameters can be used at once.
828
+ Returns whether the operation was successful.
829
+ """
830
+ logger.info(f'Deleting index')
831
+
832
+ raise NotImplementedError
833
+
834
+ # NOTE: we can delete a specific collection
835
+ self.index.delete_collection()
836
+ self.index.persist()
837
+
838
+ # Or just nuke the persist directory
839
+ # !rm -rf self.index_filepath
utils/mandi_price.py CHANGED
@@ -2,32 +2,32 @@ import requests
2
 
3
 
4
  class MANDI_PRICE:
5
- def __init__(self):
6
- self.base_url = "https://enam.gov.in/web/Ajax_ctrl/trade_data_list"
7
- # "https://enam.gov.in/web/dashboard/trade-data",
8
- # "https://enam.gov.in/web/dashboard/trade_data_list",
9
 
10
 
11
- def get_mandi_price(self,
12
- state_name,
13
- apmc_name,
14
- commodity_name,
15
- from_date,
16
- to_date
17
- ):
18
- # Prepare the payload for POST request
19
- payload = f"language=en&stateName={state_name}&apmcName={apmc_name}&commodityName={commodity_name}&fromDate={from_date}&toDate={to_date}"
20
 
21
- headers = {
22
- "Content-type": "application/x-www-form-urlencoded; charset=UTF-8",
23
- "Referer": "https://enam.gov.in/web/dashboard/trade-data",
24
- "Accept": "application/json, text/javascript, */*; q=0.01",
25
- }
26
 
27
- response = requests.post(
28
- self.base_url,
29
- json=payload,
30
- headers=headers,
31
- )
32
 
33
- return response.json()
 
2
 
3
 
4
  class MANDI_PRICE:
5
+ def __init__(self):
6
+ self.base_url = "https://enam.gov.in/web/Ajax_ctrl/trade_data_list"
7
+ # "https://enam.gov.in/web/dashboard/trade-data",
8
+ # "https://enam.gov.in/web/dashboard/trade_data_list",
9
 
10
 
11
+ def get_mandi_price(self,
12
+ state_name,
13
+ apmc_name,
14
+ commodity_name,
15
+ from_date,
16
+ to_date
17
+ ):
18
+ # Prepare the payload for POST request
19
+ payload = f"language=en&stateName={state_name}&apmcName={apmc_name}&commodityName={commodity_name}&fromDate={from_date}&toDate={to_date}"
20
 
21
+ headers = {
22
+ "Content-type": "application/x-www-form-urlencoded; charset=UTF-8",
23
+ "Referer": "https://enam.gov.in/web/dashboard/trade-data",
24
+ "Accept": "application/json, text/javascript, */*; q=0.01",
25
+ }
26
 
27
+ response = requests.post(
28
+ self.base_url,
29
+ json=payload,
30
+ headers=headers,
31
+ )
32
 
33
+ return response.json()
utils/ner_detection.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import openai
3
+ import os
4
+ import re
5
+ import ast
6
+
7
+ openai.api_key = "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"
8
+ SYSTEM_PROMPT = "You are a smart and intelligent Named Entity Recognition (NER) system. I will provide you the definition of the entities you need to extract, the sentence from where your extract the entities and the output format with examples."
9
+ USER_PROMPT_1 = "Are you clear about your role?"
10
+ ASSISTANT_PROMPT_1 = "Sure, I'm ready to help you with your NER task. Please provide me with the necessary information to get started."
11
+ GUIDELINES_PROMPT = (
12
+ """Entity Definition:\n"
13
+ "1. PEST NAME: Name of the pest which has attacked a particular crop which may lead to crop damage.\n"
14
+ "2. CROP DISEASE: Any kind of crop disease which occurs in agriculture land in india and nearby resgions.\n"
15
+ "3. WEATHER CONDITION: Severe climate conditions like heavy rainfall, hailstorm which has destroyed crops.\n"
16
+ "\n"
17
+ "Output Format:\n"
18
+ "{{'PEST NAME': [list of entities present], 'CROP DISEASE': [list of entities present], 'WEATHER CONDITION': [list of entities present]}}\n"
19
+ "If no entities are presented in any categories keep it None\n"
20
+ "\n"
21
+ "Examples:\n"
22
+ "\n"
23
+ "1. Sentence: Pest attack on maize crop in lower Kangra : The Tribune India. Farmers in lower Kangra are a harried lot as the fall armyworm pest has attacked their maize crop. 'Kolshi' continues to affect Vidarbha's Orange crop cultivation (Citrus Black Fly) | Krishak Jagat. A total of 1,50,000 hectares of land in the Vidarbha region is planted with oranges, and of them, 25% are seriously damaged by Kolshi, a citrus black fly disease. India's June tea output drops 17% as floods hit plucking | Mint. India's June tea production fell 17.4% from a year earlier to 141.31 million kilograms, the state-run Tea Board said, as floods and pest attack dented output in the main producing region\n"
24
+ "Output: {{'PEST NAME': ['fall armyworm'], 'CROP DISEASE': ['citrus black fly disease'], 'WEATHER CONDITION': ['floods']}}\n"
25
+ "\n"
26
+ "2. Sentence: ICAR issues pest alert in Leparada, W/Siang | The Arunachal Times. 70 percent prevalence of fall army worm in maize fields in Pagi, Gori and Bam villages in Leparada district and Darka, Kombo and Jirdin villages in West Siang district was observed. After maize, Kangra vegetable crops under white fly attack : The Tribune India. Vegetable crops are under attack by white fly in the lower hills of Kangra district. The pest attack comes after the recent damage caused by fall armyworm to the maize crop in the area. Pest attacks on paddy crop worry farmers in the integrated Karimnagar district | Hindudayashankar. Crops withering due to stem borer, leaf folder and rice blast; farmers have to incur huge expenditures to control menace. Cyclone Amphan damages crop, vegetable prices shoot up | Cities News,The Indian Express. Cyclone Amphan has damaged vegetables across South Bengal. Farmers lost 80 to 90 per cent of crop as fields were flooded.\n"
27
+ "Output: {{'PEST NAME': ['fall army worm', 'white fly attack', 'stem borer', 'leaf folder'], 'CROP DISEASE': ['rice blast'], 'WEATHER CONDITION': ['Cyclone Amphan']}}\n"
28
+ "\n"
29
+ "3. Sentence: {}\n"
30
+ "Output: """
31
+ )
32
+
33
+ def openai_chat_completion_response(news_article_text):
34
+ final_prompt = GUIDELINES_PROMPT.format(news_article_text)
35
+ response = openai.ChatCompletion.create(
36
+ model="gpt-3.5-turbo",
37
+ messages=[
38
+ {"role": "system", "content": SYSTEM_PROMPT},
39
+ {"role": "user", "content": USER_PROMPT_1},
40
+ {"role": "assistant", "content": ASSISTANT_PROMPT_1},
41
+ {"role": "user", "content": final_prompt}
42
+ ]
43
+ )
44
+ return response['choices'][0]['message']['content'].strip(" \n")
45
+
46
+ # def preprocess(prompt):
47
+ # return GUIDELINES_PROMPT.format(prompt)
48
+ # def main():
49
+ # my_sentence = "Hundreds of hectares of land under the cotton crop, once referred to as white gold, has come under attack of a wide range of insects like whitefly, pink bollworm and mealybug. This is likely to hit the cotton production this year."
50
+ # GUIDELINES_PROMPT = GUIDELINES_PROMPT.format(my_sentence)
51
+ # # print(GUIDELINES_PROMPT)
52
+ # ners = openai_chat_completion_response(GUIDELINES_PROMPT)
53
+ # print(ners)
54
+
55
+ import gradio as gra
56
+ #define gradio interface and other parameters
57
+ app = gra.Interface(fn = openai_chat_completion_response, inputs="text", outputs="text")
58
+ app.launch(share=True)
utils/translator.py CHANGED
@@ -7,55 +7,55 @@ from googletrans import Translator, constants
7
 
8
 
9
  class TRANSLATOR:
10
- def __init__(self):
11
- print()
12
-
13
-
14
- def split_sentences(self, paragraph, language):
15
- if language == "en":
16
- with MosesSentenceSplitter(language) as splitter:
17
- return splitter([paragraph])
18
- elif language in constants_utils.INDIC_LANGUAGE:
19
- return sentence_tokenize.sentence_split(paragraph, lang=language)
20
-
21
-
22
- def get_in_hindi(self, payload):
23
- tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
24
- model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
25
- article = self.split_sentences(payload['inputs'], 'en')
26
- # inputs = tokenizer(payload['input'], return_tensors="pt")
27
- out_text = ""
28
- for a in article:
29
- inputs = tokenizer(a, return_tensors="pt")
30
- translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["hin_Deva"], max_length=100)
31
- translated_sent = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
32
- out_text = out_text.join(translated_sent)
33
- return out_text
34
-
35
-
36
- def get_in_indic(self, text, language='Hindi'):
37
- tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
38
- model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
39
- inputs = tokenizer(text, return_tensors="pt")
40
-
41
- code = "eng_Latn"
42
- if language == 'Hindi':
43
- code= "hin_Deva"
44
- elif language == 'Marathi':
45
- code = "mar_Deva"
46
-
47
- translated_tokens = model.generate(
48
- **inputs,
49
- forced_bos_token_id=tokenizer.lang_code_to_id[code],
50
- max_length=1000
51
- )
52
-
53
- out_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
54
- return out_text
55
-
56
-
57
- def get_indic_google_translate(self, text, language='Hindi'):
58
- # Init the Google API translator
59
- translator = Translator()
60
- translations = translator.translate(text, dest=constants_utils.INDIC_LANGUAGE.get(language, 'en'))
61
- return str(translations.text)
 
7
 
8
 
9
  class TRANSLATOR:
10
+ def __init__(self):
11
+ print()
12
+
13
+
14
+ def split_sentences(self, paragraph, language):
15
+ if language == "en":
16
+ with MosesSentenceSplitter(language) as splitter:
17
+ return splitter([paragraph])
18
+ elif language in constants_utils.INDIC_LANGUAGE:
19
+ return sentence_tokenize.sentence_split(paragraph, lang=language)
20
+
21
+
22
+ def get_in_hindi(self, payload):
23
+ tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
24
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
25
+ article = self.split_sentences(payload['inputs'], 'en')
26
+ # inputs = tokenizer(payload['input'], return_tensors="pt")
27
+ out_text = ""
28
+ for a in article:
29
+ inputs = tokenizer(a, return_tensors="pt")
30
+ translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["hin_Deva"], max_length=100)
31
+ translated_sent = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
32
+ out_text = out_text.join(translated_sent)
33
+ return out_text
34
+
35
+
36
+ def get_in_indic(self, text, language='Hindi'):
37
+ tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
38
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
39
+ inputs = tokenizer(text, return_tensors="pt")
40
+
41
+ code = "eng_Latn"
42
+ if language == 'Hindi':
43
+ code= "hin_Deva"
44
+ elif language == 'Marathi':
45
+ code = "mar_Deva"
46
+
47
+ translated_tokens = model.generate(
48
+ **inputs,
49
+ forced_bos_token_id=tokenizer.lang_code_to_id[code],
50
+ max_length=1000
51
+ )
52
+
53
+ out_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
54
+ return out_text
55
+
56
+
57
+ def get_indic_google_translate(self, text, language='Hindi'):
58
+ # Init the Google API translator
59
+ translator = Translator()
60
+ translations = translator.translate(text, dest=constants_utils.INDIC_LANGUAGE.get(language, 'en'))
61
+ return str(translations.text)
utils/utils.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import pandas as pd
4
+ from urllib.parse import urlparse
5
+
6
+ import logging
7
+ logger = logging.getLogger(__name__)
8
+ logging.basicConfig(
9
+ format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
10
+ )
11
+
12
+
13
+ class UTILS:
14
+ def __init__(self):
15
+ pass
16
+
17
+
18
+ def split_text(
19
+ self,
20
+ text
21
+ ):
22
+ text = text.split(',')
23
+ text = [t.strip() for t in text]
24
+ return text
25
+
26
+
27
+ def replace_newlines_and_spaces(
28
+ self,
29
+ text
30
+ ):
31
+ # Replace all newline characters with spaces
32
+ text = text.replace("\n", " ")
33
+ # Replace multiple spaces with a single space
34
+ text = re.sub(r'\s+', ' ', text)
35
+ return text
36
+
37
+
38
+ def clean_df(
39
+ self,
40
+ df,
41
+ dropna=True,
42
+ fillna=False
43
+ ):
44
+ if fillna:
45
+ df.fillna('', inplace=True)
46
+ if dropna:
47
+ df.dropna(inplace=True)
48
+ # df = df[~df.isna()]
49
+ df = df.drop_duplicates().reset_index(drop=True)
50
+ return df
51
+
52
+
53
+ def validate_url_format(
54
+ self,
55
+ urls,
56
+ url_type='urls'
57
+ ):
58
+ valid_urls = []
59
+ for url in urls:
60
+ result = urlparse(url)
61
+ # Check if the url is valid
62
+ if all([result.scheme, result.netloc]):
63
+ # Online PDF urls should end with .pdf extension
64
+ if url_type == 'online_pdf' and not url.endswith('.pdf'):
65
+ continue
66
+ valid_urls.append(url)
67
+ logging.info(f'Valid URLs are: {valid_urls}')
68
+ return valid_urls
utils/weather.py CHANGED
@@ -3,200 +3,200 @@ from bs4 import BeautifulSoup as bs
3
 
4
 
5
  STATE_CODES = {
6
- 'Andaman-Nicobar': '01',
7
- 'Andhra-Pradesh': '02',
8
- 'Arunachal-Pradesh': '03',
9
- 'Assam': '04',
10
- 'Bihar': '05',
11
- 'Chandigarh': '06',
12
- 'Chhattisgarh': '07',
13
- 'Dadra-and-Nagar-Haveli': '08',
14
- 'Daman-and-Diu': '09',
15
- 'Delhi': '10',
16
- 'Goa': '11',
17
- 'Gujarat': '12',
18
- 'Haryana': '13',
19
- # 14
20
- 'Himachal-Pradesh': '15',
21
- 'Jammu-Kashmir': '16',
22
- 'Jharkhand': '17',
23
- 'Karnataka': '18',
24
- 'Kerala': '19',
25
- 'Lakshadweep': '20',
26
- 'Madhya-Pradesh': '21',
27
- 'Maharashtra': '22',
28
- 'Manipur': '23',
29
- 'Meghalaya': '24',
30
- 'Mizoram': '25',
31
- 'Nagaland': '26',
32
- 'Odisha': '27',
33
- 'Pondicherry': '28',
34
- 'Punjab': '29',
35
- 'Rajasthan': '30',
36
- 'Sikkim': '31',
37
- 'Tamilnadu': '32',
38
- 'Telangana': '33',
39
- 'Tripura': '34',
40
- 'Uttar-Pradesh': '35',
41
- 'Uttarakhand': '36',
42
- 'West-Bengal': '37',
43
  }
44
 
45
  # List of states that are given as the input selection to https://nwp.imd.gov.in/blf/blf_temp/ to get the weather forecast
46
  STATES = {
47
- 'Andaman-Nicobar': {},
48
-
49
- 'Andhra-Pradesh': {},
50
-
51
- 'Arunachal-Pradesh': {},
52
-
53
- 'Assam': {},
54
-
55
- 'Bihar': {},
56
-
57
- 'Chandigarh': {},
58
-
59
- 'Chhattisgarh': {},
60
-
61
- 'Dadra-and-Nagar-Haveli': {},
62
-
63
- 'Daman-and-Diu': {},
64
-
65
- 'Delhi': {
66
- 'CENTRAL-DELHI': ['CENTRAL-DELHI'],
67
- 'EAST-DELHI': ['EAST-DELHI'],
68
- 'NEW-DELHI': ['NEW-DELHI'],
69
- 'NORTH-DELHI': ['NORTH-DELHI'],
70
- 'NORTH-EAST-DELHI': ['NORTH-EAST-DELHI'],
71
- 'NORTH-WEST-DELHI': ['NORTH-WEST-DELHI'],
72
- 'SHAHDARA': ['SHAHDARA'],
73
- 'SOUTH-DELHI': ['SOUTH-DELHI'],
74
- 'SOUTH-EAST-DELHI': ['SOUTH-EAST-DELHI'],
75
- 'SOUTH-WEST-DELHI': ['SOUTH-WEST-DELHI'],
76
- 'WEST-DELHI': ['WEST-DELHI'],
77
- },
78
-
79
- 'Goa': {},
80
-
81
- 'Gujarat': {
82
- 'AHMADABAD': ['AHMEDABAD-CITY', 'BAVLA', 'DASKROI', 'DETROJ-RAMPURA', 'DHANDHUKA', 'DHOLERA', 'DHOLKA', 'MANDAL', 'SANAND', 'VIRAMGAM'],
83
- 'AMRELI': ['AMRELI', 'BABRA', 'BAGASARA', 'DHARI', 'JAFRABAD', 'KHAMBHA', 'KUNKAVAV-VADIA', 'LATHI', 'LILIA', 'RAJULA', 'SAVERKUNDLA'],
84
- 'ANAND': [],
85
- 'ARVALLI': [],
86
- 'BANASKANTHA': [],
87
- 'BHARUCH': [],
88
- 'BHAVNAGAR': [],
89
- 'BOTAD': [],
90
- 'CHHOTAUDEPUR': [],
91
- 'DANG': [],
92
- 'DEVBHUMI-DWARKA': [],
93
- 'DOHAD': [],
94
- 'GANDHINAGAR': [],
95
- 'GIR-SOMNATH': [],
96
- 'JAMNAGAR': [],
97
- 'JUNAGADH': [],
98
- 'KACHCHH': [],
99
- 'KHEDA': [],
100
- 'MAHESANA': [],
101
- 'MAHISAGAR': [],
102
- 'MORBI': [],
103
- 'NARMADA': [],
104
- 'NAVSARI': [],
105
- 'PANCH-MAHALS': [],
106
- 'PATAN': [],
107
- 'PORBANDAR': [],
108
- 'RAJKOT': [],
109
- 'SABAR-KANTHA': [],
110
- 'SURAT': ['BARDOLI', 'CHORASI', 'KAMREJ', 'MAHUVA', 'MANDVI', 'MANGROL', 'OLPAD', 'PALSANA', 'SURAT-CITY', 'UMARPADA'],
111
- 'SURENDRANAGAR': [],
112
- 'TAPI': [],
113
- 'VADODARA': [],
114
- 'VALSAD': [],
115
- },
116
-
117
- 'Haryana': {},
118
-
119
- 'Himachal-Pradesh': {},
120
-
121
- 'Jammu-Kashmir': {},
122
-
123
- 'Jharkhand': {},
124
-
125
- 'Karnataka': {},
126
-
127
- 'Kerala': {},
128
-
129
- 'Lakshadweep': {},
130
-
131
- 'Madhya-Pradesh': {},
132
-
133
- 'Maharashtra': {},
134
-
135
- 'Manipur': {},
136
-
137
- 'Meghalaya': {},
138
-
139
- 'Mizoram': {},
140
-
141
- 'Nagaland': {},
142
-
143
- 'Odisha': {},
144
-
145
- 'Pondicherry': {},
146
-
147
- 'Punjab': {},
148
-
149
- 'Rajasthan': {},
150
-
151
- 'Sikkim': {},
152
-
153
- 'Tamilnadu': {},
154
-
155
- 'Telangana': {},
156
-
157
- 'Tripura': {},
158
-
159
- 'Uttar-Pradesh': {},
160
-
161
- 'Uttarakhand': {},
162
-
163
- 'West-Bengal': {},
164
  }
165
 
166
 
167
 
168
  class WEATHER:
169
- def __init__(self):
170
- self.base_url = 'https://nwp.imd.gov.in/blf/blf_temp'
171
-
172
-
173
- # Weather forecast from Govt. website
174
- def get_weather_forecast(self, state, district, is_block_level=False):
175
- self.district_url = f"{self.base_url}/block.php?dis={STATE_CODES.get(state, '') + district}"
176
- self.block_url = f'{self.base_url}/table2.php'
177
-
178
- response = requests.get(self.district_url if not is_block_level else self.block_url)
179
- soup = bs(response.text, 'html.parser')
180
- scripts = soup.findAll('font')[0]
181
- return scripts.text
182
-
183
-
184
- # Weather using Google weather API
185
- def get_weather(self, city):
186
- city = city + " weather"
187
- city = city.replace(" ", "+")
188
-
189
- headers = {
190
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
191
- }
192
- response = requests.get(
193
- f'https://www.google.com/search?q={city}&oq={city}&aqs=chrome.0.35i39l2j0l4j46j69i60.6128j1j7&sourceid=chrome&ie=UTF-8', headers=headers)
194
-
195
- soup = bs(response.text, 'html.parser')
196
- location = soup.select('#wob_loc')[0].getText().strip()
197
- time = soup.select('#wob_dts')[0].getText().strip()
198
- info = soup.select('#wob_dc')[0].getText().strip()
199
- temperature = soup.select('#wob_tm')[0].getText().strip()
200
- temperature = temperature + "°C"
201
-
202
- return time, info, temperature
 
3
 
4
 
5
  STATE_CODES = {
6
+ 'Andaman-Nicobar': '01',
7
+ 'Andhra-Pradesh': '02',
8
+ 'Arunachal-Pradesh': '03',
9
+ 'Assam': '04',
10
+ 'Bihar': '05',
11
+ 'Chandigarh': '06',
12
+ 'Chhattisgarh': '07',
13
+ 'Dadra-and-Nagar-Haveli': '08',
14
+ 'Daman-and-Diu': '09',
15
+ 'Delhi': '10',
16
+ 'Goa': '11',
17
+ 'Gujarat': '12',
18
+ 'Haryana': '13',
19
+ # 14
20
+ 'Himachal-Pradesh': '15',
21
+ 'Jammu-Kashmir': '16',
22
+ 'Jharkhand': '17',
23
+ 'Karnataka': '18',
24
+ 'Kerala': '19',
25
+ 'Lakshadweep': '20',
26
+ 'Madhya-Pradesh': '21',
27
+ 'Maharashtra': '22',
28
+ 'Manipur': '23',
29
+ 'Meghalaya': '24',
30
+ 'Mizoram': '25',
31
+ 'Nagaland': '26',
32
+ 'Odisha': '27',
33
+ 'Pondicherry': '28',
34
+ 'Punjab': '29',
35
+ 'Rajasthan': '30',
36
+ 'Sikkim': '31',
37
+ 'Tamilnadu': '32',
38
+ 'Telangana': '33',
39
+ 'Tripura': '34',
40
+ 'Uttar-Pradesh': '35',
41
+ 'Uttarakhand': '36',
42
+ 'West-Bengal': '37',
43
  }
44
 
45
  # List of states that are given as the input selection to https://nwp.imd.gov.in/blf/blf_temp/ to get the weather forecast
46
  STATES = {
47
+ 'Andaman-Nicobar': {},
48
+
49
+ 'Andhra-Pradesh': {},
50
+
51
+ 'Arunachal-Pradesh': {},
52
+
53
+ 'Assam': {},
54
+
55
+ 'Bihar': {},
56
+
57
+ 'Chandigarh': {},
58
+
59
+ 'Chhattisgarh': {},
60
+
61
+ 'Dadra-and-Nagar-Haveli': {},
62
+
63
+ 'Daman-and-Diu': {},
64
+
65
+ 'Delhi': {
66
+ 'CENTRAL-DELHI': ['CENTRAL-DELHI'],
67
+ 'EAST-DELHI': ['EAST-DELHI'],
68
+ 'NEW-DELHI': ['NEW-DELHI'],
69
+ 'NORTH-DELHI': ['NORTH-DELHI'],
70
+ 'NORTH-EAST-DELHI': ['NORTH-EAST-DELHI'],
71
+ 'NORTH-WEST-DELHI': ['NORTH-WEST-DELHI'],
72
+ 'SHAHDARA': ['SHAHDARA'],
73
+ 'SOUTH-DELHI': ['SOUTH-DELHI'],
74
+ 'SOUTH-EAST-DELHI': ['SOUTH-EAST-DELHI'],
75
+ 'SOUTH-WEST-DELHI': ['SOUTH-WEST-DELHI'],
76
+ 'WEST-DELHI': ['WEST-DELHI'],
77
+ },
78
+
79
+ 'Goa': {},
80
+
81
+ 'Gujarat': {
82
+ 'AHMADABAD': ['AHMEDABAD-CITY', 'BAVLA', 'DASKROI', 'DETROJ-RAMPURA', 'DHANDHUKA', 'DHOLERA', 'DHOLKA', 'MANDAL', 'SANAND', 'VIRAMGAM'],
83
+ 'AMRELI': ['AMRELI', 'BABRA', 'BAGASARA', 'DHARI', 'JAFRABAD', 'KHAMBHA', 'KUNKAVAV-VADIA', 'LATHI', 'LILIA', 'RAJULA', 'SAVERKUNDLA'],
84
+ 'ANAND': [],
85
+ 'ARVALLI': [],
86
+ 'BANASKANTHA': [],
87
+ 'BHARUCH': [],
88
+ 'BHAVNAGAR': [],
89
+ 'BOTAD': [],
90
+ 'CHHOTAUDEPUR': [],
91
+ 'DANG': [],
92
+ 'DEVBHUMI-DWARKA': [],
93
+ 'DOHAD': [],
94
+ 'GANDHINAGAR': [],
95
+ 'GIR-SOMNATH': [],
96
+ 'JAMNAGAR': [],
97
+ 'JUNAGADH': [],
98
+ 'KACHCHH': [],
99
+ 'KHEDA': [],
100
+ 'MAHESANA': [],
101
+ 'MAHISAGAR': [],
102
+ 'MORBI': [],
103
+ 'NARMADA': [],
104
+ 'NAVSARI': [],
105
+ 'PANCH-MAHALS': [],
106
+ 'PATAN': [],
107
+ 'PORBANDAR': [],
108
+ 'RAJKOT': [],
109
+ 'SABAR-KANTHA': [],
110
+ 'SURAT': ['BARDOLI', 'CHORASI', 'KAMREJ', 'MAHUVA', 'MANDVI', 'MANGROL', 'OLPAD', 'PALSANA', 'SURAT-CITY', 'UMARPADA'],
111
+ 'SURENDRANAGAR': [],
112
+ 'TAPI': [],
113
+ 'VADODARA': [],
114
+ 'VALSAD': [],
115
+ },
116
+
117
+ 'Haryana': {},
118
+
119
+ 'Himachal-Pradesh': {},
120
+
121
+ 'Jammu-Kashmir': {},
122
+
123
+ 'Jharkhand': {},
124
+
125
+ 'Karnataka': {},
126
+
127
+ 'Kerala': {},
128
+
129
+ 'Lakshadweep': {},
130
+
131
+ 'Madhya-Pradesh': {},
132
+
133
+ 'Maharashtra': {},
134
+
135
+ 'Manipur': {},
136
+
137
+ 'Meghalaya': {},
138
+
139
+ 'Mizoram': {},
140
+
141
+ 'Nagaland': {},
142
+
143
+ 'Odisha': {},
144
+
145
+ 'Pondicherry': {},
146
+
147
+ 'Punjab': {},
148
+
149
+ 'Rajasthan': {},
150
+
151
+ 'Sikkim': {},
152
+
153
+ 'Tamilnadu': {},
154
+
155
+ 'Telangana': {},
156
+
157
+ 'Tripura': {},
158
+
159
+ 'Uttar-Pradesh': {},
160
+
161
+ 'Uttarakhand': {},
162
+
163
+ 'West-Bengal': {},
164
  }
165
 
166
 
167
 
168
  class WEATHER:
169
+ def __init__(self):
170
+ self.base_url = 'https://nwp.imd.gov.in/blf/blf_temp'
171
+
172
+
173
+ # Weather forecast from Govt. website
174
+ def get_weather_forecast(self, state, district, is_block_level=False):
175
+ self.district_url = f"{self.base_url}/block.php?dis={STATE_CODES.get(state, '') + district}"
176
+ self.block_url = f'{self.base_url}/table2.php'
177
+
178
+ response = requests.get(self.district_url if not is_block_level else self.block_url)
179
+ soup = bs(response.text, 'html.parser')
180
+ scripts = soup.findAll('font')[0]
181
+ return scripts.text
182
+
183
+
184
+ # Weather using Google weather API
185
+ def get_weather(self, city):
186
+ city = city + " weather"
187
+ city = city.replace(" ", "+")
188
+
189
+ headers = {
190
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
191
+ }
192
+ response = requests.get(
193
+ f'https://www.google.com/search?q={city}&oq={city}&aqs=chrome.0.35i39l2j0l4j46j69i60.6128j1j7&sourceid=chrome&ie=UTF-8', headers=headers)
194
+
195
+ soup = bs(response.text, 'html.parser')
196
+ location = soup.select('#wob_loc')[0].getText().strip()
197
+ time = soup.select('#wob_dts')[0].getText().strip()
198
+ info = soup.select('#wob_dc')[0].getText().strip()
199
+ temperature = soup.select('#wob_tm')[0].getText().strip()
200
+ temperature = temperature + "°C"
201
+
202
+ return time, info, temperature
utils/web_crawler.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup as bs
3
+
4
+
5
+ class LOAD_ONLINE_PDF_IPM_PACKAGES:
6
+ def __init__(self):
7
+ self.base_url = 'https://ppqs.gov.in/ipm-packages'
8
+
9
+ self.ipm_packages = []
10
+ self.pdfs_urls = []
11
+
12
+ self.headers = {
13
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
14
+ }
15
+
16
+
17
+ def _get_ipm_packages_name_list(self):
18
+ """
19
+ Parse HTML page to get the names of each IPM Package
20
+ """
21
+
22
+ response = requests.get(
23
+ self.base_url,
24
+ headers=self.headers,
25
+ )
26
+
27
+ soup = bs(response.text, 'html.parser')
28
+ packages = soup.findAll('span', {'class': 'field-content region-name'}, limit=None)
29
+ for package in packages:
30
+ self.ipm_packages.append(package.a['href'].split('/')[-1])
31
+
32
+
33
+ def get_ipm_packages_pdfs_list(self):
34
+ """
35
+ Parse HTML page to get the PDF URLs of each IPM Package
36
+ """
37
+ self._get_ipm_packages_name_list()
38
+
39
+ for ip in self.ipm_packages:
40
+ source_url = f'{self.base_url}/{ip}'
41
+ print(f'Loading PDFs from: {source_url}')
42
+
43
+ response = requests.get(
44
+ source_url,
45
+ headers=self.headers,
46
+ )
47
+
48
+ soup = bs(response.text, 'html.parser')
49
+ urls = soup.findAll('td', {'class': 'views-field views-field-php'}, limit=None)
50
+ for url in urls:
51
+ self.pdfs_urls.append(url.a['href'])
52
+
53
+
54
+ def get_ipm_packages_pdfs_urls():
55
+ pdf = LOAD_ONLINE_PDF_IPM_PACKAGES()
56
+ pdf.get_ipm_packages_pdfs_list()
57
+ print('Total pdfs:', len(pdf.pdfs_urls))
58
+ return pdf.pdfs_urls