import os
import pandas as pd
from pathlib import Path
from llama_index import GPTSimpleVectorIndex, download_loader
from langchain.agents import initialize_agent, Tool
from langchain.llms import OpenAI
from langchain.chains.conversation.memory import ConversationBufferMemory


class DATA_LOADER:
	def __init__(self):
		print()


	def clean_df(self, df, dropna=True, fillna=False):
		if fillna:
			df.fillna('', inplace=True)
		if dropna:
			df.dropna(inplace=True)
			# df = df[~df.isna()]
		df = df.drop_duplicates().reset_index(drop=True)
		return df


	def load_external_links_used_by_FTAs(self,
		sheet_filepath='./data/urls_used_by_ftas/external_links_used_by_FTAs.xlsx'
	):
		xls = pd.ExcelFile(sheet_filepath)
		df = pd.DataFrame(columns=['S.No.', 'Link used for', 'Link type', 'Link'])
		for sheet_name in xls.sheet_names:
			sheet = pd.read_excel(xls, sheet_name)
			if sheet.shape[0] > 0:
				df = pd.concat([df, sheet])
			else:
				print(f'{sheet_name} has no content.')

		df = df[['Link used for', 'Link type', 'Link']]
		# Clean df
		df = clean_df(df)
		print(f'Total links available across all cities: {df.shape[0]}')
		return df


	def load_document(self,
		doc_type='pdf',
		doc_filepath='',
		urls=[]
	):
		documents = []
		
		if doc_type == 'pdf':
			PDFReader = download_loader("PDFReader")
			loader = PDFReader()
			if os.path.exists(doc_filepath):
				documents = loader.load_data(file=Path(doc_filepath))
		
		elif doc_type == 'url':
			BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
			loader = BeautifulSoupWebReader()
			if len(urls) > 0:
				# Load data from URLs
				documents = loader.load_data(urls=urls)
		
		elif doc_type == 'url-kb':
			KnowledgeBaseWebReader = download_loader("KnowledgeBaseWebReader")
			loader = KnowledgeBaseWebReader()
			for url in urls:
				doc = loader.load_data(
					root_url=url, 
					link_selectors=['.article-list a', '.article-list a'],
					article_path='/articles',
					body_selector='.article-body',
					title_selector='.article-title',
					subtitle_selector='.article-subtitle',
				)
				documents.extend(doc)

		elif doc_type == 'url-chatgpt':
			BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
			loader = BeautifulSoupWebReader()
			if len(urls) > 0:
				# Load data from URLs
				documents = loader.load_data(urls=urls)
				# Build the Vector database
				index = GPTSimpleVectorIndex(documents)
				tools = [
					Tool(
						name="Website Index",
						func=lambda q: index.query(q),
						description=f"Useful when you want answer questions about the text retrieved from websites.",
					),
				]
				
				# Call ChatGPT API
				llm = OpenAI(temperature=0)    # Keep temperature=0 to search from the given urls only
				memory = ConversationBufferMemory(memory_key="chat_history")
				agent_chain = initialize_agent(
					tools, llm, agent="zero-shot-react-description", memory=memory
				)

				output = agent_chain.run(input="What language is on this website?")
		
		return documents