import os import pandas as pd from pathlib import Path from llama_index import GPTSimpleVectorIndex, download_loader from langchain.agents import initialize_agent, Tool from langchain.llms import OpenAI from langchain.chains.conversation.memory import ConversationBufferMemory class DATA_LOADER: def __init__(self): print() def clean_df(self, df, dropna=True, fillna=False): if fillna: df.fillna('', inplace=True) if dropna: df.dropna(inplace=True) # df = df[~df.isna()] df = df.drop_duplicates().reset_index(drop=True) return df def load_external_links_used_by_FTAs(self, sheet_filepath='./data/urls_used_by_ftas/external_links_used_by_FTAs.xlsx' ): xls = pd.ExcelFile(sheet_filepath) df = pd.DataFrame(columns=['S.No.', 'Link used for', 'Link type', 'Link']) for sheet_name in xls.sheet_names: sheet = pd.read_excel(xls, sheet_name) if sheet.shape[0] > 0: df = pd.concat([df, sheet]) else: print(f'{sheet_name} has no content.') df = df[['Link used for', 'Link type', 'Link']] # Clean df df = clean_df(df) print(f'Total links available across all cities: {df.shape[0]}') return df def load_document(self, doc_type='pdf', doc_filepath='', urls=[] ): documents = [] if doc_type == 'pdf': PDFReader = download_loader("PDFReader") loader = PDFReader() if os.path.exists(doc_filepath): documents = loader.load_data(file=Path(doc_filepath)) elif doc_type == 'url': BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader") loader = BeautifulSoupWebReader() if len(urls) > 0: # Load data from URLs documents = loader.load_data(urls=urls) elif doc_type == 'url-kb': KnowledgeBaseWebReader = download_loader("KnowledgeBaseWebReader") loader = KnowledgeBaseWebReader() for url in urls: doc = loader.load_data( root_url=url, link_selectors=['.article-list a', '.article-list a'], article_path='/articles', body_selector='.article-body', title_selector='.article-title', subtitle_selector='.article-subtitle', ) documents.extend(doc) elif doc_type == 'url-chatgpt': BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader") loader = BeautifulSoupWebReader() if len(urls) > 0: # Load data from URLs documents = loader.load_data(urls=urls) # Build the Vector database index = GPTSimpleVectorIndex(documents) tools = [ Tool( name="Website Index", func=lambda q: index.query(q), description=f"Useful when you want answer questions about the text retrieved from websites.", ), ] # Call ChatGPT API llm = OpenAI(temperature=0) # Keep temperature=0 to search from the given urls only memory = ConversationBufferMemory(memory_key="chat_history") agent_chain = initialize_agent( tools, llm, agent="zero-shot-react-description", memory=memory ) output = agent_chain.run(input="What language is on this website?") return documents