KKMS-KSSW-HF / utils /data_loader.py
Chintan Donda
Application file, KKMS repo and utils files
a447435
raw
history blame
3.01 kB
import os
import pandas as pd
from pathlib import Path
from llama_index import GPTSimpleVectorIndex, download_loader
from langchain.agents import initialize_agent, Tool
from langchain.llms import OpenAI
from langchain.chains.conversation.memory import ConversationBufferMemory
class DATA_LOADER:
def __init__(self):
print()
def clean_df(self, df, dropna=True, fillna=False):
if fillna:
df.fillna('', inplace=True)
if dropna:
df.dropna(inplace=True)
# df = df[~df.isna()]
df = df.drop_duplicates().reset_index(drop=True)
return df
def load_external_links_used_by_FTAs(self,
sheet_filepath='./data/urls_used_by_ftas/external_links_used_by_FTAs.xlsx'
):
xls = pd.ExcelFile(sheet_filepath)
df = pd.DataFrame(columns=['S.No.', 'Link used for', 'Link type', 'Link'])
for sheet_name in xls.sheet_names:
sheet = pd.read_excel(xls, sheet_name)
if sheet.shape[0] > 0:
df = pd.concat([df, sheet])
else:
print(f'{sheet_name} has no content.')
df = df[['Link used for', 'Link type', 'Link']]
# Clean df
df = clean_df(df)
print(f'Total links available across all cities: {df.shape[0]}')
return df
def load_document(self,
doc_type='pdf',
doc_filepath='',
urls=[]
):
documents = []
if doc_type == 'pdf':
PDFReader = download_loader("PDFReader")
loader = PDFReader()
if os.path.exists(doc_filepath):
documents = loader.load_data(file=Path(doc_filepath))
elif doc_type == 'url':
BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
loader = BeautifulSoupWebReader()
if len(urls) > 0:
# Load data from URLs
documents = loader.load_data(urls=urls)
elif doc_type == 'url-kb':
KnowledgeBaseWebReader = download_loader("KnowledgeBaseWebReader")
loader = KnowledgeBaseWebReader()
for url in urls:
doc = loader.load_data(
root_url=url,
link_selectors=['.article-list a', '.article-list a'],
article_path='/articles',
body_selector='.article-body',
title_selector='.article-title',
subtitle_selector='.article-subtitle',
)
documents.extend(doc)
elif doc_type == 'url-chatgpt':
BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
loader = BeautifulSoupWebReader()
if len(urls) > 0:
# Load data from URLs
documents = loader.load_data(urls=urls)
# Build the Vector database
index = GPTSimpleVectorIndex(documents)
tools = [
Tool(
name="Website Index",
func=lambda q: index.query(q),
description=f"Useful when you want answer questions about the text retrieved from websites.",
),
]
# Call ChatGPT API
llm = OpenAI(temperature=0) # Keep temperature=0 to search from the given urls only
memory = ConversationBufferMemory(memory_key="chat_history")
agent_chain = initialize_agent(
tools, llm, agent="zero-shot-react-description", memory=memory
)
output = agent_chain.run(input="What language is on this website?")
return documents