Spaces:
Runtime error
Runtime error
import os | |
import pandas as pd | |
from pathlib import Path | |
from llama_index import GPTSimpleVectorIndex, download_loader | |
from langchain.agents import initialize_agent, Tool | |
from langchain.llms import OpenAI | |
from langchain.chains.conversation.memory import ConversationBufferMemory | |
class DATA_LOADER: | |
def __init__(self): | |
print() | |
def clean_df(self, df, dropna=True, fillna=False): | |
if fillna: | |
df.fillna('', inplace=True) | |
if dropna: | |
df.dropna(inplace=True) | |
# df = df[~df.isna()] | |
df = df.drop_duplicates().reset_index(drop=True) | |
return df | |
def load_external_links_used_by_FTAs(self, | |
sheet_filepath='./data/urls_used_by_ftas/external_links_used_by_FTAs.xlsx' | |
): | |
xls = pd.ExcelFile(sheet_filepath) | |
df = pd.DataFrame(columns=['S.No.', 'Link used for', 'Link type', 'Link']) | |
for sheet_name in xls.sheet_names: | |
sheet = pd.read_excel(xls, sheet_name) | |
if sheet.shape[0] > 0: | |
df = pd.concat([df, sheet]) | |
else: | |
print(f'{sheet_name} has no content.') | |
df = df[['Link used for', 'Link type', 'Link']] | |
# Clean df | |
df = clean_df(df) | |
print(f'Total links available across all cities: {df.shape[0]}') | |
return df | |
def load_document(self, | |
doc_type='pdf', | |
doc_filepath='', | |
urls=[] | |
): | |
documents = [] | |
if doc_type == 'pdf': | |
PDFReader = download_loader("PDFReader") | |
loader = PDFReader() | |
if os.path.exists(doc_filepath): | |
documents = loader.load_data(file=Path(doc_filepath)) | |
elif doc_type == 'url': | |
BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader") | |
loader = BeautifulSoupWebReader() | |
if len(urls) > 0: | |
# Load data from URLs | |
documents = loader.load_data(urls=urls) | |
elif doc_type == 'url-kb': | |
KnowledgeBaseWebReader = download_loader("KnowledgeBaseWebReader") | |
loader = KnowledgeBaseWebReader() | |
for url in urls: | |
doc = loader.load_data( | |
root_url=url, | |
link_selectors=['.article-list a', '.article-list a'], | |
article_path='/articles', | |
body_selector='.article-body', | |
title_selector='.article-title', | |
subtitle_selector='.article-subtitle', | |
) | |
documents.extend(doc) | |
elif doc_type == 'url-chatgpt': | |
BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader") | |
loader = BeautifulSoupWebReader() | |
if len(urls) > 0: | |
# Load data from URLs | |
documents = loader.load_data(urls=urls) | |
# Build the Vector database | |
index = GPTSimpleVectorIndex(documents) | |
tools = [ | |
Tool( | |
name="Website Index", | |
func=lambda q: index.query(q), | |
description=f"Useful when you want answer questions about the text retrieved from websites.", | |
), | |
] | |
# Call ChatGPT API | |
llm = OpenAI(temperature=0) # Keep temperature=0 to search from the given urls only | |
memory = ConversationBufferMemory(memory_key="chat_history") | |
agent_chain = initialize_agent( | |
tools, llm, agent="zero-shot-react-description", memory=memory | |
) | |
output = agent_chain.run(input="What language is on this website?") | |
return documents | |