PDF-Summarizer / app.py
Tuana's picture
Attempting to add URL crawler
2a34bac
raw
history blame
4.28 kB
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, Crawler
from haystack.schema import Document
import logging
import base64
from PIL import Image
import validators
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
document_store = InMemoryDocumentStore()
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=200,
split_respect_sentence_boundary=True,
)
summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-cnn")
return document_store, summarizer, preprocessor
def pdf_to_document_store(pdf_file):
document_store.delete_documents()
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
with open("temp-path.pdf", 'wb') as temp_file:
base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
temp_file.write(base64.b64decode(base64_pdf))
doc = converter.convert(file_path="temp-path.pdf", meta=None)
preprocessed_docs=preprocessor.process(doc)
document_store.write_documents(preprocessed_docs)
temp_file.close()
def crawl_url(url):
crawler = Crawler(output_dir="crawled_files", overwrite_existing_files=True, crawler_depth=1)
try:
docs = crawler.crawl(urls=[url])
preprocessed_docs = preprocessor.process(docs)
document_store.write_documents(preprocessed_docs)
except:
st.write('We were unable to crawl the contents of that URL, please try something else')
def summarize(content):
if st.session_state.pdf:
pdf_to_document_store(content)
elif st.session_state.url:
crawl_url(content)
summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
return summaries
def set_state_if_absent(key, value):
if key not in st.session_state:
st.session_state[key] = value
set_state_if_absent("summaries", None)
set_state_if_absent("url", False)
set_state_if_absent("pdf", False)
document_store, summarizer, preprocessor = start_haystack()
st.title('TL;DR with Haystack')
image = Image.open('header-image.png')
st.image(image)
st.markdown( """
This Summarization demo uses a [Haystack TransformerSummarizer node](https://haystack.deepset.ai/pipeline_nodes/summarizer). You can upload a PDF file, which will be converted to text with the [Haystack PDFtoTextConverter](https://haystack.deepset.ai/reference/file-converters#pdftotextconverter). In this demo, we produce 1 summary for the whole file you upload. So, the TransformerSummarizer treats the whole thing as one string, which means along with the model limitations, PDFs that have a lot of unneeded text at the beginning produce poor results. For best results, upload a document that has minimal intro and tables at the top.
""", unsafe_allow_html=True)
uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
url = st.text_input(label="enter a URL", type="URL")
if (validators.url(url)) and (upload_file is None):
if st.button('Summarize contents of URL'):
with st.spinner("πŸ“š    Please wait while we produce a summary..."):
try:
st.session_state.pdf = False
st.session_state.url = True
st. session_state.summaries = summarize(url)
except Exception as e:
logging.exception(e)
if (uploaded_file is not None) and !(validators.url(url)):
if st.button('Summarize Document'):
with st.spinner("πŸ“š    Please wait while we produce a summary..."):
try:
st.session_state.pdf = True
st.session_state.url = False
st.session_state.summaries = summarize(uploaded_file)
except Exception as e:
logging.exception(e)
if st.session_state.summaries:
st.write('## Summary')
for count, summary in enumerate(st.session_state.summaries):
st.write(summary.content)