Spaces:

Tuana
/

PDF-Summarizer

Build error

File size: 4,270 Bytes

28ec4f0
a3fdd99
2a34bac
836e16d
a3fdd99
f6cc0cb
28c1177
2a34bac
a3fdd99
9a54394
 
 
 
 
 
 
5fdc2d5
cc0fbf1
9a54394
 
58c1223
9a54394
a3fdd99
 
3a4a956
 
a3fdd99
3a4a956
 
 
 
 
 
f6cc0cb
a7fa548
2a34bac
 
 
 
 
 
 
 
 
 
 
 
 
 
2d4dc51
67f4a7d
 
 
 
 
d42a71a
2a34bac
 
 
 
9a54394
9097656
762970d
f176fb0
 
762970d
fe7b517
c1986cc
fe7b517
 
3a4a956
7d43669
a3fdd99
12c1880
2a34bac
 
 
 
 
 
 
 
 
f4acbae
3a4a956
67f4a7d
 
2a34bac
 
 
67f4a7d

import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, Crawler
from haystack.schema import Document
import logging
import base64
from PIL import Image
import validators

@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
    document_store = InMemoryDocumentStore()
    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=200,
        split_respect_sentence_boundary=True,
    )
    summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-cnn")
    return document_store, summarizer, preprocessor


def pdf_to_document_store(pdf_file):
    document_store.delete_documents()
    converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
    with open("temp-path.pdf", 'wb') as temp_file:
        base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
        temp_file.write(base64.b64decode(base64_pdf))
        doc = converter.convert(file_path="temp-path.pdf", meta=None)
        preprocessed_docs=preprocessor.process(doc)
        document_store.write_documents(preprocessed_docs)
        temp_file.close()

def crawl_url(url):
    crawler = Crawler(output_dir="crawled_files", overwrite_existing_files=True, crawler_depth=1)
    try:
        docs = crawler.crawl(urls=[url])
        preprocessed_docs = preprocessor.process(docs)
        document_store.write_documents(preprocessed_docs)
    except:
        st.write('We were unable to crawl the contents of that URL, please try something else')
    
def summarize(content):
    if st.session_state.pdf:
        pdf_to_document_store(content)
    elif st.session_state.url:
        crawl_url(content)
    summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
    return summaries

def set_state_if_absent(key, value):
    if key not in st.session_state:
        st.session_state[key] = value
        
set_state_if_absent("summaries", None)
set_state_if_absent("url", False)
set_state_if_absent("pdf", False)
        
document_store, summarizer, preprocessor = start_haystack()

st.title('TL;DR with Haystack')
image = Image.open('header-image.png')
st.image(image)

st.markdown( """
This Summarization demo uses a [Haystack TransformerSummarizer node](https://haystack.deepset.ai/pipeline_nodes/summarizer). You can upload a PDF file, which will be converted to text with the [Haystack PDFtoTextConverter](https://haystack.deepset.ai/reference/file-converters#pdftotextconverter). In this demo, we produce 1 summary for the whole file you upload. So, the TransformerSummarizer treats the whole thing as one string, which means along with the model limitations, PDFs that have a lot of unneeded text at the beginning produce poor results. For best results, upload a document that has minimal intro and tables at the top. 
""", unsafe_allow_html=True)

uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
url = st.text_input(label="enter a URL")

if (validators.url(url)) and (uploaded_file is None):
    if st.button('Summarize contents of URL'):
        with st.spinner("📚 &nbsp;&nbsp; Please wait while we produce a summary..."):
            try:
                st.session_state.pdf = False
                st.session_state.url = True
                st. session_state.summaries = summarize(url)
            except Exception as e:
                logging.exception(e)
                
if (uploaded_file is not None) and not validators.url(url):
    if st.button('Summarize Document'):
        with st.spinner("📚 &nbsp;&nbsp; Please wait while we produce a summary..."):
            try:
                st.session_state.pdf = True
                st.session_state.url = False
                st.session_state.summaries = summarize(uploaded_file)
            except Exception as e:
                logging.exception(e)
 
if st.session_state.summaries:
    st.write('## Summary')
    for count, summary in enumerate(st.session_state.summaries):
        st.write(summary.content)