Spaces:
Build error
Build error
File size: 4,270 Bytes
28ec4f0 a3fdd99 2a34bac 836e16d a3fdd99 f6cc0cb 28c1177 2a34bac a3fdd99 9a54394 5fdc2d5 cc0fbf1 9a54394 58c1223 9a54394 a3fdd99 3a4a956 a3fdd99 3a4a956 f6cc0cb a7fa548 2a34bac 2d4dc51 67f4a7d d42a71a 2a34bac 9a54394 9097656 762970d f176fb0 762970d fe7b517 c1986cc fe7b517 3a4a956 7d43669 a3fdd99 12c1880 2a34bac f4acbae 3a4a956 67f4a7d 2a34bac 67f4a7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, Crawler
from haystack.schema import Document
import logging
import base64
from PIL import Image
import validators
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
document_store = InMemoryDocumentStore()
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=200,
split_respect_sentence_boundary=True,
)
summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-cnn")
return document_store, summarizer, preprocessor
def pdf_to_document_store(pdf_file):
document_store.delete_documents()
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
with open("temp-path.pdf", 'wb') as temp_file:
base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
temp_file.write(base64.b64decode(base64_pdf))
doc = converter.convert(file_path="temp-path.pdf", meta=None)
preprocessed_docs=preprocessor.process(doc)
document_store.write_documents(preprocessed_docs)
temp_file.close()
def crawl_url(url):
crawler = Crawler(output_dir="crawled_files", overwrite_existing_files=True, crawler_depth=1)
try:
docs = crawler.crawl(urls=[url])
preprocessed_docs = preprocessor.process(docs)
document_store.write_documents(preprocessed_docs)
except:
st.write('We were unable to crawl the contents of that URL, please try something else')
def summarize(content):
if st.session_state.pdf:
pdf_to_document_store(content)
elif st.session_state.url:
crawl_url(content)
summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
return summaries
def set_state_if_absent(key, value):
if key not in st.session_state:
st.session_state[key] = value
set_state_if_absent("summaries", None)
set_state_if_absent("url", False)
set_state_if_absent("pdf", False)
document_store, summarizer, preprocessor = start_haystack()
st.title('TL;DR with Haystack')
image = Image.open('header-image.png')
st.image(image)
st.markdown( """
This Summarization demo uses a [Haystack TransformerSummarizer node](https://haystack.deepset.ai/pipeline_nodes/summarizer). You can upload a PDF file, which will be converted to text with the [Haystack PDFtoTextConverter](https://haystack.deepset.ai/reference/file-converters#pdftotextconverter). In this demo, we produce 1 summary for the whole file you upload. So, the TransformerSummarizer treats the whole thing as one string, which means along with the model limitations, PDFs that have a lot of unneeded text at the beginning produce poor results. For best results, upload a document that has minimal intro and tables at the top.
""", unsafe_allow_html=True)
uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
url = st.text_input(label="enter a URL")
if (validators.url(url)) and (uploaded_file is None):
if st.button('Summarize contents of URL'):
with st.spinner("π Please wait while we produce a summary..."):
try:
st.session_state.pdf = False
st.session_state.url = True
st. session_state.summaries = summarize(url)
except Exception as e:
logging.exception(e)
if (uploaded_file is not None) and not validators.url(url):
if st.button('Summarize Document'):
with st.spinner("π Please wait while we produce a summary..."):
try:
st.session_state.pdf = True
st.session_state.url = False
st.session_state.summaries = summarize(uploaded_file)
except Exception as e:
logging.exception(e)
if st.session_state.summaries:
st.write('## Summary')
for count, summary in enumerate(st.session_state.summaries):
st.write(summary.content)
|