Spaces:
Build error
Build error
Attempting to add URL crawler
Browse files
app.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
import streamlit as st
|
2 |
from haystack.document_stores import InMemoryDocumentStore
|
3 |
-
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter
|
4 |
from haystack.schema import Document
|
5 |
import logging
|
6 |
import base64
|
7 |
from PIL import Image
|
|
|
8 |
|
9 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
10 |
def start_haystack():
|
@@ -32,8 +33,20 @@ def pdf_to_document_store(pdf_file):
|
|
32 |
document_store.write_documents(preprocessed_docs)
|
33 |
temp_file.close()
|
34 |
|
35 |
-
def
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
|
38 |
return summaries
|
39 |
|
@@ -41,7 +54,10 @@ def set_state_if_absent(key, value):
|
|
41 |
if key not in st.session_state:
|
42 |
st.session_state[key] = value
|
43 |
|
44 |
-
set_state_if_absent("summaries", None)
|
|
|
|
|
|
|
45 |
document_store, summarizer, preprocessor = start_haystack()
|
46 |
|
47 |
st.title('TL;DR with Haystack')
|
@@ -53,12 +69,25 @@ This Summarization demo uses a [Haystack TransformerSummarizer node](https://hay
|
|
53 |
""", unsafe_allow_html=True)
|
54 |
|
55 |
uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
|
|
|
56 |
|
57 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
if st.button('Summarize Document'):
|
59 |
with st.spinner("π Please wait while we produce a summary..."):
|
60 |
try:
|
61 |
-
st.
|
|
|
|
|
62 |
except Exception as e:
|
63 |
logging.exception(e)
|
64 |
|
|
|
1 |
import streamlit as st
|
2 |
from haystack.document_stores import InMemoryDocumentStore
|
3 |
+
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, Crawler
|
4 |
from haystack.schema import Document
|
5 |
import logging
|
6 |
import base64
|
7 |
from PIL import Image
|
8 |
+
import validators
|
9 |
|
10 |
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
11 |
def start_haystack():
|
|
|
33 |
document_store.write_documents(preprocessed_docs)
|
34 |
temp_file.close()
|
35 |
|
36 |
+
def crawl_url(url):
|
37 |
+
crawler = Crawler(output_dir="crawled_files", overwrite_existing_files=True, crawler_depth=1)
|
38 |
+
try:
|
39 |
+
docs = crawler.crawl(urls=[url])
|
40 |
+
preprocessed_docs = preprocessor.process(docs)
|
41 |
+
document_store.write_documents(preprocessed_docs)
|
42 |
+
except:
|
43 |
+
st.write('We were unable to crawl the contents of that URL, please try something else')
|
44 |
+
|
45 |
+
def summarize(content):
|
46 |
+
if st.session_state.pdf:
|
47 |
+
pdf_to_document_store(content)
|
48 |
+
elif st.session_state.url:
|
49 |
+
crawl_url(content)
|
50 |
summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
|
51 |
return summaries
|
52 |
|
|
|
54 |
if key not in st.session_state:
|
55 |
st.session_state[key] = value
|
56 |
|
57 |
+
set_state_if_absent("summaries", None)
|
58 |
+
set_state_if_absent("url", False)
|
59 |
+
set_state_if_absent("pdf", False)
|
60 |
+
|
61 |
document_store, summarizer, preprocessor = start_haystack()
|
62 |
|
63 |
st.title('TL;DR with Haystack')
|
|
|
69 |
""", unsafe_allow_html=True)
|
70 |
|
71 |
uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
|
72 |
+
url = st.text_input(label="enter a URL", type="URL")
|
73 |
|
74 |
+
if (validators.url(url)) and (upload_file is None):
|
75 |
+
if st.button('Summarize contents of URL'):
|
76 |
+
with st.spinner("π Please wait while we produce a summary..."):
|
77 |
+
try:
|
78 |
+
st.session_state.pdf = False
|
79 |
+
st.session_state.url = True
|
80 |
+
st. session_state.summaries = summarize(url)
|
81 |
+
except Exception as e:
|
82 |
+
logging.exception(e)
|
83 |
+
|
84 |
+
if (uploaded_file is not None) and !(validators.url(url)):
|
85 |
if st.button('Summarize Document'):
|
86 |
with st.spinner("π Please wait while we produce a summary..."):
|
87 |
try:
|
88 |
+
st.session_state.pdf = True
|
89 |
+
st.session_state.url = False
|
90 |
+
st.session_state.summaries = summarize(uploaded_file)
|
91 |
except Exception as e:
|
92 |
logging.exception(e)
|
93 |
|