Tuana commited on
Commit
2a34bac
Β·
1 Parent(s): 69046f9

Attempting to add URL crawler

Browse files
Files changed (1) hide show
  1. app.py +35 -6
app.py CHANGED
@@ -1,10 +1,11 @@
1
  import streamlit as st
2
  from haystack.document_stores import InMemoryDocumentStore
3
- from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter
4
  from haystack.schema import Document
5
  import logging
6
  import base64
7
  from PIL import Image
 
8
 
9
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
10
  def start_haystack():
@@ -32,8 +33,20 @@ def pdf_to_document_store(pdf_file):
32
  document_store.write_documents(preprocessed_docs)
33
  temp_file.close()
34
 
35
- def summarize(file):
36
- pdf_to_document_store(file)
 
 
 
 
 
 
 
 
 
 
 
 
37
  summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
38
  return summaries
39
 
@@ -41,7 +54,10 @@ def set_state_if_absent(key, value):
41
  if key not in st.session_state:
42
  st.session_state[key] = value
43
 
44
- set_state_if_absent("summaries", None)
 
 
 
45
  document_store, summarizer, preprocessor = start_haystack()
46
 
47
  st.title('TL;DR with Haystack')
@@ -53,12 +69,25 @@ This Summarization demo uses a [Haystack TransformerSummarizer node](https://hay
53
  """, unsafe_allow_html=True)
54
 
55
  uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
 
56
 
57
- if uploaded_file is not None:
 
 
 
 
 
 
 
 
 
 
58
  if st.button('Summarize Document'):
59
  with st.spinner("πŸ“š    Please wait while we produce a summary..."):
60
  try:
61
- st. session_state.summaries = summarize(uploaded_file)
 
 
62
  except Exception as e:
63
  logging.exception(e)
64
 
 
1
  import streamlit as st
2
  from haystack.document_stores import InMemoryDocumentStore
3
+ from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, Crawler
4
  from haystack.schema import Document
5
  import logging
6
  import base64
7
  from PIL import Image
8
+ import validators
9
 
10
  @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
11
  def start_haystack():
 
33
  document_store.write_documents(preprocessed_docs)
34
  temp_file.close()
35
 
36
+ def crawl_url(url):
37
+ crawler = Crawler(output_dir="crawled_files", overwrite_existing_files=True, crawler_depth=1)
38
+ try:
39
+ docs = crawler.crawl(urls=[url])
40
+ preprocessed_docs = preprocessor.process(docs)
41
+ document_store.write_documents(preprocessed_docs)
42
+ except:
43
+ st.write('We were unable to crawl the contents of that URL, please try something else')
44
+
45
+ def summarize(content):
46
+ if st.session_state.pdf:
47
+ pdf_to_document_store(content)
48
+ elif st.session_state.url:
49
+ crawl_url(content)
50
  summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
51
  return summaries
52
 
 
54
  if key not in st.session_state:
55
  st.session_state[key] = value
56
 
57
+ set_state_if_absent("summaries", None)
58
+ set_state_if_absent("url", False)
59
+ set_state_if_absent("pdf", False)
60
+
61
  document_store, summarizer, preprocessor = start_haystack()
62
 
63
  st.title('TL;DR with Haystack')
 
69
  """, unsafe_allow_html=True)
70
 
71
  uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
72
+ url = st.text_input(label="enter a URL", type="URL")
73
 
74
+ if (validators.url(url)) and (upload_file is None):
75
+ if st.button('Summarize contents of URL'):
76
+ with st.spinner("πŸ“š    Please wait while we produce a summary..."):
77
+ try:
78
+ st.session_state.pdf = False
79
+ st.session_state.url = True
80
+ st. session_state.summaries = summarize(url)
81
+ except Exception as e:
82
+ logging.exception(e)
83
+
84
+ if (uploaded_file is not None) and !(validators.url(url)):
85
  if st.button('Summarize Document'):
86
  with st.spinner("πŸ“š    Please wait while we produce a summary..."):
87
  try:
88
+ st.session_state.pdf = True
89
+ st.session_state.url = False
90
+ st.session_state.summaries = summarize(uploaded_file)
91
  except Exception as e:
92
  logging.exception(e)
93