ugaray96 commited on
Commit
c9524e4
·
unverified ·
2 Parent(s): 17fa846 6a6afbf

Merge pull request #9 from ugm2/fix/indexing

Browse files
.streamlit/config.toml CHANGED
@@ -1,5 +1,5 @@
1
  [theme]
2
- primaryColor="#ffbf00"
3
  backgroundColor="#0e1117"
4
  secondaryBackgroundColor="#282929"
5
  textColor = "#ffffff"
 
1
  [theme]
2
+ primaryColor="#e5ab00"
3
  backgroundColor="#0e1117"
4
  secondaryBackgroundColor="#282929"
5
  textColor = "#ffffff"
app.py CHANGED
@@ -29,7 +29,7 @@ def run_demo():
29
  with navigation:
30
 
31
  selected_page = option_menu(
32
- menu_title="Navigation",
33
  options=list(pages.keys()),
34
  icons=[f[1] for f in pages.values()],
35
  menu_icon="cast",
 
29
  with navigation:
30
 
31
  selected_page = option_menu(
32
+ menu_title=None,
33
  options=list(pages.keys()),
34
  icons=[f[1] for f in pages.values()],
35
  menu_icon="cast",
core/pipelines.py CHANGED
@@ -19,6 +19,8 @@ def keyword_search(index="documents", split_word_length=100):
19
 
20
  - Documents that have more lexical overlap with the query are more likely to be relevant
21
  - Words that occur in fewer documents are more significant than words that occur in many documents
 
 
22
  """
23
  document_store = InMemoryDocumentStore(index=index)
24
  keyword_retriever = TfidfRetriever(document_store=(document_store))
@@ -39,10 +41,7 @@ def keyword_search(index="documents", split_word_length=100):
39
  index_pipeline = Pipeline()
40
  index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
41
  index_pipeline.add_node(
42
- keyword_retriever, name="TfidfRetriever", inputs=["Preprocessor"]
43
- )
44
- index_pipeline.add_node(
45
- document_store, name="DocumentStore", inputs=["TfidfRetriever"]
46
  )
47
 
48
  return search_pipeline, index_pipeline
 
19
 
20
  - Documents that have more lexical overlap with the query are more likely to be relevant
21
  - Words that occur in fewer documents are more significant than words that occur in many documents
22
+
23
+ :warning: **(HAYSTACK BUG) Keyword Search doesn't work if you reindex:** Please refresh page in order to reindex
24
  """
25
  document_store = InMemoryDocumentStore(index=index)
26
  keyword_retriever = TfidfRetriever(document_store=(document_store))
 
41
  index_pipeline = Pipeline()
42
  index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
43
  index_pipeline.add_node(
44
+ document_store, name="DocumentStore", inputs=["Preprocessor"]
 
 
 
45
  )
46
 
47
  return search_pipeline, index_pipeline
core/search_index.py CHANGED
@@ -1,4 +1,5 @@
1
  from haystack.schema import Document
 
2
  import uuid
3
 
4
 
@@ -17,8 +18,12 @@ def format_docs(documents):
17
  return db_docs, [doc.meta["id"] for doc in db_docs]
18
 
19
 
20
- def index(documents, pipeline):
21
  documents, doc_ids = format_docs(documents)
 
 
 
 
22
  pipeline.run(documents=documents)
23
  return doc_ids
24
 
@@ -38,6 +43,7 @@ def search(queries, pipeline):
38
  "score": res.score,
39
  "id": res.meta["id"],
40
  "fragment_id": res.id,
 
41
  }
42
  )
43
  if not score_is_empty:
 
1
  from haystack.schema import Document
2
+ from haystack.document_stores import BaseDocumentStore
3
  import uuid
4
 
5
 
 
18
  return db_docs, [doc.meta["id"] for doc in db_docs]
19
 
20
 
21
+ def index(documents, pipeline, clear_index=True):
22
  documents, doc_ids = format_docs(documents)
23
+ if clear_index:
24
+ document_stores = pipeline.get_nodes_by_class(class_type=BaseDocumentStore)
25
+ for docstore in document_stores:
26
+ docstore.delete_index(docstore.index)
27
  pipeline.run(documents=documents)
28
  return doc_ids
29
 
 
43
  "score": res.score,
44
  "id": res.meta["id"],
45
  "fragment_id": res.id,
46
+ "meta": res.meta,
47
  }
48
  )
49
  if not score_is_empty:
interface/components.py CHANGED
@@ -42,11 +42,15 @@ def component_select_pipeline(container):
42
  "index_pipeline": index_pipeline,
43
  "doc": pipeline_funcs[index_pipe].__doc__,
44
  }
 
45
 
46
 
47
  def component_show_pipeline(pipeline, pipeline_name):
48
  """Draw the pipeline"""
49
- with st.expander("Show pipeline"):
 
 
 
50
  if pipeline["doc"] is not None:
51
  st.markdown(pipeline["doc"])
52
  fig = get_pipeline_graph(pipeline[pipeline_name])
@@ -59,41 +63,39 @@ def component_show_search_result(container, results):
59
  st.markdown(f"### Match {idx+1}")
60
  st.markdown(f"**Text**: {document['text']}")
61
  st.markdown(f"**Document**: {document['id']}")
 
 
62
  if document["score"] is not None:
63
  st.markdown(f"**Score**: {document['score']:.3f}")
64
  st.markdown("---")
65
 
66
 
67
- def component_text_input(container):
68
  """Draw the Text Input widget"""
69
  with container:
70
  texts = []
71
- doc_id = 1
72
  with st.expander("Enter documents"):
73
  while True:
74
  text = st.text_input(f"Document {doc_id}", key=doc_id)
75
  if text != "":
76
- texts.append({"text": text})
77
  doc_id += 1
78
  st.markdown("---")
79
  else:
80
  break
81
- corpus = [
82
- {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(texts)
83
- ]
84
- return corpus
85
 
86
 
87
- def component_article_url(container):
88
  """Draw the Article URL widget"""
89
  with container:
90
  urls = []
91
- doc_id = 1
92
  with st.expander("Enter URLs"):
93
  while True:
94
  url = st.text_input(f"URL {doc_id}", key=doc_id)
95
  if url != "":
96
- urls.append({"text": extract_text_from_url(url)})
97
  doc_id += 1
98
  st.markdown("---")
99
  else:
@@ -101,19 +103,16 @@ def component_article_url(container):
101
 
102
  for idx, doc in enumerate(urls):
103
  with st.expander(f"Preview URL {idx}"):
104
- st.write(doc)
105
 
106
- corpus = [
107
- {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
108
- ]
109
- return corpus
110
 
111
 
112
- def component_file_input(container):
113
  """Draw the extract text from file widget"""
114
  with container:
115
  files = []
116
- doc_id = 1
117
  with st.expander("Enter Files"):
118
  while True:
119
  file = st.file_uploader(
@@ -122,7 +121,7 @@ def component_file_input(container):
122
  if file != None:
123
  extracted_text = extract_text_from_file(file)
124
  if extracted_text != None:
125
- files.append({"text": extracted_text})
126
  doc_id += 1
127
  st.markdown("---")
128
  else:
@@ -132,9 +131,7 @@ def component_file_input(container):
132
 
133
  for idx, doc in enumerate(files):
134
  with st.expander(f"Preview File {idx}"):
135
- st.write(doc)
136
 
137
- corpus = [
138
- {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
139
- ]
140
- return corpus
 
42
  "index_pipeline": index_pipeline,
43
  "doc": pipeline_funcs[index_pipe].__doc__,
44
  }
45
+ st.session_state["doc_id"] = 0
46
 
47
 
48
  def component_show_pipeline(pipeline, pipeline_name):
49
  """Draw the pipeline"""
50
+ expander_text = "Show pipeline"
51
+ if pipeline["doc"] is not None and "BUG" in pipeline["doc"]:
52
+ expander_text += " ⚠️"
53
+ with st.expander(expander_text):
54
  if pipeline["doc"] is not None:
55
  st.markdown(pipeline["doc"])
56
  fig = get_pipeline_graph(pipeline[pipeline_name])
 
63
  st.markdown(f"### Match {idx+1}")
64
  st.markdown(f"**Text**: {document['text']}")
65
  st.markdown(f"**Document**: {document['id']}")
66
+ if "_split_id" in document["meta"]:
67
+ st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
68
  if document["score"] is not None:
69
  st.markdown(f"**Score**: {document['score']:.3f}")
70
  st.markdown("---")
71
 
72
 
73
+ def component_text_input(container, doc_id):
74
  """Draw the Text Input widget"""
75
  with container:
76
  texts = []
 
77
  with st.expander("Enter documents"):
78
  while True:
79
  text = st.text_input(f"Document {doc_id}", key=doc_id)
80
  if text != "":
81
+ texts.append({"text": text, "doc_id": doc_id})
82
  doc_id += 1
83
  st.markdown("---")
84
  else:
85
  break
86
+ corpus = [{"text": doc["text"], "id": doc["doc_id"]} for doc in texts]
87
+ return corpus, doc_id
 
 
88
 
89
 
90
+ def component_article_url(container, doc_id):
91
  """Draw the Article URL widget"""
92
  with container:
93
  urls = []
 
94
  with st.expander("Enter URLs"):
95
  while True:
96
  url = st.text_input(f"URL {doc_id}", key=doc_id)
97
  if url != "":
98
+ urls.append({"text": extract_text_from_url(url), "doc_id": doc_id})
99
  doc_id += 1
100
  st.markdown("---")
101
  else:
 
103
 
104
  for idx, doc in enumerate(urls):
105
  with st.expander(f"Preview URL {idx}"):
106
+ st.write(doc["text"])
107
 
108
+ corpus = [{"text": doc["text"], "id": doc["doc_id"]} for doc in urls]
109
+ return corpus, doc_id
 
 
110
 
111
 
112
+ def component_file_input(container, doc_id):
113
  """Draw the extract text from file widget"""
114
  with container:
115
  files = []
 
116
  with st.expander("Enter Files"):
117
  while True:
118
  file = st.file_uploader(
 
121
  if file != None:
122
  extracted_text = extract_text_from_file(file)
123
  if extracted_text != None:
124
+ files.append({"text": extracted_text, "doc_id": doc_id})
125
  doc_id += 1
126
  st.markdown("---")
127
  else:
 
131
 
132
  for idx, doc in enumerate(files):
133
  with st.expander(f"Preview File {idx}"):
134
+ st.write(doc["text"])
135
 
136
+ corpus = [{"text": doc["text"], "id": doc["doc_id"]} for doc in files]
137
+ return corpus, doc_id
 
 
interface/config.py CHANGED
@@ -1,7 +1,11 @@
1
  from interface.pages import page_landing_page, page_search, page_index
2
 
3
  # Define default Session Variables over the whole session.
4
- session_state_variables = {"pipeline": None, "pipeline_func_parameters": []}
 
 
 
 
5
 
6
  # Define Pages for the demo
7
  pages = {
 
1
  from interface.pages import page_landing_page, page_search, page_index
2
 
3
  # Define default Session Variables over the whole session.
4
+ session_state_variables = {
5
+ "pipeline": None,
6
+ "pipeline_func_parameters": [],
7
+ "doc_id": 0,
8
+ }
9
 
10
  # Define Pages for the demo
11
  pages = {
interface/pages.py CHANGED
@@ -79,14 +79,17 @@ def page_index(container):
79
  orientation="horizontal",
80
  )
81
 
82
- corpus = input_funcs[selected_input][0](container)
 
 
 
83
 
84
  if len(corpus) > 0:
85
  index_results = None
86
  if st.button("Index"):
87
  index_results = index(
88
- corpus,
89
- st.session_state["pipeline"]["index_pipeline"],
90
  )
 
91
  if index_results:
92
  st.write(index_results)
 
79
  orientation="horizontal",
80
  )
81
 
82
+ clear_index = st.sidebar.checkbox("Clear Index", True)
83
+
84
+ doc_id = st.session_state["doc_id"]
85
+ corpus, doc_id = input_funcs[selected_input][0](container, doc_id)
86
 
87
  if len(corpus) > 0:
88
  index_results = None
89
  if st.button("Index"):
90
  index_results = index(
91
+ corpus, st.session_state["pipeline"]["index_pipeline"], clear_index
 
92
  )
93
+ st.session_state["doc_id"] = doc_id
94
  if index_results:
95
  st.write(index_results)