ugmSorcero commited on
Commit
843bc9e
·
1 Parent(s): 27e0350

Fixes indexing problem and adds split id

Browse files
core/pipelines.py CHANGED
@@ -19,6 +19,8 @@ def keyword_search(index="documents", split_word_length=100):
19
 
20
  - Documents that have more lexical overlap with the query are more likely to be relevant
21
  - Words that occur in fewer documents are more significant than words that occur in many documents
 
 
22
  """
23
  document_store = InMemoryDocumentStore(index=index)
24
  keyword_retriever = TfidfRetriever(document_store=(document_store))
 
19
 
20
  - Documents that have more lexical overlap with the query are more likely to be relevant
21
  - Words that occur in fewer documents are more significant than words that occur in many documents
22
+
23
+ :warning: **(HAYSTACK BUG) Keyword Search doesn't work if you reindex:** Please refresh page in order to reindex
24
  """
25
  document_store = InMemoryDocumentStore(index=index)
26
  keyword_retriever = TfidfRetriever(document_store=(document_store))
core/search_index.py CHANGED
@@ -45,6 +45,7 @@ def search(queries, pipeline):
45
  "score": res.score,
46
  "id": res.meta["id"],
47
  "fragment_id": res.id,
 
48
  }
49
  )
50
  if not score_is_empty:
 
45
  "score": res.score,
46
  "id": res.meta["id"],
47
  "fragment_id": res.id,
48
+ "meta": res.meta
49
  }
50
  )
51
  if not score_is_empty:
interface/components.py CHANGED
@@ -42,11 +42,15 @@ def component_select_pipeline(container):
42
  "index_pipeline": index_pipeline,
43
  "doc": pipeline_funcs[index_pipe].__doc__,
44
  }
 
45
 
46
 
47
  def component_show_pipeline(pipeline, pipeline_name):
48
  """Draw the pipeline"""
49
- with st.expander("Show pipeline"):
 
 
 
50
  if pipeline["doc"] is not None:
51
  st.markdown(pipeline["doc"])
52
  fig = get_pipeline_graph(pipeline[pipeline_name])
@@ -59,41 +63,41 @@ def component_show_search_result(container, results):
59
  st.markdown(f"### Match {idx+1}")
60
  st.markdown(f"**Text**: {document['text']}")
61
  st.markdown(f"**Document**: {document['id']}")
 
 
62
  if document["score"] is not None:
63
  st.markdown(f"**Score**: {document['score']:.3f}")
64
  st.markdown("---")
65
 
66
 
67
- def component_text_input(container):
68
  """Draw the Text Input widget"""
69
  with container:
70
  texts = []
71
- doc_id = 1
72
  with st.expander("Enter documents"):
73
  while True:
74
  text = st.text_input(f"Document {doc_id}", key=doc_id)
75
  if text != "":
76
- texts.append({"text": text})
77
  doc_id += 1
78
  st.markdown("---")
79
  else:
80
  break
81
  corpus = [
82
- {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(texts)
83
  ]
84
- return corpus
85
 
86
 
87
- def component_article_url(container):
88
  """Draw the Article URL widget"""
89
  with container:
90
  urls = []
91
- doc_id = 1
92
  with st.expander("Enter URLs"):
93
  while True:
94
  url = st.text_input(f"URL {doc_id}", key=doc_id)
95
  if url != "":
96
- urls.append({"text": extract_text_from_url(url)})
97
  doc_id += 1
98
  st.markdown("---")
99
  else:
@@ -101,19 +105,18 @@ def component_article_url(container):
101
 
102
  for idx, doc in enumerate(urls):
103
  with st.expander(f"Preview URL {idx}"):
104
- st.write(doc)
105
 
106
  corpus = [
107
- {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
108
  ]
109
- return corpus
110
 
111
 
112
- def component_file_input(container):
113
  """Draw the extract text from file widget"""
114
  with container:
115
  files = []
116
- doc_id = 1
117
  with st.expander("Enter Files"):
118
  while True:
119
  file = st.file_uploader(
@@ -122,7 +125,7 @@ def component_file_input(container):
122
  if file != None:
123
  extracted_text = extract_text_from_file(file)
124
  if extracted_text != None:
125
- files.append({"text": extracted_text})
126
  doc_id += 1
127
  st.markdown("---")
128
  else:
@@ -132,9 +135,9 @@ def component_file_input(container):
132
 
133
  for idx, doc in enumerate(files):
134
  with st.expander(f"Preview File {idx}"):
135
- st.write(doc)
136
 
137
  corpus = [
138
- {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
139
  ]
140
- return corpus
 
42
  "index_pipeline": index_pipeline,
43
  "doc": pipeline_funcs[index_pipe].__doc__,
44
  }
45
+ st.session_state['doc_id'] = 0
46
 
47
 
48
  def component_show_pipeline(pipeline, pipeline_name):
49
  """Draw the pipeline"""
50
+ expander_text = "Show pipeline"
51
+ if pipeline["doc"] is not None and "BUG" in pipeline["doc"]:
52
+ expander_text += " ⚠️"
53
+ with st.expander(expander_text):
54
  if pipeline["doc"] is not None:
55
  st.markdown(pipeline["doc"])
56
  fig = get_pipeline_graph(pipeline[pipeline_name])
 
63
  st.markdown(f"### Match {idx+1}")
64
  st.markdown(f"**Text**: {document['text']}")
65
  st.markdown(f"**Document**: {document['id']}")
66
+ if '_split_id' in document['meta']:
67
+ st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
68
  if document["score"] is not None:
69
  st.markdown(f"**Score**: {document['score']:.3f}")
70
  st.markdown("---")
71
 
72
 
73
+ def component_text_input(container, doc_id):
74
  """Draw the Text Input widget"""
75
  with container:
76
  texts = []
 
77
  with st.expander("Enter documents"):
78
  while True:
79
  text = st.text_input(f"Document {doc_id}", key=doc_id)
80
  if text != "":
81
+ texts.append({"text": text, 'doc_id': doc_id})
82
  doc_id += 1
83
  st.markdown("---")
84
  else:
85
  break
86
  corpus = [
87
+ {"text": doc["text"], "id": doc["doc_id"]} for doc in texts
88
  ]
89
+ return corpus, doc_id
90
 
91
 
92
+ def component_article_url(container, doc_id):
93
  """Draw the Article URL widget"""
94
  with container:
95
  urls = []
 
96
  with st.expander("Enter URLs"):
97
  while True:
98
  url = st.text_input(f"URL {doc_id}", key=doc_id)
99
  if url != "":
100
+ urls.append({"text": extract_text_from_url(url), 'doc_id': doc_id})
101
  doc_id += 1
102
  st.markdown("---")
103
  else:
 
105
 
106
  for idx, doc in enumerate(urls):
107
  with st.expander(f"Preview URL {idx}"):
108
+ st.write(doc['text'])
109
 
110
  corpus = [
111
+ {"text": doc["text"], "id": doc["doc_id"]} for doc in urls
112
  ]
113
+ return corpus, doc_id
114
 
115
 
116
+ def component_file_input(container, doc_id):
117
  """Draw the extract text from file widget"""
118
  with container:
119
  files = []
 
120
  with st.expander("Enter Files"):
121
  while True:
122
  file = st.file_uploader(
 
125
  if file != None:
126
  extracted_text = extract_text_from_file(file)
127
  if extracted_text != None:
128
+ files.append({"text": extracted_text, 'doc_id': doc_id})
129
  doc_id += 1
130
  st.markdown("---")
131
  else:
 
135
 
136
  for idx, doc in enumerate(files):
137
  with st.expander(f"Preview File {idx}"):
138
+ st.write(doc['text'])
139
 
140
  corpus = [
141
+ {"text": doc["text"], "id": doc["doc_id"]} for doc in files
142
  ]
143
+ return corpus, doc_id
interface/config.py CHANGED
@@ -1,7 +1,11 @@
1
  from interface.pages import page_landing_page, page_search, page_index
2
 
3
  # Define default Session Variables over the whole session.
4
- session_state_variables = {"pipeline": None, "pipeline_func_parameters": []}
 
 
 
 
5
 
6
  # Define Pages for the demo
7
  pages = {
 
1
  from interface.pages import page_landing_page, page_search, page_index
2
 
3
  # Define default Session Variables over the whole session.
4
+ session_state_variables = {
5
+ "pipeline": None,
6
+ "pipeline_func_parameters": [],
7
+ "doc_id": 0
8
+ }
9
 
10
  # Define Pages for the demo
11
  pages = {
interface/pages.py CHANGED
@@ -81,7 +81,8 @@ def page_index(container):
81
 
82
  clear_index = st.sidebar.checkbox('Clear Index', True)
83
 
84
- corpus = input_funcs[selected_input][0](container)
 
85
 
86
  if len(corpus) > 0:
87
  index_results = None
@@ -91,5 +92,6 @@ def page_index(container):
91
  st.session_state["pipeline"]["index_pipeline"],
92
  clear_index
93
  )
 
94
  if index_results:
95
  st.write(index_results)
 
81
 
82
  clear_index = st.sidebar.checkbox('Clear Index', True)
83
 
84
+ doc_id = st.session_state['doc_id']
85
+ corpus, doc_id = input_funcs[selected_input][0](container, doc_id)
86
 
87
  if len(corpus) > 0:
88
  index_results = None
 
92
  st.session_state["pipeline"]["index_pipeline"],
93
  clear_index
94
  )
95
+ st.session_state['doc_id'] = doc_id
96
  if index_results:
97
  st.write(index_results)