lfoppiano commited on
Commit
2397955
·
1 Parent(s): ddff75f

enable extraction of coordinates from pdf, using sentences

Browse files
document_qa/document_qa_engine.py CHANGED
@@ -56,7 +56,7 @@ class DocumentQAEngine:
56
  grobid_client = GrobidClient(
57
  grobid_server=self.grobid_url,
58
  batch_size=1000,
59
- coordinates=["p"],
60
  sleep_time=5,
61
  timeout=60,
62
  check_server=True
@@ -104,7 +104,7 @@ class DocumentQAEngine:
104
  if verbose:
105
  print(query)
106
 
107
- response = self._run_query(doc_id, query, context_size=context_size)
108
  response = response['output_text'] if 'output_text' in response else response
109
 
110
  if verbose:
@@ -115,17 +115,17 @@ class DocumentQAEngine:
115
  return self._parse_json(response, output_parser), response
116
  except Exception as oe:
117
  print("Failing to parse the response", oe)
118
- return None, response
119
  elif extraction_schema:
120
  try:
121
  chain = create_extraction_chain(extraction_schema, self.llm)
122
  parsed = chain.run(response)
123
- return parsed, response
124
  except Exception as oe:
125
  print("Failing to parse the response", oe)
126
- return None, response
127
  else:
128
- return None, response
129
 
130
  def query_storage(self, query: str, doc_id, context_size=4):
131
  documents = self._get_context(doc_id, query, context_size)
@@ -156,12 +156,13 @@ class DocumentQAEngine:
156
 
157
  def _run_query(self, doc_id, query, context_size=4):
158
  relevant_documents = self._get_context(doc_id, query, context_size)
 
159
  response = self.chain.run(input_documents=relevant_documents,
160
  question=query)
161
 
162
  if self.memory:
163
  self.memory.save_context({"input": query}, {"output": response})
164
- return response
165
 
166
  def _get_context(self, doc_id, query, context_size=4):
167
  db = self.embeddings_dict[doc_id]
@@ -194,7 +195,8 @@ class DocumentQAEngine:
194
  if verbose:
195
  print("File", pdf_file_path)
196
  filename = Path(pdf_file_path).stem
197
- structure = self.grobid_processor.process_structure(pdf_file_path)
 
198
 
199
  biblio = structure['biblio']
200
  biblio['filename'] = filename.replace(" ", "_")
@@ -215,6 +217,7 @@ class DocumentQAEngine:
215
  biblio_copy['type'] = passage['type']
216
  biblio_copy['section'] = passage['section']
217
  biblio_copy['subSection'] = passage['subSection']
 
218
  metadatas.append(biblio_copy)
219
 
220
  ids.append(passage['passage_id'])
 
56
  grobid_client = GrobidClient(
57
  grobid_server=self.grobid_url,
58
  batch_size=1000,
59
+ coordinates=["s"],
60
  sleep_time=5,
61
  timeout=60,
62
  check_server=True
 
104
  if verbose:
105
  print(query)
106
 
107
+ response, coordinates = self._run_query(doc_id, query, context_size=context_size)
108
  response = response['output_text'] if 'output_text' in response else response
109
 
110
  if verbose:
 
115
  return self._parse_json(response, output_parser), response
116
  except Exception as oe:
117
  print("Failing to parse the response", oe)
118
+ return None, response, coordinates
119
  elif extraction_schema:
120
  try:
121
  chain = create_extraction_chain(extraction_schema, self.llm)
122
  parsed = chain.run(response)
123
+ return parsed, response, coordinates
124
  except Exception as oe:
125
  print("Failing to parse the response", oe)
126
+ return None, response, coordinates
127
  else:
128
+ return None, response, coordinates
129
 
130
  def query_storage(self, query: str, doc_id, context_size=4):
131
  documents = self._get_context(doc_id, query, context_size)
 
156
 
157
  def _run_query(self, doc_id, query, context_size=4):
158
  relevant_documents = self._get_context(doc_id, query, context_size)
159
+ relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else [] for doc in relevant_documents] #filter(lambda d: d['type'] == "sentence", relevant_documents)]
160
  response = self.chain.run(input_documents=relevant_documents,
161
  question=query)
162
 
163
  if self.memory:
164
  self.memory.save_context({"input": query}, {"output": response})
165
+ return response, relevant_document_coordinates
166
 
167
  def _get_context(self, doc_id, query, context_size=4):
168
  db = self.embeddings_dict[doc_id]
 
195
  if verbose:
196
  print("File", pdf_file_path)
197
  filename = Path(pdf_file_path).stem
198
+ coordinates = True if chunk_size == -1 else False
199
+ structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates)
200
 
201
  biblio = structure['biblio']
202
  biblio['filename'] = filename.replace(" ", "_")
 
217
  biblio_copy['type'] = passage['type']
218
  biblio_copy['section'] = passage['section']
219
  biblio_copy['subSection'] = passage['subSection']
220
+ biblio_copy['coordinates'] = passage['coordinates']
221
  metadatas.append(biblio_copy)
222
 
223
  ids.append(passage['passage_id'])
document_qa/grobid_processors.py CHANGED
@@ -131,13 +131,13 @@ class GrobidProcessor(BaseProcessor):
131
  # super().__init__()
132
  self.grobid_client = grobid_client
133
 
134
- def process_structure(self, input_path):
135
  pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
136
  input_path,
137
  consolidate_header=True,
138
  consolidate_citations=False,
139
- segment_sentences=False,
140
- tei_coordinates=False,
141
  include_raw_citations=False,
142
  include_raw_affiliations=False,
143
  generateIDs=True)
@@ -145,7 +145,7 @@ class GrobidProcessor(BaseProcessor):
145
  if status != 200:
146
  return
147
 
148
- output_data = self.parse_grobid_xml(text)
149
  output_data['filename'] = Path(pdf_file).stem.replace(".tei", "")
150
 
151
  return output_data
@@ -159,7 +159,7 @@ class GrobidProcessor(BaseProcessor):
159
 
160
  return doc
161
 
162
- def parse_grobid_xml(self, text):
163
  output_data = OrderedDict()
164
 
165
  doc_biblio = grobid_tei_xml.parse_document_xml(text)
@@ -188,17 +188,20 @@ class GrobidProcessor(BaseProcessor):
188
  # "passage_id": "title0"
189
  # })
190
 
 
 
191
  if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
192
  passages.append({
193
  "text": self.post_process(doc_biblio.abstract),
194
- "type": "paragraph",
195
  "section": "<header>",
196
  "subSection": "<abstract>",
197
- "passage_id": "abstract0"
 
198
  })
199
 
200
  soup = BeautifulSoup(text, 'xml')
201
- text_blocks_body = get_children_body(soup, verbose=False)
202
 
203
  passages.extend([
204
  {
@@ -206,10 +209,12 @@ class GrobidProcessor(BaseProcessor):
206
  text.parent.name != "ref" or (
207
  text.parent.name == "ref" and text.parent.attrs[
208
  'type'] != 'bibr'))),
209
- "type": "paragraph",
210
  "section": "<body>",
211
- "subSection": "<paragraph>",
212
- "passage_id": str(paragraph_id) + str(sentence_id)
 
 
213
  }
214
  for paragraph_id, paragraph in enumerate(text_blocks_body) for
215
  sentence_id, sentence in enumerate(paragraph)
@@ -223,10 +228,11 @@ class GrobidProcessor(BaseProcessor):
223
  text.parent.name != "ref" or (
224
  text.parent.name == "ref" and text.parent.attrs[
225
  'type'] != 'bibr'))),
226
- "type": "paragraph",
227
  "section": "<body>",
228
  "subSection": "<figure>",
229
- "passage_id": str(paragraph_id) + str(sentence_id)
 
230
  }
231
  for paragraph_id, paragraph in enumerate(text_blocks_figures) for
232
  sentence_id, sentence in enumerate(paragraph)
 
131
  # super().__init__()
132
  self.grobid_client = grobid_client
133
 
134
+ def process_structure(self, input_path, coordinates=False):
135
  pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
136
  input_path,
137
  consolidate_header=True,
138
  consolidate_citations=False,
139
+ segment_sentences=True,
140
+ tei_coordinates=coordinates,
141
  include_raw_citations=False,
142
  include_raw_affiliations=False,
143
  generateIDs=True)
 
145
  if status != 200:
146
  return
147
 
148
+ output_data = self.parse_grobid_xml(text, coordinates=coordinates)
149
  output_data['filename'] = Path(pdf_file).stem.replace(".tei", "")
150
 
151
  return output_data
 
159
 
160
  return doc
161
 
162
+ def parse_grobid_xml(self, text, coordinates=False):
163
  output_data = OrderedDict()
164
 
165
  doc_biblio = grobid_tei_xml.parse_document_xml(text)
 
188
  # "passage_id": "title0"
189
  # })
190
 
191
+ passage_type = "sentence" if coordinates else "paragraph"
192
+
193
  if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
194
  passages.append({
195
  "text": self.post_process(doc_biblio.abstract),
196
+ "type": passage_type,
197
  "section": "<header>",
198
  "subSection": "<abstract>",
199
+ "passage_id": "abstract0",
200
+ "coordinates": ""
201
  })
202
 
203
  soup = BeautifulSoup(text, 'xml')
204
+ text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=False)
205
 
206
  passages.extend([
207
  {
 
209
  text.parent.name != "ref" or (
210
  text.parent.name == "ref" and text.parent.attrs[
211
  'type'] != 'bibr'))),
212
+ "type": passage_type,
213
  "section": "<body>",
214
+ "subSection": "<sentence>",
215
+ "passage_id": str(paragraph_id) + str(sentence_id),
216
+ # "coordinates": sentence['coords'].split(";") if coordinates else []
217
+ "coordinates": sentence['coords'] if coordinates else ""
218
  }
219
  for paragraph_id, paragraph in enumerate(text_blocks_body) for
220
  sentence_id, sentence in enumerate(paragraph)
 
228
  text.parent.name != "ref" or (
229
  text.parent.name == "ref" and text.parent.attrs[
230
  'type'] != 'bibr'))),
231
+ "type": passage_type,
232
  "section": "<body>",
233
  "subSection": "<figure>",
234
+ "passage_id": str(paragraph_id) + str(sentence_id),
235
+ "coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
236
  }
237
  for paragraph_id, paragraph in enumerate(text_blocks_figures) for
238
  sentence_id, sentence in enumerate(paragraph)
streamlit_app.py CHANGED
@@ -59,6 +59,12 @@ if 'memory' not in st.session_state:
59
  if 'binary' not in st.session_state:
60
  st.session_state['binary'] = None
61
 
 
 
 
 
 
 
62
  st.set_page_config(
63
  page_title="Scientific Document Insights Q/A",
64
  page_icon="📝",
@@ -290,7 +296,7 @@ with st.sidebar:
290
  mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
291
  help="LLM will respond the question, Embedding will show the "
292
  "paragraphs relevant to the question in the paper.")
293
- chunk_size = st.slider("Chunks size", 100, 2000, value=250,
294
  help="Size of chunks in which the document is partitioned",
295
  disabled=uploaded_file is not None)
296
  context_size = st.slider("Context size", 3, 10, value=4,
@@ -320,8 +326,6 @@ with st.sidebar:
320
  st.markdown(
321
  """If you switch the mode to "Embedding," the system will return specific chunks from the document that are semantically related to your query. This mode helps to test why sometimes the answers are not satisfying or incomplete. """)
322
 
323
-
324
-
325
  if uploaded_file and not st.session_state.loaded_embeddings:
326
  if model not in st.session_state['api_keys']:
327
  st.error("Before uploading a document, you must enter the API key. ")
@@ -344,8 +348,8 @@ if uploaded_file and not st.session_state.loaded_embeddings:
344
  # timestamp = datetime.utcnow()
345
 
346
  with left_column:
347
- if st.session_state['binary']:
348
- pdf_viewer(st.session_state['binary'])
349
 
350
  with right_column:
351
  # css = '''
@@ -389,8 +393,14 @@ with right_column:
389
  context_size=context_size)
390
  elif mode == "LLM":
391
  with st.spinner("Generating response..."):
392
- _, text_response = st.session_state['rqa'][model].query_document(question, st.session_state.doc_id,
393
- context_size=context_size)
 
 
 
 
 
 
394
 
395
  if not text_response:
396
  st.error("Something went wrong. Contact Luca Foppiano ([email protected]) to report the issue.")
 
59
  if 'binary' not in st.session_state:
60
  st.session_state['binary'] = None
61
 
62
+ if 'annotations' not in st.session_state:
63
+ st.session_state['annotations'] = None
64
+
65
+ if 'pdf' not in st.session_state:
66
+ st.session_state['pdf'] = None
67
+
68
  st.set_page_config(
69
  page_title="Scientific Document Insights Q/A",
70
  page_icon="📝",
 
296
  mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
297
  help="LLM will respond the question, Embedding will show the "
298
  "paragraphs relevant to the question in the paper.")
299
+ chunk_size = st.slider("Chunks size", -1, 2000, value=250,
300
  help="Size of chunks in which the document is partitioned",
301
  disabled=uploaded_file is not None)
302
  context_size = st.slider("Context size", 3, 10, value=4,
 
326
  st.markdown(
327
  """If you switch the mode to "Embedding," the system will return specific chunks from the document that are semantically related to your query. This mode helps to test why sometimes the answers are not satisfying or incomplete. """)
328
 
 
 
329
  if uploaded_file and not st.session_state.loaded_embeddings:
330
  if model not in st.session_state['api_keys']:
331
  st.error("Before uploading a document, you must enter the API key. ")
 
348
  # timestamp = datetime.utcnow()
349
 
350
  with left_column:
351
+ if st.session_state['annotations']:
352
+ pdf_viewer(input=st.session_state['binary'], annotations=st.session_state['annotations'])
353
 
354
  with right_column:
355
  # css = '''
 
393
  context_size=context_size)
394
  elif mode == "LLM":
395
  with st.spinner("Generating response..."):
396
+ _, text_response, coordinates = st.session_state['rqa'][model].query_document(question,
397
+ st.session_state.doc_id,
398
+ context_size=context_size)
399
+ st.session_state['annotations'] = [
400
+ {"page": coo[0], "x": coo[1], "y": coo[2], "width": coo[3], "height": coo[4], "color": "blue"} for coo in [c.split(",") for coord in
401
+ coordinates for c in coord]]
402
+ # with left_column:
403
+ # pdf_viewer(input=st.session_state['binary'], annotations=st.session_state['annotations'], key=1)
404
 
405
  if not text_response:
406
  st.error("Something went wrong. Contact Luca Foppiano ([email protected]) to report the issue.")