lfoppiano commited on
Commit
5fd26bb
·
1 Parent(s): 9d4be7c

use paragraphs instead of sentences

Browse files
document_qa/document_qa_engine.py CHANGED
@@ -56,7 +56,7 @@ class DocumentQAEngine:
56
  grobid_client = GrobidClient(
57
  grobid_server=self.grobid_url,
58
  batch_size=1000,
59
- coordinates=["s"],
60
  sleep_time=5,
61
  timeout=60,
62
  check_server=True
 
56
  grobid_client = GrobidClient(
57
  grobid_server=self.grobid_url,
58
  batch_size=1000,
59
+ coordinates=["p"],
60
  sleep_time=5,
61
  timeout=60,
62
  check_server=True
document_qa/grobid_processors.py CHANGED
@@ -136,7 +136,7 @@ class GrobidProcessor(BaseProcessor):
136
  input_path,
137
  consolidate_header=True,
138
  consolidate_citations=False,
139
- segment_sentences=True,
140
  tei_coordinates=coordinates,
141
  include_raw_citations=False,
142
  include_raw_affiliations=False,
@@ -188,7 +188,7 @@ class GrobidProcessor(BaseProcessor):
188
  # "passage_id": "title0"
189
  # })
190
 
191
- passage_type = "sentence" if coordinates else "paragraph"
192
 
193
  if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
194
  passages.append({
@@ -201,42 +201,74 @@ class GrobidProcessor(BaseProcessor):
201
  })
202
 
203
  soup = BeautifulSoup(text, 'xml')
204
- text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=False)
205
-
206
- passages.extend([
207
- {
208
- "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
209
- text.parent.name != "ref" or (
210
- text.parent.name == "ref" and text.parent.attrs[
211
- 'type'] != 'bibr'))),
212
- "type": passage_type,
213
- "section": "<body>",
214
- "subSection": "<sentence>",
215
- "passage_id": str(paragraph_id) + str(sentence_id),
216
- # "coordinates": sentence['coords'].split(";") if coordinates else []
217
- "coordinates": sentence['coords'] if coordinates else ""
218
- }
219
- for paragraph_id, paragraph in enumerate(text_blocks_body) for
220
- sentence_id, sentence in enumerate(paragraph)
221
- ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
  text_blocks_figures = get_children_figures(soup, verbose=False)
224
 
225
- passages.extend([
226
- {
227
- "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
228
- text.parent.name != "ref" or (
229
- text.parent.name == "ref" and text.parent.attrs[
230
- 'type'] != 'bibr'))),
231
- "type": passage_type,
232
- "section": "<body>",
233
- "subSection": "<figure>",
234
- "passage_id": str(paragraph_id) + str(sentence_id),
235
- "coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
236
- }
237
- for paragraph_id, paragraph in enumerate(text_blocks_figures) for
238
- sentence_id, sentence in enumerate(paragraph)
239
- ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
  return output_data
242
 
@@ -532,6 +564,21 @@ class GrobidAggregationProcessor(GrobidProcessor, GrobidQuantitiesProcessor, Gro
532
  def extract_materials(self, text):
533
  return self.gmp.extract_materials(text)
534
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  @staticmethod
536
  def prune_overlapping_annotations(entities: list) -> list:
537
  # Sorting by offsets
@@ -742,7 +789,8 @@ def get_children_body(soup: object, use_paragraphs: object = True, verbose: obje
742
  child_name = "p" if use_paragraphs else "s"
743
  for child in soup.TEI.children:
744
  if child.name == 'text':
745
- children.extend([subchild.find_all(child_name) for subchild in child.find_all("body")])
 
746
 
747
  if verbose:
748
  print(str(children))
@@ -755,7 +803,8 @@ def get_children_figures(soup: object, use_paragraphs: object = True, verbose: o
755
  child_name = "p" if use_paragraphs else "s"
756
  for child in soup.TEI.children:
757
  if child.name == 'text':
758
- children.extend([subchild.find_all("figDesc") for subchild in child.find_all("body")])
 
759
 
760
  if verbose:
761
  print(str(children))
 
136
  input_path,
137
  consolidate_header=True,
138
  consolidate_citations=False,
139
+ segment_sentences=False,
140
  tei_coordinates=coordinates,
141
  include_raw_citations=False,
142
  include_raw_affiliations=False,
 
188
  # "passage_id": "title0"
189
  # })
190
 
191
+ passage_type = "paragraph"
192
 
193
  if doc_biblio.abstract is not None and len(doc_biblio.abstract) > 0:
194
  passages.append({
 
201
  })
202
 
203
  soup = BeautifulSoup(text, 'xml')
204
+ text_blocks_body = get_children_body(soup, verbose=False, use_paragraphs=True)
205
+
206
+ use_paragraphs = True
207
+ if not use_paragraphs:
208
+ passages.extend([
209
+ {
210
+ "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
211
+ text.parent.name != "ref" or (
212
+ text.parent.name == "ref" and text.parent.attrs[
213
+ 'type'] != 'bibr'))),
214
+ "type": passage_type,
215
+ "section": "<body>",
216
+ "subSection": "<paragraph>",
217
+ "passage_id": str(paragraph_id),
218
+ "coordinates": paragraph['coords'] if coordinates and sentence.has_attr('coords') else ""
219
+ }
220
+ for paragraph_id, paragraph in enumerate(text_blocks_body) for
221
+ sentence_id, sentence in enumerate(paragraph)
222
+ ])
223
+ else:
224
+ passages.extend([
225
+ {
226
+ "text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
227
+ text.parent.name != "ref" or (
228
+ text.parent.name == "ref" and text.parent.attrs[
229
+ 'type'] != 'bibr'))),
230
+ "type": passage_type,
231
+ "section": "<body>",
232
+ "subSection": "<paragraph>",
233
+ "passage_id": str(paragraph_id),
234
+ "coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
235
+ }
236
+ for paragraph_id, paragraph in enumerate(text_blocks_body)
237
+ ])
238
 
239
  text_blocks_figures = get_children_figures(soup, verbose=False)
240
 
241
+ if not use_paragraphs:
242
+ passages.extend([
243
+ {
244
+ "text": self.post_process(''.join(text for text in sentence.find_all(text=True) if
245
+ text.parent.name != "ref" or (
246
+ text.parent.name == "ref" and text.parent.attrs[
247
+ 'type'] != 'bibr'))),
248
+ "type": passage_type,
249
+ "section": "<body>",
250
+ "subSection": "<figure>",
251
+ "passage_id": str(paragraph_id) + str(sentence_id),
252
+ "coordinates": sentence['coords'] if coordinates and 'coords' in sentence else ""
253
+ }
254
+ for paragraph_id, paragraph in enumerate(text_blocks_figures) for
255
+ sentence_id, sentence in enumerate(paragraph)
256
+ ])
257
+ else:
258
+ passages.extend([
259
+ {
260
+ "text": self.post_process(''.join(text for text in paragraph.find_all(text=True) if
261
+ text.parent.name != "ref" or (
262
+ text.parent.name == "ref" and text.parent.attrs[
263
+ 'type'] != 'bibr'))),
264
+ "type": passage_type,
265
+ "section": "<body>",
266
+ "subSection": "<figure>",
267
+ "passage_id": str(paragraph_id),
268
+ "coordinates": paragraph['coords'] if coordinates and paragraph.has_attr('coords') else ""
269
+ }
270
+ for paragraph_id, paragraph in enumerate(text_blocks_figures)
271
+ ])
272
 
273
  return output_data
274
 
 
564
  def extract_materials(self, text):
565
  return self.gmp.extract_materials(text)
566
 
567
+ @staticmethod
568
+ def box_to_dict(box, color=None, type=None):
569
+
570
+ if box is None or box == "" or len(box) < 5:
571
+ return {}
572
+
573
+ item = {"page": box[0], "x": box[1], "y": box[2], "width": box[3], "height": box[4]}
574
+ if color is not None:
575
+ item['color'] = color
576
+
577
+ if type:
578
+ item['type'] = type
579
+
580
+ return item
581
+
582
  @staticmethod
583
  def prune_overlapping_annotations(entities: list) -> list:
584
  # Sorting by offsets
 
789
  child_name = "p" if use_paragraphs else "s"
790
  for child in soup.TEI.children:
791
  if child.name == 'text':
792
+ children.extend(
793
+ [subchild for subchild in child.find_all("body") for subchild in subchild.find_all(child_name)])
794
 
795
  if verbose:
796
  print(str(children))
 
803
  child_name = "p" if use_paragraphs else "s"
804
  for child in soup.TEI.children:
805
  if child.name == 'text':
806
+ children.extend(
807
+ [subchild for subchilds in child.find_all("body") for subchild in subchilds.find_all("figDesc")])
808
 
809
  if verbose:
810
  print(str(children))
requirements.txt CHANGED
@@ -7,7 +7,7 @@ grobid_tei_xml==0.1.3
7
  tqdm
8
  pyyaml==6.0
9
  pytest
10
- streamlit==1.27.2
11
  lxml
12
  Beautifulsoup4
13
  python-dotenv
 
7
  tqdm
8
  pyyaml==6.0
9
  pytest
10
+ streamlit==1.29.0
11
  lxml
12
  Beautifulsoup4
13
  python-dotenv
streamlit_app.py CHANGED
@@ -296,7 +296,7 @@ with st.sidebar:
296
  mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
297
  help="LLM will respond the question, Embedding will show the "
298
  "paragraphs relevant to the question in the paper.")
299
- chunk_size = st.slider("Chunks size", -1, 2000, value=250,
300
  help="Size of chunks in which the document is partitioned",
301
  disabled=uploaded_file is not None)
302
  context_size = st.slider("Context size", 3, 10, value=4,
@@ -410,8 +410,9 @@ with right_column:
410
  st.session_state.doc_id,
411
  context_size=context_size)
412
  annotations = [
413
- {"page": coo[0], "x": coo[1], "y": coo[2], "width": coo[3], "height": coo[4], "color": "grey"} for coo in [c.split(",") for coord in
414
- coordinates for c in coord]]
 
415
  gradients = generate_color_gradient(len(annotations))
416
  for i, color in enumerate(gradients):
417
  annotations[i]['color'] = color
 
296
  mode = st.radio("Query mode", ("LLM", "Embeddings"), disabled=not uploaded_file, index=0, horizontal=True,
297
  help="LLM will respond the question, Embedding will show the "
298
  "paragraphs relevant to the question in the paper.")
299
+ chunk_size = st.slider("Chunks size", -1, 2000, value=-1,
300
  help="Size of chunks in which the document is partitioned",
301
  disabled=uploaded_file is not None)
302
  context_size = st.slider("Context size", 3, 10, value=4,
 
410
  st.session_state.doc_id,
411
  context_size=context_size)
412
  annotations = [
413
+ GrobidAggregationProcessor.box_to_dict(coo) for coo in [c.split(",") for coord in
414
+ coordinates for c in coord]
415
+ ]
416
  gradients = generate_color_gradient(len(annotations))
417
  for i, color in enumerate(gradients):
418
  annotations[i]['color'] = color
tests/__init__.py ADDED
File without changes
tests/conftest.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+ from unittest.mock import MagicMock
4
+
5
+ import pytest
6
+ from _pytest._py.path import LocalPath
7
+
8
+ # derived from https://github.com/elifesciences/sciencebeam-trainer-delft/tree/develop/tests
9
+
10
+ LOGGER = logging.getLogger(__name__)
11
+
12
+
13
+ @pytest.fixture(scope='session', autouse=True)
14
+ def setup_logging():
15
+ logging.root.handlers = []
16
+ logging.basicConfig(level='INFO')
17
+ logging.getLogger('tests').setLevel('DEBUG')
18
+ # logging.getLogger('sciencebeam_trainer_delft').setLevel('DEBUG')
19
+
20
+
21
+ def _backport_assert_called(mock: MagicMock):
22
+ assert mock.called
23
+
24
+
25
+ @pytest.fixture(scope='session', autouse=True)
26
+ def patch_magicmock():
27
+ try:
28
+ MagicMock.assert_called
29
+ except AttributeError:
30
+ MagicMock.assert_called = _backport_assert_called
31
+
32
+
33
+ @pytest.fixture
34
+ def temp_dir(tmpdir: LocalPath):
35
+ # convert to standard Path
36
+ return Path(str(tmpdir))
37
+
tests/resources/2312.07559.paragraphs.tei.xml ADDED
The diff for this file is too large to render. See raw diff
 
tests/resources/2312.07559.sentences.tei.xml ADDED
The diff for this file is too large to render. See raw diff
 
tests/test_grobid_processors.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ from document_qa.grobid_processors import get_children_body
3
+
4
+
5
+ def test_get_children_paragraphs():
6
+ with open("resources/2312.07559.paragraphs.tei.xml", 'r') as fo:
7
+ soup = BeautifulSoup(fo, 'xml')
8
+
9
+ children = get_children_body(soup, use_paragraphs=True)
10
+
11
+ assert len(children) == 70
12
+
13
+
14
+ def test_get_children_sentences():
15
+ with open("resources/2312.07559.sentences.tei.xml", 'r') as fo:
16
+ soup = BeautifulSoup(fo, 'xml')
17
+
18
+ children = get_children_body(soup, use_paragraphs=False)
19
+
20
+ assert len(children) == 327