Spaces:
Build error
Build error
File size: 13,305 Bytes
c14d9ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 |
# Copyright (c) 2022, Lawrence Livermore National Security, LLC.
# All rights reserved.
# See the top-level LICENSE and NOTICE files for details.
# LLNL-CODE-838964
# SPDX-License-Identifier: Apache-2.0-with-LLVM-exception
from pdfminer.pdfpage import PDFParser
from pdfminer.pdfpage import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal
from pdfminer.layout import LTTextLineHorizontal
from pdfminer.layout import LTChar
from pdfminer.layout import LAParams
from pdfminer.layout import LTRect
from pdfminer.layout import LTFigure
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer import pdfinterp
from collections.abc import Iterable
from collections import Counter
from collections import OrderedDict
import os
# This is use for highlighting in PDFs
from PyPDF2.generic import (
DictionaryObject,
NumberObject,
FloatObject,
NameObject,
TextStringObject,
ArrayObject
)
# Used to extract pages
from PyPDF2 import PdfFileReader, PdfFileWriter
def get_page_sizes(document):
parser = PDFParser(open(document, 'rb'))
doc = PDFDocument(parser)
pageSizesList = []
for page in PDFPage.create_pages(doc):
# the media box that is the page size as list of 4 integers x0 y0 x1 y1
pageSizesList.append(page.mediabox) # <- appending
return pageSizesList
def get_page_count(document):
# Is there a better way of getting the page count than doing this?
parser = PDFParser(document)
tmpdoc = PDFDocument(parser)
page_count = pdfinterp.resolve1(tmpdoc.catalog['Pages'])['Count']
return page_count
def get_pdf_page_count(filename):
with open(filename, 'rb') as document:
return get_page_count(document)
def get_pages(document, page_numbers = None):
#Create resource manager
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page_count = get_page_count(document)
if page_numbers is None:
page_numbers = range(page_count)
for page, page_number in zip(PDFPage.get_pages(document, page_numbers), page_numbers):
interpreter.process_page(page)
# receive the LTPage object for the page.
layout = device.get_result()
#print("Yield page:", page_number)
yield layout, page_number
def partial_overlaps(box, other):
"""
Determine if the two bounding boxes overlap eachother.
TODO: Really should just use a standard Python library for this.
box -- 2 coordinate bounding box (x1,y1,x2,y2)
other -- 2 coordinate bounding box (x1,y1,x2,y2)
"""
# a1 x1 a2 x2
# <------------------>
x_intersects = (other[0] < box[0] and other[2] > box[0]) or (
other[0] < box[2] and other[2] > box[2])
y_intersects = (other[1] < box[1] and other[3] > box[1]) or (
other[1] < box[3] and other[3] > box[3])
intersects = x_intersects or y_intersects
# TODO: Simplify?
return intersects and overlaps(box, other)
#return intersects
def overlaps(box, other):
"""
Determine if the two bounding boxes overlap eachother.
TODO: Really should just use a standard Python library for this.
box -- 2 coordinate bounding box (x1,y1,x2,y2)
other -- 2 coordinate bounding box (x1,y1,x2,y2)
"""
x_intersects = box[0] > other[2] or box[2] < other[0]
y_intersects = box[1] > other[3] or box[3] < other[1]
intersects = not (x_intersects or y_intersects)
return intersects
def union(src, other):
"""
Expand src by union of other bbox
src -- 2 coordinate bounding box (x1,y1,x2,y2)
other -- 2 coordinate bounding box (x1,y1,x2,y2)
returns union of src and other
"""
xmin = min(src[0], other[0])
ymin = min(src[1], other[1])
xmax = max(src[2], other[2])
ymax = max(src[3], other[3])
return [xmin, ymin, xmax, ymax]
# See: https://gist.github.com/agentcooper/4c55133f5d95866acdee5017cd318558#file-pypdf2highlight-py
# x1, y1 starts in bottom left corner
def createHighlight(x1, y1, x2, y2, meta, color = [1, 0, 0]):
newHighlight = DictionaryObject()
newHighlight.update({
NameObject("/F"): NumberObject(4),
NameObject("/Type"): NameObject("/Annot"),
NameObject("/Subtype"): NameObject("/Highlight"),
NameObject("/T"): TextStringObject(meta["author"]),
NameObject("/Contents"): TextStringObject(meta["contents"]),
NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
NameObject("/Rect"): ArrayObject([
FloatObject(x1),
FloatObject(y1),
FloatObject(x2),
FloatObject(y2)
]),
NameObject("/QuadPoints"): ArrayObject([
FloatObject(x1),
FloatObject(y2),
FloatObject(x2),
FloatObject(y2),
FloatObject(x1),
FloatObject(y1),
FloatObject(x2),
FloatObject(y1)
]),
})
return newHighlight
def addHighlightToPage(highlight, page, output):
highlight_ref = output._addObject(highlight);
if "/Annots" in page:
page[NameObject("/Annots")].append(highlight_ref)
else:
page[NameObject("/Annots")] = ArrayObject([highlight_ref])
def get_pdf_words(document, page_numbers=None):
"""
Get all words from LTChar or LTTextLineHorizontal objects from the document.
:param document: string path of the PDF file to process
:returns: A map of page #'s containing lists of coordinates and PDFMiner
objects. Ex.: {page_number: [[x1, y1, x2, y2, <LTTextLineHorizontal>],]}
"""
pdf_doc = open(document, 'rb')
bboxes = {}
for layout, page in get_pages(pdf_doc, page_numbers):
#print(element.get_text())
bboxes[page] = []
for element in layout:
if not isinstance(element, Iterable):
continue # not iterable
for subElement in element:
#print('Subelement type:', type(subElement))
if isinstance(subElement, LTChar):
if (subElement.get_text() == ' '):
pass # TODO: Handle word deliminator
# Print the character in this class
# print(subElement.get_text(), end='')
item = list(subElement.bbox)
item.append(subElement)
bboxes[page].append(item)
elif isinstance(subElement, LTTextLineHorizontal):
#print(subElement.bbox)
item = list(subElement.bbox)
item.append(subElement)
bboxes[page].append(item)
else:
pass
return bboxes
def get_paragraphs(words):
paragraph_tolerance = 0.1
max_height_diff = 1
paragraphs = []
for page, elements in words.items():
# Find nominal font size
# Round to int
freq = Counter()
for element in elements:
height = int(element[3] - element[1])
#print(height,end=' ')
freq[height] += 1
nominal_font = freq.most_common(1)[0][0]
print("Nominal font is:", nominal_font)
print("Page:", page)
x_offset_prev_line = None
prev_x_offset = None
prev_y_offset = None
paragraph_content = ""
#print("Element count:", len(elements))
first_line = False
processed_first_line = False
for element in elements:
x_offset = element[0]
y_offset = element[1]
height = int(element[3] - element[1])
text = element[4].get_text()
if x_offset_prev_line != None:
large_x_offset = (abs(x_offset_prev_line - x_offset) > paragraph_tolerance)
# Font size mismatch?
if abs(height - nominal_font) > max_height_diff:
if len(paragraph_content) > 0:
print("Content append:", len(paragraph_content))
paragraphs.append(paragraph_content)
paragraph_content = ""
print("Continue due to height != nominal_font")
continue
print("ELEMENT:", element[0:4], text[0:15])
if prev_y_offset is not None and len(paragraph_content) > 0:
if y_offset < prev_y_offset - height * 1.5:
print("Content append:", len(paragraph_content))
if len(paragraph_content) > 0:
paragraphs.append(paragraph_content)
paragraph_content = text
prev_y_offset = None
continue
prev_y_offset = y_offset
prev_y_offset = y_offset
#print("element:", element)
if not isinstance(element[4], LTTextLineHorizontal):
continue
#print("Running text:", text)
#print(f"x_offset_prev_line , x_offset]: {x_offset_prev_line, x_offset}")
# Find first paragraph
if x_offset_prev_line is None:
#print("x_offset_prev is none")
x_offset_prev_line = x_offset
if not processed_first_line:
first_line = True
processed_first_line = True
if height == nominal_font:
paragraph_content += text
#print("Continue due to x_offset_prev_line is none")
continue
# Check case if first line was indented
if x_offset_prev_line > x_offset and first_line:
#print("x_offset < element[0]")
first_line = False
paragraph_content += text
x_offset_prev_line = x_offset
#print("Continue due to x_offset_prev_line > x_offset and first_line")
continue
# is this indented?
# and ignore small changes
if x_offset_prev_line < x_offset and large_x_offset:
#print(f"x_offset_prev_line > x_offset: {x_offset_prev_line, x_offset}")
if height == nominal_font and len(paragraph_content) > 0:
paragraphs.append(paragraph_content)
paragraph_content = text
# Reset at next line read
# What if next paragraph is also indented???
x_offset_prev_line = None
#print("Continue due to x_offset_prev_line < x_offset and large_x_offset")
continue
#print(element[0:4])
if height == nominal_font:
paragraph_content += text
#print("End of loop")
# TODO: Remove redundant space
if paragraph_content != "":
paragraphs.append(paragraph_content)
# Find paragraph indexes
c = 0
indexes = []
for p in paragraphs:
c += len(p)
indexes.append(c)
return paragraphs, indexes
def get_pdf_elements(document, element_type, page_numbers=None):
pdf_doc = open(document, 'rb')
items = {}
for layout, page in get_pages(pdf_doc, page_numbers):
#print(element.get_text())
items[page] = []
for element in layout:
if isinstance(element, element_type):
item = list(element.bbox)
if hasattr(element, 'non_stroking_color'):
item.append(element.non_stroking_color)
items[page].append(item)
print(items)
return items
def get_large_colored_background_rectangles(document, page_numbers=None):
# Only include rectangles that are at least 4" x 1" in size
min_size = (288.0, 72.0)
elements = get_pdf_elements(document, LTRect, page_numbers)
rects_out = {}
for page, rects in elements.items():
print("Rects:", rects)
for rect in rects:
width = rect[2] - rect[0]
height = rect[3] - rect[1]
print("Dimensions:", width, height)
if (width > min_size[0] and
height > min_size[1]):
if not page in rects_out:
rects_out[page] = []
rects_out[page].append(rect)
return rects_out
def extract_pages(document, output, page_numbers=None):
pdf = PdfFileReader(document)
pdf_writer = PdfFileWriter()
for page in page_numbers:
current_page = pdf.getPage(page)
pdf_writer.addPage(current_page)
with open(output, "wb") as out:
pdf_writer.write(out)
|