Spaces:
Build error
Build error
# Copyright (c) 2022, Lawrence Livermore National Security, LLC. | |
# All rights reserved. | |
# See the top-level LICENSE and NOTICE files for details. | |
# LLNL-CODE-838964 | |
# SPDX-License-Identifier: Apache-2.0-with-LLVM-exception | |
from pdfminer.pdfpage import PDFParser | |
from pdfminer.pdfpage import PDFDocument | |
from pdfminer.pdfpage import PDFPage | |
from pdfminer.layout import LTTextBoxHorizontal | |
from pdfminer.layout import LTTextLineHorizontal | |
from pdfminer.layout import LTChar | |
from pdfminer.layout import LAParams | |
from pdfminer.layout import LTRect | |
from pdfminer.layout import LTFigure | |
from pdfminer.converter import PDFPageAggregator | |
from pdfminer.pdfinterp import PDFResourceManager | |
from pdfminer.pdfinterp import PDFPageInterpreter | |
from pdfminer import pdfinterp | |
from collections.abc import Iterable | |
from collections import Counter | |
from collections import OrderedDict | |
import os | |
# This is use for highlighting in PDFs | |
from PyPDF2.generic import ( | |
DictionaryObject, | |
NumberObject, | |
FloatObject, | |
NameObject, | |
TextStringObject, | |
ArrayObject | |
) | |
# Used to extract pages | |
from PyPDF2 import PdfFileReader, PdfFileWriter | |
def get_page_sizes(document): | |
parser = PDFParser(open(document, 'rb')) | |
doc = PDFDocument(parser) | |
pageSizesList = [] | |
for page in PDFPage.create_pages(doc): | |
# the media box that is the page size as list of 4 integers x0 y0 x1 y1 | |
pageSizesList.append(page.mediabox) # <- appending | |
return pageSizesList | |
def get_page_count(document): | |
# Is there a better way of getting the page count than doing this? | |
parser = PDFParser(document) | |
tmpdoc = PDFDocument(parser) | |
page_count = pdfinterp.resolve1(tmpdoc.catalog['Pages'])['Count'] | |
return page_count | |
def get_pdf_page_count(filename): | |
with open(filename, 'rb') as document: | |
return get_page_count(document) | |
def get_pages(document, page_numbers = None): | |
#Create resource manager | |
rsrcmgr = PDFResourceManager() | |
# Set parameters for analysis. | |
laparams = LAParams() | |
# Create a PDF page aggregator object. | |
device = PDFPageAggregator(rsrcmgr, laparams=laparams) | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
page_count = get_page_count(document) | |
if page_numbers is None: | |
page_numbers = range(page_count) | |
for page, page_number in zip(PDFPage.get_pages(document, page_numbers), page_numbers): | |
interpreter.process_page(page) | |
# receive the LTPage object for the page. | |
layout = device.get_result() | |
#print("Yield page:", page_number) | |
yield layout, page_number | |
def partial_overlaps(box, other): | |
""" | |
Determine if the two bounding boxes overlap eachother. | |
TODO: Really should just use a standard Python library for this. | |
box -- 2 coordinate bounding box (x1,y1,x2,y2) | |
other -- 2 coordinate bounding box (x1,y1,x2,y2) | |
""" | |
# a1 x1 a2 x2 | |
# <------------------> | |
x_intersects = (other[0] < box[0] and other[2] > box[0]) or ( | |
other[0] < box[2] and other[2] > box[2]) | |
y_intersects = (other[1] < box[1] and other[3] > box[1]) or ( | |
other[1] < box[3] and other[3] > box[3]) | |
intersects = x_intersects or y_intersects | |
# TODO: Simplify? | |
return intersects and overlaps(box, other) | |
#return intersects | |
def overlaps(box, other): | |
""" | |
Determine if the two bounding boxes overlap eachother. | |
TODO: Really should just use a standard Python library for this. | |
box -- 2 coordinate bounding box (x1,y1,x2,y2) | |
other -- 2 coordinate bounding box (x1,y1,x2,y2) | |
""" | |
x_intersects = box[0] > other[2] or box[2] < other[0] | |
y_intersects = box[1] > other[3] or box[3] < other[1] | |
intersects = not (x_intersects or y_intersects) | |
return intersects | |
def union(src, other): | |
""" | |
Expand src by union of other bbox | |
src -- 2 coordinate bounding box (x1,y1,x2,y2) | |
other -- 2 coordinate bounding box (x1,y1,x2,y2) | |
returns union of src and other | |
""" | |
xmin = min(src[0], other[0]) | |
ymin = min(src[1], other[1]) | |
xmax = max(src[2], other[2]) | |
ymax = max(src[3], other[3]) | |
return [xmin, ymin, xmax, ymax] | |
# See: https://gist.github.com/agentcooper/4c55133f5d95866acdee5017cd318558#file-pypdf2highlight-py | |
# x1, y1 starts in bottom left corner | |
def createHighlight(x1, y1, x2, y2, meta, color = [1, 0, 0]): | |
newHighlight = DictionaryObject() | |
newHighlight.update({ | |
NameObject("/F"): NumberObject(4), | |
NameObject("/Type"): NameObject("/Annot"), | |
NameObject("/Subtype"): NameObject("/Highlight"), | |
NameObject("/T"): TextStringObject(meta["author"]), | |
NameObject("/Contents"): TextStringObject(meta["contents"]), | |
NameObject("/C"): ArrayObject([FloatObject(c) for c in color]), | |
NameObject("/Rect"): ArrayObject([ | |
FloatObject(x1), | |
FloatObject(y1), | |
FloatObject(x2), | |
FloatObject(y2) | |
]), | |
NameObject("/QuadPoints"): ArrayObject([ | |
FloatObject(x1), | |
FloatObject(y2), | |
FloatObject(x2), | |
FloatObject(y2), | |
FloatObject(x1), | |
FloatObject(y1), | |
FloatObject(x2), | |
FloatObject(y1) | |
]), | |
}) | |
return newHighlight | |
def addHighlightToPage(highlight, page, output): | |
highlight_ref = output._addObject(highlight); | |
if "/Annots" in page: | |
page[NameObject("/Annots")].append(highlight_ref) | |
else: | |
page[NameObject("/Annots")] = ArrayObject([highlight_ref]) | |
def get_pdf_words(document, page_numbers=None): | |
""" | |
Get all words from LTChar or LTTextLineHorizontal objects from the document. | |
:param document: string path of the PDF file to process | |
:returns: A map of page #'s containing lists of coordinates and PDFMiner | |
objects. Ex.: {page_number: [[x1, y1, x2, y2, <LTTextLineHorizontal>],]} | |
""" | |
pdf_doc = open(document, 'rb') | |
bboxes = {} | |
for layout, page in get_pages(pdf_doc, page_numbers): | |
#print(element.get_text()) | |
bboxes[page] = [] | |
for element in layout: | |
if not isinstance(element, Iterable): | |
continue # not iterable | |
for subElement in element: | |
#print('Subelement type:', type(subElement)) | |
if isinstance(subElement, LTChar): | |
if (subElement.get_text() == ' '): | |
pass # TODO: Handle word deliminator | |
# Print the character in this class | |
# print(subElement.get_text(), end='') | |
item = list(subElement.bbox) | |
item.append(subElement) | |
bboxes[page].append(item) | |
elif isinstance(subElement, LTTextLineHorizontal): | |
#print(subElement.bbox) | |
item = list(subElement.bbox) | |
item.append(subElement) | |
bboxes[page].append(item) | |
else: | |
pass | |
return bboxes | |
def get_paragraphs(words): | |
paragraph_tolerance = 0.1 | |
max_height_diff = 1 | |
paragraphs = [] | |
for page, elements in words.items(): | |
# Find nominal font size | |
# Round to int | |
freq = Counter() | |
for element in elements: | |
height = int(element[3] - element[1]) | |
#print(height,end=' ') | |
freq[height] += 1 | |
nominal_font = freq.most_common(1)[0][0] | |
print("Nominal font is:", nominal_font) | |
print("Page:", page) | |
x_offset_prev_line = None | |
prev_x_offset = None | |
prev_y_offset = None | |
paragraph_content = "" | |
#print("Element count:", len(elements)) | |
first_line = False | |
processed_first_line = False | |
for element in elements: | |
x_offset = element[0] | |
y_offset = element[1] | |
height = int(element[3] - element[1]) | |
text = element[4].get_text() | |
if x_offset_prev_line != None: | |
large_x_offset = (abs(x_offset_prev_line - x_offset) > paragraph_tolerance) | |
# Font size mismatch? | |
if abs(height - nominal_font) > max_height_diff: | |
if len(paragraph_content) > 0: | |
print("Content append:", len(paragraph_content)) | |
paragraphs.append(paragraph_content) | |
paragraph_content = "" | |
print("Continue due to height != nominal_font") | |
continue | |
print("ELEMENT:", element[0:4], text[0:15]) | |
if prev_y_offset is not None and len(paragraph_content) > 0: | |
if y_offset < prev_y_offset - height * 1.5: | |
print("Content append:", len(paragraph_content)) | |
if len(paragraph_content) > 0: | |
paragraphs.append(paragraph_content) | |
paragraph_content = text | |
prev_y_offset = None | |
continue | |
prev_y_offset = y_offset | |
prev_y_offset = y_offset | |
#print("element:", element) | |
if not isinstance(element[4], LTTextLineHorizontal): | |
continue | |
#print("Running text:", text) | |
#print(f"x_offset_prev_line , x_offset]: {x_offset_prev_line, x_offset}") | |
# Find first paragraph | |
if x_offset_prev_line is None: | |
#print("x_offset_prev is none") | |
x_offset_prev_line = x_offset | |
if not processed_first_line: | |
first_line = True | |
processed_first_line = True | |
if height == nominal_font: | |
paragraph_content += text | |
#print("Continue due to x_offset_prev_line is none") | |
continue | |
# Check case if first line was indented | |
if x_offset_prev_line > x_offset and first_line: | |
#print("x_offset < element[0]") | |
first_line = False | |
paragraph_content += text | |
x_offset_prev_line = x_offset | |
#print("Continue due to x_offset_prev_line > x_offset and first_line") | |
continue | |
# is this indented? | |
# and ignore small changes | |
if x_offset_prev_line < x_offset and large_x_offset: | |
#print(f"x_offset_prev_line > x_offset: {x_offset_prev_line, x_offset}") | |
if height == nominal_font and len(paragraph_content) > 0: | |
paragraphs.append(paragraph_content) | |
paragraph_content = text | |
# Reset at next line read | |
# What if next paragraph is also indented??? | |
x_offset_prev_line = None | |
#print("Continue due to x_offset_prev_line < x_offset and large_x_offset") | |
continue | |
#print(element[0:4]) | |
if height == nominal_font: | |
paragraph_content += text | |
#print("End of loop") | |
# TODO: Remove redundant space | |
if paragraph_content != "": | |
paragraphs.append(paragraph_content) | |
# Find paragraph indexes | |
c = 0 | |
indexes = [] | |
for p in paragraphs: | |
c += len(p) | |
indexes.append(c) | |
return paragraphs, indexes | |
def get_pdf_elements(document, element_type, page_numbers=None): | |
pdf_doc = open(document, 'rb') | |
items = {} | |
for layout, page in get_pages(pdf_doc, page_numbers): | |
#print(element.get_text()) | |
items[page] = [] | |
for element in layout: | |
if isinstance(element, element_type): | |
item = list(element.bbox) | |
if hasattr(element, 'non_stroking_color'): | |
item.append(element.non_stroking_color) | |
items[page].append(item) | |
print(items) | |
return items | |
def get_large_colored_background_rectangles(document, page_numbers=None): | |
# Only include rectangles that are at least 4" x 1" in size | |
min_size = (288.0, 72.0) | |
elements = get_pdf_elements(document, LTRect, page_numbers) | |
rects_out = {} | |
for page, rects in elements.items(): | |
print("Rects:", rects) | |
for rect in rects: | |
width = rect[2] - rect[0] | |
height = rect[3] - rect[1] | |
print("Dimensions:", width, height) | |
if (width > min_size[0] and | |
height > min_size[1]): | |
if not page in rects_out: | |
rects_out[page] = [] | |
rects_out[page].append(rect) | |
return rects_out | |
def extract_pages(document, output, page_numbers=None): | |
pdf = PdfFileReader(document) | |
pdf_writer = PdfFileWriter() | |
for page in page_numbers: | |
current_page = pdf.getPage(page) | |
pdf_writer.addPage(current_page) | |
with open(output, "wb") as out: | |
pdf_writer.write(out) | |