DocumentQA / DiT_Extractor /dit_runner.py
Epoching's picture
init
c14d9ad
# Copyright (c) 2022, Lawrence Livermore National Security, LLC.
# All rights reserved.
# See the top-level LICENSE and NOTICE files for details.
# LLNL-CODE-838964
# SPDX-License-Identifier: Apache-2.0-with-LLVM-exception
import cv2
from pathlib import Path
import torch
import json
from detectron2.config import CfgNode as CN
from detectron2.config import get_cfg
from detectron2.utils.visualizer import ColorMode, Visualizer
from detectron2.data import MetadataCatalog
from detectron2.engine import DefaultPredictor
from pdf2image import convert_from_path
from PIL import Image
import numpy as np
from dit_object_detection.ditod import add_vit_config
import base_utils
from pdfminer.layout import LTTextLineHorizontal, LTTextBoxHorizontal, LTAnno, LTChar
from tokenizers.pre_tokenizers import Whitespace
import warnings
warnings.filterwarnings("ignore")
dit_path = Path('DiT_Extractor/dit_object_detection')
cfg = get_cfg()
add_vit_config(cfg)
cfg.merge_from_file(dit_path / "publaynet_configs/cascade/cascade_dit_base.yaml")
cfg.MODEL.WEIGHTS = "https://layoutlm.blob.core.windows.net/dit/dit-fts/publaynet_dit-b_cascade.pth"
cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
predictor = DefaultPredictor(cfg)
thing_classes = ["text","title","list","table","figure"]
thing_map = dict(map(reversed, enumerate(thing_classes)))
md = MetadataCatalog.get(cfg.DATASETS.TEST[0])
md.set(thing_classes=thing_classes)
def get_pdf_image(pdf_file, page):
image = convert_from_path(pdf_file, dpi=200, first_page=page, last_page=page)
return image
def get_characters(subelement):
all_chars = []
if isinstance(subelement, LTTextLineHorizontal):
for char in subelement:
if isinstance(char, LTChar):
all_chars.append((char.bbox, char.get_text()))
if isinstance(char, LTAnno):
# No bbox, just a space, so make a thin slice after previous text
bbox = all_chars[-1][0]
bbox = (bbox[2],bbox[1],bbox[2],bbox[3])
all_chars.append((bbox, char.get_text()))
return all_chars
def get_dit_preds(pdf, score_threshold=0.5):
page_count = base_utils.get_pdf_page_count(pdf)
# Input is numpy array of PIL image
page_sizes = base_utils.get_page_sizes(pdf)
sections = {}
viz_images = []
page_words = base_utils.get_pdf_words(pdf)
for page in range(1, page_count+1): #range(2, page_count + 1):
image = get_pdf_image(pdf, page)
image = np.array(image[0])
# Get prediction
output = predictor(image)["instances"]
output = output.to('cpu')
# Visualize predictions
v = Visualizer(image[:, :, ::-1],
md,
scale=1.0,
instance_mode=ColorMode.SEGMENTATION)
result = v.draw_instance_predictions(output)
result_image = result.get_image()[:, :, ::-1]
viz_img = Image.fromarray(result_image)
viz_images.append(viz_img)
words = page_words[page-1]
# Convert from image_size to page size
pdf_dimensions = page_sizes[page-1][2:]
# Swap height/width
pdf_image_size = (output.image_size[1], output.image_size[0])
scale = np.array(pdf_dimensions) / np.array(pdf_image_size)
scale_box = np.hstack((scale,scale))
# Words are in page coordinates
id = 0
sections[page-1] = []
draw = image.copy()
for box_t, clazz, score in zip(output.get('pred_boxes'), output.get('pred_classes'), output.get('scores')):
if score < score_threshold:
continue
box = box_t.numpy()
# Flip along Y axis
box[1] = pdf_image_size[1] - box[1]
box[3] = pdf_image_size[1] - box[3]
# Scale
scaled = box * scale_box
# This is the correct order
scaled = [scaled[0], scaled[3], scaled[2], scaled[1]]
if clazz != thing_map['text']:
continue
start = box[0:2].tolist()
end = box[2:4].tolist()
start = [int(x) for x in start]
end = [int(x) for x in end]
out = {}
for word in words.copy():
if base_utils.partial_overlaps(word[0:4], scaled):
if out == {}:
id += 1
out['coord'] = word[0:4]
out['subelements'] = []
out['type'] = 'content_block'
out['id']= id
out['text'] = ''
out['coord'] = base_utils.union(out['coord'], word[0:4])
out['text'] = out['text'] + word[4].get_text()
characters = get_characters(word[4])
out['subelements'].append(characters)
words.remove(word)
if len(out) != 0:
sections[page-1].append(out)
# Write final annotation
out_name = Path(pdf).name[:-4] + ".json"
with open(out_name, 'w', encoding='utf8') as json_out:
json.dump(sections, json_out, ensure_ascii=False, indent=4)
return viz_images