Spaces:

Epoching
/

DocumentQA

Build error

File size: 5,563 Bytes

# Copyright (c) 2022, Lawrence Livermore National Security, LLC. 
# All rights reserved.
# See the top-level LICENSE and NOTICE files for details.
# LLNL-CODE-838964

# SPDX-License-Identifier: Apache-2.0-with-LLVM-exception

import json
from tokenizers.pre_tokenizers import Whitespace
import base_utils
import spacy

def guess_sentences(tokens, text):
    sentence_delems = ('.', '?', ').', '!')
    sentences = []
    sentence = []   
    maybe_delem = None  
    for token in tokens:
        # check next token to see if there is space after prev delem
        if maybe_delem != None:
            if maybe_delem[1][1] < token[1][0]:
                sentences.append(sentence)
                sentence = []
        maybe_delem = None
            
        sentence.append(token)
        if token[0] in sentence_delems:
            maybe_delem = token
    if sentence != []:
        sentences.append(sentence)
    return sentences

def spacey_sentences(text):
    nlp = spacy.blank('en')
    nlp.add_pipe('sentencizer')
    sentences = [s.text for s in nlp(text).sents]
    return sentences

def add_coords(sentences, all_coords):
    sentences_out = []
    for sentence in sentences:
        new_sentence = []
        for token in sentence:
            indexes = token[1]
            bbox = all_coords[indexes[0]]
            for i in range(indexes[0]+1, indexes[1]):
                bbox = base_utils.union(bbox, all_coords[i])
            new_sentence.append((token[0],token[1],bbox))
        sentences_out.append(new_sentence)
    return sentences_out
    
def sentence_extract(document):
    """
    Convert extract .PDF result .pkl into tokens with max length of 384 tokens, seperated 
    on sentence delimiter boundaries such as .!?
    """
    max_tokens = 384
    document_tree = json.load(open(document,'r'))
    sections_per_page = {}
    for page_num, page in document_tree.items():
        # Tokenize per section (rectangular block that was detected by DIT)
        word_sections = []
        text_sections = []
        for section in page:
            text_sections.append(section['text'])
            all_text = ''
            all_coord = []
            if 'subelements' not in section:
                continue
            for subelement in section['subelements']:
                for char in subelement:
                    all_text += char[1]
                    all_coord.append(char[0])
                    # check for weird characters, e.g. "(cid:206)", "ff", "fi", etc
                    # if string isn't just 1 character, it's an irregular LTChar (character) from pdfminer.
                    # instead of skipping them, we can just create extra duplicate coordinates for the additional characters.
                    if len(char[1]) > 1:
                        bad_char_len = len(char[1])
                        dupe_coord_amt = (bad_char_len - 1)
                        for dupe_i in range(dupe_coord_amt):
                            all_coord.append(char[0])
    
            pre_tokenizer = Whitespace() 
            
            sentences_pre_tok = spacey_sentences(all_text)
            sentences = []
            for sentence in sentences_pre_tok:
                tokenized = pre_tokenizer.pre_tokenize_str(sentence)
                sentences.append(tokenized)        
            
            sentences = add_coords(sentences, all_coord)        
            
            word_section = []
            t = 0
            for sentence in sentences:
                t += len(sentence)
                if t <= max_tokens:
                    # update character indicies from concatenating sentences
                    if len(word_section) > 0:
                        last_word_obj = word_section[-1]
                        _, (_, char_idx_offset), _ = last_word_obj
                        sentence = [(w, (sc+char_idx_offset+1, ec+char_idx_offset+1), bbox) for w, (sc, ec), bbox in sentence]
                    word_section += sentence
                else:
                    word_sections.append(word_section)
                    word_section = sentence
                    t = len(sentence)
            word_sections.append(word_section)
        sections = {'text_sections':text_sections, 'word_sections':word_sections}
        sections_per_page[page_num] = sections
    return sections_per_page
        
def format_output_contexts(sections_per_page):
    
    all_contexts = {}

    for page_idx in sections_per_page.keys():
        
        text_sections = sections_per_page[page_idx]['text_sections']
        word_sections = sections_per_page[page_idx]['word_sections']

        for text_section, word_section in zip(text_sections, word_sections):
            whitespaced_text = ' '.join([word[0] for word in word_section])
            words_info = []
            for word in word_section:
                words_info.append({'word_text':word[0], 'char_indices':word[1], 'word_bbox':word[2]})

            context_row = {'text':text_section, 'whitespaced_text':whitespaced_text, 'page_idx':int(page_idx), 'words_info':words_info}
            context_id = 'context_{0}'.format(len(all_contexts))
            all_contexts[context_id] = context_row

    return all_contexts
    
def get_contexts(json_input):
    json_output = 'contexts_{0}'.format(json_input)
    sections_per_page = sentence_extract(json_input)
    
    all_contexts = format_output_contexts(sections_per_page)
    
    with open(json_output, 'w', encoding='utf8') as json_out:
        json.dump(all_contexts, json_out, ensure_ascii=False, indent=4)