File size: 5,563 Bytes
c14d9ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
961cf08
 
 
 
 
c14d9ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6318714
c14d9ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Copyright (c) 2022, Lawrence Livermore National Security, LLC. 
# All rights reserved.
# See the top-level LICENSE and NOTICE files for details.
# LLNL-CODE-838964

# SPDX-License-Identifier: Apache-2.0-with-LLVM-exception

import json
from tokenizers.pre_tokenizers import Whitespace
import base_utils
import spacy

def guess_sentences(tokens, text):
    sentence_delems = ('.', '?', ').', '!')
    sentences = []
    sentence = []   
    maybe_delem = None  
    for token in tokens:
        # check next token to see if there is space after prev delem
        if maybe_delem != None:
            if maybe_delem[1][1] < token[1][0]:
                sentences.append(sentence)
                sentence = []
        maybe_delem = None
            
        sentence.append(token)
        if token[0] in sentence_delems:
            maybe_delem = token
    if sentence != []:
        sentences.append(sentence)
    return sentences

def spacey_sentences(text):
    nlp = spacy.blank('en')
    nlp.add_pipe('sentencizer')
    sentences = [s.text for s in nlp(text).sents]
    return sentences

def add_coords(sentences, all_coords):
    sentences_out = []
    for sentence in sentences:
        new_sentence = []
        for token in sentence:
            indexes = token[1]
            bbox = all_coords[indexes[0]]
            for i in range(indexes[0]+1, indexes[1]):
                bbox = base_utils.union(bbox, all_coords[i])
            new_sentence.append((token[0],token[1],bbox))
        sentences_out.append(new_sentence)
    return sentences_out
    
def sentence_extract(document):
    """
    Convert extract .PDF result .pkl into tokens with max length of 384 tokens, seperated 
    on sentence delimiter boundaries such as .!?
    """
    max_tokens = 384
    document_tree = json.load(open(document,'r'))
    sections_per_page = {}
    for page_num, page in document_tree.items():
        # Tokenize per section (rectangular block that was detected by DIT)
        word_sections = []
        text_sections = []
        for section in page:
            text_sections.append(section['text'])
            all_text = ''
            all_coord = []
            if 'subelements' not in section:
                continue
            for subelement in section['subelements']:
                for char in subelement:
                    all_text += char[1]
                    all_coord.append(char[0])
                    # check for weird characters, e.g. "(cid:206)", "ff", "fi", etc
                    # if string isn't just 1 character, it's an irregular LTChar (character) from pdfminer.
                    # instead of skipping them, we can just create extra duplicate coordinates for the additional characters.
                    if len(char[1]) > 1:
                        bad_char_len = len(char[1])
                        dupe_coord_amt = (bad_char_len - 1)
                        for dupe_i in range(dupe_coord_amt):
                            all_coord.append(char[0])
    
            pre_tokenizer = Whitespace() 
            
            sentences_pre_tok = spacey_sentences(all_text)
            sentences = []
            for sentence in sentences_pre_tok:
                tokenized = pre_tokenizer.pre_tokenize_str(sentence)
                sentences.append(tokenized)        
            
            sentences = add_coords(sentences, all_coord)        
            
            word_section = []
            t = 0
            for sentence in sentences:
                t += len(sentence)
                if t <= max_tokens:
                    # update character indicies from concatenating sentences
                    if len(word_section) > 0:
                        last_word_obj = word_section[-1]
                        _, (_, char_idx_offset), _ = last_word_obj
                        sentence = [(w, (sc+char_idx_offset+1, ec+char_idx_offset+1), bbox) for w, (sc, ec), bbox in sentence]
                    word_section += sentence
                else:
                    word_sections.append(word_section)
                    word_section = sentence
                    t = len(sentence)
            word_sections.append(word_section)
        sections = {'text_sections':text_sections, 'word_sections':word_sections}
        sections_per_page[page_num] = sections
    return sections_per_page
        
def format_output_contexts(sections_per_page):
    
    all_contexts = {}

    for page_idx in sections_per_page.keys():
        
        text_sections = sections_per_page[page_idx]['text_sections']
        word_sections = sections_per_page[page_idx]['word_sections']

        for text_section, word_section in zip(text_sections, word_sections):
            whitespaced_text = ' '.join([word[0] for word in word_section])
            words_info = []
            for word in word_section:
                words_info.append({'word_text':word[0], 'char_indices':word[1], 'word_bbox':word[2]})

            context_row = {'text':text_section, 'whitespaced_text':whitespaced_text, 'page_idx':int(page_idx), 'words_info':words_info}
            context_id = 'context_{0}'.format(len(all_contexts))
            all_contexts[context_id] = context_row

    return all_contexts
    
def get_contexts(json_input):
    json_output = 'contexts_{0}'.format(json_input)
    sections_per_page = sentence_extract(json_input)
    
    all_contexts = format_output_contexts(sections_per_page)
    
    with open(json_output, 'w', encoding='utf8') as json_out:
        json.dump(all_contexts, json_out, ensure_ascii=False, indent=4)