File size: 1,155 Bytes
97208ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import re

def locate(ocr_text):
    # input: string produced by the ocr
    # output: (1) array of possible page numbers (may include false positives)
    #         (2) array of possible dates
    #         (3) array of possible book numbers 
    possible_pages = []
    possible_dates = []
    possible_book = []
    result = ocr_text.split("\n")
    pattern = re.compile(r'Re(?:c|ceived|e|o|a)')
    book_pattern = re.compile(r'B(?:OOK|00K)',  re.IGNORECASE)
    for word in result:
        # checks for possible page numbers
        if word.isdigit() == True:
            possible_pages.append(word)
        # checks for rec'd dates
        if re.match(pattern, word):
            # appending entire string for human judgement as OCR fails to correctly translate years in few cases
            possible_dates.append(word)
        if re.match(book_pattern, word):
            possible_book.append(word)
    if not possible_pages:
        possible_pages.append("Null")
    if not possible_dates:
        possible_dates.append("Null")
    if not possible_book:
        possible_book.append("Null")
    return possible_pages, possible_dates, possible_book