Spaces:
Sleeping
Sleeping
File size: 1,155 Bytes
97208ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
import re
def locate(ocr_text):
# input: string produced by the ocr
# output: (1) array of possible page numbers (may include false positives)
# (2) array of possible dates
# (3) array of possible book numbers
possible_pages = []
possible_dates = []
possible_book = []
result = ocr_text.split("\n")
pattern = re.compile(r'Re(?:c|ceived|e|o|a)')
book_pattern = re.compile(r'B(?:OOK|00K)', re.IGNORECASE)
for word in result:
# checks for possible page numbers
if word.isdigit() == True:
possible_pages.append(word)
# checks for rec'd dates
if re.match(pattern, word):
# appending entire string for human judgement as OCR fails to correctly translate years in few cases
possible_dates.append(word)
if re.match(book_pattern, word):
possible_book.append(word)
if not possible_pages:
possible_pages.append("Null")
if not possible_dates:
possible_dates.append("Null")
if not possible_book:
possible_book.append("Null")
return possible_pages, possible_dates, possible_book |