Spaces:
Running
Running
# -*- coding: utf-8 -*- | |
import string | |
import re | |
from pysbd.utils import Rule, Text | |
from functools import partial | |
class ListItemReplacer(object): | |
ROMAN_NUMERALS = "i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx".split(' ') | |
LATIN_NUMERALS = list(string.ascii_lowercase) | |
# Rubular: http://rubular.com/r/XcpaJKH0sz | |
ALPHABETICAL_LIST_WITH_PERIODS = r'(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)' | |
# Rubular: http://rubular.com/r/Gu5rQapywf | |
# TODO: Make sure below regex call is case-insensitive | |
ALPHABETICAL_LIST_WITH_PARENS = r'(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))' | |
# (pattern, replacement) | |
SubstituteListPeriodRule = Rule('♨', '∯') | |
ListMarkerRule = Rule('☝', '') | |
# Rubular: http://rubular.com/r/Wv4qLdoPx7 | |
# https://regex101.com/r/62YBlv/1 | |
SpaceBetweenListItemsFirstRule = Rule(r'(?<=\S\S)\s(?=\S\s*\d+♨)', "\r") | |
# Rubular: http://rubular.com/r/AizHXC6HxK | |
# https://regex101.com/r/62YBlv/2 | |
SpaceBetweenListItemsSecondRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}♨)', "\r") | |
# Rubular: http://rubular.com/r/GE5q6yID2j | |
# https://regex101.com/r/62YBlv/3 | |
SpaceBetweenListItemsThirdRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}☝)', "\r") | |
NUMBERED_LIST_REGEX_1 = r'\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))' | |
# 1. abcd | |
# 2. xyz | |
NUMBERED_LIST_REGEX_2 = r'(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))' | |
# 1) abcd | |
# 2) xyz | |
NUMBERED_LIST_PARENS_REGEX = r'\d{1,2}(?=\)\s)' | |
# Rubular: http://rubular.com/r/NsNFSqrNvJ | |
# TODO: Make sure below regex call is case-insensitive | |
EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX = r'\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))' | |
# Rubular: http://rubular.com/r/wMpnVedEIb | |
# TODO: Make sure below regex call is case-insensitive | |
ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX = r'(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\.' | |
# Rubular: http://rubular.com/r/GcnmQt4a3I | |
ROMAN_NUMERALS_IN_PARENTHESES = r'\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])' | |
def __init__(self, text): | |
self.text = text | |
def add_line_break(self): | |
self.format_alphabetical_lists() | |
self.format_roman_numeral_lists() | |
self.format_numbered_list_with_periods() | |
self.format_numbered_list_with_parens() | |
return self.text | |
def replace_parens(self): | |
text = re.sub(self.ROMAN_NUMERALS_IN_PARENTHESES, | |
r'&✂&\1&⌬&', self.text) | |
return text | |
def format_numbered_list_with_parens(self): | |
self.replace_parens_in_numbered_list() | |
self.add_line_breaks_for_numbered_list_with_parens() | |
self.text = Text(self.text).apply(self.ListMarkerRule) | |
def replace_periods_in_numbered_list(self): | |
self.scan_lists(self.NUMBERED_LIST_REGEX_1, self.NUMBERED_LIST_REGEX_2, | |
'♨', strip=True) | |
def format_numbered_list_with_periods(self): | |
self.replace_periods_in_numbered_list() | |
self.add_line_breaks_for_numbered_list_with_periods() | |
self.text = Text(self.text).apply(self.SubstituteListPeriodRule) | |
def format_alphabetical_lists(self): | |
self.txt = self.add_line_breaks_for_alphabetical_list_with_periods( | |
roman_numeral=False) | |
self.txt = self.add_line_breaks_for_alphabetical_list_with_parens( | |
roman_numeral=False) | |
return self.txt | |
def format_roman_numeral_lists(self): | |
self.txt = self.add_line_breaks_for_alphabetical_list_with_periods( | |
roman_numeral=True) | |
self.txt = self.add_line_breaks_for_alphabetical_list_with_parens( | |
roman_numeral=True) | |
return self.txt | |
def add_line_breaks_for_alphabetical_list_with_periods( | |
self, roman_numeral=False): | |
txt = self.iterate_alphabet_array( | |
self.ALPHABETICAL_LIST_WITH_PERIODS, | |
roman_numeral=roman_numeral) | |
return txt | |
def add_line_breaks_for_alphabetical_list_with_parens(self, roman_numeral=False): | |
txt = self.iterate_alphabet_array( | |
self.ALPHABETICAL_LIST_WITH_PARENS, | |
parens=True, | |
roman_numeral=roman_numeral) | |
return txt | |
def scan_lists(self, regex1, regex2, replacement, strip=False): | |
list_array = re.findall(regex1, self.text) | |
list_array = list(map(int, list_array)) | |
for ind, item in enumerate(list_array): | |
# to avoid IndexError | |
# ruby returns nil if index is out of range | |
if (ind < len(list_array) - 1 and item + 1 == list_array[ind + 1]): | |
self.substitute_found_list_items(regex2, item, strip, replacement) | |
elif ind > 0: | |
if (((item - 1) == list_array[ind - 1]) or | |
((item == 0) and (list_array[ind - 1] == 9)) or | |
((item == 9) and (list_array[ind - 1] == 0))): | |
self.substitute_found_list_items(regex2, item, strip, replacement) | |
def substitute_found_list_items(self, regex, each, strip, replacement): | |
def replace_item(match, val=None, strip=False, repl='♨'): | |
match = match.group() | |
if strip: | |
match = str(match).strip() | |
chomped_match = match if len(match) == 1 else match.strip('.])') | |
if str(each) == chomped_match: | |
return "{}{}".format(each, replacement) | |
else: | |
return str(match) | |
self.text = re.sub(regex, partial(replace_item, val=each, | |
strip=strip, repl=replacement), self.text) | |
def add_line_breaks_for_numbered_list_with_periods(self): | |
if ('♨' in self.text) and (not re.search( | |
'♨.+(\n|\r).+♨', self.text)) and (not re.search( | |
r'for\s\d{1,2}♨\s[a-z]', self.text)): | |
self.text = Text(self.text).apply(self.SpaceBetweenListItemsFirstRule, | |
self.SpaceBetweenListItemsSecondRule) | |
def replace_parens_in_numbered_list(self): | |
self.scan_lists( | |
self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝') | |
self.scan_lists(self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝') | |
def add_line_breaks_for_numbered_list_with_parens(self): | |
if '☝' in self.text and not re.search("☝.+\n.+☝|☝.+\r.+☝", self.text): | |
self.text = Text(self.text).apply( | |
self.SpaceBetweenListItemsThirdRule) | |
def replace_alphabet_list(self, a): | |
""" | |
Input: 'a. ffegnog b. fgegkl c.' | |
Output: \ra∯ ffegnog \rb∯ fgegkl \rc∯ | |
""" | |
def replace_letter_period(match, val=None): | |
match = match.group() | |
match_wo_period = match.strip('.') | |
if match_wo_period == val: | |
return '\r{}∯'.format(match_wo_period) | |
else: | |
return match | |
txt = re.sub(self.ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX, | |
partial(replace_letter_period, val=a), | |
self.text, flags=re.IGNORECASE) | |
return txt | |
def replace_alphabet_list_parens(self, a): | |
""" | |
Input: "a) ffegnog (b) fgegkl c)" | |
Output: "\ra) ffegnog \r&✂&b) fgegkl \rc)" | |
""" | |
def replace_alphabet_paren(match, val=None): | |
match = match.group() | |
if '(' in match: | |
match_wo_paren = match.strip('(') | |
if match_wo_paren == val: | |
return '\r&✂&{}'.format(match_wo_paren) | |
else: | |
return match | |
else: | |
if match == val: | |
return '\r{}'.format(match) | |
else: | |
return match | |
# Make it cases-insensitive | |
txt = re.sub(self.EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX, | |
partial(replace_alphabet_paren, val=a), | |
self.text, flags=re.IGNORECASE) | |
return txt | |
def replace_correct_alphabet_list(self, a, parens): | |
if parens: | |
a = self.replace_alphabet_list_parens(a) | |
else: | |
a = self.replace_alphabet_list(a) | |
return a | |
def last_array_item_replacement(self, a, i, alphabet, list_array, parens): | |
if (len(alphabet) == 0) & (len(list_array) == 0) or ( | |
list_array[i - 1] not in alphabet) or (a not in alphabet): | |
return self.text | |
if abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1: | |
return self.text | |
result = self.replace_correct_alphabet_list(a, parens) | |
return result | |
def other_items_replacement(self, a, i, alphabet, list_array, parens): | |
if (len(alphabet) == 0) & (len(list_array) == 0) or ( | |
list_array[i - 1] not in alphabet) or (a not in alphabet) or ( | |
list_array[i + 1] not in alphabet): | |
return self.text | |
if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 and \ | |
abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1: | |
return self.text | |
result = self.replace_correct_alphabet_list(a, parens) | |
return result | |
def iterate_alphabet_array(self, regex, parens=False, roman_numeral=False): | |
list_array = re.findall(regex, self.text) | |
alphabet = self.ROMAN_NUMERALS if roman_numeral else self.LATIN_NUMERALS | |
list_array = [i for i in list_array if i in alphabet] | |
for ind, each in enumerate(list_array): | |
if ind == len(list_array) - 1: | |
self.text = self.last_array_item_replacement(each, ind, alphabet, list_array, parens) | |
else: | |
self.text = self.other_items_replacement( | |
each, ind, alphabet, list_array, parens) | |
return self.text | |