Spaces:

AriNubar
/

xcl-en-demo

Running

xcl-en-demo / pysbd /lists_item_replacer.py

Ari Nubar Boyacıoğlu

add files

42bcb30 13 days ago

10.4 kB

	# -- coding: utf-8 --
	import string
	import re
	from pysbd.utils import Rule, Text
	from functools import partial


	class ListItemReplacer(object):

	ROMAN_NUMERALS = "i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx".split(' ')
	LATIN_NUMERALS = list(string.ascii_lowercase)

	# Rubular: http://rubular.com/r/XcpaJKH0sz
	ALPHABETICAL_LIST_WITH_PERIODS = r'(?<=^)[a-z](?=\.)\|(?<=\A)[a-z](?=\.)\|(?<=\s)[a-z](?=\.)'

	# Rubular: http://rubular.com/r/Gu5rQapywf
	# TODO: Make sure below regex call is case-insensitive
	ALPHABETICAL_LIST_WITH_PARENS = r'(?<=\()[a-z]+(?=\))\|(?<=^)[a-z]+(?=\))\|(?<=\A)[a-z]+(?=\))\|(?<=\s)[a-z]+(?=\))'

	# (pattern, replacement)
	SubstituteListPeriodRule = Rule('♨', '∯')
	ListMarkerRule = Rule('☝', '')

	# Rubular: http://rubular.com/r/Wv4qLdoPx7
	# https://regex101.com/r/62YBlv/1
	SpaceBetweenListItemsFirstRule = Rule(r'(?<=\S\S)\s(?=\S\s*\d+♨)', "\r")

	# Rubular: http://rubular.com/r/AizHXC6HxK
	# https://regex101.com/r/62YBlv/2
	SpaceBetweenListItemsSecondRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}♨)', "\r")

	# Rubular: http://rubular.com/r/GE5q6yID2j
	# https://regex101.com/r/62YBlv/3
	SpaceBetweenListItemsThirdRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}☝)', "\r")

	NUMBERED_LIST_REGEX_1 = r'\s\d{1,2}(?=\.\s)\|^\d{1,2}(?=\.\s)\|\s\d{1,2}(?=\.\))\|^\d{1,2}(?=\.\))\|(?<=\s\-)\d{1,2}(?=\.\s)\|(?<=^\-)\d{1,2}(?=\.\s)\|(?<=\s\⁃)\d{1,2}(?=\.\s)\|(?<=^\⁃)\d{1,2}(?=\.\s)\|(?<=s\-)\d{1,2}(?=\.\))\|(?<=^\-)\d{1,2}(?=\.\))\|(?<=\s\⁃)\d{1,2}(?=\.\))\|(?<=^\⁃)\d{1,2}(?=\.\))'
	# 1. abcd
	# 2. xyz
	NUMBERED_LIST_REGEX_2 = r'(?<=\s)\d{1,2}\.(?=\s)\|^\d{1,2}\.(?=\s)\|(?<=\s)\d{1,2}\.(?=\))\|^\d{1,2}\.(?=\))\|(?<=\s\-)\d{1,2}\.(?=\s)\|(?<=^\-)\d{1,2}\.(?=\s)\|(?<=\s\⁃)\d{1,2}\.(?=\s)\|(?<=^\⁃)\d{1,2}\.(?=\s)\|(?<=\s\-)\d{1,2}\.(?=\))\|(?<=^\-)\d{1,2}\.(?=\))\|(?<=\s\⁃)\d{1,2}\.(?=\))\|(?<=^\⁃)\d{1,2}\.(?=\))'
	# 1) abcd
	# 2) xyz
	NUMBERED_LIST_PARENS_REGEX = r'\d{1,2}(?=\)\s)'

	# Rubular: http://rubular.com/r/NsNFSqrNvJ
	# TODO: Make sure below regex call is case-insensitive
	EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX = r'\([a-z]+(?=\))\|(?<=^)[a-z]+(?=\))\|(?<=\A)[a-z]+(?=\))\|(?<=\s)[a-z]+(?=\))'

	# Rubular: http://rubular.com/r/wMpnVedEIb
	# TODO: Make sure below regex call is case-insensitive
	ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX = r'(?<=^)[a-z]\.\|(?<=\A)[a-z]\.\|(?<=\s)[a-z]\.'

	# Rubular: http://rubular.com/r/GcnmQt4a3I
	ROMAN_NUMERALS_IN_PARENTHESES = r'\(((?=[mdclxvi])m(c[md]\|d?c)(x[cl]\|l?x)(i[xv]\|v?i))\)(?=\s[A-Z])'

	def __init__(self, text):
	self.text = text

	def add_line_break(self):
	self.format_alphabetical_lists()
	self.format_roman_numeral_lists()
	self.format_numbered_list_with_periods()
	self.format_numbered_list_with_parens()
	return self.text

	def replace_parens(self):
	text = re.sub(self.ROMAN_NUMERALS_IN_PARENTHESES,
	r'&✂&\1&⌬&', self.text)
	return text

	def format_numbered_list_with_parens(self):
	self.replace_parens_in_numbered_list()
	self.add_line_breaks_for_numbered_list_with_parens()
	self.text = Text(self.text).apply(self.ListMarkerRule)

	def replace_periods_in_numbered_list(self):
	self.scan_lists(self.NUMBERED_LIST_REGEX_1, self.NUMBERED_LIST_REGEX_2,
	'♨', strip=True)

	def format_numbered_list_with_periods(self):
	self.replace_periods_in_numbered_list()
	self.add_line_breaks_for_numbered_list_with_periods()
	self.text = Text(self.text).apply(self.SubstituteListPeriodRule)

	def format_alphabetical_lists(self):
	self.txt = self.add_line_breaks_for_alphabetical_list_with_periods(
	roman_numeral=False)
	self.txt = self.add_line_breaks_for_alphabetical_list_with_parens(
	roman_numeral=False)
	return self.txt

	def format_roman_numeral_lists(self):
	self.txt = self.add_line_breaks_for_alphabetical_list_with_periods(
	roman_numeral=True)
	self.txt = self.add_line_breaks_for_alphabetical_list_with_parens(
	roman_numeral=True)
	return self.txt

	def add_line_breaks_for_alphabetical_list_with_periods(
	self, roman_numeral=False):
	txt = self.iterate_alphabet_array(
	self.ALPHABETICAL_LIST_WITH_PERIODS,
	roman_numeral=roman_numeral)
	return txt

	def add_line_breaks_for_alphabetical_list_with_parens(self, roman_numeral=False):
	txt = self.iterate_alphabet_array(
	self.ALPHABETICAL_LIST_WITH_PARENS,
	parens=True,
	roman_numeral=roman_numeral)
	return txt

	def scan_lists(self, regex1, regex2, replacement, strip=False):
	list_array = re.findall(regex1, self.text)
	list_array = list(map(int, list_array))
	for ind, item in enumerate(list_array):
	# to avoid IndexError
	# ruby returns nil if index is out of range
	if (ind < len(list_array) - 1 and item + 1 == list_array[ind + 1]):
	self.substitute_found_list_items(regex2, item, strip, replacement)
	elif ind > 0:
	if (((item - 1) == list_array[ind - 1]) or
	((item == 0) and (list_array[ind - 1] == 9)) or
	((item == 9) and (list_array[ind - 1] == 0))):
	self.substitute_found_list_items(regex2, item, strip, replacement)

	def substitute_found_list_items(self, regex, each, strip, replacement):

	def replace_item(match, val=None, strip=False, repl='♨'):
	match = match.group()
	if strip:
	match = str(match).strip()
	chomped_match = match if len(match) == 1 else match.strip('.])')
	if str(each) == chomped_match:
	return "{}{}".format(each, replacement)
	else:
	return str(match)

	self.text = re.sub(regex, partial(replace_item, val=each,
	strip=strip, repl=replacement), self.text)

	def add_line_breaks_for_numbered_list_with_periods(self):
	if ('♨' in self.text) and (not re.search(
	'♨.+(\n\|\r).+♨', self.text)) and (not re.search(
	r'for\s\d{1,2}♨\s[a-z]', self.text)):
	self.text = Text(self.text).apply(self.SpaceBetweenListItemsFirstRule,
	self.SpaceBetweenListItemsSecondRule)

	def replace_parens_in_numbered_list(self):
	self.scan_lists(
	self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝')
	self.scan_lists(self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝')

	def add_line_breaks_for_numbered_list_with_parens(self):
	if '☝' in self.text and not re.search("☝.+\n.+☝\|☝.+\r.+☝", self.text):
	self.text = Text(self.text).apply(
	self.SpaceBetweenListItemsThirdRule)

	def replace_alphabet_list(self, a):
	"""
	Input: 'a. ffegnog b. fgegkl c.'
	Output: \ra∯ ffegnog \rb∯ fgegkl \rc∯
	"""

	def replace_letter_period(match, val=None):
	match = match.group()
	match_wo_period = match.strip('.')
	if match_wo_period == val:
	return '\r{}∯'.format(match_wo_period)
	else:
	return match

	txt = re.sub(self.ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX,
	partial(replace_letter_period, val=a),
	self.text, flags=re.IGNORECASE)
	return txt

	def replace_alphabet_list_parens(self, a):
	"""
	Input: "a) ffegnog (b) fgegkl c)"
	Output: "\ra) ffegnog \r&✂&b) fgegkl \rc)"
	"""

	def replace_alphabet_paren(match, val=None):
	match = match.group()
	if '(' in match:
	match_wo_paren = match.strip('(')
	if match_wo_paren == val:
	return '\r&✂&{}'.format(match_wo_paren)
	else:
	return match
	else:
	if match == val:
	return '\r{}'.format(match)
	else:
	return match

	# Make it cases-insensitive
	txt = re.sub(self.EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX,
	partial(replace_alphabet_paren, val=a),
	self.text, flags=re.IGNORECASE)
	return txt

	def replace_correct_alphabet_list(self, a, parens):
	if parens:
	a = self.replace_alphabet_list_parens(a)
	else:
	a = self.replace_alphabet_list(a)
	return a

	def last_array_item_replacement(self, a, i, alphabet, list_array, parens):
	if (len(alphabet) == 0) & (len(list_array) == 0) or (
	list_array[i - 1] not in alphabet) or (a not in alphabet):
	return self.text
	if abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1:
	return self.text
	result = self.replace_correct_alphabet_list(a, parens)
	return result

	def other_items_replacement(self, a, i, alphabet, list_array, parens):
	if (len(alphabet) == 0) & (len(list_array) == 0) or (
	list_array[i - 1] not in alphabet) or (a not in alphabet) or (
	list_array[i + 1] not in alphabet):
	return self.text
	if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 and \
	abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1:
	return self.text
	result = self.replace_correct_alphabet_list(a, parens)
	return result

	def iterate_alphabet_array(self, regex, parens=False, roman_numeral=False):
	list_array = re.findall(regex, self.text)
	alphabet = self.ROMAN_NUMERALS if roman_numeral else self.LATIN_NUMERALS
	list_array = [i for i in list_array if i in alphabet]
	for ind, each in enumerate(list_array):
	if ind == len(list_array) - 1:
	self.text = self.last_array_item_replacement(each, ind, alphabet, list_array, parens)
	else:
	self.text = self.other_items_replacement(
	each, ind, alphabet, list_array, parens)
	return self.text