Spaces:
Running
Running
# -*- coding: utf-8 -*- | |
import re | |
from pysbd.utils import Rule, Text | |
class EscapeRegexReservedCharacters(object): | |
LeftParen = Rule(r'\(', '\\(') | |
RightParen = Rule(r'\)', '\\)') | |
# LeftParen = Rule(re.escape(r'('), '(') | |
# RightParen = Rule(re.escape(r')'), ')') | |
LeftBracket = Rule(r'\[', '\\[') | |
RightBracket = Rule(r'\]', '\\]') | |
Dash = Rule(r'\-', '\\-') | |
All = [LeftParen, RightParen, LeftBracket, RightBracket, Dash] | |
class SubEscapedRegexReservedCharacters(object): | |
SubLeftParen = Rule(r'\\\(', '(') | |
SubRightParen = Rule(r'\\\)', ')') | |
# SubLeftParen = Rule(re.escape(r"\\("), "(") | |
# SubRightParen = Rule(re.escape(r'\\)'), ')') | |
SubLeftBracket = Rule(r'\\\[', '[') | |
SubRightBracket = Rule(r'\\\]', ']') | |
SubDash = Rule(r'\\\-', '-') | |
All = [ | |
SubLeftParen, SubRightParen, SubLeftBracket, SubRightBracket, SubDash | |
] | |
def replace_punctuation(match, match_type=None): | |
text = Text(match.group()).apply(*EscapeRegexReservedCharacters.All) | |
sub = re.sub(r'\.', '∯', text) | |
sub = re.sub(r'։', '⍟', sub) # ADDED FOR ARMENIAN | |
sub_1 = re.sub(r'\。', '&ᓰ&', sub) | |
sub_2 = re.sub(r'\.', '&ᓱ&', sub_1) | |
sub_3 = re.sub(r'\!', '&ᓳ&', sub_2) | |
sub_4 = re.sub(r'\!', '&ᓴ&', sub_3) | |
sub_5 = re.sub(r'\?', '&ᓷ&', sub_4) | |
last_sub = re.sub(r'\?', '&ᓸ&', sub_5) | |
if match_type != 'single': | |
last_sub = re.sub(r"'", '&⎋&', last_sub) | |
text = Text(last_sub).apply(*SubEscapedRegexReservedCharacters.All) | |
return text | |