xcl-en-demo / pysbd /punctuation_replacer.py
Ari Nubar Boyacıoğlu
add files
42bcb30
raw
history blame
1.54 kB
# -*- coding: utf-8 -*-
import re
from pysbd.utils import Rule, Text
class EscapeRegexReservedCharacters(object):
LeftParen = Rule(r'\(', '\\(')
RightParen = Rule(r'\)', '\\)')
# LeftParen = Rule(re.escape(r'('), '(')
# RightParen = Rule(re.escape(r')'), ')')
LeftBracket = Rule(r'\[', '\\[')
RightBracket = Rule(r'\]', '\\]')
Dash = Rule(r'\-', '\\-')
All = [LeftParen, RightParen, LeftBracket, RightBracket, Dash]
class SubEscapedRegexReservedCharacters(object):
SubLeftParen = Rule(r'\\\(', '(')
SubRightParen = Rule(r'\\\)', ')')
# SubLeftParen = Rule(re.escape(r"\\("), "(")
# SubRightParen = Rule(re.escape(r'\\)'), ')')
SubLeftBracket = Rule(r'\\\[', '[')
SubRightBracket = Rule(r'\\\]', ']')
SubDash = Rule(r'\\\-', '-')
All = [
SubLeftParen, SubRightParen, SubLeftBracket, SubRightBracket, SubDash
]
def replace_punctuation(match, match_type=None):
text = Text(match.group()).apply(*EscapeRegexReservedCharacters.All)
sub = re.sub(r'\.', '∯', text)
sub = re.sub(r'։', '⍟', sub) # ADDED FOR ARMENIAN
sub_1 = re.sub(r'\。', '&ᓰ&', sub)
sub_2 = re.sub(r'\.', '&ᓱ&', sub_1)
sub_3 = re.sub(r'\!', '&ᓳ&', sub_2)
sub_4 = re.sub(r'\!', '&ᓴ&', sub_3)
sub_5 = re.sub(r'\?', '&ᓷ&', sub_4)
last_sub = re.sub(r'\?', '&ᓸ&', sub_5)
if match_type != 'single':
last_sub = re.sub(r"'", '&⎋&', last_sub)
text = Text(last_sub).apply(*SubEscapedRegexReservedCharacters.All)
return text