# -*- coding: utf-8 -*- import re from pysbd.utils import Text from pysbd.lists_item_replacer import ListItemReplacer from pysbd.exclamation_words import ExclamationWords from pysbd.between_punctuation import BetweenPunctuation from pysbd.abbreviation_replacer import AbbreviationReplacer class Processor(object): def __init__(self, text, lang, char_span=False): """Process a text - do pre and post processing - to get proper sentences Parameters ---------- text : str Original text language : object Language module char_span : bool, optional Get start & end character offsets of each sentences within original text, by default False """ self.text = text self.lang = lang self.char_span = char_span def process(self): if not self.text: return self.text self.text = self.text.replace('\n', '\r') li = ListItemReplacer(self.text) self.text = li.add_line_break() self.replace_abbreviations() self.replace_numbers() self.replace_continuous_punctuation() self.replace_periods_before_numeric_references() self.text = Text(self.text).apply( self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule, self.lang.GeoLocationRule, self.lang.FileFormatRule) postprocessed_sents = self.split_into_segments() return postprocessed_sents def rm_none_flatten(self, sents): """Remove None values and unpack list of list sents Parameters ---------- sents : list list of sentences Returns ------- list unpacked and None removed list of sents """ sents = list(filter(None, sents)) if not any(isinstance(s, list) for s in sents): return sents new_sents = [] for sent in sents: if isinstance(sent, list): for s in sent: new_sents.append(s) else: new_sents.append(sent) return new_sents def split_into_segments(self): self.check_for_parens_between_quotes() sents = self.text.split('\r') # remove empty and none values sents = self.rm_none_flatten(sents) sents = [ Text(s).apply(self.lang.SingleNewLineRule, *self.lang.EllipsisRules.All) for s in sents ] # # THESE LINES ARE NOT PRESENT IN THE ORIGINAL CODE --> ONLY USE FOR HYW # sents = [self.post_process_segments(s) for s in sents] # sents = self.rm_none_flatten(sents) sents = [self.check_for_punctuation(s) for s in sents] # flatten list of list of sentences sents = self.rm_none_flatten(sents) postprocessed_sents = [] for sent in sents: sent = Text(sent).apply(*self.lang.SubSymbolsRules.All) post_process_sent = self.post_process_segments(sent) if post_process_sent and isinstance(post_process_sent, str): postprocessed_sents.append(post_process_sent) elif isinstance(post_process_sent, list): for pps in post_process_sent: postprocessed_sents.append(pps) postprocessed_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule) for ns in postprocessed_sents] return postprocessed_sents def post_process_segments(self, txt): if len(txt) > 2 and re.search(r'\A[a-zA-Z]*\Z', txt): return txt # below condition present in pragmatic segmenter # dont know significance of it yet. # if self.consecutive_underscore(txt) or len(txt) < 2: # return txt if re.match(r'\t', txt): pass # TODO: # Decide on keeping or removing Standard.ExtraWhiteSpaceRule # removed to retain original text spans # txt = Text(txt).apply(*ReinsertEllipsisRules.All, # Standard.ExtraWhiteSpaceRule) txt = Text(txt).apply(*self.lang.ReinsertEllipsisRules.All) if re.search(self.lang.QUOTATION_AT_END_OF_SENTENCE_REGEX, txt): txt = re.split( self.lang.SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX, txt) return txt else: txt = txt.replace('\n', '') return txt.strip() def check_for_parens_between_quotes(self): def paren_replace(match): match = match.group() sub1 = re.sub(r'\s(?=\()', '\r', match) sub2 = re.sub(r'(?<=\))\s', '\r', sub1) return sub2 self.text = re.sub(self.lang.PARENS_BETWEEN_DOUBLE_QUOTES_REGEX, paren_replace, self.text) def replace_continuous_punctuation(self): def continuous_puncs_replace(match): match = match.group() sub1 = re.sub(re.escape('!'), '&ᓴ&', match) sub2 = re.sub(re.escape('?'), '&ᓷ&', sub1) return sub2 self.text = re.sub(self.lang.CONTINUOUS_PUNCTUATION_REGEX, continuous_puncs_replace, self.text) def replace_periods_before_numeric_references(self): # https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703 self.text = re.sub(self.lang.NUMBERED_REFERENCE_REGEX, r"∯\2\r\7", self.text) def consecutive_underscore(self, txt): # Rubular: http://rubular.com/r/fTF2Ff3WBL txt = re.sub(r'_{3,}', '', txt) return len(txt) == 0 def check_for_punctuation(self, txt): if any(p in txt for p in self.lang.Punctuations): sents = self.process_text(txt) return sents else: # NOTE: next steps of check_for_punctuation will unpack this list return [txt] def process_text(self, txt): if txt[-1] not in self.lang.Punctuations: txt += 'ȸ' txt = ExclamationWords.apply_rules(txt) txt = self.between_punctuation(txt) # handle text having only doublepunctuations if not re.match(self.lang.DoublePunctuationRules.DoublePunctuation, txt): txt = Text(txt).apply(*self.lang.DoublePunctuationRules.All) txt = Text(txt).apply(self.lang.QuestionMarkInQuotationRule, *self.lang.ExclamationPointRules.All) txt = ListItemReplacer(txt).replace_parens() txt = self.sentence_boundary_punctuation(txt) return txt def replace_numbers(self): self.text = Text(self.text).apply(*self.lang.Numbers.All) def abbreviations_replacer(self): if hasattr(self.lang, "AbbreviationReplacer"): return self.lang.AbbreviationReplacer(self.text, self.lang) else: return AbbreviationReplacer(self.text, self.lang) def replace_abbreviations(self): self.text = self.abbreviations_replacer().replace() def between_punctuation_processor(self, txt): if hasattr(self.lang, "BetweenPunctuation"): return self.lang.BetweenPunctuation(txt) else: return BetweenPunctuation(txt) def between_punctuation(self, txt): txt = self.between_punctuation_processor(txt).replace() return txt def sentence_boundary_punctuation(self, txt): if hasattr(self.lang, 'ReplaceColonBetweenNumbersRule'): txt = Text(txt).apply( self.lang.ReplaceColonBetweenNumbersRule) if hasattr(self.lang, 'ReplaceNonSentenceBoundaryCommaRule'): txt = Text(txt).apply( self.lang.ReplaceNonSentenceBoundaryCommaRule) # retain exclamation mark if it is an ending character of a given text txt = re.sub(r'&ᓴ&$', '!', txt) txt = [ m.group() for m in re.finditer(self.lang.SENTENCE_BOUNDARY_REGEX, txt) ] return txt