File size: 4,436 Bytes
42bcb30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# -*- coding: utf-8 -*-
import re
from pysbd.utils import Text
from pysbd.clean.rules import PDF, HTML, CleanRules as cr


class Cleaner(object):

    def __init__(self, text, lang, doc_type=None):
        self.text = text
        self.lang = lang
        self.doc_type = doc_type

    def clean(self):
        if not self.text:
            return self.text
        self.remove_all_newlines()
        self.replace_double_newlines()
        self.replace_newlines()
        self.replace_escaped_newlines()
        self.text = Text(self.text).apply(*HTML.All)
        self.replace_punctuation_in_brackets()
        self.text = Text(self.text).apply(cr.InlineFormattingRule)
        self.clean_quotations()
        self.clean_table_of_contents()
        self.check_for_no_space_in_between_sentences()
        self.clean_consecutive_characters()
        return self.text

    def remove_all_newlines(self):
        self.remove_newline_in_middle_of_sentence()
        self.remove_newline_in_middle_of_word()

    def remove_newline_in_middle_of_sentence(self):
        def replace_w_blank(match):
            match = match.group()
            sub = re.sub(cr.NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '', match)
            return sub
        self.text = re.sub(r'(?:[^\.])*', replace_w_blank, self.text)

    def remove_newline_in_middle_of_word(self):
        self.text = Text(self.text).apply(cr.NewLineInMiddleOfWordRule)

    def replace_double_newlines(self):
        self.text = Text(self.text).apply(cr.DoubleNewLineWithSpaceRule,
                                          cr.DoubleNewLineRule)

    def remove_pdf_line_breaks(self):
        self.text = Text(
                self.text).apply(cr.NewLineFollowedByBulletRule,
                                 PDF.NewLineInMiddleOfSentenceRule,
                                 PDF.NewLineInMiddleOfSentenceNoSpacesRule)

    def replace_newlines(self):
        if self.doc_type == 'pdf':
            self.remove_pdf_line_breaks()
        else:
            self.text = Text(
                self.text).apply(cr.NewLineFollowedByPeriodRule,
                                 cr.ReplaceNewlineWithCarriageReturnRule)

    def replace_escaped_newlines(self):
        self.text = Text(
                self.text).apply(cr.EscapedNewLineRule,
                                 cr.EscapedCarriageReturnRule,
                                 cr.TypoEscapedNewLineRule,
                                 cr.TypoEscapedCarriageReturnRule)

    def replace_punctuation_in_brackets(self):
        def replace_punct(match):
            match = match.group()
            if '?' in match:
                sub = re.sub(re.escape('?'), '&ᓷ&', match)
                return sub
            return match
        self.text = re.sub(r'\[(?:[^\]])*\]', replace_punct, self.text)

    def clean_quotations(self):
        # method added explicitly
        # pragmatic-segmenter applies thhis method
        # at different location
        self.text = re.sub('`', "'", self.text)
        self.text = Text(self.text).apply(
                                        cr.QuotationsFirstRule,
                                        cr.QuotationsSecondRule)

    def clean_table_of_contents(self):
        self.text = Text(self.text).apply(
                                        cr.TableOfContentsRule,
                                        cr.ConsecutivePeriodsRule,
                                        cr.ConsecutiveForwardSlashRule)

    def search_for_connected_sentences(self, word, txt, regex, rule):
        if not re.search(regex, word):
            return txt
        if any(k in word for k in cr.URL_EMAIL_KEYWORDS):
            return txt
        new_word = Text(word).apply(rule)
        txt = re.sub(re.escape(word), new_word, txt)
        return txt

    def check_for_no_space_in_between_sentences(self):
        words = self.text.split(' ')
        for word in words:
            self.text = self.search_for_connected_sentences(word, self.text, cr.NO_SPACE_BETWEEN_SENTENCES_REGEX, cr.NoSpaceBetweenSentencesRule)
            self.text = self.search_for_connected_sentences(word, self.text, cr.NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, cr.NoSpaceBetweenSentencesDigitRule)

    def clean_consecutive_characters(self):
        self.text = Text(self.text).apply(
                                        cr.ConsecutivePeriodsRule,
                                        cr.ConsecutiveForwardSlashRule)