Ari Nubar Boyacıoğlu commited on
Commit
42bcb30
·
1 Parent(s): 2883dd2
This view is limited to 50 files because it contains too many changes.   See raw diff
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __pycache__
2
+ flagged_data
3
+ .gitattributes
4
+ .gradio
README copy.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Hyw En Demo v2
3
+ emoji: ⚡
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 4.43.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: cc-by-4.0
11
+ ---
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
  title: Xcl En Demo
3
- emoji: 🐨
4
- colorFrom: pink
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
-
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Xcl En Demo
3
+ emoji: 📖
4
+ colorFrom: red
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.43.0
8
  app_file: app.py
9
  pinned: false
10
+ license: cc-by-4.0
11
  ---
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+
3
+ import gradio as gr
4
+ import os
5
+ import json
6
+ from pathlib import Path
7
+ from uuid import uuid4
8
+ from datetime import datetime
9
+ from huggingface_hub import CommitScheduler
10
+ from translation import Translator, LANGUAGES
11
+ import re
12
+
13
+
14
+ LANGUAGES_LIST = list(LANGUAGES.keys())
15
+ HF_TOKEN = os.environ.get('HF_TOKEN')
16
+
17
+ JSON_DATASET_DIR = Path("flagged_data")
18
+ JSON_DATASET_DIR.mkdir(exist_ok=True, parents=True)
19
+ JSON_DATASET_PATH = JSON_DATASET_DIR / f"dataset-session-{uuid4()}.json"
20
+
21
+
22
+
23
+
24
+ def translate_wrapper(text, src_lang, tgt_lang, by_sentence=True, clean=True, num_beams=4):
25
+ if text in ["", None, []]:
26
+ return "Մուտքագրումը պարապ է։ | Input is empty."
27
+
28
+ if src_lang in ["", None, []] or tgt_lang in ["", None, []]:
29
+ return "Ընտրեցէք թարգմանութեան կողմերը | Please select source and target languages"
30
+
31
+ if src_lang == tgt_lang:
32
+ return "Ընտրուած լեզուները նոյնն են։ | Source and target languages are identical."
33
+
34
+ src_lang = LANGUAGES.get(src_lang)
35
+ tgt_lang = LANGUAGES.get(tgt_lang)
36
+
37
+ result = translator.translate(text, src_lang, tgt_lang, by_sentence=by_sentence, clean=clean, num_beams=num_beams)
38
+ return result
39
+
40
+
41
+ # hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN, "AriNubar/hyw-en-crowd-source")
42
+
43
+ scheduler = CommitScheduler(
44
+ repo_id="AriNubar/xcl-en-crowdsource",
45
+ repo_type="dataset",
46
+ folder_path=JSON_DATASET_DIR,
47
+ path_in_repo="collected_data",
48
+ token=HF_TOKEN,
49
+ every=30 # every 30 minute
50
+ )
51
+
52
+ TQS = {
53
+ "😊 Լաւ | Good": "good",
54
+ "😐 Միջակ | Average": "average",
55
+ "☹️ Վատ | Bad": "bad"
56
+ }
57
+
58
+ def save_json(src_lang, tgt_lang, input_text, output_text, by_sentence, clean, num_beams, translation_quality):
59
+ if any([src_lang in ["", None, []], tgt_lang in ["", None, []], input_text in ["", None, []], output_text in ["", None, []]]):
60
+ gr.Warning("Տուեալին մէկ մասը պարապ է։ Ձեր գնահատութիւնը չպահուեցաւ։ | Some part of the data is missing. Your feedback has not been saved.")
61
+ return
62
+
63
+ src_lang = LANGUAGES.get(src_lang)
64
+ tgt_lang = LANGUAGES.get(tgt_lang)
65
+ translation_quality = TQS.get(translation_quality)
66
+
67
+ # print(src_lang, tgt_lang, input_text, output_text, by_sentence, clean, num_beams, translation_quality)
68
+ # print(type(src_lang), type(tgt_lang), type(input_text), type(output_text), type(by_sentence), type(clean), type(num_beams), type(translation_quality))
69
+ data = {
70
+ "src_lang": src_lang,
71
+ "tgt_lang": tgt_lang,
72
+ "original": input_text,
73
+ "translation": output_text,
74
+ "by_sentence": by_sentence,
75
+ "clean": clean,
76
+ "num_beams": num_beams,
77
+ "translation_quality": translation_quality,
78
+ "timestamp": datetime.now().isoformat()
79
+ }
80
+ with scheduler.lock:
81
+ with open(JSON_DATASET_PATH, "a", encoding="utf8") as f:
82
+ f.write(json.dumps(data, ensure_ascii=False) + "\n")
83
+
84
+ gr.Info("Ձեր գնահանութիւնը պահուեցաւ։ Շատ շնորհակալութի՛ւն։ | Your feedback has been saved. Thank you.")
85
+
86
+
87
+ def switch_languages(src, tgt, input_text, output_text):
88
+ new_src = tgt
89
+ new_tgt = src
90
+ new_input = output_text if output_text else input_text
91
+ return [new_src, new_tgt, new_input, None]
92
+
93
+
94
+ def detect_language(text):
95
+ """Detect language based on script ratio"""
96
+ armenian_pattern = r'[\u0531-\u0587\u0589\u058A\u058F]'
97
+ non_armenian_pattern = r'[a-zA-Z]'
98
+
99
+ armenian_chars = len(re.findall(armenian_pattern, text))
100
+ non_armenian_chars = len(re.findall(non_armenian_pattern, text))
101
+
102
+ if armenian_chars > non_armenian_chars:
103
+ return "Գրաբար Հայոց | Classical Armenian", "Անգլերէն | English"
104
+ elif non_armenian_chars > 0:
105
+ return "Անգլերէն | English", "Գրաբար Հայոց | Classical Armenian"
106
+ return [gr.update(), gr.update()] # No clear dominance, reset dropdowns
107
+
108
+ def update_languages(text):
109
+ if not text:
110
+ return [gr.update(), gr.update()]
111
+ src, tgt = detect_language(text)
112
+ return [gr.update(value=src), gr.update(value=tgt)]
113
+
114
+
115
+ theme = gr.themes.Default().set(
116
+ block_info_text_size="*text_xxs" # for info text
117
+ )
118
+
119
+ with gr.Blocks(title="Գրաբար-Անգլերէն Մեքենական Թարգմանիչ | Classical Armenian-English Machine Translation",
120
+ theme=theme,
121
+ ) as demo:
122
+
123
+ gr.HTML("""
124
+ <h2 style='margin-bottom: 5px'>Գրաբար-Անգլերէն Մեքենական Թարգմանիչ | Classical Armenian-English Machine Translation</h2>
125
+ <h3>Տարբերակ | Version: 1.0</h3>
126
+ <h3 style='margin-bottom: 5px'>Ստեղծող՝ | Created By: <a href='https://www.arinubar.com' target='_blank'>Ari Nubar Boyacıoğlu</a></h3>
127
+ <p style="font-size: 0.7rem">Եթէ այս գործիքը կարողացաւ ձեզ օգտակար հանդիսանալ, բարելաւելու համար հաճեցէք սուրճի մը փոխարժէքը նուիրել․ | If this tool has proven useful to you, please consider making a donation. <a href='https://www.paypal.com/donate/?hosted_button_id=RRBCV3GQJ7D8N' target='_blank'>PayPal</a> | <a href='https://buymeacoffee.com/arinubar' target='_blank'>Buy Me a Coffee</a></p>
128
+ """)
129
+
130
+ with gr.Accordion("Թարգմանիչի Մասին | Information about the Translator", open=False):
131
+ gr.HTML("""
132
+ <p>Հոս կը ցուցադրուի առաջին գրաբար-անգլերէն մեքենական թարգմանիչը, որ կարուցուած է Մեթայի (Ֆեյսպուքի) 'No Language Left Behind' տիպարի հիման վրայ։ Թարգմանութեան տիպարը կ'աշխատի CPU-ի մը մէջ, ուրեմն նախադասութեան մը թարգմանութիւնը կրնայ տեւել մօտաւորապէս <strong>40-60 երկվայրկեան</strong>։ Ձեր գնահատութիւնները եւ քննադատութիւնները շատ կարեւոր են տիպարի թարգմանութեան որակը բարելաւելու համար։</p>
133
+ <p>Դուք թարգմանութեան որակին մասին ձեր գնահատութիւնը կրնաք տալ երեք գնահատութեան կոճակներէ մէկուն սեղմելով։ Լեզուի, մուտքի եւ ելքի գրութիւններու, յարաչափերու եւ ձեր գնահատութեան մասին տուեալները պիտի պահուին։ Գնահատութիւնը պարտաւոր չէ։</p>
134
+ <hr style='margin-top: 5px; margin-bottom: 5px'>
135
+ <p>This is the demo of the first Classical Armenian-English neural machine translation system which is based on Meta's 'No Language Left Behind' model. The model runs on a CPU, so it might take approximately <strong>40-60 seconds</strong> to translate a single sentence. Your feedback and comments are very important for us to improve the quality of the translation.</p>
136
+ <p>You can give your feedback about the quality of the translation by clicking one of the three feedback buttons. Information about source, target languages, input and output texts, parameters and your feedback about quality will be saved. It is not mandatory to give feedback.</p>
137
+ """)
138
+
139
+ with gr.Row():
140
+ with gr.Column():
141
+ text = gr.Textbox(
142
+ lines=5,
143
+ label="Մուտքագրում | Input Text",
144
+ every=1.5 # Trigger event 1.5 seconds after last keystroke
145
+ )
146
+
147
+ with gr.Row():
148
+ src_lang = gr.Dropdown(LANGUAGES_LIST, type="value", label="Թարգմանէ Այս Լեզուէ | Source Language")
149
+ tgt_lang = gr.Dropdown(LANGUAGES_LIST, type="value", label="Թարգմանէ Այս Լեզուի | Target Language")
150
+
151
+ with gr.Row():
152
+ switch_btn = gr.Button("🔄 Լեզուները Փոխէ | Switch Languages")
153
+
154
+
155
+ def switch_languages(src, tgt, input_text, output_text):
156
+ # Swap languages
157
+ new_src = tgt
158
+ new_tgt = src
159
+ # Move output to input if exists and clear output
160
+ new_input = output_text if output_text else input_text
161
+ return [new_src, new_tgt, new_input, None]
162
+
163
+ text.change(fn=update_languages,
164
+ inputs=[text],
165
+ outputs=[src_lang, tgt_lang])
166
+
167
+ with gr.Column():
168
+ translated = gr.Textbox(lines=5, label="Ելքագրում | Output Text", interactive=False)
169
+ translate_btn = gr.Button(value="Թարգմանէ | Translate", variant="primary")
170
+ with gr.Row():
171
+ with gr.Column():
172
+ gr.Markdown("""
173
+ ### Թարգմանութեան Որակ | Translation Quality
174
+ """)
175
+ flag_good_btn = gr.Button(value="😊 Լաւ | Good", size="sm")
176
+ flag_average_btn = gr.Button(value="😐 Միջակ | Average", size="sm")
177
+ flag_bad_btn = gr.Button(value="☹️ Վատ | Bad", size="sm")
178
+
179
+ with gr.Row():
180
+ gr.Markdown("""
181
+ ## Յարաչափեր | Parameters
182
+ """
183
+ )
184
+ by_sentence = gr.Checkbox(label="Նախադասութիւններու Բաժնէ | Split into Sentences", value=True, info="Տուփը նշանագրեցէք եթէ կ'ուզէք ձեր մուտքագրումը թարգմանուի նախադասութիւն առ նախադասութիւն։ Այս կերպով թարգմանուած նախադասութիւններուն որակը ընդհանրապէս աւելի լաւ կ'ըլլան։ | Check this box if you want to split your input text into sentences. This way the quality of the translation will be better.")
185
+ clean = gr.Checkbox(label="Մշակէ | Preprocess", value=True, info="Տուփը նշանագրեցէք եթէ կ'ուզէք ձեր մուտքագրումը կանոնաւորուի ծրագրի կողմէ թարգմանութենէ առաջ։ Կանոնաւորումը թարգմանութեան որակի բարելաւման համար օգտակար է։ | Check this box if you want to preprocess your input text before translation. This way the quality of the translation will be better.")
186
+ num_beams = gr.Dropdown([1, 2, 3, 4, 5], type="value", label="Որոնման Շողեր | Number of Beams", value=4, info="Աւելի բարձր թիւը ընդհանրապէս կը պատճառէ աւելի բարձր որակի, բայց նոյնիսկ երկարատեւ թարգմանութեան։ | Higher beam size will result in better quality translation, but also longer translation time.")
187
+
188
+ switch_btn.click(switch_languages, inputs=[src_lang, tgt_lang, text, translated], outputs=[src_lang, tgt_lang, text, translated])
189
+ translate_btn.click(translate_wrapper, inputs=[text, src_lang, tgt_lang, by_sentence, clean, num_beams], outputs=translated)
190
+
191
+ # hf_writer.setup([src_lang, tgt_lang, text, translated, by_sentence, clean, num_beams, flag_bad_btn], "flagged_data_points")
192
+
193
+ flag_good_btn.click(save_json, inputs=[src_lang, tgt_lang, text, translated, by_sentence, clean, num_beams, flag_good_btn], outputs=None)
194
+ flag_average_btn.click(save_json, inputs=[src_lang, tgt_lang, text, translated, by_sentence, clean, num_beams, flag_average_btn], outputs=None)
195
+ flag_bad_btn.click(save_json, inputs=[src_lang, tgt_lang, text, translated, by_sentence, clean, num_beams, flag_bad_btn], outputs=None)
196
+
197
+ visitor_badge_html = """
198
+ <a href="https://visitorbadge.io/status?path=https%3A%2F%2Farinubar-hyw-en-demo.hf.space%2F">
199
+ <img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Farinubar-hyw-en-demo.hf.space%2F&label=%D4%B1%D5%B5%D6%81%D5%A5%D5%AC%D5%B8%D6%82%D5%B6%D5%A5%D6%80%20%7C%20Visitors&countColor=%23f97316&style=flat" />
200
+ </a>
201
+ """
202
+
203
+ gr.HTML(visitor_badge_html)
204
+
205
+
206
+
207
+ sponsors_html = """
208
+ <div style="display: flex; justify-content: center; align-items: center; margin-bottom: 5px;">
209
+ <h3>Աջակցութեամբ՝ | Supported By: </h3>
210
+ </div>
211
+ <div style="display: flex; justify-content: center; align-items: center; background-color: #0000007a; border-radius: 20px;">
212
+ <a href="#">
213
+ <img src="/file=./img/mkhitaryan-varjaran.png" alt="Pangalti Mkhitaryan School" style="padding: 10px; margin: 20px; width: 150px;" />
214
+ </a>
215
+
216
+ <a href="https://gulbenkian.pt/armenian-communities/">
217
+ <img src="/file=./img/gulbenkian.png" alt="Calouste Gulbenkian Foundation - Armenian Communities" style="padding: 10px; margin: 20px;" />
218
+ </a>
219
+
220
+ <a href="http://www.teaov.org/">
221
+ <img src="/file=./img/teaov.png" alt="Turkish-Armenian Minority Schools Teachers Foundation" style="padding: 10px; margin: 20px; width: 200px; padding-right:35px;" />
222
+ </a>
223
+ </div>
224
+
225
+ """
226
+ gr.HTML(sponsors_html)
227
+
228
+
229
+ if __name__ == "__main__":
230
+ translator = Translator()
231
+ demo.launch(favicon_path="img/translate.png", share=True, allowed_paths=["./img"])
img/gulbenkian.png ADDED
img/gulbenkian.svg ADDED
img/mkhitaryan-varjaran.png ADDED
img/teaov.png ADDED
img/translate.png ADDED
pysbd/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .segmenter import Segmenter
2
+ from .about import __version__
pysbd/abbreviation_replacer.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+ from pysbd.utils import Text
4
+
5
+
6
+ def replace_pre_number_abbr(txt, abbr):
7
+ # prepend a space to avoid needing another regex for start of string
8
+ txt = " " + txt
9
+ txt = re.sub(r"(?<=\s{abbr})\.(?=(\s\d|\s+\())".format(abbr=abbr.strip()), "∯", txt)
10
+ # remove the prepended space
11
+ txt = txt[1:]
12
+ return txt
13
+
14
+
15
+ def replace_prepositive_abbr(txt, abbr):
16
+ # prepend a space to avoid needing another regex for start of string
17
+ txt = " " + txt
18
+ txt = re.sub(r"(?<=\s{abbr})\.(?=(\s|:\d+))".format(abbr=abbr.strip()), "∯", txt)
19
+ # remove the prepended space
20
+ txt = txt[1:]
21
+ return txt
22
+
23
+
24
+ class AbbreviationReplacer(object):
25
+ def __init__(self, text, lang):
26
+ self.text = text
27
+ self.lang = lang
28
+
29
+ def replace(self):
30
+ self.text = Text(self.text).apply(
31
+ self.lang.PossessiveAbbreviationRule,
32
+ self.lang.KommanditgesellschaftRule,
33
+ *self.lang.SingleLetterAbbreviationRules.All
34
+ )
35
+ abbr_handled_text = ""
36
+ for line in self.text.splitlines(True):
37
+ abbr_handled_text += self.search_for_abbreviations_in_string(line)
38
+ self.text = abbr_handled_text
39
+ self.replace_multi_period_abbreviations()
40
+ self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
41
+ self.text = self.replace_abbreviation_as_sentence_boundary()
42
+ return self.text
43
+
44
+ def replace_abbreviation_as_sentence_boundary(self):
45
+ sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in self.SENTENCE_STARTERS))
46
+ regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})".format(sent_starters)
47
+ self.text = re.sub(regex, '\\1.', self.text)
48
+ return self.text
49
+
50
+ def replace_multi_period_abbreviations(self):
51
+ def mpa_replace(match):
52
+ match = match.group()
53
+ match = re.sub(re.escape(r"."), "∯", match)
54
+ return match
55
+
56
+ self.text = re.sub(
57
+ self.lang.MULTI_PERIOD_ABBREVIATION_REGEX,
58
+ mpa_replace,
59
+ self.text,
60
+ flags=re.IGNORECASE
61
+ )
62
+
63
+ def replace_period_of_abbr(self, txt, abbr):
64
+ # prepend a space to avoid needing another regex for start of string
65
+ txt = " " + txt
66
+ txt = re.sub(
67
+ r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))".format(
68
+ abbr=re.escape(abbr.strip())
69
+ ),
70
+ "∯",
71
+ txt,
72
+ )
73
+ # remove the prepended space
74
+ txt = txt[1:]
75
+ return txt
76
+
77
+
78
+ def search_for_abbreviations_in_string(self, text):
79
+ lowered = text.lower()
80
+ for abbr in self.lang.Abbreviation.ABBREVIATIONS:
81
+ stripped = abbr.strip()
82
+ if stripped not in lowered:
83
+ continue
84
+ abbrev_match = re.findall(
85
+ r"(?:^|\s|\r|\n){}".format(stripped), text, flags=re.IGNORECASE
86
+ )
87
+ if not abbrev_match:
88
+ continue
89
+ next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}"
90
+ char_array = re.findall(next_word_start, text)
91
+ for ind, match in enumerate(abbrev_match):
92
+ text = self.scan_for_replacements(
93
+ text, match, ind, char_array
94
+ )
95
+ return text
96
+
97
+ def scan_for_replacements(self, txt, am, ind, char_array):
98
+ try:
99
+ char = char_array[ind]
100
+ except IndexError:
101
+ char = ""
102
+ prepositive = self.lang.Abbreviation.PREPOSITIVE_ABBREVIATIONS
103
+ number_abbr = self.lang.Abbreviation.NUMBER_ABBREVIATIONS
104
+ upper = str(char).isupper()
105
+ if not upper or am.strip().lower() in prepositive:
106
+ if am.strip().lower() in prepositive:
107
+ txt = replace_prepositive_abbr(txt, am)
108
+ elif am.strip().lower() in number_abbr:
109
+ txt = replace_pre_number_abbr(txt, am)
110
+ else:
111
+ txt = self.replace_period_of_abbr(txt, am)
112
+ return txt
pysbd/about.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # inspired from:
2
+ # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
3
+
4
+ __title__ = "pysbd"
5
+ __version__ = "0.3.4"
6
+ __summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
7
+ __uri__ = "http://nipunsadvilkar.github.io/"
8
+ __author__ = "Nipun Sadvilkar"
9
+ __email__ = "[email protected]"
10
+ __license__ = "MIT"
pysbd/between_punctuation.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+ from functools import partial
4
+ from pysbd.punctuation_replacer import replace_punctuation
5
+
6
+
7
+ class BetweenPunctuation(object):
8
+ # Rubular: http://rubular.com/r/2YFrKWQUYi
9
+ BETWEEN_SINGLE_QUOTES_REGEX = r"(?<=\s)'(?:[^']|'[a-zA-Z])*'"
10
+
11
+ BETWEEN_SINGLE_QUOTE_SLANTED_REGEX = r"(?<=\s)‘(?:[^’]|’[a-zA-Z])*’"
12
+
13
+ # Rubular: http://rubular.com/r/3Pw1QlXOjd
14
+ BETWEEN_DOUBLE_QUOTES_REGEX = r'"(?>[^"\\]+|\\{2}|\\.)*"'
15
+
16
+ # https://regex101.com/r/r6I1bW/1
17
+ # https://stackoverflow.com/questions/13577372/do-python-regular-expressions-have-an-equivalent-to-rubys-atomic-grouping?noredirect=1&lq=1
18
+ BETWEEN_DOUBLE_QUOTES_REGEX_2 = r'"(?=(?P<tmp>[^\"\\]+|\\{2}|\\.)*)(?P=tmp)"'
19
+
20
+ # Rubular: http://rubular.com/r/x6s4PZK8jc
21
+ BETWEEN_QUOTE_ARROW_REGEX = r'«(?>[^»\\]+|\\{2}|\\.)*»'
22
+
23
+ BETWEEN_QUOTE_ARROW_REGEX_2 = r"\«(?=(?P<tmp>[^»\\]+|\\{2}|\\.)*)(?P=tmp)\»"
24
+
25
+ # Rubular: http://rubular.com/r/JbAIpKdlSq
26
+ BETWEEN_QUOTE_SLANTED_REGEX = r"“(?>[^”\\]+|\\{2}|\\.)*”"
27
+ BETWEEN_QUOTE_SLANTED_REGEX_2 = r"\“(?=(?P<tmp>[^”\\]+|\\{2}|\\.)*)(?P=tmp)\”"
28
+
29
+ # Rubular: http://rubular.com/r/WX4AvnZvlX
30
+ BETWEEN_SQUARE_BRACKETS_REGEX = r"\[(?>[^\]\\]+|\\{2}|\\.)*\]"
31
+
32
+ BETWEEN_SQUARE_BRACKETS_REGEX_2 = r'\[(?=(?P<tmp>[^\]\\]+|\\{2}|\\.)*)(?P=tmp)\]'
33
+
34
+ # Rubular: http://rubular.com/r/6tTityPflI
35
+ BETWEEN_PARENS_REGEX = r"\((?>[^\(\)\\]+|\\{2}|\\.)*\)"
36
+
37
+ BETWEEN_PARENS_REGEX_2 = r"\((?=(?P<tmp>[^\(\)\\]+|\\{2}|\\.)*)(?P=tmp)\)"
38
+
39
+ # Rubular: http://rubular.com/r/mXf8cW025o
40
+ WORD_WITH_LEADING_APOSTROPHE = r"(?<=\s)'(?:[^']|'[a-zA-Z])*'\S"
41
+
42
+ # Rubular: http://rubular.com/r/jTtDKfjxzr
43
+ BETWEEN_EM_DASHES_REGEX = r"\-\-(?>[^\-\-])*\-\-"
44
+
45
+ BETWEEN_EM_DASHES_REGEX_2 = r"--(?=(?P<tmp>[^--]*))(?P=tmp)--"
46
+
47
+ def __init__(self, text):
48
+ self.text = text
49
+
50
+ def replace(self):
51
+ return self.sub_punctuation_between_quotes_and_parens(self.text)
52
+
53
+ def sub_punctuation_between_quotes_and_parens(self, txt):
54
+ txt = self.sub_punctuation_between_single_quotes(txt)
55
+ txt = self.sub_punctuation_between_single_quote_slanted(txt)
56
+ txt = self.sub_punctuation_between_double_quotes(txt)
57
+ txt = self.sub_punctuation_between_square_brackets(txt)
58
+ txt = self.sub_punctuation_between_parens(txt)
59
+ txt = self.sub_punctuation_between_quotes_arrow(txt)
60
+ txt = self.sub_punctuation_between_em_dashes(txt)
61
+ txt = self.sub_punctuation_between_quotes_slanted(txt)
62
+ return txt
63
+
64
+ def sub_punctuation_between_parens(self, txt):
65
+ return re.sub(self.BETWEEN_PARENS_REGEX_2, replace_punctuation, txt)
66
+
67
+ def sub_punctuation_between_square_brackets(self, txt):
68
+ return re.sub(self.BETWEEN_SQUARE_BRACKETS_REGEX_2, replace_punctuation,
69
+ txt)
70
+
71
+ def sub_punctuation_between_single_quotes(self, txt):
72
+ if re.search(self.WORD_WITH_LEADING_APOSTROPHE, txt) and \
73
+ (not re.search(r"'\s", txt)):
74
+ return txt
75
+ return re.sub(self.BETWEEN_SINGLE_QUOTES_REGEX,
76
+ partial(replace_punctuation, match_type='single'), txt)
77
+
78
+ def sub_punctuation_between_single_quote_slanted(self, txt):
79
+ return re.sub(self.BETWEEN_SINGLE_QUOTE_SLANTED_REGEX,
80
+ replace_punctuation, txt)
81
+
82
+ def sub_punctuation_between_double_quotes(self, txt):
83
+ return re.sub(self.BETWEEN_DOUBLE_QUOTES_REGEX_2, replace_punctuation,
84
+ txt)
85
+
86
+ def sub_punctuation_between_quotes_arrow(self, txt):
87
+ return re.sub(self.BETWEEN_QUOTE_ARROW_REGEX_2, replace_punctuation, txt)
88
+
89
+ def sub_punctuation_between_em_dashes(self, txt):
90
+ return re.sub(self.BETWEEN_EM_DASHES_REGEX_2, replace_punctuation, txt)
91
+
92
+ def sub_punctuation_between_quotes_slanted(self, txt):
93
+ return re.sub(self.BETWEEN_QUOTE_SLANTED_REGEX_2, replace_punctuation,
94
+ txt)
pysbd/clean/__init__.py ADDED
File without changes
pysbd/clean/rules.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.utils import Rule
3
+
4
+
5
+ class CleanRules(object):
6
+
7
+ # NOTE: Caution: Might require \\ for special characters
8
+ # if regex is defined with r'' then dont
9
+ # add extra \\ for special characters
10
+ # Rubular: http://rubular.com/r/V57WnM9Zut
11
+ NewLineInMiddleOfWordRule = Rule(r'\n(?=[a-zA-Z]{1,2}\n)', '')
12
+
13
+ # Rubular: http://rubular.com/r/dMxp5MixFS
14
+ DoubleNewLineWithSpaceRule = Rule(r'\n \n', "\r")
15
+
16
+ # Rubular: http://rubular.com/r/H6HOJeA8bq
17
+ DoubleNewLineRule = Rule(r'\n\n', "\r")
18
+
19
+ # Rubular: http://rubular.com/r/FseyMiiYFT
20
+ NewLineFollowedByPeriodRule = Rule(r'\n(?=\.(\s|\n))', '')
21
+
22
+ ReplaceNewlineWithCarriageReturnRule = Rule(r'\n', "\r")
23
+
24
+ EscapedNewLineRule = Rule(r'\\n', "\n")
25
+
26
+ EscapedCarriageReturnRule = Rule(r'\\r', "\r")
27
+
28
+ TypoEscapedNewLineRule = Rule(r'\\\ n', "\n")
29
+
30
+ TypoEscapedCarriageReturnRule = Rule(r'\\\ r', "\r")
31
+
32
+ # Rubular: http://rubular.com/r/bAJrhyLNeZ
33
+ InlineFormattingRule = Rule(r'{b\^&gt;\d*&lt;b\^}|{b\^>\d*<b\^}', '')
34
+
35
+ # Rubular: http://rubular.com/r/8mc1ArOIGy
36
+ TableOfContentsRule = Rule(r'\.{4,}\s*\d+-*\d*', "\r")
37
+
38
+ # Rubular: http://rubular.com/r/DwNSuZrNtk
39
+ ConsecutivePeriodsRule = Rule(r'\.{5,}', ' ')
40
+
41
+ # Rubular: http://rubular.com/r/IQ4TPfsbd8
42
+ ConsecutiveForwardSlashRule = Rule(r'\/{3}', '')
43
+
44
+ # Rubular: http://rubular.com/r/6dt98uI76u
45
+ NO_SPACE_BETWEEN_SENTENCES_REGEX = r'(?<=[a-z])\.(?=[A-Z])'
46
+ # NO_SPACE_BETWEEN_SENTENCES_REGEX = r'[a-z]\.[A-Z]'
47
+ NoSpaceBetweenSentencesRule = Rule(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
48
+
49
+ # Rubular: http://rubular.com/r/l6KN6rH5XE
50
+ NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = r'(?<=\d)\.(?=[A-Z])'
51
+ NoSpaceBetweenSentencesDigitRule = Rule(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
52
+
53
+ URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']
54
+
55
+ # Rubular: http://rubular.com/r/3GiRiP2IbD
56
+ NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = r'(?<=\s)\n(?=([a-z]|\())'
57
+
58
+ # Rubular: http://rubular.com/r/Gn18aAnLdZ
59
+ NewLineFollowedByBulletRule = Rule(r"\n(?=•')", "\r")
60
+
61
+ QuotationsFirstRule = Rule(r"''", '"')
62
+ QuotationsSecondRule = Rule(r'``', '"')
63
+
64
+
65
+ class HTML(object):
66
+ # Rubular: http://rubular.com/r/9d0OVOEJWj
67
+ HTMLTagRule = Rule(r"<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[\^'\">\s]+))?)+\s*|\s*)\/?>", '')
68
+
69
+ # Rubular: http://rubular.com/r/XZVqMPJhea
70
+ EscapedHTMLTagRule = Rule(r'&lt;\/?[^gt;]*gt;', '')
71
+
72
+ All = [HTMLTagRule, EscapedHTMLTagRule]
73
+
74
+
75
+ class PDF(object):
76
+ # Rubular: http://rubular.com/r/UZAVcwqck8
77
+ NewLineInMiddleOfSentenceRule = Rule(r'(?<=[^\n]\s)\n(?=\S)', '')
78
+
79
+ # Rubular: http://rubular.com/r/eaNwGavmdo
80
+ NewLineInMiddleOfSentenceNoSpacesRule = Rule(r"\n(?=[a-z])", ' ')
pysbd/cleaner.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+ from pysbd.utils import Text
4
+ from pysbd.clean.rules import PDF, HTML, CleanRules as cr
5
+
6
+
7
+ class Cleaner(object):
8
+
9
+ def __init__(self, text, lang, doc_type=None):
10
+ self.text = text
11
+ self.lang = lang
12
+ self.doc_type = doc_type
13
+
14
+ def clean(self):
15
+ if not self.text:
16
+ return self.text
17
+ self.remove_all_newlines()
18
+ self.replace_double_newlines()
19
+ self.replace_newlines()
20
+ self.replace_escaped_newlines()
21
+ self.text = Text(self.text).apply(*HTML.All)
22
+ self.replace_punctuation_in_brackets()
23
+ self.text = Text(self.text).apply(cr.InlineFormattingRule)
24
+ self.clean_quotations()
25
+ self.clean_table_of_contents()
26
+ self.check_for_no_space_in_between_sentences()
27
+ self.clean_consecutive_characters()
28
+ return self.text
29
+
30
+ def remove_all_newlines(self):
31
+ self.remove_newline_in_middle_of_sentence()
32
+ self.remove_newline_in_middle_of_word()
33
+
34
+ def remove_newline_in_middle_of_sentence(self):
35
+ def replace_w_blank(match):
36
+ match = match.group()
37
+ sub = re.sub(cr.NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '', match)
38
+ return sub
39
+ self.text = re.sub(r'(?:[^\.])*', replace_w_blank, self.text)
40
+
41
+ def remove_newline_in_middle_of_word(self):
42
+ self.text = Text(self.text).apply(cr.NewLineInMiddleOfWordRule)
43
+
44
+ def replace_double_newlines(self):
45
+ self.text = Text(self.text).apply(cr.DoubleNewLineWithSpaceRule,
46
+ cr.DoubleNewLineRule)
47
+
48
+ def remove_pdf_line_breaks(self):
49
+ self.text = Text(
50
+ self.text).apply(cr.NewLineFollowedByBulletRule,
51
+ PDF.NewLineInMiddleOfSentenceRule,
52
+ PDF.NewLineInMiddleOfSentenceNoSpacesRule)
53
+
54
+ def replace_newlines(self):
55
+ if self.doc_type == 'pdf':
56
+ self.remove_pdf_line_breaks()
57
+ else:
58
+ self.text = Text(
59
+ self.text).apply(cr.NewLineFollowedByPeriodRule,
60
+ cr.ReplaceNewlineWithCarriageReturnRule)
61
+
62
+ def replace_escaped_newlines(self):
63
+ self.text = Text(
64
+ self.text).apply(cr.EscapedNewLineRule,
65
+ cr.EscapedCarriageReturnRule,
66
+ cr.TypoEscapedNewLineRule,
67
+ cr.TypoEscapedCarriageReturnRule)
68
+
69
+ def replace_punctuation_in_brackets(self):
70
+ def replace_punct(match):
71
+ match = match.group()
72
+ if '?' in match:
73
+ sub = re.sub(re.escape('?'), '&ᓷ&', match)
74
+ return sub
75
+ return match
76
+ self.text = re.sub(r'\[(?:[^\]])*\]', replace_punct, self.text)
77
+
78
+ def clean_quotations(self):
79
+ # method added explicitly
80
+ # pragmatic-segmenter applies thhis method
81
+ # at different location
82
+ self.text = re.sub('`', "'", self.text)
83
+ self.text = Text(self.text).apply(
84
+ cr.QuotationsFirstRule,
85
+ cr.QuotationsSecondRule)
86
+
87
+ def clean_table_of_contents(self):
88
+ self.text = Text(self.text).apply(
89
+ cr.TableOfContentsRule,
90
+ cr.ConsecutivePeriodsRule,
91
+ cr.ConsecutiveForwardSlashRule)
92
+
93
+ def search_for_connected_sentences(self, word, txt, regex, rule):
94
+ if not re.search(regex, word):
95
+ return txt
96
+ if any(k in word for k in cr.URL_EMAIL_KEYWORDS):
97
+ return txt
98
+ new_word = Text(word).apply(rule)
99
+ txt = re.sub(re.escape(word), new_word, txt)
100
+ return txt
101
+
102
+ def check_for_no_space_in_between_sentences(self):
103
+ words = self.text.split(' ')
104
+ for word in words:
105
+ self.text = self.search_for_connected_sentences(word, self.text, cr.NO_SPACE_BETWEEN_SENTENCES_REGEX, cr.NoSpaceBetweenSentencesRule)
106
+ self.text = self.search_for_connected_sentences(word, self.text, cr.NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, cr.NoSpaceBetweenSentencesDigitRule)
107
+
108
+ def clean_consecutive_characters(self):
109
+ self.text = Text(self.text).apply(
110
+ cr.ConsecutivePeriodsRule,
111
+ cr.ConsecutiveForwardSlashRule)
pysbd/exclamation_words.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+ from pysbd.punctuation_replacer import replace_punctuation
4
+
5
+
6
+ class ExclamationWords(object):
7
+ """
8
+ Searches for exclamation points that are part of words
9
+ and not ending punctuation and replaces them.
10
+ """
11
+ EXCLAMATION_WORDS = "!Xũ !Kung ǃʼOǃKung !Xuun !Kung-Ekoka ǃHu ǃKhung ǃKu ǃung ǃXo ǃXû ǃXung ǃXũ !Xun Yahoo! Y!J Yum!".split()
12
+ EXCLAMATION_REGEX = r"|".join(re.escape(w) for w in EXCLAMATION_WORDS)
13
+
14
+ @classmethod
15
+ def apply_rules(cls, text):
16
+ return re.sub(ExclamationWords.EXCLAMATION_REGEX, replace_punctuation,
17
+ text)
pysbd/lang/__init__.py ADDED
File without changes
pysbd/lang/amharic.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
3
+ from pysbd.lang.common import Common, Standard
4
+
5
+ class Amharic(Common, Standard):
6
+
7
+ iso_code = 'am'
8
+
9
+ SENTENCE_BOUNDARY_REGEX = r'.*?[፧።!\?]|.*?$'
10
+ Punctuations = ['።', '፧', '?', '!']
11
+
12
+ class AbbreviationReplacer(AbbreviationReplacer):
13
+ SENTENCE_STARTERS = []
pysbd/lang/arabic.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+
4
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
5
+ from pysbd.lang.common import Common, Standard
6
+ from pysbd.utils import Rule
7
+
8
+ class Arabic(Common, Standard):
9
+
10
+ iso_code = 'ar'
11
+
12
+ Punctuations = ['?', '!', ':', '.', '؟', '،']
13
+ SENTENCE_BOUNDARY_REGEX = r'.*?[:\.!\?؟،]|.*?\Z|.*?$'
14
+
15
+ # Rubular: http://rubular.com/r/RX5HpdDIyv
16
+ ReplaceColonBetweenNumbersRule = Rule(r'(?<=\d):(?=\d)', '♭')
17
+
18
+ # Rubular: http://rubular.com/r/kPRgApNHUg
19
+ ReplaceNonSentenceBoundaryCommaRule = Rule(r'،(?=\s\S+،)', '♬')
20
+
21
+ class AbbreviationReplacer(AbbreviationReplacer):
22
+
23
+ SENTENCE_STARTERS = []
24
+
25
+ def __init__(self, text, lang):
26
+ super().__init__(text, lang)
27
+
28
+ def scan_for_replacements(self, txt, am, index, character_array):
29
+ txt = re.sub('(?<={0})\.'.format(am), '∯', txt)
30
+ return txt
31
+
32
+ class Abbreviation(Standard.Abbreviation):
33
+ ABBREVIATIONS = ['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه',]
34
+ PREPOSITIVE_ABBREVIATIONS = []
35
+ NUMBER_ABBREVIATIONS = []
pysbd/lang/armenian.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
3
+ from pysbd.lang.common import Common, Standard
4
+ from pysbd.between_punctuation import BetweenPunctuation
5
+ import re
6
+ from functools import partial
7
+ from pysbd.punctuation_replacer import replace_punctuation
8
+
9
+ class Armenian(Common, Standard):
10
+
11
+ iso_code = 'hy'
12
+
13
+ SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[Ա-ՖA-Z])|「(?:[^」])*」(?=\s[Ա-ՖA-Z])|\((?:[^\)]){2,}\)(?=\s[Ա-ՖA-Z])|\'(?:[^\'])*[^,]\'(?=\s[Ա-ՖA-Z])|\"(?:[^\"])*[^,]\"(?=\s[Ա-ՖA-Z])|\“(?:[^\”])*[^,]\”(?=\s[Ա-ՖA-Z])|[。..!!?? ]{2,}|\S.*?[。..!!??ȸȹ☉☈☇☄]|[。..!!??]|.*?(?<!\d)[։]"
14
+
15
+
16
+ # SENTENCE_BOUNDARY_REGEX = r'((?:[^)])*)(?=\s?[Ա-ՖA-Z0-9])|.*?(?<!\d)[։]|.*?$'
17
+ Punctuations = ['։']
18
+
19
+
20
+ QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[.․։][\"\'“”»«]\s{1}[A-ZԱ-Ֆ]'
21
+
22
+ SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = r'(?<=[.․։][\"\'“”»«])\s{1}(?=[A-ZԱ-Ֆ])'
23
+
24
+ class AbbreviationReplacer(AbbreviationReplacer):
25
+ SENTENCE_STARTERS = []
26
+
27
+ class BetweenPunctuation(BetweenPunctuation):
28
+ BETWEEN_SINGLE_QUOTES_ARMENIAN_REGEX = r"(?<=\s)'(?:[^']|'[ա-ֆԱ-Ֆ])*'"
29
+
30
+ BETWEEN_SINGLE_QUOTE_SLANTED_ARMENIAN_REGEX = r"(?<=\s)‘(?:[^’]|’[ա-ֆԱ-Ֆ])*’"
31
+
32
+ BETWEEN_DOUBLE_QUOTES_REGEX = r'"(?>[^"\\]+|\\{2}|\\.)*"'
33
+
34
+ BETWEEN_DOUBLE_QUOTES_ARMENIAN_REGEX_2 = r'"(?=(?P<tmp>[^\"\\]+|\\{2}|\\.)*)(?P=tmp)"'
35
+
36
+ # Rubular: http://rubular.com/r/x6s4PZK8jc
37
+ BETWEEN_QUOTE_ARROW_ARMENIAN_REGEX = r'«(?>[^»\\]+|\\{2}|\\.)*»'
38
+
39
+ BETWEEN_QUOTE_ARROW_ARMENIAN_REGEX_2 = r"\«(?=(?P<tmp>[^»\\]+|\\{2}|\\.)*)(?P=tmp)\»"
40
+
41
+ # Rubular: http://rubular.com/r/JbAIpKdlSq
42
+ BETWEEN_QUOTE_SLANTED_ARMENIAN_REGEX = r"“(?>[^”\\]+|\\{2}|\\.)*”"
43
+ BETWEEN_QUOTE_SLANTED_ARMENIAN_REGEX_2 = r"\“(?=(?P<tmp>[^”\\]+|\\{2}|\\.)*)(?P=tmp)\”"
44
+
45
+ # Rubular: http://rubular.com/r/WX4AvnZvlX
46
+ BETWEEN_SQUARE_BRACKETS_ARMENIAN_REGEX = r"\[(?>[^\]\\]+|\\{2}|\\.)*\]"
47
+
48
+ BETWEEN_SQUARE_BRACKETS_ARMENIAN_REGEX_2 = r'\[(?=(?P<tmp>[^\]\\]+|\\{2}|\\.)*)(?P=tmp)\]'
49
+
50
+ # Rubular: http://rubular.com/r/6tTityPflI
51
+ BETWEEN_PARENS_ARMENIAN_REGEX = r"\((?>[^\(\)\\]+|\\{2}|\\.)*\)"
52
+
53
+ BETWEEN_PARENS_ARMENIAN_REGEX_2 = r"\((?=(?P<tmp>[^\(\)\\]+|\\{2}|\\.)*)(?P=tmp)\)"
54
+
55
+ # Rubular: http://rubular.com/r/mXf8cW025o
56
+ WORD_WITH_LEADING_APOSTROPHE_ARMENIAN = r"(?<=\s)'(?:[^']|'[ա-ֆԱ-Ֆ])*'\S"
57
+
58
+ # Rubular: http://rubular.com/r/jTtDKfjxzr
59
+ BETWEEN_EM_DASHES_REGEX_ARMENIAN = r"\-\-(?>[^\-\-])*\-\-"
60
+
61
+ BETWEEN_EM_DASHES_REGEX_2_ARMENIAN = r"--(?=(?P<tmp>[^--]*))(?P=tmp)--"
62
+
63
+ def __init__(self, text):
64
+ super().__init__(text)
65
+
66
+ def replace(self):
67
+ text = self.sub_punctuation_between_quotes_and_parens(self.text)
68
+ return self.sub_punctuation_between_quotes_and_parens_armenian(text)
69
+
70
+ def sub_punctuation_between_quotes_and_parens_armenian(self, txt):
71
+ txt = self.sub_punctuation_between_single_quotes_armenian(txt)
72
+ txt = self.sub_punctuation_between_single_quote_slanted_armenian(txt)
73
+ txt = self.sub_punctuation_between_double_quotes_armenian(txt)
74
+ txt = self.sub_punctuation_between_square_brackets_armenian(txt)
75
+ txt = self.sub_punctuation_between_parens_armenian(txt)
76
+ txt = self.sub_punctuation_between_quotes_arrow_armenian(txt)
77
+ txt = self.sub_punctuation_between_em_dashes_armenian(txt)
78
+ txt = self.sub_punctuation_between_quotes_slanted_armenian(txt)
79
+ return txt
80
+
81
+ def sub_punctuation_between_single_quotes_armenian(self, txt):
82
+ if re.search(self.WORD_WITH_LEADING_APOSTROPHE_ARMENIAN, txt) and \
83
+ (not re.search(r"'\s", txt)):
84
+ return txt
85
+ return re.sub(self.BETWEEN_SINGLE_QUOTES_ARMENIAN_REGEX,
86
+ partial(replace_punctuation, match_type='single'), txt)
87
+
88
+ def sub_punctuation_between_single_quote_slanted_armenian(self, txt):
89
+ return re.sub(self.BETWEEN_SINGLE_QUOTE_SLANTED_ARMENIAN_REGEX,
90
+ replace_punctuation, txt)
91
+
92
+
93
+ def sub_punctuation_between_parens_armenian(self, txt):
94
+ return re.sub(self.BETWEEN_PARENS_ARMENIAN_REGEX_2, replace_punctuation, txt)
95
+
96
+ def sub_punctuation_between_square_brackets_armenian(self, txt):
97
+ return re.sub(self.BETWEEN_SQUARE_BRACKETS_ARMENIAN_REGEX_2, replace_punctuation,
98
+ txt)
99
+
100
+ def sub_punctuation_between_double_quotes_armenian(self, txt):
101
+ return re.sub(self.BETWEEN_DOUBLE_QUOTES_ARMENIAN_REGEX_2, replace_punctuation,
102
+ txt)
103
+
104
+ def sub_punctuation_between_quotes_arrow_armenian(self, txt):
105
+ return re.sub(self.BETWEEN_QUOTE_ARROW_ARMENIAN_REGEX_2, replace_punctuation, txt)
106
+
107
+ def sub_punctuation_between_em_dashes_armenian(self, txt):
108
+ return re.sub(self.BETWEEN_EM_DASHES_REGEX_2_ARMENIAN, replace_punctuation, txt)
109
+
110
+ def sub_punctuation_between_quotes_slanted_armenian(self, txt):
111
+ return re.sub(self.BETWEEN_QUOTE_SLANTED_ARMENIAN_REGEX_2, replace_punctuation,
112
+ txt)
pysbd/lang/bulgarian.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+
4
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
5
+ from pysbd.lang.common import Common, Standard
6
+
7
+ class Bulgarian(Common, Standard):
8
+
9
+ iso_code = 'bg'
10
+
11
+ class Abbreviation(Standard.Abbreviation):
12
+ ABBREVIATIONS = ["p.s", "акад", "ал", "б.р", "б.ред", "бел.а", "бел.пр", "бр", "бул", "в", "вж", "вкл", "вм", "вр", "г", "ген", "гр", "дж", "дм", "доц", "др", "ем", "заб", "зам", "инж", "к.с", "кв", "кв.м", "кг", "км", "кор", "куб", "куб.м", "л", "лв", "м", "м.г", "мин", "млн", "млрд", "мм", "н.с", "напр", "пл", "полк", "проф", "р", "рис", "с", "св", "сек", "см", "сп", "срв", "ст", "стр", "т", "т.г", "т.е", "т.н", "т.нар", "табл", "тел", "у", "ул", "фиг", "ха", "хил", "ч", "чл", "щ.д"]
13
+ NUMBER_ABBREVIATIONS = []
14
+ PREPOSITIVE_ABBREVIATIONS = []
15
+
16
+ class AbbreviationReplacer(AbbreviationReplacer):
17
+ SENTENCE_STARTERS = []
18
+
19
+ def __init__(self, text, lang):
20
+ super().__init__(text, lang)
21
+
22
+ def replace_period_of_abbr(self, txt, abbr):
23
+ txt = re.sub(r'(?<=\s{abbr})\.|(?<=^{abbr})\.'.format(abbr=abbr.strip()), '∯', txt)
24
+ return txt
pysbd/lang/burmese.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
3
+ from pysbd.lang.common import Common, Standard
4
+
5
+ class Burmese(Common, Standard):
6
+
7
+ iso_code = 'my'
8
+
9
+ SENTENCE_BOUNDARY_REGEX = r'.*?[။၏!\?]|.*?$'
10
+ Punctuations = ['။', '၏', '?', '!']
11
+
12
+ class AbbreviationReplacer(AbbreviationReplacer):
13
+ SENTENCE_STARTERS = []
pysbd/lang/chinese.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
4
+ from pysbd.between_punctuation import BetweenPunctuation
5
+ from pysbd.lang.common import Common, Standard
6
+ from pysbd.punctuation_replacer import replace_punctuation
7
+
8
+ class Chinese(Common, Standard):
9
+
10
+ iso_code = 'zh'
11
+
12
+ class AbbreviationReplacer(AbbreviationReplacer):
13
+ SENTENCE_STARTERS = []
14
+
15
+ class BetweenPunctuation(BetweenPunctuation):
16
+
17
+ def __init__(self, text):
18
+ super().__init__(text)
19
+
20
+ def replace(self):
21
+ self.sub_punctuation_between_quotes_and_parens()
22
+ return self.text
23
+
24
+ def sub_punctuation_between_double_angled_quotation_marks(self):
25
+ BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX = r"《(?=(?P<tmp>[^》\\]+|\\{2}|\\.)*)(?P=tmp)》"
26
+ self.text = re.sub(BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX, replace_punctuation,
27
+ self.text)
28
+
29
+ def sub_punctuation_between_l_bracket(self):
30
+ BETWEEN_L_BRACKET_REGEX = r"「(?=(?P<tmp>[^」\\]+|\\{2}|\\.)*)(?P=tmp)」"
31
+ self.text = re.sub(BETWEEN_L_BRACKET_REGEX, replace_punctuation,
32
+ self.text)
33
+
34
+ def sub_punctuation_between_quotes_and_parens(self):
35
+ self.sub_punctuation_between_double_angled_quotation_marks()
36
+ self.sub_punctuation_between_l_bracket()
pysbd/lang/common/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .common import Common # noqa: F401
2
+ from .standard import Standard # noqa: F401
pysbd/lang/common/common.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+ from pysbd.utils import Rule
4
+
5
+ class Common(object):
6
+
7
+ # added special case: r"[。..!!? ]{2,}" to handle intermittent dots, exclamation, etc.
8
+ # r"[。..!!?] at end to handle single instances of these symbol inputs
9
+ SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!?? ]{2,}|\S.*?[。..!!??ȸȹ☉☈☇☄]|[。..!!??]"
10
+
11
+ # # Rubular: http://rubular.com/r/NqCqv372Ix
12
+ QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[!?\.-][\"\'“”]\s{1}[A-Z]'
13
+
14
+ # # Rubular: http://rubular.com/r/6flGnUMEVl
15
+ PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = r'["\”]\s\(.*\)\s["\“]'
16
+
17
+ # # Rubular: http://rubular.com/r/TYzr4qOW1Q
18
+ # BETWEEN_DOUBLE_QUOTES_REGEX = / "(?:[^"])*[^, ]"|“(?: [ ^”])*[^, ]”/
19
+
20
+ # # Rubular: http://rubular.com/r/JMjlZHAT4g
21
+ SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = r'(?<=[!?\.-][\"\'“”])\s{1}(?=[A-Z])'
22
+
23
+ # # Rubular: http://rubular.com/r/mQ8Es9bxtk
24
+ CONTINUOUS_PUNCTUATION_REGEX = r'(?<=\S)(!|\?){3,}(?=(\s|\Z|$))'
25
+
26
+ # https://rubular.com/r/UkumQaILKbkeyc
27
+ # https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703
28
+ NUMBERED_REFERENCE_REGEX = r'(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)?\d{1,3}))(\s)(?=[A-Z])'
29
+
30
+ # # Rubular: http://rubular.com/r/yqa4Rit8EY
31
+ PossessiveAbbreviationRule = Rule(r"\.(?='s\s)|\.(?='s$)|\.(?='s\Z)", '∯')
32
+
33
+ # # Rubular: http://rubular.com/r/NEv265G2X2
34
+ KommanditgesellschaftRule = Rule(r'(?<=Co)\.(?=\sKG)', '∯')
35
+
36
+ # # Rubular: http://rubular.com/r/xDkpFZ0EgH
37
+ MULTI_PERIOD_ABBREVIATION_REGEX = r"\b[a-z](?:\.[a-z])+[.]"
38
+
39
+ class SingleLetterAbbreviationRules(object):
40
+ """Searches for periods within an abbreviation and
41
+ replaces the periods.
42
+ """
43
+ # Rubular: http://rubular.com/r/e3H6kwnr6H
44
+ SingleUpperCaseLetterAtStartOfLineRule = Rule(r"(?<=^[A-Z])\.(?=\s)", '∯')
45
+
46
+ # Rubular: http://rubular.com/r/gitvf0YWH4
47
+ SingleUpperCaseLetterRule = Rule(r"(?<=\s[A-Z])\.(?=,?\s)", '∯')
48
+
49
+ All = [
50
+ SingleUpperCaseLetterAtStartOfLineRule, SingleUpperCaseLetterRule
51
+ ]
52
+
53
+ class AmPmRules(object):
54
+
55
+ # Rubular: http://rubular.com/r/Vnx3m4Spc8
56
+ UpperCasePmRule = Rule(r'(?<= P∯M)∯(?=\s[A-Z])', '.')
57
+
58
+ # Rubular: http://rubular.com/r/AJMCotJVbW
59
+ UpperCaseAmRule = Rule(r'(?<=A∯M)∯(?=\s[A-Z])', '.')
60
+
61
+ # Rubular: http://rubular.com/r/13q7SnOhgA
62
+ LowerCasePmRule = Rule(r'(?<=p∯m)∯(?=\s[A-Z])', '.')
63
+
64
+ # Rubular: http://rubular.com/r/DgUDq4mLz5
65
+ LowerCaseAmRule = Rule(r'(?<=a∯m)∯(?=\s[A-Z])', '.')
66
+
67
+ All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
68
+
69
+ class Numbers(object):
70
+ # Rubular: http://rubular.com/r/oNyxBOqbyy
71
+ PeriodBeforeNumberRule = Rule(r'\.(?=\d)', '∯')
72
+
73
+ # Rubular: http://rubular.com/r/EMk5MpiUzt
74
+ NumberAfterPeriodBeforeLetterRule = Rule(r'(?<=\d)\.(?=\S)', '∯')
75
+
76
+ # Rubular: http://rubular.com/r/rf4l1HjtjG
77
+ NewLineNumberPeriodSpaceLetterRule = Rule(r'(?<=\r\d)\.(?=(\s\S)|\))', '∯')
78
+
79
+ # Rubular: http://rubular.com/r/HPa4sdc6b9
80
+ StartLineNumberPeriodRule = Rule(r'(?<=^\d)\.(?=(\s\S)|\))', '∯')
81
+
82
+ # Rubular: http://rubular.com/r/NuvWnKleFl
83
+ StartLineTwoDigitNumberPeriodRule = Rule(r'(?<=^\d\d)\.(?=(\s\S)|\))', '∯')
84
+
85
+ All = [
86
+ PeriodBeforeNumberRule,
87
+ NumberAfterPeriodBeforeLetterRule,
88
+ NewLineNumberPeriodSpaceLetterRule,
89
+ StartLineNumberPeriodRule,
90
+ StartLineTwoDigitNumberPeriodRule
91
+ ]
pysbd/lang/common/standard.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.utils import Rule
3
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
4
+
5
+ class Standard:
6
+
7
+ # This class holds the punctuation marks.
8
+ Punctuations = ['。', '.', '.', '!', '!', '?', '?']
9
+
10
+ # Rubular: http://rubular.com/r/G2opjedIm9
11
+ GeoLocationRule = Rule(r'(?<=[a-zA-z]°)\.(?=\s*\d+)', '∯')
12
+
13
+ FileFormatRule = Rule(r'(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)', '∯')
14
+
15
+ SingleNewLineRule = Rule(r'\n', 'ȹ')
16
+
17
+ # Rubular: http://rubular.com/r/aXPUGm6fQh
18
+ QuestionMarkInQuotationRule = Rule(r'\?(?=(\'|\"))', '&ᓷ&')
19
+
20
+ ExtraWhiteSpaceRule = Rule(r'\s{3,}', ' ')
21
+
22
+ SubSingleQuoteRule = Rule(r'&⎋&', "'")
23
+
24
+ class Abbreviation(object):
25
+ """Defines the abbreviations for each language (if available)"""
26
+ ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk', 'fig']
27
+ PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs', 'fig']
28
+ NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
29
+
30
+ # Rubular: http://rubular.com/r/EUbZCNfgei
31
+ # WithMultiplePeriodsAndEmailRule = Rule(r'(\w)(\.)(\w)', '\\1∮\\3')
32
+ # \w in python matches unicode abbreviations also so limit to english alphanumerics
33
+ WithMultiplePeriodsAndEmailRule = Rule(r'([a-zA-Z0-9_])(\.)([a-zA-Z0-9_])', '\\1∮\\3')
34
+
35
+ class DoublePunctuationRules(object):
36
+ FirstRule = Rule(r'\?!', '☉')
37
+ SecondRule = Rule(r'!\?', '☈')
38
+ ThirdRule = Rule(r'\?\?', '☇')
39
+ ForthRule = Rule(r'!!', '☄')
40
+ DoublePunctuation = r'\?!|!\?|\?\?|!!'
41
+ All = [FirstRule, SecondRule, ThirdRule, ForthRule]
42
+
43
+ class ExclamationPointRules(object):
44
+ # Rubular: http://rubular.com/r/XS1XXFRfM2
45
+ InQuotationRule = Rule(r'\!(?=(\'|\"))', '&ᓴ&')
46
+
47
+ # Rubular: http://rubular.com/r/sl57YI8LkA
48
+ BeforeCommaMidSentenceRule = Rule(r'\!(?=\,\s[a-z])', '&ᓴ&')
49
+
50
+ # Rubular: http://rubular.com/r/f9zTjmkIPb
51
+ MidSentenceRule = Rule(r'\!(?=\s[a-z])', '&ᓴ&')
52
+
53
+ All = [InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule]
54
+
55
+ class SubSymbolsRules(object):
56
+ Period = Rule(r'∯', '.')
57
+ ArmenianFullStop = Rule(r'⍟', '։')
58
+ ArabicComma = Rule(r'♬', '،')
59
+ SemiColon = Rule(r'♭', ':')
60
+ FullWidthPeriod = Rule(r'&ᓰ&', '。')
61
+ SpecialPeriod = Rule(r'&ᓱ&', '.')
62
+ FullWidthExclamation = Rule(r'&ᓳ&', '!')
63
+ ExclamationPoint = Rule(r'&ᓴ&', '!')
64
+ QuestionMark = Rule(r'&ᓷ&', '?')
65
+ FullWidthQuestionMark = Rule(r'&ᓸ&', '?')
66
+ MixedDoubleQE = Rule(r'☉', '?!')
67
+ MixedDoubleQQ = Rule(r'☇', '??')
68
+ MixedDoubleEQ = Rule(r'☈', '!?')
69
+ MixedDoubleEE = Rule(r'☄', '!!')
70
+ LeftParens = Rule(r'&✂&', '(')
71
+ RightParens = Rule(r'&⌬&', ')')
72
+ TemporaryEndingPunctutation = Rule(r'ȸ', '')
73
+ Newline = Rule(r'ȹ', "\n")
74
+ All = [Period, ArmenianFullStop, ArabicComma, SemiColon, FullWidthPeriod, SpecialPeriod,
75
+ FullWidthExclamation, ExclamationPoint, QuestionMark,
76
+ FullWidthQuestionMark, MixedDoubleQE, MixedDoubleQQ, MixedDoubleEQ,
77
+ MixedDoubleEE, LeftParens, RightParens, TemporaryEndingPunctutation,
78
+ Newline]
79
+
80
+ class EllipsisRules(object):
81
+
82
+ # below rules aren't similar to original rules of pragmatic segmenter
83
+ # modification: spaces replaced with same number of symbols
84
+ # Rubular: http://rubular.com/r/i60hCK81fz
85
+ ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏☏.')
86
+
87
+ # Rubular: http://rubular.com/r/Hdqpd90owl
88
+ FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪƪƪ')
89
+
90
+ # Rubular: http://rubular.com/r/YBG1dIHTRu
91
+ ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟♟♟♟♟♟♟')
92
+
93
+ # Rubular: http://rubular.com/r/2VvZ8wRbd8
94
+ FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝♝♝♝♝♝♝')
95
+
96
+ OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪƪƪ')
97
+
98
+ All = [ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule,
99
+ ThreeConsecutiveRule, OtherThreePeriodRule]
100
+
101
+ class ReinsertEllipsisRules(object):
102
+ # below rules aren't similar to original rules of pragmatic segmenter
103
+ # modification: symbols replaced with same number of ellipses
104
+ SubThreeConsecutivePeriod = Rule(r'ƪƪƪ', '...')
105
+ SubThreeSpacePeriod = Rule(r'♟♟♟♟♟♟♟', ' . . . ')
106
+ SubFourSpacePeriod = Rule(r'♝♝♝♝♝♝♝', '. . . .')
107
+ SubTwoConsecutivePeriod = Rule(r'☏☏', '..')
108
+ SubOnePeriod = Rule(r'∮', '.')
109
+ All = [SubThreeConsecutivePeriod, SubThreeSpacePeriod, SubFourSpacePeriod,
110
+ SubTwoConsecutivePeriod, SubOnePeriod]
111
+
112
+ class AbbreviationReplacer(AbbreviationReplacer):
113
+ SENTENCE_STARTERS = "A Being Did For He How However I In It Millions "\
114
+ "More She That The There They We What When Where Who Why".split(" ")
pysbd/lang/danish.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+ from re import escape
4
+
5
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
6
+ from pysbd.lang.common import Common, Standard
7
+ from pysbd.utils import Rule
8
+
9
+ class Danish(Common, Standard):
10
+
11
+ iso_code = 'da'
12
+
13
+ MONTHS = ['Januar', 'Februar', 'Marts', 'April', 'Maj', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'December']
14
+
15
+ class Numbers(Common.Numbers):
16
+
17
+ NumberPeriodSpaceRule = Rule(r'(?<=\s[1-9][0-9])\.(?=\s)|(?<=\s[0-9])\.(?=\s)', '∯')
18
+
19
+ NegativeNumberPeriodSpaceRule = Rule(r'(?<=\s-[1-9][0-9])\.(?=\s)|(?<=\s-[0-9])\.(?=\s)', '∯')
20
+
21
+ All = Common.Numbers.All + [NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule]
22
+
23
+ class AbbreviationReplacer(AbbreviationReplacer):
24
+
25
+ SENTENCE_STARTERS = ("At De Dem Den Der Det Du En Et For Få Gjorde Han Hun Hvad Hvem"
26
+ " Hvilke Hvor Hvordan Hvorfor Hvorledes Hvornår I Jeg Mange Vi Være").split(' ')
27
+
28
+ def __init__(self, text, lang):
29
+ super().__init__(text, lang)
30
+
31
+ def replace_abbreviation_as_sentence_boundary(self):
32
+ sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in self.SENTENCE_STARTERS))
33
+ regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|s.u|s.U)∯({})".format(sent_starters)
34
+ self.text = re.sub(regex, '\\1.', self.text)
35
+ return self.text
36
+
37
+ class Abbreviation(Standard.Abbreviation):
38
+ ABBREVIATIONS = ['adm', 'adr', 'afd', 'afs', 'al', 'alm', 'alm', 'ang', 'ank', 'anm', 'ann', 'ansvh', 'apr', 'arr', 'ass', 'att', 'aud', 'aug', 'aut', 'bd', 'bdt', 'bet', 'bhk', 'bio', 'biol', 'bk', 'bl.a', 'bot', 'br', 'bto', 'ca', 'cal', 'cirk', 'cit', 'co', 'cpr-nr', 'cvr-nr', 'd.d', 'd.e', 'd.m', 'd.s', 'd.s.s', 'd.y', 'd.å', 'd.æ', 'da', 'dav', 'dec', 'def', 'del', 'dep', 'diam', 'din', 'dir', 'disp', 'distr', 'do', 'dobb', 'dr', 'ds', 'dvs', 'e.b', 'e.kr', 'e.l', 'e.o', 'e.v.t', 'eftf', 'eftm', 'egl', 'eks', 'eksam', 'ekskl', 'eksp', 'ekspl', 'el', 'emer', 'endv', 'eng', 'enk', 'etc', 'eur', 'evt', 'exam', 'f', 'f', 'f.eks', 'f.kr', 'f.m', 'f.n', 'f.o', 'f.o.m', 'f.s.v', 'f.t', 'f.v.t', 'f.å', 'fa', 'fakt', 'feb', 'fec', 'ff', 'fg', 'fg', 'fhv', 'fig', 'fl', 'flg', 'fm', 'fm', 'fmd', 'forb', 'foreg', 'foren', 'forf', 'forh', 'fork', 'form', 'forr', 'fors', 'forsk', 'forts', 'fp', 'fr', 'frk', 'fuldm', 'fuldm', 'fung', 'fung', 'fys', 'fær', 'g', 'g.d', 'g.m', 'gd', 'gdr', 'gg', 'gh', 'gl', 'gn', 'gns', 'gr', 'grdl', 'gross', 'h.a', 'h.c', 'hdl', 'henh', 'henv', 'hf', 'hft', 'hhv', 'hort', 'hosp', 'hpl', 'hr', 'hrs', 'hum', 'i', 'i.e', 'ib', 'ibid', 'if', 'ifm', 'ill', 'indb', 'indreg', 'ing', 'inkl', 'insp', 'instr', 'isl', 'istf', 'jan', 'jf', 'jfr', 'jnr', 'jr', 'jul', 'jun', 'jur', 'jvf', 'kal', 'kap', 'kat', 'kbh', 'kem', 'kgl', 'kin', 'kl', 'kld', 'km/t', 'knsp', 'komm', 'kons', 'korr', 'kp', 'kr', 'kr', 'kst', 'kt', 'ktr', 'kv', 'kvt', 'l', 'l.c', 'lab', 'lat', 'lb', 'lb.', 'lb.nr', 'lejl', 'lgd', 'lic', 'lign', 'lin', 'ling.merc', 'litt', 'lok', 'lrs', 'ltr', 'lø', 'm', 'm.a.o', 'm.fl.st', 'm.m', 'm/', 'ma', 'mag', 'maks', 'mar', 'mat', 'matr.nr', 'md', 'mdl', 'mdr', 'mdtl', 'med', 'medd', 'medflg', 'medl', 'merc', 'mezz', 'mf', 'mfl', 'mgl', 'mhp', 'mht', 'mi', 'mia', 'mio', 'ml', 'mods', 'modsv', 'modt', 'mr', 'mrk', 'mrs', 'ms', 'mul', 'mv', 'mvh', 'n', 'n.br', 'n.f', 'nat', 'ned', 'nedenn', 'nedenst', 'nederl', 'nkr', 'nl', 'no', 'nord', 'nov', 'nr', 'nr', 'nto', 'nuv', 'o', 'o.a', 'o.fl.st', 'o.g', 'o.h', 'o.m.a', 'obj', 'obl', 'obs', 'odont', 'oecon', 'off', 'ofl', 'okt', 'omg', 'omr', 'omtr', 'on', 'op.cit', 'opg', 'opl', 'opr', 'org', 'orig', 'osfr', 'osv', 'ovenn', 'ovenst', 'overs', 'ovf', 'oz', 'p', 'p.a', 'p.b.v', 'p.c', 'p.m.v', 'p.p', 'p.s', 'p.t', 'p.v.a', 'p.v.c', 'par', 'partc', 'pass', 'pct', 'pd', 'pens', 'perf', 'pers', 'pg', 'pga', 'pgl', 'ph', 'ph.d', 'pharm', 'phil', 'pinx', 'pk', 'pkt', 'pl', 'pluskv', 'polit', 'polyt', 'port', 'pos', 'pp', 'pr', 'prc', 'priv', 'prod', 'prof', 'pron', 'præd', 'præf', 'præp', 'præs', 'præt', 'psych', 'pt', 'pæd', 'q.e.d', 'rad', 'red', 'ref', 'reg', 'regn', 'rel', 'rep', 'repr', 'rest', 'rk', 'russ', 's', 's.br', 's.d', 's.e', 's.f', 's.m.b.a', 's.u', 's.å', 's/', 'sa', 'sb', 'sc', 'scient', 'sek', 'sek', 'sekr', 'sem', 'sen', 'sep', 'sept', 'sg', 'sign', 'sj', 'skr', 'skt', 'slutn', 'sml', 'smp', 'sms', 'smst', 'soc', 'soc', 'sort', 'sp', 'spec', 'spm', 'spr', 'spsk', 'st', 'stk', 'str', 'stud', 'subj', 'subst', 'suff', 'sup', 'suppl', 'sv', 'såk', 'sædv', 'sø', 't', 't.h', 't.o.m', 't.v', 'tab', 'td', 'tdl', 'tdr', 'techn', 'tekn', 'temp', 'th', 'ti', 'tidl', 'tilf', 'tilh', 'till', 'tilsv', 'tjg', 'tlf', 'tlgr', 'to', 'tr', 'trp', 'tv', 'ty', 'u', 'u.p', 'u.st', 'u.å', 'uafh', 'ubf', 'ubøj', 'udb', 'udbet', 'udd', 'udg', 'uds', 'ugtl', 'ulin', 'ult', 'undt', 'univ', 'v.f', 'var', 'vb', 'vbsb', 'vedk', 'vedl', 'vedr', 'vejl', 'vh', 'vol', 'vs', 'vsa', 'vær', 'zool', 'årg', 'årh', 'årl', 'ø.f', 'øv', 'øvr']
39
+ NUMBER_ABBREVIATIONS = ['nr', 's']
40
+ PREPOSITIVE_ABBREVIATIONS = ['adm', 'skt', 'dr', 'hr', 'fru', 'st']
pysbd/lang/deutsch.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
4
+ from pysbd.between_punctuation import BetweenPunctuation
5
+ from pysbd.lang.common import Common, Standard
6
+ from pysbd.punctuation_replacer import replace_punctuation
7
+ from pysbd.processor import Processor
8
+ from pysbd.utils import Text, Rule
9
+
10
+
11
+ class Deutsch(Common, Standard):
12
+
13
+ iso_code = 'de'
14
+
15
+ class Numbers(Common.Numbers):
16
+ # Rubular: http://rubular.com/r/hZxoyQwKT1
17
+ NumberPeriodSpaceRule = Rule(r'(?<=\s\d)\.(?=\s)|(?<=\s\d\d)\.(?=\s)', '∯')
18
+
19
+ # Rubular: http://rubular.com/r/ityNMwdghj
20
+ NegativeNumberPeriodSpaceRule = Rule(r'(?<=-\d)\.(?=\s)|(?<=-\d\d)\.(?=\s)', '∯')
21
+
22
+ All = Common.Numbers.All + [NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule]
23
+
24
+ class Processor(Processor):
25
+
26
+ def __init__(self, text, lang, char_span=False):
27
+ super().__init__(text, lang, char_span)
28
+
29
+ def replace_numbers(self):
30
+ self.text = Text(self.text).apply(*self.lang.Numbers.All)
31
+ self.replace_period_in_deutsch_dates()
32
+ return self.text
33
+
34
+ def replace_period_in_deutsch_dates(self):
35
+ MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August',
36
+ 'September', 'Oktober', 'November', 'Dezember']
37
+ for month in MONTHS:
38
+ # Rubular: http://rubular.com/r/zlqgj7G5dA
39
+ self.text = re.sub(r'(?<=\d)\.(?=\s*{month})'.format(month=month), '∯', self.text)
40
+
41
+ class Abbreviation(Standard.Abbreviation):
42
+ ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt', 'univ.-prof', 'o.univ.-prof', 'ao.univ.prof', 'ass.prof', 'hon.prof', 'univ.-doz', 'univ.ass', 'stud.ass', 'projektass', 'ass', 'di', 'dipl.-ing', 'mag']
43
+ PREPOSITIVE_ABBREVIATIONS = []
44
+ NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
45
+
46
+ class AbbreviationReplacer(AbbreviationReplacer):
47
+
48
+ SENTENCE_STARTERS = ("Am Auch Auf Bei Da Das Der Die Ein Eine Es Für Heute Ich Im In "
49
+ "Ist Jetzt Mein Mit Nach So Und Warum Was Wenn Wer Wie Wir").split(' ')
50
+
51
+ def __init__(self, text, lang):
52
+ super().__init__(text, lang)
53
+
54
+ def replace(self):
55
+ # Rubular: http://rubular.com/r/B4X33QKIL8
56
+ SingleLowerCaseLetterRule = Rule(r'(?<=\s[a-z])\.(?=\s)', '∯')
57
+
58
+ # Rubular: http://rubular.com/r/iUNSkCuso0
59
+ SingleLowerCaseLetterAtStartOfLineRule = Rule(r'(?<=^[a-z])\.(?=\s)', '∯')
60
+ self.text = Text(self.text).apply(
61
+ self.lang.PossessiveAbbreviationRule,
62
+ *self.lang.SingleLetterAbbreviationRules.All,
63
+ SingleLowerCaseLetterRule,
64
+ SingleLowerCaseLetterAtStartOfLineRule)
65
+
66
+ self.text = self.search_for_abbreviations_in_string(self.text)
67
+ self.replace_multi_period_abbreviations()
68
+ self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
69
+ self.text = self.replace_abbreviation_as_sentence_boundary()
70
+ return self.text
71
+
72
+ def scan_for_replacements(self, txt, am, index, character_array):
73
+ txt = re.sub(r'(?<={am})\.(?=\s)'.format(am=am), '∯', txt)
74
+ return txt
75
+
76
+ class BetweenPunctuation(BetweenPunctuation):
77
+
78
+ def __init__(self, text):
79
+ super().__init__(text)
80
+
81
+ def sub_punctuation_between_double_quotes(self, txt):
82
+ # Rubular: http://rubular.com/r/OdcXBsub0w
83
+ BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = r',,(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)“'
84
+
85
+ # Rubular: http://rubular.com/r/2UskIupGgP
86
+ # SPLIT_DOUBLE_QUOTES_DE_REGEX = r'\A„(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)“'
87
+
88
+ # Rubular: http://rubular.com/r/TkZomF9tTM
89
+ BETWEEN_DOUBLE_QUOTES_DE_REGEX = r'„(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)“'
90
+
91
+ if '„' in txt:
92
+ return re.sub(BETWEEN_DOUBLE_QUOTES_DE_REGEX, replace_punctuation, txt)
93
+ elif ',,' in txt:
94
+ return re.sub(BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX,
95
+ replace_punctuation, txt)
96
+ else:
97
+ return txt
pysbd/lang/dutch.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
3
+ from pysbd.lang.common import Common, Standard
4
+
5
+ class Dutch(Common, Standard):
6
+
7
+ iso_code = 'nl'
8
+
9
+ class Abbreviation(Standard.Abbreviation):
10
+ ABBREVIATIONS = ['a.2d', 'a.a', 'a.a.j.b', 'a.f.t', 'a.g.j.b', 'a.h.v', 'a.h.w', 'a.hosp', 'a.i', 'a.j.b', 'a.j.t', 'a.m', 'a.m.r', 'a.p.m', 'a.p.r', 'a.p.t', 'a.s', 'a.t.d.f', 'a.u.b', 'a.v.a', 'a.w', 'aanbev', 'aanbev.comm', 'aant', 'aanv.st', 'aanw', 'vnw', 'aanw.vnw', 'abd', 'abm', 'abs', 'acc.& fisc', 'acc.act', 'acc.bedr.m', 'acc.bedr.t', "acc.thema's m.", 'acc.thema’s m', 'achterv', 'act.dr', 'act.dr.fam', 'act.fisc', 'act.soc', 'adm.akk', 'adm.besl', 'adm.lex', 'adm.onderr', 'adm.ov', 'adv', 'adv', 'gen', 'adv.bl', 'afd', 'afl', 'aggl.verord', 'agr', 'al', 'alg', 'alg.richts', 'amén', 'ann.dr', 'ann.dr.lg', 'ann.dr.sc.pol', 'ann.ét.eur', 'ann.fac.dr.lg', 'ann.jur.créd', 'ann.jur.créd.règl.coll', 'ann.not', 'ann.parl', 'ann.prat.comm', 'app', 'arb', 'aud', 'arbbl', 'arbh', 'arbit.besl', 'arbrb', 'arr', 'arr.cass', 'arr.r.v.st', 'arr.verbr', 'arrondrb', 'art', 'artw', 'aud', 'b', 'b', 'en w', 'b.&w', 'b.a', 'b.a.s', 'b.b.o', 'b.best.dep', 'b.br.ex', 'b.coll.fr.gem.comm', 'b.coll.vl.gem.comm', 'b.d.cult.r', 'b.d.gem.ex', 'b.d.gem.reg', 'b.dep', 'b.e.b', 'b.f.r', 'b.fr.gem.ex', 'b.fr.gem.reg', 'b.i.h', 'b.inl.j.d', 'b.inl.s.reg', 'b.j', 'b.l', 'b.lid br.ex', 'b.lid d.gem.ex', 'b.lid fr.gem.ex', 'b.lid vl.ex', 'b.lid w.gew.ex', 'b.o.z', 'b.prov.r', 'b.r.h', 'b.s', 'b.sr', 'b.stb', 'b.t.i.r', 'b.t.s.z', 'b.t.w.rev', 'b.v', 'b.ver.coll.gem.gem.comm', 'b.verg.r.b', 'b.versl', 'b.vl.ex', 'b.voorl.reg', 'b.w', 'b.w.gew.ex', 'b.z.d.g', 'b.z.v', 'bab', 'bank fin', 'bank fin.r', 'bedr.org', 'begins', 'beheersov', 'bekendm.comm', 'bel', 'bel.besch', 'bel.w.p', 'beleidsov', 'belg', 'grondw', 'benelux jur', 'ber', 'ber.w', 'besch', 'besl', 'beslagr', 'besluitwet nr', 'bestuurswet', 'bet', 'betr', 'betr', 'vnw', 'bevest', 'bew', 'bijbl', 'ind', 'eig', 'bijbl.n.bijdr', 'bijl', 'bijv', 'bijw', 'bijz.decr', 'bin.b', 'bkh', 'bl', 'blz', 'bm', 'bn', 'bnlx merkw', 'bnlx tek', 'bnlx uitl', 'rh', 'bnw', 'bouwr', 'br drs', 'br.parl', 'bs', 'bt drs', 'btw rev', 'bull', 'bull.adm.pénit', 'bull.ass', 'bull.b.m.m', 'bull.bel', 'bull.best.strafinr', 'bull.bmm', 'bull.c.b.n', 'bull.c.n.c', 'bull.cbn', 'bull.centr.arb', 'bull.cnc', 'bull.contr', 'bull.doc.min.fin', 'bull.f.e.b', 'bull.feb', 'bull.fisc.fin.r', 'bull.i.u.m', 'bull.inf.ass.secr.soc', 'bull.inf.i.e.c', 'bull.inf.i.n.a.m.i', 'bull.inf.i.r.e', 'bull.inf.iec', 'bull.inf.inami', 'bull.inf.ire', 'bull.inst.arb', 'bull.ium', 'bull.jur.imm', 'bull.lég.b', 'bull.off', 'bull.trim.b.dr.comp', 'bull.us', 'bull.v.b.o', 'bull.vbo', 'bv i.o', 'bv', 'bw int.reg', 'bw', 'bxh', 'byz', 'c', 'c.& f', 'c.& f.p', 'c.a', 'c.a.-a', 'c.a.b.g', 'c.c', 'c.c.i', 'c.c.s', 'c.conc.jur', 'c.d.e', 'c.d.p.k', 'c.e', 'c.ex', 'c.f', 'c.h.a', 'c.i.f', 'c.i.f.i.c', 'c.j', 'c.l', 'c.n', 'c.o.d', 'c.p', 'c.pr.civ', 'c.q', 'c.r', 'c.r.a', 'c.s', 'c.s.a', 'c.s.q.n', 'c.v', 'c.v.a', 'c.v.o', 'ca', 'cadeaust', 'cah.const', 'cah.dr.europ', 'cah.dr.immo', 'cah.dr.jud', 'cal', '2d', 'cal', '3e', 'cal', 'rprt', 'cap', 'carg', 'cass', 'cass', 'verw', 'cert', 'cf', 'ch', 'chron', 'chron.d.s', 'chron.dr.not', 'cie', 'cie', 'verz.schr', 'cir', 'circ', 'circ.z', 'cit', 'cit.loc', 'civ', 'cl.et.b', 'cmt', 'co', 'cognoss.v', 'coll', 'v', 'b', 'colp.w', 'com', 'com', 'cas', 'com.v.min', 'comm', 'comm', 'v', 'comm.bijz.ov', 'comm.erf', 'comm.fin', 'comm.ger', 'comm.handel', 'comm.pers', 'comm.pub', 'comm.straf', 'comm.v', 'comm.v.en v', 'comm.venn', 'comm.verz', 'comm.voor', 'comp', 'compt.w', 'computerr', 'con.m', 'concl', 'concr', 'conf', 'confl.w', 'confl.w.huwbetr', 'cons', 'conv', 'coöp', 'ver', 'corr', 'corr.bl', 'cour de cass', 'cour.fisc', 'cour.immo', 'cridon', 'crim', 'cur', 'cur', 'crt', 'curs', 'd', 'd.-g', 'd.a', 'd.a.v', 'd.b.f', 'd.c', 'd.c.c.r', 'd.d', 'd.d.p', 'd.e.t', 'd.gem.r', 'd.h', 'd.h.z', 'd.i', 'd.i.t', 'd.j', 'd.l.r', 'd.m', 'd.m.v', 'd.o.v', 'd.parl', 'd.w.z', 'dact', 'dat', 'dbesch', 'dbesl', 'de advoc', 'de belg.acc', 'de burg.st', 'de gem', 'de gerechtsd', 'de venn', 'de verz', 'decr', 'decr.d', 'decr.fr', 'decr.vl', 'decr.w', 'def', 'dep.opv', 'dep.rtl', 'derg', 'desp', 'det.mag', 'deurw.regl', 'dez', 'dgl', 'dhr', 'disp', 'diss', 'div', 'div.act', 'div.bel', 'dl', 'dln', 'dnotz', 'doc', 'hist', 'doc.jur.b', 'doc.min.fin', 'doc.parl', 'doctr', 'dpl', 'dpl.besl', 'dr', 'dr.banc.fin', 'dr.circ', 'dr.inform', 'dr.mr', 'dr.pén.entr', 'dr.q.m', 'drs', 'dtp', 'dwz', 'dyn', 'e cont', 'e', 'e.a', 'e.b', 'tek.mod', 'e.c', 'e.c.a', 'e.d', 'e.e', 'e.e.a', 'e.e.g', 'e.g', 'e.g.a', 'e.h.a', 'e.i', 'e.j', 'e.m.a', 'e.n.a.c', 'e.o', 'e.p.c', 'e.r.c', 'e.r.f', 'e.r.h', 'e.r.o', 'e.r.p', 'e.r.v', 'e.s.r.a', 'e.s.t', 'e.v', 'e.v.a', 'e.w', 'e&o.e', 'ec.pol.r', 'echos log', 'econ', 'ed', 'ed(s)', 'eeg verd.v', 'eex san s', 'eff', 'eg rtl', 'eig', 'eig.mag', 'eil', 'elektr', 'enmb', 'entr.et dr', 'enz', 'err', 'et al', 'et seq', 'etc', 'etq', 'eur', 'parl', 'eur.t.s', 'eur.verd.overdracht strafv', 'ev rechtsh', 'ev uitl', 'ev', 'evt', 'ex', 'ex.crim', 'exec', 'f', 'f.a.o', 'f.a.q', 'f.a.s', 'f.i.b', 'f.j.f', 'f.o.b', 'f.o.r', 'f.o.s', 'f.o.t', 'f.r', 'f.supp', 'f.suppl', 'fa', 'facs', 'fare act', 'fasc', 'fg', 'fid.ber', 'fig', 'fin.verh.w', 'fisc', 'fisc', 'tijdschr', 'fisc.act', 'fisc.koer', 'fl', 'form', 'foro', 'it', 'fr', 'fr.cult.r', 'fr.gem.r', 'fr.parl', 'fra', 'ft', 'g', 'g.a', 'g.a.v', 'g.a.w.v', 'g.g.d', 'g.m.t', 'g.o', 'g.omt.e', 'g.p', 'g.s', 'g.v', 'g.w.w', 'geb', 'gebr', 'gebrs', 'gec', 'gec.decr', 'ged', 'ged.st', 'gedipl', 'gedr.st', 'geh', 'gem', 'gem', 'en gew', 'gem', 'en prov', 'gem.gem.comm', 'gem.st', 'gem.stem', 'gem.w', 'gem.wet, gem.wet', 'gemeensch.optr', 'gemeensch.standp', 'gemeensch.strat', 'gemeent', 'gemeent.b', 'gemeent.regl', 'gemeent.verord', 'geol', 'geopp', 'gepubl', 'ger.deurw', 'ger.w', 'gerekw', 'gereq', 'gesch', 'get', 'getr', 'gev.m', 'gev.maatr', 'gew', 'ghert', 'gir.eff.verk', 'gk', 'gr', 'gramm', 'grat.w', 'gron,opm.en leermed', 'grootb.w', 'grs', 'grur ausl', 'grur int', 'grvm', 'grw', 'gst', 'gw', 'h.a', 'h.a.v.o', 'h.b.o', 'h.e.a.o', 'h.e.g.a', 'h.e.geb', 'h.e.gestr', 'h.l', 'h.m', 'h.o', 'h.r', 'h.t.l', 'h.t.m', 'h.w.geb', 'hand', 'handelsn.w', 'handelspr', 'handelsr.w', 'handelsreg.w', 'handv', 'harv.l.rev', 'hc', 'herald', 'hert', 'herz', 'hfdst', 'hfst', 'hgrw', 'hhr', 'hist', 'hooggel', 'hoogl', 'hosp', 'hpw', 'hr', 'hr', 'ms', 'hr.ms', 'hregw', 'hrg', 'hst', 'huis.just', 'huisv.w', 'huurbl', 'hv.vn', 'hw', 'hyp.w', 'i.b.s', 'i.c', 'i.c.m.h', 'i.e', 'i.f', 'i.f.p', 'i.g.v', 'i.h', 'i.h.a', 'i.h.b', 'i.l.pr', 'i.o', 'i.p.o', 'i.p.r', 'i.p.v', 'i.pl.v', 'i.r.d.i', 'i.s.m', 'i.t.t', 'i.v', 'i.v.m', 'i.v.s', 'i.w.tr', 'i.z', 'ib', 'ibid', 'icip-ing.cons', 'iem', 'ind prop', 'indic.soc', 'indiv', 'inf', 'inf.i.d.a.c', 'inf.idac', 'inf.r.i.z.i.v', 'inf.riziv', 'inf.soc.secr', 'ing', 'ing', 'cons', 'ing.cons', 'inst', 'int', 'int', 'rechtsh', 'strafz', "int'l & comp.l.q.", 'interm', 'intern.fisc.act', 'intern.vervoerr', 'inv', 'inv', 'f', 'inv.w', 'inv.wet', 'invord.w', 'inz', 'ir', 'irspr', 'iwtr', 'j', 'j.-cl', 'j.c.b', 'j.c.e', 'j.c.fl', 'j.c.j', 'j.c.p', 'j.d.e', 'j.d.f', 'j.d.s.c', 'j.dr.jeun', 'j.j.d', 'j.j.p', 'j.j.pol', 'j.l', 'j.l.m.b', 'j.l.o', 'j.ordre pharm', 'j.p.a', 'j.r.s', 'j.t', 'j.t.d.e', 'j.t.dr.eur', 'j.t.o', 'j.t.t', 'jaarl', 'jb.hand', 'jb.kred', 'jb.kred.c.s', 'jb.l.r.b', 'jb.lrb', 'jb.markt', 'jb.mens', 'jb.t.r.d', 'jb.trd', 'jeugdrb', 'jeugdwerkg.w', 'jg', 'jis', 'jl', 'journ.jur', 'journ.prat.dr.fisc.fin', 'journ.proc', 'jrg', 'jur', 'jur.comm.fl', 'jur.dr.soc.b.l.n', 'jur.f.p.e', 'jur.fpe', 'jur.niv', 'jur.trav.brux', 'jura falc', 'jurambt', 'jv.cass', 'jv.h.r.j', 'jv.hrj', 'jw', 'k', 'k', 'en m', 'k.b', 'k.g', 'k.k', 'k.m.b.o', 'k.o.o', 'k.v.k', 'k.v.v.v', 'kadasterw', 'kaderb', 'kador', 'kbo-nr', 'kg', 'kh', 'kiesw', 'kind.bes.v', 'kkr', 'koopv', 'kr', 'krankz.w', 'ksbel', 'kt', 'ktg', 'ktr', 'kvdm', 'kw.r', 'kymr', 'kzr', 'kzw', 'l', 'l.b', 'l.b.o', 'l.bas', 'l.c', 'l.gew', 'l.j', 'l.k', 'l.l', 'l.o', 'l.r.b', 'l.u.v.i', 'l.v.r', 'l.v.w', 'l.w', "l'exp.-compt.b.", 'l’exp.-compt.b', 'landinr.w', 'landscrt', 'larcier cass', 'lat', 'law.ed', 'lett', 'levensverz', 'lgrs', 'lidw', 'limb.rechtsl', 'lit', 'litt', 'liw', 'liwet', 'lk', 'll', 'll.(l.)l.r', 'loonw', 'losbl', 'ltd', 'luchtv', 'luchtv.w', 'm', 'm', 'not', 'm.a.v.o', 'm.a.w', 'm.b', 'm.b.o', 'm.b.r', 'm.b.t', 'm.d.g.o', 'm.e.a.o', 'm.e.r', 'm.h', 'm.h.d', 'm.i.v', 'm.j.t', 'm.k', 'm.m', 'm.m.a', 'm.m.h.h', 'm.m.v', 'm.n', 'm.not.fisc', 'm.nt', 'm.o', 'm.r', 'm.s.a', 'm.u.p', 'm.v.a', 'm.v.h.n', 'm.v.t', 'm.z', 'maatr.teboekgest.luchtv', 'maced', 'mand', 'max', 'mbl.not', 'me', 'med', 'med', 'v.b.o', 'med.b.u.f.r', 'med.bufr', 'med.vbo', 'meerv', 'meetbr.w', 'mém.adm', 'mgr', 'mgrs', 'mhd', 'mi.verantw', 'mil', 'mil.bed', 'mil.ger', 'min', 'min', 'aanbev', 'min', 'circ', 'min', 'fin', 'min.j.omz', 'min.just.circ', 'mitt', 'mnd', 'mod', 'mon', 'monde ass', 'mouv.comm', 'mr', 'ms', 'muz', 'mv', 'mva ii inv', 'mva inv', 'n cont', 'n', 'chr', 'n.a', 'n.a.g', 'n.a.v', 'n.b', 'n.c', 'n.chr', 'n.d', 'n.d.r', 'n.e.a', 'n.g', 'n.h.b.c', 'n.j', 'n.j.b', 'n.j.w', 'n.l', 'n.m', 'n.m.m', 'n.n', 'n.n.b', 'n.n.g', 'n.n.k', 'n.o.m', 'n.o.t.k', 'n.rapp', 'n.tijd.pol', 'n.v', 'n.v.d.r', 'n.v.d.v', 'n.v.o.b', 'n.v.t', 'nat.besch.w', 'nat.omb', 'nat.pers', 'ned.cult.r', 'neg.verkl', 'nhd', 'nieuw arch', 'wisk', 'njcm-bull', 'nl', 'nnd', 'no', 'not.fisc.m', 'not.w', 'not.wet', 'nr', 'nrs', 'nste', 'nt', 'numism', 'o', 'o.a', 'o.b', 'o.c', 'o.g', 'o.g.v', 'o.i', 'o.i.d', 'o.m', 'o.o', 'o.o.d', 'o.o.v', 'o.p', 'o.r', 'o.regl', 'o.s', 'o.t.s', 'o.t.t', 'o.t.t.t', 'o.t.t.z', 'o.tk.t', 'o.v.t', 'o.v.t.t', 'o.v.tk.t', 'o.v.v', 'ob', 'obsv', 'octr', 'octr.gem.regl', 'octr.regl', 'oe', 'oecd mod', 'off.pol', 'ofra', 'ohd', 'omb', 'omnia frat', 'omnil', 'omz', 'on.ww', 'onderr', 'onfrank', 'onteig.w', 'ontw', 'b.w', 'onuitg', 'onz', 'oorl.w', 'op.cit', 'opin.pa', 'opm', 'or', 'ord.br', 'ord.gem', 'ors', 'orth', 'os', 'osm', 'ov', 'ov.w.i', 'ov.w.ii', 'ov.ww', 'overg.w', 'overw', 'ovkst', 'ow kadasterw', 'oz', 'p', 'p.& b', 'p.a', 'p.a.o', 'p.b.o', 'p.e', 'p.g', 'p.j', 'p.m', 'p.m.a', 'p.o', 'p.o.j.t', 'p.p', 'p.v', 'p.v.s', 'pachtw', 'pag', 'pan', 'pand.b', 'pand.pér', 'parl.gesch', 'parl.gesch', 'inv', 'parl.st', 'part.arb', 'pas', 'pasin', 'pat', 'pb.c', 'pb.l', 'pens', 'pensioenverz', 'per.ber.i.b.r', 'per.ber.ibr', 'pers', 'st', 'pft', 'pg wijz.rv', 'pk', 'pktg', 'pli jur', 'plv', 'po', 'pol', 'pol.off', 'pol.r', 'pol.w', 'politie j', 'postbankw', 'postw', 'pp', 'pr', 'preadv', 'pres', 'prf', 'prft', 'prg', 'prijz.w', 'pro jus', 'proc', 'procesregl', 'prof', 'prot', 'prov', 'prov.b', 'prov.instr.h.m.g', 'prov.regl', 'prov.verord', 'prov.w', 'publ', 'publ.cour eur.d.h', 'publ.eur.court h.r', 'pun', 'pw', 'q.b.d', 'q.e.d', 'q.q', 'q.r', 'r', 'r.a.b.g', 'r.a.c.e', 'r.a.j.b', 'r.b.d.c', 'r.b.d.i', 'r.b.s.s', 'r.c', 'r.c.b', 'r.c.d.c', 'r.c.j.b', 'r.c.s.j', 'r.cass', 'r.d.c', 'r.d.i', 'r.d.i.d.c', 'r.d.j.b', 'r.d.j.p', 'r.d.p.c', 'r.d.s', 'r.d.t.i', 'r.e', 'r.f.s.v.p', 'r.g.a.r', 'r.g.c.f', 'r.g.d.c', 'r.g.f', 'r.g.z', 'r.h.a', 'r.i.c', 'r.i.d.a', 'r.i.e.j', 'r.i.n', 'r.i.s.a', 'r.j.d.a', 'r.j.i', 'r.k', 'r.l', 'r.l.g.b', 'r.med', 'r.med.rechtspr', 'r.n.b', 'r.o', 'r.orde apoth', 'r.ov', 'r.p', 'r.p.d.b', 'r.p.o.t', 'r.p.r.j', 'r.p.s', 'r.r.d', 'r.r.s', 'r.s', 'r.s.v.p', 'r.stvb', 'r.t.d.f', 'r.t.d.h', 'r.t.l', 'r.trim.dr.eur', 'r.v.a', 'r.verkb', 'r.w', 'r.w.d', 'rap.ann.c.a', 'rap.ann.c.c', 'rap.ann.c.e', 'rap.ann.c.s.j', 'rap.ann.ca', 'rap.ann.cass', 'rap.ann.cc', 'rap.ann.ce', 'rap.ann.csj', 'rapp', 'rb', 'rb.kh', 'rb.van kh', 'rdn', 'rdnr', 're.pers', 'rec', 'rec.c.i.j', 'rec.c.j.c.e', 'rec.cij', 'rec.cjce', 'rec.cour eur.d.h', 'rec.gén.enr.not', 'rec.lois decr.arr', 'rechtsk.t', 'rechtspl.zeem', 'rechtspr.arb.br', 'rechtspr.b.f.e', 'rechtspr.bfe', 'rechtspr.soc.r.b.l.n', 'recl.reg', 'rect', 'red', 'reg', 'reg.huiz.bew', 'reg.w', 'registr.w', 'regl', 'regl', 'r.v.k', 'regl.besl', 'regl.onderr', 'regl.r.t', 'rep', 'rep.eur.court h.r', 'rép.fisc', 'rép.not', 'rep.r.j', 'rep.rj', 'req', 'res', 'resp', 'rev', 'rev', 'de dr', 'comp', 'rev', 'trim', 'de dr', 'civ', 'rev', 'trim', 'de dr', 'comm', 'rev.acc.trav', 'rev.adm', 'rev.b.compt', 'rev.b.dr.const', 'rev.b.dr.intern', 'rev.b.séc.soc', 'rev.banc.fin', 'rev.comm', 'rev.cons.prud', 'rev.dr.b', 'rev.dr.commun', 'rev.dr.étr', 'rev.dr.fam', 'rev.dr.intern.comp', 'rev.dr.mil', 'rev.dr.min', 'rev.dr.pén', 'rev.dr.pén.mil', 'rev.dr.rur', 'rev.dr.u.l.b', 'rev.dr.ulb', 'rev.exp', 'rev.faill', 'rev.fisc', 'rev.gd', 'rev.hist.dr', 'rev.i.p.c', 'rev.ipc', 'rev.not.b', 'rev.prat.dr.comm', 'rev.prat.not.b', 'rev.prat.soc', 'rev.rec', 'rev.rw', 'rev.trav', 'rev.trim.d.h', 'rev.trim.dr.fam', 'rev.urb', 'richtl', 'riv.dir.int', 'riv.dir.int."le priv', 'riv.dir.int.priv.proc', 'rk', 'rln', 'roln', 'rom', 'rondz', 'rov', 'rtl', 'rubr', 'ruilv.wet', 'rv.verdr', 'rvkb', 's', 's', 'en s', 's.a', 's.b.n', 's.ct', 's.d', 's.e.c', 's.e.et.o', 's.e.w', 's.exec.rept', 's.hrg', 's.j.b', 's.l', 's.l.e.a', 's.l.n.d', 's.p.a', 's.s', 's.t', 's.t.b', 's.v', 's.v.p', 'samenw', 'sc', 'sch', 'scheidsr.uitspr', 'schepel.besl', 'secr.comm', 'secr.gen', 'sect.soc', 'sess', 'cas', 'sir', 'soc', 'best', 'soc', 'handv', 'soc', 'verz', 'soc.act', 'soc.best', 'soc.kron', 'soc.r', 'soc.sw', 'soc.weg', 'sofi-nr', 'somm', 'somm.ann', 'sp.c.c', 'sr', 'ss', 'st.doc.b.c.n.a.r', 'st.doc.bcnar', 'st.vw', 'stagever', 'stas', 'stat', 'stb', 'stbl', 'stcrt', 'stichting i.v', 'stud.dipl', 'su', 'subs', 'subst', 'succ.w', 'suppl', 'sv', 'sw', 't', 't.a', 't.a.a', 't.a.n', 't.a.p', 't.a.s.n', 't.a.v', 't.a.v.w', 't.aann', 't.acc', 't.agr.r', 't.app', 't.b.b.r', 't.b.h', 't.b.m', 't.b.o', 't.b.p', 't.b.r', 't.b.s', 't.b.v', 't.bankw', 't.belg.not', 't.desk', 't.e.m', 't.e.p', 't.f.r', 't.fam', 't.fin.r', 't.g.r', 't.g.t', 't.g.v', 't.gem', 't.gez', 't.huur', 't.i.n', 't.in b.z', 't.j.k', 't.l.l', 't.l.v', 't.m', 't.m.r', 't.m.w', 't.mil.r', 't.mil.strafr', 't.not', 't.o', 't.o.r.b', 't.o.v', 't.ontv', 't.orde geneesh', 't.p.r', 't.pol', 't.r', 't.r.d.& i', 't.r.g', 't.r.o.s', 't.r.v', 't.s.r', 't.strafr', 't.t', 't.u', 't.v.c', 't.v.g', 't.v.m.r', 't.v.o', 't.v.v', 't.v.v.d.b', 't.v.w', 't.verz', 't.vred', 't.vreemd', 't.w', 't.w.k', 't.w.v', 't.w.v.r', 't.wrr', 't.z', 't.z.t', 't.z.v', 'taalk', 'tar.burg.z', 'td', 'techn', 'telecomm', 'toel', 'toel.st.v.w', 'toep', 'toep.regl', 'tom', 'top', 'trans.b', 'transp.r', 'trav.com.ét.et lég.not', 'trb', 'trib', 'trib.civ', 'trib.gr.inst', 'ts', 'ts', 'best', 'ts', 'verv', 'turnh.rechtsl', 'tvpol', 'tvpr', 'tvrechtsgesch', 'tw', 'u', 'u.a', 'u.a.r', 'u.a.v', 'u.c', 'u.c.c', 'u.g', 'u.p', 'u.s', 'u.s.d.c', 'uitdr', 'uitl.w', 'uitv.besch.div.b', 'uitv.besl', 'uitv.besl', 'succ.w', 'uitv.besl.bel.rv', 'uitv.besl.l.b', 'uitv.reg', 'inv.w', 'uitv.reg.bel.d', 'uitv.reg.afd.verm', 'uitv.reg.lb', 'uitv.reg.succ.w', 'univ', 'univ.verkl', 'v', 'v', 'chr', 'v.& f', 'v.a', 'v.a.v', 'v.bp prot', 'v.c', 'v.chr', 'v.h', 'v.huw.verm', 'v.i', 'v.i.o', 'v.k.a', 'v.m', 'v.o.f', 'v.o.n', 'v.onderh.verpl', 'v.p', 'v.r', 'v.s.o', 'v.t.t', 'v.t.t.t', 'v.tk.t', 'v.toep.r.vert', 'v.v.b', 'v.v.g', 'v.v.t', 'v.v.t.t', 'v.v.tk.t', 'v.w.b', 'v.z.m', 'vb', 'vb.bo', 'vbb', 'vc', 'vd', 'veldw', 'ver.k', 'ver.verg.gem', 'gem.comm', 'verbr', 'verd', 'verdr', 'verdr.v', 'verdrag benel.i.z', 'tek.mod', 'verenw', 'verg', 'verg.fr.gem', 'comm', 'verkl', 'verkl.herz.gw', 'verl', 'deelw', 'vern', 'verord', 'vers.r', 'versch', 'versl.c.s.w', 'versl.csw', 'vert', 'verw', 'verz', 'verz.w', 'verz.wett.besl', 'verz.wett.decr.besl', 'vgl', 'vid', 'vigiles jb', 'viss.w', 'vl.parl', 'vl.r', 'vl.t.gez', 'vl.w.reg', 'vl.w.succ', 'vlg', 'vn', 'vnl', 'vnw', 'vo', 'vo.bl', 'voegw', 'vol', 'volg', 'volt', 'deelw', 'voorl', 'voorz', 'vord.w', 'vorst.d', 'vr', 'en antw', 'vred', 'vrg', 'vnw', 'vrijgrs', 'vs', 'vt', 'vvsr jb', 'vw', 'vz', 'vzngr', 'vzr', 'w', 'w.a', 'w.b.r', 'w.c.h', 'w.conf.huw', 'w.conf.huwelijksb', 'w.consum.kr', 'w.f.r', 'w.g', 'w.gelijke beh', 'w.gew.r', 'w.ident.pl', 'w.just.doc', 'w.kh', 'w.l.r', 'w.l.v', 'w.mil.straf.spr', 'w.n', 'w.not.ambt', 'w.o', 'w.o.d.huurcomm', 'w.o.d.k', 'w.openb.manif', 'w.parl', 'w.r', 'w.reg', 'w.succ', 'w.u.b', 'w.uitv.pl.verord', 'w.v', 'w.v.k', 'w.v.m.s', 'w.v.r', 'w.v.w', 'w.venn', 'wac', 'wd', 'wet a.b', 'wet bel.rv', 'wet c.a.o', 'wet c.o', 'wet div.bel', 'wet ksbel', 'wet l.v', 'wetb', 'n.v.h', 'wgb', 'winkelt.w', 'wisk', 'wka-verkl', 'wnd', 'won.w', 'woningw', 'woonr.w', 'wrr', 'wrr.ber', 'wrsch', 'ws', 'wsch', 'wsr', 'wtvb', 'ww', 'x.d', 'z cont', 'z.a', 'z.g', 'z.i', 'z.j', 'z.o.z', 'z.p', 'z.s.m', 'zesde richtl', 'zg', 'zgn', 'zn', 'znw', 'zr', 'zr', 'ms', 'zr.ms']
11
+ PREPOSITIVE_ABBREVIATIONS = []
12
+ NUMBER_ABBREVIATIONS = []
pysbd/lang/english.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
3
+ from pysbd.lang.common import Common, Standard
4
+
5
+ class English(Common, Standard):
6
+
7
+ iso_code = 'en'
8
+
9
+ class AbbreviationReplacer(AbbreviationReplacer):
10
+ SENTENCE_STARTERS = "A Being Did For He How However I In It Millions "\
11
+ "More She That The There They We What When Where Who Why".split(" ")
pysbd/lang/french.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
3
+ from pysbd.lang.common import Common, Standard
4
+
5
+ class French(Common, Standard):
6
+
7
+ iso_code = 'fr'
8
+
9
+ class AbbreviationReplacer(AbbreviationReplacer):
10
+ SENTENCE_STARTERS = []
11
+
12
+ class Abbreviation(Standard.Abbreviation):
13
+ ABBREVIATIONS = ['a.c.n', 'a.m', 'al', 'ann', 'apr', 'art', 'auj', 'av', 'b.p', 'boul', 'c.-à-d', 'c.n', 'c.n.s', 'c.p.i', 'c.q.f.d', 'c.s', 'ca', 'cf', 'ch.-l', 'chap', 'co', 'co', 'contr', 'dir', 'e.g', 'e.v', 'env', 'etc', 'ex', 'fasc', 'fig', 'fr', 'fém', 'hab', 'i.e', 'ibid', 'id', 'inf', 'l.d', 'lib', 'll.aa', 'll.aa.ii', 'll.aa.rr', 'll.aa.ss', 'll.ee', 'll.mm', 'll.mm.ii.rr', 'loc.cit', 'ltd', 'ltd', 'masc', 'mm', 'ms', 'n.b', 'n.d', 'n.d.a', 'n.d.l.r', 'n.d.t', 'n.p.a.i', 'n.s', 'n/réf', 'nn.ss', 'p.c.c', 'p.ex', 'p.j', 'p.s', 'pl', 'pp', 'r.-v', 'r.a.s', 'r.i.p', 'r.p', 's.a', 's.a.i', 's.a.r', 's.a.s', 's.e', 's.m', 's.m.i.r', 's.s', 'sec', 'sect', 'sing', 'sq', 'sqq', 'ss', 'suiv', 'sup', 'suppl', 't.s.v.p', 'tél', 'vb', 'vol', 'vs', 'x.o', 'z.i', 'éd']
14
+ PREPOSITIVE_ABBREVIATIONS = []
15
+ NUMBER_ABBREVIATIONS = []
pysbd/lang/greek.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
3
+ from pysbd.lang.common import Common, Standard
4
+
5
+ class Greek(Common, Standard):
6
+
7
+ iso_code = 'el'
8
+
9
+ SENTENCE_BOUNDARY_REGEX = r'.*?[\.;!\?]|.*?$'
10
+ Punctuations = ['.', '!', ';', '?']
11
+
12
+ class AbbreviationReplacer(AbbreviationReplacer):
13
+ SENTENCE_STARTERS = []
pysbd/lang/hindi.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
3
+ from pysbd.lang.common import Common, Standard
4
+
5
+ class Hindi(Common, Standard):
6
+
7
+ iso_code = 'hi'
8
+
9
+ SENTENCE_BOUNDARY_REGEX = r'.*?[।\|!\?]|.*?$'
10
+ Punctuations = ['।', '|', '.', '!', '?']
11
+
12
+ class AbbreviationReplacer(AbbreviationReplacer):
13
+ SENTENCE_STARTERS = []
pysbd/lang/italian.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
3
+ from pysbd.lang.common import Common, Standard
4
+
5
+ class Italian(Common, Standard):
6
+
7
+ iso_code = 'it'
8
+
9
+ class AbbreviationReplacer(AbbreviationReplacer):
10
+ SENTENCE_STARTERS = []
11
+
12
+ class Abbreviation(Standard.Abbreviation):
13
+ ABBREVIATIONS = ['1°', 'a.c', 'a.c/a', 'a.cam', 'a.civ', 'a.cor', 'a.d.r', 'a.gov', 'a.mil', 'a.mon', 'a.smv', 'a.v', 'a/a', 'a/c', 'a/i', 'aa', 'aaaa', 'aaal', 'aacst', 'aamct', 'aams', 'aar', 'aato', 'ab', 'abbigl', 'abbrev', 'abc', 'abi', 'abl', 'abm', 'abr', 'abs', 'absp', 'ac', 'acam', 'acb', 'acbi', 'acc', 'accorc', 'accr', 'acd', 'ace', 'acec', 'acep', 'aci', 'acli', 'acp', 'acro', 'acsit', 'actl', 'ad', 'ad.mil', 'ada', 'adap', 'adatt', 'adc', 'add', 'adei', 'adeion', 'adhd', 'adi', 'adisco', 'adj', 'adm', 'adp', 'adr', 'ads', 'adsi', 'adsl', 'adv', 'ae.b', 'aefi', 'aer', 'aerodin', 'aeron', 'afa', 'afc', 'afci', 'affl', 'afi', 'afic', 'afm', 'afp', 'ag', 'agcm', 'agcom', 'age', 'agecs', 'agesci', 'agg', 'agip', 'agis', 'agm', 'ago', 'agr', 'agric', 'agt', 'ai', 'aia', 'aiab', 'aiac', 'aiace', 'aiap', 'aias', 'aiat', 'aib', 'aic', 'aica', 'aicel', 'aici', 'aics', 'aid', 'aida', 'aidaa', 'aidac', 'aidama', 'aidda', 'aidim', 'aido', 'aids', 'aies', 'aif', 'aih', 'aiip', 'aimi', 'aip', 'aipsc', 'airi', 'ais', 'aisa', 'aism', 'aiss', 'aissca', 'aitc', 'aiti', 'aitr', 'aits', 'aka', 'al', 'alai', 'alch', 'alg', 'ali', 'alim', 'all', 'allev', 'allus', 'alp', 'alq', 'alt', 'am', 'ama', 'amaci', 'amag', 'amami', 'amc', 'ammec', 'amn', 'ampas', 'amps', 'an', 'ana', 'anaai', 'anac', 'anaci', 'anad', 'anai', 'anaoo', 'anart', 'anat', 'anat. comp', 'ancci', 'anci', 'ancip', 'ancsa', 'andit', 'anec', 'anee', 'anem', 'anes', 'anffas', 'ani', 'ania', 'anica', 'anie', 'animi', 'anis', 'anisc', 'anm', 'anmfit', 'anmig', 'anmil', 'anmli', 'anms', 'anpa', 'anpas', 'anpci', 'anpe', 'anpi', 'ansi', 'ansv', 'ant', 'anta', 'antifr', 'antlo', 'anton', 'antrop', 'anusca', 'anvi', 'anx', 'ao', 'ap', 'apa', 'apd', 'apea', 'apec', 'apet', 'api', 'apos', 'app', 'app.sc', 'apr', 'aps', 'apt', 'aq', 'ar', 'ar.ind', 'ar.rep', 'arald', 'arame', 'arc', 'arch', 'archeol', 'arci', 'ardsu', 'are', 'arg', 'aritm', 'arpa', 'arpat', 'arred', 'arrt', 'arsia', 'art', 'arti min', 'artig', 'artigl', 'artt', 'as', 'asa', 'asae', 'asc', 'asci', 'ascii', 'ascom', 'ascop', 'asd', 'ase', 'asf', 'asfer', 'asg', 'asic', 'asifa', 'asl', 'asmdc', 'asmi', 'asp', 'aspic', 'aspp', 'assi', 'assic', 'assol', 'asst', 'aster', 'astr', 'astrol', 'astron', 'at', 'ata', 'atb', 'atic', 'atm', 'ats', 'att', 'attrav', 'atv', 'au', 'auc', 'aus', 'auser', 'aut', 'autom', 'av', 'avi', 'avis', 'avo', 'avv', 'avvers', 'awb', 'awdp', 'az', 'azh', 'b.a', 'b2b', 'b2c', 'ba', 'bafta', 'bal', 'ball', 'ban', 'banc', 'bar', 'bart', 'bas', 'bat', 'batt', 'bban', 'bbc', 'bbl', 'bbs', 'bbtc', 'bcc', 'bce', 'bcf', 'bdf', 'bei', 'bep', 'bers', 'bg', 'bi', 'bibl', 'bic', 'bioch', 'biol', 'bl', 'bld', 'bldg', 'blpc', 'bm', 'bmps', 'bmw', 'bn', 'bna', 'bncf', 'bncrm', 'bni', 'bnl', 'bo', 'bot', 'bpl', 'bpm', 'bpn', 'bpr', 'br', 'brd', 'bre', 'bric', 'brig', 'brig.ca', 'brig.gen', 'bros', 'bs', 'bsc', 'bsp', 'bsu', 'bt', 'btc', 'btg', 'btg.l', 'btr', 'bts', 'bu', 'bur', 'bz', 'c.a', 'c.a.p', 'c.c.p', 'c.cost', 'c.d a', 'c.d', 'c.le', 'c.m', 'c.opv', 'c.p', 'c.s', 'c.v', 'c.v.d', 'c/a', 'c/c', 'c/pag', 'ca', 'ca.rep', 'ca.sm', 'ca.sz', 'ca.uf', 'caaf', 'cab', 'cad', 'cae', 'cai', 'cal', 'cam', 'cap', 'capol', 'capt', 'car', 'car.sc', 'carat', 'card', 'cas', 'casaca', 'casd', 'cass.civ', 'cat', 'caus', 'cav', 'cavg', 'cb', 'cbd', 'cbr', 'cbs', 'cc', 'cca', 'ccap', 'ccda', 'ccdp', 'ccee', 'cciaa', 'ccie', 'ccip', 'cciss', 'ccna', 'ccnl', 'ccnp', 'ccpb', 'ccs', 'ccsp', 'cctld', 'cctv', 'ccv', 'cd', 'cda', 'cdma', 'cdo', 'cdpd', 'cdr', 'cds', 'cdw', 'ce', 'ced', 'cee', 'cei', 'cemat', 'cenelec', 'centr', 'cepis', 'ceps', 'cept', 'cerit', 'cese', 'cesis', 'cesvot', 'cet', 'cf', 'cfa', 'cfr', 'cg', 'cgi', 'cgil', 'cgs', 'ch', 'chf', 'chim', 'chim. ind', 'chir', 'ci', 'ci-europa', 'ciber', 'cicae', 'cid', 'cie', 'cif', 'cifej', 'cig', 'cigs', 'cii', 'cilea', 'cilo', 'cim', 'cime', 'cin', 'cinit', 'cio', 'cipe', 'cirm', 'cisal', 'ciscs', 'cisd', 'cisl', 'cism', 'citol', 'cl', 'class', 'cli', 'cm', 'cmdr', 'cme', 'cmo', 'cmr', 'cms', 'cmyk', 'cm²', 'cm³', 'cn', 'cna', 'cnb', 'cnc', 'cnel', 'cngei', 'cni', 'cnipa', 'cnit', 'cnn', 'cnr', 'cns', 'cnt', 'cnvvf', 'co', 'co.ing', 'co.sa', 'cobas', 'coc', 'cod', 'cod. civ', 'cod. deont. not', 'cod. pen', 'cod. proc. civ', 'cod. proc. pen', 'codec', 'coi', 'col', 'colf', 'coll', 'com', 'comdr', 'comm', 'comp', 'compar', 'compl', 'con', 'conai', 'conc', 'concl', 'condiz', 'confetra', 'confitarma', 'confr', 'cong', 'congeav', 'congiunt', 'coni', 'coniug', 'consec', 'consob', 'contab', 'contr', 'coreco', 'corp', 'corr', 'correl', 'corrisp', 'cosap', 'cospe', 'cost', 'costr', 'cpc', 'cpdel', 'cpe', 'cpi', 'cpl', 'cpt', 'cpu', 'cr', 'cral', 'credem', 'crf', 'cri', 'cric', 'cristall', 'crm', 'cro', 'cron', 'crsm', 'crt', 'cs', 'csa', 'csai', 'csc', 'csm', 'csn', 'css', 'ct', 'ctc', 'cti', 'ctr', 'ctsis', 'cuc', 'cud', 'cun', 'cup', 'cusi', 'cvb', 'cvbs', 'cwt', 'cz', 'd', 'd.c', 'd.i.a', 'dab', 'dac', 'dam', 'dams', 'dat', 'dau', 'db', 'dbms', 'dc', 'dca', 'dccc', 'dda', 'ddp', 'ddr', 'ddt', 'dea', 'decoraz', 'dect', 'dek', 'denom', 'deriv', 'derm', 'determ', 'df', 'dfp', 'dg', 'dga', 'dhcp', 'di', 'dia', 'dial', 'dic', 'dicomac', 'dif', 'difett', 'dig. iv', 'digos', 'dimin', 'dimostr', 'din', 'dipart', 'diplom', 'dir', 'dir. amm', 'dir. can', 'dir. civ', 'dir. d. lav', 'dir. giur', 'dir. internaz', 'dir. it', 'dir. pen', 'dir. priv', 'dir. proces', 'dir. pub', 'dir. rom', 'disus', 'diy', 'dl', 'dlf', 'dm', 'dme', 'dmf', 'dmo', 'dmoz', 'dm²', 'dm³', 'dnr', 'dns', 'doa', 'doc', 'docg', 'dom', 'dop', 'dos', 'dott', 'dpa', 'dpi', 'dpl', 'dpof', 'dps', 'dpt', 'dr', 'dra', 'drm', 'drs', 'dry pt', 'ds', 'dslam', 'dspn', 'dss', 'dtc', 'dtmf', 'dtp', 'dts', 'dv', 'dvb', 'dvb-t', 'dvd', 'dvi', 'dwdm', 'e.g', 'e.p.c', 'ead', 'eafrd', 'ean', 'eap', 'easw', 'eb', 'eban', 'ebr', 'ebri', 'ebtn', 'ecc', 'eccl', 'ecdl', 'ecfa', 'ecff', 'ecg', 'ecm', 'econ', 'econ. az', 'econ. dom', 'econ. pol', 'ecpnm', 'ed', 'ed agg', 'edge', 'edi', 'edil', 'edit', 'ef', 'efa', 'efcb', 'efp', 'efsa', 'efta', 'eg', 'egiz', 'egl', 'egr', 'ei', 'eisa', 'elab', 'elettr', 'elettron', 'ellitt', 'emap', 'emas', 'embr', 'emdr', 'emi', 'emr', 'en', 'enaip', 'enal', 'enaoli', 'enapi', 'encat', 'enclic', 'enea', 'enel', 'eni', 'enigm', 'enit', 'enol', 'enpa', 'enpaf', 'enpals', 'enpi', 'enpmf', 'ens', 'entom', 'epd', 'epigr', 'epirbs', 'epl', 'epo', 'ept', 'erc', 'ercom', 'ermes', 'erp', 'es', 'esa', 'escl', 'esist', 'eso', 'esp', 'estens', 'estr. min', 'etacs', 'etf', 'eti', 'etim', 'etn', 'etol', 'eu', 'eufem', 'eufic', 'eula', 'eva®', 'f.a', 'f.b', 'f.m', 'f.p', 'fa', 'fabi', 'fac', 'facl', 'facs', 'fad', 'fai', 'faile', 'failp', 'failpa', 'faisa', 'falcri', 'fam', 'famar', 'fans', 'fao', 'fapav', 'faq', 'farm', 'fasi', 'fasib', 'fatt', 'fbe', 'fbi', 'fc', 'fco', 'fcp', 'fcr', 'fcu', 'fdi', 'fe', 'feaog', 'feaosc', 'feb', 'fedic', 'fema', 'feoga', 'ferr', 'fesco', 'fesr', 'fess', 'fg', 'fi', 'fiaf', 'fiaip', 'fiais', 'fialtel', 'fiap', 'fiapf', 'fiat', 'fiavet', 'fic', 'ficc', 'fice', 'fidal', 'fidam', 'fidapa', 'fieg', 'fifa', 'fifo', 'fig', 'figc', 'figs', 'filat', 'filcams', 'file', 'filol', 'filos', 'fim', 'fima', 'fimmg', 'fin', 'finco', 'fio', 'fioto', 'fipe', 'fipresci', 'fis', 'fisar', 'fisc', 'fisg', 'fisiol', 'fisiopatol', 'fistel', 'fit', 'fita', 'fitav', 'fits', 'fiv', 'fivet', 'fivl', 'flo', 'flpd', 'fluid pt', 'fm', 'fmcg', 'fmi', 'fmth', 'fnas', 'fnomceo', 'fnsi', 'fob', 'fod', 'folcl', 'fon', 'fop', 'fotogr', 'fp', 'fpc', 'fpld', 'fr', 'fra', 'fs', 'fsc', 'fse', 'fsf', 'fsfi', 'fsh', 'ft', 'ftase', 'ftbcc', 'fte', 'ftp', 'fts', 'ft²', 'ft³', 'fuaav', 'fut', 'fv', 'fvg', 'g.fv', 'g.u', 'g.u.el', 'gal', 'gats', 'gatt', 'gb', 'gc', 'gccc', 'gco', 'gcost', 'gd', 'gdd', 'gdf', 'gdi', 'gdo', 'gdp', 'ge', 'gea', 'gel', 'gen', 'geneal', 'geod', 'geofis', 'geogr', 'geogr. antr', 'geogr. fis', 'geol', 'geom', 'gep', 'germ', 'gescal', 'gg', 'ggv', 'gi', 'gia', 'gides', 'gift', 'gio', 'giorn', 'gis', 'gisma', 'gismo', 'giu', 'gm', 'gmdss', 'gme', 'gmo', 'go', 'gov', 'gp', 'gpl', 'gprs', 'gps', 'gr', 'gr.sel.spec', 'gr.sel.tr', 'gr.sqd', 'gra', 'gram', 'grano', 'grd', 'grtn', 'grv', 'gsa', 'gsm', 'gsm-r', 'gsr', 'gtld', 'gu', 'guce', 'gui', 'gus', 'ha', 'haart', 'haccp', 'hba', 'hcg', 'hcrp', 'hd-dvd', 'hdcp', 'hdi', 'hdml', 'hdtv', 'hepa', 'hfpa', 'hg', 'hifi', 'hiperlan', 'hiv', 'hm', 'hmld', 'hon', 'hosp', 'hpv', 'hr', 'hrh', 'hrm', 'hrt', 'html', 'http', 'hvac', 'hz', 'i.e', 'i.g.m', 'iana', 'iasb', 'iasc', 'iass', 'iat', 'iata', 'iatse', 'iau', 'iban', 'ibid', 'ibm', 'icann', 'icao', 'icbi', 'iccu', 'ice', 'icf', 'ici', 'icm', 'icom', 'icon', 'ics', 'icsi', 'icstis', 'ict', 'icta', 'id', 'iden', 'idl', 'idraul', 'iec', 'iedm', 'ieee', 'ietf', 'ifat', 'ifel', 'ifla', 'ifrs', 'ifto', 'ifts', 'ig', 'igm', 'igmp', 'igp', 'iims', 'iipp', 'ilm', 'ilo', 'ilor', 'ils', 'im', 'imaie', 'imap', 'imc', 'imdb', 'imei', 'imi', 'imms', 'imo', 'imp', 'imper', 'imperf', 'impers', 'imq', 'ims', 'imsi', 'in', 'inail', 'inca', 'incb', 'inci', 'ind', 'ind. agr', 'ind. alim', 'ind. cart', 'ind. chim', 'ind. cuoio', 'ind. estratt', 'ind. graf', 'ind. mecc', 'ind. tess', 'indecl', 'indef', 'indeterm', 'indire', 'inea', 'inf', 'infea', 'infm', 'inform', 'ing', 'ingl', 'inmarsat', 'inpdai', 'inpdap', 'inpgi', 'inps', 'inr', 'inran', 'ins', 'insp', 'int', 'inter', 'intr', 'invar', 'invim', 'in²', 'in³', 'ioma', 'iosco', 'ip', 'ipab', 'ipasvi', 'ipi', 'ippc', 'ips', 'iptv', 'iq', 'ira', 'irap', 'ircc', 'ircs', 'irda', 'iref', 'ires', 'iron', 'irpef', 'irpeg', 'irpet', 'irreg', 'is', 'isae', 'isbd', 'isbn', 'isc', 'isdn', 'isee', 'isef', 'isfol', 'isg', 'isi', 'isia', 'ism', 'ismea', 'isnart', 'iso', 'isp', 'ispearmi', 'ispel', 'ispescuole', 'ispesl', 'ispo', 'ispro', 'iss', 'issn', 'istat', 'istol', 'isvap', 'it', 'iti', 'itt', 'ittiol', 'itu', 'iud', 'iugr', 'iulm', 'iva', 'iveco', 'ivg', 'ivr', 'ivs', 'iyhp', 'j', 'jal', 'jit', 'jr', 'jv', 'k', 'kb', 'kee', 'kg', 'kkk', 'klm', 'km', 'km/h', 'kmph', 'kmq', 'km²', 'kr', 'kw', 'kwh', 'l', 'l\'ing', 'l.n', 'l\'avv', 'la', 'lag', 'lan', 'lanc', 'larn', 'laser', 'lat', 'lav', 'lav. femm', 'lav. pubbl', 'laz', 'lb', 'lc', 'lcca', 'lcd', 'le', 'led', 'lett', 'lh', 'li', 'liaf', 'lib', 'lic', 'lic.ord', 'lic.strd', 'licd', 'lice', 'lida', 'lidci', 'liff', 'lifo', 'lig', 'liit', 'lila', 'lilt', 'linfa', 'ling', 'lipu', 'lis', 'lisaac', 'lism', 'lit', 'litab', 'lnp', 'lo', 'loc', 'loc. div', 'lolo', 'lom', 'long', 'lp', 'lrm', 'lrms', 'lsi', 'lsu', 'lt', 'ltd', 'lu', 'lug', 'luiss', 'lun', 'lwt', 'lww', 'm.a', 'm.b', 'm.o', 'm/s', 'ma', 'mac', 'macch', 'mag', 'magg.(maj)', 'magg.gen.(maj.gen.)', 'mai', 'maj', 'mar', 'mar.a', 'mar.ca', 'mar.ord', 'marc', 'mat', 'mater', 'max', 'mb', 'mbac', 'mc', 'mcl', 'mcpc', 'mcs', 'md', 'mdf', 'mdp', 'me', 'mec', 'mecc', 'med', 'mediev', 'mef', 'mer', 'merc', 'merid', 'mesa', 'messrs', 'metall', 'meteor', 'metr', 'metrol', 'mg', 'mgc', 'mgm', 'mi', 'mibac', 'mica', 'microb', 'mifed', 'miglio nautico', 'miglio nautico per ora', 'miglio nautico²', 'miglio²', 'mil', 'mile', 'miles/h', 'milesph', 'min', 'miner', 'mips', 'miptv', 'mit', 'mitol', 'miur', 'ml', 'mlle', 'mls', 'mm', 'mme', 'mms', 'mm²', 'mn', 'mnp', 'mo', 'mod', 'mol', 'mons', 'morf', 'mos', 'mpaa', 'mpd', 'mpeg', 'mpi', 'mps', 'mq', 'mr', 'mrs', 'ms', 'msgr', 'mss', 'mt', 'mto', 'murst', 'mus', 'mvds', 'mws', 'm²', 'm³', 'n.a', 'n.b', 'na', 'naa', 'nafta', 'napt', 'nars', 'nasa', 'nat', 'natas', 'nato', 'nb', 'nba', 'nbc', 'ncts', 'nd', 'nda', 'nde', 'ndr', 'ndt', 'ne', 'ned', 'neg', 'neol', 'netpac', 'neur', 'news!', 'ngcc', 'nhmf', 'nlcc', 'nmr', 'no', 'nodo', 'nom', 'nos', 'nov', 'novissdi', 'npi', 'nr', 'nt', 'nta', 'nts', 'ntsc', 'nu', 'nuct', 'numism', 'nwt', 'nyc', 'nz', 'o.m.i', 'oai-pmh', 'oav', 'oc', 'occ', 'occult', 'oci', 'ocr', 'ocse', 'oculist', 'od', 'odg', 'odp', 'oecd', 'oem', 'ofdm', 'oft', 'og', 'ogg', 'ogi', 'ogm', 'ohim', 'oic', 'oics', 'olaf', 'oland', 'ole', 'oled', 'omi', 'oms', 'on', 'ong', 'onig', 'onlus', 'onomat', 'onpi', 'onu', 'op', 'opac', 'opec', 'opord', 'opsosa', 'or', 'ord', 'ord. scol', 'ore', 'oref', 'orient', 'ornit', 'orogr', 'orp', 'ort', 'os', 'osa', 'osas', 'osd', 'ot', 'ote', 'ott', 'oz', 'p', 'p.a', 'p.c', 'p.c.c', 'p.es', 'p.f', 'p.m', 'p.r', 'p.s', 'p.t', 'p.v', 'pa', 'pac', 'pag./p', 'pagg./pp', 'pai', 'pal', 'paleobot', 'paleogr', 'paleont', 'paleozool', 'paletn', 'pamr', 'pan', 'papir', 'par', 'parapsicol', 'part', 'partic', 'pass', 'pat', 'patol', 'pb', 'pc', 'pci', 'pcm', 'pcmcia', 'pcs', 'pcss', 'pct', 'pd', 'pda', 'pdf', 'pdl', 'pds', 'pe', 'pec', 'ped', 'pedag', 'peg', 'pegg', 'per.ind', 'pers', 'pert', 'pesq', 'pet', 'petr', 'petrogr', 'pfc', 'pg', 'pga', 'pgp', 'pgut', 'ph', 'php', 'pi', 'pics', 'pie', 'pif', 'pii', 'pil', 'pime', 'pin', 'pine', 'pip', 'pir', 'pit', 'pitt', 'piuss', 'pkcs', 'pki', 'pko', 'pl', 'pli', 'plr', 'pm', 'pma', 'pmi', 'pmr', 'pn', 'pnf', 'pnl', 'po', 'poet', 'pof', 'pol', 'pop', 'popitt', 'popol', 'port', 'pos', 'poss', 'post', 'pots', 'pp', 'ppa', 'ppc', 'ppga', 'ppp', 'pps', 'pptt', 'ppv', 'pr', 'pra', 'praa', 'pref', 'preist', 'prep', 'pres', 'pret', 'prg', 'pri', 'priv', 'pro.civ', 'prof', 'pron', 'pronom', 'propr', 'prov', 'prs', 'prtl', 'prusst', 'ps', 'pse', 'psi', 'psicoan', 'psicol', 'pso', 'psp', 'pstn', 'pt', 'ptc', 'pti', 'ptsd', 'ptt', 'pu', 'pug', 'puk', 'put', 'pv', 'pvb', 'pvc', 'pvt', 'pz', 'qb', 'qcs', 'qfd', 'qg', 'qi', 'qlco', 'qlcu', 'qos', 'qualif', 'r-lan', 'r.s', 'ra', 'racc', 'radar', 'radc', 'radiotecn', 'raee', 'raf', 'rag', 'raid', 'ram', 'rar', 'ras', 'rass. avv. stato', 'rc', 'rca', 'rcdp', 'rcs', 'rdc', 'rdco', 'rdf', 'rdi', 'rdp', 'rds', 'rdt', 're', 'rea', 'recipr', 'recl', 'reg', 'region', 'rel', 'rem', 'rep', 'reps', 'res', 'retor', 'rev', 'rfi', 'rfid', 'rg', 'rgb', 'rgc', 'rge', 'rgi', 'rgi bdp', 'rgpt', 'rgt', 'ri', 'riaa', 'riaj', 'riba', 'ric', 'rid', 'rif', 'rifl', 'rina', 'rip', 'ris', 'rit', 'ritts', 'rm', 'rmn', 'rn', 'ro', 'roa', 'roc', 'roi', 'rom', 'roro', 'rov', 'rp', 'rpm', 'rr', 'rrf', 'rs', 'rsc', 'rspp', 'rss', 'rsu', 'rsvp', 'rt', 'rtdpc', 'rtg', 'rtn', 'rtp', 'rttt', 'rvm', 's-dab', 's.a', 's.b.f', 's.n.c', 's.p.a', 's.p.m', 's.r.l', 's.ten', 's.v', 's/m', 'sa', 'sab', 'saca', 'sace', 'sact', 'sad', 'sag', 'sahm', 'sai', 'saisa', 'sam', 'san', 'sanas', 'sape', 'sar', 'sars', 'sart', 'sas', 'sbaf', 'sbas', 'sbn', 'sc', 'sca.sm', 'scherz', 'scien', 'scn', 'scsi', 'scuba', 'scult', 'scut', 'sdds', 'sdiaf', 'sds', 'sdsl', 'se', 'seat', 'sebc', 'sec', 'seca', 'secam', 'secc', 'see', 'seg', 'segg', 'segredifesa', 'sem', 'sempo', 'sen', 'sens', 'seo', 'serg', 'serg.magg.(sgm)', 'serg.magg.ca', 'set', 'sfc', 'sfis', 'sfx', 'sg', 'sga', 'sgc', 'sgg', 'sgml', 'sgt', 'si', 'si@lt', 'sia', 'siae', 'siaic', 'siap', 'sias', 'sic', 'sicav', 'sid', 'sido', 'sie', 'sif', 'sig', 'sig.na', 'sig.ra', 'sige', 'sigg', 'sigill', 'sigo', 'siia', 'simb', 'simbdea', 'simg', 'simo', 'sin', 'sinalv', 'sing', 'sins', 'sinu', 'siocmf', 'siog', 'sioi', 'siommms', 'siot', 'sip', 'sipem', 'sips', 'sirf', 'sirm', 'sis', 'sisde', 'sismi', 'sissa', 'sit', 'siulp', 'siusa', 'sla', 'sldn', 'slm', 'slr', 'sm', 'sma', 'smau', 'smd', 'sme', 'smes', 'smm', 'smpt', 'sms', 'sn', 'snad', 'snai', 'snc', 'sncci', 'sncf', 'sngci', 'snit', 'so', 'soc', 'sociol', 'sogg', 'soho', 'soi', 'sol', 'somipar', 'somm', 'sonar', 'sp', 'spa', 'spe', 'spett', 'spi', 'spm', 'spot', 'spp', 'spreg', 'sq', 'sqd', 'sr', 'srd', 'srl', 'srr', 'ss', 'ssi', 'ssn', 'ssr', 'sss', 'st', 'st. d. arte', 'st. d. dir', 'st. d. filos', 'st. d. rel', 'stat', 'stg', 'stp', 'stw', 'su', 'suap', 'suem', 'suff', 'sup', 'superl', 'supt', 'surg', 'surl', 'susm', 'sut', 'suv', 'sv', 'svga', 'swics', 'swift', 'swot', 'sxga', 'sz', 't-dab', 't.sg', 'ta', 'taa', 'tac', 'tacan', 'tacs', 'taeg', 'tai', 'tan', 'tar', 'targa', 'tav', 'tb', 'tbt', 'tci', 'tcp', 'tcp/ip', 'tcsm', 'tdm', 'tdma', 'te', 'tecn', 'tecnol', 'ted', 'tel', 'telecom', 'temp', 'ten.(lt)', 'ten.col.(ltc)', 'ten.gen', 'teol', 'term', 'tesa', 'tese', 'tesol', 'tess', 'tet', 'tetra', 'tfr', 'tft', 'tfts', 'tgv', 'thx', 'tim', 'tipogr', 'tir', 'tit', 'tld', 'tm', 'tmc', 'tn', 'to', 'toefl', 'ton', 'top', 'topog', 'tos', 'tosap', 'tosc', 'tp', 'tpl', 'tr', 'trad', 'tramat', 'trasp', 'ts', 'tso', 'tuir', 'tuld', 'tv', 'twa', 'twain', 'u.ad', 'u.s', 'ucai', 'ucca', 'ucei', 'ucina', 'uclaf', 'ucoi', 'ucoii', 'ucsi', 'ud', 'udc', 'udi', 'udp', 'ue', 'uefa', 'uemri', 'ufo', 'ugc', 'uhci', 'uhf', 'uht', 'uibm', 'uic', 'uicc', 'uiga', 'uil', 'uilps', 'uisp', 'uits', 'uk', 'ul', 'ull', 'uma', 'umb', 'ummc', 'umss', 'umts', 'unac', 'unar', 'unasp', 'uncem', 'unctad', 'undp', 'unefa', 'unep', 'unesco', 'ungh', 'unhcr', 'uni', 'unicef', 'unitec', 'unpredep', 'unsa', 'upa', 'upc', 'urar', 'urban', 'url', 'urp', 'urss', 'usa', 'usb', 'usfi', 'usga', 'usl', 'usp', 'uspi', 'ussr', 'utap', 'v', 'v.brig', 'v.cte', 'v.m', 'v.p', 'v.r', 'v.s', 'va', 'vab', 'vaio', 'val', 'vas', 'vb', 'vbr', 'vc', 'vcc', 'vcr', 'vda', 've', 'ven', 'ves', 'vesa', 'veter', 'vezz', 'vfb', 'vfp', 'vfx', 'vga', 'vhf', 'vhs', 'vi', 'via', 'vip', 'vis', 'vn', 'vo', 'voc', 'voip', 'vol', 'volg', 'voll', 'vor', 'vpdn', 'vpn', 'vr', 'vs', 'vsp', 'vt', 'vtc', 'vts', 'vtt', 'vv', 'vvf', 'wai', 'wais', 'wan', 'wap', 'wasp', 'wc', 'wcdma', 'wcm', 'wga', 'wi-fi', 'wipo', 'wisp', 'wll', 'wml', 'wms', 'worm', 'wp', 'wpan', 'wssn', 'wto', 'wwan', 'wwf', 'www', 'wygiwys', 'xl', 'xml', 'xs', 'xxl', 'xxs', 'yaf', 'yb', 'yci', 'yd', 'yd²', 'yd³', 'ymca', 'zat', 'zb', 'zcs', 'zdf', 'zdg', 'zift', 'zool', 'zoot', 'ztc', 'ztl', '°c', '°f', '°n', '°ra', '°ré', 'µg']
14
+ PREPOSITIVE_ABBREVIATIONS = ['a.c', 'acc', 'adj', 'adm', 'adv', 'all', 'amn', 'arch', 'asst', 'avv', 'banc', 'bart', 'bcc', 'bldg', 'brig', 'bros', 'c.a', 'c.a.p', 'c.c.p', 'c.m', 'c.p', 'c.p', 'c.s', 'c.v', 'capt', 'cc', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'corr', 'cpl', 'dir', 'dott', 'dott', 'dr', 'dr', 'drs', 'e.p.c', 'ecc', 'egr', 'ens', 'es', 'fatt', 'gen', 'geom', 'gg', 'gov', 'hon', 'hosp', 'hr', 'id', 'ing', 'insp', 'int', "l'avv", "l'ing", 'lett', 'lt', 'maj', 'messrs', 'mlle', 'mm', 'mme', 'mo', 'mons', 'mr', 'mr', 'mrs', 'mrs', 'ms', 'ms', 'msgr', 'n.b', 'ogg', 'on', 'op', 'ord', 'p.c', 'p.c.c', 'p.es', 'p.f', 'p.r', 'p.s', 'p.t', 'p.v', 'pfc', 'ph', 'post', 'pp', 'prof', 'psicol', 'pvt', 'racc', 'rag', 'rep', 'reps', 'res', 'rev', 'ric', 'rif', 'rp', 'rsvp', 'rt', 's.a', 's.b.f', 's.n.c', 's.p.a', 's.p.m', 's.r.l', 'seg', 'sen', 'sens', 'sfc', 'sgg', 'sgt', 'sig', 'sigg', 'soc', 'spett', 'sr', 'ss', 'st', 'supt', 'surg', 'tel', 'u.s', 'v.p', 'v.r', 'v.s']
15
+ NUMBER_ABBREVIATIONS = ['art', 'no', 'nos', 'nr', 'pp']
pysbd/lang/japanese.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
4
+ from pysbd.between_punctuation import BetweenPunctuation
5
+ from pysbd.lang.common import Common, Standard
6
+ from pysbd.punctuation_replacer import replace_punctuation
7
+ from pysbd.cleaner import Cleaner
8
+ from pysbd.utils import Text, Rule
9
+
10
+ class Japanese(Common, Standard):
11
+
12
+ iso_code = 'ja'
13
+
14
+ class Cleaner(Cleaner):
15
+
16
+ def __init__(self, text, lang, doc_type=None):
17
+ super().__init__(text, lang)
18
+
19
+ def clean(self):
20
+ self.remove_newline_in_middle_of_word()
21
+ return self.text
22
+
23
+ def remove_newline_in_middle_of_word(self):
24
+ NewLineInMiddleOfWordRule = Rule(r'(?<=の)\n(?=\S)', '')
25
+ self.text = Text(self.text).apply(NewLineInMiddleOfWordRule)
26
+
27
+ class AbbreviationReplacer(AbbreviationReplacer):
28
+ SENTENCE_STARTERS = []
29
+
30
+ class BetweenPunctuation(BetweenPunctuation):
31
+
32
+ def __init__(self, text):
33
+ super().__init__(text)
34
+
35
+ def replace(self):
36
+ self.sub_punctuation_between_quotes_and_parens()
37
+ return self.text
38
+
39
+ def sub_punctuation_between_parens_ja(self):
40
+ BETWEEN_PARENS_JA_REGEX = r'((?=(?P<tmp>[^()]+|\\{2}|\\.)*)(?P=tmp))'
41
+ self.text = re.sub(BETWEEN_PARENS_JA_REGEX, replace_punctuation,
42
+ self.text)
43
+
44
+ def sub_punctuation_between_quotes_ja(self):
45
+ BETWEEN_QUOTE_JA_REGEX = r'「(?=(?P<tmp>[^「」]+|\\{2}|\\.)*)(?P=tmp)」'
46
+ self.text = re.sub(BETWEEN_QUOTE_JA_REGEX, replace_punctuation,
47
+ self.text)
48
+
49
+ def sub_punctuation_between_quotes_and_parens(self):
50
+ self.sub_punctuation_between_parens_ja()
51
+ self.sub_punctuation_between_quotes_ja()
pysbd/lang/kazakh.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
3
+ from pysbd.lang.common import Common, Standard
4
+ from pysbd.processor import Processor
5
+ from pysbd.utils import Text, Rule
6
+
7
+
8
+ class Kazakh(Common, Standard):
9
+
10
+ iso_code = 'kk'
11
+
12
+ # Handling Cyrillic characters in re module
13
+ # https://stackoverflow.com/a/10982308/5462100
14
+ MULTI_PERIOD_ABBREVIATION_REGEX = r'\b[\u0400-\u0500]+(?:\.\s?[\u0400-\u0500])+[.]|b[a-z](?:\.[a-z])+[.]'
15
+
16
+ class Processor(Processor):
17
+
18
+ def __init__(self, text, lang, char_span=False):
19
+ super().__init__(text, lang, char_span)
20
+
21
+ def between_punctuation(self, txt):
22
+ txt = self.between_punctuation_processor(txt).replace()
23
+ # Rubular: http://rubular.com/r/WRWy56Z5zp
24
+ QuestionMarkFollowedByDashLowercaseRule = Rule(r'(?<=)\?(?=\s*[-—]\s*)', '&ᓷ&')
25
+ # Rubular: http://rubular.com/r/lixxP7puSa
26
+ ExclamationMarkFollowedByDashLowercaseRule = Rule(r'(?<=)!(?=\s*[-—]\s*)', '&ᓴ&')
27
+
28
+ txt = Text(txt).apply(QuestionMarkFollowedByDashLowercaseRule,
29
+ ExclamationMarkFollowedByDashLowercaseRule)
30
+ return txt
31
+
32
+ class Abbreviation(Standard.Abbreviation):
33
+ ABBREVIATIONS = ['afp', 'anp', 'atp', 'bae', 'bg', 'bp', 'cam', 'cctv', 'cd', 'cez', 'cgi', 'cnpc', 'farc', 'fbi', 'eiti', 'epo', 'er', 'gp', 'gps', 'has', 'hiv', 'hrh', 'http', 'icu', 'idf', 'imd', 'ime', 'icu', 'idf', 'ip', 'iso', 'kaz', 'kpo', 'kpa', 'kz', 'kz', 'mri', 'nasa', 'nba', 'nbc', 'nds', 'ohl', 'omlt', 'ppm', 'pda', 'pkk', 'psm', 'psp', 'raf', 'rss', 'rtl', 'sas', 'sme', 'sms', 'tnt', 'udf', 'uefa', 'usb', 'utc', 'x', 'zdf', 'әқбк', 'әқбк', 'аақ', 'авг.', 'aбб', 'аек', 'ак', 'ақ', 'акцион.', 'акср', 'ақш', 'англ', 'аөсшк', 'апр', 'м.', 'а.', 'р.', 'ғ.', 'апр.', 'аум.', 'ацат', 'әч', 'т. б.', 'б. з. б.', 'б. з. б.', 'б. з. д.', 'б. з. д.', 'биікт.', 'б. т.', 'биол.', 'биохим', 'бө', 'б. э. д.', 'бта', 'бұұ', 'вич', 'всоонл', 'геогр.', 'геол.', 'гленкор', 'гэс', 'қк', 'км', 'г', 'млн', 'млрд', 'т', 'ғ. с.', 'ғ.', 'қ.', 'ғ.', 'дек.', 'днқ', 'дсұ', 'еақк', 'еқыұ', 'ембімұнайгаз', 'ео', 'еуразэқ', 'еуроодақ', 'еұу', 'ж.', 'ж.', 'жж.', 'жоо', 'жіө', 'жсдп', 'жшс', 'іім', 'инта', 'исаф', 'камаз', 'кгб', 'кеу', 'кг', 'км²', 'км²', 'км³', 'км³', 'кимеп', 'кср', 'ксро', 'кокп', 'кхдр', 'қазатомпром', 'қазкср', 'қазұу', 'қазмұнайгаз', 'қазпошта', 'қазтаг', 'қазұу', 'қкп', 'қмдб', 'қр', 'қхр', 'лат.', 'м²', 'м²', 'м³', 'м³', 'магатэ', 'май.', 'максам', 'мб', 'мвт', 'мемл', 'м', 'мсоп', 'мтк', 'мыс.', 'наса', 'нато', 'нквд', 'нояб.', 'обл.', 'огпу', 'окт.', 'оңт.', 'опек', 'оеб', 'өзенмұнайгаз', 'өф', 'пәк', 'пед.', 'ркфср', 'рнқ', 'рсфср', 'рф', 'свс', 'сву', 'сду', 'сес', 'сент.', 'см', 'снпс', 'солт.', 'солт.', 'сооно', 'ссро', 'сср', 'ссср', 'ссс', 'сэс', 'дк', 'т. б.', 'т', 'тв', 'тереңд.', 'тех.', 'тжқ', 'тмд', 'төм.', 'трлн', 'тр', 'т.', 'и.', 'м.', 'с.', 'ш.', 'т.', 'т. с. с.', 'тэц', 'уаз', 'уефа', 'еқыұ', 'ұқк', 'ұқшұ', 'февр.', 'фққ', 'фсб', 'хим.', 'хқко', 'шұар', 'шыұ', 'экон.', 'экспо', 'цтп', 'цас', 'янв.', 'dvd', 'жкт', 'ққс', 'км', 'ацат', 'юнеско', 'ббс', 'mgm', 'жск', 'зоо', 'бсн', 'өұқ', 'оар', 'боак', 'эөкк', 'хтқо', 'әөк', 'жэк', 'хдо', 'спбму', 'аф', 'сбд', 'амт', 'гсдп', 'гсбп', 'эыдұ', 'нұсжп', 'шыұ', 'жтсх', 'хдп', 'эқк', 'фкққ', 'пиқ', 'өгк', 'мбф', 'маж', 'кота', 'тж', 'ук', 'обб', 'сбл', 'жхл', 'кмс', 'бмтрк', 'жққ', 'бхооо', 'мқо', 'ржмб', 'гулаг', 'жко', 'еэы', 'еаэы', 'кхдр', 'рфкп', 'рлдп', 'хвқ', 'мр', 'мт', 'кту', 'ртж', 'тим', 'мемдум', 'ксро', 'т.с.с', 'с.ш.', 'ш.б.', 'б.б.', 'руб', 'мин', 'акад.', 'ғ.', 'мм', 'мм.']
34
+ PREPOSITIVE_ABBREVIATIONS = []
35
+ NUMBER_ABBREVIATIONS = []
36
+
37
+ class AbbreviationReplacer(AbbreviationReplacer):
38
+
39
+ SENTENCE_STARTERS = []
40
+
41
+ def __init__(self, text, lang):
42
+ super().__init__(text, lang)
43
+
44
+ def replace(self):
45
+ SingleUpperCaseCyrillicLetterAtStartOfLineRule = Rule(r'(?<=^[А-ЯЁ])\.(?=\s)', '∯')
46
+ SingleUpperCaseCyrillicLetterRule = Rule(r'(?<=\s[А-ЯЁ])\.(?=\s)', '∯')
47
+ self.text = Text(self.text).apply(SingleUpperCaseCyrillicLetterAtStartOfLineRule,
48
+ SingleUpperCaseCyrillicLetterRule)
49
+ self.replace_multi_period_abbreviations()
50
+ return self.text
pysbd/lang/marathi.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Grammer rules from https://gopract.com/Pages/Marathi-Grammar-Viramchinah.aspx
3
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
4
+ from pysbd.lang.common import Common, Standard
5
+
6
+ class Marathi(Common, Standard):
7
+
8
+ iso_code = 'mr'
9
+
10
+ SENTENCE_BOUNDARY_REGEX = r'.*?[.!?]|.*?$'
11
+ Punctuations = ['.', '!', '?']
12
+
13
+ class AbbreviationReplacer(AbbreviationReplacer):
14
+ SENTENCE_STARTERS = []
pysbd/lang/persian.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+
4
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
5
+ from pysbd.lang.common import Common, Standard
6
+ from pysbd.utils import Rule
7
+
8
+ class Persian(Common, Standard):
9
+
10
+ iso_code = 'fa'
11
+
12
+ Punctuations = ['?', '!', ':', '.', '؟']
13
+ SENTENCE_BOUNDARY_REGEX = r'.*?[:\.!\?؟]|.*?\Z|.*?$'
14
+
15
+ # Rubular: http://rubular.com/r/RX5HpdDIyv
16
+ ReplaceColonBetweenNumbersRule = Rule(r'(?<=\d):(?=\d)', '♭')
17
+
18
+ # Rubular: http://rubular.com/r/kPRgApNHUg
19
+ ReplaceNonSentenceBoundaryCommaRule = Rule(r'،(?=\s\S+،)', '♬')
20
+
21
+ class AbbreviationReplacer(AbbreviationReplacer):
22
+
23
+ SENTENCE_STARTERS = []
24
+
25
+ def __init__(self, text, lang):
26
+ super().__init__(text, lang)
27
+
28
+ def scan_for_replacements(self, txt, am, index, character_array):
29
+ txt = re.sub('(?<={0})\.'.format(am), '∯', txt)
30
+ return txt
pysbd/lang/polish.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
3
+ from pysbd.lang.common import Common, Standard
4
+
5
+ class Polish(Common, Standard):
6
+
7
+ iso_code = 'pl'
8
+
9
+ class AbbreviationReplacer(AbbreviationReplacer):
10
+ SENTENCE_STARTERS = []
11
+
12
+ class Abbreviation(Standard.Abbreviation):
13
+ ABBREVIATIONS = ['ags', 'alb', 'ang', 'aor', 'awest', 'bałt', 'bojkow', 'bret', 'brus', 'bsł', 'bułg', 'c.b.d.o', 'c.b.d.u', 'celt', 'chorw', 'cs', 'czakaw', 'czerw', 'czes', 'dłuż', 'dniem', 'dor', 'dubrow', 'duń', 'ekaw', 'fiń', 'franc', 'gal', 'germ', 'głuż', 'gniem', 'goc', 'gr', 'grudz', 'hebr', 'het', 'hol', 'I cont', 'ie', 'ikaw', 'irań', 'irl', 'islandz', 'itd', 'itd.', 'itp', 'jekaw', 'kajkaw', 'kasz', 'kirg', 'kwiec', 'łac', 'lip', 'listop', 'lit', 'łot', 'lp', 'maced', 'mar', 'młpol', 'moraw', 'n.e', 'nb.', 'ngr', 'niem', 'nord', 'norw', 'np', 'np.', 'ok.', 'orm', 'oset', 'osk', 'p.n', 'p.n.e', 'p.o', 'pazdz', 'pers', 'pie', 'pod red.', 'podhal', 'pol', 'połab', 'port', 'prekm', 'pskow', 'psł', 'R cont', 'rez', 'rom', 'rozdz.', 'rum', 'rus', 'rys.', 'sas', 'sch', 'scs', 'serb', 'sierp', 'śl', 'sła', 'słe', 'słi', 'słow', 'sp. z o.o', 'śrdniem', 'śrgniem', 'śrirl', 'stbułg', 'stind', 'stpol', 'stpr', 'str.', 'strus', 'stwniem', 'stycz', 'sztokaw', 'szwedz', 't.', 'tj.', 'tłum.', 'toch', 'tur', 'tzn', 'ukr', 'ul', 'umbr', 'wed', 'węg', 'wlkpol', 'włos', 'wrzes', 'wyd.', 'zakarp']
14
+ PREPOSITIVE_ABBREVIATIONS = []
15
+ NUMBER_ABBREVIATIONS = []
pysbd/lang/russian.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+
4
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
5
+ from pysbd.lang.common import Common, Standard
6
+
7
+ class Russian(Common, Standard):
8
+
9
+ iso_code = 'ru'
10
+
11
+ class Abbreviation(Standard.Abbreviation):
12
+ ABBREVIATIONS = ["y", "y.e", "а", "авт", "адм.-терр", "акад", "в", "вв", "вкз", "вост.-европ", "г", "гг", "гос", "гр", "д", "деп", "дисс", "дол", "долл", "ежедн", "ж", "жен", "з", "зап", "зап.-европ", "заруб", "и", "ин", "иностр", "инст", "к", "канд", "кв", "кг", "куб", "л", "л.h", "л.н", "м", "мин", "моск", "муж", "н", "нед", "о", "п", "пгт", "пер", "пп", "пр", "просп", "проф", "р", "руб", "с", "сек", "см", "спб", "стр", "т", "тел", "тов", "тт", "тыс", "у", "у.е", "ул", "ф", "ч"]
13
+ PREPOSITIVE_ABBREVIATIONS = []
14
+ NUMBER_ABBREVIATIONS = []
15
+
16
+ class AbbreviationReplacer(AbbreviationReplacer):
17
+
18
+ SENTENCE_STARTERS = []
19
+
20
+ def __init__(self, text, lang):
21
+ super().__init__(text, lang)
22
+
23
+ def replace_period_of_abbr(self, txt, abbr):
24
+ txt = re.sub(r'(?<=\s{abbr})\.'.format(abbr=abbr.strip()), '∯', txt)
25
+ txt = re.sub(r'(?<=\A{abbr})\.'.format(abbr=abbr.strip()), '∯', txt)
26
+ txt = re.sub(r'(?<=^{abbr})\.'.format(abbr=abbr.strip()), '∯', txt)
27
+ return txt
pysbd/lang/slovak.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
4
+ from pysbd.between_punctuation import BetweenPunctuation
5
+ from pysbd.lang.common import Common, Standard
6
+ from pysbd.processor import Processor
7
+ from pysbd.utils import Text
8
+ from pysbd.punctuation_replacer import replace_punctuation
9
+ from pysbd.lists_item_replacer import ListItemReplacer
10
+
11
+
12
+ class Slovak(Common, Standard):
13
+
14
+ iso_code = 'sk'
15
+
16
+ class ListItemReplacer(ListItemReplacer):
17
+
18
+ def add_line_break(self):
19
+ # We've found alphabetical lists are causing a lot of problems with abbreviations
20
+ # with multiple periods and spaces, such as 'Company name s. r. o.'. Disabling
21
+ # alphabetical list parsing seems like a reasonable tradeoff.
22
+
23
+ # self.format_alphabetical_lists()
24
+ self.format_roman_numeral_lists()
25
+ self.format_numbered_list_with_periods()
26
+ self.format_numbered_list_with_parens()
27
+ return self.text
28
+
29
+ class AbbreviationReplacer(AbbreviationReplacer):
30
+ SENTENCE_STARTERS = []
31
+
32
+ def replace_period_of_abbr(self, txt, abbr):
33
+ # This is a very simple version of the original function, which makes sure
34
+ # all of the periods in the abbreviation get replaced, not only the last one.
35
+ # In Slovak language we use a lot of abbreviations like 'Company Name s. r. o.', so it
36
+ # is important to handle this properly.
37
+
38
+ abbr_new = abbr.replace(".", "∯") + "∯"
39
+ txt = txt.replace(abbr + ".", abbr_new)
40
+ return txt
41
+
42
+ class Abbreviation(Standard.Abbreviation):
43
+ ABBREVIATIONS = ['č', 'no', 'nr', 's. r. o', 'ing', 'p', 'a. d', 'o. k', 'pol. pr', 'a. s. a. p', 'p. n. l', 'red', 'o.k', 'a.d', 'm.o', 'pol.pr', 'a.s.a.p', 'p.n.l', 'pp', 'sl', 'corp', 'plgr', 'tz', 'rtg', 'o.c.p', 'o. c. p', 'c.k', 'c. k', 'n.a', 'n. a', 'a.m', 'a. m', 'vz', 'i.b', 'i. b', 'ú.p.v.o', 'ú. p. v. o', 'bros', 'rsdr', 'doc', 'tu', 'ods', 'n.w.a', 'n. w. a', 'nár', 'pedg', 'paeddr', 'rndr', 'naprk', 'a.g.p', 'a. g. p', 'prof', 'pr', 'a.v', 'a. v', 'por', 'mvdr', 'nešp', 'u.s', 'u. s', 'kt', 'vyd', 'e.t', 'e. t', 'al', 'll.m', 'll. m', 'o.f.i', 'o. f. i', 'mr', 'apod', 'súkr', 'stred', 's.e.g', 's. e. g', 'sr', 'tvz', 'ind', 'var', 'etc', 'atd', 'n.o', 'n. o', 's.a', 's. a', 'např', 'a.i.i', 'a. i. i', 'a.k.a', 'a. k. a', 'konkr', 'čsl', 'odd', 'ltd', 't.z', 't. z', 'o.z', 'o. z', 'obv', 'obr', 'pok', 'tel', 'št', 'skr', 'phdr', 'xx', 'š.p', 'š. p', 'ph.d', 'ph. d', 'm.n.m', 'm. n. m', 'zz', 'roz', 'atď.', 'ev', 'v.sp', 'v. sp', 'drsc', 'mudr', 't.č', 't. č', 'el', 'os', 'co', 'r.o', 'r. o', 'str', 'p.a', 'p. a', 'zdravot', 'prek', 'gen', 'viď', 'dr', 'cca', 'p.s', 'p. s', 'zák', 'slov', 'arm', 'inc', 'max', 'd.c', 'k.o', 'a. r. k', 'd. c', 'k. o', 'a. r. k', 'soc', 'bc', 'zs', 'akad', 'sz', 'pozn', 'tr', 'nám', 'kol', 'csc', 'ul', 'sp', 'o.i', 'jr', 'zb', 'sv', 'tj', 'čs', 'tzn', 'príp', 'iv', 'hl', 'st', 'pod', 'vi', 'tis', 'stor', 'rozh', 'mld', 'atď', 'mgr', 'a.s', 'a. s', 'phd', 'z.z', 'z. z', 'judr', 'ing', 'hod', 'vs', 'písm', 's.r.o', 'min', 'ml', 'iii', 't.j', 't. j', 'spol', 'mil', 'ii', 'napr', 'resp', 'tzv']
44
+ PREPOSITIVE_ABBREVIATIONS = ['st', 'p', 'dr', 'mudr', 'judr', 'ing', 'mgr', 'bc', 'drsc', 'doc', 'prof']
45
+ NUMBER_ABBREVIATIONS = ['č', 'no', 'nr']
46
+
47
+ class BetweenPunctuation(BetweenPunctuation):
48
+ # Rubular: https://rubular.com/r/rImWbaYFtHHtf4
49
+ BETWEEN_SLOVAK_DOUBLE_QUOTES_REGEX = r'„(?>[^“\\]+|\\{2}|\\.)*“'
50
+ BETWEEN_SLOVAK_DOUBLE_QUOTES_REGEX_2 = r'\„(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)\“'
51
+
52
+ def sub_punctuation_between_slovak_double_quotes(self, txt):
53
+ return re.sub(self.BETWEEN_SLOVAK_DOUBLE_QUOTES_REGEX_2, replace_punctuation, txt)
54
+
55
+ def sub_punctuation_between_quotes_and_parens(self, txt):
56
+ txt = self.sub_punctuation_between_single_quotes(txt)
57
+ txt = self.sub_punctuation_between_single_quote_slanted(txt)
58
+ txt = self.sub_punctuation_between_double_quotes(txt)
59
+ txt = self.sub_punctuation_between_square_brackets(txt)
60
+ txt = self.sub_punctuation_between_parens(txt)
61
+ txt = self.sub_punctuation_between_quotes_arrow(txt)
62
+ txt = self.sub_punctuation_between_em_dashes(txt)
63
+ txt = self.sub_punctuation_between_quotes_slanted(txt)
64
+ txt = self.sub_punctuation_between_slovak_double_quotes(txt)
65
+ return txt
66
+
67
+ class Processor(Processor):
68
+
69
+ def __init__(self, text, lang, char_span=False):
70
+ super().__init__(text, lang, char_span)
71
+
72
+ def process(self):
73
+ if not self.text:
74
+ return self.text
75
+ self.text = self.text.replace('\n', '\r')
76
+
77
+ # Here we use language specific ListItemReplacer:
78
+ li = self.lang.ListItemReplacer(self.text)
79
+ self.text = li.add_line_break()
80
+
81
+ self.replace_abbreviations()
82
+ self.replace_numbers()
83
+ self.replace_continuous_punctuation()
84
+ self.replace_periods_before_numeric_references()
85
+ self.text = Text(self.text).apply(
86
+ self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,
87
+ self.lang.GeoLocationRule, self.lang.FileFormatRule)
88
+ postprocessed_sents = self.split_into_segments()
89
+ return postprocessed_sents
90
+
91
+ def replace_numbers(self):
92
+ self.text = Text(self.text).apply(*self.lang.Numbers.All)
93
+ self.replace_period_in_slovak_dates()
94
+ self.replace_period_in_ordinal_numerals()
95
+ self.replace_period_in_roman_numerals()
96
+ return self.text
97
+
98
+ def replace_period_in_ordinal_numerals(self):
99
+ # Rubular: https://rubular.com/r/0HkmvzMGTqgWs6
100
+ self.text = re.sub(r'(?<=\d)\.(?=\s*[a-z]+)', '∯', self.text)
101
+
102
+ def replace_period_in_roman_numerals(self):
103
+ # Rubular: https://rubular.com/r/XlzTIi7aBRThSl
104
+ self.text = re.sub(r'((\s+[VXI]+)|(^[VXI]+))(\.)(?=\s+)', r'\1∯', self.text, re.IGNORECASE)
105
+
106
+ def replace_period_in_slovak_dates(self):
107
+ MONTHS = ['Január', 'Február', 'Marec', 'Apríl', 'Máj', 'Jún', 'Júl', 'August', 'September', 'Október', 'November', 'December',
108
+ 'Januára', 'Februára', 'Marca', 'Apríla', 'Mája', 'Júna', 'Júla', 'Augusta', 'Septembra', 'Októbra', 'Novembra', 'Decembra']
109
+ for month in MONTHS:
110
+ # Rubular: https://rubular.com/r/dGLZqsbjcdJvCd
111
+ self.text = re.sub(r'(?<=\d)\.(?=\s*{month})'.format(month=month), '∯', self.text)
pysbd/lang/spanish.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
3
+ from pysbd.lang.common import Common, Standard
4
+
5
+ class Spanish(Common, Standard):
6
+
7
+ iso_code = 'es'
8
+
9
+ class AbbreviationReplacer(AbbreviationReplacer):
10
+ SENTENCE_STARTERS = []
11
+
12
+ class Abbreviation(Standard.Abbreviation):
13
+ ABBREVIATIONS = ['a.c', 'a/c', 'abr', 'adj', 'admón', 'afmo', 'ago', 'almte', 'ap', 'apdo', 'arq', 'art', 'atte', 'av', 'avda', 'bco', 'bibl', 'bs. as', 'c', 'c.f', 'c.g', 'c/c', 'c/u', 'cap', 'cc.aa', 'cdad', 'cm', 'co', 'cra', 'cta', 'cv', 'd.e.p', 'da', 'dcha', 'dcho', 'dep', 'dic', 'dicc', 'dir', 'dn', 'doc', 'dom', 'dpto', 'dr', 'dra', 'dto', 'ee', 'ej', 'en', 'entlo', 'esq', 'etc', 'excmo', 'ext', 'f.c', 'fca', 'fdo', 'febr', 'ff. aa', 'ff.cc', 'fig', 'fil', 'fra', 'g.p', 'g/p', 'gob', 'gr', 'gral', 'grs', 'hnos', 'hs', 'igl', 'iltre', 'imp', 'impr', 'impto', 'incl', 'ing', 'inst', 'izdo', 'izq', 'izqdo', 'j.c', 'jue', 'jul', 'jun', 'kg', 'km', 'lcdo', 'ldo', 'let', 'lic', 'ltd', 'lun', 'mar', 'may', 'mg', 'min', 'mié', 'mm', 'máx', 'mín', 'mt', 'n. del t', 'n.b', 'no', 'nov', 'ntra. sra', 'núm', 'oct', 'p', 'p.a', 'p.d', 'p.ej', 'p.v.p', 'párrf', 'ppal', 'prev', 'prof', 'prov', 'ptas', 'pts', 'pza', 'pág', 'págs', 'párr', 'q.e.g.e', 'q.e.p.d', 'q.e.s.m', 'reg', 'rep', 'rr. hh', 'rte', 's', 's. a', 's.a.r', 's.e', 's.l', 's.r.c', 's.r.l', 's.s.s', 's/n', 'sdad', 'seg', 'sept', 'sig', 'sr', 'sra', 'sres', 'srta', 'sta', 'sto', 'sáb', 't.v.e', 'tamb', 'tel', 'tfno', 'ud', 'uu', 'uds', 'univ', 'v.b', 'v.e', 'vd', 'vds', 'vid', 'vie', 'vol', 'vs', 'vto', 'a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
14
+ PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
15
+ NUMBER_ABBREVIATIONS = ['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']
pysbd/lang/urdu.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
3
+ from pysbd.lang.common import Common, Standard
4
+
5
+ class Urdu(Common, Standard):
6
+
7
+ iso_code = 'ur'
8
+
9
+ SENTENCE_BOUNDARY_REGEX = r'.*?[۔؟!\?]|.*?$'
10
+ Punctuations = ['?', '!', '۔', '؟']
11
+
12
+ class AbbreviationReplacer(AbbreviationReplacer):
13
+ SENTENCE_STARTERS = []
pysbd/languages.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from pysbd.lang.english import English
3
+ from pysbd.lang.hindi import Hindi
4
+ from pysbd.lang.marathi import Marathi
5
+ from pysbd.lang.chinese import Chinese
6
+ from pysbd.lang.spanish import Spanish
7
+ from pysbd.lang.amharic import Amharic
8
+ from pysbd.lang.arabic import Arabic
9
+ from pysbd.lang.armenian import Armenian
10
+ from pysbd.lang.bulgarian import Bulgarian
11
+ from pysbd.lang.urdu import Urdu
12
+ from pysbd.lang.russian import Russian
13
+ from pysbd.lang.polish import Polish
14
+ from pysbd.lang.persian import Persian
15
+ from pysbd.lang.dutch import Dutch
16
+ from pysbd.lang.danish import Danish
17
+ from pysbd.lang.french import French
18
+ from pysbd.lang.burmese import Burmese
19
+ from pysbd.lang.greek import Greek
20
+ from pysbd.lang.italian import Italian
21
+ from pysbd.lang.japanese import Japanese
22
+ from pysbd.lang.deutsch import Deutsch
23
+ from pysbd.lang.kazakh import Kazakh
24
+ from pysbd.lang.slovak import Slovak
25
+ from pysbd.lang.armenian import Armenian
26
+
27
+ LANGUAGE_CODES = {
28
+ 'en': English,
29
+ 'hi': Hindi,
30
+ 'mr': Marathi,
31
+ 'zh': Chinese,
32
+ 'es': Spanish,
33
+ 'am': Amharic,
34
+ 'ar': Arabic,
35
+ 'hy': Armenian,
36
+ 'bg': Bulgarian,
37
+ 'ur': Urdu,
38
+ 'ru': Russian,
39
+ 'pl': Polish,
40
+ 'fa': Persian,
41
+ 'nl': Dutch,
42
+ 'da': Danish,
43
+ 'fr': French,
44
+ 'my': Burmese,
45
+ 'el': Greek,
46
+ 'it': Italian,
47
+ 'ja': Japanese,
48
+ 'de': Deutsch,
49
+ 'kk': Kazakh,
50
+ 'sk': Slovak,
51
+ 'hy': Armenian
52
+ }
53
+
54
+
55
+ class Language(object):
56
+
57
+ def __init__(self, code):
58
+ self.code = code
59
+
60
+ @classmethod
61
+ def get_language_code(cls, code):
62
+ try:
63
+ return LANGUAGE_CODES[code]
64
+ except KeyError:
65
+ raise ValueError("Provide valid language ID i.e. ISO code. "
66
+ "Available codes are : {}".format(set(LANGUAGE_CODES.keys())))
pysbd/lists_item_replacer.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import string
3
+ import re
4
+ from pysbd.utils import Rule, Text
5
+ from functools import partial
6
+
7
+
8
+ class ListItemReplacer(object):
9
+
10
+ ROMAN_NUMERALS = "i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx".split(' ')
11
+ LATIN_NUMERALS = list(string.ascii_lowercase)
12
+
13
+ # Rubular: http://rubular.com/r/XcpaJKH0sz
14
+ ALPHABETICAL_LIST_WITH_PERIODS = r'(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)'
15
+
16
+ # Rubular: http://rubular.com/r/Gu5rQapywf
17
+ # TODO: Make sure below regex call is case-insensitive
18
+ ALPHABETICAL_LIST_WITH_PARENS = r'(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))'
19
+
20
+ # (pattern, replacement)
21
+ SubstituteListPeriodRule = Rule('♨', '∯')
22
+ ListMarkerRule = Rule('☝', '')
23
+
24
+ # Rubular: http://rubular.com/r/Wv4qLdoPx7
25
+ # https://regex101.com/r/62YBlv/1
26
+ SpaceBetweenListItemsFirstRule = Rule(r'(?<=\S\S)\s(?=\S\s*\d+♨)', "\r")
27
+
28
+ # Rubular: http://rubular.com/r/AizHXC6HxK
29
+ # https://regex101.com/r/62YBlv/2
30
+ SpaceBetweenListItemsSecondRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}♨)', "\r")
31
+
32
+ # Rubular: http://rubular.com/r/GE5q6yID2j
33
+ # https://regex101.com/r/62YBlv/3
34
+ SpaceBetweenListItemsThirdRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}☝)', "\r")
35
+
36
+ NUMBERED_LIST_REGEX_1 = r'\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))'
37
+ # 1. abcd
38
+ # 2. xyz
39
+ NUMBERED_LIST_REGEX_2 = r'(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))'
40
+ # 1) abcd
41
+ # 2) xyz
42
+ NUMBERED_LIST_PARENS_REGEX = r'\d{1,2}(?=\)\s)'
43
+
44
+ # Rubular: http://rubular.com/r/NsNFSqrNvJ
45
+ # TODO: Make sure below regex call is case-insensitive
46
+ EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX = r'\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))'
47
+
48
+ # Rubular: http://rubular.com/r/wMpnVedEIb
49
+ # TODO: Make sure below regex call is case-insensitive
50
+ ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX = r'(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\.'
51
+
52
+ # Rubular: http://rubular.com/r/GcnmQt4a3I
53
+ ROMAN_NUMERALS_IN_PARENTHESES = r'\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])'
54
+
55
+ def __init__(self, text):
56
+ self.text = text
57
+
58
+ def add_line_break(self):
59
+ self.format_alphabetical_lists()
60
+ self.format_roman_numeral_lists()
61
+ self.format_numbered_list_with_periods()
62
+ self.format_numbered_list_with_parens()
63
+ return self.text
64
+
65
+ def replace_parens(self):
66
+ text = re.sub(self.ROMAN_NUMERALS_IN_PARENTHESES,
67
+ r'&✂&\1&⌬&', self.text)
68
+ return text
69
+
70
+ def format_numbered_list_with_parens(self):
71
+ self.replace_parens_in_numbered_list()
72
+ self.add_line_breaks_for_numbered_list_with_parens()
73
+ self.text = Text(self.text).apply(self.ListMarkerRule)
74
+
75
+ def replace_periods_in_numbered_list(self):
76
+ self.scan_lists(self.NUMBERED_LIST_REGEX_1, self.NUMBERED_LIST_REGEX_2,
77
+ '♨', strip=True)
78
+
79
+ def format_numbered_list_with_periods(self):
80
+ self.replace_periods_in_numbered_list()
81
+ self.add_line_breaks_for_numbered_list_with_periods()
82
+ self.text = Text(self.text).apply(self.SubstituteListPeriodRule)
83
+
84
+ def format_alphabetical_lists(self):
85
+ self.txt = self.add_line_breaks_for_alphabetical_list_with_periods(
86
+ roman_numeral=False)
87
+ self.txt = self.add_line_breaks_for_alphabetical_list_with_parens(
88
+ roman_numeral=False)
89
+ return self.txt
90
+
91
+ def format_roman_numeral_lists(self):
92
+ self.txt = self.add_line_breaks_for_alphabetical_list_with_periods(
93
+ roman_numeral=True)
94
+ self.txt = self.add_line_breaks_for_alphabetical_list_with_parens(
95
+ roman_numeral=True)
96
+ return self.txt
97
+
98
+ def add_line_breaks_for_alphabetical_list_with_periods(
99
+ self, roman_numeral=False):
100
+ txt = self.iterate_alphabet_array(
101
+ self.ALPHABETICAL_LIST_WITH_PERIODS,
102
+ roman_numeral=roman_numeral)
103
+ return txt
104
+
105
+ def add_line_breaks_for_alphabetical_list_with_parens(self, roman_numeral=False):
106
+ txt = self.iterate_alphabet_array(
107
+ self.ALPHABETICAL_LIST_WITH_PARENS,
108
+ parens=True,
109
+ roman_numeral=roman_numeral)
110
+ return txt
111
+
112
+ def scan_lists(self, regex1, regex2, replacement, strip=False):
113
+ list_array = re.findall(regex1, self.text)
114
+ list_array = list(map(int, list_array))
115
+ for ind, item in enumerate(list_array):
116
+ # to avoid IndexError
117
+ # ruby returns nil if index is out of range
118
+ if (ind < len(list_array) - 1 and item + 1 == list_array[ind + 1]):
119
+ self.substitute_found_list_items(regex2, item, strip, replacement)
120
+ elif ind > 0:
121
+ if (((item - 1) == list_array[ind - 1]) or
122
+ ((item == 0) and (list_array[ind - 1] == 9)) or
123
+ ((item == 9) and (list_array[ind - 1] == 0))):
124
+ self.substitute_found_list_items(regex2, item, strip, replacement)
125
+
126
+ def substitute_found_list_items(self, regex, each, strip, replacement):
127
+
128
+ def replace_item(match, val=None, strip=False, repl='♨'):
129
+ match = match.group()
130
+ if strip:
131
+ match = str(match).strip()
132
+ chomped_match = match if len(match) == 1 else match.strip('.])')
133
+ if str(each) == chomped_match:
134
+ return "{}{}".format(each, replacement)
135
+ else:
136
+ return str(match)
137
+
138
+ self.text = re.sub(regex, partial(replace_item, val=each,
139
+ strip=strip, repl=replacement), self.text)
140
+
141
+ def add_line_breaks_for_numbered_list_with_periods(self):
142
+ if ('♨' in self.text) and (not re.search(
143
+ '♨.+(\n|\r).+♨', self.text)) and (not re.search(
144
+ r'for\s\d{1,2}♨\s[a-z]', self.text)):
145
+ self.text = Text(self.text).apply(self.SpaceBetweenListItemsFirstRule,
146
+ self.SpaceBetweenListItemsSecondRule)
147
+
148
+ def replace_parens_in_numbered_list(self):
149
+ self.scan_lists(
150
+ self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝')
151
+ self.scan_lists(self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝')
152
+
153
+ def add_line_breaks_for_numbered_list_with_parens(self):
154
+ if '☝' in self.text and not re.search("☝.+\n.+☝|☝.+\r.+☝", self.text):
155
+ self.text = Text(self.text).apply(
156
+ self.SpaceBetweenListItemsThirdRule)
157
+
158
+ def replace_alphabet_list(self, a):
159
+ """
160
+ Input: 'a. ffegnog b. fgegkl c.'
161
+ Output: \ra∯ ffegnog \rb∯ fgegkl \rc∯
162
+ """
163
+
164
+ def replace_letter_period(match, val=None):
165
+ match = match.group()
166
+ match_wo_period = match.strip('.')
167
+ if match_wo_period == val:
168
+ return '\r{}∯'.format(match_wo_period)
169
+ else:
170
+ return match
171
+
172
+ txt = re.sub(self.ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX,
173
+ partial(replace_letter_period, val=a),
174
+ self.text, flags=re.IGNORECASE)
175
+ return txt
176
+
177
+ def replace_alphabet_list_parens(self, a):
178
+ """
179
+ Input: "a) ffegnog (b) fgegkl c)"
180
+ Output: "\ra) ffegnog \r&✂&b) fgegkl \rc)"
181
+ """
182
+
183
+ def replace_alphabet_paren(match, val=None):
184
+ match = match.group()
185
+ if '(' in match:
186
+ match_wo_paren = match.strip('(')
187
+ if match_wo_paren == val:
188
+ return '\r&✂&{}'.format(match_wo_paren)
189
+ else:
190
+ return match
191
+ else:
192
+ if match == val:
193
+ return '\r{}'.format(match)
194
+ else:
195
+ return match
196
+
197
+ # Make it cases-insensitive
198
+ txt = re.sub(self.EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX,
199
+ partial(replace_alphabet_paren, val=a),
200
+ self.text, flags=re.IGNORECASE)
201
+ return txt
202
+
203
+ def replace_correct_alphabet_list(self, a, parens):
204
+ if parens:
205
+ a = self.replace_alphabet_list_parens(a)
206
+ else:
207
+ a = self.replace_alphabet_list(a)
208
+ return a
209
+
210
+ def last_array_item_replacement(self, a, i, alphabet, list_array, parens):
211
+ if (len(alphabet) == 0) & (len(list_array) == 0) or (
212
+ list_array[i - 1] not in alphabet) or (a not in alphabet):
213
+ return self.text
214
+ if abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1:
215
+ return self.text
216
+ result = self.replace_correct_alphabet_list(a, parens)
217
+ return result
218
+
219
+ def other_items_replacement(self, a, i, alphabet, list_array, parens):
220
+ if (len(alphabet) == 0) & (len(list_array) == 0) or (
221
+ list_array[i - 1] not in alphabet) or (a not in alphabet) or (
222
+ list_array[i + 1] not in alphabet):
223
+ return self.text
224
+ if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 and \
225
+ abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1:
226
+ return self.text
227
+ result = self.replace_correct_alphabet_list(a, parens)
228
+ return result
229
+
230
+ def iterate_alphabet_array(self, regex, parens=False, roman_numeral=False):
231
+ list_array = re.findall(regex, self.text)
232
+ alphabet = self.ROMAN_NUMERALS if roman_numeral else self.LATIN_NUMERALS
233
+ list_array = [i for i in list_array if i in alphabet]
234
+ for ind, each in enumerate(list_array):
235
+ if ind == len(list_array) - 1:
236
+ self.text = self.last_array_item_replacement(each, ind, alphabet, list_array, parens)
237
+ else:
238
+ self.text = self.other_items_replacement(
239
+ each, ind, alphabet, list_array, parens)
240
+ return self.text
pysbd/processor.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+ from pysbd.utils import Text
4
+ from pysbd.lists_item_replacer import ListItemReplacer
5
+ from pysbd.exclamation_words import ExclamationWords
6
+ from pysbd.between_punctuation import BetweenPunctuation
7
+ from pysbd.abbreviation_replacer import AbbreviationReplacer
8
+
9
+ class Processor(object):
10
+
11
+ def __init__(self, text, lang, char_span=False):
12
+ """Process a text - do pre and post processing - to get proper sentences
13
+
14
+ Parameters
15
+ ----------
16
+ text : str
17
+ Original text
18
+ language : object
19
+ Language module
20
+ char_span : bool, optional
21
+ Get start & end character offsets of each sentences
22
+ within original text, by default False
23
+ """
24
+ self.text = text
25
+ self.lang = lang
26
+ self.char_span = char_span
27
+
28
+ def process(self):
29
+ if not self.text:
30
+ return self.text
31
+ self.text = self.text.replace('\n', '\r')
32
+ li = ListItemReplacer(self.text)
33
+ self.text = li.add_line_break()
34
+ self.replace_abbreviations()
35
+ self.replace_numbers()
36
+ self.replace_continuous_punctuation()
37
+ self.replace_periods_before_numeric_references()
38
+ self.text = Text(self.text).apply(
39
+ self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,
40
+ self.lang.GeoLocationRule, self.lang.FileFormatRule)
41
+ postprocessed_sents = self.split_into_segments()
42
+ return postprocessed_sents
43
+
44
+ def rm_none_flatten(self, sents):
45
+ """Remove None values and unpack list of list sents
46
+
47
+ Parameters
48
+ ----------
49
+ sents : list
50
+ list of sentences
51
+
52
+ Returns
53
+ -------
54
+ list
55
+ unpacked and None removed list of sents
56
+ """
57
+ sents = list(filter(None, sents))
58
+ if not any(isinstance(s, list) for s in sents):
59
+ return sents
60
+ new_sents = []
61
+ for sent in sents:
62
+ if isinstance(sent, list):
63
+ for s in sent:
64
+ new_sents.append(s)
65
+ else:
66
+ new_sents.append(sent)
67
+ return new_sents
68
+
69
+ def split_into_segments(self):
70
+ self.check_for_parens_between_quotes()
71
+ sents = self.text.split('\r')
72
+ # remove empty and none values
73
+ sents = self.rm_none_flatten(sents)
74
+ sents = [
75
+ Text(s).apply(self.lang.SingleNewLineRule, *self.lang.EllipsisRules.All)
76
+ for s in sents
77
+ ]
78
+
79
+ # # THESE LINES ARE NOT PRESENT IN THE ORIGINAL CODE --> ONLY USE FOR HYW
80
+ # sents = [self.post_process_segments(s) for s in sents]
81
+ # sents = self.rm_none_flatten(sents)
82
+
83
+
84
+ sents = [self.check_for_punctuation(s) for s in sents]
85
+ # flatten list of list of sentences
86
+ sents = self.rm_none_flatten(sents)
87
+ postprocessed_sents = []
88
+ for sent in sents:
89
+ sent = Text(sent).apply(*self.lang.SubSymbolsRules.All)
90
+ post_process_sent = self.post_process_segments(sent)
91
+ if post_process_sent and isinstance(post_process_sent, str):
92
+ postprocessed_sents.append(post_process_sent)
93
+ elif isinstance(post_process_sent, list):
94
+ for pps in post_process_sent:
95
+ postprocessed_sents.append(pps)
96
+ postprocessed_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule)
97
+ for ns in postprocessed_sents]
98
+ return postprocessed_sents
99
+
100
+ def post_process_segments(self, txt):
101
+ if len(txt) > 2 and re.search(r'\A[a-zA-Z]*\Z', txt):
102
+ return txt
103
+
104
+ # below condition present in pragmatic segmenter
105
+ # dont know significance of it yet.
106
+ # if self.consecutive_underscore(txt) or len(txt) < 2:
107
+ # return txt
108
+
109
+ if re.match(r'\t', txt):
110
+ pass
111
+
112
+ # TODO:
113
+ # Decide on keeping or removing Standard.ExtraWhiteSpaceRule
114
+ # removed to retain original text spans
115
+ # txt = Text(txt).apply(*ReinsertEllipsisRules.All,
116
+ # Standard.ExtraWhiteSpaceRule)
117
+ txt = Text(txt).apply(*self.lang.ReinsertEllipsisRules.All)
118
+ if re.search(self.lang.QUOTATION_AT_END_OF_SENTENCE_REGEX, txt):
119
+ txt = re.split(
120
+ self.lang.SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX, txt)
121
+ return txt
122
+ else:
123
+ txt = txt.replace('\n', '')
124
+ return txt.strip()
125
+
126
+ def check_for_parens_between_quotes(self):
127
+ def paren_replace(match):
128
+ match = match.group()
129
+ sub1 = re.sub(r'\s(?=\()', '\r', match)
130
+ sub2 = re.sub(r'(?<=\))\s', '\r', sub1)
131
+ return sub2
132
+ self.text = re.sub(self.lang.PARENS_BETWEEN_DOUBLE_QUOTES_REGEX,
133
+ paren_replace, self.text)
134
+
135
+ def replace_continuous_punctuation(self):
136
+ def continuous_puncs_replace(match):
137
+ match = match.group()
138
+ sub1 = re.sub(re.escape('!'), '&ᓴ&', match)
139
+ sub2 = re.sub(re.escape('?'), '&ᓷ&', sub1)
140
+ return sub2
141
+ self.text = re.sub(self.lang.CONTINUOUS_PUNCTUATION_REGEX,
142
+ continuous_puncs_replace, self.text)
143
+
144
+ def replace_periods_before_numeric_references(self):
145
+ # https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703
146
+ self.text = re.sub(self.lang.NUMBERED_REFERENCE_REGEX,
147
+ r"∯\2\r\7", self.text)
148
+
149
+ def consecutive_underscore(self, txt):
150
+ # Rubular: http://rubular.com/r/fTF2Ff3WBL
151
+ txt = re.sub(r'_{3,}', '', txt)
152
+ return len(txt) == 0
153
+
154
+ def check_for_punctuation(self, txt):
155
+ if any(p in txt for p in self.lang.Punctuations):
156
+ sents = self.process_text(txt)
157
+ return sents
158
+ else:
159
+ # NOTE: next steps of check_for_punctuation will unpack this list
160
+ return [txt]
161
+
162
+ def process_text(self, txt):
163
+ if txt[-1] not in self.lang.Punctuations:
164
+ txt += 'ȸ'
165
+ txt = ExclamationWords.apply_rules(txt)
166
+ txt = self.between_punctuation(txt)
167
+ # handle text having only doublepunctuations
168
+ if not re.match(self.lang.DoublePunctuationRules.DoublePunctuation, txt):
169
+ txt = Text(txt).apply(*self.lang.DoublePunctuationRules.All)
170
+ txt = Text(txt).apply(self.lang.QuestionMarkInQuotationRule,
171
+ *self.lang.ExclamationPointRules.All)
172
+ txt = ListItemReplacer(txt).replace_parens()
173
+ txt = self.sentence_boundary_punctuation(txt)
174
+ return txt
175
+
176
+ def replace_numbers(self):
177
+ self.text = Text(self.text).apply(*self.lang.Numbers.All)
178
+
179
+ def abbreviations_replacer(self):
180
+ if hasattr(self.lang, "AbbreviationReplacer"):
181
+ return self.lang.AbbreviationReplacer(self.text, self.lang)
182
+ else:
183
+ return AbbreviationReplacer(self.text, self.lang)
184
+
185
+ def replace_abbreviations(self):
186
+ self.text = self.abbreviations_replacer().replace()
187
+
188
+ def between_punctuation_processor(self, txt):
189
+ if hasattr(self.lang, "BetweenPunctuation"):
190
+ return self.lang.BetweenPunctuation(txt)
191
+ else:
192
+ return BetweenPunctuation(txt)
193
+
194
+ def between_punctuation(self, txt):
195
+ txt = self.between_punctuation_processor(txt).replace()
196
+ return txt
197
+
198
+ def sentence_boundary_punctuation(self, txt):
199
+ if hasattr(self.lang, 'ReplaceColonBetweenNumbersRule'):
200
+ txt = Text(txt).apply(
201
+ self.lang.ReplaceColonBetweenNumbersRule)
202
+ if hasattr(self.lang, 'ReplaceNonSentenceBoundaryCommaRule'):
203
+ txt = Text(txt).apply(
204
+ self.lang.ReplaceNonSentenceBoundaryCommaRule)
205
+ # retain exclamation mark if it is an ending character of a given text
206
+ txt = re.sub(r'&ᓴ&$', '!', txt)
207
+ txt = [
208
+ m.group() for m in re.finditer(self.lang.SENTENCE_BOUNDARY_REGEX, txt)
209
+ ]
210
+ return txt
pysbd/punctuation_replacer.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+ from pysbd.utils import Rule, Text
4
+
5
+
6
+ class EscapeRegexReservedCharacters(object):
7
+ LeftParen = Rule(r'\(', '\\(')
8
+ RightParen = Rule(r'\)', '\\)')
9
+ # LeftParen = Rule(re.escape(r'('), '(')
10
+ # RightParen = Rule(re.escape(r')'), ')')
11
+ LeftBracket = Rule(r'\[', '\\[')
12
+ RightBracket = Rule(r'\]', '\\]')
13
+ Dash = Rule(r'\-', '\\-')
14
+
15
+ All = [LeftParen, RightParen, LeftBracket, RightBracket, Dash]
16
+
17
+
18
+ class SubEscapedRegexReservedCharacters(object):
19
+ SubLeftParen = Rule(r'\\\(', '(')
20
+ SubRightParen = Rule(r'\\\)', ')')
21
+ # SubLeftParen = Rule(re.escape(r"\\("), "(")
22
+ # SubRightParen = Rule(re.escape(r'\\)'), ')')
23
+ SubLeftBracket = Rule(r'\\\[', '[')
24
+ SubRightBracket = Rule(r'\\\]', ']')
25
+ SubDash = Rule(r'\\\-', '-')
26
+
27
+ All = [
28
+ SubLeftParen, SubRightParen, SubLeftBracket, SubRightBracket, SubDash
29
+ ]
30
+
31
+
32
+ def replace_punctuation(match, match_type=None):
33
+ text = Text(match.group()).apply(*EscapeRegexReservedCharacters.All)
34
+ sub = re.sub(r'\.', '∯', text)
35
+ sub = re.sub(r'։', '⍟', sub) # ADDED FOR ARMENIAN
36
+ sub_1 = re.sub(r'\。', '&ᓰ&', sub)
37
+ sub_2 = re.sub(r'\.', '&ᓱ&', sub_1)
38
+ sub_3 = re.sub(r'\!', '&ᓳ&', sub_2)
39
+ sub_4 = re.sub(r'\!', '&ᓴ&', sub_3)
40
+ sub_5 = re.sub(r'\?', '&ᓷ&', sub_4)
41
+ last_sub = re.sub(r'\?', '&ᓸ&', sub_5)
42
+ if match_type != 'single':
43
+ last_sub = re.sub(r"'", '&⎋&', last_sub)
44
+ text = Text(last_sub).apply(*SubEscapedRegexReservedCharacters.All)
45
+ return text
pysbd/segmenter.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import re
3
+
4
+ from pysbd.languages import Language
5
+ from pysbd.processor import Processor
6
+ from pysbd.cleaner import Cleaner
7
+ from pysbd.utils import TextSpan
8
+
9
+ class Segmenter(object):
10
+
11
+ def __init__(self, language="en", clean=False, doc_type=None, char_span=False):
12
+ """Segments a text into an list of sentences
13
+ with or withour character offsets from original text
14
+
15
+ Parameters
16
+ ----------
17
+ language : str, required
18
+ specify a language use its two character ISO 639-1 code,
19
+ by default "en"
20
+ clean : bool, optional
21
+ cleans original text, by default False
22
+ doc_type : [type], optional
23
+ Normal text or OCRed text, by default None
24
+ set to `pdf` for OCRed text
25
+ char_span : bool, optional
26
+ Get start & end character offsets of each sentences
27
+ within original text, by default False
28
+ """
29
+ self.language = language
30
+ self.language_module = Language.get_language_code(language)
31
+ self.clean = clean
32
+ self.doc_type = doc_type
33
+ self.char_span = char_span
34
+ if self.clean and self.char_span:
35
+ raise ValueError("char_span must be False if clean is True. "
36
+ "Since `clean=True` will modify original text.")
37
+ # when doctype is pdf then force user to clean the text
38
+ # char_span func wont be provided with pdf doctype also
39
+ elif self.doc_type == 'pdf' and not self.clean:
40
+ raise ValueError("`doc_type='pdf'` should have `clean=True` & "
41
+ "`char_span` should be False since original"
42
+ "text will be modified.")
43
+
44
+ def cleaner(self, text):
45
+ if hasattr(self.language_module, "Cleaner"):
46
+ return self.language_module.Cleaner(text, self.language_module,
47
+ doc_type=self.doc_type)
48
+ else:
49
+ return Cleaner(text, self.language_module, doc_type=self.doc_type)
50
+
51
+ def processor(self, text):
52
+ if hasattr(self.language_module, "Processor"):
53
+ return self.language_module.Processor(text, self.language_module,
54
+ char_span=self.char_span)
55
+ else:
56
+ return Processor(text, self.language_module,
57
+ char_span=self.char_span)
58
+
59
+ def sentences_with_char_spans(self, sentences):
60
+ # since SENTENCE_BOUNDARY_REGEX doesnt account
61
+ # for trailing whitespaces \s* & is used as suffix
62
+ # to keep non-destructive text after segments joins
63
+ sent_spans = []
64
+ prior_end_char_idx = 0
65
+ for sent in sentences:
66
+ for match in re.finditer('{0}\s*'.format(re.escape(sent)), self.original_text):
67
+ match_str = match.group()
68
+ match_start_idx, match_end_idx = match.span()
69
+ if match_end_idx > prior_end_char_idx:
70
+ # making sure if curren sentence and its span
71
+ # is either first sentence along with its char spans
72
+ # or current sent spans adjacent to prior sentence spans
73
+ sent_spans.append(
74
+ TextSpan(match_str, match_start_idx, match_end_idx))
75
+ prior_end_char_idx = match_end_idx
76
+ break
77
+ return sent_spans
78
+
79
+ def segment(self, text):
80
+ self.original_text = text
81
+ if not text:
82
+ return []
83
+
84
+ if self.clean or self.doc_type == 'pdf':
85
+ text = self.cleaner(text).clean()
86
+
87
+ postprocessed_sents = self.processor(text).process()
88
+ sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents)
89
+ if self.char_span:
90
+ return sentence_w_char_spans
91
+ elif self.clean:
92
+ # clean and destructed sentences
93
+ return postprocessed_sents
94
+ else:
95
+ # nondestructive with whitespaces
96
+ return [textspan.sent for textspan in sentence_w_char_spans]
pysbd/utils.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ import re
4
+ import pysbd
5
+
6
+ class Rule(object):
7
+
8
+ def __init__(self, pattern, replacement):
9
+ self.pattern = pattern
10
+ self.replacement = replacement
11
+
12
+ def __repr__(self): # pragma: no cover
13
+ return '<{} pattern="{}" and replacement="{}">'.format(
14
+ self.__class__.__name__, self.pattern, self.replacement)
15
+
16
+
17
+ class Text(str):
18
+ """Extending str functionality to apply regex rules
19
+
20
+ https://stackoverflow.com/questions/4698493/can-i-add-custom-methods-attributes-to-built-in-python-types
21
+
22
+ Parameters
23
+ ----------
24
+ str : str
25
+ string content
26
+
27
+ Returns
28
+ -------
29
+ str
30
+ input as it is if rule pattern doesnt match
31
+ else replacing found pattern with replacement chars
32
+ """
33
+ def apply(self, *rules):
34
+ for each_r in rules:
35
+ self = re.sub(each_r.pattern, each_r.replacement, self)
36
+ return self
37
+
38
+
39
+ class TextSpan(object):
40
+
41
+ def __init__(self, sent, start, end):
42
+ """
43
+ Sentence text and its start & end character offsets within original text
44
+
45
+ Parameters
46
+ ----------
47
+ sent : str
48
+ Sentence text
49
+ start : int
50
+ start character offset of a sentence in original text
51
+ end : int
52
+ end character offset of a sentence in original text
53
+ """
54
+ self.sent = sent
55
+ self.start = start
56
+ self.end = end
57
+
58
+ def __repr__(self): # pragma: no cover
59
+ return "{0}(sent={1}, start={2}, end={3})".format(
60
+ self.__class__.__name__, repr(self.sent), self.start, self.end)
61
+
62
+ def __eq__(self, other):
63
+ if isinstance(self, other.__class__):
64
+ return self.sent == other.sent and self.start == other.start and self.end == other.end
65
+
66
+
67
+ class PySBDFactory(object):
68
+ """pysbd as a spacy component through entrypoints"""
69
+
70
+ def __init__(self, nlp, language='en'):
71
+ self.nlp = nlp
72
+ self.seg = pysbd.Segmenter(language=language, clean=False,
73
+ char_span=True)
74
+
75
+ def __call__(self, doc):
76
+ sents_char_spans = self.seg.segment(doc.text_with_ws)
77
+ start_token_ids = [sent.start for sent in sents_char_spans]
78
+ for token in doc:
79
+ token.is_sent_start = (True if token.idx
80
+ in start_token_ids else False)
81
+ return doc