Spaces:
Running
Running
Ari Nubar Boyacıoğlu
commited on
Commit
·
42bcb30
1
Parent(s):
2883dd2
add files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +4 -0
- README copy.md +12 -0
- README.md +5 -5
- app.py +231 -0
- img/gulbenkian.png +0 -0
- img/gulbenkian.svg +1 -0
- img/mkhitaryan-varjaran.png +0 -0
- img/teaov.png +0 -0
- img/translate.png +0 -0
- pysbd/__init__.py +2 -0
- pysbd/abbreviation_replacer.py +112 -0
- pysbd/about.py +10 -0
- pysbd/between_punctuation.py +94 -0
- pysbd/clean/__init__.py +0 -0
- pysbd/clean/rules.py +80 -0
- pysbd/cleaner.py +111 -0
- pysbd/exclamation_words.py +17 -0
- pysbd/lang/__init__.py +0 -0
- pysbd/lang/amharic.py +13 -0
- pysbd/lang/arabic.py +35 -0
- pysbd/lang/armenian.py +112 -0
- pysbd/lang/bulgarian.py +24 -0
- pysbd/lang/burmese.py +13 -0
- pysbd/lang/chinese.py +36 -0
- pysbd/lang/common/__init__.py +2 -0
- pysbd/lang/common/common.py +91 -0
- pysbd/lang/common/standard.py +114 -0
- pysbd/lang/danish.py +40 -0
- pysbd/lang/deutsch.py +97 -0
- pysbd/lang/dutch.py +12 -0
- pysbd/lang/english.py +11 -0
- pysbd/lang/french.py +15 -0
- pysbd/lang/greek.py +13 -0
- pysbd/lang/hindi.py +13 -0
- pysbd/lang/italian.py +15 -0
- pysbd/lang/japanese.py +51 -0
- pysbd/lang/kazakh.py +50 -0
- pysbd/lang/marathi.py +14 -0
- pysbd/lang/persian.py +30 -0
- pysbd/lang/polish.py +15 -0
- pysbd/lang/russian.py +27 -0
- pysbd/lang/slovak.py +111 -0
- pysbd/lang/spanish.py +15 -0
- pysbd/lang/urdu.py +13 -0
- pysbd/languages.py +66 -0
- pysbd/lists_item_replacer.py +240 -0
- pysbd/processor.py +210 -0
- pysbd/punctuation_replacer.py +45 -0
- pysbd/segmenter.py +96 -0
- pysbd/utils.py +81 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
flagged_data
|
3 |
+
.gitattributes
|
4 |
+
.gradio
|
README copy.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Hyw En Demo v2
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.43.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: cc-by-4.0
|
11 |
+
---
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
README.md
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
---
|
2 |
title: Xcl En Demo
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
10 |
---
|
11 |
-
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Xcl En Demo
|
3 |
+
emoji: 📖
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: purple
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.43.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: cc-by-4.0
|
11 |
---
|
|
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding: utf-8
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import os
|
5 |
+
import json
|
6 |
+
from pathlib import Path
|
7 |
+
from uuid import uuid4
|
8 |
+
from datetime import datetime
|
9 |
+
from huggingface_hub import CommitScheduler
|
10 |
+
from translation import Translator, LANGUAGES
|
11 |
+
import re
|
12 |
+
|
13 |
+
|
14 |
+
LANGUAGES_LIST = list(LANGUAGES.keys())
|
15 |
+
HF_TOKEN = os.environ.get('HF_TOKEN')
|
16 |
+
|
17 |
+
JSON_DATASET_DIR = Path("flagged_data")
|
18 |
+
JSON_DATASET_DIR.mkdir(exist_ok=True, parents=True)
|
19 |
+
JSON_DATASET_PATH = JSON_DATASET_DIR / f"dataset-session-{uuid4()}.json"
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
def translate_wrapper(text, src_lang, tgt_lang, by_sentence=True, clean=True, num_beams=4):
|
25 |
+
if text in ["", None, []]:
|
26 |
+
return "Մուտքագրումը պարապ է։ | Input is empty."
|
27 |
+
|
28 |
+
if src_lang in ["", None, []] or tgt_lang in ["", None, []]:
|
29 |
+
return "Ընտրեցէք թարգմանութեան կողմերը | Please select source and target languages"
|
30 |
+
|
31 |
+
if src_lang == tgt_lang:
|
32 |
+
return "Ընտրուած լեզուները նոյնն են։ | Source and target languages are identical."
|
33 |
+
|
34 |
+
src_lang = LANGUAGES.get(src_lang)
|
35 |
+
tgt_lang = LANGUAGES.get(tgt_lang)
|
36 |
+
|
37 |
+
result = translator.translate(text, src_lang, tgt_lang, by_sentence=by_sentence, clean=clean, num_beams=num_beams)
|
38 |
+
return result
|
39 |
+
|
40 |
+
|
41 |
+
# hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN, "AriNubar/hyw-en-crowd-source")
|
42 |
+
|
43 |
+
scheduler = CommitScheduler(
|
44 |
+
repo_id="AriNubar/xcl-en-crowdsource",
|
45 |
+
repo_type="dataset",
|
46 |
+
folder_path=JSON_DATASET_DIR,
|
47 |
+
path_in_repo="collected_data",
|
48 |
+
token=HF_TOKEN,
|
49 |
+
every=30 # every 30 minute
|
50 |
+
)
|
51 |
+
|
52 |
+
TQS = {
|
53 |
+
"😊 Լաւ | Good": "good",
|
54 |
+
"😐 Միջակ | Average": "average",
|
55 |
+
"☹️ Վատ | Bad": "bad"
|
56 |
+
}
|
57 |
+
|
58 |
+
def save_json(src_lang, tgt_lang, input_text, output_text, by_sentence, clean, num_beams, translation_quality):
|
59 |
+
if any([src_lang in ["", None, []], tgt_lang in ["", None, []], input_text in ["", None, []], output_text in ["", None, []]]):
|
60 |
+
gr.Warning("Տուեալին մէկ մասը պարապ է։ Ձեր գնահատութիւնը չպահուեցաւ։ | Some part of the data is missing. Your feedback has not been saved.")
|
61 |
+
return
|
62 |
+
|
63 |
+
src_lang = LANGUAGES.get(src_lang)
|
64 |
+
tgt_lang = LANGUAGES.get(tgt_lang)
|
65 |
+
translation_quality = TQS.get(translation_quality)
|
66 |
+
|
67 |
+
# print(src_lang, tgt_lang, input_text, output_text, by_sentence, clean, num_beams, translation_quality)
|
68 |
+
# print(type(src_lang), type(tgt_lang), type(input_text), type(output_text), type(by_sentence), type(clean), type(num_beams), type(translation_quality))
|
69 |
+
data = {
|
70 |
+
"src_lang": src_lang,
|
71 |
+
"tgt_lang": tgt_lang,
|
72 |
+
"original": input_text,
|
73 |
+
"translation": output_text,
|
74 |
+
"by_sentence": by_sentence,
|
75 |
+
"clean": clean,
|
76 |
+
"num_beams": num_beams,
|
77 |
+
"translation_quality": translation_quality,
|
78 |
+
"timestamp": datetime.now().isoformat()
|
79 |
+
}
|
80 |
+
with scheduler.lock:
|
81 |
+
with open(JSON_DATASET_PATH, "a", encoding="utf8") as f:
|
82 |
+
f.write(json.dumps(data, ensure_ascii=False) + "\n")
|
83 |
+
|
84 |
+
gr.Info("Ձեր գնահանութիւնը պահուեցաւ։ Շատ շնորհակալութի՛ւն։ | Your feedback has been saved. Thank you.")
|
85 |
+
|
86 |
+
|
87 |
+
def switch_languages(src, tgt, input_text, output_text):
|
88 |
+
new_src = tgt
|
89 |
+
new_tgt = src
|
90 |
+
new_input = output_text if output_text else input_text
|
91 |
+
return [new_src, new_tgt, new_input, None]
|
92 |
+
|
93 |
+
|
94 |
+
def detect_language(text):
|
95 |
+
"""Detect language based on script ratio"""
|
96 |
+
armenian_pattern = r'[\u0531-\u0587\u0589\u058A\u058F]'
|
97 |
+
non_armenian_pattern = r'[a-zA-Z]'
|
98 |
+
|
99 |
+
armenian_chars = len(re.findall(armenian_pattern, text))
|
100 |
+
non_armenian_chars = len(re.findall(non_armenian_pattern, text))
|
101 |
+
|
102 |
+
if armenian_chars > non_armenian_chars:
|
103 |
+
return "Գրաբար Հայոց | Classical Armenian", "Անգլերէն | English"
|
104 |
+
elif non_armenian_chars > 0:
|
105 |
+
return "Անգլերէն | English", "Գրաբար Հայոց | Classical Armenian"
|
106 |
+
return [gr.update(), gr.update()] # No clear dominance, reset dropdowns
|
107 |
+
|
108 |
+
def update_languages(text):
|
109 |
+
if not text:
|
110 |
+
return [gr.update(), gr.update()]
|
111 |
+
src, tgt = detect_language(text)
|
112 |
+
return [gr.update(value=src), gr.update(value=tgt)]
|
113 |
+
|
114 |
+
|
115 |
+
theme = gr.themes.Default().set(
|
116 |
+
block_info_text_size="*text_xxs" # for info text
|
117 |
+
)
|
118 |
+
|
119 |
+
with gr.Blocks(title="Գրաբար-Անգլերէն Մեքենական Թարգմանիչ | Classical Armenian-English Machine Translation",
|
120 |
+
theme=theme,
|
121 |
+
) as demo:
|
122 |
+
|
123 |
+
gr.HTML("""
|
124 |
+
<h2 style='margin-bottom: 5px'>Գրաբար-Անգլերէն Մեքենական Թարգմանիչ | Classical Armenian-English Machine Translation</h2>
|
125 |
+
<h3>Տարբերակ | Version: 1.0</h3>
|
126 |
+
<h3 style='margin-bottom: 5px'>Ստեղծող՝ | Created By: <a href='https://www.arinubar.com' target='_blank'>Ari Nubar Boyacıoğlu</a></h3>
|
127 |
+
<p style="font-size: 0.7rem">Եթէ այս գործիքը կարողացաւ ձեզ օգտակար հանդիսանալ, բարելաւելու համար հաճեցէք սուրճի մը փոխարժէքը նուիրել․ | If this tool has proven useful to you, please consider making a donation. <a href='https://www.paypal.com/donate/?hosted_button_id=RRBCV3GQJ7D8N' target='_blank'>PayPal</a> | <a href='https://buymeacoffee.com/arinubar' target='_blank'>Buy Me a Coffee</a></p>
|
128 |
+
""")
|
129 |
+
|
130 |
+
with gr.Accordion("Թարգմանիչի Մասին | Information about the Translator", open=False):
|
131 |
+
gr.HTML("""
|
132 |
+
<p>Հոս կը ցուցադրուի առաջին գրաբար-անգլերէն մեքենական թարգմանիչը, որ կարուցուած է Մեթայի (Ֆեյսպուքի) 'No Language Left Behind' տիպարի հիման վրայ։ Թարգմանութեան տիպարը կ'աշխատի CPU-ի մը մէջ, ուրեմն նախադասութեան մը թարգմանութիւնը կրնայ տեւել մօտաւորապէս <strong>40-60 երկվայրկեան</strong>։ Ձեր գնահատութիւնները եւ քննադատութիւնները շատ կարեւոր են տիպարի թարգմանութեան որակը բարելաւելու համար։</p>
|
133 |
+
<p>Դուք թարգմանութեան որակին մասին ձեր գնահատութիւնը կրնաք տալ երեք գնահատութեան կոճակներէ մէկուն սեղմելով։ Լեզուի, մուտքի եւ ելքի գրութիւններու, յարաչափերու եւ ձեր գնահատութեան մասին տուեալները պիտի պահուին։ Գնահատութիւնը պարտաւոր չէ։</p>
|
134 |
+
<hr style='margin-top: 5px; margin-bottom: 5px'>
|
135 |
+
<p>This is the demo of the first Classical Armenian-English neural machine translation system which is based on Meta's 'No Language Left Behind' model. The model runs on a CPU, so it might take approximately <strong>40-60 seconds</strong> to translate a single sentence. Your feedback and comments are very important for us to improve the quality of the translation.</p>
|
136 |
+
<p>You can give your feedback about the quality of the translation by clicking one of the three feedback buttons. Information about source, target languages, input and output texts, parameters and your feedback about quality will be saved. It is not mandatory to give feedback.</p>
|
137 |
+
""")
|
138 |
+
|
139 |
+
with gr.Row():
|
140 |
+
with gr.Column():
|
141 |
+
text = gr.Textbox(
|
142 |
+
lines=5,
|
143 |
+
label="Մուտքագրում | Input Text",
|
144 |
+
every=1.5 # Trigger event 1.5 seconds after last keystroke
|
145 |
+
)
|
146 |
+
|
147 |
+
with gr.Row():
|
148 |
+
src_lang = gr.Dropdown(LANGUAGES_LIST, type="value", label="Թարգմանէ Այս Լեզուէ | Source Language")
|
149 |
+
tgt_lang = gr.Dropdown(LANGUAGES_LIST, type="value", label="Թարգմանէ Այս Լեզուի | Target Language")
|
150 |
+
|
151 |
+
with gr.Row():
|
152 |
+
switch_btn = gr.Button("🔄 Լեզուները Փոխէ | Switch Languages")
|
153 |
+
|
154 |
+
|
155 |
+
def switch_languages(src, tgt, input_text, output_text):
|
156 |
+
# Swap languages
|
157 |
+
new_src = tgt
|
158 |
+
new_tgt = src
|
159 |
+
# Move output to input if exists and clear output
|
160 |
+
new_input = output_text if output_text else input_text
|
161 |
+
return [new_src, new_tgt, new_input, None]
|
162 |
+
|
163 |
+
text.change(fn=update_languages,
|
164 |
+
inputs=[text],
|
165 |
+
outputs=[src_lang, tgt_lang])
|
166 |
+
|
167 |
+
with gr.Column():
|
168 |
+
translated = gr.Textbox(lines=5, label="Ելքագրում | Output Text", interactive=False)
|
169 |
+
translate_btn = gr.Button(value="Թարգմանէ | Translate", variant="primary")
|
170 |
+
with gr.Row():
|
171 |
+
with gr.Column():
|
172 |
+
gr.Markdown("""
|
173 |
+
### Թարգմանութեան Որակ | Translation Quality
|
174 |
+
""")
|
175 |
+
flag_good_btn = gr.Button(value="😊 Լաւ | Good", size="sm")
|
176 |
+
flag_average_btn = gr.Button(value="😐 Միջակ | Average", size="sm")
|
177 |
+
flag_bad_btn = gr.Button(value="☹️ Վատ | Bad", size="sm")
|
178 |
+
|
179 |
+
with gr.Row():
|
180 |
+
gr.Markdown("""
|
181 |
+
## Յարաչափեր | Parameters
|
182 |
+
"""
|
183 |
+
)
|
184 |
+
by_sentence = gr.Checkbox(label="Նախադասութիւններու Բաժնէ | Split into Sentences", value=True, info="Տուփը նշանագրեցէք եթէ կ'ուզէք ձեր մուտքագրումը թարգմանուի նախադասութիւն առ նախադասութիւն։ Այս կերպով թարգմանուած նախադասութիւններուն որակը ընդհանրապէս աւելի լաւ կ'ըլլան։ | Check this box if you want to split your input text into sentences. This way the quality of the translation will be better.")
|
185 |
+
clean = gr.Checkbox(label="Մշակէ | Preprocess", value=True, info="Տուփը նշանագրեցէք եթէ կ'ուզէք ձեր մուտքագրումը կանոնաւորուի ծրագրի կողմէ թարգմանութենէ առաջ։ Կանոնաւորումը թարգմանութեան որակի բարելաւման համար օգտակար է։ | Check this box if you want to preprocess your input text before translation. This way the quality of the translation will be better.")
|
186 |
+
num_beams = gr.Dropdown([1, 2, 3, 4, 5], type="value", label="Որոնման Շողեր | Number of Beams", value=4, info="Աւելի բարձր թիւը ընդհանրապէս կը պատճառէ աւելի բարձր որակի, բայց նոյնիսկ երկարատեւ թարգմանութեան։ | Higher beam size will result in better quality translation, but also longer translation time.")
|
187 |
+
|
188 |
+
switch_btn.click(switch_languages, inputs=[src_lang, tgt_lang, text, translated], outputs=[src_lang, tgt_lang, text, translated])
|
189 |
+
translate_btn.click(translate_wrapper, inputs=[text, src_lang, tgt_lang, by_sentence, clean, num_beams], outputs=translated)
|
190 |
+
|
191 |
+
# hf_writer.setup([src_lang, tgt_lang, text, translated, by_sentence, clean, num_beams, flag_bad_btn], "flagged_data_points")
|
192 |
+
|
193 |
+
flag_good_btn.click(save_json, inputs=[src_lang, tgt_lang, text, translated, by_sentence, clean, num_beams, flag_good_btn], outputs=None)
|
194 |
+
flag_average_btn.click(save_json, inputs=[src_lang, tgt_lang, text, translated, by_sentence, clean, num_beams, flag_average_btn], outputs=None)
|
195 |
+
flag_bad_btn.click(save_json, inputs=[src_lang, tgt_lang, text, translated, by_sentence, clean, num_beams, flag_bad_btn], outputs=None)
|
196 |
+
|
197 |
+
visitor_badge_html = """
|
198 |
+
<a href="https://visitorbadge.io/status?path=https%3A%2F%2Farinubar-hyw-en-demo.hf.space%2F">
|
199 |
+
<img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Farinubar-hyw-en-demo.hf.space%2F&label=%D4%B1%D5%B5%D6%81%D5%A5%D5%AC%D5%B8%D6%82%D5%B6%D5%A5%D6%80%20%7C%20Visitors&countColor=%23f97316&style=flat" />
|
200 |
+
</a>
|
201 |
+
"""
|
202 |
+
|
203 |
+
gr.HTML(visitor_badge_html)
|
204 |
+
|
205 |
+
|
206 |
+
|
207 |
+
sponsors_html = """
|
208 |
+
<div style="display: flex; justify-content: center; align-items: center; margin-bottom: 5px;">
|
209 |
+
<h3>Աջակցութեամբ՝ | Supported By: </h3>
|
210 |
+
</div>
|
211 |
+
<div style="display: flex; justify-content: center; align-items: center; background-color: #0000007a; border-radius: 20px;">
|
212 |
+
<a href="#">
|
213 |
+
<img src="/file=./img/mkhitaryan-varjaran.png" alt="Pangalti Mkhitaryan School" style="padding: 10px; margin: 20px; width: 150px;" />
|
214 |
+
</a>
|
215 |
+
|
216 |
+
<a href="https://gulbenkian.pt/armenian-communities/">
|
217 |
+
<img src="/file=./img/gulbenkian.png" alt="Calouste Gulbenkian Foundation - Armenian Communities" style="padding: 10px; margin: 20px;" />
|
218 |
+
</a>
|
219 |
+
|
220 |
+
<a href="http://www.teaov.org/">
|
221 |
+
<img src="/file=./img/teaov.png" alt="Turkish-Armenian Minority Schools Teachers Foundation" style="padding: 10px; margin: 20px; width: 200px; padding-right:35px;" />
|
222 |
+
</a>
|
223 |
+
</div>
|
224 |
+
|
225 |
+
"""
|
226 |
+
gr.HTML(sponsors_html)
|
227 |
+
|
228 |
+
|
229 |
+
if __name__ == "__main__":
|
230 |
+
translator = Translator()
|
231 |
+
demo.launch(favicon_path="img/translate.png", share=True, allowed_paths=["./img"])
|
img/gulbenkian.png
ADDED
img/gulbenkian.svg
ADDED
img/mkhitaryan-varjaran.png
ADDED
img/teaov.png
ADDED
img/translate.png
ADDED
pysbd/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .segmenter import Segmenter
|
2 |
+
from .about import __version__
|
pysbd/abbreviation_replacer.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
from pysbd.utils import Text
|
4 |
+
|
5 |
+
|
6 |
+
def replace_pre_number_abbr(txt, abbr):
|
7 |
+
# prepend a space to avoid needing another regex for start of string
|
8 |
+
txt = " " + txt
|
9 |
+
txt = re.sub(r"(?<=\s{abbr})\.(?=(\s\d|\s+\())".format(abbr=abbr.strip()), "∯", txt)
|
10 |
+
# remove the prepended space
|
11 |
+
txt = txt[1:]
|
12 |
+
return txt
|
13 |
+
|
14 |
+
|
15 |
+
def replace_prepositive_abbr(txt, abbr):
|
16 |
+
# prepend a space to avoid needing another regex for start of string
|
17 |
+
txt = " " + txt
|
18 |
+
txt = re.sub(r"(?<=\s{abbr})\.(?=(\s|:\d+))".format(abbr=abbr.strip()), "∯", txt)
|
19 |
+
# remove the prepended space
|
20 |
+
txt = txt[1:]
|
21 |
+
return txt
|
22 |
+
|
23 |
+
|
24 |
+
class AbbreviationReplacer(object):
|
25 |
+
def __init__(self, text, lang):
|
26 |
+
self.text = text
|
27 |
+
self.lang = lang
|
28 |
+
|
29 |
+
def replace(self):
|
30 |
+
self.text = Text(self.text).apply(
|
31 |
+
self.lang.PossessiveAbbreviationRule,
|
32 |
+
self.lang.KommanditgesellschaftRule,
|
33 |
+
*self.lang.SingleLetterAbbreviationRules.All
|
34 |
+
)
|
35 |
+
abbr_handled_text = ""
|
36 |
+
for line in self.text.splitlines(True):
|
37 |
+
abbr_handled_text += self.search_for_abbreviations_in_string(line)
|
38 |
+
self.text = abbr_handled_text
|
39 |
+
self.replace_multi_period_abbreviations()
|
40 |
+
self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
|
41 |
+
self.text = self.replace_abbreviation_as_sentence_boundary()
|
42 |
+
return self.text
|
43 |
+
|
44 |
+
def replace_abbreviation_as_sentence_boundary(self):
|
45 |
+
sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in self.SENTENCE_STARTERS))
|
46 |
+
regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})".format(sent_starters)
|
47 |
+
self.text = re.sub(regex, '\\1.', self.text)
|
48 |
+
return self.text
|
49 |
+
|
50 |
+
def replace_multi_period_abbreviations(self):
|
51 |
+
def mpa_replace(match):
|
52 |
+
match = match.group()
|
53 |
+
match = re.sub(re.escape(r"."), "∯", match)
|
54 |
+
return match
|
55 |
+
|
56 |
+
self.text = re.sub(
|
57 |
+
self.lang.MULTI_PERIOD_ABBREVIATION_REGEX,
|
58 |
+
mpa_replace,
|
59 |
+
self.text,
|
60 |
+
flags=re.IGNORECASE
|
61 |
+
)
|
62 |
+
|
63 |
+
def replace_period_of_abbr(self, txt, abbr):
|
64 |
+
# prepend a space to avoid needing another regex for start of string
|
65 |
+
txt = " " + txt
|
66 |
+
txt = re.sub(
|
67 |
+
r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))".format(
|
68 |
+
abbr=re.escape(abbr.strip())
|
69 |
+
),
|
70 |
+
"∯",
|
71 |
+
txt,
|
72 |
+
)
|
73 |
+
# remove the prepended space
|
74 |
+
txt = txt[1:]
|
75 |
+
return txt
|
76 |
+
|
77 |
+
|
78 |
+
def search_for_abbreviations_in_string(self, text):
|
79 |
+
lowered = text.lower()
|
80 |
+
for abbr in self.lang.Abbreviation.ABBREVIATIONS:
|
81 |
+
stripped = abbr.strip()
|
82 |
+
if stripped not in lowered:
|
83 |
+
continue
|
84 |
+
abbrev_match = re.findall(
|
85 |
+
r"(?:^|\s|\r|\n){}".format(stripped), text, flags=re.IGNORECASE
|
86 |
+
)
|
87 |
+
if not abbrev_match:
|
88 |
+
continue
|
89 |
+
next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}"
|
90 |
+
char_array = re.findall(next_word_start, text)
|
91 |
+
for ind, match in enumerate(abbrev_match):
|
92 |
+
text = self.scan_for_replacements(
|
93 |
+
text, match, ind, char_array
|
94 |
+
)
|
95 |
+
return text
|
96 |
+
|
97 |
+
def scan_for_replacements(self, txt, am, ind, char_array):
|
98 |
+
try:
|
99 |
+
char = char_array[ind]
|
100 |
+
except IndexError:
|
101 |
+
char = ""
|
102 |
+
prepositive = self.lang.Abbreviation.PREPOSITIVE_ABBREVIATIONS
|
103 |
+
number_abbr = self.lang.Abbreviation.NUMBER_ABBREVIATIONS
|
104 |
+
upper = str(char).isupper()
|
105 |
+
if not upper or am.strip().lower() in prepositive:
|
106 |
+
if am.strip().lower() in prepositive:
|
107 |
+
txt = replace_prepositive_abbr(txt, am)
|
108 |
+
elif am.strip().lower() in number_abbr:
|
109 |
+
txt = replace_pre_number_abbr(txt, am)
|
110 |
+
else:
|
111 |
+
txt = self.replace_period_of_abbr(txt, am)
|
112 |
+
return txt
|
pysbd/about.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# inspired from:
|
2 |
+
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
|
3 |
+
|
4 |
+
__title__ = "pysbd"
|
5 |
+
__version__ = "0.3.4"
|
6 |
+
__summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
|
7 |
+
__uri__ = "http://nipunsadvilkar.github.io/"
|
8 |
+
__author__ = "Nipun Sadvilkar"
|
9 |
+
__email__ = "[email protected]"
|
10 |
+
__license__ = "MIT"
|
pysbd/between_punctuation.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
from functools import partial
|
4 |
+
from pysbd.punctuation_replacer import replace_punctuation
|
5 |
+
|
6 |
+
|
7 |
+
class BetweenPunctuation(object):
|
8 |
+
# Rubular: http://rubular.com/r/2YFrKWQUYi
|
9 |
+
BETWEEN_SINGLE_QUOTES_REGEX = r"(?<=\s)'(?:[^']|'[a-zA-Z])*'"
|
10 |
+
|
11 |
+
BETWEEN_SINGLE_QUOTE_SLANTED_REGEX = r"(?<=\s)‘(?:[^’]|’[a-zA-Z])*’"
|
12 |
+
|
13 |
+
# Rubular: http://rubular.com/r/3Pw1QlXOjd
|
14 |
+
BETWEEN_DOUBLE_QUOTES_REGEX = r'"(?>[^"\\]+|\\{2}|\\.)*"'
|
15 |
+
|
16 |
+
# https://regex101.com/r/r6I1bW/1
|
17 |
+
# https://stackoverflow.com/questions/13577372/do-python-regular-expressions-have-an-equivalent-to-rubys-atomic-grouping?noredirect=1&lq=1
|
18 |
+
BETWEEN_DOUBLE_QUOTES_REGEX_2 = r'"(?=(?P<tmp>[^\"\\]+|\\{2}|\\.)*)(?P=tmp)"'
|
19 |
+
|
20 |
+
# Rubular: http://rubular.com/r/x6s4PZK8jc
|
21 |
+
BETWEEN_QUOTE_ARROW_REGEX = r'«(?>[^»\\]+|\\{2}|\\.)*»'
|
22 |
+
|
23 |
+
BETWEEN_QUOTE_ARROW_REGEX_2 = r"\«(?=(?P<tmp>[^»\\]+|\\{2}|\\.)*)(?P=tmp)\»"
|
24 |
+
|
25 |
+
# Rubular: http://rubular.com/r/JbAIpKdlSq
|
26 |
+
BETWEEN_QUOTE_SLANTED_REGEX = r"“(?>[^”\\]+|\\{2}|\\.)*”"
|
27 |
+
BETWEEN_QUOTE_SLANTED_REGEX_2 = r"\“(?=(?P<tmp>[^”\\]+|\\{2}|\\.)*)(?P=tmp)\”"
|
28 |
+
|
29 |
+
# Rubular: http://rubular.com/r/WX4AvnZvlX
|
30 |
+
BETWEEN_SQUARE_BRACKETS_REGEX = r"\[(?>[^\]\\]+|\\{2}|\\.)*\]"
|
31 |
+
|
32 |
+
BETWEEN_SQUARE_BRACKETS_REGEX_2 = r'\[(?=(?P<tmp>[^\]\\]+|\\{2}|\\.)*)(?P=tmp)\]'
|
33 |
+
|
34 |
+
# Rubular: http://rubular.com/r/6tTityPflI
|
35 |
+
BETWEEN_PARENS_REGEX = r"\((?>[^\(\)\\]+|\\{2}|\\.)*\)"
|
36 |
+
|
37 |
+
BETWEEN_PARENS_REGEX_2 = r"\((?=(?P<tmp>[^\(\)\\]+|\\{2}|\\.)*)(?P=tmp)\)"
|
38 |
+
|
39 |
+
# Rubular: http://rubular.com/r/mXf8cW025o
|
40 |
+
WORD_WITH_LEADING_APOSTROPHE = r"(?<=\s)'(?:[^']|'[a-zA-Z])*'\S"
|
41 |
+
|
42 |
+
# Rubular: http://rubular.com/r/jTtDKfjxzr
|
43 |
+
BETWEEN_EM_DASHES_REGEX = r"\-\-(?>[^\-\-])*\-\-"
|
44 |
+
|
45 |
+
BETWEEN_EM_DASHES_REGEX_2 = r"--(?=(?P<tmp>[^--]*))(?P=tmp)--"
|
46 |
+
|
47 |
+
def __init__(self, text):
|
48 |
+
self.text = text
|
49 |
+
|
50 |
+
def replace(self):
|
51 |
+
return self.sub_punctuation_between_quotes_and_parens(self.text)
|
52 |
+
|
53 |
+
def sub_punctuation_between_quotes_and_parens(self, txt):
|
54 |
+
txt = self.sub_punctuation_between_single_quotes(txt)
|
55 |
+
txt = self.sub_punctuation_between_single_quote_slanted(txt)
|
56 |
+
txt = self.sub_punctuation_between_double_quotes(txt)
|
57 |
+
txt = self.sub_punctuation_between_square_brackets(txt)
|
58 |
+
txt = self.sub_punctuation_between_parens(txt)
|
59 |
+
txt = self.sub_punctuation_between_quotes_arrow(txt)
|
60 |
+
txt = self.sub_punctuation_between_em_dashes(txt)
|
61 |
+
txt = self.sub_punctuation_between_quotes_slanted(txt)
|
62 |
+
return txt
|
63 |
+
|
64 |
+
def sub_punctuation_between_parens(self, txt):
|
65 |
+
return re.sub(self.BETWEEN_PARENS_REGEX_2, replace_punctuation, txt)
|
66 |
+
|
67 |
+
def sub_punctuation_between_square_brackets(self, txt):
|
68 |
+
return re.sub(self.BETWEEN_SQUARE_BRACKETS_REGEX_2, replace_punctuation,
|
69 |
+
txt)
|
70 |
+
|
71 |
+
def sub_punctuation_between_single_quotes(self, txt):
|
72 |
+
if re.search(self.WORD_WITH_LEADING_APOSTROPHE, txt) and \
|
73 |
+
(not re.search(r"'\s", txt)):
|
74 |
+
return txt
|
75 |
+
return re.sub(self.BETWEEN_SINGLE_QUOTES_REGEX,
|
76 |
+
partial(replace_punctuation, match_type='single'), txt)
|
77 |
+
|
78 |
+
def sub_punctuation_between_single_quote_slanted(self, txt):
|
79 |
+
return re.sub(self.BETWEEN_SINGLE_QUOTE_SLANTED_REGEX,
|
80 |
+
replace_punctuation, txt)
|
81 |
+
|
82 |
+
def sub_punctuation_between_double_quotes(self, txt):
|
83 |
+
return re.sub(self.BETWEEN_DOUBLE_QUOTES_REGEX_2, replace_punctuation,
|
84 |
+
txt)
|
85 |
+
|
86 |
+
def sub_punctuation_between_quotes_arrow(self, txt):
|
87 |
+
return re.sub(self.BETWEEN_QUOTE_ARROW_REGEX_2, replace_punctuation, txt)
|
88 |
+
|
89 |
+
def sub_punctuation_between_em_dashes(self, txt):
|
90 |
+
return re.sub(self.BETWEEN_EM_DASHES_REGEX_2, replace_punctuation, txt)
|
91 |
+
|
92 |
+
def sub_punctuation_between_quotes_slanted(self, txt):
|
93 |
+
return re.sub(self.BETWEEN_QUOTE_SLANTED_REGEX_2, replace_punctuation,
|
94 |
+
txt)
|
pysbd/clean/__init__.py
ADDED
File without changes
|
pysbd/clean/rules.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.utils import Rule
|
3 |
+
|
4 |
+
|
5 |
+
class CleanRules(object):
|
6 |
+
|
7 |
+
# NOTE: Caution: Might require \\ for special characters
|
8 |
+
# if regex is defined with r'' then dont
|
9 |
+
# add extra \\ for special characters
|
10 |
+
# Rubular: http://rubular.com/r/V57WnM9Zut
|
11 |
+
NewLineInMiddleOfWordRule = Rule(r'\n(?=[a-zA-Z]{1,2}\n)', '')
|
12 |
+
|
13 |
+
# Rubular: http://rubular.com/r/dMxp5MixFS
|
14 |
+
DoubleNewLineWithSpaceRule = Rule(r'\n \n', "\r")
|
15 |
+
|
16 |
+
# Rubular: http://rubular.com/r/H6HOJeA8bq
|
17 |
+
DoubleNewLineRule = Rule(r'\n\n', "\r")
|
18 |
+
|
19 |
+
# Rubular: http://rubular.com/r/FseyMiiYFT
|
20 |
+
NewLineFollowedByPeriodRule = Rule(r'\n(?=\.(\s|\n))', '')
|
21 |
+
|
22 |
+
ReplaceNewlineWithCarriageReturnRule = Rule(r'\n', "\r")
|
23 |
+
|
24 |
+
EscapedNewLineRule = Rule(r'\\n', "\n")
|
25 |
+
|
26 |
+
EscapedCarriageReturnRule = Rule(r'\\r', "\r")
|
27 |
+
|
28 |
+
TypoEscapedNewLineRule = Rule(r'\\\ n', "\n")
|
29 |
+
|
30 |
+
TypoEscapedCarriageReturnRule = Rule(r'\\\ r', "\r")
|
31 |
+
|
32 |
+
# Rubular: http://rubular.com/r/bAJrhyLNeZ
|
33 |
+
InlineFormattingRule = Rule(r'{b\^>\d*<b\^}|{b\^>\d*<b\^}', '')
|
34 |
+
|
35 |
+
# Rubular: http://rubular.com/r/8mc1ArOIGy
|
36 |
+
TableOfContentsRule = Rule(r'\.{4,}\s*\d+-*\d*', "\r")
|
37 |
+
|
38 |
+
# Rubular: http://rubular.com/r/DwNSuZrNtk
|
39 |
+
ConsecutivePeriodsRule = Rule(r'\.{5,}', ' ')
|
40 |
+
|
41 |
+
# Rubular: http://rubular.com/r/IQ4TPfsbd8
|
42 |
+
ConsecutiveForwardSlashRule = Rule(r'\/{3}', '')
|
43 |
+
|
44 |
+
# Rubular: http://rubular.com/r/6dt98uI76u
|
45 |
+
NO_SPACE_BETWEEN_SENTENCES_REGEX = r'(?<=[a-z])\.(?=[A-Z])'
|
46 |
+
# NO_SPACE_BETWEEN_SENTENCES_REGEX = r'[a-z]\.[A-Z]'
|
47 |
+
NoSpaceBetweenSentencesRule = Rule(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
|
48 |
+
|
49 |
+
# Rubular: http://rubular.com/r/l6KN6rH5XE
|
50 |
+
NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = r'(?<=\d)\.(?=[A-Z])'
|
51 |
+
NoSpaceBetweenSentencesDigitRule = Rule(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
|
52 |
+
|
53 |
+
URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']
|
54 |
+
|
55 |
+
# Rubular: http://rubular.com/r/3GiRiP2IbD
|
56 |
+
NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = r'(?<=\s)\n(?=([a-z]|\())'
|
57 |
+
|
58 |
+
# Rubular: http://rubular.com/r/Gn18aAnLdZ
|
59 |
+
NewLineFollowedByBulletRule = Rule(r"\n(?=•')", "\r")
|
60 |
+
|
61 |
+
QuotationsFirstRule = Rule(r"''", '"')
|
62 |
+
QuotationsSecondRule = Rule(r'``', '"')
|
63 |
+
|
64 |
+
|
65 |
+
class HTML(object):
|
66 |
+
# Rubular: http://rubular.com/r/9d0OVOEJWj
|
67 |
+
HTMLTagRule = Rule(r"<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[\^'\">\s]+))?)+\s*|\s*)\/?>", '')
|
68 |
+
|
69 |
+
# Rubular: http://rubular.com/r/XZVqMPJhea
|
70 |
+
EscapedHTMLTagRule = Rule(r'<\/?[^gt;]*gt;', '')
|
71 |
+
|
72 |
+
All = [HTMLTagRule, EscapedHTMLTagRule]
|
73 |
+
|
74 |
+
|
75 |
+
class PDF(object):
|
76 |
+
# Rubular: http://rubular.com/r/UZAVcwqck8
|
77 |
+
NewLineInMiddleOfSentenceRule = Rule(r'(?<=[^\n]\s)\n(?=\S)', '')
|
78 |
+
|
79 |
+
# Rubular: http://rubular.com/r/eaNwGavmdo
|
80 |
+
NewLineInMiddleOfSentenceNoSpacesRule = Rule(r"\n(?=[a-z])", ' ')
|
pysbd/cleaner.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
from pysbd.utils import Text
|
4 |
+
from pysbd.clean.rules import PDF, HTML, CleanRules as cr
|
5 |
+
|
6 |
+
|
7 |
+
class Cleaner(object):
|
8 |
+
|
9 |
+
def __init__(self, text, lang, doc_type=None):
|
10 |
+
self.text = text
|
11 |
+
self.lang = lang
|
12 |
+
self.doc_type = doc_type
|
13 |
+
|
14 |
+
def clean(self):
|
15 |
+
if not self.text:
|
16 |
+
return self.text
|
17 |
+
self.remove_all_newlines()
|
18 |
+
self.replace_double_newlines()
|
19 |
+
self.replace_newlines()
|
20 |
+
self.replace_escaped_newlines()
|
21 |
+
self.text = Text(self.text).apply(*HTML.All)
|
22 |
+
self.replace_punctuation_in_brackets()
|
23 |
+
self.text = Text(self.text).apply(cr.InlineFormattingRule)
|
24 |
+
self.clean_quotations()
|
25 |
+
self.clean_table_of_contents()
|
26 |
+
self.check_for_no_space_in_between_sentences()
|
27 |
+
self.clean_consecutive_characters()
|
28 |
+
return self.text
|
29 |
+
|
30 |
+
def remove_all_newlines(self):
|
31 |
+
self.remove_newline_in_middle_of_sentence()
|
32 |
+
self.remove_newline_in_middle_of_word()
|
33 |
+
|
34 |
+
def remove_newline_in_middle_of_sentence(self):
|
35 |
+
def replace_w_blank(match):
|
36 |
+
match = match.group()
|
37 |
+
sub = re.sub(cr.NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '', match)
|
38 |
+
return sub
|
39 |
+
self.text = re.sub(r'(?:[^\.])*', replace_w_blank, self.text)
|
40 |
+
|
41 |
+
def remove_newline_in_middle_of_word(self):
|
42 |
+
self.text = Text(self.text).apply(cr.NewLineInMiddleOfWordRule)
|
43 |
+
|
44 |
+
def replace_double_newlines(self):
|
45 |
+
self.text = Text(self.text).apply(cr.DoubleNewLineWithSpaceRule,
|
46 |
+
cr.DoubleNewLineRule)
|
47 |
+
|
48 |
+
def remove_pdf_line_breaks(self):
|
49 |
+
self.text = Text(
|
50 |
+
self.text).apply(cr.NewLineFollowedByBulletRule,
|
51 |
+
PDF.NewLineInMiddleOfSentenceRule,
|
52 |
+
PDF.NewLineInMiddleOfSentenceNoSpacesRule)
|
53 |
+
|
54 |
+
def replace_newlines(self):
|
55 |
+
if self.doc_type == 'pdf':
|
56 |
+
self.remove_pdf_line_breaks()
|
57 |
+
else:
|
58 |
+
self.text = Text(
|
59 |
+
self.text).apply(cr.NewLineFollowedByPeriodRule,
|
60 |
+
cr.ReplaceNewlineWithCarriageReturnRule)
|
61 |
+
|
62 |
+
def replace_escaped_newlines(self):
|
63 |
+
self.text = Text(
|
64 |
+
self.text).apply(cr.EscapedNewLineRule,
|
65 |
+
cr.EscapedCarriageReturnRule,
|
66 |
+
cr.TypoEscapedNewLineRule,
|
67 |
+
cr.TypoEscapedCarriageReturnRule)
|
68 |
+
|
69 |
+
def replace_punctuation_in_brackets(self):
|
70 |
+
def replace_punct(match):
|
71 |
+
match = match.group()
|
72 |
+
if '?' in match:
|
73 |
+
sub = re.sub(re.escape('?'), '&ᓷ&', match)
|
74 |
+
return sub
|
75 |
+
return match
|
76 |
+
self.text = re.sub(r'\[(?:[^\]])*\]', replace_punct, self.text)
|
77 |
+
|
78 |
+
def clean_quotations(self):
|
79 |
+
# method added explicitly
|
80 |
+
# pragmatic-segmenter applies thhis method
|
81 |
+
# at different location
|
82 |
+
self.text = re.sub('`', "'", self.text)
|
83 |
+
self.text = Text(self.text).apply(
|
84 |
+
cr.QuotationsFirstRule,
|
85 |
+
cr.QuotationsSecondRule)
|
86 |
+
|
87 |
+
def clean_table_of_contents(self):
|
88 |
+
self.text = Text(self.text).apply(
|
89 |
+
cr.TableOfContentsRule,
|
90 |
+
cr.ConsecutivePeriodsRule,
|
91 |
+
cr.ConsecutiveForwardSlashRule)
|
92 |
+
|
93 |
+
def search_for_connected_sentences(self, word, txt, regex, rule):
|
94 |
+
if not re.search(regex, word):
|
95 |
+
return txt
|
96 |
+
if any(k in word for k in cr.URL_EMAIL_KEYWORDS):
|
97 |
+
return txt
|
98 |
+
new_word = Text(word).apply(rule)
|
99 |
+
txt = re.sub(re.escape(word), new_word, txt)
|
100 |
+
return txt
|
101 |
+
|
102 |
+
def check_for_no_space_in_between_sentences(self):
|
103 |
+
words = self.text.split(' ')
|
104 |
+
for word in words:
|
105 |
+
self.text = self.search_for_connected_sentences(word, self.text, cr.NO_SPACE_BETWEEN_SENTENCES_REGEX, cr.NoSpaceBetweenSentencesRule)
|
106 |
+
self.text = self.search_for_connected_sentences(word, self.text, cr.NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, cr.NoSpaceBetweenSentencesDigitRule)
|
107 |
+
|
108 |
+
def clean_consecutive_characters(self):
|
109 |
+
self.text = Text(self.text).apply(
|
110 |
+
cr.ConsecutivePeriodsRule,
|
111 |
+
cr.ConsecutiveForwardSlashRule)
|
pysbd/exclamation_words.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
from pysbd.punctuation_replacer import replace_punctuation
|
4 |
+
|
5 |
+
|
6 |
+
class ExclamationWords(object):
|
7 |
+
"""
|
8 |
+
Searches for exclamation points that are part of words
|
9 |
+
and not ending punctuation and replaces them.
|
10 |
+
"""
|
11 |
+
EXCLAMATION_WORDS = "!Xũ !Kung ǃʼOǃKung !Xuun !Kung-Ekoka ǃHu ǃKhung ǃKu ǃung ǃXo ǃXû ǃXung ǃXũ !Xun Yahoo! Y!J Yum!".split()
|
12 |
+
EXCLAMATION_REGEX = r"|".join(re.escape(w) for w in EXCLAMATION_WORDS)
|
13 |
+
|
14 |
+
@classmethod
|
15 |
+
def apply_rules(cls, text):
|
16 |
+
return re.sub(ExclamationWords.EXCLAMATION_REGEX, replace_punctuation,
|
17 |
+
text)
|
pysbd/lang/__init__.py
ADDED
File without changes
|
pysbd/lang/amharic.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
3 |
+
from pysbd.lang.common import Common, Standard
|
4 |
+
|
5 |
+
class Amharic(Common, Standard):
|
6 |
+
|
7 |
+
iso_code = 'am'
|
8 |
+
|
9 |
+
SENTENCE_BOUNDARY_REGEX = r'.*?[፧።!\?]|.*?$'
|
10 |
+
Punctuations = ['።', '፧', '?', '!']
|
11 |
+
|
12 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
13 |
+
SENTENCE_STARTERS = []
|
pysbd/lang/arabic.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
|
4 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
5 |
+
from pysbd.lang.common import Common, Standard
|
6 |
+
from pysbd.utils import Rule
|
7 |
+
|
8 |
+
class Arabic(Common, Standard):
|
9 |
+
|
10 |
+
iso_code = 'ar'
|
11 |
+
|
12 |
+
Punctuations = ['?', '!', ':', '.', '؟', '،']
|
13 |
+
SENTENCE_BOUNDARY_REGEX = r'.*?[:\.!\?؟،]|.*?\Z|.*?$'
|
14 |
+
|
15 |
+
# Rubular: http://rubular.com/r/RX5HpdDIyv
|
16 |
+
ReplaceColonBetweenNumbersRule = Rule(r'(?<=\d):(?=\d)', '♭')
|
17 |
+
|
18 |
+
# Rubular: http://rubular.com/r/kPRgApNHUg
|
19 |
+
ReplaceNonSentenceBoundaryCommaRule = Rule(r'،(?=\s\S+،)', '♬')
|
20 |
+
|
21 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
22 |
+
|
23 |
+
SENTENCE_STARTERS = []
|
24 |
+
|
25 |
+
def __init__(self, text, lang):
|
26 |
+
super().__init__(text, lang)
|
27 |
+
|
28 |
+
def scan_for_replacements(self, txt, am, index, character_array):
|
29 |
+
txt = re.sub('(?<={0})\.'.format(am), '∯', txt)
|
30 |
+
return txt
|
31 |
+
|
32 |
+
class Abbreviation(Standard.Abbreviation):
|
33 |
+
ABBREVIATIONS = ['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه',]
|
34 |
+
PREPOSITIVE_ABBREVIATIONS = []
|
35 |
+
NUMBER_ABBREVIATIONS = []
|
pysbd/lang/armenian.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
3 |
+
from pysbd.lang.common import Common, Standard
|
4 |
+
from pysbd.between_punctuation import BetweenPunctuation
|
5 |
+
import re
|
6 |
+
from functools import partial
|
7 |
+
from pysbd.punctuation_replacer import replace_punctuation
|
8 |
+
|
9 |
+
class Armenian(Common, Standard):
|
10 |
+
|
11 |
+
iso_code = 'hy'
|
12 |
+
|
13 |
+
SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[Ա-ՖA-Z])|「(?:[^」])*」(?=\s[Ա-ՖA-Z])|\((?:[^\)]){2,}\)(?=\s[Ա-ՖA-Z])|\'(?:[^\'])*[^,]\'(?=\s[Ա-ՖA-Z])|\"(?:[^\"])*[^,]\"(?=\s[Ա-ՖA-Z])|\“(?:[^\”])*[^,]\”(?=\s[Ա-ՖA-Z])|[。..!!?? ]{2,}|\S.*?[。..!!??ȸȹ☉☈☇☄]|[。..!!??]|.*?(?<!\d)[։]"
|
14 |
+
|
15 |
+
|
16 |
+
# SENTENCE_BOUNDARY_REGEX = r'((?:[^)])*)(?=\s?[Ա-ՖA-Z0-9])|.*?(?<!\d)[։]|.*?$'
|
17 |
+
Punctuations = ['։']
|
18 |
+
|
19 |
+
|
20 |
+
QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[.․։][\"\'“”»«]\s{1}[A-ZԱ-Ֆ]'
|
21 |
+
|
22 |
+
SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = r'(?<=[.․։][\"\'“”»«])\s{1}(?=[A-ZԱ-Ֆ])'
|
23 |
+
|
24 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
25 |
+
SENTENCE_STARTERS = []
|
26 |
+
|
27 |
+
class BetweenPunctuation(BetweenPunctuation):
|
28 |
+
BETWEEN_SINGLE_QUOTES_ARMENIAN_REGEX = r"(?<=\s)'(?:[^']|'[ա-ֆԱ-Ֆ])*'"
|
29 |
+
|
30 |
+
BETWEEN_SINGLE_QUOTE_SLANTED_ARMENIAN_REGEX = r"(?<=\s)‘(?:[^’]|’[ա-ֆԱ-Ֆ])*’"
|
31 |
+
|
32 |
+
BETWEEN_DOUBLE_QUOTES_REGEX = r'"(?>[^"\\]+|\\{2}|\\.)*"'
|
33 |
+
|
34 |
+
BETWEEN_DOUBLE_QUOTES_ARMENIAN_REGEX_2 = r'"(?=(?P<tmp>[^\"\\]+|\\{2}|\\.)*)(?P=tmp)"'
|
35 |
+
|
36 |
+
# Rubular: http://rubular.com/r/x6s4PZK8jc
|
37 |
+
BETWEEN_QUOTE_ARROW_ARMENIAN_REGEX = r'«(?>[^»\\]+|\\{2}|\\.)*»'
|
38 |
+
|
39 |
+
BETWEEN_QUOTE_ARROW_ARMENIAN_REGEX_2 = r"\«(?=(?P<tmp>[^»\\]+|\\{2}|\\.)*)(?P=tmp)\»"
|
40 |
+
|
41 |
+
# Rubular: http://rubular.com/r/JbAIpKdlSq
|
42 |
+
BETWEEN_QUOTE_SLANTED_ARMENIAN_REGEX = r"“(?>[^”\\]+|\\{2}|\\.)*”"
|
43 |
+
BETWEEN_QUOTE_SLANTED_ARMENIAN_REGEX_2 = r"\“(?=(?P<tmp>[^”\\]+|\\{2}|\\.)*)(?P=tmp)\”"
|
44 |
+
|
45 |
+
# Rubular: http://rubular.com/r/WX4AvnZvlX
|
46 |
+
BETWEEN_SQUARE_BRACKETS_ARMENIAN_REGEX = r"\[(?>[^\]\\]+|\\{2}|\\.)*\]"
|
47 |
+
|
48 |
+
BETWEEN_SQUARE_BRACKETS_ARMENIAN_REGEX_2 = r'\[(?=(?P<tmp>[^\]\\]+|\\{2}|\\.)*)(?P=tmp)\]'
|
49 |
+
|
50 |
+
# Rubular: http://rubular.com/r/6tTityPflI
|
51 |
+
BETWEEN_PARENS_ARMENIAN_REGEX = r"\((?>[^\(\)\\]+|\\{2}|\\.)*\)"
|
52 |
+
|
53 |
+
BETWEEN_PARENS_ARMENIAN_REGEX_2 = r"\((?=(?P<tmp>[^\(\)\\]+|\\{2}|\\.)*)(?P=tmp)\)"
|
54 |
+
|
55 |
+
# Rubular: http://rubular.com/r/mXf8cW025o
|
56 |
+
WORD_WITH_LEADING_APOSTROPHE_ARMENIAN = r"(?<=\s)'(?:[^']|'[ա-ֆԱ-Ֆ])*'\S"
|
57 |
+
|
58 |
+
# Rubular: http://rubular.com/r/jTtDKfjxzr
|
59 |
+
BETWEEN_EM_DASHES_REGEX_ARMENIAN = r"\-\-(?>[^\-\-])*\-\-"
|
60 |
+
|
61 |
+
BETWEEN_EM_DASHES_REGEX_2_ARMENIAN = r"--(?=(?P<tmp>[^--]*))(?P=tmp)--"
|
62 |
+
|
63 |
+
def __init__(self, text):
|
64 |
+
super().__init__(text)
|
65 |
+
|
66 |
+
def replace(self):
|
67 |
+
text = self.sub_punctuation_between_quotes_and_parens(self.text)
|
68 |
+
return self.sub_punctuation_between_quotes_and_parens_armenian(text)
|
69 |
+
|
70 |
+
def sub_punctuation_between_quotes_and_parens_armenian(self, txt):
|
71 |
+
txt = self.sub_punctuation_between_single_quotes_armenian(txt)
|
72 |
+
txt = self.sub_punctuation_between_single_quote_slanted_armenian(txt)
|
73 |
+
txt = self.sub_punctuation_between_double_quotes_armenian(txt)
|
74 |
+
txt = self.sub_punctuation_between_square_brackets_armenian(txt)
|
75 |
+
txt = self.sub_punctuation_between_parens_armenian(txt)
|
76 |
+
txt = self.sub_punctuation_between_quotes_arrow_armenian(txt)
|
77 |
+
txt = self.sub_punctuation_between_em_dashes_armenian(txt)
|
78 |
+
txt = self.sub_punctuation_between_quotes_slanted_armenian(txt)
|
79 |
+
return txt
|
80 |
+
|
81 |
+
def sub_punctuation_between_single_quotes_armenian(self, txt):
|
82 |
+
if re.search(self.WORD_WITH_LEADING_APOSTROPHE_ARMENIAN, txt) and \
|
83 |
+
(not re.search(r"'\s", txt)):
|
84 |
+
return txt
|
85 |
+
return re.sub(self.BETWEEN_SINGLE_QUOTES_ARMENIAN_REGEX,
|
86 |
+
partial(replace_punctuation, match_type='single'), txt)
|
87 |
+
|
88 |
+
def sub_punctuation_between_single_quote_slanted_armenian(self, txt):
|
89 |
+
return re.sub(self.BETWEEN_SINGLE_QUOTE_SLANTED_ARMENIAN_REGEX,
|
90 |
+
replace_punctuation, txt)
|
91 |
+
|
92 |
+
|
93 |
+
def sub_punctuation_between_parens_armenian(self, txt):
|
94 |
+
return re.sub(self.BETWEEN_PARENS_ARMENIAN_REGEX_2, replace_punctuation, txt)
|
95 |
+
|
96 |
+
def sub_punctuation_between_square_brackets_armenian(self, txt):
|
97 |
+
return re.sub(self.BETWEEN_SQUARE_BRACKETS_ARMENIAN_REGEX_2, replace_punctuation,
|
98 |
+
txt)
|
99 |
+
|
100 |
+
def sub_punctuation_between_double_quotes_armenian(self, txt):
|
101 |
+
return re.sub(self.BETWEEN_DOUBLE_QUOTES_ARMENIAN_REGEX_2, replace_punctuation,
|
102 |
+
txt)
|
103 |
+
|
104 |
+
def sub_punctuation_between_quotes_arrow_armenian(self, txt):
|
105 |
+
return re.sub(self.BETWEEN_QUOTE_ARROW_ARMENIAN_REGEX_2, replace_punctuation, txt)
|
106 |
+
|
107 |
+
def sub_punctuation_between_em_dashes_armenian(self, txt):
|
108 |
+
return re.sub(self.BETWEEN_EM_DASHES_REGEX_2_ARMENIAN, replace_punctuation, txt)
|
109 |
+
|
110 |
+
def sub_punctuation_between_quotes_slanted_armenian(self, txt):
|
111 |
+
return re.sub(self.BETWEEN_QUOTE_SLANTED_ARMENIAN_REGEX_2, replace_punctuation,
|
112 |
+
txt)
|
pysbd/lang/bulgarian.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
|
4 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
5 |
+
from pysbd.lang.common import Common, Standard
|
6 |
+
|
7 |
+
class Bulgarian(Common, Standard):
|
8 |
+
|
9 |
+
iso_code = 'bg'
|
10 |
+
|
11 |
+
class Abbreviation(Standard.Abbreviation):
|
12 |
+
ABBREVIATIONS = ["p.s", "акад", "ал", "б.р", "б.ред", "бел.а", "бел.пр", "бр", "бул", "в", "вж", "вкл", "вм", "вр", "г", "ген", "гр", "дж", "дм", "доц", "др", "ем", "заб", "зам", "инж", "к.с", "кв", "кв.м", "кг", "км", "кор", "куб", "куб.м", "л", "лв", "м", "м.г", "мин", "млн", "млрд", "мм", "н.с", "напр", "пл", "полк", "проф", "р", "рис", "с", "св", "сек", "см", "сп", "срв", "ст", "стр", "т", "т.г", "т.е", "т.н", "т.нар", "табл", "тел", "у", "ул", "фиг", "ха", "хил", "ч", "чл", "щ.д"]
|
13 |
+
NUMBER_ABBREVIATIONS = []
|
14 |
+
PREPOSITIVE_ABBREVIATIONS = []
|
15 |
+
|
16 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
17 |
+
SENTENCE_STARTERS = []
|
18 |
+
|
19 |
+
def __init__(self, text, lang):
|
20 |
+
super().__init__(text, lang)
|
21 |
+
|
22 |
+
def replace_period_of_abbr(self, txt, abbr):
|
23 |
+
txt = re.sub(r'(?<=\s{abbr})\.|(?<=^{abbr})\.'.format(abbr=abbr.strip()), '∯', txt)
|
24 |
+
return txt
|
pysbd/lang/burmese.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
3 |
+
from pysbd.lang.common import Common, Standard
|
4 |
+
|
5 |
+
class Burmese(Common, Standard):
|
6 |
+
|
7 |
+
iso_code = 'my'
|
8 |
+
|
9 |
+
SENTENCE_BOUNDARY_REGEX = r'.*?[။၏!\?]|.*?$'
|
10 |
+
Punctuations = ['။', '၏', '?', '!']
|
11 |
+
|
12 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
13 |
+
SENTENCE_STARTERS = []
|
pysbd/lang/chinese.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
4 |
+
from pysbd.between_punctuation import BetweenPunctuation
|
5 |
+
from pysbd.lang.common import Common, Standard
|
6 |
+
from pysbd.punctuation_replacer import replace_punctuation
|
7 |
+
|
8 |
+
class Chinese(Common, Standard):
|
9 |
+
|
10 |
+
iso_code = 'zh'
|
11 |
+
|
12 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
13 |
+
SENTENCE_STARTERS = []
|
14 |
+
|
15 |
+
class BetweenPunctuation(BetweenPunctuation):
|
16 |
+
|
17 |
+
def __init__(self, text):
|
18 |
+
super().__init__(text)
|
19 |
+
|
20 |
+
def replace(self):
|
21 |
+
self.sub_punctuation_between_quotes_and_parens()
|
22 |
+
return self.text
|
23 |
+
|
24 |
+
def sub_punctuation_between_double_angled_quotation_marks(self):
|
25 |
+
BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX = r"《(?=(?P<tmp>[^》\\]+|\\{2}|\\.)*)(?P=tmp)》"
|
26 |
+
self.text = re.sub(BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX, replace_punctuation,
|
27 |
+
self.text)
|
28 |
+
|
29 |
+
def sub_punctuation_between_l_bracket(self):
|
30 |
+
BETWEEN_L_BRACKET_REGEX = r"「(?=(?P<tmp>[^」\\]+|\\{2}|\\.)*)(?P=tmp)」"
|
31 |
+
self.text = re.sub(BETWEEN_L_BRACKET_REGEX, replace_punctuation,
|
32 |
+
self.text)
|
33 |
+
|
34 |
+
def sub_punctuation_between_quotes_and_parens(self):
|
35 |
+
self.sub_punctuation_between_double_angled_quotation_marks()
|
36 |
+
self.sub_punctuation_between_l_bracket()
|
pysbd/lang/common/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .common import Common # noqa: F401
|
2 |
+
from .standard import Standard # noqa: F401
|
pysbd/lang/common/common.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
from pysbd.utils import Rule
|
4 |
+
|
5 |
+
class Common(object):
|
6 |
+
|
7 |
+
# added special case: r"[。..!!? ]{2,}" to handle intermittent dots, exclamation, etc.
|
8 |
+
# r"[。..!!?] at end to handle single instances of these symbol inputs
|
9 |
+
SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!?? ]{2,}|\S.*?[。..!!??ȸȹ☉☈☇☄]|[。..!!??]"
|
10 |
+
|
11 |
+
# # Rubular: http://rubular.com/r/NqCqv372Ix
|
12 |
+
QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[!?\.-][\"\'“”]\s{1}[A-Z]'
|
13 |
+
|
14 |
+
# # Rubular: http://rubular.com/r/6flGnUMEVl
|
15 |
+
PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = r'["\”]\s\(.*\)\s["\“]'
|
16 |
+
|
17 |
+
# # Rubular: http://rubular.com/r/TYzr4qOW1Q
|
18 |
+
# BETWEEN_DOUBLE_QUOTES_REGEX = / "(?:[^"])*[^, ]"|“(?: [ ^”])*[^, ]”/
|
19 |
+
|
20 |
+
# # Rubular: http://rubular.com/r/JMjlZHAT4g
|
21 |
+
SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = r'(?<=[!?\.-][\"\'“”])\s{1}(?=[A-Z])'
|
22 |
+
|
23 |
+
# # Rubular: http://rubular.com/r/mQ8Es9bxtk
|
24 |
+
CONTINUOUS_PUNCTUATION_REGEX = r'(?<=\S)(!|\?){3,}(?=(\s|\Z|$))'
|
25 |
+
|
26 |
+
# https://rubular.com/r/UkumQaILKbkeyc
|
27 |
+
# https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703
|
28 |
+
NUMBERED_REFERENCE_REGEX = r'(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)?\d{1,3}))(\s)(?=[A-Z])'
|
29 |
+
|
30 |
+
# # Rubular: http://rubular.com/r/yqa4Rit8EY
|
31 |
+
PossessiveAbbreviationRule = Rule(r"\.(?='s\s)|\.(?='s$)|\.(?='s\Z)", '∯')
|
32 |
+
|
33 |
+
# # Rubular: http://rubular.com/r/NEv265G2X2
|
34 |
+
KommanditgesellschaftRule = Rule(r'(?<=Co)\.(?=\sKG)', '∯')
|
35 |
+
|
36 |
+
# # Rubular: http://rubular.com/r/xDkpFZ0EgH
|
37 |
+
MULTI_PERIOD_ABBREVIATION_REGEX = r"\b[a-z](?:\.[a-z])+[.]"
|
38 |
+
|
39 |
+
class SingleLetterAbbreviationRules(object):
|
40 |
+
"""Searches for periods within an abbreviation and
|
41 |
+
replaces the periods.
|
42 |
+
"""
|
43 |
+
# Rubular: http://rubular.com/r/e3H6kwnr6H
|
44 |
+
SingleUpperCaseLetterAtStartOfLineRule = Rule(r"(?<=^[A-Z])\.(?=\s)", '∯')
|
45 |
+
|
46 |
+
# Rubular: http://rubular.com/r/gitvf0YWH4
|
47 |
+
SingleUpperCaseLetterRule = Rule(r"(?<=\s[A-Z])\.(?=,?\s)", '∯')
|
48 |
+
|
49 |
+
All = [
|
50 |
+
SingleUpperCaseLetterAtStartOfLineRule, SingleUpperCaseLetterRule
|
51 |
+
]
|
52 |
+
|
53 |
+
class AmPmRules(object):
|
54 |
+
|
55 |
+
# Rubular: http://rubular.com/r/Vnx3m4Spc8
|
56 |
+
UpperCasePmRule = Rule(r'(?<= P∯M)∯(?=\s[A-Z])', '.')
|
57 |
+
|
58 |
+
# Rubular: http://rubular.com/r/AJMCotJVbW
|
59 |
+
UpperCaseAmRule = Rule(r'(?<=A∯M)∯(?=\s[A-Z])', '.')
|
60 |
+
|
61 |
+
# Rubular: http://rubular.com/r/13q7SnOhgA
|
62 |
+
LowerCasePmRule = Rule(r'(?<=p∯m)∯(?=\s[A-Z])', '.')
|
63 |
+
|
64 |
+
# Rubular: http://rubular.com/r/DgUDq4mLz5
|
65 |
+
LowerCaseAmRule = Rule(r'(?<=a∯m)∯(?=\s[A-Z])', '.')
|
66 |
+
|
67 |
+
All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
|
68 |
+
|
69 |
+
class Numbers(object):
|
70 |
+
# Rubular: http://rubular.com/r/oNyxBOqbyy
|
71 |
+
PeriodBeforeNumberRule = Rule(r'\.(?=\d)', '∯')
|
72 |
+
|
73 |
+
# Rubular: http://rubular.com/r/EMk5MpiUzt
|
74 |
+
NumberAfterPeriodBeforeLetterRule = Rule(r'(?<=\d)\.(?=\S)', '∯')
|
75 |
+
|
76 |
+
# Rubular: http://rubular.com/r/rf4l1HjtjG
|
77 |
+
NewLineNumberPeriodSpaceLetterRule = Rule(r'(?<=\r\d)\.(?=(\s\S)|\))', '∯')
|
78 |
+
|
79 |
+
# Rubular: http://rubular.com/r/HPa4sdc6b9
|
80 |
+
StartLineNumberPeriodRule = Rule(r'(?<=^\d)\.(?=(\s\S)|\))', '∯')
|
81 |
+
|
82 |
+
# Rubular: http://rubular.com/r/NuvWnKleFl
|
83 |
+
StartLineTwoDigitNumberPeriodRule = Rule(r'(?<=^\d\d)\.(?=(\s\S)|\))', '∯')
|
84 |
+
|
85 |
+
All = [
|
86 |
+
PeriodBeforeNumberRule,
|
87 |
+
NumberAfterPeriodBeforeLetterRule,
|
88 |
+
NewLineNumberPeriodSpaceLetterRule,
|
89 |
+
StartLineNumberPeriodRule,
|
90 |
+
StartLineTwoDigitNumberPeriodRule
|
91 |
+
]
|
pysbd/lang/common/standard.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.utils import Rule
|
3 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
4 |
+
|
5 |
+
class Standard:
|
6 |
+
|
7 |
+
# This class holds the punctuation marks.
|
8 |
+
Punctuations = ['。', '.', '.', '!', '!', '?', '?']
|
9 |
+
|
10 |
+
# Rubular: http://rubular.com/r/G2opjedIm9
|
11 |
+
GeoLocationRule = Rule(r'(?<=[a-zA-z]°)\.(?=\s*\d+)', '∯')
|
12 |
+
|
13 |
+
FileFormatRule = Rule(r'(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)', '∯')
|
14 |
+
|
15 |
+
SingleNewLineRule = Rule(r'\n', 'ȹ')
|
16 |
+
|
17 |
+
# Rubular: http://rubular.com/r/aXPUGm6fQh
|
18 |
+
QuestionMarkInQuotationRule = Rule(r'\?(?=(\'|\"))', '&ᓷ&')
|
19 |
+
|
20 |
+
ExtraWhiteSpaceRule = Rule(r'\s{3,}', ' ')
|
21 |
+
|
22 |
+
SubSingleQuoteRule = Rule(r'&⎋&', "'")
|
23 |
+
|
24 |
+
class Abbreviation(object):
|
25 |
+
"""Defines the abbreviations for each language (if available)"""
|
26 |
+
ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk', 'fig']
|
27 |
+
PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs', 'fig']
|
28 |
+
NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
|
29 |
+
|
30 |
+
# Rubular: http://rubular.com/r/EUbZCNfgei
|
31 |
+
# WithMultiplePeriodsAndEmailRule = Rule(r'(\w)(\.)(\w)', '\\1∮\\3')
|
32 |
+
# \w in python matches unicode abbreviations also so limit to english alphanumerics
|
33 |
+
WithMultiplePeriodsAndEmailRule = Rule(r'([a-zA-Z0-9_])(\.)([a-zA-Z0-9_])', '\\1∮\\3')
|
34 |
+
|
35 |
+
class DoublePunctuationRules(object):
|
36 |
+
FirstRule = Rule(r'\?!', '☉')
|
37 |
+
SecondRule = Rule(r'!\?', '☈')
|
38 |
+
ThirdRule = Rule(r'\?\?', '☇')
|
39 |
+
ForthRule = Rule(r'!!', '☄')
|
40 |
+
DoublePunctuation = r'\?!|!\?|\?\?|!!'
|
41 |
+
All = [FirstRule, SecondRule, ThirdRule, ForthRule]
|
42 |
+
|
43 |
+
class ExclamationPointRules(object):
|
44 |
+
# Rubular: http://rubular.com/r/XS1XXFRfM2
|
45 |
+
InQuotationRule = Rule(r'\!(?=(\'|\"))', '&ᓴ&')
|
46 |
+
|
47 |
+
# Rubular: http://rubular.com/r/sl57YI8LkA
|
48 |
+
BeforeCommaMidSentenceRule = Rule(r'\!(?=\,\s[a-z])', '&ᓴ&')
|
49 |
+
|
50 |
+
# Rubular: http://rubular.com/r/f9zTjmkIPb
|
51 |
+
MidSentenceRule = Rule(r'\!(?=\s[a-z])', '&ᓴ&')
|
52 |
+
|
53 |
+
All = [InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule]
|
54 |
+
|
55 |
+
class SubSymbolsRules(object):
|
56 |
+
Period = Rule(r'∯', '.')
|
57 |
+
ArmenianFullStop = Rule(r'⍟', '։')
|
58 |
+
ArabicComma = Rule(r'♬', '،')
|
59 |
+
SemiColon = Rule(r'♭', ':')
|
60 |
+
FullWidthPeriod = Rule(r'&ᓰ&', '。')
|
61 |
+
SpecialPeriod = Rule(r'&ᓱ&', '.')
|
62 |
+
FullWidthExclamation = Rule(r'&ᓳ&', '!')
|
63 |
+
ExclamationPoint = Rule(r'&ᓴ&', '!')
|
64 |
+
QuestionMark = Rule(r'&ᓷ&', '?')
|
65 |
+
FullWidthQuestionMark = Rule(r'&ᓸ&', '?')
|
66 |
+
MixedDoubleQE = Rule(r'☉', '?!')
|
67 |
+
MixedDoubleQQ = Rule(r'☇', '??')
|
68 |
+
MixedDoubleEQ = Rule(r'☈', '!?')
|
69 |
+
MixedDoubleEE = Rule(r'☄', '!!')
|
70 |
+
LeftParens = Rule(r'&✂&', '(')
|
71 |
+
RightParens = Rule(r'&⌬&', ')')
|
72 |
+
TemporaryEndingPunctutation = Rule(r'ȸ', '')
|
73 |
+
Newline = Rule(r'ȹ', "\n")
|
74 |
+
All = [Period, ArmenianFullStop, ArabicComma, SemiColon, FullWidthPeriod, SpecialPeriod,
|
75 |
+
FullWidthExclamation, ExclamationPoint, QuestionMark,
|
76 |
+
FullWidthQuestionMark, MixedDoubleQE, MixedDoubleQQ, MixedDoubleEQ,
|
77 |
+
MixedDoubleEE, LeftParens, RightParens, TemporaryEndingPunctutation,
|
78 |
+
Newline]
|
79 |
+
|
80 |
+
class EllipsisRules(object):
|
81 |
+
|
82 |
+
# below rules aren't similar to original rules of pragmatic segmenter
|
83 |
+
# modification: spaces replaced with same number of symbols
|
84 |
+
# Rubular: http://rubular.com/r/i60hCK81fz
|
85 |
+
ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏☏.')
|
86 |
+
|
87 |
+
# Rubular: http://rubular.com/r/Hdqpd90owl
|
88 |
+
FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪƪƪ')
|
89 |
+
|
90 |
+
# Rubular: http://rubular.com/r/YBG1dIHTRu
|
91 |
+
ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟♟♟♟♟♟♟')
|
92 |
+
|
93 |
+
# Rubular: http://rubular.com/r/2VvZ8wRbd8
|
94 |
+
FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝♝♝♝♝♝♝')
|
95 |
+
|
96 |
+
OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪƪƪ')
|
97 |
+
|
98 |
+
All = [ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule,
|
99 |
+
ThreeConsecutiveRule, OtherThreePeriodRule]
|
100 |
+
|
101 |
+
class ReinsertEllipsisRules(object):
|
102 |
+
# below rules aren't similar to original rules of pragmatic segmenter
|
103 |
+
# modification: symbols replaced with same number of ellipses
|
104 |
+
SubThreeConsecutivePeriod = Rule(r'ƪƪƪ', '...')
|
105 |
+
SubThreeSpacePeriod = Rule(r'♟♟♟♟♟♟♟', ' . . . ')
|
106 |
+
SubFourSpacePeriod = Rule(r'♝♝♝♝♝♝♝', '. . . .')
|
107 |
+
SubTwoConsecutivePeriod = Rule(r'☏☏', '..')
|
108 |
+
SubOnePeriod = Rule(r'∮', '.')
|
109 |
+
All = [SubThreeConsecutivePeriod, SubThreeSpacePeriod, SubFourSpacePeriod,
|
110 |
+
SubTwoConsecutivePeriod, SubOnePeriod]
|
111 |
+
|
112 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
113 |
+
SENTENCE_STARTERS = "A Being Did For He How However I In It Millions "\
|
114 |
+
"More She That The There They We What When Where Who Why".split(" ")
|
pysbd/lang/danish.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
from re import escape
|
4 |
+
|
5 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
6 |
+
from pysbd.lang.common import Common, Standard
|
7 |
+
from pysbd.utils import Rule
|
8 |
+
|
9 |
+
class Danish(Common, Standard):
|
10 |
+
|
11 |
+
iso_code = 'da'
|
12 |
+
|
13 |
+
MONTHS = ['Januar', 'Februar', 'Marts', 'April', 'Maj', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'December']
|
14 |
+
|
15 |
+
class Numbers(Common.Numbers):
|
16 |
+
|
17 |
+
NumberPeriodSpaceRule = Rule(r'(?<=\s[1-9][0-9])\.(?=\s)|(?<=\s[0-9])\.(?=\s)', '∯')
|
18 |
+
|
19 |
+
NegativeNumberPeriodSpaceRule = Rule(r'(?<=\s-[1-9][0-9])\.(?=\s)|(?<=\s-[0-9])\.(?=\s)', '∯')
|
20 |
+
|
21 |
+
All = Common.Numbers.All + [NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule]
|
22 |
+
|
23 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
24 |
+
|
25 |
+
SENTENCE_STARTERS = ("At De Dem Den Der Det Du En Et For Få Gjorde Han Hun Hvad Hvem"
|
26 |
+
" Hvilke Hvor Hvordan Hvorfor Hvorledes Hvornår I Jeg Mange Vi Være").split(' ')
|
27 |
+
|
28 |
+
def __init__(self, text, lang):
|
29 |
+
super().__init__(text, lang)
|
30 |
+
|
31 |
+
def replace_abbreviation_as_sentence_boundary(self):
|
32 |
+
sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in self.SENTENCE_STARTERS))
|
33 |
+
regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|s.u|s.U)∯({})".format(sent_starters)
|
34 |
+
self.text = re.sub(regex, '\\1.', self.text)
|
35 |
+
return self.text
|
36 |
+
|
37 |
+
class Abbreviation(Standard.Abbreviation):
|
38 |
+
ABBREVIATIONS = ['adm', 'adr', 'afd', 'afs', 'al', 'alm', 'alm', 'ang', 'ank', 'anm', 'ann', 'ansvh', 'apr', 'arr', 'ass', 'att', 'aud', 'aug', 'aut', 'bd', 'bdt', 'bet', 'bhk', 'bio', 'biol', 'bk', 'bl.a', 'bot', 'br', 'bto', 'ca', 'cal', 'cirk', 'cit', 'co', 'cpr-nr', 'cvr-nr', 'd.d', 'd.e', 'd.m', 'd.s', 'd.s.s', 'd.y', 'd.å', 'd.æ', 'da', 'dav', 'dec', 'def', 'del', 'dep', 'diam', 'din', 'dir', 'disp', 'distr', 'do', 'dobb', 'dr', 'ds', 'dvs', 'e.b', 'e.kr', 'e.l', 'e.o', 'e.v.t', 'eftf', 'eftm', 'egl', 'eks', 'eksam', 'ekskl', 'eksp', 'ekspl', 'el', 'emer', 'endv', 'eng', 'enk', 'etc', 'eur', 'evt', 'exam', 'f', 'f', 'f.eks', 'f.kr', 'f.m', 'f.n', 'f.o', 'f.o.m', 'f.s.v', 'f.t', 'f.v.t', 'f.å', 'fa', 'fakt', 'feb', 'fec', 'ff', 'fg', 'fg', 'fhv', 'fig', 'fl', 'flg', 'fm', 'fm', 'fmd', 'forb', 'foreg', 'foren', 'forf', 'forh', 'fork', 'form', 'forr', 'fors', 'forsk', 'forts', 'fp', 'fr', 'frk', 'fuldm', 'fuldm', 'fung', 'fung', 'fys', 'fær', 'g', 'g.d', 'g.m', 'gd', 'gdr', 'gg', 'gh', 'gl', 'gn', 'gns', 'gr', 'grdl', 'gross', 'h.a', 'h.c', 'hdl', 'henh', 'henv', 'hf', 'hft', 'hhv', 'hort', 'hosp', 'hpl', 'hr', 'hrs', 'hum', 'i', 'i.e', 'ib', 'ibid', 'if', 'ifm', 'ill', 'indb', 'indreg', 'ing', 'inkl', 'insp', 'instr', 'isl', 'istf', 'jan', 'jf', 'jfr', 'jnr', 'jr', 'jul', 'jun', 'jur', 'jvf', 'kal', 'kap', 'kat', 'kbh', 'kem', 'kgl', 'kin', 'kl', 'kld', 'km/t', 'knsp', 'komm', 'kons', 'korr', 'kp', 'kr', 'kr', 'kst', 'kt', 'ktr', 'kv', 'kvt', 'l', 'l.c', 'lab', 'lat', 'lb', 'lb.', 'lb.nr', 'lejl', 'lgd', 'lic', 'lign', 'lin', 'ling.merc', 'litt', 'lok', 'lrs', 'ltr', 'lø', 'm', 'm.a.o', 'm.fl.st', 'm.m', 'm/', 'ma', 'mag', 'maks', 'mar', 'mat', 'matr.nr', 'md', 'mdl', 'mdr', 'mdtl', 'med', 'medd', 'medflg', 'medl', 'merc', 'mezz', 'mf', 'mfl', 'mgl', 'mhp', 'mht', 'mi', 'mia', 'mio', 'ml', 'mods', 'modsv', 'modt', 'mr', 'mrk', 'mrs', 'ms', 'mul', 'mv', 'mvh', 'n', 'n.br', 'n.f', 'nat', 'ned', 'nedenn', 'nedenst', 'nederl', 'nkr', 'nl', 'no', 'nord', 'nov', 'nr', 'nr', 'nto', 'nuv', 'o', 'o.a', 'o.fl.st', 'o.g', 'o.h', 'o.m.a', 'obj', 'obl', 'obs', 'odont', 'oecon', 'off', 'ofl', 'okt', 'omg', 'omr', 'omtr', 'on', 'op.cit', 'opg', 'opl', 'opr', 'org', 'orig', 'osfr', 'osv', 'ovenn', 'ovenst', 'overs', 'ovf', 'oz', 'p', 'p.a', 'p.b.v', 'p.c', 'p.m.v', 'p.p', 'p.s', 'p.t', 'p.v.a', 'p.v.c', 'par', 'partc', 'pass', 'pct', 'pd', 'pens', 'perf', 'pers', 'pg', 'pga', 'pgl', 'ph', 'ph.d', 'pharm', 'phil', 'pinx', 'pk', 'pkt', 'pl', 'pluskv', 'polit', 'polyt', 'port', 'pos', 'pp', 'pr', 'prc', 'priv', 'prod', 'prof', 'pron', 'præd', 'præf', 'præp', 'præs', 'præt', 'psych', 'pt', 'pæd', 'q.e.d', 'rad', 'red', 'ref', 'reg', 'regn', 'rel', 'rep', 'repr', 'rest', 'rk', 'russ', 's', 's.br', 's.d', 's.e', 's.f', 's.m.b.a', 's.u', 's.å', 's/', 'sa', 'sb', 'sc', 'scient', 'sek', 'sek', 'sekr', 'sem', 'sen', 'sep', 'sept', 'sg', 'sign', 'sj', 'skr', 'skt', 'slutn', 'sml', 'smp', 'sms', 'smst', 'soc', 'soc', 'sort', 'sp', 'spec', 'spm', 'spr', 'spsk', 'st', 'stk', 'str', 'stud', 'subj', 'subst', 'suff', 'sup', 'suppl', 'sv', 'såk', 'sædv', 'sø', 't', 't.h', 't.o.m', 't.v', 'tab', 'td', 'tdl', 'tdr', 'techn', 'tekn', 'temp', 'th', 'ti', 'tidl', 'tilf', 'tilh', 'till', 'tilsv', 'tjg', 'tlf', 'tlgr', 'to', 'tr', 'trp', 'tv', 'ty', 'u', 'u.p', 'u.st', 'u.å', 'uafh', 'ubf', 'ubøj', 'udb', 'udbet', 'udd', 'udg', 'uds', 'ugtl', 'ulin', 'ult', 'undt', 'univ', 'v.f', 'var', 'vb', 'vbsb', 'vedk', 'vedl', 'vedr', 'vejl', 'vh', 'vol', 'vs', 'vsa', 'vær', 'zool', 'årg', 'årh', 'årl', 'ø.f', 'øv', 'øvr']
|
39 |
+
NUMBER_ABBREVIATIONS = ['nr', 's']
|
40 |
+
PREPOSITIVE_ABBREVIATIONS = ['adm', 'skt', 'dr', 'hr', 'fru', 'st']
|
pysbd/lang/deutsch.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
4 |
+
from pysbd.between_punctuation import BetweenPunctuation
|
5 |
+
from pysbd.lang.common import Common, Standard
|
6 |
+
from pysbd.punctuation_replacer import replace_punctuation
|
7 |
+
from pysbd.processor import Processor
|
8 |
+
from pysbd.utils import Text, Rule
|
9 |
+
|
10 |
+
|
11 |
+
class Deutsch(Common, Standard):
|
12 |
+
|
13 |
+
iso_code = 'de'
|
14 |
+
|
15 |
+
class Numbers(Common.Numbers):
|
16 |
+
# Rubular: http://rubular.com/r/hZxoyQwKT1
|
17 |
+
NumberPeriodSpaceRule = Rule(r'(?<=\s\d)\.(?=\s)|(?<=\s\d\d)\.(?=\s)', '∯')
|
18 |
+
|
19 |
+
# Rubular: http://rubular.com/r/ityNMwdghj
|
20 |
+
NegativeNumberPeriodSpaceRule = Rule(r'(?<=-\d)\.(?=\s)|(?<=-\d\d)\.(?=\s)', '∯')
|
21 |
+
|
22 |
+
All = Common.Numbers.All + [NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule]
|
23 |
+
|
24 |
+
class Processor(Processor):
|
25 |
+
|
26 |
+
def __init__(self, text, lang, char_span=False):
|
27 |
+
super().__init__(text, lang, char_span)
|
28 |
+
|
29 |
+
def replace_numbers(self):
|
30 |
+
self.text = Text(self.text).apply(*self.lang.Numbers.All)
|
31 |
+
self.replace_period_in_deutsch_dates()
|
32 |
+
return self.text
|
33 |
+
|
34 |
+
def replace_period_in_deutsch_dates(self):
|
35 |
+
MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August',
|
36 |
+
'September', 'Oktober', 'November', 'Dezember']
|
37 |
+
for month in MONTHS:
|
38 |
+
# Rubular: http://rubular.com/r/zlqgj7G5dA
|
39 |
+
self.text = re.sub(r'(?<=\d)\.(?=\s*{month})'.format(month=month), '∯', self.text)
|
40 |
+
|
41 |
+
class Abbreviation(Standard.Abbreviation):
|
42 |
+
ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt', 'univ.-prof', 'o.univ.-prof', 'ao.univ.prof', 'ass.prof', 'hon.prof', 'univ.-doz', 'univ.ass', 'stud.ass', 'projektass', 'ass', 'di', 'dipl.-ing', 'mag']
|
43 |
+
PREPOSITIVE_ABBREVIATIONS = []
|
44 |
+
NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
|
45 |
+
|
46 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
47 |
+
|
48 |
+
SENTENCE_STARTERS = ("Am Auch Auf Bei Da Das Der Die Ein Eine Es Für Heute Ich Im In "
|
49 |
+
"Ist Jetzt Mein Mit Nach So Und Warum Was Wenn Wer Wie Wir").split(' ')
|
50 |
+
|
51 |
+
def __init__(self, text, lang):
|
52 |
+
super().__init__(text, lang)
|
53 |
+
|
54 |
+
def replace(self):
|
55 |
+
# Rubular: http://rubular.com/r/B4X33QKIL8
|
56 |
+
SingleLowerCaseLetterRule = Rule(r'(?<=\s[a-z])\.(?=\s)', '∯')
|
57 |
+
|
58 |
+
# Rubular: http://rubular.com/r/iUNSkCuso0
|
59 |
+
SingleLowerCaseLetterAtStartOfLineRule = Rule(r'(?<=^[a-z])\.(?=\s)', '∯')
|
60 |
+
self.text = Text(self.text).apply(
|
61 |
+
self.lang.PossessiveAbbreviationRule,
|
62 |
+
*self.lang.SingleLetterAbbreviationRules.All,
|
63 |
+
SingleLowerCaseLetterRule,
|
64 |
+
SingleLowerCaseLetterAtStartOfLineRule)
|
65 |
+
|
66 |
+
self.text = self.search_for_abbreviations_in_string(self.text)
|
67 |
+
self.replace_multi_period_abbreviations()
|
68 |
+
self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
|
69 |
+
self.text = self.replace_abbreviation_as_sentence_boundary()
|
70 |
+
return self.text
|
71 |
+
|
72 |
+
def scan_for_replacements(self, txt, am, index, character_array):
|
73 |
+
txt = re.sub(r'(?<={am})\.(?=\s)'.format(am=am), '∯', txt)
|
74 |
+
return txt
|
75 |
+
|
76 |
+
class BetweenPunctuation(BetweenPunctuation):
|
77 |
+
|
78 |
+
def __init__(self, text):
|
79 |
+
super().__init__(text)
|
80 |
+
|
81 |
+
def sub_punctuation_between_double_quotes(self, txt):
|
82 |
+
# Rubular: http://rubular.com/r/OdcXBsub0w
|
83 |
+
BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = r',,(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)“'
|
84 |
+
|
85 |
+
# Rubular: http://rubular.com/r/2UskIupGgP
|
86 |
+
# SPLIT_DOUBLE_QUOTES_DE_REGEX = r'\A„(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)“'
|
87 |
+
|
88 |
+
# Rubular: http://rubular.com/r/TkZomF9tTM
|
89 |
+
BETWEEN_DOUBLE_QUOTES_DE_REGEX = r'„(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)“'
|
90 |
+
|
91 |
+
if '„' in txt:
|
92 |
+
return re.sub(BETWEEN_DOUBLE_QUOTES_DE_REGEX, replace_punctuation, txt)
|
93 |
+
elif ',,' in txt:
|
94 |
+
return re.sub(BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX,
|
95 |
+
replace_punctuation, txt)
|
96 |
+
else:
|
97 |
+
return txt
|
pysbd/lang/dutch.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
3 |
+
from pysbd.lang.common import Common, Standard
|
4 |
+
|
5 |
+
class Dutch(Common, Standard):
|
6 |
+
|
7 |
+
iso_code = 'nl'
|
8 |
+
|
9 |
+
class Abbreviation(Standard.Abbreviation):
|
10 |
+
ABBREVIATIONS = ['a.2d', 'a.a', 'a.a.j.b', 'a.f.t', 'a.g.j.b', 'a.h.v', 'a.h.w', 'a.hosp', 'a.i', 'a.j.b', 'a.j.t', 'a.m', 'a.m.r', 'a.p.m', 'a.p.r', 'a.p.t', 'a.s', 'a.t.d.f', 'a.u.b', 'a.v.a', 'a.w', 'aanbev', 'aanbev.comm', 'aant', 'aanv.st', 'aanw', 'vnw', 'aanw.vnw', 'abd', 'abm', 'abs', 'acc.& fisc', 'acc.act', 'acc.bedr.m', 'acc.bedr.t', "acc.thema's m.", 'acc.thema’s m', 'achterv', 'act.dr', 'act.dr.fam', 'act.fisc', 'act.soc', 'adm.akk', 'adm.besl', 'adm.lex', 'adm.onderr', 'adm.ov', 'adv', 'adv', 'gen', 'adv.bl', 'afd', 'afl', 'aggl.verord', 'agr', 'al', 'alg', 'alg.richts', 'amén', 'ann.dr', 'ann.dr.lg', 'ann.dr.sc.pol', 'ann.ét.eur', 'ann.fac.dr.lg', 'ann.jur.créd', 'ann.jur.créd.règl.coll', 'ann.not', 'ann.parl', 'ann.prat.comm', 'app', 'arb', 'aud', 'arbbl', 'arbh', 'arbit.besl', 'arbrb', 'arr', 'arr.cass', 'arr.r.v.st', 'arr.verbr', 'arrondrb', 'art', 'artw', 'aud', 'b', 'b', 'en w', 'b.&w', 'b.a', 'b.a.s', 'b.b.o', 'b.best.dep', 'b.br.ex', 'b.coll.fr.gem.comm', 'b.coll.vl.gem.comm', 'b.d.cult.r', 'b.d.gem.ex', 'b.d.gem.reg', 'b.dep', 'b.e.b', 'b.f.r', 'b.fr.gem.ex', 'b.fr.gem.reg', 'b.i.h', 'b.inl.j.d', 'b.inl.s.reg', 'b.j', 'b.l', 'b.lid br.ex', 'b.lid d.gem.ex', 'b.lid fr.gem.ex', 'b.lid vl.ex', 'b.lid w.gew.ex', 'b.o.z', 'b.prov.r', 'b.r.h', 'b.s', 'b.sr', 'b.stb', 'b.t.i.r', 'b.t.s.z', 'b.t.w.rev', 'b.v', 'b.ver.coll.gem.gem.comm', 'b.verg.r.b', 'b.versl', 'b.vl.ex', 'b.voorl.reg', 'b.w', 'b.w.gew.ex', 'b.z.d.g', 'b.z.v', 'bab', 'bank fin', 'bank fin.r', 'bedr.org', 'begins', 'beheersov', 'bekendm.comm', 'bel', 'bel.besch', 'bel.w.p', 'beleidsov', 'belg', 'grondw', 'benelux jur', 'ber', 'ber.w', 'besch', 'besl', 'beslagr', 'besluitwet nr', 'bestuurswet', 'bet', 'betr', 'betr', 'vnw', 'bevest', 'bew', 'bijbl', 'ind', 'eig', 'bijbl.n.bijdr', 'bijl', 'bijv', 'bijw', 'bijz.decr', 'bin.b', 'bkh', 'bl', 'blz', 'bm', 'bn', 'bnlx merkw', 'bnlx tek', 'bnlx uitl', 'rh', 'bnw', 'bouwr', 'br drs', 'br.parl', 'bs', 'bt drs', 'btw rev', 'bull', 'bull.adm.pénit', 'bull.ass', 'bull.b.m.m', 'bull.bel', 'bull.best.strafinr', 'bull.bmm', 'bull.c.b.n', 'bull.c.n.c', 'bull.cbn', 'bull.centr.arb', 'bull.cnc', 'bull.contr', 'bull.doc.min.fin', 'bull.f.e.b', 'bull.feb', 'bull.fisc.fin.r', 'bull.i.u.m', 'bull.inf.ass.secr.soc', 'bull.inf.i.e.c', 'bull.inf.i.n.a.m.i', 'bull.inf.i.r.e', 'bull.inf.iec', 'bull.inf.inami', 'bull.inf.ire', 'bull.inst.arb', 'bull.ium', 'bull.jur.imm', 'bull.lég.b', 'bull.off', 'bull.trim.b.dr.comp', 'bull.us', 'bull.v.b.o', 'bull.vbo', 'bv i.o', 'bv', 'bw int.reg', 'bw', 'bxh', 'byz', 'c', 'c.& f', 'c.& f.p', 'c.a', 'c.a.-a', 'c.a.b.g', 'c.c', 'c.c.i', 'c.c.s', 'c.conc.jur', 'c.d.e', 'c.d.p.k', 'c.e', 'c.ex', 'c.f', 'c.h.a', 'c.i.f', 'c.i.f.i.c', 'c.j', 'c.l', 'c.n', 'c.o.d', 'c.p', 'c.pr.civ', 'c.q', 'c.r', 'c.r.a', 'c.s', 'c.s.a', 'c.s.q.n', 'c.v', 'c.v.a', 'c.v.o', 'ca', 'cadeaust', 'cah.const', 'cah.dr.europ', 'cah.dr.immo', 'cah.dr.jud', 'cal', '2d', 'cal', '3e', 'cal', 'rprt', 'cap', 'carg', 'cass', 'cass', 'verw', 'cert', 'cf', 'ch', 'chron', 'chron.d.s', 'chron.dr.not', 'cie', 'cie', 'verz.schr', 'cir', 'circ', 'circ.z', 'cit', 'cit.loc', 'civ', 'cl.et.b', 'cmt', 'co', 'cognoss.v', 'coll', 'v', 'b', 'colp.w', 'com', 'com', 'cas', 'com.v.min', 'comm', 'comm', 'v', 'comm.bijz.ov', 'comm.erf', 'comm.fin', 'comm.ger', 'comm.handel', 'comm.pers', 'comm.pub', 'comm.straf', 'comm.v', 'comm.v.en v', 'comm.venn', 'comm.verz', 'comm.voor', 'comp', 'compt.w', 'computerr', 'con.m', 'concl', 'concr', 'conf', 'confl.w', 'confl.w.huwbetr', 'cons', 'conv', 'coöp', 'ver', 'corr', 'corr.bl', 'cour de cass', 'cour.fisc', 'cour.immo', 'cridon', 'crim', 'cur', 'cur', 'crt', 'curs', 'd', 'd.-g', 'd.a', 'd.a.v', 'd.b.f', 'd.c', 'd.c.c.r', 'd.d', 'd.d.p', 'd.e.t', 'd.gem.r', 'd.h', 'd.h.z', 'd.i', 'd.i.t', 'd.j', 'd.l.r', 'd.m', 'd.m.v', 'd.o.v', 'd.parl', 'd.w.z', 'dact', 'dat', 'dbesch', 'dbesl', 'de advoc', 'de belg.acc', 'de burg.st', 'de gem', 'de gerechtsd', 'de venn', 'de verz', 'decr', 'decr.d', 'decr.fr', 'decr.vl', 'decr.w', 'def', 'dep.opv', 'dep.rtl', 'derg', 'desp', 'det.mag', 'deurw.regl', 'dez', 'dgl', 'dhr', 'disp', 'diss', 'div', 'div.act', 'div.bel', 'dl', 'dln', 'dnotz', 'doc', 'hist', 'doc.jur.b', 'doc.min.fin', 'doc.parl', 'doctr', 'dpl', 'dpl.besl', 'dr', 'dr.banc.fin', 'dr.circ', 'dr.inform', 'dr.mr', 'dr.pén.entr', 'dr.q.m', 'drs', 'dtp', 'dwz', 'dyn', 'e cont', 'e', 'e.a', 'e.b', 'tek.mod', 'e.c', 'e.c.a', 'e.d', 'e.e', 'e.e.a', 'e.e.g', 'e.g', 'e.g.a', 'e.h.a', 'e.i', 'e.j', 'e.m.a', 'e.n.a.c', 'e.o', 'e.p.c', 'e.r.c', 'e.r.f', 'e.r.h', 'e.r.o', 'e.r.p', 'e.r.v', 'e.s.r.a', 'e.s.t', 'e.v', 'e.v.a', 'e.w', 'e&o.e', 'ec.pol.r', 'echos log', 'econ', 'ed', 'ed(s)', 'eeg verd.v', 'eex san s', 'eff', 'eg rtl', 'eig', 'eig.mag', 'eil', 'elektr', 'enmb', 'entr.et dr', 'enz', 'err', 'et al', 'et seq', 'etc', 'etq', 'eur', 'parl', 'eur.t.s', 'eur.verd.overdracht strafv', 'ev rechtsh', 'ev uitl', 'ev', 'evt', 'ex', 'ex.crim', 'exec', 'f', 'f.a.o', 'f.a.q', 'f.a.s', 'f.i.b', 'f.j.f', 'f.o.b', 'f.o.r', 'f.o.s', 'f.o.t', 'f.r', 'f.supp', 'f.suppl', 'fa', 'facs', 'fare act', 'fasc', 'fg', 'fid.ber', 'fig', 'fin.verh.w', 'fisc', 'fisc', 'tijdschr', 'fisc.act', 'fisc.koer', 'fl', 'form', 'foro', 'it', 'fr', 'fr.cult.r', 'fr.gem.r', 'fr.parl', 'fra', 'ft', 'g', 'g.a', 'g.a.v', 'g.a.w.v', 'g.g.d', 'g.m.t', 'g.o', 'g.omt.e', 'g.p', 'g.s', 'g.v', 'g.w.w', 'geb', 'gebr', 'gebrs', 'gec', 'gec.decr', 'ged', 'ged.st', 'gedipl', 'gedr.st', 'geh', 'gem', 'gem', 'en gew', 'gem', 'en prov', 'gem.gem.comm', 'gem.st', 'gem.stem', 'gem.w', 'gem.wet, gem.wet', 'gemeensch.optr', 'gemeensch.standp', 'gemeensch.strat', 'gemeent', 'gemeent.b', 'gemeent.regl', 'gemeent.verord', 'geol', 'geopp', 'gepubl', 'ger.deurw', 'ger.w', 'gerekw', 'gereq', 'gesch', 'get', 'getr', 'gev.m', 'gev.maatr', 'gew', 'ghert', 'gir.eff.verk', 'gk', 'gr', 'gramm', 'grat.w', 'gron,opm.en leermed', 'grootb.w', 'grs', 'grur ausl', 'grur int', 'grvm', 'grw', 'gst', 'gw', 'h.a', 'h.a.v.o', 'h.b.o', 'h.e.a.o', 'h.e.g.a', 'h.e.geb', 'h.e.gestr', 'h.l', 'h.m', 'h.o', 'h.r', 'h.t.l', 'h.t.m', 'h.w.geb', 'hand', 'handelsn.w', 'handelspr', 'handelsr.w', 'handelsreg.w', 'handv', 'harv.l.rev', 'hc', 'herald', 'hert', 'herz', 'hfdst', 'hfst', 'hgrw', 'hhr', 'hist', 'hooggel', 'hoogl', 'hosp', 'hpw', 'hr', 'hr', 'ms', 'hr.ms', 'hregw', 'hrg', 'hst', 'huis.just', 'huisv.w', 'huurbl', 'hv.vn', 'hw', 'hyp.w', 'i.b.s', 'i.c', 'i.c.m.h', 'i.e', 'i.f', 'i.f.p', 'i.g.v', 'i.h', 'i.h.a', 'i.h.b', 'i.l.pr', 'i.o', 'i.p.o', 'i.p.r', 'i.p.v', 'i.pl.v', 'i.r.d.i', 'i.s.m', 'i.t.t', 'i.v', 'i.v.m', 'i.v.s', 'i.w.tr', 'i.z', 'ib', 'ibid', 'icip-ing.cons', 'iem', 'ind prop', 'indic.soc', 'indiv', 'inf', 'inf.i.d.a.c', 'inf.idac', 'inf.r.i.z.i.v', 'inf.riziv', 'inf.soc.secr', 'ing', 'ing', 'cons', 'ing.cons', 'inst', 'int', 'int', 'rechtsh', 'strafz', "int'l & comp.l.q.", 'interm', 'intern.fisc.act', 'intern.vervoerr', 'inv', 'inv', 'f', 'inv.w', 'inv.wet', 'invord.w', 'inz', 'ir', 'irspr', 'iwtr', 'j', 'j.-cl', 'j.c.b', 'j.c.e', 'j.c.fl', 'j.c.j', 'j.c.p', 'j.d.e', 'j.d.f', 'j.d.s.c', 'j.dr.jeun', 'j.j.d', 'j.j.p', 'j.j.pol', 'j.l', 'j.l.m.b', 'j.l.o', 'j.ordre pharm', 'j.p.a', 'j.r.s', 'j.t', 'j.t.d.e', 'j.t.dr.eur', 'j.t.o', 'j.t.t', 'jaarl', 'jb.hand', 'jb.kred', 'jb.kred.c.s', 'jb.l.r.b', 'jb.lrb', 'jb.markt', 'jb.mens', 'jb.t.r.d', 'jb.trd', 'jeugdrb', 'jeugdwerkg.w', 'jg', 'jis', 'jl', 'journ.jur', 'journ.prat.dr.fisc.fin', 'journ.proc', 'jrg', 'jur', 'jur.comm.fl', 'jur.dr.soc.b.l.n', 'jur.f.p.e', 'jur.fpe', 'jur.niv', 'jur.trav.brux', 'jura falc', 'jurambt', 'jv.cass', 'jv.h.r.j', 'jv.hrj', 'jw', 'k', 'k', 'en m', 'k.b', 'k.g', 'k.k', 'k.m.b.o', 'k.o.o', 'k.v.k', 'k.v.v.v', 'kadasterw', 'kaderb', 'kador', 'kbo-nr', 'kg', 'kh', 'kiesw', 'kind.bes.v', 'kkr', 'koopv', 'kr', 'krankz.w', 'ksbel', 'kt', 'ktg', 'ktr', 'kvdm', 'kw.r', 'kymr', 'kzr', 'kzw', 'l', 'l.b', 'l.b.o', 'l.bas', 'l.c', 'l.gew', 'l.j', 'l.k', 'l.l', 'l.o', 'l.r.b', 'l.u.v.i', 'l.v.r', 'l.v.w', 'l.w', "l'exp.-compt.b.", 'l’exp.-compt.b', 'landinr.w', 'landscrt', 'larcier cass', 'lat', 'law.ed', 'lett', 'levensverz', 'lgrs', 'lidw', 'limb.rechtsl', 'lit', 'litt', 'liw', 'liwet', 'lk', 'll', 'll.(l.)l.r', 'loonw', 'losbl', 'ltd', 'luchtv', 'luchtv.w', 'm', 'm', 'not', 'm.a.v.o', 'm.a.w', 'm.b', 'm.b.o', 'm.b.r', 'm.b.t', 'm.d.g.o', 'm.e.a.o', 'm.e.r', 'm.h', 'm.h.d', 'm.i.v', 'm.j.t', 'm.k', 'm.m', 'm.m.a', 'm.m.h.h', 'm.m.v', 'm.n', 'm.not.fisc', 'm.nt', 'm.o', 'm.r', 'm.s.a', 'm.u.p', 'm.v.a', 'm.v.h.n', 'm.v.t', 'm.z', 'maatr.teboekgest.luchtv', 'maced', 'mand', 'max', 'mbl.not', 'me', 'med', 'med', 'v.b.o', 'med.b.u.f.r', 'med.bufr', 'med.vbo', 'meerv', 'meetbr.w', 'mém.adm', 'mgr', 'mgrs', 'mhd', 'mi.verantw', 'mil', 'mil.bed', 'mil.ger', 'min', 'min', 'aanbev', 'min', 'circ', 'min', 'fin', 'min.j.omz', 'min.just.circ', 'mitt', 'mnd', 'mod', 'mon', 'monde ass', 'mouv.comm', 'mr', 'ms', 'muz', 'mv', 'mva ii inv', 'mva inv', 'n cont', 'n', 'chr', 'n.a', 'n.a.g', 'n.a.v', 'n.b', 'n.c', 'n.chr', 'n.d', 'n.d.r', 'n.e.a', 'n.g', 'n.h.b.c', 'n.j', 'n.j.b', 'n.j.w', 'n.l', 'n.m', 'n.m.m', 'n.n', 'n.n.b', 'n.n.g', 'n.n.k', 'n.o.m', 'n.o.t.k', 'n.rapp', 'n.tijd.pol', 'n.v', 'n.v.d.r', 'n.v.d.v', 'n.v.o.b', 'n.v.t', 'nat.besch.w', 'nat.omb', 'nat.pers', 'ned.cult.r', 'neg.verkl', 'nhd', 'nieuw arch', 'wisk', 'njcm-bull', 'nl', 'nnd', 'no', 'not.fisc.m', 'not.w', 'not.wet', 'nr', 'nrs', 'nste', 'nt', 'numism', 'o', 'o.a', 'o.b', 'o.c', 'o.g', 'o.g.v', 'o.i', 'o.i.d', 'o.m', 'o.o', 'o.o.d', 'o.o.v', 'o.p', 'o.r', 'o.regl', 'o.s', 'o.t.s', 'o.t.t', 'o.t.t.t', 'o.t.t.z', 'o.tk.t', 'o.v.t', 'o.v.t.t', 'o.v.tk.t', 'o.v.v', 'ob', 'obsv', 'octr', 'octr.gem.regl', 'octr.regl', 'oe', 'oecd mod', 'off.pol', 'ofra', 'ohd', 'omb', 'omnia frat', 'omnil', 'omz', 'on.ww', 'onderr', 'onfrank', 'onteig.w', 'ontw', 'b.w', 'onuitg', 'onz', 'oorl.w', 'op.cit', 'opin.pa', 'opm', 'or', 'ord.br', 'ord.gem', 'ors', 'orth', 'os', 'osm', 'ov', 'ov.w.i', 'ov.w.ii', 'ov.ww', 'overg.w', 'overw', 'ovkst', 'ow kadasterw', 'oz', 'p', 'p.& b', 'p.a', 'p.a.o', 'p.b.o', 'p.e', 'p.g', 'p.j', 'p.m', 'p.m.a', 'p.o', 'p.o.j.t', 'p.p', 'p.v', 'p.v.s', 'pachtw', 'pag', 'pan', 'pand.b', 'pand.pér', 'parl.gesch', 'parl.gesch', 'inv', 'parl.st', 'part.arb', 'pas', 'pasin', 'pat', 'pb.c', 'pb.l', 'pens', 'pensioenverz', 'per.ber.i.b.r', 'per.ber.ibr', 'pers', 'st', 'pft', 'pg wijz.rv', 'pk', 'pktg', 'pli jur', 'plv', 'po', 'pol', 'pol.off', 'pol.r', 'pol.w', 'politie j', 'postbankw', 'postw', 'pp', 'pr', 'preadv', 'pres', 'prf', 'prft', 'prg', 'prijz.w', 'pro jus', 'proc', 'procesregl', 'prof', 'prot', 'prov', 'prov.b', 'prov.instr.h.m.g', 'prov.regl', 'prov.verord', 'prov.w', 'publ', 'publ.cour eur.d.h', 'publ.eur.court h.r', 'pun', 'pw', 'q.b.d', 'q.e.d', 'q.q', 'q.r', 'r', 'r.a.b.g', 'r.a.c.e', 'r.a.j.b', 'r.b.d.c', 'r.b.d.i', 'r.b.s.s', 'r.c', 'r.c.b', 'r.c.d.c', 'r.c.j.b', 'r.c.s.j', 'r.cass', 'r.d.c', 'r.d.i', 'r.d.i.d.c', 'r.d.j.b', 'r.d.j.p', 'r.d.p.c', 'r.d.s', 'r.d.t.i', 'r.e', 'r.f.s.v.p', 'r.g.a.r', 'r.g.c.f', 'r.g.d.c', 'r.g.f', 'r.g.z', 'r.h.a', 'r.i.c', 'r.i.d.a', 'r.i.e.j', 'r.i.n', 'r.i.s.a', 'r.j.d.a', 'r.j.i', 'r.k', 'r.l', 'r.l.g.b', 'r.med', 'r.med.rechtspr', 'r.n.b', 'r.o', 'r.orde apoth', 'r.ov', 'r.p', 'r.p.d.b', 'r.p.o.t', 'r.p.r.j', 'r.p.s', 'r.r.d', 'r.r.s', 'r.s', 'r.s.v.p', 'r.stvb', 'r.t.d.f', 'r.t.d.h', 'r.t.l', 'r.trim.dr.eur', 'r.v.a', 'r.verkb', 'r.w', 'r.w.d', 'rap.ann.c.a', 'rap.ann.c.c', 'rap.ann.c.e', 'rap.ann.c.s.j', 'rap.ann.ca', 'rap.ann.cass', 'rap.ann.cc', 'rap.ann.ce', 'rap.ann.csj', 'rapp', 'rb', 'rb.kh', 'rb.van kh', 'rdn', 'rdnr', 're.pers', 'rec', 'rec.c.i.j', 'rec.c.j.c.e', 'rec.cij', 'rec.cjce', 'rec.cour eur.d.h', 'rec.gén.enr.not', 'rec.lois decr.arr', 'rechtsk.t', 'rechtspl.zeem', 'rechtspr.arb.br', 'rechtspr.b.f.e', 'rechtspr.bfe', 'rechtspr.soc.r.b.l.n', 'recl.reg', 'rect', 'red', 'reg', 'reg.huiz.bew', 'reg.w', 'registr.w', 'regl', 'regl', 'r.v.k', 'regl.besl', 'regl.onderr', 'regl.r.t', 'rep', 'rep.eur.court h.r', 'rép.fisc', 'rép.not', 'rep.r.j', 'rep.rj', 'req', 'res', 'resp', 'rev', 'rev', 'de dr', 'comp', 'rev', 'trim', 'de dr', 'civ', 'rev', 'trim', 'de dr', 'comm', 'rev.acc.trav', 'rev.adm', 'rev.b.compt', 'rev.b.dr.const', 'rev.b.dr.intern', 'rev.b.séc.soc', 'rev.banc.fin', 'rev.comm', 'rev.cons.prud', 'rev.dr.b', 'rev.dr.commun', 'rev.dr.étr', 'rev.dr.fam', 'rev.dr.intern.comp', 'rev.dr.mil', 'rev.dr.min', 'rev.dr.pén', 'rev.dr.pén.mil', 'rev.dr.rur', 'rev.dr.u.l.b', 'rev.dr.ulb', 'rev.exp', 'rev.faill', 'rev.fisc', 'rev.gd', 'rev.hist.dr', 'rev.i.p.c', 'rev.ipc', 'rev.not.b', 'rev.prat.dr.comm', 'rev.prat.not.b', 'rev.prat.soc', 'rev.rec', 'rev.rw', 'rev.trav', 'rev.trim.d.h', 'rev.trim.dr.fam', 'rev.urb', 'richtl', 'riv.dir.int', 'riv.dir.int."le priv', 'riv.dir.int.priv.proc', 'rk', 'rln', 'roln', 'rom', 'rondz', 'rov', 'rtl', 'rubr', 'ruilv.wet', 'rv.verdr', 'rvkb', 's', 's', 'en s', 's.a', 's.b.n', 's.ct', 's.d', 's.e.c', 's.e.et.o', 's.e.w', 's.exec.rept', 's.hrg', 's.j.b', 's.l', 's.l.e.a', 's.l.n.d', 's.p.a', 's.s', 's.t', 's.t.b', 's.v', 's.v.p', 'samenw', 'sc', 'sch', 'scheidsr.uitspr', 'schepel.besl', 'secr.comm', 'secr.gen', 'sect.soc', 'sess', 'cas', 'sir', 'soc', 'best', 'soc', 'handv', 'soc', 'verz', 'soc.act', 'soc.best', 'soc.kron', 'soc.r', 'soc.sw', 'soc.weg', 'sofi-nr', 'somm', 'somm.ann', 'sp.c.c', 'sr', 'ss', 'st.doc.b.c.n.a.r', 'st.doc.bcnar', 'st.vw', 'stagever', 'stas', 'stat', 'stb', 'stbl', 'stcrt', 'stichting i.v', 'stud.dipl', 'su', 'subs', 'subst', 'succ.w', 'suppl', 'sv', 'sw', 't', 't.a', 't.a.a', 't.a.n', 't.a.p', 't.a.s.n', 't.a.v', 't.a.v.w', 't.aann', 't.acc', 't.agr.r', 't.app', 't.b.b.r', 't.b.h', 't.b.m', 't.b.o', 't.b.p', 't.b.r', 't.b.s', 't.b.v', 't.bankw', 't.belg.not', 't.desk', 't.e.m', 't.e.p', 't.f.r', 't.fam', 't.fin.r', 't.g.r', 't.g.t', 't.g.v', 't.gem', 't.gez', 't.huur', 't.i.n', 't.in b.z', 't.j.k', 't.l.l', 't.l.v', 't.m', 't.m.r', 't.m.w', 't.mil.r', 't.mil.strafr', 't.not', 't.o', 't.o.r.b', 't.o.v', 't.ontv', 't.orde geneesh', 't.p.r', 't.pol', 't.r', 't.r.d.& i', 't.r.g', 't.r.o.s', 't.r.v', 't.s.r', 't.strafr', 't.t', 't.u', 't.v.c', 't.v.g', 't.v.m.r', 't.v.o', 't.v.v', 't.v.v.d.b', 't.v.w', 't.verz', 't.vred', 't.vreemd', 't.w', 't.w.k', 't.w.v', 't.w.v.r', 't.wrr', 't.z', 't.z.t', 't.z.v', 'taalk', 'tar.burg.z', 'td', 'techn', 'telecomm', 'toel', 'toel.st.v.w', 'toep', 'toep.regl', 'tom', 'top', 'trans.b', 'transp.r', 'trav.com.ét.et lég.not', 'trb', 'trib', 'trib.civ', 'trib.gr.inst', 'ts', 'ts', 'best', 'ts', 'verv', 'turnh.rechtsl', 'tvpol', 'tvpr', 'tvrechtsgesch', 'tw', 'u', 'u.a', 'u.a.r', 'u.a.v', 'u.c', 'u.c.c', 'u.g', 'u.p', 'u.s', 'u.s.d.c', 'uitdr', 'uitl.w', 'uitv.besch.div.b', 'uitv.besl', 'uitv.besl', 'succ.w', 'uitv.besl.bel.rv', 'uitv.besl.l.b', 'uitv.reg', 'inv.w', 'uitv.reg.bel.d', 'uitv.reg.afd.verm', 'uitv.reg.lb', 'uitv.reg.succ.w', 'univ', 'univ.verkl', 'v', 'v', 'chr', 'v.& f', 'v.a', 'v.a.v', 'v.bp prot', 'v.c', 'v.chr', 'v.h', 'v.huw.verm', 'v.i', 'v.i.o', 'v.k.a', 'v.m', 'v.o.f', 'v.o.n', 'v.onderh.verpl', 'v.p', 'v.r', 'v.s.o', 'v.t.t', 'v.t.t.t', 'v.tk.t', 'v.toep.r.vert', 'v.v.b', 'v.v.g', 'v.v.t', 'v.v.t.t', 'v.v.tk.t', 'v.w.b', 'v.z.m', 'vb', 'vb.bo', 'vbb', 'vc', 'vd', 'veldw', 'ver.k', 'ver.verg.gem', 'gem.comm', 'verbr', 'verd', 'verdr', 'verdr.v', 'verdrag benel.i.z', 'tek.mod', 'verenw', 'verg', 'verg.fr.gem', 'comm', 'verkl', 'verkl.herz.gw', 'verl', 'deelw', 'vern', 'verord', 'vers.r', 'versch', 'versl.c.s.w', 'versl.csw', 'vert', 'verw', 'verz', 'verz.w', 'verz.wett.besl', 'verz.wett.decr.besl', 'vgl', 'vid', 'vigiles jb', 'viss.w', 'vl.parl', 'vl.r', 'vl.t.gez', 'vl.w.reg', 'vl.w.succ', 'vlg', 'vn', 'vnl', 'vnw', 'vo', 'vo.bl', 'voegw', 'vol', 'volg', 'volt', 'deelw', 'voorl', 'voorz', 'vord.w', 'vorst.d', 'vr', 'en antw', 'vred', 'vrg', 'vnw', 'vrijgrs', 'vs', 'vt', 'vvsr jb', 'vw', 'vz', 'vzngr', 'vzr', 'w', 'w.a', 'w.b.r', 'w.c.h', 'w.conf.huw', 'w.conf.huwelijksb', 'w.consum.kr', 'w.f.r', 'w.g', 'w.gelijke beh', 'w.gew.r', 'w.ident.pl', 'w.just.doc', 'w.kh', 'w.l.r', 'w.l.v', 'w.mil.straf.spr', 'w.n', 'w.not.ambt', 'w.o', 'w.o.d.huurcomm', 'w.o.d.k', 'w.openb.manif', 'w.parl', 'w.r', 'w.reg', 'w.succ', 'w.u.b', 'w.uitv.pl.verord', 'w.v', 'w.v.k', 'w.v.m.s', 'w.v.r', 'w.v.w', 'w.venn', 'wac', 'wd', 'wet a.b', 'wet bel.rv', 'wet c.a.o', 'wet c.o', 'wet div.bel', 'wet ksbel', 'wet l.v', 'wetb', 'n.v.h', 'wgb', 'winkelt.w', 'wisk', 'wka-verkl', 'wnd', 'won.w', 'woningw', 'woonr.w', 'wrr', 'wrr.ber', 'wrsch', 'ws', 'wsch', 'wsr', 'wtvb', 'ww', 'x.d', 'z cont', 'z.a', 'z.g', 'z.i', 'z.j', 'z.o.z', 'z.p', 'z.s.m', 'zesde richtl', 'zg', 'zgn', 'zn', 'znw', 'zr', 'zr', 'ms', 'zr.ms']
|
11 |
+
PREPOSITIVE_ABBREVIATIONS = []
|
12 |
+
NUMBER_ABBREVIATIONS = []
|
pysbd/lang/english.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
3 |
+
from pysbd.lang.common import Common, Standard
|
4 |
+
|
5 |
+
class English(Common, Standard):
|
6 |
+
|
7 |
+
iso_code = 'en'
|
8 |
+
|
9 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
10 |
+
SENTENCE_STARTERS = "A Being Did For He How However I In It Millions "\
|
11 |
+
"More She That The There They We What When Where Who Why".split(" ")
|
pysbd/lang/french.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
3 |
+
from pysbd.lang.common import Common, Standard
|
4 |
+
|
5 |
+
class French(Common, Standard):
|
6 |
+
|
7 |
+
iso_code = 'fr'
|
8 |
+
|
9 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
10 |
+
SENTENCE_STARTERS = []
|
11 |
+
|
12 |
+
class Abbreviation(Standard.Abbreviation):
|
13 |
+
ABBREVIATIONS = ['a.c.n', 'a.m', 'al', 'ann', 'apr', 'art', 'auj', 'av', 'b.p', 'boul', 'c.-à-d', 'c.n', 'c.n.s', 'c.p.i', 'c.q.f.d', 'c.s', 'ca', 'cf', 'ch.-l', 'chap', 'co', 'co', 'contr', 'dir', 'e.g', 'e.v', 'env', 'etc', 'ex', 'fasc', 'fig', 'fr', 'fém', 'hab', 'i.e', 'ibid', 'id', 'inf', 'l.d', 'lib', 'll.aa', 'll.aa.ii', 'll.aa.rr', 'll.aa.ss', 'll.ee', 'll.mm', 'll.mm.ii.rr', 'loc.cit', 'ltd', 'ltd', 'masc', 'mm', 'ms', 'n.b', 'n.d', 'n.d.a', 'n.d.l.r', 'n.d.t', 'n.p.a.i', 'n.s', 'n/réf', 'nn.ss', 'p.c.c', 'p.ex', 'p.j', 'p.s', 'pl', 'pp', 'r.-v', 'r.a.s', 'r.i.p', 'r.p', 's.a', 's.a.i', 's.a.r', 's.a.s', 's.e', 's.m', 's.m.i.r', 's.s', 'sec', 'sect', 'sing', 'sq', 'sqq', 'ss', 'suiv', 'sup', 'suppl', 't.s.v.p', 'tél', 'vb', 'vol', 'vs', 'x.o', 'z.i', 'éd']
|
14 |
+
PREPOSITIVE_ABBREVIATIONS = []
|
15 |
+
NUMBER_ABBREVIATIONS = []
|
pysbd/lang/greek.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
3 |
+
from pysbd.lang.common import Common, Standard
|
4 |
+
|
5 |
+
class Greek(Common, Standard):
|
6 |
+
|
7 |
+
iso_code = 'el'
|
8 |
+
|
9 |
+
SENTENCE_BOUNDARY_REGEX = r'.*?[\.;!\?]|.*?$'
|
10 |
+
Punctuations = ['.', '!', ';', '?']
|
11 |
+
|
12 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
13 |
+
SENTENCE_STARTERS = []
|
pysbd/lang/hindi.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
3 |
+
from pysbd.lang.common import Common, Standard
|
4 |
+
|
5 |
+
class Hindi(Common, Standard):
|
6 |
+
|
7 |
+
iso_code = 'hi'
|
8 |
+
|
9 |
+
SENTENCE_BOUNDARY_REGEX = r'.*?[।\|!\?]|.*?$'
|
10 |
+
Punctuations = ['।', '|', '.', '!', '?']
|
11 |
+
|
12 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
13 |
+
SENTENCE_STARTERS = []
|
pysbd/lang/italian.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
3 |
+
from pysbd.lang.common import Common, Standard
|
4 |
+
|
5 |
+
class Italian(Common, Standard):
|
6 |
+
|
7 |
+
iso_code = 'it'
|
8 |
+
|
9 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
10 |
+
SENTENCE_STARTERS = []
|
11 |
+
|
12 |
+
class Abbreviation(Standard.Abbreviation):
|
13 |
+
ABBREVIATIONS = ['1°', 'a.c', 'a.c/a', 'a.cam', 'a.civ', 'a.cor', 'a.d.r', 'a.gov', 'a.mil', 'a.mon', 'a.smv', 'a.v', 'a/a', 'a/c', 'a/i', 'aa', 'aaaa', 'aaal', 'aacst', 'aamct', 'aams', 'aar', 'aato', 'ab', 'abbigl', 'abbrev', 'abc', 'abi', 'abl', 'abm', 'abr', 'abs', 'absp', 'ac', 'acam', 'acb', 'acbi', 'acc', 'accorc', 'accr', 'acd', 'ace', 'acec', 'acep', 'aci', 'acli', 'acp', 'acro', 'acsit', 'actl', 'ad', 'ad.mil', 'ada', 'adap', 'adatt', 'adc', 'add', 'adei', 'adeion', 'adhd', 'adi', 'adisco', 'adj', 'adm', 'adp', 'adr', 'ads', 'adsi', 'adsl', 'adv', 'ae.b', 'aefi', 'aer', 'aerodin', 'aeron', 'afa', 'afc', 'afci', 'affl', 'afi', 'afic', 'afm', 'afp', 'ag', 'agcm', 'agcom', 'age', 'agecs', 'agesci', 'agg', 'agip', 'agis', 'agm', 'ago', 'agr', 'agric', 'agt', 'ai', 'aia', 'aiab', 'aiac', 'aiace', 'aiap', 'aias', 'aiat', 'aib', 'aic', 'aica', 'aicel', 'aici', 'aics', 'aid', 'aida', 'aidaa', 'aidac', 'aidama', 'aidda', 'aidim', 'aido', 'aids', 'aies', 'aif', 'aih', 'aiip', 'aimi', 'aip', 'aipsc', 'airi', 'ais', 'aisa', 'aism', 'aiss', 'aissca', 'aitc', 'aiti', 'aitr', 'aits', 'aka', 'al', 'alai', 'alch', 'alg', 'ali', 'alim', 'all', 'allev', 'allus', 'alp', 'alq', 'alt', 'am', 'ama', 'amaci', 'amag', 'amami', 'amc', 'ammec', 'amn', 'ampas', 'amps', 'an', 'ana', 'anaai', 'anac', 'anaci', 'anad', 'anai', 'anaoo', 'anart', 'anat', 'anat. comp', 'ancci', 'anci', 'ancip', 'ancsa', 'andit', 'anec', 'anee', 'anem', 'anes', 'anffas', 'ani', 'ania', 'anica', 'anie', 'animi', 'anis', 'anisc', 'anm', 'anmfit', 'anmig', 'anmil', 'anmli', 'anms', 'anpa', 'anpas', 'anpci', 'anpe', 'anpi', 'ansi', 'ansv', 'ant', 'anta', 'antifr', 'antlo', 'anton', 'antrop', 'anusca', 'anvi', 'anx', 'ao', 'ap', 'apa', 'apd', 'apea', 'apec', 'apet', 'api', 'apos', 'app', 'app.sc', 'apr', 'aps', 'apt', 'aq', 'ar', 'ar.ind', 'ar.rep', 'arald', 'arame', 'arc', 'arch', 'archeol', 'arci', 'ardsu', 'are', 'arg', 'aritm', 'arpa', 'arpat', 'arred', 'arrt', 'arsia', 'art', 'arti min', 'artig', 'artigl', 'artt', 'as', 'asa', 'asae', 'asc', 'asci', 'ascii', 'ascom', 'ascop', 'asd', 'ase', 'asf', 'asfer', 'asg', 'asic', 'asifa', 'asl', 'asmdc', 'asmi', 'asp', 'aspic', 'aspp', 'assi', 'assic', 'assol', 'asst', 'aster', 'astr', 'astrol', 'astron', 'at', 'ata', 'atb', 'atic', 'atm', 'ats', 'att', 'attrav', 'atv', 'au', 'auc', 'aus', 'auser', 'aut', 'autom', 'av', 'avi', 'avis', 'avo', 'avv', 'avvers', 'awb', 'awdp', 'az', 'azh', 'b.a', 'b2b', 'b2c', 'ba', 'bafta', 'bal', 'ball', 'ban', 'banc', 'bar', 'bart', 'bas', 'bat', 'batt', 'bban', 'bbc', 'bbl', 'bbs', 'bbtc', 'bcc', 'bce', 'bcf', 'bdf', 'bei', 'bep', 'bers', 'bg', 'bi', 'bibl', 'bic', 'bioch', 'biol', 'bl', 'bld', 'bldg', 'blpc', 'bm', 'bmps', 'bmw', 'bn', 'bna', 'bncf', 'bncrm', 'bni', 'bnl', 'bo', 'bot', 'bpl', 'bpm', 'bpn', 'bpr', 'br', 'brd', 'bre', 'bric', 'brig', 'brig.ca', 'brig.gen', 'bros', 'bs', 'bsc', 'bsp', 'bsu', 'bt', 'btc', 'btg', 'btg.l', 'btr', 'bts', 'bu', 'bur', 'bz', 'c.a', 'c.a.p', 'c.c.p', 'c.cost', 'c.d a', 'c.d', 'c.le', 'c.m', 'c.opv', 'c.p', 'c.s', 'c.v', 'c.v.d', 'c/a', 'c/c', 'c/pag', 'ca', 'ca.rep', 'ca.sm', 'ca.sz', 'ca.uf', 'caaf', 'cab', 'cad', 'cae', 'cai', 'cal', 'cam', 'cap', 'capol', 'capt', 'car', 'car.sc', 'carat', 'card', 'cas', 'casaca', 'casd', 'cass.civ', 'cat', 'caus', 'cav', 'cavg', 'cb', 'cbd', 'cbr', 'cbs', 'cc', 'cca', 'ccap', 'ccda', 'ccdp', 'ccee', 'cciaa', 'ccie', 'ccip', 'cciss', 'ccna', 'ccnl', 'ccnp', 'ccpb', 'ccs', 'ccsp', 'cctld', 'cctv', 'ccv', 'cd', 'cda', 'cdma', 'cdo', 'cdpd', 'cdr', 'cds', 'cdw', 'ce', 'ced', 'cee', 'cei', 'cemat', 'cenelec', 'centr', 'cepis', 'ceps', 'cept', 'cerit', 'cese', 'cesis', 'cesvot', 'cet', 'cf', 'cfa', 'cfr', 'cg', 'cgi', 'cgil', 'cgs', 'ch', 'chf', 'chim', 'chim. ind', 'chir', 'ci', 'ci-europa', 'ciber', 'cicae', 'cid', 'cie', 'cif', 'cifej', 'cig', 'cigs', 'cii', 'cilea', 'cilo', 'cim', 'cime', 'cin', 'cinit', 'cio', 'cipe', 'cirm', 'cisal', 'ciscs', 'cisd', 'cisl', 'cism', 'citol', 'cl', 'class', 'cli', 'cm', 'cmdr', 'cme', 'cmo', 'cmr', 'cms', 'cmyk', 'cm²', 'cm³', 'cn', 'cna', 'cnb', 'cnc', 'cnel', 'cngei', 'cni', 'cnipa', 'cnit', 'cnn', 'cnr', 'cns', 'cnt', 'cnvvf', 'co', 'co.ing', 'co.sa', 'cobas', 'coc', 'cod', 'cod. civ', 'cod. deont. not', 'cod. pen', 'cod. proc. civ', 'cod. proc. pen', 'codec', 'coi', 'col', 'colf', 'coll', 'com', 'comdr', 'comm', 'comp', 'compar', 'compl', 'con', 'conai', 'conc', 'concl', 'condiz', 'confetra', 'confitarma', 'confr', 'cong', 'congeav', 'congiunt', 'coni', 'coniug', 'consec', 'consob', 'contab', 'contr', 'coreco', 'corp', 'corr', 'correl', 'corrisp', 'cosap', 'cospe', 'cost', 'costr', 'cpc', 'cpdel', 'cpe', 'cpi', 'cpl', 'cpt', 'cpu', 'cr', 'cral', 'credem', 'crf', 'cri', 'cric', 'cristall', 'crm', 'cro', 'cron', 'crsm', 'crt', 'cs', 'csa', 'csai', 'csc', 'csm', 'csn', 'css', 'ct', 'ctc', 'cti', 'ctr', 'ctsis', 'cuc', 'cud', 'cun', 'cup', 'cusi', 'cvb', 'cvbs', 'cwt', 'cz', 'd', 'd.c', 'd.i.a', 'dab', 'dac', 'dam', 'dams', 'dat', 'dau', 'db', 'dbms', 'dc', 'dca', 'dccc', 'dda', 'ddp', 'ddr', 'ddt', 'dea', 'decoraz', 'dect', 'dek', 'denom', 'deriv', 'derm', 'determ', 'df', 'dfp', 'dg', 'dga', 'dhcp', 'di', 'dia', 'dial', 'dic', 'dicomac', 'dif', 'difett', 'dig. iv', 'digos', 'dimin', 'dimostr', 'din', 'dipart', 'diplom', 'dir', 'dir. amm', 'dir. can', 'dir. civ', 'dir. d. lav', 'dir. giur', 'dir. internaz', 'dir. it', 'dir. pen', 'dir. priv', 'dir. proces', 'dir. pub', 'dir. rom', 'disus', 'diy', 'dl', 'dlf', 'dm', 'dme', 'dmf', 'dmo', 'dmoz', 'dm²', 'dm³', 'dnr', 'dns', 'doa', 'doc', 'docg', 'dom', 'dop', 'dos', 'dott', 'dpa', 'dpi', 'dpl', 'dpof', 'dps', 'dpt', 'dr', 'dra', 'drm', 'drs', 'dry pt', 'ds', 'dslam', 'dspn', 'dss', 'dtc', 'dtmf', 'dtp', 'dts', 'dv', 'dvb', 'dvb-t', 'dvd', 'dvi', 'dwdm', 'e.g', 'e.p.c', 'ead', 'eafrd', 'ean', 'eap', 'easw', 'eb', 'eban', 'ebr', 'ebri', 'ebtn', 'ecc', 'eccl', 'ecdl', 'ecfa', 'ecff', 'ecg', 'ecm', 'econ', 'econ. az', 'econ. dom', 'econ. pol', 'ecpnm', 'ed', 'ed agg', 'edge', 'edi', 'edil', 'edit', 'ef', 'efa', 'efcb', 'efp', 'efsa', 'efta', 'eg', 'egiz', 'egl', 'egr', 'ei', 'eisa', 'elab', 'elettr', 'elettron', 'ellitt', 'emap', 'emas', 'embr', 'emdr', 'emi', 'emr', 'en', 'enaip', 'enal', 'enaoli', 'enapi', 'encat', 'enclic', 'enea', 'enel', 'eni', 'enigm', 'enit', 'enol', 'enpa', 'enpaf', 'enpals', 'enpi', 'enpmf', 'ens', 'entom', 'epd', 'epigr', 'epirbs', 'epl', 'epo', 'ept', 'erc', 'ercom', 'ermes', 'erp', 'es', 'esa', 'escl', 'esist', 'eso', 'esp', 'estens', 'estr. min', 'etacs', 'etf', 'eti', 'etim', 'etn', 'etol', 'eu', 'eufem', 'eufic', 'eula', 'eva®', 'f.a', 'f.b', 'f.m', 'f.p', 'fa', 'fabi', 'fac', 'facl', 'facs', 'fad', 'fai', 'faile', 'failp', 'failpa', 'faisa', 'falcri', 'fam', 'famar', 'fans', 'fao', 'fapav', 'faq', 'farm', 'fasi', 'fasib', 'fatt', 'fbe', 'fbi', 'fc', 'fco', 'fcp', 'fcr', 'fcu', 'fdi', 'fe', 'feaog', 'feaosc', 'feb', 'fedic', 'fema', 'feoga', 'ferr', 'fesco', 'fesr', 'fess', 'fg', 'fi', 'fiaf', 'fiaip', 'fiais', 'fialtel', 'fiap', 'fiapf', 'fiat', 'fiavet', 'fic', 'ficc', 'fice', 'fidal', 'fidam', 'fidapa', 'fieg', 'fifa', 'fifo', 'fig', 'figc', 'figs', 'filat', 'filcams', 'file', 'filol', 'filos', 'fim', 'fima', 'fimmg', 'fin', 'finco', 'fio', 'fioto', 'fipe', 'fipresci', 'fis', 'fisar', 'fisc', 'fisg', 'fisiol', 'fisiopatol', 'fistel', 'fit', 'fita', 'fitav', 'fits', 'fiv', 'fivet', 'fivl', 'flo', 'flpd', 'fluid pt', 'fm', 'fmcg', 'fmi', 'fmth', 'fnas', 'fnomceo', 'fnsi', 'fob', 'fod', 'folcl', 'fon', 'fop', 'fotogr', 'fp', 'fpc', 'fpld', 'fr', 'fra', 'fs', 'fsc', 'fse', 'fsf', 'fsfi', 'fsh', 'ft', 'ftase', 'ftbcc', 'fte', 'ftp', 'fts', 'ft²', 'ft³', 'fuaav', 'fut', 'fv', 'fvg', 'g.fv', 'g.u', 'g.u.el', 'gal', 'gats', 'gatt', 'gb', 'gc', 'gccc', 'gco', 'gcost', 'gd', 'gdd', 'gdf', 'gdi', 'gdo', 'gdp', 'ge', 'gea', 'gel', 'gen', 'geneal', 'geod', 'geofis', 'geogr', 'geogr. antr', 'geogr. fis', 'geol', 'geom', 'gep', 'germ', 'gescal', 'gg', 'ggv', 'gi', 'gia', 'gides', 'gift', 'gio', 'giorn', 'gis', 'gisma', 'gismo', 'giu', 'gm', 'gmdss', 'gme', 'gmo', 'go', 'gov', 'gp', 'gpl', 'gprs', 'gps', 'gr', 'gr.sel.spec', 'gr.sel.tr', 'gr.sqd', 'gra', 'gram', 'grano', 'grd', 'grtn', 'grv', 'gsa', 'gsm', 'gsm-r', 'gsr', 'gtld', 'gu', 'guce', 'gui', 'gus', 'ha', 'haart', 'haccp', 'hba', 'hcg', 'hcrp', 'hd-dvd', 'hdcp', 'hdi', 'hdml', 'hdtv', 'hepa', 'hfpa', 'hg', 'hifi', 'hiperlan', 'hiv', 'hm', 'hmld', 'hon', 'hosp', 'hpv', 'hr', 'hrh', 'hrm', 'hrt', 'html', 'http', 'hvac', 'hz', 'i.e', 'i.g.m', 'iana', 'iasb', 'iasc', 'iass', 'iat', 'iata', 'iatse', 'iau', 'iban', 'ibid', 'ibm', 'icann', 'icao', 'icbi', 'iccu', 'ice', 'icf', 'ici', 'icm', 'icom', 'icon', 'ics', 'icsi', 'icstis', 'ict', 'icta', 'id', 'iden', 'idl', 'idraul', 'iec', 'iedm', 'ieee', 'ietf', 'ifat', 'ifel', 'ifla', 'ifrs', 'ifto', 'ifts', 'ig', 'igm', 'igmp', 'igp', 'iims', 'iipp', 'ilm', 'ilo', 'ilor', 'ils', 'im', 'imaie', 'imap', 'imc', 'imdb', 'imei', 'imi', 'imms', 'imo', 'imp', 'imper', 'imperf', 'impers', 'imq', 'ims', 'imsi', 'in', 'inail', 'inca', 'incb', 'inci', 'ind', 'ind. agr', 'ind. alim', 'ind. cart', 'ind. chim', 'ind. cuoio', 'ind. estratt', 'ind. graf', 'ind. mecc', 'ind. tess', 'indecl', 'indef', 'indeterm', 'indire', 'inea', 'inf', 'infea', 'infm', 'inform', 'ing', 'ingl', 'inmarsat', 'inpdai', 'inpdap', 'inpgi', 'inps', 'inr', 'inran', 'ins', 'insp', 'int', 'inter', 'intr', 'invar', 'invim', 'in²', 'in³', 'ioma', 'iosco', 'ip', 'ipab', 'ipasvi', 'ipi', 'ippc', 'ips', 'iptv', 'iq', 'ira', 'irap', 'ircc', 'ircs', 'irda', 'iref', 'ires', 'iron', 'irpef', 'irpeg', 'irpet', 'irreg', 'is', 'isae', 'isbd', 'isbn', 'isc', 'isdn', 'isee', 'isef', 'isfol', 'isg', 'isi', 'isia', 'ism', 'ismea', 'isnart', 'iso', 'isp', 'ispearmi', 'ispel', 'ispescuole', 'ispesl', 'ispo', 'ispro', 'iss', 'issn', 'istat', 'istol', 'isvap', 'it', 'iti', 'itt', 'ittiol', 'itu', 'iud', 'iugr', 'iulm', 'iva', 'iveco', 'ivg', 'ivr', 'ivs', 'iyhp', 'j', 'jal', 'jit', 'jr', 'jv', 'k', 'kb', 'kee', 'kg', 'kkk', 'klm', 'km', 'km/h', 'kmph', 'kmq', 'km²', 'kr', 'kw', 'kwh', 'l', 'l\'ing', 'l.n', 'l\'avv', 'la', 'lag', 'lan', 'lanc', 'larn', 'laser', 'lat', 'lav', 'lav. femm', 'lav. pubbl', 'laz', 'lb', 'lc', 'lcca', 'lcd', 'le', 'led', 'lett', 'lh', 'li', 'liaf', 'lib', 'lic', 'lic.ord', 'lic.strd', 'licd', 'lice', 'lida', 'lidci', 'liff', 'lifo', 'lig', 'liit', 'lila', 'lilt', 'linfa', 'ling', 'lipu', 'lis', 'lisaac', 'lism', 'lit', 'litab', 'lnp', 'lo', 'loc', 'loc. div', 'lolo', 'lom', 'long', 'lp', 'lrm', 'lrms', 'lsi', 'lsu', 'lt', 'ltd', 'lu', 'lug', 'luiss', 'lun', 'lwt', 'lww', 'm.a', 'm.b', 'm.o', 'm/s', 'ma', 'mac', 'macch', 'mag', 'magg.(maj)', 'magg.gen.(maj.gen.)', 'mai', 'maj', 'mar', 'mar.a', 'mar.ca', 'mar.ord', 'marc', 'mat', 'mater', 'max', 'mb', 'mbac', 'mc', 'mcl', 'mcpc', 'mcs', 'md', 'mdf', 'mdp', 'me', 'mec', 'mecc', 'med', 'mediev', 'mef', 'mer', 'merc', 'merid', 'mesa', 'messrs', 'metall', 'meteor', 'metr', 'metrol', 'mg', 'mgc', 'mgm', 'mi', 'mibac', 'mica', 'microb', 'mifed', 'miglio nautico', 'miglio nautico per ora', 'miglio nautico²', 'miglio²', 'mil', 'mile', 'miles/h', 'milesph', 'min', 'miner', 'mips', 'miptv', 'mit', 'mitol', 'miur', 'ml', 'mlle', 'mls', 'mm', 'mme', 'mms', 'mm²', 'mn', 'mnp', 'mo', 'mod', 'mol', 'mons', 'morf', 'mos', 'mpaa', 'mpd', 'mpeg', 'mpi', 'mps', 'mq', 'mr', 'mrs', 'ms', 'msgr', 'mss', 'mt', 'mto', 'murst', 'mus', 'mvds', 'mws', 'm²', 'm³', 'n.a', 'n.b', 'na', 'naa', 'nafta', 'napt', 'nars', 'nasa', 'nat', 'natas', 'nato', 'nb', 'nba', 'nbc', 'ncts', 'nd', 'nda', 'nde', 'ndr', 'ndt', 'ne', 'ned', 'neg', 'neol', 'netpac', 'neur', 'news!', 'ngcc', 'nhmf', 'nlcc', 'nmr', 'no', 'nodo', 'nom', 'nos', 'nov', 'novissdi', 'npi', 'nr', 'nt', 'nta', 'nts', 'ntsc', 'nu', 'nuct', 'numism', 'nwt', 'nyc', 'nz', 'o.m.i', 'oai-pmh', 'oav', 'oc', 'occ', 'occult', 'oci', 'ocr', 'ocse', 'oculist', 'od', 'odg', 'odp', 'oecd', 'oem', 'ofdm', 'oft', 'og', 'ogg', 'ogi', 'ogm', 'ohim', 'oic', 'oics', 'olaf', 'oland', 'ole', 'oled', 'omi', 'oms', 'on', 'ong', 'onig', 'onlus', 'onomat', 'onpi', 'onu', 'op', 'opac', 'opec', 'opord', 'opsosa', 'or', 'ord', 'ord. scol', 'ore', 'oref', 'orient', 'ornit', 'orogr', 'orp', 'ort', 'os', 'osa', 'osas', 'osd', 'ot', 'ote', 'ott', 'oz', 'p', 'p.a', 'p.c', 'p.c.c', 'p.es', 'p.f', 'p.m', 'p.r', 'p.s', 'p.t', 'p.v', 'pa', 'pac', 'pag./p', 'pagg./pp', 'pai', 'pal', 'paleobot', 'paleogr', 'paleont', 'paleozool', 'paletn', 'pamr', 'pan', 'papir', 'par', 'parapsicol', 'part', 'partic', 'pass', 'pat', 'patol', 'pb', 'pc', 'pci', 'pcm', 'pcmcia', 'pcs', 'pcss', 'pct', 'pd', 'pda', 'pdf', 'pdl', 'pds', 'pe', 'pec', 'ped', 'pedag', 'peg', 'pegg', 'per.ind', 'pers', 'pert', 'pesq', 'pet', 'petr', 'petrogr', 'pfc', 'pg', 'pga', 'pgp', 'pgut', 'ph', 'php', 'pi', 'pics', 'pie', 'pif', 'pii', 'pil', 'pime', 'pin', 'pine', 'pip', 'pir', 'pit', 'pitt', 'piuss', 'pkcs', 'pki', 'pko', 'pl', 'pli', 'plr', 'pm', 'pma', 'pmi', 'pmr', 'pn', 'pnf', 'pnl', 'po', 'poet', 'pof', 'pol', 'pop', 'popitt', 'popol', 'port', 'pos', 'poss', 'post', 'pots', 'pp', 'ppa', 'ppc', 'ppga', 'ppp', 'pps', 'pptt', 'ppv', 'pr', 'pra', 'praa', 'pref', 'preist', 'prep', 'pres', 'pret', 'prg', 'pri', 'priv', 'pro.civ', 'prof', 'pron', 'pronom', 'propr', 'prov', 'prs', 'prtl', 'prusst', 'ps', 'pse', 'psi', 'psicoan', 'psicol', 'pso', 'psp', 'pstn', 'pt', 'ptc', 'pti', 'ptsd', 'ptt', 'pu', 'pug', 'puk', 'put', 'pv', 'pvb', 'pvc', 'pvt', 'pz', 'qb', 'qcs', 'qfd', 'qg', 'qi', 'qlco', 'qlcu', 'qos', 'qualif', 'r-lan', 'r.s', 'ra', 'racc', 'radar', 'radc', 'radiotecn', 'raee', 'raf', 'rag', 'raid', 'ram', 'rar', 'ras', 'rass. avv. stato', 'rc', 'rca', 'rcdp', 'rcs', 'rdc', 'rdco', 'rdf', 'rdi', 'rdp', 'rds', 'rdt', 're', 'rea', 'recipr', 'recl', 'reg', 'region', 'rel', 'rem', 'rep', 'reps', 'res', 'retor', 'rev', 'rfi', 'rfid', 'rg', 'rgb', 'rgc', 'rge', 'rgi', 'rgi bdp', 'rgpt', 'rgt', 'ri', 'riaa', 'riaj', 'riba', 'ric', 'rid', 'rif', 'rifl', 'rina', 'rip', 'ris', 'rit', 'ritts', 'rm', 'rmn', 'rn', 'ro', 'roa', 'roc', 'roi', 'rom', 'roro', 'rov', 'rp', 'rpm', 'rr', 'rrf', 'rs', 'rsc', 'rspp', 'rss', 'rsu', 'rsvp', 'rt', 'rtdpc', 'rtg', 'rtn', 'rtp', 'rttt', 'rvm', 's-dab', 's.a', 's.b.f', 's.n.c', 's.p.a', 's.p.m', 's.r.l', 's.ten', 's.v', 's/m', 'sa', 'sab', 'saca', 'sace', 'sact', 'sad', 'sag', 'sahm', 'sai', 'saisa', 'sam', 'san', 'sanas', 'sape', 'sar', 'sars', 'sart', 'sas', 'sbaf', 'sbas', 'sbn', 'sc', 'sca.sm', 'scherz', 'scien', 'scn', 'scsi', 'scuba', 'scult', 'scut', 'sdds', 'sdiaf', 'sds', 'sdsl', 'se', 'seat', 'sebc', 'sec', 'seca', 'secam', 'secc', 'see', 'seg', 'segg', 'segredifesa', 'sem', 'sempo', 'sen', 'sens', 'seo', 'serg', 'serg.magg.(sgm)', 'serg.magg.ca', 'set', 'sfc', 'sfis', 'sfx', 'sg', 'sga', 'sgc', 'sgg', 'sgml', 'sgt', 'si', 'si@lt', 'sia', 'siae', 'siaic', 'siap', 'sias', 'sic', 'sicav', 'sid', 'sido', 'sie', 'sif', 'sig', 'sig.na', 'sig.ra', 'sige', 'sigg', 'sigill', 'sigo', 'siia', 'simb', 'simbdea', 'simg', 'simo', 'sin', 'sinalv', 'sing', 'sins', 'sinu', 'siocmf', 'siog', 'sioi', 'siommms', 'siot', 'sip', 'sipem', 'sips', 'sirf', 'sirm', 'sis', 'sisde', 'sismi', 'sissa', 'sit', 'siulp', 'siusa', 'sla', 'sldn', 'slm', 'slr', 'sm', 'sma', 'smau', 'smd', 'sme', 'smes', 'smm', 'smpt', 'sms', 'sn', 'snad', 'snai', 'snc', 'sncci', 'sncf', 'sngci', 'snit', 'so', 'soc', 'sociol', 'sogg', 'soho', 'soi', 'sol', 'somipar', 'somm', 'sonar', 'sp', 'spa', 'spe', 'spett', 'spi', 'spm', 'spot', 'spp', 'spreg', 'sq', 'sqd', 'sr', 'srd', 'srl', 'srr', 'ss', 'ssi', 'ssn', 'ssr', 'sss', 'st', 'st. d. arte', 'st. d. dir', 'st. d. filos', 'st. d. rel', 'stat', 'stg', 'stp', 'stw', 'su', 'suap', 'suem', 'suff', 'sup', 'superl', 'supt', 'surg', 'surl', 'susm', 'sut', 'suv', 'sv', 'svga', 'swics', 'swift', 'swot', 'sxga', 'sz', 't-dab', 't.sg', 'ta', 'taa', 'tac', 'tacan', 'tacs', 'taeg', 'tai', 'tan', 'tar', 'targa', 'tav', 'tb', 'tbt', 'tci', 'tcp', 'tcp/ip', 'tcsm', 'tdm', 'tdma', 'te', 'tecn', 'tecnol', 'ted', 'tel', 'telecom', 'temp', 'ten.(lt)', 'ten.col.(ltc)', 'ten.gen', 'teol', 'term', 'tesa', 'tese', 'tesol', 'tess', 'tet', 'tetra', 'tfr', 'tft', 'tfts', 'tgv', 'thx', 'tim', 'tipogr', 'tir', 'tit', 'tld', 'tm', 'tmc', 'tn', 'to', 'toefl', 'ton', 'top', 'topog', 'tos', 'tosap', 'tosc', 'tp', 'tpl', 'tr', 'trad', 'tramat', 'trasp', 'ts', 'tso', 'tuir', 'tuld', 'tv', 'twa', 'twain', 'u.ad', 'u.s', 'ucai', 'ucca', 'ucei', 'ucina', 'uclaf', 'ucoi', 'ucoii', 'ucsi', 'ud', 'udc', 'udi', 'udp', 'ue', 'uefa', 'uemri', 'ufo', 'ugc', 'uhci', 'uhf', 'uht', 'uibm', 'uic', 'uicc', 'uiga', 'uil', 'uilps', 'uisp', 'uits', 'uk', 'ul', 'ull', 'uma', 'umb', 'ummc', 'umss', 'umts', 'unac', 'unar', 'unasp', 'uncem', 'unctad', 'undp', 'unefa', 'unep', 'unesco', 'ungh', 'unhcr', 'uni', 'unicef', 'unitec', 'unpredep', 'unsa', 'upa', 'upc', 'urar', 'urban', 'url', 'urp', 'urss', 'usa', 'usb', 'usfi', 'usga', 'usl', 'usp', 'uspi', 'ussr', 'utap', 'v', 'v.brig', 'v.cte', 'v.m', 'v.p', 'v.r', 'v.s', 'va', 'vab', 'vaio', 'val', 'vas', 'vb', 'vbr', 'vc', 'vcc', 'vcr', 'vda', 've', 'ven', 'ves', 'vesa', 'veter', 'vezz', 'vfb', 'vfp', 'vfx', 'vga', 'vhf', 'vhs', 'vi', 'via', 'vip', 'vis', 'vn', 'vo', 'voc', 'voip', 'vol', 'volg', 'voll', 'vor', 'vpdn', 'vpn', 'vr', 'vs', 'vsp', 'vt', 'vtc', 'vts', 'vtt', 'vv', 'vvf', 'wai', 'wais', 'wan', 'wap', 'wasp', 'wc', 'wcdma', 'wcm', 'wga', 'wi-fi', 'wipo', 'wisp', 'wll', 'wml', 'wms', 'worm', 'wp', 'wpan', 'wssn', 'wto', 'wwan', 'wwf', 'www', 'wygiwys', 'xl', 'xml', 'xs', 'xxl', 'xxs', 'yaf', 'yb', 'yci', 'yd', 'yd²', 'yd³', 'ymca', 'zat', 'zb', 'zcs', 'zdf', 'zdg', 'zift', 'zool', 'zoot', 'ztc', 'ztl', '°c', '°f', '°n', '°ra', '°ré', 'µg']
|
14 |
+
PREPOSITIVE_ABBREVIATIONS = ['a.c', 'acc', 'adj', 'adm', 'adv', 'all', 'amn', 'arch', 'asst', 'avv', 'banc', 'bart', 'bcc', 'bldg', 'brig', 'bros', 'c.a', 'c.a.p', 'c.c.p', 'c.m', 'c.p', 'c.p', 'c.s', 'c.v', 'capt', 'cc', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'corr', 'cpl', 'dir', 'dott', 'dott', 'dr', 'dr', 'drs', 'e.p.c', 'ecc', 'egr', 'ens', 'es', 'fatt', 'gen', 'geom', 'gg', 'gov', 'hon', 'hosp', 'hr', 'id', 'ing', 'insp', 'int', "l'avv", "l'ing", 'lett', 'lt', 'maj', 'messrs', 'mlle', 'mm', 'mme', 'mo', 'mons', 'mr', 'mr', 'mrs', 'mrs', 'ms', 'ms', 'msgr', 'n.b', 'ogg', 'on', 'op', 'ord', 'p.c', 'p.c.c', 'p.es', 'p.f', 'p.r', 'p.s', 'p.t', 'p.v', 'pfc', 'ph', 'post', 'pp', 'prof', 'psicol', 'pvt', 'racc', 'rag', 'rep', 'reps', 'res', 'rev', 'ric', 'rif', 'rp', 'rsvp', 'rt', 's.a', 's.b.f', 's.n.c', 's.p.a', 's.p.m', 's.r.l', 'seg', 'sen', 'sens', 'sfc', 'sgg', 'sgt', 'sig', 'sigg', 'soc', 'spett', 'sr', 'ss', 'st', 'supt', 'surg', 'tel', 'u.s', 'v.p', 'v.r', 'v.s']
|
15 |
+
NUMBER_ABBREVIATIONS = ['art', 'no', 'nos', 'nr', 'pp']
|
pysbd/lang/japanese.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
4 |
+
from pysbd.between_punctuation import BetweenPunctuation
|
5 |
+
from pysbd.lang.common import Common, Standard
|
6 |
+
from pysbd.punctuation_replacer import replace_punctuation
|
7 |
+
from pysbd.cleaner import Cleaner
|
8 |
+
from pysbd.utils import Text, Rule
|
9 |
+
|
10 |
+
class Japanese(Common, Standard):
|
11 |
+
|
12 |
+
iso_code = 'ja'
|
13 |
+
|
14 |
+
class Cleaner(Cleaner):
|
15 |
+
|
16 |
+
def __init__(self, text, lang, doc_type=None):
|
17 |
+
super().__init__(text, lang)
|
18 |
+
|
19 |
+
def clean(self):
|
20 |
+
self.remove_newline_in_middle_of_word()
|
21 |
+
return self.text
|
22 |
+
|
23 |
+
def remove_newline_in_middle_of_word(self):
|
24 |
+
NewLineInMiddleOfWordRule = Rule(r'(?<=の)\n(?=\S)', '')
|
25 |
+
self.text = Text(self.text).apply(NewLineInMiddleOfWordRule)
|
26 |
+
|
27 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
28 |
+
SENTENCE_STARTERS = []
|
29 |
+
|
30 |
+
class BetweenPunctuation(BetweenPunctuation):
|
31 |
+
|
32 |
+
def __init__(self, text):
|
33 |
+
super().__init__(text)
|
34 |
+
|
35 |
+
def replace(self):
|
36 |
+
self.sub_punctuation_between_quotes_and_parens()
|
37 |
+
return self.text
|
38 |
+
|
39 |
+
def sub_punctuation_between_parens_ja(self):
|
40 |
+
BETWEEN_PARENS_JA_REGEX = r'((?=(?P<tmp>[^()]+|\\{2}|\\.)*)(?P=tmp))'
|
41 |
+
self.text = re.sub(BETWEEN_PARENS_JA_REGEX, replace_punctuation,
|
42 |
+
self.text)
|
43 |
+
|
44 |
+
def sub_punctuation_between_quotes_ja(self):
|
45 |
+
BETWEEN_QUOTE_JA_REGEX = r'「(?=(?P<tmp>[^「」]+|\\{2}|\\.)*)(?P=tmp)」'
|
46 |
+
self.text = re.sub(BETWEEN_QUOTE_JA_REGEX, replace_punctuation,
|
47 |
+
self.text)
|
48 |
+
|
49 |
+
def sub_punctuation_between_quotes_and_parens(self):
|
50 |
+
self.sub_punctuation_between_parens_ja()
|
51 |
+
self.sub_punctuation_between_quotes_ja()
|
pysbd/lang/kazakh.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
3 |
+
from pysbd.lang.common import Common, Standard
|
4 |
+
from pysbd.processor import Processor
|
5 |
+
from pysbd.utils import Text, Rule
|
6 |
+
|
7 |
+
|
8 |
+
class Kazakh(Common, Standard):
|
9 |
+
|
10 |
+
iso_code = 'kk'
|
11 |
+
|
12 |
+
# Handling Cyrillic characters in re module
|
13 |
+
# https://stackoverflow.com/a/10982308/5462100
|
14 |
+
MULTI_PERIOD_ABBREVIATION_REGEX = r'\b[\u0400-\u0500]+(?:\.\s?[\u0400-\u0500])+[.]|b[a-z](?:\.[a-z])+[.]'
|
15 |
+
|
16 |
+
class Processor(Processor):
|
17 |
+
|
18 |
+
def __init__(self, text, lang, char_span=False):
|
19 |
+
super().__init__(text, lang, char_span)
|
20 |
+
|
21 |
+
def between_punctuation(self, txt):
|
22 |
+
txt = self.between_punctuation_processor(txt).replace()
|
23 |
+
# Rubular: http://rubular.com/r/WRWy56Z5zp
|
24 |
+
QuestionMarkFollowedByDashLowercaseRule = Rule(r'(?<=)\?(?=\s*[-—]\s*)', '&ᓷ&')
|
25 |
+
# Rubular: http://rubular.com/r/lixxP7puSa
|
26 |
+
ExclamationMarkFollowedByDashLowercaseRule = Rule(r'(?<=)!(?=\s*[-—]\s*)', '&ᓴ&')
|
27 |
+
|
28 |
+
txt = Text(txt).apply(QuestionMarkFollowedByDashLowercaseRule,
|
29 |
+
ExclamationMarkFollowedByDashLowercaseRule)
|
30 |
+
return txt
|
31 |
+
|
32 |
+
class Abbreviation(Standard.Abbreviation):
|
33 |
+
ABBREVIATIONS = ['afp', 'anp', 'atp', 'bae', 'bg', 'bp', 'cam', 'cctv', 'cd', 'cez', 'cgi', 'cnpc', 'farc', 'fbi', 'eiti', 'epo', 'er', 'gp', 'gps', 'has', 'hiv', 'hrh', 'http', 'icu', 'idf', 'imd', 'ime', 'icu', 'idf', 'ip', 'iso', 'kaz', 'kpo', 'kpa', 'kz', 'kz', 'mri', 'nasa', 'nba', 'nbc', 'nds', 'ohl', 'omlt', 'ppm', 'pda', 'pkk', 'psm', 'psp', 'raf', 'rss', 'rtl', 'sas', 'sme', 'sms', 'tnt', 'udf', 'uefa', 'usb', 'utc', 'x', 'zdf', 'әқбк', 'әқбк', 'аақ', 'авг.', 'aбб', 'аек', 'ак', 'ақ', 'акцион.', 'акср', 'ақш', 'англ', 'аөсшк', 'апр', 'м.', 'а.', 'р.', 'ғ.', 'апр.', 'аум.', 'ацат', 'әч', 'т. б.', 'б. з. б.', 'б. з. б.', 'б. з. д.', 'б. з. д.', 'биікт.', 'б. т.', 'биол.', 'биохим', 'бө', 'б. э. д.', 'бта', 'бұұ', 'вич', 'всоонл', 'геогр.', 'геол.', 'гленкор', 'гэс', 'қк', 'км', 'г', 'млн', 'млрд', 'т', 'ғ. с.', 'ғ.', 'қ.', 'ғ.', 'дек.', 'днқ', 'дсұ', 'еақк', 'еқыұ', 'ембімұнайгаз', 'ео', 'еуразэқ', 'еуроодақ', 'еұу', 'ж.', 'ж.', 'жж.', 'жоо', 'жіө', 'жсдп', 'жшс', 'іім', 'инта', 'исаф', 'камаз', 'кгб', 'кеу', 'кг', 'км²', 'км²', 'км³', 'км³', 'кимеп', 'кср', 'ксро', 'кокп', 'кхдр', 'қазатомпром', 'қазкср', 'қазұу', 'қазмұнайгаз', 'қазпошта', 'қазтаг', 'қазұу', 'қкп', 'қмдб', 'қр', 'қхр', 'лат.', 'м²', 'м²', 'м³', 'м³', 'магатэ', 'май.', 'максам', 'мб', 'мвт', 'мемл', 'м', 'мсоп', 'мтк', 'мыс.', 'наса', 'нато', 'нквд', 'нояб.', 'обл.', 'огпу', 'окт.', 'оңт.', 'опек', 'оеб', 'өзенмұнайгаз', 'өф', 'пәк', 'пед.', 'ркфср', 'рнқ', 'рсфср', 'рф', 'свс', 'сву', 'сду', 'сес', 'сент.', 'см', 'снпс', 'солт.', 'солт.', 'сооно', 'ссро', 'сср', 'ссср', 'ссс', 'сэс', 'дк', 'т. б.', 'т', 'тв', 'тереңд.', 'тех.', 'тжқ', 'тмд', 'төм.', 'трлн', 'тр', 'т.', 'и.', 'м.', 'с.', 'ш.', 'т.', 'т. с. с.', 'тэц', 'уаз', 'уефа', 'еқыұ', 'ұқк', 'ұқшұ', 'февр.', 'фққ', 'фсб', 'хим.', 'хқко', 'шұар', 'шыұ', 'экон.', 'экспо', 'цтп', 'цас', 'янв.', 'dvd', 'жкт', 'ққс', 'км', 'ацат', 'юнеско', 'ббс', 'mgm', 'жск', 'зоо', 'бсн', 'өұқ', 'оар', 'боак', 'эөкк', 'хтқо', 'әөк', 'жэк', 'хдо', 'спбму', 'аф', 'сбд', 'амт', 'гсдп', 'гсбп', 'эыдұ', 'нұсжп', 'шыұ', 'жтсх', 'хдп', 'эқк', 'фкққ', 'пиқ', 'өгк', 'мбф', 'маж', 'кота', 'тж', 'ук', 'обб', 'сбл', 'жхл', 'кмс', 'бмтрк', 'жққ', 'бхооо', 'мқо', 'ржмб', 'гулаг', 'жко', 'еэы', 'еаэы', 'кхдр', 'рфкп', 'рлдп', 'хвқ', 'мр', 'мт', 'кту', 'ртж', 'тим', 'мемдум', 'ксро', 'т.с.с', 'с.ш.', 'ш.б.', 'б.б.', 'руб', 'мин', 'акад.', 'ғ.', 'мм', 'мм.']
|
34 |
+
PREPOSITIVE_ABBREVIATIONS = []
|
35 |
+
NUMBER_ABBREVIATIONS = []
|
36 |
+
|
37 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
38 |
+
|
39 |
+
SENTENCE_STARTERS = []
|
40 |
+
|
41 |
+
def __init__(self, text, lang):
|
42 |
+
super().__init__(text, lang)
|
43 |
+
|
44 |
+
def replace(self):
|
45 |
+
SingleUpperCaseCyrillicLetterAtStartOfLineRule = Rule(r'(?<=^[А-ЯЁ])\.(?=\s)', '∯')
|
46 |
+
SingleUpperCaseCyrillicLetterRule = Rule(r'(?<=\s[А-ЯЁ])\.(?=\s)', '∯')
|
47 |
+
self.text = Text(self.text).apply(SingleUpperCaseCyrillicLetterAtStartOfLineRule,
|
48 |
+
SingleUpperCaseCyrillicLetterRule)
|
49 |
+
self.replace_multi_period_abbreviations()
|
50 |
+
return self.text
|
pysbd/lang/marathi.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# Grammer rules from https://gopract.com/Pages/Marathi-Grammar-Viramchinah.aspx
|
3 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
4 |
+
from pysbd.lang.common import Common, Standard
|
5 |
+
|
6 |
+
class Marathi(Common, Standard):
|
7 |
+
|
8 |
+
iso_code = 'mr'
|
9 |
+
|
10 |
+
SENTENCE_BOUNDARY_REGEX = r'.*?[.!?]|.*?$'
|
11 |
+
Punctuations = ['.', '!', '?']
|
12 |
+
|
13 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
14 |
+
SENTENCE_STARTERS = []
|
pysbd/lang/persian.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
|
4 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
5 |
+
from pysbd.lang.common import Common, Standard
|
6 |
+
from pysbd.utils import Rule
|
7 |
+
|
8 |
+
class Persian(Common, Standard):
|
9 |
+
|
10 |
+
iso_code = 'fa'
|
11 |
+
|
12 |
+
Punctuations = ['?', '!', ':', '.', '؟']
|
13 |
+
SENTENCE_BOUNDARY_REGEX = r'.*?[:\.!\?؟]|.*?\Z|.*?$'
|
14 |
+
|
15 |
+
# Rubular: http://rubular.com/r/RX5HpdDIyv
|
16 |
+
ReplaceColonBetweenNumbersRule = Rule(r'(?<=\d):(?=\d)', '♭')
|
17 |
+
|
18 |
+
# Rubular: http://rubular.com/r/kPRgApNHUg
|
19 |
+
ReplaceNonSentenceBoundaryCommaRule = Rule(r'،(?=\s\S+،)', '♬')
|
20 |
+
|
21 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
22 |
+
|
23 |
+
SENTENCE_STARTERS = []
|
24 |
+
|
25 |
+
def __init__(self, text, lang):
|
26 |
+
super().__init__(text, lang)
|
27 |
+
|
28 |
+
def scan_for_replacements(self, txt, am, index, character_array):
|
29 |
+
txt = re.sub('(?<={0})\.'.format(am), '∯', txt)
|
30 |
+
return txt
|
pysbd/lang/polish.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
3 |
+
from pysbd.lang.common import Common, Standard
|
4 |
+
|
5 |
+
class Polish(Common, Standard):
|
6 |
+
|
7 |
+
iso_code = 'pl'
|
8 |
+
|
9 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
10 |
+
SENTENCE_STARTERS = []
|
11 |
+
|
12 |
+
class Abbreviation(Standard.Abbreviation):
|
13 |
+
ABBREVIATIONS = ['ags', 'alb', 'ang', 'aor', 'awest', 'bałt', 'bojkow', 'bret', 'brus', 'bsł', 'bułg', 'c.b.d.o', 'c.b.d.u', 'celt', 'chorw', 'cs', 'czakaw', 'czerw', 'czes', 'dłuż', 'dniem', 'dor', 'dubrow', 'duń', 'ekaw', 'fiń', 'franc', 'gal', 'germ', 'głuż', 'gniem', 'goc', 'gr', 'grudz', 'hebr', 'het', 'hol', 'I cont', 'ie', 'ikaw', 'irań', 'irl', 'islandz', 'itd', 'itd.', 'itp', 'jekaw', 'kajkaw', 'kasz', 'kirg', 'kwiec', 'łac', 'lip', 'listop', 'lit', 'łot', 'lp', 'maced', 'mar', 'młpol', 'moraw', 'n.e', 'nb.', 'ngr', 'niem', 'nord', 'norw', 'np', 'np.', 'ok.', 'orm', 'oset', 'osk', 'p.n', 'p.n.e', 'p.o', 'pazdz', 'pers', 'pie', 'pod red.', 'podhal', 'pol', 'połab', 'port', 'prekm', 'pskow', 'psł', 'R cont', 'rez', 'rom', 'rozdz.', 'rum', 'rus', 'rys.', 'sas', 'sch', 'scs', 'serb', 'sierp', 'śl', 'sła', 'słe', 'słi', 'słow', 'sp. z o.o', 'śrdniem', 'śrgniem', 'śrirl', 'stbułg', 'stind', 'stpol', 'stpr', 'str.', 'strus', 'stwniem', 'stycz', 'sztokaw', 'szwedz', 't.', 'tj.', 'tłum.', 'toch', 'tur', 'tzn', 'ukr', 'ul', 'umbr', 'wed', 'węg', 'wlkpol', 'włos', 'wrzes', 'wyd.', 'zakarp']
|
14 |
+
PREPOSITIVE_ABBREVIATIONS = []
|
15 |
+
NUMBER_ABBREVIATIONS = []
|
pysbd/lang/russian.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
|
4 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
5 |
+
from pysbd.lang.common import Common, Standard
|
6 |
+
|
7 |
+
class Russian(Common, Standard):
|
8 |
+
|
9 |
+
iso_code = 'ru'
|
10 |
+
|
11 |
+
class Abbreviation(Standard.Abbreviation):
|
12 |
+
ABBREVIATIONS = ["y", "y.e", "а", "авт", "адм.-терр", "акад", "в", "вв", "вкз", "вост.-европ", "г", "гг", "гос", "гр", "д", "деп", "дисс", "дол", "долл", "ежедн", "ж", "жен", "з", "зап", "зап.-европ", "заруб", "и", "ин", "иностр", "инст", "к", "канд", "кв", "кг", "куб", "л", "л.h", "л.н", "м", "мин", "моск", "муж", "н", "нед", "о", "п", "пгт", "пер", "пп", "пр", "просп", "проф", "р", "руб", "с", "сек", "см", "спб", "стр", "т", "тел", "тов", "тт", "тыс", "у", "у.е", "ул", "ф", "ч"]
|
13 |
+
PREPOSITIVE_ABBREVIATIONS = []
|
14 |
+
NUMBER_ABBREVIATIONS = []
|
15 |
+
|
16 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
17 |
+
|
18 |
+
SENTENCE_STARTERS = []
|
19 |
+
|
20 |
+
def __init__(self, text, lang):
|
21 |
+
super().__init__(text, lang)
|
22 |
+
|
23 |
+
def replace_period_of_abbr(self, txt, abbr):
|
24 |
+
txt = re.sub(r'(?<=\s{abbr})\.'.format(abbr=abbr.strip()), '∯', txt)
|
25 |
+
txt = re.sub(r'(?<=\A{abbr})\.'.format(abbr=abbr.strip()), '∯', txt)
|
26 |
+
txt = re.sub(r'(?<=^{abbr})\.'.format(abbr=abbr.strip()), '∯', txt)
|
27 |
+
return txt
|
pysbd/lang/slovak.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
4 |
+
from pysbd.between_punctuation import BetweenPunctuation
|
5 |
+
from pysbd.lang.common import Common, Standard
|
6 |
+
from pysbd.processor import Processor
|
7 |
+
from pysbd.utils import Text
|
8 |
+
from pysbd.punctuation_replacer import replace_punctuation
|
9 |
+
from pysbd.lists_item_replacer import ListItemReplacer
|
10 |
+
|
11 |
+
|
12 |
+
class Slovak(Common, Standard):
|
13 |
+
|
14 |
+
iso_code = 'sk'
|
15 |
+
|
16 |
+
class ListItemReplacer(ListItemReplacer):
|
17 |
+
|
18 |
+
def add_line_break(self):
|
19 |
+
# We've found alphabetical lists are causing a lot of problems with abbreviations
|
20 |
+
# with multiple periods and spaces, such as 'Company name s. r. o.'. Disabling
|
21 |
+
# alphabetical list parsing seems like a reasonable tradeoff.
|
22 |
+
|
23 |
+
# self.format_alphabetical_lists()
|
24 |
+
self.format_roman_numeral_lists()
|
25 |
+
self.format_numbered_list_with_periods()
|
26 |
+
self.format_numbered_list_with_parens()
|
27 |
+
return self.text
|
28 |
+
|
29 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
30 |
+
SENTENCE_STARTERS = []
|
31 |
+
|
32 |
+
def replace_period_of_abbr(self, txt, abbr):
|
33 |
+
# This is a very simple version of the original function, which makes sure
|
34 |
+
# all of the periods in the abbreviation get replaced, not only the last one.
|
35 |
+
# In Slovak language we use a lot of abbreviations like 'Company Name s. r. o.', so it
|
36 |
+
# is important to handle this properly.
|
37 |
+
|
38 |
+
abbr_new = abbr.replace(".", "∯") + "∯"
|
39 |
+
txt = txt.replace(abbr + ".", abbr_new)
|
40 |
+
return txt
|
41 |
+
|
42 |
+
class Abbreviation(Standard.Abbreviation):
|
43 |
+
ABBREVIATIONS = ['č', 'no', 'nr', 's. r. o', 'ing', 'p', 'a. d', 'o. k', 'pol. pr', 'a. s. a. p', 'p. n. l', 'red', 'o.k', 'a.d', 'm.o', 'pol.pr', 'a.s.a.p', 'p.n.l', 'pp', 'sl', 'corp', 'plgr', 'tz', 'rtg', 'o.c.p', 'o. c. p', 'c.k', 'c. k', 'n.a', 'n. a', 'a.m', 'a. m', 'vz', 'i.b', 'i. b', 'ú.p.v.o', 'ú. p. v. o', 'bros', 'rsdr', 'doc', 'tu', 'ods', 'n.w.a', 'n. w. a', 'nár', 'pedg', 'paeddr', 'rndr', 'naprk', 'a.g.p', 'a. g. p', 'prof', 'pr', 'a.v', 'a. v', 'por', 'mvdr', 'nešp', 'u.s', 'u. s', 'kt', 'vyd', 'e.t', 'e. t', 'al', 'll.m', 'll. m', 'o.f.i', 'o. f. i', 'mr', 'apod', 'súkr', 'stred', 's.e.g', 's. e. g', 'sr', 'tvz', 'ind', 'var', 'etc', 'atd', 'n.o', 'n. o', 's.a', 's. a', 'např', 'a.i.i', 'a. i. i', 'a.k.a', 'a. k. a', 'konkr', 'čsl', 'odd', 'ltd', 't.z', 't. z', 'o.z', 'o. z', 'obv', 'obr', 'pok', 'tel', 'št', 'skr', 'phdr', 'xx', 'š.p', 'š. p', 'ph.d', 'ph. d', 'm.n.m', 'm. n. m', 'zz', 'roz', 'atď.', 'ev', 'v.sp', 'v. sp', 'drsc', 'mudr', 't.č', 't. č', 'el', 'os', 'co', 'r.o', 'r. o', 'str', 'p.a', 'p. a', 'zdravot', 'prek', 'gen', 'viď', 'dr', 'cca', 'p.s', 'p. s', 'zák', 'slov', 'arm', 'inc', 'max', 'd.c', 'k.o', 'a. r. k', 'd. c', 'k. o', 'a. r. k', 'soc', 'bc', 'zs', 'akad', 'sz', 'pozn', 'tr', 'nám', 'kol', 'csc', 'ul', 'sp', 'o.i', 'jr', 'zb', 'sv', 'tj', 'čs', 'tzn', 'príp', 'iv', 'hl', 'st', 'pod', 'vi', 'tis', 'stor', 'rozh', 'mld', 'atď', 'mgr', 'a.s', 'a. s', 'phd', 'z.z', 'z. z', 'judr', 'ing', 'hod', 'vs', 'písm', 's.r.o', 'min', 'ml', 'iii', 't.j', 't. j', 'spol', 'mil', 'ii', 'napr', 'resp', 'tzv']
|
44 |
+
PREPOSITIVE_ABBREVIATIONS = ['st', 'p', 'dr', 'mudr', 'judr', 'ing', 'mgr', 'bc', 'drsc', 'doc', 'prof']
|
45 |
+
NUMBER_ABBREVIATIONS = ['č', 'no', 'nr']
|
46 |
+
|
47 |
+
class BetweenPunctuation(BetweenPunctuation):
|
48 |
+
# Rubular: https://rubular.com/r/rImWbaYFtHHtf4
|
49 |
+
BETWEEN_SLOVAK_DOUBLE_QUOTES_REGEX = r'„(?>[^“\\]+|\\{2}|\\.)*“'
|
50 |
+
BETWEEN_SLOVAK_DOUBLE_QUOTES_REGEX_2 = r'\„(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)\“'
|
51 |
+
|
52 |
+
def sub_punctuation_between_slovak_double_quotes(self, txt):
|
53 |
+
return re.sub(self.BETWEEN_SLOVAK_DOUBLE_QUOTES_REGEX_2, replace_punctuation, txt)
|
54 |
+
|
55 |
+
def sub_punctuation_between_quotes_and_parens(self, txt):
|
56 |
+
txt = self.sub_punctuation_between_single_quotes(txt)
|
57 |
+
txt = self.sub_punctuation_between_single_quote_slanted(txt)
|
58 |
+
txt = self.sub_punctuation_between_double_quotes(txt)
|
59 |
+
txt = self.sub_punctuation_between_square_brackets(txt)
|
60 |
+
txt = self.sub_punctuation_between_parens(txt)
|
61 |
+
txt = self.sub_punctuation_between_quotes_arrow(txt)
|
62 |
+
txt = self.sub_punctuation_between_em_dashes(txt)
|
63 |
+
txt = self.sub_punctuation_between_quotes_slanted(txt)
|
64 |
+
txt = self.sub_punctuation_between_slovak_double_quotes(txt)
|
65 |
+
return txt
|
66 |
+
|
67 |
+
class Processor(Processor):
|
68 |
+
|
69 |
+
def __init__(self, text, lang, char_span=False):
|
70 |
+
super().__init__(text, lang, char_span)
|
71 |
+
|
72 |
+
def process(self):
|
73 |
+
if not self.text:
|
74 |
+
return self.text
|
75 |
+
self.text = self.text.replace('\n', '\r')
|
76 |
+
|
77 |
+
# Here we use language specific ListItemReplacer:
|
78 |
+
li = self.lang.ListItemReplacer(self.text)
|
79 |
+
self.text = li.add_line_break()
|
80 |
+
|
81 |
+
self.replace_abbreviations()
|
82 |
+
self.replace_numbers()
|
83 |
+
self.replace_continuous_punctuation()
|
84 |
+
self.replace_periods_before_numeric_references()
|
85 |
+
self.text = Text(self.text).apply(
|
86 |
+
self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,
|
87 |
+
self.lang.GeoLocationRule, self.lang.FileFormatRule)
|
88 |
+
postprocessed_sents = self.split_into_segments()
|
89 |
+
return postprocessed_sents
|
90 |
+
|
91 |
+
def replace_numbers(self):
|
92 |
+
self.text = Text(self.text).apply(*self.lang.Numbers.All)
|
93 |
+
self.replace_period_in_slovak_dates()
|
94 |
+
self.replace_period_in_ordinal_numerals()
|
95 |
+
self.replace_period_in_roman_numerals()
|
96 |
+
return self.text
|
97 |
+
|
98 |
+
def replace_period_in_ordinal_numerals(self):
|
99 |
+
# Rubular: https://rubular.com/r/0HkmvzMGTqgWs6
|
100 |
+
self.text = re.sub(r'(?<=\d)\.(?=\s*[a-z]+)', '∯', self.text)
|
101 |
+
|
102 |
+
def replace_period_in_roman_numerals(self):
|
103 |
+
# Rubular: https://rubular.com/r/XlzTIi7aBRThSl
|
104 |
+
self.text = re.sub(r'((\s+[VXI]+)|(^[VXI]+))(\.)(?=\s+)', r'\1∯', self.text, re.IGNORECASE)
|
105 |
+
|
106 |
+
def replace_period_in_slovak_dates(self):
|
107 |
+
MONTHS = ['Január', 'Február', 'Marec', 'Apríl', 'Máj', 'Jún', 'Júl', 'August', 'September', 'Október', 'November', 'December',
|
108 |
+
'Januára', 'Februára', 'Marca', 'Apríla', 'Mája', 'Júna', 'Júla', 'Augusta', 'Septembra', 'Októbra', 'Novembra', 'Decembra']
|
109 |
+
for month in MONTHS:
|
110 |
+
# Rubular: https://rubular.com/r/dGLZqsbjcdJvCd
|
111 |
+
self.text = re.sub(r'(?<=\d)\.(?=\s*{month})'.format(month=month), '∯', self.text)
|
pysbd/lang/spanish.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
3 |
+
from pysbd.lang.common import Common, Standard
|
4 |
+
|
5 |
+
class Spanish(Common, Standard):
|
6 |
+
|
7 |
+
iso_code = 'es'
|
8 |
+
|
9 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
10 |
+
SENTENCE_STARTERS = []
|
11 |
+
|
12 |
+
class Abbreviation(Standard.Abbreviation):
|
13 |
+
ABBREVIATIONS = ['a.c', 'a/c', 'abr', 'adj', 'admón', 'afmo', 'ago', 'almte', 'ap', 'apdo', 'arq', 'art', 'atte', 'av', 'avda', 'bco', 'bibl', 'bs. as', 'c', 'c.f', 'c.g', 'c/c', 'c/u', 'cap', 'cc.aa', 'cdad', 'cm', 'co', 'cra', 'cta', 'cv', 'd.e.p', 'da', 'dcha', 'dcho', 'dep', 'dic', 'dicc', 'dir', 'dn', 'doc', 'dom', 'dpto', 'dr', 'dra', 'dto', 'ee', 'ej', 'en', 'entlo', 'esq', 'etc', 'excmo', 'ext', 'f.c', 'fca', 'fdo', 'febr', 'ff. aa', 'ff.cc', 'fig', 'fil', 'fra', 'g.p', 'g/p', 'gob', 'gr', 'gral', 'grs', 'hnos', 'hs', 'igl', 'iltre', 'imp', 'impr', 'impto', 'incl', 'ing', 'inst', 'izdo', 'izq', 'izqdo', 'j.c', 'jue', 'jul', 'jun', 'kg', 'km', 'lcdo', 'ldo', 'let', 'lic', 'ltd', 'lun', 'mar', 'may', 'mg', 'min', 'mié', 'mm', 'máx', 'mín', 'mt', 'n. del t', 'n.b', 'no', 'nov', 'ntra. sra', 'núm', 'oct', 'p', 'p.a', 'p.d', 'p.ej', 'p.v.p', 'párrf', 'ppal', 'prev', 'prof', 'prov', 'ptas', 'pts', 'pza', 'pág', 'págs', 'párr', 'q.e.g.e', 'q.e.p.d', 'q.e.s.m', 'reg', 'rep', 'rr. hh', 'rte', 's', 's. a', 's.a.r', 's.e', 's.l', 's.r.c', 's.r.l', 's.s.s', 's/n', 'sdad', 'seg', 'sept', 'sig', 'sr', 'sra', 'sres', 'srta', 'sta', 'sto', 'sáb', 't.v.e', 'tamb', 'tel', 'tfno', 'ud', 'uu', 'uds', 'univ', 'v.b', 'v.e', 'vd', 'vds', 'vid', 'vie', 'vol', 'vs', 'vto', 'a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
|
14 |
+
PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
|
15 |
+
NUMBER_ABBREVIATIONS = ['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']
|
pysbd/lang/urdu.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
3 |
+
from pysbd.lang.common import Common, Standard
|
4 |
+
|
5 |
+
class Urdu(Common, Standard):
|
6 |
+
|
7 |
+
iso_code = 'ur'
|
8 |
+
|
9 |
+
SENTENCE_BOUNDARY_REGEX = r'.*?[۔؟!\?]|.*?$'
|
10 |
+
Punctuations = ['?', '!', '۔', '؟']
|
11 |
+
|
12 |
+
class AbbreviationReplacer(AbbreviationReplacer):
|
13 |
+
SENTENCE_STARTERS = []
|
pysbd/languages.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from pysbd.lang.english import English
|
3 |
+
from pysbd.lang.hindi import Hindi
|
4 |
+
from pysbd.lang.marathi import Marathi
|
5 |
+
from pysbd.lang.chinese import Chinese
|
6 |
+
from pysbd.lang.spanish import Spanish
|
7 |
+
from pysbd.lang.amharic import Amharic
|
8 |
+
from pysbd.lang.arabic import Arabic
|
9 |
+
from pysbd.lang.armenian import Armenian
|
10 |
+
from pysbd.lang.bulgarian import Bulgarian
|
11 |
+
from pysbd.lang.urdu import Urdu
|
12 |
+
from pysbd.lang.russian import Russian
|
13 |
+
from pysbd.lang.polish import Polish
|
14 |
+
from pysbd.lang.persian import Persian
|
15 |
+
from pysbd.lang.dutch import Dutch
|
16 |
+
from pysbd.lang.danish import Danish
|
17 |
+
from pysbd.lang.french import French
|
18 |
+
from pysbd.lang.burmese import Burmese
|
19 |
+
from pysbd.lang.greek import Greek
|
20 |
+
from pysbd.lang.italian import Italian
|
21 |
+
from pysbd.lang.japanese import Japanese
|
22 |
+
from pysbd.lang.deutsch import Deutsch
|
23 |
+
from pysbd.lang.kazakh import Kazakh
|
24 |
+
from pysbd.lang.slovak import Slovak
|
25 |
+
from pysbd.lang.armenian import Armenian
|
26 |
+
|
27 |
+
LANGUAGE_CODES = {
|
28 |
+
'en': English,
|
29 |
+
'hi': Hindi,
|
30 |
+
'mr': Marathi,
|
31 |
+
'zh': Chinese,
|
32 |
+
'es': Spanish,
|
33 |
+
'am': Amharic,
|
34 |
+
'ar': Arabic,
|
35 |
+
'hy': Armenian,
|
36 |
+
'bg': Bulgarian,
|
37 |
+
'ur': Urdu,
|
38 |
+
'ru': Russian,
|
39 |
+
'pl': Polish,
|
40 |
+
'fa': Persian,
|
41 |
+
'nl': Dutch,
|
42 |
+
'da': Danish,
|
43 |
+
'fr': French,
|
44 |
+
'my': Burmese,
|
45 |
+
'el': Greek,
|
46 |
+
'it': Italian,
|
47 |
+
'ja': Japanese,
|
48 |
+
'de': Deutsch,
|
49 |
+
'kk': Kazakh,
|
50 |
+
'sk': Slovak,
|
51 |
+
'hy': Armenian
|
52 |
+
}
|
53 |
+
|
54 |
+
|
55 |
+
class Language(object):
|
56 |
+
|
57 |
+
def __init__(self, code):
|
58 |
+
self.code = code
|
59 |
+
|
60 |
+
@classmethod
|
61 |
+
def get_language_code(cls, code):
|
62 |
+
try:
|
63 |
+
return LANGUAGE_CODES[code]
|
64 |
+
except KeyError:
|
65 |
+
raise ValueError("Provide valid language ID i.e. ISO code. "
|
66 |
+
"Available codes are : {}".format(set(LANGUAGE_CODES.keys())))
|
pysbd/lists_item_replacer.py
ADDED
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import string
|
3 |
+
import re
|
4 |
+
from pysbd.utils import Rule, Text
|
5 |
+
from functools import partial
|
6 |
+
|
7 |
+
|
8 |
+
class ListItemReplacer(object):
|
9 |
+
|
10 |
+
ROMAN_NUMERALS = "i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx".split(' ')
|
11 |
+
LATIN_NUMERALS = list(string.ascii_lowercase)
|
12 |
+
|
13 |
+
# Rubular: http://rubular.com/r/XcpaJKH0sz
|
14 |
+
ALPHABETICAL_LIST_WITH_PERIODS = r'(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)'
|
15 |
+
|
16 |
+
# Rubular: http://rubular.com/r/Gu5rQapywf
|
17 |
+
# TODO: Make sure below regex call is case-insensitive
|
18 |
+
ALPHABETICAL_LIST_WITH_PARENS = r'(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))'
|
19 |
+
|
20 |
+
# (pattern, replacement)
|
21 |
+
SubstituteListPeriodRule = Rule('♨', '∯')
|
22 |
+
ListMarkerRule = Rule('☝', '')
|
23 |
+
|
24 |
+
# Rubular: http://rubular.com/r/Wv4qLdoPx7
|
25 |
+
# https://regex101.com/r/62YBlv/1
|
26 |
+
SpaceBetweenListItemsFirstRule = Rule(r'(?<=\S\S)\s(?=\S\s*\d+♨)', "\r")
|
27 |
+
|
28 |
+
# Rubular: http://rubular.com/r/AizHXC6HxK
|
29 |
+
# https://regex101.com/r/62YBlv/2
|
30 |
+
SpaceBetweenListItemsSecondRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}♨)', "\r")
|
31 |
+
|
32 |
+
# Rubular: http://rubular.com/r/GE5q6yID2j
|
33 |
+
# https://regex101.com/r/62YBlv/3
|
34 |
+
SpaceBetweenListItemsThirdRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}☝)', "\r")
|
35 |
+
|
36 |
+
NUMBERED_LIST_REGEX_1 = r'\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))'
|
37 |
+
# 1. abcd
|
38 |
+
# 2. xyz
|
39 |
+
NUMBERED_LIST_REGEX_2 = r'(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))'
|
40 |
+
# 1) abcd
|
41 |
+
# 2) xyz
|
42 |
+
NUMBERED_LIST_PARENS_REGEX = r'\d{1,2}(?=\)\s)'
|
43 |
+
|
44 |
+
# Rubular: http://rubular.com/r/NsNFSqrNvJ
|
45 |
+
# TODO: Make sure below regex call is case-insensitive
|
46 |
+
EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX = r'\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))'
|
47 |
+
|
48 |
+
# Rubular: http://rubular.com/r/wMpnVedEIb
|
49 |
+
# TODO: Make sure below regex call is case-insensitive
|
50 |
+
ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX = r'(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\.'
|
51 |
+
|
52 |
+
# Rubular: http://rubular.com/r/GcnmQt4a3I
|
53 |
+
ROMAN_NUMERALS_IN_PARENTHESES = r'\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])'
|
54 |
+
|
55 |
+
def __init__(self, text):
|
56 |
+
self.text = text
|
57 |
+
|
58 |
+
def add_line_break(self):
|
59 |
+
self.format_alphabetical_lists()
|
60 |
+
self.format_roman_numeral_lists()
|
61 |
+
self.format_numbered_list_with_periods()
|
62 |
+
self.format_numbered_list_with_parens()
|
63 |
+
return self.text
|
64 |
+
|
65 |
+
def replace_parens(self):
|
66 |
+
text = re.sub(self.ROMAN_NUMERALS_IN_PARENTHESES,
|
67 |
+
r'&✂&\1&⌬&', self.text)
|
68 |
+
return text
|
69 |
+
|
70 |
+
def format_numbered_list_with_parens(self):
|
71 |
+
self.replace_parens_in_numbered_list()
|
72 |
+
self.add_line_breaks_for_numbered_list_with_parens()
|
73 |
+
self.text = Text(self.text).apply(self.ListMarkerRule)
|
74 |
+
|
75 |
+
def replace_periods_in_numbered_list(self):
|
76 |
+
self.scan_lists(self.NUMBERED_LIST_REGEX_1, self.NUMBERED_LIST_REGEX_2,
|
77 |
+
'♨', strip=True)
|
78 |
+
|
79 |
+
def format_numbered_list_with_periods(self):
|
80 |
+
self.replace_periods_in_numbered_list()
|
81 |
+
self.add_line_breaks_for_numbered_list_with_periods()
|
82 |
+
self.text = Text(self.text).apply(self.SubstituteListPeriodRule)
|
83 |
+
|
84 |
+
def format_alphabetical_lists(self):
|
85 |
+
self.txt = self.add_line_breaks_for_alphabetical_list_with_periods(
|
86 |
+
roman_numeral=False)
|
87 |
+
self.txt = self.add_line_breaks_for_alphabetical_list_with_parens(
|
88 |
+
roman_numeral=False)
|
89 |
+
return self.txt
|
90 |
+
|
91 |
+
def format_roman_numeral_lists(self):
|
92 |
+
self.txt = self.add_line_breaks_for_alphabetical_list_with_periods(
|
93 |
+
roman_numeral=True)
|
94 |
+
self.txt = self.add_line_breaks_for_alphabetical_list_with_parens(
|
95 |
+
roman_numeral=True)
|
96 |
+
return self.txt
|
97 |
+
|
98 |
+
def add_line_breaks_for_alphabetical_list_with_periods(
|
99 |
+
self, roman_numeral=False):
|
100 |
+
txt = self.iterate_alphabet_array(
|
101 |
+
self.ALPHABETICAL_LIST_WITH_PERIODS,
|
102 |
+
roman_numeral=roman_numeral)
|
103 |
+
return txt
|
104 |
+
|
105 |
+
def add_line_breaks_for_alphabetical_list_with_parens(self, roman_numeral=False):
|
106 |
+
txt = self.iterate_alphabet_array(
|
107 |
+
self.ALPHABETICAL_LIST_WITH_PARENS,
|
108 |
+
parens=True,
|
109 |
+
roman_numeral=roman_numeral)
|
110 |
+
return txt
|
111 |
+
|
112 |
+
def scan_lists(self, regex1, regex2, replacement, strip=False):
|
113 |
+
list_array = re.findall(regex1, self.text)
|
114 |
+
list_array = list(map(int, list_array))
|
115 |
+
for ind, item in enumerate(list_array):
|
116 |
+
# to avoid IndexError
|
117 |
+
# ruby returns nil if index is out of range
|
118 |
+
if (ind < len(list_array) - 1 and item + 1 == list_array[ind + 1]):
|
119 |
+
self.substitute_found_list_items(regex2, item, strip, replacement)
|
120 |
+
elif ind > 0:
|
121 |
+
if (((item - 1) == list_array[ind - 1]) or
|
122 |
+
((item == 0) and (list_array[ind - 1] == 9)) or
|
123 |
+
((item == 9) and (list_array[ind - 1] == 0))):
|
124 |
+
self.substitute_found_list_items(regex2, item, strip, replacement)
|
125 |
+
|
126 |
+
def substitute_found_list_items(self, regex, each, strip, replacement):
|
127 |
+
|
128 |
+
def replace_item(match, val=None, strip=False, repl='♨'):
|
129 |
+
match = match.group()
|
130 |
+
if strip:
|
131 |
+
match = str(match).strip()
|
132 |
+
chomped_match = match if len(match) == 1 else match.strip('.])')
|
133 |
+
if str(each) == chomped_match:
|
134 |
+
return "{}{}".format(each, replacement)
|
135 |
+
else:
|
136 |
+
return str(match)
|
137 |
+
|
138 |
+
self.text = re.sub(regex, partial(replace_item, val=each,
|
139 |
+
strip=strip, repl=replacement), self.text)
|
140 |
+
|
141 |
+
def add_line_breaks_for_numbered_list_with_periods(self):
|
142 |
+
if ('♨' in self.text) and (not re.search(
|
143 |
+
'♨.+(\n|\r).+♨', self.text)) and (not re.search(
|
144 |
+
r'for\s\d{1,2}♨\s[a-z]', self.text)):
|
145 |
+
self.text = Text(self.text).apply(self.SpaceBetweenListItemsFirstRule,
|
146 |
+
self.SpaceBetweenListItemsSecondRule)
|
147 |
+
|
148 |
+
def replace_parens_in_numbered_list(self):
|
149 |
+
self.scan_lists(
|
150 |
+
self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝')
|
151 |
+
self.scan_lists(self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝')
|
152 |
+
|
153 |
+
def add_line_breaks_for_numbered_list_with_parens(self):
|
154 |
+
if '☝' in self.text and not re.search("☝.+\n.+☝|☝.+\r.+☝", self.text):
|
155 |
+
self.text = Text(self.text).apply(
|
156 |
+
self.SpaceBetweenListItemsThirdRule)
|
157 |
+
|
158 |
+
def replace_alphabet_list(self, a):
|
159 |
+
"""
|
160 |
+
Input: 'a. ffegnog b. fgegkl c.'
|
161 |
+
Output: \ra∯ ffegnog \rb∯ fgegkl \rc∯
|
162 |
+
"""
|
163 |
+
|
164 |
+
def replace_letter_period(match, val=None):
|
165 |
+
match = match.group()
|
166 |
+
match_wo_period = match.strip('.')
|
167 |
+
if match_wo_period == val:
|
168 |
+
return '\r{}∯'.format(match_wo_period)
|
169 |
+
else:
|
170 |
+
return match
|
171 |
+
|
172 |
+
txt = re.sub(self.ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX,
|
173 |
+
partial(replace_letter_period, val=a),
|
174 |
+
self.text, flags=re.IGNORECASE)
|
175 |
+
return txt
|
176 |
+
|
177 |
+
def replace_alphabet_list_parens(self, a):
|
178 |
+
"""
|
179 |
+
Input: "a) ffegnog (b) fgegkl c)"
|
180 |
+
Output: "\ra) ffegnog \r&✂&b) fgegkl \rc)"
|
181 |
+
"""
|
182 |
+
|
183 |
+
def replace_alphabet_paren(match, val=None):
|
184 |
+
match = match.group()
|
185 |
+
if '(' in match:
|
186 |
+
match_wo_paren = match.strip('(')
|
187 |
+
if match_wo_paren == val:
|
188 |
+
return '\r&✂&{}'.format(match_wo_paren)
|
189 |
+
else:
|
190 |
+
return match
|
191 |
+
else:
|
192 |
+
if match == val:
|
193 |
+
return '\r{}'.format(match)
|
194 |
+
else:
|
195 |
+
return match
|
196 |
+
|
197 |
+
# Make it cases-insensitive
|
198 |
+
txt = re.sub(self.EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX,
|
199 |
+
partial(replace_alphabet_paren, val=a),
|
200 |
+
self.text, flags=re.IGNORECASE)
|
201 |
+
return txt
|
202 |
+
|
203 |
+
def replace_correct_alphabet_list(self, a, parens):
|
204 |
+
if parens:
|
205 |
+
a = self.replace_alphabet_list_parens(a)
|
206 |
+
else:
|
207 |
+
a = self.replace_alphabet_list(a)
|
208 |
+
return a
|
209 |
+
|
210 |
+
def last_array_item_replacement(self, a, i, alphabet, list_array, parens):
|
211 |
+
if (len(alphabet) == 0) & (len(list_array) == 0) or (
|
212 |
+
list_array[i - 1] not in alphabet) or (a not in alphabet):
|
213 |
+
return self.text
|
214 |
+
if abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1:
|
215 |
+
return self.text
|
216 |
+
result = self.replace_correct_alphabet_list(a, parens)
|
217 |
+
return result
|
218 |
+
|
219 |
+
def other_items_replacement(self, a, i, alphabet, list_array, parens):
|
220 |
+
if (len(alphabet) == 0) & (len(list_array) == 0) or (
|
221 |
+
list_array[i - 1] not in alphabet) or (a not in alphabet) or (
|
222 |
+
list_array[i + 1] not in alphabet):
|
223 |
+
return self.text
|
224 |
+
if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 and \
|
225 |
+
abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1:
|
226 |
+
return self.text
|
227 |
+
result = self.replace_correct_alphabet_list(a, parens)
|
228 |
+
return result
|
229 |
+
|
230 |
+
def iterate_alphabet_array(self, regex, parens=False, roman_numeral=False):
|
231 |
+
list_array = re.findall(regex, self.text)
|
232 |
+
alphabet = self.ROMAN_NUMERALS if roman_numeral else self.LATIN_NUMERALS
|
233 |
+
list_array = [i for i in list_array if i in alphabet]
|
234 |
+
for ind, each in enumerate(list_array):
|
235 |
+
if ind == len(list_array) - 1:
|
236 |
+
self.text = self.last_array_item_replacement(each, ind, alphabet, list_array, parens)
|
237 |
+
else:
|
238 |
+
self.text = self.other_items_replacement(
|
239 |
+
each, ind, alphabet, list_array, parens)
|
240 |
+
return self.text
|
pysbd/processor.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
from pysbd.utils import Text
|
4 |
+
from pysbd.lists_item_replacer import ListItemReplacer
|
5 |
+
from pysbd.exclamation_words import ExclamationWords
|
6 |
+
from pysbd.between_punctuation import BetweenPunctuation
|
7 |
+
from pysbd.abbreviation_replacer import AbbreviationReplacer
|
8 |
+
|
9 |
+
class Processor(object):
|
10 |
+
|
11 |
+
def __init__(self, text, lang, char_span=False):
|
12 |
+
"""Process a text - do pre and post processing - to get proper sentences
|
13 |
+
|
14 |
+
Parameters
|
15 |
+
----------
|
16 |
+
text : str
|
17 |
+
Original text
|
18 |
+
language : object
|
19 |
+
Language module
|
20 |
+
char_span : bool, optional
|
21 |
+
Get start & end character offsets of each sentences
|
22 |
+
within original text, by default False
|
23 |
+
"""
|
24 |
+
self.text = text
|
25 |
+
self.lang = lang
|
26 |
+
self.char_span = char_span
|
27 |
+
|
28 |
+
def process(self):
|
29 |
+
if not self.text:
|
30 |
+
return self.text
|
31 |
+
self.text = self.text.replace('\n', '\r')
|
32 |
+
li = ListItemReplacer(self.text)
|
33 |
+
self.text = li.add_line_break()
|
34 |
+
self.replace_abbreviations()
|
35 |
+
self.replace_numbers()
|
36 |
+
self.replace_continuous_punctuation()
|
37 |
+
self.replace_periods_before_numeric_references()
|
38 |
+
self.text = Text(self.text).apply(
|
39 |
+
self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,
|
40 |
+
self.lang.GeoLocationRule, self.lang.FileFormatRule)
|
41 |
+
postprocessed_sents = self.split_into_segments()
|
42 |
+
return postprocessed_sents
|
43 |
+
|
44 |
+
def rm_none_flatten(self, sents):
|
45 |
+
"""Remove None values and unpack list of list sents
|
46 |
+
|
47 |
+
Parameters
|
48 |
+
----------
|
49 |
+
sents : list
|
50 |
+
list of sentences
|
51 |
+
|
52 |
+
Returns
|
53 |
+
-------
|
54 |
+
list
|
55 |
+
unpacked and None removed list of sents
|
56 |
+
"""
|
57 |
+
sents = list(filter(None, sents))
|
58 |
+
if not any(isinstance(s, list) for s in sents):
|
59 |
+
return sents
|
60 |
+
new_sents = []
|
61 |
+
for sent in sents:
|
62 |
+
if isinstance(sent, list):
|
63 |
+
for s in sent:
|
64 |
+
new_sents.append(s)
|
65 |
+
else:
|
66 |
+
new_sents.append(sent)
|
67 |
+
return new_sents
|
68 |
+
|
69 |
+
def split_into_segments(self):
|
70 |
+
self.check_for_parens_between_quotes()
|
71 |
+
sents = self.text.split('\r')
|
72 |
+
# remove empty and none values
|
73 |
+
sents = self.rm_none_flatten(sents)
|
74 |
+
sents = [
|
75 |
+
Text(s).apply(self.lang.SingleNewLineRule, *self.lang.EllipsisRules.All)
|
76 |
+
for s in sents
|
77 |
+
]
|
78 |
+
|
79 |
+
# # THESE LINES ARE NOT PRESENT IN THE ORIGINAL CODE --> ONLY USE FOR HYW
|
80 |
+
# sents = [self.post_process_segments(s) for s in sents]
|
81 |
+
# sents = self.rm_none_flatten(sents)
|
82 |
+
|
83 |
+
|
84 |
+
sents = [self.check_for_punctuation(s) for s in sents]
|
85 |
+
# flatten list of list of sentences
|
86 |
+
sents = self.rm_none_flatten(sents)
|
87 |
+
postprocessed_sents = []
|
88 |
+
for sent in sents:
|
89 |
+
sent = Text(sent).apply(*self.lang.SubSymbolsRules.All)
|
90 |
+
post_process_sent = self.post_process_segments(sent)
|
91 |
+
if post_process_sent and isinstance(post_process_sent, str):
|
92 |
+
postprocessed_sents.append(post_process_sent)
|
93 |
+
elif isinstance(post_process_sent, list):
|
94 |
+
for pps in post_process_sent:
|
95 |
+
postprocessed_sents.append(pps)
|
96 |
+
postprocessed_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule)
|
97 |
+
for ns in postprocessed_sents]
|
98 |
+
return postprocessed_sents
|
99 |
+
|
100 |
+
def post_process_segments(self, txt):
|
101 |
+
if len(txt) > 2 and re.search(r'\A[a-zA-Z]*\Z', txt):
|
102 |
+
return txt
|
103 |
+
|
104 |
+
# below condition present in pragmatic segmenter
|
105 |
+
# dont know significance of it yet.
|
106 |
+
# if self.consecutive_underscore(txt) or len(txt) < 2:
|
107 |
+
# return txt
|
108 |
+
|
109 |
+
if re.match(r'\t', txt):
|
110 |
+
pass
|
111 |
+
|
112 |
+
# TODO:
|
113 |
+
# Decide on keeping or removing Standard.ExtraWhiteSpaceRule
|
114 |
+
# removed to retain original text spans
|
115 |
+
# txt = Text(txt).apply(*ReinsertEllipsisRules.All,
|
116 |
+
# Standard.ExtraWhiteSpaceRule)
|
117 |
+
txt = Text(txt).apply(*self.lang.ReinsertEllipsisRules.All)
|
118 |
+
if re.search(self.lang.QUOTATION_AT_END_OF_SENTENCE_REGEX, txt):
|
119 |
+
txt = re.split(
|
120 |
+
self.lang.SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX, txt)
|
121 |
+
return txt
|
122 |
+
else:
|
123 |
+
txt = txt.replace('\n', '')
|
124 |
+
return txt.strip()
|
125 |
+
|
126 |
+
def check_for_parens_between_quotes(self):
|
127 |
+
def paren_replace(match):
|
128 |
+
match = match.group()
|
129 |
+
sub1 = re.sub(r'\s(?=\()', '\r', match)
|
130 |
+
sub2 = re.sub(r'(?<=\))\s', '\r', sub1)
|
131 |
+
return sub2
|
132 |
+
self.text = re.sub(self.lang.PARENS_BETWEEN_DOUBLE_QUOTES_REGEX,
|
133 |
+
paren_replace, self.text)
|
134 |
+
|
135 |
+
def replace_continuous_punctuation(self):
|
136 |
+
def continuous_puncs_replace(match):
|
137 |
+
match = match.group()
|
138 |
+
sub1 = re.sub(re.escape('!'), '&ᓴ&', match)
|
139 |
+
sub2 = re.sub(re.escape('?'), '&ᓷ&', sub1)
|
140 |
+
return sub2
|
141 |
+
self.text = re.sub(self.lang.CONTINUOUS_PUNCTUATION_REGEX,
|
142 |
+
continuous_puncs_replace, self.text)
|
143 |
+
|
144 |
+
def replace_periods_before_numeric_references(self):
|
145 |
+
# https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703
|
146 |
+
self.text = re.sub(self.lang.NUMBERED_REFERENCE_REGEX,
|
147 |
+
r"∯\2\r\7", self.text)
|
148 |
+
|
149 |
+
def consecutive_underscore(self, txt):
|
150 |
+
# Rubular: http://rubular.com/r/fTF2Ff3WBL
|
151 |
+
txt = re.sub(r'_{3,}', '', txt)
|
152 |
+
return len(txt) == 0
|
153 |
+
|
154 |
+
def check_for_punctuation(self, txt):
|
155 |
+
if any(p in txt for p in self.lang.Punctuations):
|
156 |
+
sents = self.process_text(txt)
|
157 |
+
return sents
|
158 |
+
else:
|
159 |
+
# NOTE: next steps of check_for_punctuation will unpack this list
|
160 |
+
return [txt]
|
161 |
+
|
162 |
+
def process_text(self, txt):
|
163 |
+
if txt[-1] not in self.lang.Punctuations:
|
164 |
+
txt += 'ȸ'
|
165 |
+
txt = ExclamationWords.apply_rules(txt)
|
166 |
+
txt = self.between_punctuation(txt)
|
167 |
+
# handle text having only doublepunctuations
|
168 |
+
if not re.match(self.lang.DoublePunctuationRules.DoublePunctuation, txt):
|
169 |
+
txt = Text(txt).apply(*self.lang.DoublePunctuationRules.All)
|
170 |
+
txt = Text(txt).apply(self.lang.QuestionMarkInQuotationRule,
|
171 |
+
*self.lang.ExclamationPointRules.All)
|
172 |
+
txt = ListItemReplacer(txt).replace_parens()
|
173 |
+
txt = self.sentence_boundary_punctuation(txt)
|
174 |
+
return txt
|
175 |
+
|
176 |
+
def replace_numbers(self):
|
177 |
+
self.text = Text(self.text).apply(*self.lang.Numbers.All)
|
178 |
+
|
179 |
+
def abbreviations_replacer(self):
|
180 |
+
if hasattr(self.lang, "AbbreviationReplacer"):
|
181 |
+
return self.lang.AbbreviationReplacer(self.text, self.lang)
|
182 |
+
else:
|
183 |
+
return AbbreviationReplacer(self.text, self.lang)
|
184 |
+
|
185 |
+
def replace_abbreviations(self):
|
186 |
+
self.text = self.abbreviations_replacer().replace()
|
187 |
+
|
188 |
+
def between_punctuation_processor(self, txt):
|
189 |
+
if hasattr(self.lang, "BetweenPunctuation"):
|
190 |
+
return self.lang.BetweenPunctuation(txt)
|
191 |
+
else:
|
192 |
+
return BetweenPunctuation(txt)
|
193 |
+
|
194 |
+
def between_punctuation(self, txt):
|
195 |
+
txt = self.between_punctuation_processor(txt).replace()
|
196 |
+
return txt
|
197 |
+
|
198 |
+
def sentence_boundary_punctuation(self, txt):
|
199 |
+
if hasattr(self.lang, 'ReplaceColonBetweenNumbersRule'):
|
200 |
+
txt = Text(txt).apply(
|
201 |
+
self.lang.ReplaceColonBetweenNumbersRule)
|
202 |
+
if hasattr(self.lang, 'ReplaceNonSentenceBoundaryCommaRule'):
|
203 |
+
txt = Text(txt).apply(
|
204 |
+
self.lang.ReplaceNonSentenceBoundaryCommaRule)
|
205 |
+
# retain exclamation mark if it is an ending character of a given text
|
206 |
+
txt = re.sub(r'&ᓴ&$', '!', txt)
|
207 |
+
txt = [
|
208 |
+
m.group() for m in re.finditer(self.lang.SENTENCE_BOUNDARY_REGEX, txt)
|
209 |
+
]
|
210 |
+
return txt
|
pysbd/punctuation_replacer.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
from pysbd.utils import Rule, Text
|
4 |
+
|
5 |
+
|
6 |
+
class EscapeRegexReservedCharacters(object):
|
7 |
+
LeftParen = Rule(r'\(', '\\(')
|
8 |
+
RightParen = Rule(r'\)', '\\)')
|
9 |
+
# LeftParen = Rule(re.escape(r'('), '(')
|
10 |
+
# RightParen = Rule(re.escape(r')'), ')')
|
11 |
+
LeftBracket = Rule(r'\[', '\\[')
|
12 |
+
RightBracket = Rule(r'\]', '\\]')
|
13 |
+
Dash = Rule(r'\-', '\\-')
|
14 |
+
|
15 |
+
All = [LeftParen, RightParen, LeftBracket, RightBracket, Dash]
|
16 |
+
|
17 |
+
|
18 |
+
class SubEscapedRegexReservedCharacters(object):
|
19 |
+
SubLeftParen = Rule(r'\\\(', '(')
|
20 |
+
SubRightParen = Rule(r'\\\)', ')')
|
21 |
+
# SubLeftParen = Rule(re.escape(r"\\("), "(")
|
22 |
+
# SubRightParen = Rule(re.escape(r'\\)'), ')')
|
23 |
+
SubLeftBracket = Rule(r'\\\[', '[')
|
24 |
+
SubRightBracket = Rule(r'\\\]', ']')
|
25 |
+
SubDash = Rule(r'\\\-', '-')
|
26 |
+
|
27 |
+
All = [
|
28 |
+
SubLeftParen, SubRightParen, SubLeftBracket, SubRightBracket, SubDash
|
29 |
+
]
|
30 |
+
|
31 |
+
|
32 |
+
def replace_punctuation(match, match_type=None):
|
33 |
+
text = Text(match.group()).apply(*EscapeRegexReservedCharacters.All)
|
34 |
+
sub = re.sub(r'\.', '∯', text)
|
35 |
+
sub = re.sub(r'։', '⍟', sub) # ADDED FOR ARMENIAN
|
36 |
+
sub_1 = re.sub(r'\。', '&ᓰ&', sub)
|
37 |
+
sub_2 = re.sub(r'\.', '&ᓱ&', sub_1)
|
38 |
+
sub_3 = re.sub(r'\!', '&ᓳ&', sub_2)
|
39 |
+
sub_4 = re.sub(r'\!', '&ᓴ&', sub_3)
|
40 |
+
sub_5 = re.sub(r'\?', '&ᓷ&', sub_4)
|
41 |
+
last_sub = re.sub(r'\?', '&ᓸ&', sub_5)
|
42 |
+
if match_type != 'single':
|
43 |
+
last_sub = re.sub(r"'", '&⎋&', last_sub)
|
44 |
+
text = Text(last_sub).apply(*SubEscapedRegexReservedCharacters.All)
|
45 |
+
return text
|
pysbd/segmenter.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import re
|
3 |
+
|
4 |
+
from pysbd.languages import Language
|
5 |
+
from pysbd.processor import Processor
|
6 |
+
from pysbd.cleaner import Cleaner
|
7 |
+
from pysbd.utils import TextSpan
|
8 |
+
|
9 |
+
class Segmenter(object):
|
10 |
+
|
11 |
+
def __init__(self, language="en", clean=False, doc_type=None, char_span=False):
|
12 |
+
"""Segments a text into an list of sentences
|
13 |
+
with or withour character offsets from original text
|
14 |
+
|
15 |
+
Parameters
|
16 |
+
----------
|
17 |
+
language : str, required
|
18 |
+
specify a language use its two character ISO 639-1 code,
|
19 |
+
by default "en"
|
20 |
+
clean : bool, optional
|
21 |
+
cleans original text, by default False
|
22 |
+
doc_type : [type], optional
|
23 |
+
Normal text or OCRed text, by default None
|
24 |
+
set to `pdf` for OCRed text
|
25 |
+
char_span : bool, optional
|
26 |
+
Get start & end character offsets of each sentences
|
27 |
+
within original text, by default False
|
28 |
+
"""
|
29 |
+
self.language = language
|
30 |
+
self.language_module = Language.get_language_code(language)
|
31 |
+
self.clean = clean
|
32 |
+
self.doc_type = doc_type
|
33 |
+
self.char_span = char_span
|
34 |
+
if self.clean and self.char_span:
|
35 |
+
raise ValueError("char_span must be False if clean is True. "
|
36 |
+
"Since `clean=True` will modify original text.")
|
37 |
+
# when doctype is pdf then force user to clean the text
|
38 |
+
# char_span func wont be provided with pdf doctype also
|
39 |
+
elif self.doc_type == 'pdf' and not self.clean:
|
40 |
+
raise ValueError("`doc_type='pdf'` should have `clean=True` & "
|
41 |
+
"`char_span` should be False since original"
|
42 |
+
"text will be modified.")
|
43 |
+
|
44 |
+
def cleaner(self, text):
|
45 |
+
if hasattr(self.language_module, "Cleaner"):
|
46 |
+
return self.language_module.Cleaner(text, self.language_module,
|
47 |
+
doc_type=self.doc_type)
|
48 |
+
else:
|
49 |
+
return Cleaner(text, self.language_module, doc_type=self.doc_type)
|
50 |
+
|
51 |
+
def processor(self, text):
|
52 |
+
if hasattr(self.language_module, "Processor"):
|
53 |
+
return self.language_module.Processor(text, self.language_module,
|
54 |
+
char_span=self.char_span)
|
55 |
+
else:
|
56 |
+
return Processor(text, self.language_module,
|
57 |
+
char_span=self.char_span)
|
58 |
+
|
59 |
+
def sentences_with_char_spans(self, sentences):
|
60 |
+
# since SENTENCE_BOUNDARY_REGEX doesnt account
|
61 |
+
# for trailing whitespaces \s* & is used as suffix
|
62 |
+
# to keep non-destructive text after segments joins
|
63 |
+
sent_spans = []
|
64 |
+
prior_end_char_idx = 0
|
65 |
+
for sent in sentences:
|
66 |
+
for match in re.finditer('{0}\s*'.format(re.escape(sent)), self.original_text):
|
67 |
+
match_str = match.group()
|
68 |
+
match_start_idx, match_end_idx = match.span()
|
69 |
+
if match_end_idx > prior_end_char_idx:
|
70 |
+
# making sure if curren sentence and its span
|
71 |
+
# is either first sentence along with its char spans
|
72 |
+
# or current sent spans adjacent to prior sentence spans
|
73 |
+
sent_spans.append(
|
74 |
+
TextSpan(match_str, match_start_idx, match_end_idx))
|
75 |
+
prior_end_char_idx = match_end_idx
|
76 |
+
break
|
77 |
+
return sent_spans
|
78 |
+
|
79 |
+
def segment(self, text):
|
80 |
+
self.original_text = text
|
81 |
+
if not text:
|
82 |
+
return []
|
83 |
+
|
84 |
+
if self.clean or self.doc_type == 'pdf':
|
85 |
+
text = self.cleaner(text).clean()
|
86 |
+
|
87 |
+
postprocessed_sents = self.processor(text).process()
|
88 |
+
sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents)
|
89 |
+
if self.char_span:
|
90 |
+
return sentence_w_char_spans
|
91 |
+
elif self.clean:
|
92 |
+
# clean and destructed sentences
|
93 |
+
return postprocessed_sents
|
94 |
+
else:
|
95 |
+
# nondestructive with whitespaces
|
96 |
+
return [textspan.sent for textspan in sentence_w_char_spans]
|
pysbd/utils.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import re
|
4 |
+
import pysbd
|
5 |
+
|
6 |
+
class Rule(object):
|
7 |
+
|
8 |
+
def __init__(self, pattern, replacement):
|
9 |
+
self.pattern = pattern
|
10 |
+
self.replacement = replacement
|
11 |
+
|
12 |
+
def __repr__(self): # pragma: no cover
|
13 |
+
return '<{} pattern="{}" and replacement="{}">'.format(
|
14 |
+
self.__class__.__name__, self.pattern, self.replacement)
|
15 |
+
|
16 |
+
|
17 |
+
class Text(str):
|
18 |
+
"""Extending str functionality to apply regex rules
|
19 |
+
|
20 |
+
https://stackoverflow.com/questions/4698493/can-i-add-custom-methods-attributes-to-built-in-python-types
|
21 |
+
|
22 |
+
Parameters
|
23 |
+
----------
|
24 |
+
str : str
|
25 |
+
string content
|
26 |
+
|
27 |
+
Returns
|
28 |
+
-------
|
29 |
+
str
|
30 |
+
input as it is if rule pattern doesnt match
|
31 |
+
else replacing found pattern with replacement chars
|
32 |
+
"""
|
33 |
+
def apply(self, *rules):
|
34 |
+
for each_r in rules:
|
35 |
+
self = re.sub(each_r.pattern, each_r.replacement, self)
|
36 |
+
return self
|
37 |
+
|
38 |
+
|
39 |
+
class TextSpan(object):
|
40 |
+
|
41 |
+
def __init__(self, sent, start, end):
|
42 |
+
"""
|
43 |
+
Sentence text and its start & end character offsets within original text
|
44 |
+
|
45 |
+
Parameters
|
46 |
+
----------
|
47 |
+
sent : str
|
48 |
+
Sentence text
|
49 |
+
start : int
|
50 |
+
start character offset of a sentence in original text
|
51 |
+
end : int
|
52 |
+
end character offset of a sentence in original text
|
53 |
+
"""
|
54 |
+
self.sent = sent
|
55 |
+
self.start = start
|
56 |
+
self.end = end
|
57 |
+
|
58 |
+
def __repr__(self): # pragma: no cover
|
59 |
+
return "{0}(sent={1}, start={2}, end={3})".format(
|
60 |
+
self.__class__.__name__, repr(self.sent), self.start, self.end)
|
61 |
+
|
62 |
+
def __eq__(self, other):
|
63 |
+
if isinstance(self, other.__class__):
|
64 |
+
return self.sent == other.sent and self.start == other.start and self.end == other.end
|
65 |
+
|
66 |
+
|
67 |
+
class PySBDFactory(object):
|
68 |
+
"""pysbd as a spacy component through entrypoints"""
|
69 |
+
|
70 |
+
def __init__(self, nlp, language='en'):
|
71 |
+
self.nlp = nlp
|
72 |
+
self.seg = pysbd.Segmenter(language=language, clean=False,
|
73 |
+
char_span=True)
|
74 |
+
|
75 |
+
def __call__(self, doc):
|
76 |
+
sents_char_spans = self.seg.segment(doc.text_with_ws)
|
77 |
+
start_token_ids = [sent.start for sent in sents_char_spans]
|
78 |
+
for token in doc:
|
79 |
+
token.is_sent_start = (True if token.idx
|
80 |
+
in start_token_ids else False)
|
81 |
+
return doc
|