""" Split text to sentences. Modifed from seg_text.seg_text.py, vtext sentence_splitter removed, use Use sentence_splitter if supported, else use polyglot.text.Text !apt install libicu-dev !install pyicu pycld2 !pip install polyglot sentence_splitter Use vtext and fastlid to rid of polyglot? from vtext.tokenize_sentence import UnicodeSentenceTokenizer, PunctuationTokenizer tok = UnicodeSentenceTokenizer() seg = tok.tokenize(''' Text ''') for langs not in LANG_S """ # pylint: disable=invalid-name import re from typing import List, Optional, Union import pysbd from fastlid import fastlid from loguru import logger from tqdm.auto import tqdm def _seg_text( text: str, lang: Optional[str] = None, ) -> List[str]: """ Split text to sentences. Switched to pysbd Args: ---- text: string to split lang: language, two-letter ISO (22 languages) Returns: ------- List of segmented sentences """ if lang is None: try: lang, _ = fastlid(text) except Exception as exc: logger.warning(" fastlid: %s, setting lang='en'", exc) lang = "en" if not text.strip(): return [] # pysbd only understands {'ja', 'am', 'bg', 'ur', 'hi', 'de', 'da', 'fr', 'el', 'fa', 'ru', 'ar', 'my', 'kk', 'pl', 'sk', 'en', 'hy', 'zh', 'mr', 'nl', 'it', 'es'}, 23 try: seg = pysbd.Segmenter(language=lang, clean=True) except Exception as exc: # fall back to 'en' logger.error(exc) logger.warning( f" pysbd probably does not understand {lang} " "fall back to 'en'" ) seg = pysbd.Segmenter(language="en", clean=True) try: # _ = tok.tokenize(text) _ = seg.segment(text) except Exception as exc: logger.exception(f"pysbd.Segmenter, {exc=}") raise return _ def seg_text( lst: Union[str, List[str]], lang: Optional[str] = None, maxlines: int = 1000, extra: Optional[str] = None, ) -> List[str]: """Split a list of text. Arguments: lst: text or text list lang: optional lang code maxlines: (default 1000), threshold for turn on tqdm progressbar, set to <1 or a large number to turn it off extra: re.split(rf"{extra}, text) first Returns: list of splitted text. """ if isinstance(lst, str): lst = [lst] if extra: # insert \n lst = [re.sub(rf"({extra})", r"\1\n", elm) for elm in lst] res = [] for elm in lst: res.extend( _seg_text( elm, lang=lang, # maxlines=maxlines, # flag=False, ) ) return res