#!/usr/bin/python # -*- coding: utf-8 -*- import re import sys import unicodedata def Text(text): """provide a wrapper for python string map byte to str (python 3) all string in utf-8 encoding normalize string to NFC """ if not is_unicode(text): text = text.decode("utf-8") text = unicodedata.normalize("NFC", text) return text def is_unicode(text): return type(text) == str UPCASE_CHARACTERS = "QWERTYUIOPASDFGHJKLZXCVBNMÀÁẠẢÃÂẦẤẬẨẪĂẰẮẶẲẴÈÉẸẺẼÊỀẾỆỂỄÌÍỊỈĨÒÓỌỎÕÔỒỐỘỔỖƠỜỚỢỞỠÙÚỤỦŨƯỪỨỰỬỮỲÝỴỶỸĐ" LOWCASE_CHARACTERS = UPCASE_CHARACTERS.lower() specials = [r"==>", r"->", r"\.\.\.", r">>", r"=\)\)"] digit = r"\d+([\.,_]\d+)+" email = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+" # urls pattern from nltk # https://www.nltk.org/_modules/nltk/tokenize/casual.html # with Vu Anh's modified to match fpt protocol urls = [ r"(ftp|http|https)?://(?:www\.)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!$&\'()*+,;=]+", r"""(?:(https|http|ftp):(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:[a-z]{2,13})) (?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)(...))+ (?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]) |(?:(?