import re import regex import inflect import unicodedata from lingua import Language, LanguageDetectorBuilder from builtins import str as unicode from tn.chinese.normalizer import Normalizer as ZhNormalizer from tn.english.normalizer import Normalizer as EnNormalizer from fireredtts.modules.text_normalizer.regex_common import * from fireredtts.modules.text_normalizer.utils import * def preprocess_text(sentence): # preprocessing sentence = bytes(sentence, "utf-8").decode("utf-8", "ignore") sentence = regex.sub("[\p{Cf}--[\u200d]]", "", sentence, flags=regex.V1) sentence = regex.sub("\p{Co}", "", sentence) sentence = sentence.replace("\u00a0", " ") sentence = sentence.replace("\ufffd", "") sentence = regex.sub("\p{Zl}", "\n", sentence) sentence = regex.sub("\p{Zp}", "\n", sentence) sentence = unicode(sentence) sentence = "".join( char for char in unicodedata.normalize("NFD", sentence) if unicodedata.category(char) != "Mn" ) # Strip accents sentence = strip_kaomoji(sentence) # full to half with exemption (to be converted after number TN): 。,: sentence = f2b(sentence, exemption="。,:") # clean spaces sentence = sentence.replace("\n", ",") sentence = sentence.replace("\t", ",") sentence = sentence.replace("\r", ",") sentence = re.sub(r"[。.]{3,}", "…", sentence) sentence = re.sub(r"[…⋯]{1,}", "…", sentence) sentence = re.sub(r"[ ]+", " ", sentence) sentence = sentence.strip() # punctuation reduction result = "" for idx, char in enumerate(sentence): if char in symbol_reduction: char = symbol_reduction[char] if char == " ": if idx == 0: continue if is_chinese(sentence[idx + 1]) and ( is_chinese(sentence[idx - 1]) or sentence[idx - 1] in '") ' ): result += "," else: result += " " continue if is_valid_char(char): result += char result = re.sub(r"[ ]+", " ", result) return result def rettt(sentence): # handle abbreviations for all languages sentence = sentence.replace("&nd", "and") sentence = sentence.replace("Jan.", "january") sentence = sentence.replace("Feb.", "febrary") sentence = sentence.replace("Mar.", "march") sentence = sentence.replace("Apr.", "april") sentence = sentence.replace("May.", "may") sentence = sentence.replace("Jun.", "june") sentence = sentence.replace("Jul.", "july") sentence = sentence.replace("Aug.", "august") sentence = sentence.replace("Sept.", "september") sentence = sentence.replace("Sep.", "september") sentence = sentence.replace("Oct.", "october") sentence = sentence.replace("Nov.", "november") sentence = sentence.replace("Dec.", "december") sentence = sentence.replace("Mon.", "monday") sentence = sentence.replace("Tues.", "tuesday") sentence = sentence.replace("Wed.", "wednesday") sentence = sentence.replace("Thur.", "thursday") sentence = sentence.replace("Fri.", "friday") sentence = sentence.replace("Sat.", "saturday") if sentence != "Sun.": sentence = sentence.replace("Sun.", "sunday") sentence = re.sub(r" St\. ([A-Z])", r" saint \1", sentence) sentence = re.sub(r" St\.", " street", sentence) sentence = re.sub(r" Rd\.", " road", sentence) sentence = re.sub(r"[Aa]\.[Mm]\.", "A_M", sentence) sentence = re.sub(r"[Pp]\.[Mm]\.", "P_M", sentence) sentence = re.sub(r"[Bb]\.[Cc]\.", "B_C", sentence) sentence = re.sub(r"[Ad]\.[Dd]\.", "A_D", sentence) sentence = sentence.replace("Mr.", "mister") sentence = sentence.replace("Ms.", "miss") sentence = sentence.replace("Mrs.", "misses") sentence = sentence.replace("Ph.D", "P_H_D") sentence = sentence.replace("i.e.", "that is") sentence = sentence.replace("e.g.", "for example") sentence = sentence.replace("btw.", "by the way") sentence = sentence.replace("btw", "by the way") sentence = sentence.replace("b.t.w.", "by the way") sentence = sentence.replace("@", " at ") return sentence class TextNormalizer: def __init__(self): self.language_detector = LanguageDetectorBuilder.from_languages( Language.ENGLISH, Language.CHINESE ).build() self.zh_normalizer = ZhNormalizer() self.en_normalizer = EnNormalizer() self.inflect_parser = inflect.engine() self.lang2token = {Language.ENGLISH: "en", Language.CHINESE: "zh"} def tn(self, text): text = preprocess_text(text) text = rettt(text) # regex replacements # for non chinese languages language = self.language_detector.detect_language_of(text) # enforce chinese if text contains any chinese character if contains_chinese(text): language = Language.CHINESE text_lang = self.lang2token.get(language, "zh") if is_upper_eng_and_digit(text): language = Language.CHINESE if language == Language.CHINESE: text = self.zh_normalizer.normalize(text) text = text.replace("\n", "") text = re.sub(r"[,,]+$", "。", text) else: text = re.sub(r"[^ 0-9A-Za-z\[\]'.,:?!_\-]", "", text) text = self.en_normalizer.normalize(text) # fallback number normalization pieces = re.split(r"(\d+)", text) text = "".join( [ self.inflect_parser.number_to_words(p) if p.isnumeric() else p for p in pieces if len(p) > 0 ] ) # cleanup text = text.replace("_", " ") text = re.sub(r"[ ]+", " ", text) # spell caplital words pieces = re.split(r"([A-Z]{2,4}|[ ])", text) for idx, p in enumerate(pieces): if re.match("[A-Z]{2,4}", p): pieces[idx] = " ".join(p) text = " ".join([p for p in pieces if p != " "]) # post TN full to half text = text.replace("。", ".") text = text.replace(",", ",") text = text.replace(":", ":") # model limitations text = text.lower().strip() text = text.replace('"', "") text = text.replace("·", " ") text = re.sub("[…~、!,?:;!?:;]+", ",", text) text = re.sub("[,]+", ",", text) text = re.sub(r"[,. ]+$", ".", text) if len(text) > 0 and text[-1] != ".": text = text + "." return text, text_lang