Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,678 Bytes
37ced70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import re
import regex
import inflect
import unicodedata
from lingua import Language, LanguageDetectorBuilder
from builtins import str as unicode
from tn.chinese.normalizer import Normalizer as ZhNormalizer
from tn.english.normalizer import Normalizer as EnNormalizer
from fireredtts.modules.text_normalizer.regex_common import *
from fireredtts.modules.text_normalizer.utils import *
def preprocess_text(sentence):
# preprocessing
sentence = bytes(sentence, "utf-8").decode("utf-8", "ignore")
sentence = regex.sub("[\p{Cf}--[\u200d]]", "", sentence, flags=regex.V1)
sentence = regex.sub("\p{Co}", "", sentence)
sentence = sentence.replace("\u00a0", " ")
sentence = sentence.replace("\ufffd", "")
sentence = regex.sub("\p{Zl}", "\n", sentence)
sentence = regex.sub("\p{Zp}", "\n", sentence)
sentence = unicode(sentence)
sentence = "".join(
char
for char in unicodedata.normalize("NFD", sentence)
if unicodedata.category(char) != "Mn"
) # Strip accents
sentence = strip_kaomoji(sentence)
# full to half with exemption (to be converted after number TN): 。,:
sentence = f2b(sentence, exemption="。,:")
# clean spaces
sentence = sentence.replace("\n", ",")
sentence = sentence.replace("\t", ",")
sentence = sentence.replace("\r", ",")
sentence = re.sub(r"[。.]{3,}", "…", sentence)
sentence = re.sub(r"[…⋯]{1,}", "…", sentence)
sentence = re.sub(r"[ ]+", " ", sentence)
sentence = sentence.strip()
# punctuation reduction
result = ""
for idx, char in enumerate(sentence):
if char in symbol_reduction:
char = symbol_reduction[char]
if char == " ":
if idx == 0:
continue
if is_chinese(sentence[idx + 1]) and (
is_chinese(sentence[idx - 1]) or sentence[idx - 1] in '") '
):
result += ","
else:
result += " "
continue
if is_valid_char(char):
result += char
result = re.sub(r"[ ]+", " ", result)
return result
def rettt(sentence):
# handle abbreviations for all languages
sentence = sentence.replace("&nd", "and")
sentence = sentence.replace("Jan.", "january")
sentence = sentence.replace("Feb.", "febrary")
sentence = sentence.replace("Mar.", "march")
sentence = sentence.replace("Apr.", "april")
sentence = sentence.replace("May.", "may")
sentence = sentence.replace("Jun.", "june")
sentence = sentence.replace("Jul.", "july")
sentence = sentence.replace("Aug.", "august")
sentence = sentence.replace("Sept.", "september")
sentence = sentence.replace("Sep.", "september")
sentence = sentence.replace("Oct.", "october")
sentence = sentence.replace("Nov.", "november")
sentence = sentence.replace("Dec.", "december")
sentence = sentence.replace("Mon.", "monday")
sentence = sentence.replace("Tues.", "tuesday")
sentence = sentence.replace("Wed.", "wednesday")
sentence = sentence.replace("Thur.", "thursday")
sentence = sentence.replace("Fri.", "friday")
sentence = sentence.replace("Sat.", "saturday")
if sentence != "Sun.":
sentence = sentence.replace("Sun.", "sunday")
sentence = re.sub(r" St\. ([A-Z])", r" saint \1", sentence)
sentence = re.sub(r" St\.", " street", sentence)
sentence = re.sub(r" Rd\.", " road", sentence)
sentence = re.sub(r"[Aa]\.[Mm]\.", "A_M", sentence)
sentence = re.sub(r"[Pp]\.[Mm]\.", "P_M", sentence)
sentence = re.sub(r"[Bb]\.[Cc]\.", "B_C", sentence)
sentence = re.sub(r"[Ad]\.[Dd]\.", "A_D", sentence)
sentence = sentence.replace("Mr.", "mister")
sentence = sentence.replace("Ms.", "miss")
sentence = sentence.replace("Mrs.", "misses")
sentence = sentence.replace("Ph.D", "P_H_D")
sentence = sentence.replace("i.e.", "that is")
sentence = sentence.replace("e.g.", "for example")
sentence = sentence.replace("btw.", "by the way")
sentence = sentence.replace("btw", "by the way")
sentence = sentence.replace("b.t.w.", "by the way")
sentence = sentence.replace("@", " at ")
return sentence
class TextNormalizer:
def __init__(self):
self.language_detector = LanguageDetectorBuilder.from_languages(
Language.ENGLISH, Language.CHINESE
).build()
self.zh_normalizer = ZhNormalizer()
self.en_normalizer = EnNormalizer()
self.inflect_parser = inflect.engine()
self.lang2token = {Language.ENGLISH: "en", Language.CHINESE: "zh"}
def tn(self, text):
text = preprocess_text(text)
text = rettt(text) # regex replacements
# for non chinese languages
language = self.language_detector.detect_language_of(text)
# enforce chinese if text contains any chinese character
if contains_chinese(text):
language = Language.CHINESE
text_lang = self.lang2token.get(language, "zh")
if is_upper_eng_and_digit(text):
language = Language.CHINESE
if language == Language.CHINESE:
text = self.zh_normalizer.normalize(text)
text = text.replace("\n", "")
text = re.sub(r"[,,]+$", "。", text)
else:
text = re.sub(r"[^ 0-9A-Za-z\[\]'.,:?!_\-]", "", text)
text = self.en_normalizer.normalize(text)
# fallback number normalization
pieces = re.split(r"(\d+)", text)
text = "".join(
[
self.inflect_parser.number_to_words(p) if p.isnumeric() else p
for p in pieces
if len(p) > 0
]
)
# cleanup
text = text.replace("_", " ")
text = re.sub(r"[ ]+", " ", text)
# spell caplital words
pieces = re.split(r"([A-Z]{2,4}|[ ])", text)
for idx, p in enumerate(pieces):
if re.match("[A-Z]{2,4}", p):
pieces[idx] = " ".join(p)
text = " ".join([p for p in pieces if p != " "])
# post TN full to half
text = text.replace("。", ".")
text = text.replace(",", ",")
text = text.replace(":", ":")
# model limitations
text = text.lower().strip()
text = text.replace('"', "")
text = text.replace("·", " ")
text = re.sub("[…~、!,?:;!?:;]+", ",", text)
text = re.sub("[,]+", ",", text)
text = re.sub(r"[,. ]+$", ".", text)
if len(text) > 0 and text[-1] != ".":
text = text + "."
return text, text_lang
|