File size: 6,994 Bytes
d232606 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import logging
import regex as re
from tools.classify_language import classify_language, split_alpha_nonalpha
def check_is_none(item) -> bool:
"""none -> True, not none -> False"""
return (
item is None
or (isinstance(item, str) and str(item).isspace())
or str(item) == ""
)
def markup_language(text: str, target_languages: list = None) -> str:
pattern = (
r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
r"\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
)
sentences = re.split(pattern, text)
pre_lang = ""
p = 0
if target_languages is not None:
sorted_target_languages = sorted(target_languages)
if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
new_sentences = []
for sentence in sentences:
new_sentences.extend(split_alpha_nonalpha(sentence))
sentences = new_sentences
for sentence in sentences:
if check_is_none(sentence):
continue
lang = classify_language(sentence, target_languages)
if pre_lang == "":
text = text[:p] + text[p:].replace(
sentence, f"[{lang.upper()}]{sentence}", 1
)
p += len(f"[{lang.upper()}]")
elif pre_lang != lang:
text = text[:p] + text[p:].replace(
sentence, f"[{pre_lang.upper()}][{lang.upper()}]{sentence}", 1
)
p += len(f"[{pre_lang.upper()}][{lang.upper()}]")
pre_lang = lang
p += text[p:].index(sentence) + len(sentence)
text += f"[{pre_lang.upper()}]"
return text
def split_by_language(text: str, target_languages: list = None) -> list:
pattern = (
r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`"
r"\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」"
r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+"
)
sentences = re.split(pattern, text)
pre_lang = ""
start = 0
end = 0
sentences_list = []
if target_languages is not None:
sorted_target_languages = sorted(target_languages)
if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]:
new_sentences = []
for sentence in sentences:
new_sentences.extend(split_alpha_nonalpha(sentence))
sentences = new_sentences
for sentence in sentences:
if check_is_none(sentence):
continue
lang = classify_language(sentence, target_languages)
end += text[end:].index(sentence)
if pre_lang != "" and pre_lang != lang:
sentences_list.append((text[start:end], pre_lang))
start = end
end += len(sentence)
pre_lang = lang
sentences_list.append((text[start:], pre_lang))
return sentences_list
def sentence_split(text: str, max: int) -> list:
pattern = r"[!(),—+\-.:;??。,、;:]+"
sentences = re.split(pattern, text)
discarded_chars = re.findall(pattern, text)
sentences_list, count, p = [], 0, 0
# 按被分割的符号遍历
for i, discarded_chars in enumerate(discarded_chars):
count += len(sentences[i]) + len(discarded_chars)
if count >= max:
sentences_list.append(text[p : p + count].strip())
p += count
count = 0
# 加入最后剩余的文本
if p < len(text):
sentences_list.append(text[p:])
return sentences_list
def sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None):
# 如果该speaker只支持一种语言
if speaker_lang is not None and len(speaker_lang) == 1:
if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
logging.debug(
f'lang "{lang}" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}'
)
lang = speaker_lang[0]
sentences_list = []
if lang.upper() != "MIX":
if max <= 0:
sentences_list.append(
markup_language(text, speaker_lang)
if lang.upper() == "AUTO"
else f"[{lang.upper()}]{text}[{lang.upper()}]"
)
else:
for i in sentence_split(text, max):
if check_is_none(i):
continue
sentences_list.append(
markup_language(i, speaker_lang)
if lang.upper() == "AUTO"
else f"[{lang.upper()}]{i}[{lang.upper()}]"
)
else:
sentences_list.append(text)
for i in sentences_list:
logging.debug(i)
return sentences_list
if __name__ == "__main__":
text = "这几天心里颇不宁静。今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。我悄悄地披了大衫,带上门出去。"
print(markup_language(text, target_languages=None))
print(sentence_split(text, max=50))
print(sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None))
text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは自動ラベリングのテスト用テキストです.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了GAN Duration predictor和transformer flow,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
print(split_by_language(text, ["zh", "ja", "en"]))
text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days"
print(split_by_language(text, ["zh", "ja", "en"]))
# output: [('vits', 'en'), ('和', 'ja'), ('Bert-VITS', 'en'), ('2是', 'zh'), ('tts', 'en'), ('模型。花费3', 'zh'), ('days.', 'en'), ('花费3天。', 'zh'), ('Take 3 days', 'en')]
print(split_by_language(text, ["zh", "en"]))
# output: [('vits', 'en'), ('和', 'zh'), ('Bert-VITS', 'en'), ('2是', 'zh'), ('tts', 'en'), ('模型。花费3', 'zh'), ('days.', 'en'), ('花费3天。', 'zh'), ('Take 3 days', 'en')]
text = "vits 和 Bert-VITS2 是 tts 模型。花费 3 days. 花费 3天。Take 3 days"
print(split_by_language(text, ["zh", "en"]))
# output: [('vits ', 'en'), ('和 ', 'zh'), ('Bert-VITS2 ', 'en'), ('是 ', 'zh'), ('tts ', 'en'), ('模型。花费 ', 'zh'), ('3 days. ', 'en'), ('花费 3天。', 'zh'), ('Take 3 days', 'en')]
|